| 
258 | 258 |     operator: "All"  | 
259 | 259 |     targetReplicatedJobs:  | 
260 | 260 |     - {args.targetReplicatedJob}  | 
 | 261 | +  startupPolicy:  | 
 | 262 | +    startupPolicyOrder: InOrder  | 
261 | 263 |   replicatedJobs:  | 
262 |  | -    - name: worker  | 
263 |  | -      replicas: {args.num_slices}  | 
264 |  | -      template:  | 
265 |  | -        metadata:  | 
266 |  | -          annotations:  | 
267 |  | -            alpha.jobset.sigs.k8s.io/exclusive-topology: cloud.google.com/gke-nodepool  | 
268 |  | -          labels:  | 
269 |  | -            xpk.google.com/workload: {args.workload}  | 
270 |  | -        spec:  | 
271 |  | -          backoffLimit: {backoff_limit}  | 
272 |  | -          completions: {system.vms_per_slice}  | 
273 |  | -          parallelism: {system.vms_per_slice}  | 
274 |  | -          template:  | 
275 |  | -            metadata:  | 
276 |  | -              annotations:  | 
277 |  | -                {storage_annotations}  | 
278 |  | -            spec:  | 
279 |  | -              terminationGracePeriodSeconds: {args.termination_grace_period_seconds}  | 
280 |  | -              serviceAccountName: {service_account}  | 
281 |  | -              containers:  | 
282 |  | -              - args:  | 
283 |  | -                {pathways_worker_args}  | 
284 |  | -                image: {args.server_image}  | 
285 |  | -                imagePullPolicy: Always  | 
286 |  | -                name: pathways-worker  | 
287 |  | -                ports:  | 
288 |  | -                - containerPort: 29001  | 
289 |  | -                - containerPort: 8471  | 
290 |  | -                - containerPort: 8080  | 
291 |  | -                resources:  | 
292 |  | -                  limits:  | 
293 |  | -                    {resource_type}: {system.chips_per_vm}  | 
294 |  | -                securityContext:  | 
295 |  | -                  privileged: true  | 
296 |  | -                volumeMounts:  | 
297 |  | -                - mountPath: /tmp  | 
298 |  | -                  name: shared-tmp  | 
299 |  | -                {storage_volume_mounts}  | 
300 |  | -                env:  | 
301 |  | -                  - name: PROJECT_ID  | 
302 |  | -                    value: {args.project}  | 
303 |  | -                  - name: LOCATION  | 
304 |  | -                    value: {args.zone}  | 
305 |  | -                  - name: CLUSTER_NAME  | 
306 |  | -                    value: {args.cluster}  | 
307 |  | -                  - name: POD_NAME  | 
308 |  | -                    valueFrom:  | 
309 |  | -                      fieldRef:  | 
310 |  | -                        fieldPath: metadata.name  | 
311 |  | -                  - name: CONTAINER_NAME  | 
312 |  | -                    value: "pathways-worker"  | 
313 |  | -                  - name: NAMESPACE  | 
314 |  | -                    valueFrom:  | 
315 |  | -                      fieldRef:  | 
316 |  | -                        fieldPath: metadata.namespace  | 
317 |  | -                  # Workaround for v6e  | 
318 |  | -                  - name: MEGASCALE_GRPC_ENABLE_XOR_TRACER  | 
319 |  | -                    value: "false"  | 
320 |  | -                  - name: MEGASCALE_NUM_SLICES  | 
321 |  | -                    valueFrom:  | 
322 |  | -                      fieldRef:  | 
323 |  | -                        fieldPath: "metadata.labels['jobset.sigs.k8s.io/replicatedjob-replicas']"  | 
324 |  | -                  - name: JOBSET_NAME  | 
325 |  | -                    valueFrom:  | 
326 |  | -                      fieldRef:  | 
327 |  | -                        fieldPath: metadata.annotations['jobset.sigs.k8s.io/jobset-name']  | 
328 |  | -                  - name: REPLICATED_JOB_NAME  | 
329 |  | -                    valueFrom:  | 
330 |  | -                      fieldRef:  | 
331 |  | -                        fieldPath: metadata.annotations['jobset.sigs.k8s.io/replicatedjob-name']  | 
332 |  | -                  - name: MEGASCALE_SLICE_ID  | 
333 |  | -                    valueFrom:  | 
334 |  | -                      fieldRef:  | 
335 |  | -                        fieldPath: "metadata.labels['jobset.sigs.k8s.io/job-index']"  | 
336 |  | -                  - name: MEGASCALE_COORDINATOR_ADDRESS  | 
337 |  | -                    value: "$(JOBSET_NAME)-$(REPLICATED_JOB_NAME)-$(MEGASCALE_SLICE_ID)-0.$(JOBSET_NAME)"  | 
338 |  | -              {pathways_sidecar_container}  | 
339 |  | -              nodeSelector:  | 
340 |  | -                {accelerator_label}  | 
341 |  | -                {machine_label}  | 
342 |  | -                {autoprovisioning_args}  | 
343 |  | -              priorityClassName: {args.priority}  | 
344 |  | -              hostNetwork: true  | 
345 |  | -              dnsPolicy: ClusterFirstWithHostNet  | 
346 |  | -              volumes:  | 
347 |  | -              - hostPath:  | 
348 |  | -                  path: /tmp  | 
349 |  | -                  type: DirectoryOrCreate  | 
350 |  | -                name: shared-tmp  | 
351 |  | -              {storage_volumes}  | 
352 | 264 |     - name: rm  | 
353 | 265 |       replicas: 1  | 
354 | 266 |       template:  | 
 | 
365 | 277 |               - args:  | 
366 | 278 |                 {pathways_rm_args}  | 
367 | 279 |                 env:  | 
 | 280 | +                - name: REPLICATED_JOB_NAME  | 
 | 281 | +                  valueFrom:  | 
 | 282 | +                    fieldRef:  | 
 | 283 | +                      fieldPath: metadata.annotations['jobset.sigs.k8s.io/replicatedjob-name']  | 
 | 284 | +                - name: JOBSET_NAME  | 
 | 285 | +                  valueFrom:  | 
 | 286 | +                    fieldRef:  | 
 | 287 | +                      fieldPath: metadata.annotations['jobset.sigs.k8s.io/jobset-name']  | 
 | 288 | +                - name: HOST_ADDRESS  | 
 | 289 | +                  value: $(JOBSET_NAME)-$(REPLICATED_JOB_NAME)-0-0.$(JOBSET_NAME)  | 
 | 290 | +                - name: TPU_SKIP_MDS_QUERY  | 
 | 291 | +                  value: "true"  | 
368 | 292 |                 - name: PROJECT_ID  | 
369 | 293 |                   value: {args.project}  | 
370 | 294 |                 - name: LOCATION  | 
 | 
381 | 305 |                   valueFrom:  | 
382 | 306 |                     fieldRef:  | 
383 | 307 |                       fieldPath: metadata.namespace  | 
384 |  | -                - name: REPLICATED_JOB_NAME  | 
385 |  | -                  valueFrom:  | 
386 |  | -                    fieldRef:  | 
387 |  | -                      fieldPath: metadata.annotations['jobset.sigs.k8s.io/replicatedjob-name']  | 
388 |  | -                - name: JOBSET_NAME  | 
389 |  | -                  valueFrom:  | 
390 |  | -                    fieldRef:  | 
391 |  | -                      fieldPath: metadata.annotations['jobset.sigs.k8s.io/jobset-name']  | 
392 |  | -                - name: HOST_ADDRESS  | 
393 |  | -                  value: $(JOBSET_NAME)-$(REPLICATED_JOB_NAME)-0-0.$(JOBSET_NAME)  | 
394 |  | -                - name: TPU_SKIP_MDS_QUERY  | 
395 |  | -                  value: "true"  | 
396 |  | -                image: {args.server_image}  | 
397 | 308 |                 imagePullPolicy: Always  | 
398 | 309 |                 name: pathways-rm  | 
399 | 310 |                 ports:  | 
 | 
454 | 365 |               nodeSelector:  | 
455 | 366 |                 cloud.google.com/gke-nodepool: cpu-proxy-np  | 
456 | 367 |     {user_workload}  | 
 | 368 | +    - name: worker  | 
 | 369 | +      replicas: {args.num_slices}  | 
 | 370 | +      template:  | 
 | 371 | +        metadata:  | 
 | 372 | +          annotations:  | 
 | 373 | +            alpha.jobset.sigs.k8s.io/exclusive-topology: cloud.google.com/gke-nodepool  | 
 | 374 | +          labels:  | 
 | 375 | +            xpk.google.com/workload: {args.workload}  | 
 | 376 | +        spec:  | 
 | 377 | +          backoffLimit: {backoff_limit}  | 
 | 378 | +          completions: {system.vms_per_slice}  | 
 | 379 | +          parallelism: {system.vms_per_slice}  | 
 | 380 | +          template:  | 
 | 381 | +            metadata:  | 
 | 382 | +              annotations:  | 
 | 383 | +                {storage_annotations}  | 
 | 384 | +            spec:  | 
 | 385 | +              terminationGracePeriodSeconds: {args.termination_grace_period_seconds}  | 
 | 386 | +              serviceAccountName: {service_account}  | 
 | 387 | +              containers:  | 
 | 388 | +              - args:  | 
 | 389 | +                {pathways_worker_args}  | 
 | 390 | +                image: {args.server_image}  | 
 | 391 | +                imagePullPolicy: Always  | 
 | 392 | +                name: pathways-worker  | 
 | 393 | +                ports:  | 
 | 394 | +                - containerPort: 29001  | 
 | 395 | +                - containerPort: 8471  | 
 | 396 | +                - containerPort: 8080  | 
 | 397 | +                resources:  | 
 | 398 | +                  limits:  | 
 | 399 | +                    {resource_type}: {system.chips_per_vm}  | 
 | 400 | +                securityContext:  | 
 | 401 | +                  privileged: true  | 
 | 402 | +                volumeMounts:  | 
 | 403 | +                - mountPath: /tmp  | 
 | 404 | +                  name: shared-tmp  | 
 | 405 | +                {storage_volume_mounts}  | 
 | 406 | +                env:  | 
 | 407 | +                  - name: PROJECT_ID  | 
 | 408 | +                    value: {args.project}  | 
 | 409 | +                  - name: LOCATION  | 
 | 410 | +                    value: {args.zone}  | 
 | 411 | +                  - name: CLUSTER_NAME  | 
 | 412 | +                    value: {args.cluster}  | 
 | 413 | +                  - name: POD_NAME  | 
 | 414 | +                    valueFrom:  | 
 | 415 | +                      fieldRef:  | 
 | 416 | +                        fieldPath: metadata.name  | 
 | 417 | +                  - name: CONTAINER_NAME  | 
 | 418 | +                    value: "pathways-worker"  | 
 | 419 | +                  - name: NAMESPACE  | 
 | 420 | +                    valueFrom:  | 
 | 421 | +                      fieldRef:  | 
 | 422 | +                        fieldPath: metadata.namespace  | 
 | 423 | +                  # Workaround for v6e  | 
 | 424 | +                  - name: MEGASCALE_GRPC_ENABLE_XOR_TRACER  | 
 | 425 | +                    value: "false"  | 
 | 426 | +                  - name: MEGASCALE_NUM_SLICES  | 
 | 427 | +                    valueFrom:  | 
 | 428 | +                      fieldRef:  | 
 | 429 | +                        fieldPath: "metadata.labels['jobset.sigs.k8s.io/replicatedjob-replicas']"  | 
 | 430 | +                  - name: JOBSET_NAME  | 
 | 431 | +                    valueFrom:  | 
 | 432 | +                      fieldRef:  | 
 | 433 | +                        fieldPath: metadata.annotations['jobset.sigs.k8s.io/jobset-name']  | 
 | 434 | +                  - name: REPLICATED_JOB_NAME  | 
 | 435 | +                    valueFrom:  | 
 | 436 | +                      fieldRef:  | 
 | 437 | +                        fieldPath: metadata.annotations['jobset.sigs.k8s.io/replicatedjob-name']  | 
 | 438 | +                  - name: MEGASCALE_SLICE_ID  | 
 | 439 | +                    valueFrom:  | 
 | 440 | +                      fieldRef:  | 
 | 441 | +                        fieldPath: "metadata.labels['jobset.sigs.k8s.io/job-index']"  | 
 | 442 | +                  - name: MEGASCALE_COORDINATOR_ADDRESS  | 
 | 443 | +                    value: "$(JOBSET_NAME)-$(REPLICATED_JOB_NAME)-$(MEGASCALE_SLICE_ID)-0.$(JOBSET_NAME)"  | 
 | 444 | +              {pathways_sidecar_container}  | 
 | 445 | +              nodeSelector:  | 
 | 446 | +                {accelerator_label}  | 
 | 447 | +                {machine_label}  | 
 | 448 | +                {autoprovisioning_args}  | 
 | 449 | +              priorityClassName: {args.priority}  | 
 | 450 | +              hostNetwork: true  | 
 | 451 | +              dnsPolicy: ClusterFirstWithHostNet  | 
 | 452 | +              volumes:  | 
 | 453 | +              - hostPath:  | 
 | 454 | +                  path: /tmp  | 
 | 455 | +                  type: DirectoryOrCreate  | 
 | 456 | +                name: shared-tmp  | 
 | 457 | +              {storage_volumes}  | 
457 | 458 | """  | 
458 | 459 | 
 
  | 
459 | 460 | 
 
  | 
@@ -742,8 +743,7 @@ def workload_create(args) -> None:  | 
742 | 743 |           ' done! ******* '  | 
743 | 744 |       )  | 
744 | 745 |       xpk_print(  | 
745 |  | -          'Steps to connect to the proxy: kubectl get pods | grep proxy ;'  | 
746 |  | -          ' kubectl port-forward <proxy-pod-name> 29000:29000; '  | 
 | 746 | +          'Steps to connect to the proxy: kubectl get pods | grep {args.workload}-proxy-0 | awk "{print $1}" | xargs -I {} kubectl port-forward {} 29000:29000 &'  | 
747 | 747 |           ' JAX_PLATFORMS=proxy; JAX_BACKEND_TARGET=grpc://127.0.0.1:29000;'  | 
748 | 748 |           " python -c 'import pathwaysutils; import jax; print(jax.devices())'"  | 
749 | 749 |       )  | 
 | 
0 commit comments