|
258 | 258 | operator: "All"
|
259 | 259 | targetReplicatedJobs:
|
260 | 260 | - {args.targetReplicatedJob}
|
| 261 | + startupPolicy: |
| 262 | + startupPolicyOrder: InOrder |
261 | 263 | replicatedJobs:
|
262 |
| - - name: worker |
263 |
| - replicas: {args.num_slices} |
264 |
| - template: |
265 |
| - metadata: |
266 |
| - annotations: |
267 |
| - alpha.jobset.sigs.k8s.io/exclusive-topology: cloud.google.com/gke-nodepool |
268 |
| - labels: |
269 |
| - xpk.google.com/workload: {args.workload} |
270 |
| - spec: |
271 |
| - backoffLimit: {backoff_limit} |
272 |
| - completions: {system.vms_per_slice} |
273 |
| - parallelism: {system.vms_per_slice} |
274 |
| - template: |
275 |
| - metadata: |
276 |
| - annotations: |
277 |
| - {storage_annotations} |
278 |
| - spec: |
279 |
| - terminationGracePeriodSeconds: {args.termination_grace_period_seconds} |
280 |
| - serviceAccountName: {service_account} |
281 |
| - containers: |
282 |
| - - args: |
283 |
| - {pathways_worker_args} |
284 |
| - image: {args.server_image} |
285 |
| - imagePullPolicy: Always |
286 |
| - name: pathways-worker |
287 |
| - ports: |
288 |
| - - containerPort: 29001 |
289 |
| - - containerPort: 8471 |
290 |
| - - containerPort: 8080 |
291 |
| - resources: |
292 |
| - limits: |
293 |
| - {resource_type}: {system.chips_per_vm} |
294 |
| - securityContext: |
295 |
| - privileged: true |
296 |
| - volumeMounts: |
297 |
| - - mountPath: /tmp |
298 |
| - name: shared-tmp |
299 |
| - {storage_volume_mounts} |
300 |
| - env: |
301 |
| - - name: PROJECT_ID |
302 |
| - value: {args.project} |
303 |
| - - name: LOCATION |
304 |
| - value: {args.zone} |
305 |
| - - name: CLUSTER_NAME |
306 |
| - value: {args.cluster} |
307 |
| - - name: POD_NAME |
308 |
| - valueFrom: |
309 |
| - fieldRef: |
310 |
| - fieldPath: metadata.name |
311 |
| - - name: CONTAINER_NAME |
312 |
| - value: "pathways-worker" |
313 |
| - - name: NAMESPACE |
314 |
| - valueFrom: |
315 |
| - fieldRef: |
316 |
| - fieldPath: metadata.namespace |
317 |
| - # Workaround for v6e |
318 |
| - - name: MEGASCALE_GRPC_ENABLE_XOR_TRACER |
319 |
| - value: "false" |
320 |
| - - name: MEGASCALE_NUM_SLICES |
321 |
| - valueFrom: |
322 |
| - fieldRef: |
323 |
| - fieldPath: "metadata.labels['jobset.sigs.k8s.io/replicatedjob-replicas']" |
324 |
| - - name: JOBSET_NAME |
325 |
| - valueFrom: |
326 |
| - fieldRef: |
327 |
| - fieldPath: metadata.annotations['jobset.sigs.k8s.io/jobset-name'] |
328 |
| - - name: REPLICATED_JOB_NAME |
329 |
| - valueFrom: |
330 |
| - fieldRef: |
331 |
| - fieldPath: metadata.annotations['jobset.sigs.k8s.io/replicatedjob-name'] |
332 |
| - - name: MEGASCALE_SLICE_ID |
333 |
| - valueFrom: |
334 |
| - fieldRef: |
335 |
| - fieldPath: "metadata.labels['jobset.sigs.k8s.io/job-index']" |
336 |
| - - name: MEGASCALE_COORDINATOR_ADDRESS |
337 |
| - value: "$(JOBSET_NAME)-$(REPLICATED_JOB_NAME)-$(MEGASCALE_SLICE_ID)-0.$(JOBSET_NAME)" |
338 |
| - {pathways_sidecar_container} |
339 |
| - nodeSelector: |
340 |
| - {accelerator_label} |
341 |
| - {machine_label} |
342 |
| - {autoprovisioning_args} |
343 |
| - priorityClassName: {args.priority} |
344 |
| - hostNetwork: true |
345 |
| - dnsPolicy: ClusterFirstWithHostNet |
346 |
| - volumes: |
347 |
| - - hostPath: |
348 |
| - path: /tmp |
349 |
| - type: DirectoryOrCreate |
350 |
| - name: shared-tmp |
351 |
| - {storage_volumes} |
352 | 264 | - name: rm
|
353 | 265 | replicas: 1
|
354 | 266 | template:
|
|
365 | 277 | - args:
|
366 | 278 | {pathways_rm_args}
|
367 | 279 | env:
|
| 280 | + - name: REPLICATED_JOB_NAME |
| 281 | + valueFrom: |
| 282 | + fieldRef: |
| 283 | + fieldPath: metadata.annotations['jobset.sigs.k8s.io/replicatedjob-name'] |
| 284 | + - name: JOBSET_NAME |
| 285 | + valueFrom: |
| 286 | + fieldRef: |
| 287 | + fieldPath: metadata.annotations['jobset.sigs.k8s.io/jobset-name'] |
| 288 | + - name: HOST_ADDRESS |
| 289 | + value: $(JOBSET_NAME)-$(REPLICATED_JOB_NAME)-0-0.$(JOBSET_NAME) |
| 290 | + - name: TPU_SKIP_MDS_QUERY |
| 291 | + value: "true" |
368 | 292 | - name: PROJECT_ID
|
369 | 293 | value: {args.project}
|
370 | 294 | - name: LOCATION
|
|
381 | 305 | valueFrom:
|
382 | 306 | fieldRef:
|
383 | 307 | fieldPath: metadata.namespace
|
384 |
| - - name: REPLICATED_JOB_NAME |
385 |
| - valueFrom: |
386 |
| - fieldRef: |
387 |
| - fieldPath: metadata.annotations['jobset.sigs.k8s.io/replicatedjob-name'] |
388 |
| - - name: JOBSET_NAME |
389 |
| - valueFrom: |
390 |
| - fieldRef: |
391 |
| - fieldPath: metadata.annotations['jobset.sigs.k8s.io/jobset-name'] |
392 |
| - - name: HOST_ADDRESS |
393 |
| - value: $(JOBSET_NAME)-$(REPLICATED_JOB_NAME)-0-0.$(JOBSET_NAME) |
394 |
| - - name: TPU_SKIP_MDS_QUERY |
395 |
| - value: "true" |
396 |
| - image: {args.server_image} |
397 | 308 | imagePullPolicy: Always
|
398 | 309 | name: pathways-rm
|
399 | 310 | ports:
|
|
454 | 365 | nodeSelector:
|
455 | 366 | cloud.google.com/gke-nodepool: cpu-proxy-np
|
456 | 367 | {user_workload}
|
| 368 | + - name: worker |
| 369 | + replicas: {args.num_slices} |
| 370 | + template: |
| 371 | + metadata: |
| 372 | + annotations: |
| 373 | + alpha.jobset.sigs.k8s.io/exclusive-topology: cloud.google.com/gke-nodepool |
| 374 | + labels: |
| 375 | + xpk.google.com/workload: {args.workload} |
| 376 | + spec: |
| 377 | + backoffLimit: {backoff_limit} |
| 378 | + completions: {system.vms_per_slice} |
| 379 | + parallelism: {system.vms_per_slice} |
| 380 | + template: |
| 381 | + metadata: |
| 382 | + annotations: |
| 383 | + {storage_annotations} |
| 384 | + spec: |
| 385 | + terminationGracePeriodSeconds: {args.termination_grace_period_seconds} |
| 386 | + serviceAccountName: {service_account} |
| 387 | + containers: |
| 388 | + - args: |
| 389 | + {pathways_worker_args} |
| 390 | + image: {args.server_image} |
| 391 | + imagePullPolicy: Always |
| 392 | + name: pathways-worker |
| 393 | + ports: |
| 394 | + - containerPort: 29001 |
| 395 | + - containerPort: 8471 |
| 396 | + - containerPort: 8080 |
| 397 | + resources: |
| 398 | + limits: |
| 399 | + {resource_type}: {system.chips_per_vm} |
| 400 | + securityContext: |
| 401 | + privileged: true |
| 402 | + volumeMounts: |
| 403 | + - mountPath: /tmp |
| 404 | + name: shared-tmp |
| 405 | + {storage_volume_mounts} |
| 406 | + env: |
| 407 | + - name: PROJECT_ID |
| 408 | + value: {args.project} |
| 409 | + - name: LOCATION |
| 410 | + value: {args.zone} |
| 411 | + - name: CLUSTER_NAME |
| 412 | + value: {args.cluster} |
| 413 | + - name: POD_NAME |
| 414 | + valueFrom: |
| 415 | + fieldRef: |
| 416 | + fieldPath: metadata.name |
| 417 | + - name: CONTAINER_NAME |
| 418 | + value: "pathways-worker" |
| 419 | + - name: NAMESPACE |
| 420 | + valueFrom: |
| 421 | + fieldRef: |
| 422 | + fieldPath: metadata.namespace |
| 423 | + # Workaround for v6e |
| 424 | + - name: MEGASCALE_GRPC_ENABLE_XOR_TRACER |
| 425 | + value: "false" |
| 426 | + - name: MEGASCALE_NUM_SLICES |
| 427 | + valueFrom: |
| 428 | + fieldRef: |
| 429 | + fieldPath: "metadata.labels['jobset.sigs.k8s.io/replicatedjob-replicas']" |
| 430 | + - name: JOBSET_NAME |
| 431 | + valueFrom: |
| 432 | + fieldRef: |
| 433 | + fieldPath: metadata.annotations['jobset.sigs.k8s.io/jobset-name'] |
| 434 | + - name: REPLICATED_JOB_NAME |
| 435 | + valueFrom: |
| 436 | + fieldRef: |
| 437 | + fieldPath: metadata.annotations['jobset.sigs.k8s.io/replicatedjob-name'] |
| 438 | + - name: MEGASCALE_SLICE_ID |
| 439 | + valueFrom: |
| 440 | + fieldRef: |
| 441 | + fieldPath: "metadata.labels['jobset.sigs.k8s.io/job-index']" |
| 442 | + - name: MEGASCALE_COORDINATOR_ADDRESS |
| 443 | + value: "$(JOBSET_NAME)-$(REPLICATED_JOB_NAME)-$(MEGASCALE_SLICE_ID)-0.$(JOBSET_NAME)" |
| 444 | + {pathways_sidecar_container} |
| 445 | + nodeSelector: |
| 446 | + {accelerator_label} |
| 447 | + {machine_label} |
| 448 | + {autoprovisioning_args} |
| 449 | + priorityClassName: {args.priority} |
| 450 | + hostNetwork: true |
| 451 | + dnsPolicy: ClusterFirstWithHostNet |
| 452 | + volumes: |
| 453 | + - hostPath: |
| 454 | + path: /tmp |
| 455 | + type: DirectoryOrCreate |
| 456 | + name: shared-tmp |
| 457 | + {storage_volumes} |
457 | 458 | """
|
458 | 459 |
|
459 | 460 |
|
@@ -742,8 +743,7 @@ def workload_create(args) -> None:
|
742 | 743 | ' done! ******* '
|
743 | 744 | )
|
744 | 745 | xpk_print(
|
745 |
| - 'Steps to connect to the proxy: kubectl get pods | grep proxy ;' |
746 |
| - ' kubectl port-forward <proxy-pod-name> 29000:29000; ' |
| 746 | + 'Steps to connect to the proxy: kubectl get pods | grep {args.workload}-proxy-0 | awk "{print $1}" | xargs -I {} kubectl port-forward {} 29000:29000 &' |
747 | 747 | ' JAX_PLATFORMS=proxy; JAX_BACKEND_TARGET=grpc://127.0.0.1:29000;'
|
748 | 748 | " python -c 'import pathwaysutils; import jax; print(jax.devices())'"
|
749 | 749 | )
|
|
0 commit comments