Skip to content

Commit a26a868

Browse files
committed
Add startupPolicy: InOrder to the pathways containers along with streamz changes
Minor edit to the interactive workload connection command
1 parent d9ce2cd commit a26a868

File tree

1 file changed

+105
-105
lines changed

1 file changed

+105
-105
lines changed

src/xpk/commands/workload.py

+105-105
Original file line numberDiff line numberDiff line change
@@ -258,97 +258,9 @@
258258
operator: "All"
259259
targetReplicatedJobs:
260260
- {args.targetReplicatedJob}
261+
startupPolicy:
262+
startupPolicyOrder: InOrder
261263
replicatedJobs:
262-
- name: worker
263-
replicas: {args.num_slices}
264-
template:
265-
metadata:
266-
annotations:
267-
alpha.jobset.sigs.k8s.io/exclusive-topology: cloud.google.com/gke-nodepool
268-
labels:
269-
xpk.google.com/workload: {args.workload}
270-
spec:
271-
backoffLimit: {backoff_limit}
272-
completions: {system.vms_per_slice}
273-
parallelism: {system.vms_per_slice}
274-
template:
275-
metadata:
276-
annotations:
277-
{storage_annotations}
278-
spec:
279-
terminationGracePeriodSeconds: {args.termination_grace_period_seconds}
280-
serviceAccountName: {service_account}
281-
containers:
282-
- args:
283-
{pathways_worker_args}
284-
image: {args.server_image}
285-
imagePullPolicy: Always
286-
name: pathways-worker
287-
ports:
288-
- containerPort: 29001
289-
- containerPort: 8471
290-
- containerPort: 8080
291-
resources:
292-
limits:
293-
{resource_type}: {system.chips_per_vm}
294-
securityContext:
295-
privileged: true
296-
volumeMounts:
297-
- mountPath: /tmp
298-
name: shared-tmp
299-
{storage_volume_mounts}
300-
env:
301-
- name: PROJECT_ID
302-
value: {args.project}
303-
- name: LOCATION
304-
value: {args.zone}
305-
- name: CLUSTER_NAME
306-
value: {args.cluster}
307-
- name: POD_NAME
308-
valueFrom:
309-
fieldRef:
310-
fieldPath: metadata.name
311-
- name: CONTAINER_NAME
312-
value: "pathways-worker"
313-
- name: NAMESPACE
314-
valueFrom:
315-
fieldRef:
316-
fieldPath: metadata.namespace
317-
# Workaround for v6e
318-
- name: MEGASCALE_GRPC_ENABLE_XOR_TRACER
319-
value: "false"
320-
- name: MEGASCALE_NUM_SLICES
321-
valueFrom:
322-
fieldRef:
323-
fieldPath: "metadata.labels['jobset.sigs.k8s.io/replicatedjob-replicas']"
324-
- name: JOBSET_NAME
325-
valueFrom:
326-
fieldRef:
327-
fieldPath: metadata.annotations['jobset.sigs.k8s.io/jobset-name']
328-
- name: REPLICATED_JOB_NAME
329-
valueFrom:
330-
fieldRef:
331-
fieldPath: metadata.annotations['jobset.sigs.k8s.io/replicatedjob-name']
332-
- name: MEGASCALE_SLICE_ID
333-
valueFrom:
334-
fieldRef:
335-
fieldPath: "metadata.labels['jobset.sigs.k8s.io/job-index']"
336-
- name: MEGASCALE_COORDINATOR_ADDRESS
337-
value: "$(JOBSET_NAME)-$(REPLICATED_JOB_NAME)-$(MEGASCALE_SLICE_ID)-0.$(JOBSET_NAME)"
338-
{pathways_sidecar_container}
339-
nodeSelector:
340-
{accelerator_label}
341-
{machine_label}
342-
{autoprovisioning_args}
343-
priorityClassName: {args.priority}
344-
hostNetwork: true
345-
dnsPolicy: ClusterFirstWithHostNet
346-
volumes:
347-
- hostPath:
348-
path: /tmp
349-
type: DirectoryOrCreate
350-
name: shared-tmp
351-
{storage_volumes}
352264
- name: rm
353265
replicas: 1
354266
template:
@@ -365,6 +277,18 @@
365277
- args:
366278
{pathways_rm_args}
367279
env:
280+
- name: REPLICATED_JOB_NAME
281+
valueFrom:
282+
fieldRef:
283+
fieldPath: metadata.annotations['jobset.sigs.k8s.io/replicatedjob-name']
284+
- name: JOBSET_NAME
285+
valueFrom:
286+
fieldRef:
287+
fieldPath: metadata.annotations['jobset.sigs.k8s.io/jobset-name']
288+
- name: HOST_ADDRESS
289+
value: $(JOBSET_NAME)-$(REPLICATED_JOB_NAME)-0-0.$(JOBSET_NAME)
290+
- name: TPU_SKIP_MDS_QUERY
291+
value: "true"
368292
- name: PROJECT_ID
369293
value: {args.project}
370294
- name: LOCATION
@@ -381,19 +305,6 @@
381305
valueFrom:
382306
fieldRef:
383307
fieldPath: metadata.namespace
384-
- name: REPLICATED_JOB_NAME
385-
valueFrom:
386-
fieldRef:
387-
fieldPath: metadata.annotations['jobset.sigs.k8s.io/replicatedjob-name']
388-
- name: JOBSET_NAME
389-
valueFrom:
390-
fieldRef:
391-
fieldPath: metadata.annotations['jobset.sigs.k8s.io/jobset-name']
392-
- name: HOST_ADDRESS
393-
value: $(JOBSET_NAME)-$(REPLICATED_JOB_NAME)-0-0.$(JOBSET_NAME)
394-
- name: TPU_SKIP_MDS_QUERY
395-
value: "true"
396-
image: {args.server_image}
397308
imagePullPolicy: Always
398309
name: pathways-rm
399310
ports:
@@ -454,6 +365,96 @@
454365
nodeSelector:
455366
cloud.google.com/gke-nodepool: cpu-proxy-np
456367
{user_workload}
368+
- name: worker
369+
replicas: {args.num_slices}
370+
template:
371+
metadata:
372+
annotations:
373+
alpha.jobset.sigs.k8s.io/exclusive-topology: cloud.google.com/gke-nodepool
374+
labels:
375+
xpk.google.com/workload: {args.workload}
376+
spec:
377+
backoffLimit: {backoff_limit}
378+
completions: {system.vms_per_slice}
379+
parallelism: {system.vms_per_slice}
380+
template:
381+
metadata:
382+
annotations:
383+
{storage_annotations}
384+
spec:
385+
terminationGracePeriodSeconds: {args.termination_grace_period_seconds}
386+
serviceAccountName: {service_account}
387+
containers:
388+
- args:
389+
{pathways_worker_args}
390+
image: {args.server_image}
391+
imagePullPolicy: Always
392+
name: pathways-worker
393+
ports:
394+
- containerPort: 29001
395+
- containerPort: 8471
396+
- containerPort: 8080
397+
resources:
398+
limits:
399+
{resource_type}: {system.chips_per_vm}
400+
securityContext:
401+
privileged: true
402+
volumeMounts:
403+
- mountPath: /tmp
404+
name: shared-tmp
405+
{storage_volume_mounts}
406+
env:
407+
- name: PROJECT_ID
408+
value: {args.project}
409+
- name: LOCATION
410+
value: {args.zone}
411+
- name: CLUSTER_NAME
412+
value: {args.cluster}
413+
- name: POD_NAME
414+
valueFrom:
415+
fieldRef:
416+
fieldPath: metadata.name
417+
- name: CONTAINER_NAME
418+
value: "pathways-worker"
419+
- name: NAMESPACE
420+
valueFrom:
421+
fieldRef:
422+
fieldPath: metadata.namespace
423+
# Workaround for v6e
424+
- name: MEGASCALE_GRPC_ENABLE_XOR_TRACER
425+
value: "false"
426+
- name: MEGASCALE_NUM_SLICES
427+
valueFrom:
428+
fieldRef:
429+
fieldPath: "metadata.labels['jobset.sigs.k8s.io/replicatedjob-replicas']"
430+
- name: JOBSET_NAME
431+
valueFrom:
432+
fieldRef:
433+
fieldPath: metadata.annotations['jobset.sigs.k8s.io/jobset-name']
434+
- name: REPLICATED_JOB_NAME
435+
valueFrom:
436+
fieldRef:
437+
fieldPath: metadata.annotations['jobset.sigs.k8s.io/replicatedjob-name']
438+
- name: MEGASCALE_SLICE_ID
439+
valueFrom:
440+
fieldRef:
441+
fieldPath: "metadata.labels['jobset.sigs.k8s.io/job-index']"
442+
- name: MEGASCALE_COORDINATOR_ADDRESS
443+
value: "$(JOBSET_NAME)-$(REPLICATED_JOB_NAME)-$(MEGASCALE_SLICE_ID)-0.$(JOBSET_NAME)"
444+
{pathways_sidecar_container}
445+
nodeSelector:
446+
{accelerator_label}
447+
{machine_label}
448+
{autoprovisioning_args}
449+
priorityClassName: {args.priority}
450+
hostNetwork: true
451+
dnsPolicy: ClusterFirstWithHostNet
452+
volumes:
453+
- hostPath:
454+
path: /tmp
455+
type: DirectoryOrCreate
456+
name: shared-tmp
457+
{storage_volumes}
457458
"""
458459

459460

@@ -742,8 +743,7 @@ def workload_create(args) -> None:
742743
' done! ******* '
743744
)
744745
xpk_print(
745-
'Steps to connect to the proxy: kubectl get pods | grep proxy ;'
746-
' kubectl port-forward <proxy-pod-name> 29000:29000; '
746+
'Steps to connect to the proxy: kubectl get pods | grep {args.workload}-proxy-0 | awk "{print $1}" | xargs -I {} kubectl port-forward {} 29000:29000 &'
747747
' JAX_PLATFORMS=proxy; JAX_BACKEND_TARGET=grpc://127.0.0.1:29000;'
748748
" python -c 'import pathwaysutils; import jax; print(jax.devices())'"
749749
)

0 commit comments

Comments
 (0)