Skip to content

Scale testing fixes to Pathways XPK integration #442

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: develop
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
210 changes: 105 additions & 105 deletions src/xpk/commands/workload.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,97 +258,9 @@
operator: "All"
targetReplicatedJobs:
- {args.targetReplicatedJob}
startupPolicy:
startupPolicyOrder: InOrder
replicatedJobs:
- name: worker
replicas: {args.num_slices}
template:
metadata:
annotations:
alpha.jobset.sigs.k8s.io/exclusive-topology: cloud.google.com/gke-nodepool
labels:
xpk.google.com/workload: {args.workload}
spec:
backoffLimit: {backoff_limit}
completions: {system.vms_per_slice}
parallelism: {system.vms_per_slice}
template:
metadata:
annotations:
{storage_annotations}
spec:
terminationGracePeriodSeconds: {args.termination_grace_period_seconds}
serviceAccountName: {service_account}
containers:
- args:
{pathways_worker_args}
image: {args.server_image}
imagePullPolicy: Always
name: pathways-worker
ports:
- containerPort: 29001
- containerPort: 8471
- containerPort: 8080
resources:
limits:
{resource_type}: {system.chips_per_vm}
securityContext:
privileged: true
volumeMounts:
- mountPath: /tmp
name: shared-tmp
{storage_volume_mounts}
env:
- name: PROJECT_ID
value: {args.project}
- name: LOCATION
value: {args.zone}
- name: CLUSTER_NAME
value: {args.cluster}
- name: POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: CONTAINER_NAME
value: "pathways-worker"
- name: NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
# Workaround for v6e
- name: MEGASCALE_GRPC_ENABLE_XOR_TRACER
value: "false"
- name: MEGASCALE_NUM_SLICES
valueFrom:
fieldRef:
fieldPath: "metadata.labels['jobset.sigs.k8s.io/replicatedjob-replicas']"
- name: JOBSET_NAME
valueFrom:
fieldRef:
fieldPath: metadata.annotations['jobset.sigs.k8s.io/jobset-name']
- name: REPLICATED_JOB_NAME
valueFrom:
fieldRef:
fieldPath: metadata.annotations['jobset.sigs.k8s.io/replicatedjob-name']
- name: MEGASCALE_SLICE_ID
valueFrom:
fieldRef:
fieldPath: "metadata.labels['jobset.sigs.k8s.io/job-index']"
- name: MEGASCALE_COORDINATOR_ADDRESS
value: "$(JOBSET_NAME)-$(REPLICATED_JOB_NAME)-$(MEGASCALE_SLICE_ID)-0.$(JOBSET_NAME)"
{pathways_sidecar_container}
nodeSelector:
{accelerator_label}
{machine_label}
{autoprovisioning_args}
priorityClassName: {args.priority}
hostNetwork: true
dnsPolicy: ClusterFirstWithHostNet
volumes:
- hostPath:
path: /tmp
type: DirectoryOrCreate
name: shared-tmp
{storage_volumes}
- name: rm
replicas: 1
template:
Expand All @@ -365,6 +277,18 @@
- args:
{pathways_rm_args}
env:
- name: REPLICATED_JOB_NAME
valueFrom:
fieldRef:
fieldPath: metadata.annotations['jobset.sigs.k8s.io/replicatedjob-name']
- name: JOBSET_NAME
valueFrom:
fieldRef:
fieldPath: metadata.annotations['jobset.sigs.k8s.io/jobset-name']
- name: HOST_ADDRESS
value: $(JOBSET_NAME)-$(REPLICATED_JOB_NAME)-0-0.$(JOBSET_NAME)
- name: TPU_SKIP_MDS_QUERY
value: "true"
- name: PROJECT_ID
value: {args.project}
- name: LOCATION
Expand All @@ -381,19 +305,6 @@
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: REPLICATED_JOB_NAME
valueFrom:
fieldRef:
fieldPath: metadata.annotations['jobset.sigs.k8s.io/replicatedjob-name']
- name: JOBSET_NAME
valueFrom:
fieldRef:
fieldPath: metadata.annotations['jobset.sigs.k8s.io/jobset-name']
- name: HOST_ADDRESS
value: $(JOBSET_NAME)-$(REPLICATED_JOB_NAME)-0-0.$(JOBSET_NAME)
- name: TPU_SKIP_MDS_QUERY
value: "true"
image: {args.server_image}
imagePullPolicy: Always
name: pathways-rm
ports:
Expand Down Expand Up @@ -454,6 +365,96 @@
nodeSelector:
cloud.google.com/gke-nodepool: cpu-proxy-np
{user_workload}
- name: worker
replicas: {args.num_slices}
template:
metadata:
annotations:
alpha.jobset.sigs.k8s.io/exclusive-topology: cloud.google.com/gke-nodepool
labels:
xpk.google.com/workload: {args.workload}
spec:
backoffLimit: {backoff_limit}
completions: {system.vms_per_slice}
parallelism: {system.vms_per_slice}
template:
metadata:
annotations:
{storage_annotations}
spec:
terminationGracePeriodSeconds: {args.termination_grace_period_seconds}
serviceAccountName: {service_account}
containers:
- args:
{pathways_worker_args}
image: {args.server_image}
imagePullPolicy: Always
name: pathways-worker
ports:
- containerPort: 29001
- containerPort: 8471
- containerPort: 8080
resources:
limits:
{resource_type}: {system.chips_per_vm}
securityContext:
privileged: true
volumeMounts:
- mountPath: /tmp
name: shared-tmp
{storage_volume_mounts}
env:
- name: PROJECT_ID
value: {args.project}
- name: LOCATION
value: {args.zone}
- name: CLUSTER_NAME
value: {args.cluster}
- name: POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: CONTAINER_NAME
value: "pathways-worker"
- name: NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
# Workaround for v6e
- name: MEGASCALE_GRPC_ENABLE_XOR_TRACER
value: "false"
- name: MEGASCALE_NUM_SLICES
valueFrom:
fieldRef:
fieldPath: "metadata.labels['jobset.sigs.k8s.io/replicatedjob-replicas']"
- name: JOBSET_NAME
valueFrom:
fieldRef:
fieldPath: metadata.annotations['jobset.sigs.k8s.io/jobset-name']
- name: REPLICATED_JOB_NAME
valueFrom:
fieldRef:
fieldPath: metadata.annotations['jobset.sigs.k8s.io/replicatedjob-name']
- name: MEGASCALE_SLICE_ID
valueFrom:
fieldRef:
fieldPath: "metadata.labels['jobset.sigs.k8s.io/job-index']"
- name: MEGASCALE_COORDINATOR_ADDRESS
value: "$(JOBSET_NAME)-$(REPLICATED_JOB_NAME)-$(MEGASCALE_SLICE_ID)-0.$(JOBSET_NAME)"
{pathways_sidecar_container}
nodeSelector:
{accelerator_label}
{machine_label}
{autoprovisioning_args}
priorityClassName: {args.priority}
hostNetwork: true
dnsPolicy: ClusterFirstWithHostNet
volumes:
- hostPath:
path: /tmp
type: DirectoryOrCreate
name: shared-tmp
{storage_volumes}
"""


Expand Down Expand Up @@ -742,8 +743,7 @@ def workload_create(args) -> None:
' done! ******* '
)
xpk_print(
'Steps to connect to the proxy: kubectl get pods | grep proxy ;'
' kubectl port-forward <proxy-pod-name> 29000:29000; '
'Steps to connect to the proxy: kubectl get pods | grep {args.workload}-proxy-0 | awk "{print $1}" | xargs -I {} kubectl port-forward {} 29000:29000 &'
' JAX_PLATFORMS=proxy; JAX_BACKEND_TARGET=grpc://127.0.0.1:29000;'
" python -c 'import pathwaysutils; import jax; print(jax.devices())'"
)
Expand Down
Loading