Skip to content

Commit b75b5c6

Browse files
committed
Added the changes to the jobset for elastic training
1 parent d850d97 commit b75b5c6

File tree

1 file changed

+11
-4
lines changed

1 file changed

+11
-4
lines changed

axlearn/cloud/gcp/pathways_utils.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -320,6 +320,8 @@ def _build_pathways_head_sidecar_containers(self) -> list[Nested[Any]]:
320320
f"--resource_manager_address=localhost:{_PATHWAYS_RESOURCE_MANAGER_PORT}",
321321
f"--server_port={_PATHWAYS_PROXY_PORT}",
322322
f"--gcs_scratch_location={staging_location}",
323+
# This should be made configurable
324+
f"--num_elastic_slices={cfg.accelerator.num_replicas}"
323325
]
324326
cmd_args.extend(xla_flags_from_options(self._xla_options).split())
325327

@@ -581,14 +583,19 @@ def _build_pathways_worker_job(
581583
annotations.update(
582584
{"alpha.jobset.sigs.k8s.io/exclusive-topology": "cloud.google.com/gke-nodepool"}
583585
)
586+
# Default value for suspend and resume.
587+
# References:
588+
# https://github.com/google/pathways-job/blob/4417de7aa23d3c2316e400a3a327512834374475/internal/controller/pathwaysjob_controller.go#L651
589+
# backoffLimit = system.vms_per_slice * 4
590+
591+
# This backoffLimit is just for verifying elastic fast-resume
592+
large_number = 1000
593+
backoffLimit = system.vms_per_slice * 4 * large_number
584594

585595
spec = dict(
586596
parallelism=system.vms_per_slice,
587597
completions=system.vms_per_slice,
588-
# Default value for suspend and resume.
589-
# References:
590-
# https://github.com/google/pathways-job/blob/4417de7aa23d3c2316e400a3a327512834374475/internal/controller/pathwaysjob_controller.go#L651
591-
backoffLimit=system.vms_per_slice * 4,
598+
backoffLimit=backoffLimit,
592599
template=self._build_pathways_worker_pod(pathways_worker_replicated_job_index),
593600
)
594601
worker_job = dict(

0 commit comments

Comments
 (0)