Added the changes to the jobset for elastic training

lukebaumann · lukebaumann · commit b75b5c6fbfaf · 2025-06-17T22:38:13.000Z
diff --git a/axlearn/cloud/gcp/pathways_utils.py b/axlearn/cloud/gcp/pathways_utils.py
@@ -320,6 +320,8 @@ def _build_pathways_head_sidecar_containers(self) -> list[Nested[Any]]:
             f"--resource_manager_address=localhost:{_PATHWAYS_RESOURCE_MANAGER_PORT}",
             f"--server_port={_PATHWAYS_PROXY_PORT}",
             f"--gcs_scratch_location={staging_location}",
+            # This should be made configurable
+            f"--num_elastic_slices={cfg.accelerator.num_replicas}"
         ]
         cmd_args.extend(xla_flags_from_options(self._xla_options).split())
 
@@ -581,14 +583,19 @@ def _build_pathways_worker_job(
         annotations.update(
             {"alpha.jobset.sigs.k8s.io/exclusive-topology": "cloud.google.com/gke-nodepool"}
         )
+        # Default value for suspend and resume.
+        # References:
+        # https://github.com/google/pathways-job/blob/4417de7aa23d3c2316e400a3a327512834374475/internal/controller/pathwaysjob_controller.go#L651
+        # backoffLimit = system.vms_per_slice * 4
+
+        # This backoffLimit is just for verifying elastic fast-resume
+        large_number = 1000
+        backoffLimit = system.vms_per_slice * 4 * large_number
 
         spec = dict(
             parallelism=system.vms_per_slice,
             completions=system.vms_per_slice,
-            # Default value for suspend and resume.
-            # References:
-            # https://github.com/google/pathways-job/blob/4417de7aa23d3c2316e400a3a327512834374475/internal/controller/pathwaysjob_controller.go#L651
-            backoffLimit=system.vms_per_slice * 4,
+            backoffLimit=backoffLimit,
             template=self._build_pathways_worker_pod(pathways_worker_replicated_job_index),
         )
         worker_job = dict(