@@ -320,6 +320,8 @@ def _build_pathways_head_sidecar_containers(self) -> list[Nested[Any]]:
320
320
f"--resource_manager_address=localhost:{ _PATHWAYS_RESOURCE_MANAGER_PORT } " ,
321
321
f"--server_port={ _PATHWAYS_PROXY_PORT } " ,
322
322
f"--gcs_scratch_location={ staging_location } " ,
323
+ # This should be made configurable
324
+ f"--num_elastic_slices={ cfg .accelerator .num_replicas } "
323
325
]
324
326
cmd_args .extend (xla_flags_from_options (self ._xla_options ).split ())
325
327
@@ -581,14 +583,19 @@ def _build_pathways_worker_job(
581
583
annotations .update (
582
584
{"alpha.jobset.sigs.k8s.io/exclusive-topology" : "cloud.google.com/gke-nodepool" }
583
585
)
586
+ # Default value for suspend and resume.
587
+ # References:
588
+ # https://github.com/google/pathways-job/blob/4417de7aa23d3c2316e400a3a327512834374475/internal/controller/pathwaysjob_controller.go#L651
589
+ # backoffLimit = system.vms_per_slice * 4
590
+
591
+ # This backoffLimit is just for verifying elastic fast-resume
592
+ large_number = 1000
593
+ backoffLimit = system .vms_per_slice * 4 * large_number
584
594
585
595
spec = dict (
586
596
parallelism = system .vms_per_slice ,
587
597
completions = system .vms_per_slice ,
588
- # Default value for suspend and resume.
589
- # References:
590
- # https://github.com/google/pathways-job/blob/4417de7aa23d3c2316e400a3a327512834374475/internal/controller/pathwaysjob_controller.go#L651
591
- backoffLimit = system .vms_per_slice * 4 ,
598
+ backoffLimit = backoffLimit ,
592
599
template = self ._build_pathways_worker_pod (pathways_worker_replicated_job_index ),
593
600
)
594
601
worker_job = dict (
0 commit comments