Skip to content

Commit ca6a23f

Browse files
committed
Adding an extra slice to the Pathways cluster to swap in when there is a slice failure
1 parent 1e1fd3e commit ca6a23f

File tree

1 file changed

+2
-2
lines changed

1 file changed

+2
-2
lines changed

axlearn/cloud/gcp/pathways_utils.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -356,7 +356,7 @@ def _build_pathways_head_sidecar_containers(self) -> list[Nested[Any]]:
356356
args=[
357357
f"--server_port={_PATHWAYS_RESOURCE_MANAGER_PORT}",
358358
"--node_type=resource_manager",
359-
f"--instance_count={pathways_instance_count}",
359+
f"--instance_count={pathways_instance_count + 1}",
360360
f"--instance_type={pathways_tpu_version}:{system.topology}",
361361
f"--gcs_scratch_location={staging_location}",
362362
"--alsologtostderr",
@@ -626,7 +626,7 @@ def __call__(self) -> Sequence[Nested[Any]]:
626626
),
627627
dict(
628628
name=_PATHWAYS_WORKER_REPLICATED_JOB_NAME,
629-
replicas=cfg.accelerator.num_replicas,
629+
replicas=cfg.accelerator.num_replicas + 1,
630630
template=self._build_pathways_worker_job(),
631631
),
632632
]

0 commit comments

Comments
 (0)