Skip to content

Commit 11728f8

Browse files
committed
Merge branch 'develop'
2 parents ff1e09b + 8d8ff1d commit 11728f8

File tree

8 files changed

+307
-27
lines changed

8 files changed

+307
-27
lines changed

src/xpk/commands/cluster.py

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
from tabulate import tabulate
1818

1919
from ..utils.feature_flags import FeatureFlags
20-
from ..core.capacity import H100_DEVICE_TYPE, H200_DEVICE_TYPE, B200_DEVICE_TYPE
20+
from ..core.capacity import H100_DEVICE_TYPE, H200_DEVICE_TYPE, B200_DEVICE_TYPE, get_reservation_deployment_type
2121
from ..core.cluster import (
2222
get_all_clusters_programmatic,
2323
get_cluster_credentials,
@@ -204,6 +204,38 @@ def cluster_adapt(args) -> None:
204204
def _validate_cluster_create_args(args, system: SystemCharacteristics):
205205
if FeatureFlags.SUB_SLICING_ENABLED and args.sub_slicing:
206206
validate_sub_slicing_system(system)
207+
_validate_sub_slicing_reservation(args)
208+
209+
210+
def _validate_sub_slicing_reservation(args):
211+
if args.reservation is None:
212+
xpk_print(
213+
'Error: Validation failed: Sub-slicing cluster creation requires'
214+
' Cluster Director reservation to be specified.'
215+
)
216+
xpk_exit(1)
217+
218+
deployment_type = get_reservation_deployment_type(
219+
reservation=args.reservation, project=args.project, zone=args.zone
220+
)
221+
if deployment_type != 'DENSE':
222+
xpk_print(
223+
'Error: Validation failed: The specified reservation'
224+
f' "{args.reservation}" is not a Cluster Director reservation.'
225+
)
226+
xpk_print(
227+
'Please provide a reservation created for Cluster Director to proceed.'
228+
)
229+
xpk_print('To list valid Cluster Director reservations, run:')
230+
xpk_print(
231+
' gcloud compute reservations list --filter="deploymentType=DENSE"'
232+
)
233+
xpk_print(
234+
'Refer to the documentation for more information on creating Cluster'
235+
' Director reservations:'
236+
' https://cloud.google.com/cluster-director/docs/reserve-capacity'
237+
)
238+
xpk_exit(1)
207239

208240

209241
def cluster_create(args) -> None:

src/xpk/commands/cluster_test.py

Lines changed: 67 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,8 @@
2727
@dataclass
2828
class _Mocks:
2929
common_print_mock: MagicMock
30-
common_exit_mock: MagicMock
30+
commands_print_mock: MagicMock
31+
commands_get_reservation_deployment_type: MagicMock
3132

3233

3334
@pytest.fixture
@@ -36,12 +37,17 @@ def mock_common_print_and_exit(mocker):
3637
'xpk.commands.common.xpk_print',
3738
return_value=None,
3839
)
39-
common_exit_mock = mocker.patch(
40-
'xpk.commands.common.xpk_exit',
41-
return_value=None,
40+
commands_print_mock = mocker.patch(
41+
'xpk.commands.cluster.xpk_print', return_value=None
42+
)
43+
commands_get_reservation_deployment_type = mocker.patch(
44+
'xpk.commands.cluster.get_reservation_deployment_type',
45+
return_value='DENSE',
4246
)
4347
return _Mocks(
44-
common_print_mock=common_print_mock, common_exit_mock=common_exit_mock
48+
common_print_mock=common_print_mock,
49+
commands_get_reservation_deployment_type=commands_get_reservation_deployment_type,
50+
commands_print_mock=commands_print_mock,
4551
)
4652

4753

@@ -61,32 +67,82 @@ def test_validate_cluster_create_args_for_correct_args_pass(
6167
_validate_cluster_create_args(args, DEFAULT_TEST_SYSTEM)
6268

6369
assert mock_common_print_and_exit.common_print_mock.call_count == 0
64-
assert mock_common_print_and_exit.common_exit_mock.call_count == 0
6570

6671

6772
def test_validate_cluster_create_args_for_correct_sub_slicing_args_pass(
6873
mock_common_print_and_exit: _Mocks,
6974
):
7075
FeatureFlags.SUB_SLICING_ENABLED = True
71-
args = Namespace(sub_slicing=True)
76+
args = Namespace(
77+
sub_slicing=True,
78+
reservation='test-reservation',
79+
project='project',
80+
zone='zone',
81+
)
7282

7383
_validate_cluster_create_args(args, SUB_SLICING_SYSTEM)
7484

7585
assert mock_common_print_and_exit.common_print_mock.call_count == 0
76-
assert mock_common_print_and_exit.common_exit_mock.call_count == 0
7786

7887

7988
def test_validate_cluster_create_args_for_not_supported_system_throws(
8089
mock_common_print_and_exit: _Mocks,
8190
):
8291
FeatureFlags.SUB_SLICING_ENABLED = True
83-
args = Namespace(sub_slicing=True)
92+
args = Namespace(
93+
sub_slicing=True,
94+
reservation='test-reservation',
95+
project='project',
96+
zone='zone',
97+
)
8498

85-
_validate_cluster_create_args(args, DEFAULT_TEST_SYSTEM)
99+
with pytest.raises(SystemExit):
100+
_validate_cluster_create_args(args, DEFAULT_TEST_SYSTEM)
86101

87102
assert mock_common_print_and_exit.common_print_mock.call_count == 1
88103
assert (
89104
mock_common_print_and_exit.common_print_mock.call_args[0][0]
90105
== 'Error: l4-1 does not support Sub-slicing.'
91106
)
92-
assert mock_common_print_and_exit.common_exit_mock.call_count == 1
107+
108+
109+
def test_validate_cluster_create_args_for_missing_reservation(
110+
mock_common_print_and_exit: _Mocks,
111+
):
112+
FeatureFlags.SUB_SLICING_ENABLED = True
113+
args = Namespace(
114+
sub_slicing=True, project='project', zone='zone', reservation=None
115+
)
116+
117+
with pytest.raises(SystemExit):
118+
_validate_cluster_create_args(args, SUB_SLICING_SYSTEM)
119+
120+
assert mock_common_print_and_exit.commands_print_mock.call_count == 1
121+
assert (
122+
'Validation failed: Sub-slicing cluster creation requires'
123+
in mock_common_print_and_exit.commands_print_mock.call_args[0][0]
124+
)
125+
126+
127+
def test_validate_cluster_create_args_for_invalid_reservation(
128+
mock_common_print_and_exit: _Mocks,
129+
):
130+
FeatureFlags.SUB_SLICING_ENABLED = True
131+
args = Namespace(
132+
sub_slicing=True,
133+
project='project',
134+
zone='zone',
135+
reservation='test-reservation',
136+
)
137+
mock_common_print_and_exit.commands_get_reservation_deployment_type.return_value = (
138+
'SPARSE'
139+
)
140+
141+
with pytest.raises(SystemExit):
142+
_validate_cluster_create_args(args, SUB_SLICING_SYSTEM)
143+
144+
assert mock_common_print_and_exit.commands_print_mock.call_count == 5
145+
assert (
146+
'Refer to the documentation for more information on creating Cluster'
147+
in mock_common_print_and_exit.commands_print_mock.call_args[0][0]
148+
)

src/xpk/commands/workload.py

Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,8 @@
2626
get_cluster_credentials,
2727
setup_k8s_env,
2828
)
29-
from ..core.commands import run_command_with_updates, run_commands
29+
from ..core.commands import run_command_with_updates, run_commands, run_command_for_value
30+
from ..core.kueue_manager import KueueManager, SUB_SLICE_TOPOLOGY_NAME
3031
from ..core.config import (VERTEX_TENSORBOARD_FEATURE_FLAG, XPK_CURRENT_VERSION)
3132
from ..core.docker_container import (
3233
get_main_container_docker_image,
@@ -95,6 +96,7 @@
9596
tcpxo_decorator,
9697
)
9798
from ..utils.console import get_user_input, xpk_exit, xpk_print
99+
from packaging.version import Version
98100
from ..utils.file import write_tmp_file
99101
from ..utils.execution_context import is_dry_run
100102
from ..utils.validation import validate_dependencies_list, SystemDependency, should_validate_dependencies
@@ -283,6 +285,7 @@
283285
"""
284286

285287
SUB_SLICING_TOPOLOGIES = ['2x2', '2x4', '4x4', '4x8', '8x8', '8x16', '16x16']
288+
SUB_SLICING_MINIMUM_KUEUE_VERSION = Version('0.13.0')
286289

287290

288291
def workload_create_pathways(args) -> None:
@@ -340,6 +343,7 @@ def workload_create(args) -> None:
340343
xpk_exit(return_code)
341344

342345
if FeatureFlags.SUB_SLICING_ENABLED and args.sub_slicing_topology is not None:
346+
_validate_sub_slicing_availability()
343347
_validate_sub_slicing_topology(system, args.sub_slicing_topology)
344348

345349
if not check_if_workload_can_schedule(args, system):
@@ -678,6 +682,43 @@ def workload_create(args) -> None:
678682
xpk_exit(0)
679683

680684

685+
def _validate_sub_slicing_availability():
686+
return_code, value = run_command_for_value(
687+
command='kubectl get topology', task='Get defined topologies'
688+
)
689+
690+
if return_code != 0:
691+
xpk_print(
692+
'Error: Unable to validate sub-slicing support on a given cluster.'
693+
)
694+
xpk_exit(1)
695+
696+
if SUB_SLICE_TOPOLOGY_NAME not in value:
697+
xpk_print(
698+
'Error: Cluster has not been not set up for Sub-slicing. Please enable'
699+
' --sub-slicing in "cluster create" command first.'
700+
)
701+
xpk_exit(1)
702+
703+
kueue_manager = KueueManager()
704+
return_code, current_version = kueue_manager.get_installed_kueue_version()
705+
if return_code != 0:
706+
xpk_print(
707+
'Error: Unable to validate sub-slicing support on a given cluster.'
708+
)
709+
xpk_exit(1)
710+
711+
if current_version < SUB_SLICING_MINIMUM_KUEUE_VERSION:
712+
xpk_print(
713+
"Error: Current Kueue version ({current_version}) doesn't support"
714+
' Sub-slicing. The minimal required version is'
715+
' v{SUB_SLICING_MINIMUM_KUEUE_VERSION}. Please either update Kueue'
716+
' manually, or run "cluster create --sub-slicing" on the existing'
717+
' cluster.'
718+
)
719+
xpk_exit(1)
720+
721+
681722
def _validate_sub_slicing_topology(
682723
system_characteristics: SystemCharacteristics, sub_slicing_topology: str
683724
) -> None:

src/xpk/commands/workload_test.py

Lines changed: 86 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,8 @@
1818
from unittest.mock import MagicMock, patch
1919
import pytest
2020
from ..core.system_characteristics import SystemCharacteristics
21-
from .workload import _validate_sub_slicing_topology
21+
from .workload import _validate_sub_slicing_topology, _validate_sub_slicing_availability
22+
from packaging.version import Version
2223

2324

2425
SYSTEM_CHARACTERISTICS = SystemCharacteristics(
@@ -40,7 +41,7 @@ def xpk_print(mocker):
4041

4142

4243
def test_validate_sub_slicing_topology_exits_for_unsupported_topology(
43-
xpk_print,
44+
xpk_print: MagicMock,
4445
):
4546
with pytest.raises(SystemExit):
4647
_validate_sub_slicing_topology(SYSTEM_CHARACTERISTICS, '2x1')
@@ -50,7 +51,9 @@ def test_validate_sub_slicing_topology_exits_for_unsupported_topology(
5051
)
5152

5253

53-
def test_validate_sub_slicing_topology_exits_for_too_large_topology(xpk_print):
54+
def test_validate_sub_slicing_topology_exits_for_too_large_topology(
55+
xpk_print: MagicMock,
56+
):
5457
with pytest.raises(SystemExit):
5558
_validate_sub_slicing_topology(SYSTEM_CHARACTERISTICS, '16x16')
5659

@@ -64,6 +67,86 @@ def test_validate_sub_slicing_topology_does_nothing_for_supported_topology():
6467
_validate_sub_slicing_topology(SYSTEM_CHARACTERISTICS, '4x4')
6568

6669

70+
def test_validate_sub_slicing_availability_exits_when_getting_topologies_fails(
71+
xpk_print: MagicMock, mocker
72+
):
73+
mocker.patch(
74+
'xpk.commands.workload.run_command_for_value',
75+
return_value=(1, ''),
76+
)
77+
with pytest.raises(SystemExit):
78+
_validate_sub_slicing_availability()
79+
80+
assert (
81+
'Unable to validate sub-slicing support'
82+
in xpk_print.mock_calls[0].args[0]
83+
)
84+
85+
86+
def test_validate_sub_slicing_availability_exits_when_subslicing_topology_is_not_defined(
87+
xpk_print: MagicMock, mocker
88+
):
89+
mocker.patch(
90+
'xpk.commands.workload.run_command_for_value',
91+
return_value=(0, ''),
92+
)
93+
with pytest.raises(SystemExit):
94+
_validate_sub_slicing_availability()
95+
96+
assert (
97+
'Cluster has not been not set up for Sub-slicing.'
98+
in xpk_print.mock_calls[0].args[0]
99+
)
100+
101+
102+
def test_validate_sub_slicing_availability_exits_when_kueue_version_cannot_be_determined(
103+
xpk_print: MagicMock, mocker
104+
):
105+
mocker.patch(
106+
'xpk.commands.workload.run_command_for_value',
107+
return_value=(0, 'sub-slice-topology'),
108+
)
109+
mocker.patch(
110+
'xpk.commands.workload.KueueManager.get_installed_kueue_version',
111+
return_value=(1, None),
112+
)
113+
with pytest.raises(SystemExit):
114+
_validate_sub_slicing_availability()
115+
116+
assert 'Unable to validate sub-slicing' in xpk_print.mock_calls[0].args[0]
117+
118+
119+
def test_validate_sub_slicing_availability_exits_when_kueue_version_does_not_meet_minimum_requirements(
120+
xpk_print: MagicMock, mocker
121+
):
122+
mocker.patch(
123+
'xpk.commands.workload.run_command_for_value',
124+
return_value=(0, 'sub-slice-topology'),
125+
)
126+
mocker.patch(
127+
'xpk.commands.workload.KueueManager.get_installed_kueue_version',
128+
return_value=(0, Version('0.0.0')),
129+
)
130+
with pytest.raises(SystemExit):
131+
_validate_sub_slicing_availability()
132+
133+
assert 'The minimal required version is' in xpk_print.mock_calls[0].args[0]
134+
135+
136+
def test_validate_sub_slicing_availability_does_nothing_when_cluster_is_correctly_configured_for_subslicing(
137+
mocker,
138+
):
139+
mocker.patch(
140+
'xpk.commands.workload.run_command_for_value',
141+
return_value=(0, 'sub-slice-topology'),
142+
)
143+
mocker.patch(
144+
'xpk.commands.workload.KueueManager.get_installed_kueue_version',
145+
return_value=(0, Version('0.13.0')),
146+
)
147+
_validate_sub_slicing_availability()
148+
149+
67150
@patch('xpk.commands.common.xpk_print')
68151
def test_validate_sub_slicing_topology_fails_for_unsupported_system(
69152
common_xpk_print: MagicMock,

src/xpk/core/capacity.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,23 @@ def get_reservation_placement_policy(
152152
return output.strip()
153153

154154

155+
def get_reservation_deployment_type(
156+
reservation: str, zone: str, project: str
157+
) -> str:
158+
"""Get reservation deployment type."""
159+
command = (
160+
f'gcloud beta compute reservations describe {reservation}'
161+
f' --project={project} --zone={zone} --format="value(deploymentType)"'
162+
)
163+
return_code, output = run_command_for_value(
164+
command, 'Get reservation deployment type', dry_run_return_val='DENSE'
165+
)
166+
if return_code != 0:
167+
xpk_print(f'Get reservation deployment type ERROR {return_code}')
168+
xpk_exit(1)
169+
return output.strip()
170+
171+
155172
def verify_reservation_exists(args) -> int:
156173
"""Verify the reservation exists.
157174

0 commit comments

Comments
 (0)