Skip to content

Commit 9e7e27f

Browse files
committed
Refactoring: Organize functions from core.py into classes
1 parent e2ccd20 commit 9e7e27f

29 files changed

+2260
-2293
lines changed

src/xpk/commands/batch.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
from argparse import Namespace
1818

1919
from ..core.commands import run_command_for_value
20-
from ..core.gcloud_context import add_zone_and_project
20+
from ..core.gcloud_context import GCloudContextManager
2121
from ..core.kjob import AppProfileDefaults
2222
from ..core.kueue import LOCAL_QUEUE_NAME
2323
from ..utils.console import xpk_exit, xpk_print
@@ -34,7 +34,7 @@ def batch(args: Namespace) -> None:
3434
None
3535
"""
3636
if not args.kind_cluster:
37-
add_zone_and_project(args)
37+
GCloudContextManager.add_zone_and_project(args)
3838
set_cluster_command_code = set_cluster_command(args)
3939
else:
4040
set_cluster_command_code = set_local_cluster_command(args)

src/xpk/commands/cluster.py

+49-49
Original file line numberDiff line numberDiff line change
@@ -16,21 +16,12 @@
1616

1717
from tabulate import tabulate
1818

19-
from ..core.capacity import H100_DEVICE_TYPE
20-
from ..core.cluster import (
21-
get_all_clusters_programmatic,
22-
install_nccl_on_cluster,
23-
set_jobset_on_cluster,
24-
)
19+
from ..core.capacity import CapacityManager, DeviceType
20+
from ..core.cluster import ClusterManager
2521
from ..core.cluster_private import authorize_private_cluster_access_if_necessary
2622
from ..core.commands import run_command_for_value, run_command_with_updates
2723
from ..core.config import VERTEX_TENSORBOARD_FEATURE_FLAG
28-
from ..core.gcloud_context import (
29-
add_zone_and_project,
30-
get_gke_control_plane_version,
31-
get_gke_server_config,
32-
zone_to_region,
33-
)
24+
from ..core.gcloud_context import GCloudContextManager, GKEVersionManager
3425
from ..core.kjob import apply_kjob_crds, prepare_kjob, verify_kjob_installed
3526
from ..core.kueue import (
3627
cluster_preheat_yml,
@@ -39,21 +30,17 @@
3930
wait_for_kueue_available,
4031
)
4132
from ..core.nap import enable_autoprovisioning_on_cluster
42-
from ..core.network import (
43-
create_cluster_network_config,
44-
delete_cluster_subnets,
45-
set_up_cluster_network_for_gpu,
46-
)
47-
from ..core.nodepool import get_gke_node_pool_version, run_gke_node_pool_create_command
33+
from ..core.network import ClusterNetworkManager
34+
from ..core.nodepool import NodePoolManager
4835
from ..core.ray import install_ray_cluster
49-
from ..core.resources import create_cluster_configmaps
36+
from ..core.resources import ResourceManager
5037
from ..core.system_characteristics import (
5138
AcceleratorType,
5239
AcceleratorTypeToAcceleratorCharacteristics,
5340
SystemCharacteristics,
5441
get_system_characteristics,
5542
)
56-
from ..core.vertex import create_vertex_tensorboard
43+
from ..core.vertex import VertexAI
5744
from ..core.workload import get_workload_list
5845
from ..utils.console import get_user_input, xpk_exit, xpk_print
5946
from ..utils.file import write_tmp_file
@@ -71,13 +58,26 @@ def cluster_create(args) -> None:
7158
0 if successful and 1 otherwise.
7259
"""
7360
system, return_code = get_system_characteristics(args)
61+
cluster_manager = ClusterManager(args, system)
62+
capacity_manager = CapacityManager(args)
63+
resource_manager = ResourceManager(args, capacity_manager, system)
64+
nodepools_manager = NodePoolManager(
65+
args, system, resource_manager, capacity_manager
66+
)
67+
network_manager = ClusterNetworkManager(args)
68+
vertex_ai = VertexAI(args, resource_manager)
69+
try:
70+
version_manager = GKEVersionManager(args)
71+
except RuntimeError as e:
72+
xpk_print(e)
73+
xpk_exit(1)
7474

7575
if return_code > 0:
7676
xpk_print('Fetching system characteristics failed!')
7777
xpk_exit(return_code)
7878

7979
xpk_print(f'Starting cluster create for cluster {args.cluster}:', flush=True)
80-
add_zone_and_project(args)
80+
GCloudContextManager.add_zone_and_project(args)
8181

8282
if system.device_type in cluster_gcluster.supported_device_types:
8383
xpk_print(
@@ -87,12 +87,8 @@ def cluster_create(args) -> None:
8787
cluster_gcluster.cluster_create(args)
8888
xpk_exit(0)
8989

90-
return_code, gke_server_config = get_gke_server_config(args)
91-
if return_code != 0:
92-
xpk_exit(return_code)
93-
94-
return_code, gke_control_plane_version = get_gke_control_plane_version(
95-
args, gke_server_config
90+
return_code, gke_control_plane_version = (
91+
version_manager.get_gke_control_plane_version()
9692
)
9793
if return_code != 0:
9894
xpk_exit(return_code)
@@ -118,33 +114,34 @@ def cluster_create(args) -> None:
118114
# create Vertex Tensorboard for new and existing clusters if create-vertex-tensorboard is set
119115
tensorboard_config = {}
120116
if VERTEX_TENSORBOARD_FEATURE_FLAG and args.create_vertex_tensorboard:
121-
tensorboard_config = create_vertex_tensorboard(args)
117+
tensorboard_config = vertex_ai.create_vertex_tensorboard()
122118
# exit if failed to create Tensorboard in Vertex AI
123119
if not tensorboard_config:
124120
xpk_exit(1)
125121

126122
if system.accelerator_type == AcceleratorType['GPU']:
127123
xpk_print('Setting up Network for cluster')
128-
set_up_cluster_network_code = set_up_cluster_network_for_gpu(args, system)
124+
set_up_cluster_network_code = network_manager.set_up_network_for_gpu(system)
129125
if set_up_cluster_network_code != 0:
130126
xpk_exit(set_up_cluster_network_code)
131127

132-
if system.device_type == H100_DEVICE_TYPE:
128+
if system.device_type == DeviceType.H100.value:
133129
xpk_print('Creating Network Config for cluster')
134-
create_cluster_network_config_code = create_cluster_network_config(args)
130+
create_cluster_network_config_code = network_manager.create_network_config()
135131
if create_cluster_network_config_code != 0:
136132
xpk_exit(create_cluster_network_config_code)
137133

138134
# Check the control plane version of the cluster and determine the node pool
139135
# version to use.
140-
return_code, gke_node_pool_version = get_gke_node_pool_version(
141-
args, gke_server_config
136+
137+
return_code, gke_node_pool_version = nodepools_manager.get_node_pool_version(
138+
version_manager
142139
)
143140
if return_code != 0:
144141
xpk_exit(return_code)
145142

146-
run_gke_node_pool_create_command_code = run_gke_node_pool_create_command(
147-
args, system, gke_node_pool_version
143+
run_gke_node_pool_create_command_code = nodepools_manager.create_node_pool(
144+
gke_node_pool_version
148145
)
149146
if run_gke_node_pool_create_command_code != 0:
150147
xpk_exit(run_gke_node_pool_create_command_code)
@@ -153,7 +150,7 @@ def cluster_create(args) -> None:
153150
'Enabling the jobset API on our cluster, to be deprecated when Jobset is'
154151
' globally available'
155152
)
156-
set_jobset_on_cluster_code = set_jobset_on_cluster(args)
153+
set_jobset_on_cluster_code = cluster_manager.set_jobset_on_cluster()
157154
if set_jobset_on_cluster_code != 0:
158155
xpk_exit(set_jobset_on_cluster_code)
159156

@@ -201,13 +198,13 @@ def cluster_create(args) -> None:
201198

202199
if system.accelerator_type == AcceleratorType['GPU']:
203200
xpk_print('Installing NCCL Plugin for cluster')
204-
install_nccl_code = install_nccl_on_cluster(args, system)
201+
install_nccl_code = cluster_manager.install_nccl_on_cluster()
205202
if install_nccl_code != 0:
206203
xpk_exit(install_nccl_code)
207204

208205
xpk_print('Creating ConfigMap for cluster')
209-
create_cluster_configmaps_code = create_cluster_configmaps(
210-
args, system, tensorboard_config, autoprovisioning_config
206+
create_cluster_configmaps_code = resource_manager.create_cluster_configmaps(
207+
tensorboard_config, autoprovisioning_config
211208
)
212209
if create_cluster_configmaps_code != 0:
213210
xpk_exit(create_cluster_configmaps_code)
@@ -222,7 +219,7 @@ def cluster_create(args) -> None:
222219
xpk_print(
223220
'See your GKE Cluster here:'
224221
# pylint: disable=line-too-long
225-
f' https://console.cloud.google.com/kubernetes/clusters/details/{zone_to_region(args.zone)}/{args.cluster}/details?project={args.project}'
222+
f' https://console.cloud.google.com/kubernetes/clusters/details/{GCloudContextManager.zone_to_region(args.zone)}/{args.cluster}/details?project={args.project}'
226223
)
227224
xpk_exit(0)
228225

@@ -237,7 +234,7 @@ def cluster_delete(args) -> None:
237234
0 if successful and 1 otherwise.
238235
"""
239236
xpk_print(f'Starting cluster delete for cluster: {args.cluster}', flush=True)
240-
add_zone_and_project(args)
237+
GCloudContextManager.add_zone_and_project(args)
241238

242239
if cluster_gcluster.created_by_gcluster(args):
243240
xpk_print(f'Deleting {args.cluster} cluster using Cluster Toolkit...')
@@ -263,7 +260,7 @@ def cluster_cacheimage(args) -> None:
263260
xpk_print(
264261
f'Starting cluster cacheimage for cluster: {args.cluster}', flush=True
265262
)
266-
add_zone_and_project(args)
263+
GCloudContextManager.add_zone_and_project(args)
267264

268265
set_cluster_command_code = set_cluster_command(args)
269266
if set_cluster_command_code != 0:
@@ -314,7 +311,7 @@ def cluster_describe(args) -> None:
314311
0 if successful and 1 otherwise.
315312
"""
316313
xpk_print(f'Starting nodepool list for cluster: {args.cluster}', flush=True)
317-
add_zone_and_project(args)
314+
GCloudContextManager.add_zone_and_project(args)
318315

319316
set_cluster_command_code = set_cluster_command(args)
320317
if set_cluster_command_code != 0:
@@ -547,7 +544,7 @@ def cluster_list(args) -> None:
547544
Returns:
548545
0 if successful and 1 otherwise.
549546
"""
550-
add_zone_and_project(args)
547+
GCloudContextManager.add_zone_and_project(args)
551548
xpk_print(f'For project {args.project} and zone {args.zone}:', flush=True)
552549
if run_gke_clusters_list_command(args):
553550
xpk_exit(1)
@@ -595,7 +592,8 @@ def create_cluster_if_necessary(
595592
Returns:
596593
0 if successful and 1 otherwise.
597594
"""
598-
all_clusters, return_code = get_all_clusters_programmatic(args)
595+
cluster_manager = ClusterManager(args, system)
596+
all_clusters, return_code = cluster_manager.get_all_clusters()
599597
if return_code > 0:
600598
xpk_print('Listing all clusters failed!')
601599
return 1
@@ -617,6 +615,8 @@ def run_gke_cluster_delete_command(args) -> int:
617615
Returns:
618616
0 if successful and 1 otherwise.
619617
"""
618+
network_manager = ClusterNetworkManager(args)
619+
620620
if not args.force:
621621
xpk_print('Get the name of the workloads in the cluster.')
622622
args.filter_by_status = 'EVERYTHING'
@@ -639,15 +639,15 @@ def run_gke_cluster_delete_command(args) -> int:
639639
command = (
640640
'gcloud beta container clusters delete'
641641
f' {args.cluster} --project={args.project}'
642-
f' --region={zone_to_region(args.zone)} --quiet'
642+
f' --region={GCloudContextManager.zone_to_region(args.zone)} --quiet'
643643
)
644644

645645
return_code = run_command_with_updates(command, 'Cluster Delete', args)
646646
if return_code != 0:
647647
xpk_print(f'Cluster delete request returned ERROR {return_code}')
648648
return 1
649649

650-
return_code = delete_cluster_subnets(args)
650+
return_code = network_manager.delete_subnets()
651651
if return_code != 0:
652652
return return_code
653653

@@ -665,7 +665,7 @@ def run_gke_clusters_list_command(args) -> int:
665665
"""
666666
command = (
667667
'gcloud container clusters list'
668-
f' --project={args.project} --region={zone_to_region(args.zone)}'
668+
f' --project={args.project} --region={GCloudContextManager.zone_to_region(args.zone)}'
669669
)
670670
return_code = run_command_with_updates(command, 'Cluster List', args)
671671
if return_code != 0:
@@ -712,7 +712,7 @@ def run_gke_cluster_create_command(
712712
command = (
713713
'gcloud beta container clusters create'
714714
f' {args.cluster} --project={args.project}'
715-
f' --region={zone_to_region(args.zone)}'
715+
f' --region={GCloudContextManager.zone_to_region(args.zone)}'
716716
f' --node-locations={args.zone}'
717717
f' --cluster-version={gke_control_plane_version}'
718718
f' --machine-type={machine_type}'

src/xpk/commands/cluster_gcluster.py

+9-8
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,9 @@
2323
a3ultra_device_type,
2424
supported_device_types,
2525
)
26-
from ..core.capacity import get_capacity_type
26+
from ..core.capacity import CapacityManager
2727
from ..core.docker_manager import DockerManager
28-
from ..core.gcloud_context import zone_to_region
28+
from ..core.gcloud_context import GCloudContextManager
2929
from ..core.gcluster_manager import GclusterManager
3030
from ..utils.console import xpk_exit, xpk_print
3131
from ..utils.file import ensure_directory_exists
@@ -50,7 +50,7 @@ def cluster_create(args) -> None:
5050
check_gcloud_authenticated()
5151
prepare_directories()
5252
gcm = prepare_gcluster_manager()
53-
region = zone_to_region(args.zone)
53+
region = GCloudContextManager.zone_to_region(args.zone)
5454

5555
# unique_name uses shortened hash string, so still name collision is possible
5656
unique_name = get_unique_name(args.project, region, args.cluster)
@@ -90,7 +90,7 @@ def cluster_delete(args) -> None:
9090
check_gcloud_authenticated()
9191
prepare_directories()
9292
gcm = prepare_gcluster_manager()
93-
region = zone_to_region(args.zone)
93+
region = GCloudContextManager.zone_to_region(args.zone)
9494

9595
# unique_name uses shortened hash string, so still name collision is possible
9696
unique_name = get_unique_name(args.project, region, args.cluster)
@@ -104,7 +104,7 @@ def cluster_delete(args) -> None:
104104

105105
def created_by_gcluster(args) -> bool:
106106
prepare_directories()
107-
region = zone_to_region(args.zone)
107+
region = GCloudContextManager.zone_to_region(args.zone)
108108
unique_name = get_unique_name(args.project, region, args.cluster)
109109
prefix = get_prefix_path(args.project, region)
110110
bpg = prepare_blueprint_generator()
@@ -155,7 +155,8 @@ def prepare_blueprint_generator() -> BlueprintGenerator:
155155
def generate_blueprint(
156156
blueprint_name, args, prefix=None
157157
) -> BlueprintGeneratorOutput:
158-
capacity_type, return_code = get_capacity_type(args)
158+
capacity_manager = CapacityManager(args)
159+
capacity_type, return_code = capacity_manager.get_capacity_type()
159160
if return_code != 0:
160161
xpk_print('Capacity type is invalid.')
161162
xpk_exit(return_code)
@@ -169,7 +170,7 @@ def generate_blueprint(
169170
blueprint_name=blueprint_name,
170171
prefix=prefix,
171172
cluster_name=args.cluster,
172-
region=zone_to_region(args.zone),
173+
region=GCloudContextManager.zone_to_region(args.zone),
173174
project_id=args.project,
174175
zone=args.zone,
175176
auth_cidr=all_IPs_cidr,
@@ -185,7 +186,7 @@ def generate_blueprint(
185186
blueprint_name=blueprint_name,
186187
prefix=prefix,
187188
cluster_name=args.cluster,
188-
region=zone_to_region(args.zone),
189+
region=GCloudContextManager.zone_to_region(args.zone),
189190
project_id=args.project,
190191
zone=args.zone,
191192
auth_cidr=all_IPs_cidr,

src/xpk/commands/common.py

+2-3
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
"""
1616

1717
from ..core.commands import run_command_with_updates_retry
18-
from ..core.gcloud_context import zone_to_region
18+
from ..core.gcloud_context import GCloudContextManager
1919
from ..utils.console import xpk_print
2020

2121

@@ -30,8 +30,7 @@ def set_cluster_command(args) -> int:
3030
"""
3131
command = (
3232
'gcloud container clusters get-credentials'
33-
f' {args.cluster} --region={zone_to_region(args.zone)}'
34-
f' --project={args.project} &&'
33+
f' {args.cluster} --region={GCloudContextManager.zone_to_region(args.zone)} --project={args.project} &&'
3534
' kubectl config view && kubectl config set-context --current'
3635
' --namespace=default'
3736
)

src/xpk/commands/info.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
from tabulate import tabulate
2121

2222
from ..core.commands import run_command_for_value
23-
from ..core.gcloud_context import add_zone_and_project
23+
from ..core.gcloud_context import GCloudContextManager
2424
from ..core.kueue import verify_kueuectl
2525
from ..utils.console import xpk_exit, xpk_print
2626
from .common import set_cluster_command
@@ -36,7 +36,7 @@ def info(args: Namespace) -> None:
3636
Returns:
3737
None
3838
"""
39-
add_zone_and_project(args)
39+
GCloudContextManager.add_zone_and_project(args)
4040
set_cluster_command_code = set_cluster_command(args)
4141
if set_cluster_command_code != 0:
4242
xpk_exit(set_cluster_command_code)

0 commit comments

Comments
 (0)