16
16
17
17
from tabulate import tabulate
18
18
19
- from ..core .capacity import H100_DEVICE_TYPE
20
- from ..core .cluster import (
21
- get_all_clusters_programmatic ,
22
- install_nccl_on_cluster ,
23
- set_jobset_on_cluster ,
24
- )
19
+ from ..core .capacity import CapacityManager , DeviceType
20
+ from ..core .cluster import ClusterManager
25
21
from ..core .cluster_private import authorize_private_cluster_access_if_necessary
26
22
from ..core .commands import run_command_for_value , run_command_with_updates
27
23
from ..core .config import VERTEX_TENSORBOARD_FEATURE_FLAG
28
- from ..core .gcloud_context import (
29
- add_zone_and_project ,
30
- get_gke_control_plane_version ,
31
- get_gke_server_config ,
32
- zone_to_region ,
33
- )
24
+ from ..core .gcloud_context import GCloudContextManager , GKEVersionManager
34
25
from ..core .kjob import apply_kjob_crds , prepare_kjob , verify_kjob_installed
35
26
from ..core .kueue import (
36
27
cluster_preheat_yml ,
39
30
wait_for_kueue_available ,
40
31
)
41
32
from ..core .nap import enable_autoprovisioning_on_cluster
42
- from ..core .network import (
43
- create_cluster_network_config ,
44
- delete_cluster_subnets ,
45
- set_up_cluster_network_for_gpu ,
46
- )
47
- from ..core .nodepool import get_gke_node_pool_version , run_gke_node_pool_create_command
33
+ from ..core .network import ClusterNetworkManager
34
+ from ..core .nodepool import NodePoolManager
48
35
from ..core .ray import install_ray_cluster
49
- from ..core .resources import create_cluster_configmaps
36
+ from ..core .resources import ResourceManager
50
37
from ..core .system_characteristics import (
51
38
AcceleratorType ,
52
39
AcceleratorTypeToAcceleratorCharacteristics ,
53
40
SystemCharacteristics ,
54
41
get_system_characteristics ,
55
42
)
56
- from ..core .vertex import create_vertex_tensorboard
43
+ from ..core .vertex import VertexAI
57
44
from ..core .workload import get_workload_list
58
45
from ..utils .console import get_user_input , xpk_exit , xpk_print
59
46
from ..utils .file import write_tmp_file
@@ -71,13 +58,26 @@ def cluster_create(args) -> None:
71
58
0 if successful and 1 otherwise.
72
59
"""
73
60
system , return_code = get_system_characteristics (args )
61
+ cluster_manager = ClusterManager (args , system )
62
+ capacity_manager = CapacityManager (args )
63
+ resource_manager = ResourceManager (args , capacity_manager , system )
64
+ nodepools_manager = NodePoolManager (
65
+ args , system , resource_manager , capacity_manager
66
+ )
67
+ network_manager = ClusterNetworkManager (args )
68
+ vertex_ai = VertexAI (args , resource_manager )
69
+ try :
70
+ version_manager = GKEVersionManager (args )
71
+ except RuntimeError as e :
72
+ xpk_print (e )
73
+ xpk_exit (1 )
74
74
75
75
if return_code > 0 :
76
76
xpk_print ('Fetching system characteristics failed!' )
77
77
xpk_exit (return_code )
78
78
79
79
xpk_print (f'Starting cluster create for cluster { args .cluster } :' , flush = True )
80
- add_zone_and_project (args )
80
+ GCloudContextManager . add_zone_and_project (args )
81
81
82
82
if system .device_type in cluster_gcluster .supported_device_types :
83
83
xpk_print (
@@ -87,12 +87,8 @@ def cluster_create(args) -> None:
87
87
cluster_gcluster .cluster_create (args )
88
88
xpk_exit (0 )
89
89
90
- return_code , gke_server_config = get_gke_server_config (args )
91
- if return_code != 0 :
92
- xpk_exit (return_code )
93
-
94
- return_code , gke_control_plane_version = get_gke_control_plane_version (
95
- args , gke_server_config
90
+ return_code , gke_control_plane_version = (
91
+ version_manager .get_gke_control_plane_version ()
96
92
)
97
93
if return_code != 0 :
98
94
xpk_exit (return_code )
@@ -118,33 +114,34 @@ def cluster_create(args) -> None:
118
114
# create Vertex Tensorboard for new and existing clusters if create-vertex-tensorboard is set
119
115
tensorboard_config = {}
120
116
if VERTEX_TENSORBOARD_FEATURE_FLAG and args .create_vertex_tensorboard :
121
- tensorboard_config = create_vertex_tensorboard (args )
117
+ tensorboard_config = vertex_ai . create_vertex_tensorboard ()
122
118
# exit if failed to create Tensorboard in Vertex AI
123
119
if not tensorboard_config :
124
120
xpk_exit (1 )
125
121
126
122
if system .accelerator_type == AcceleratorType ['GPU' ]:
127
123
xpk_print ('Setting up Network for cluster' )
128
- set_up_cluster_network_code = set_up_cluster_network_for_gpu ( args , system )
124
+ set_up_cluster_network_code = network_manager . set_up_network_for_gpu ( system )
129
125
if set_up_cluster_network_code != 0 :
130
126
xpk_exit (set_up_cluster_network_code )
131
127
132
- if system .device_type == H100_DEVICE_TYPE :
128
+ if system .device_type == DeviceType . H100 . value :
133
129
xpk_print ('Creating Network Config for cluster' )
134
- create_cluster_network_config_code = create_cluster_network_config ( args )
130
+ create_cluster_network_config_code = network_manager . create_network_config ( )
135
131
if create_cluster_network_config_code != 0 :
136
132
xpk_exit (create_cluster_network_config_code )
137
133
138
134
# Check the control plane version of the cluster and determine the node pool
139
135
# version to use.
140
- return_code , gke_node_pool_version = get_gke_node_pool_version (
141
- args , gke_server_config
136
+
137
+ return_code , gke_node_pool_version = nodepools_manager .get_node_pool_version (
138
+ version_manager
142
139
)
143
140
if return_code != 0 :
144
141
xpk_exit (return_code )
145
142
146
- run_gke_node_pool_create_command_code = run_gke_node_pool_create_command (
147
- args , system , gke_node_pool_version
143
+ run_gke_node_pool_create_command_code = nodepools_manager . create_node_pool (
144
+ gke_node_pool_version
148
145
)
149
146
if run_gke_node_pool_create_command_code != 0 :
150
147
xpk_exit (run_gke_node_pool_create_command_code )
@@ -153,7 +150,7 @@ def cluster_create(args) -> None:
153
150
'Enabling the jobset API on our cluster, to be deprecated when Jobset is'
154
151
' globally available'
155
152
)
156
- set_jobset_on_cluster_code = set_jobset_on_cluster (args )
153
+ set_jobset_on_cluster_code = cluster_manager . set_jobset_on_cluster ()
157
154
if set_jobset_on_cluster_code != 0 :
158
155
xpk_exit (set_jobset_on_cluster_code )
159
156
@@ -201,13 +198,13 @@ def cluster_create(args) -> None:
201
198
202
199
if system .accelerator_type == AcceleratorType ['GPU' ]:
203
200
xpk_print ('Installing NCCL Plugin for cluster' )
204
- install_nccl_code = install_nccl_on_cluster (args , system )
201
+ install_nccl_code = cluster_manager . install_nccl_on_cluster ()
205
202
if install_nccl_code != 0 :
206
203
xpk_exit (install_nccl_code )
207
204
208
205
xpk_print ('Creating ConfigMap for cluster' )
209
- create_cluster_configmaps_code = create_cluster_configmaps (
210
- args , system , tensorboard_config , autoprovisioning_config
206
+ create_cluster_configmaps_code = resource_manager . create_cluster_configmaps (
207
+ tensorboard_config , autoprovisioning_config
211
208
)
212
209
if create_cluster_configmaps_code != 0 :
213
210
xpk_exit (create_cluster_configmaps_code )
@@ -222,7 +219,7 @@ def cluster_create(args) -> None:
222
219
xpk_print (
223
220
'See your GKE Cluster here:'
224
221
# pylint: disable=line-too-long
225
- f' https://console.cloud.google.com/kubernetes/clusters/details/{ zone_to_region (args .zone )} /{ args .cluster } /details?project={ args .project } '
222
+ f' https://console.cloud.google.com/kubernetes/clusters/details/{ GCloudContextManager . zone_to_region (args .zone )} /{ args .cluster } /details?project={ args .project } '
226
223
)
227
224
xpk_exit (0 )
228
225
@@ -237,7 +234,7 @@ def cluster_delete(args) -> None:
237
234
0 if successful and 1 otherwise.
238
235
"""
239
236
xpk_print (f'Starting cluster delete for cluster: { args .cluster } ' , flush = True )
240
- add_zone_and_project (args )
237
+ GCloudContextManager . add_zone_and_project (args )
241
238
242
239
if cluster_gcluster .created_by_gcluster (args ):
243
240
xpk_print (f'Deleting { args .cluster } cluster using Cluster Toolkit...' )
@@ -263,7 +260,7 @@ def cluster_cacheimage(args) -> None:
263
260
xpk_print (
264
261
f'Starting cluster cacheimage for cluster: { args .cluster } ' , flush = True
265
262
)
266
- add_zone_and_project (args )
263
+ GCloudContextManager . add_zone_and_project (args )
267
264
268
265
set_cluster_command_code = set_cluster_command (args )
269
266
if set_cluster_command_code != 0 :
@@ -314,7 +311,7 @@ def cluster_describe(args) -> None:
314
311
0 if successful and 1 otherwise.
315
312
"""
316
313
xpk_print (f'Starting nodepool list for cluster: { args .cluster } ' , flush = True )
317
- add_zone_and_project (args )
314
+ GCloudContextManager . add_zone_and_project (args )
318
315
319
316
set_cluster_command_code = set_cluster_command (args )
320
317
if set_cluster_command_code != 0 :
@@ -547,7 +544,7 @@ def cluster_list(args) -> None:
547
544
Returns:
548
545
0 if successful and 1 otherwise.
549
546
"""
550
- add_zone_and_project (args )
547
+ GCloudContextManager . add_zone_and_project (args )
551
548
xpk_print (f'For project { args .project } and zone { args .zone } :' , flush = True )
552
549
if run_gke_clusters_list_command (args ):
553
550
xpk_exit (1 )
@@ -595,7 +592,8 @@ def create_cluster_if_necessary(
595
592
Returns:
596
593
0 if successful and 1 otherwise.
597
594
"""
598
- all_clusters , return_code = get_all_clusters_programmatic (args )
595
+ cluster_manager = ClusterManager (args , system )
596
+ all_clusters , return_code = cluster_manager .get_all_clusters ()
599
597
if return_code > 0 :
600
598
xpk_print ('Listing all clusters failed!' )
601
599
return 1
@@ -617,6 +615,8 @@ def run_gke_cluster_delete_command(args) -> int:
617
615
Returns:
618
616
0 if successful and 1 otherwise.
619
617
"""
618
+ network_manager = ClusterNetworkManager (args )
619
+
620
620
if not args .force :
621
621
xpk_print ('Get the name of the workloads in the cluster.' )
622
622
args .filter_by_status = 'EVERYTHING'
@@ -639,15 +639,15 @@ def run_gke_cluster_delete_command(args) -> int:
639
639
command = (
640
640
'gcloud beta container clusters delete'
641
641
f' { args .cluster } --project={ args .project } '
642
- f' --region={ zone_to_region (args .zone )} --quiet'
642
+ f' --region={ GCloudContextManager . zone_to_region (args .zone )} --quiet'
643
643
)
644
644
645
645
return_code = run_command_with_updates (command , 'Cluster Delete' , args )
646
646
if return_code != 0 :
647
647
xpk_print (f'Cluster delete request returned ERROR { return_code } ' )
648
648
return 1
649
649
650
- return_code = delete_cluster_subnets ( args )
650
+ return_code = network_manager . delete_subnets ( )
651
651
if return_code != 0 :
652
652
return return_code
653
653
@@ -665,7 +665,7 @@ def run_gke_clusters_list_command(args) -> int:
665
665
"""
666
666
command = (
667
667
'gcloud container clusters list'
668
- f' --project={ args .project } --region={ zone_to_region (args .zone )} '
668
+ f' --project={ args .project } --region={ GCloudContextManager . zone_to_region (args .zone )} '
669
669
)
670
670
return_code = run_command_with_updates (command , 'Cluster List' , args )
671
671
if return_code != 0 :
@@ -712,7 +712,7 @@ def run_gke_cluster_create_command(
712
712
command = (
713
713
'gcloud beta container clusters create'
714
714
f' { args .cluster } --project={ args .project } '
715
- f' --region={ zone_to_region (args .zone )} '
715
+ f' --region={ GCloudContextManager . zone_to_region (args .zone )} '
716
716
f' --node-locations={ args .zone } '
717
717
f' --cluster-version={ gke_control_plane_version } '
718
718
f' --machine-type={ machine_type } '
0 commit comments