16
16
17
17
from tabulate import tabulate
18
18
19
- from ..core .capacity import H100_DEVICE_TYPE
20
- from ..core .cluster import (
21
- get_all_clusters_programmatic ,
22
- get_cluster_credentials ,
23
- install_nccl_on_cluster ,
24
- set_jobset_on_cluster ,
25
- setup_k8s_env ,
26
- update_cluster_with_gcsfuse_driver_if_necessary ,
27
- update_cluster_with_workload_identity_if_necessary ,
28
- )
19
+ from ..core .capacity import CapacityManager , DeviceType
20
+ from ..core .cluster import ClusterManager
29
21
from ..core .cluster_private import authorize_private_cluster_access_if_necessary
30
22
from ..core .commands import run_command_for_value , run_command_with_updates
31
23
from ..core .config import VERTEX_TENSORBOARD_FEATURE_FLAG
32
- from ..core .gcloud_context import (
33
- add_zone_and_project ,
34
- get_gke_control_plane_version ,
35
- get_gke_server_config ,
36
- zone_to_region ,
37
- )
24
+ from ..core .gcloud_context import GCloudContextManager , GKEVersionManager
38
25
from ..core .kjob import apply_kjob_crds , prepare_kjob , verify_kjob_installed
39
26
from ..core .kueue import (
40
27
cluster_preheat_yml ,
43
30
wait_for_kueue_available ,
44
31
)
45
32
from ..core .nap import enable_autoprovisioning_on_cluster
46
- from ..core .network import (
47
- create_cluster_network_config ,
48
- delete_cluster_subnets ,
49
- set_up_cluster_network_for_gpu ,
50
- )
51
- from ..core .nodepool import get_gke_node_pool_version , run_gke_node_pool_create_command
33
+ from ..core .network import ClusterNetworkManager
34
+ from ..core .nodepool import NodePoolManager
52
35
from ..core .ray import install_ray_cluster
53
- from ..core .resources import create_cluster_configmaps
36
+ from ..core .resources import ResourceManager
54
37
from ..core .storage import install_storage_crd
55
38
from ..core .system_characteristics import (
56
39
AcceleratorType ,
57
40
AcceleratorTypeToAcceleratorCharacteristics ,
58
41
SystemCharacteristics ,
59
42
get_system_characteristics ,
60
43
)
61
- from ..core .vertex import create_vertex_tensorboard
44
+ from ..core .vertex import VertexAI
62
45
from ..core .workload import get_workload_list
63
46
from ..utils .console import get_user_input , xpk_exit , xpk_print
64
47
from ..utils .file import write_tmp_file
65
48
from . import cluster_gcluster
66
49
from .common import set_cluster_command
67
- from ..core .cluster import update_cluster_with_gcpfilestore_driver_if_necessary
68
50
69
51
70
52
def cluster_create (args ) -> None :
@@ -77,13 +59,25 @@ def cluster_create(args) -> None:
77
59
0 if successful and 1 otherwise.
78
60
"""
79
61
system , return_code = get_system_characteristics (args )
80
-
81
62
if return_code > 0 :
82
63
xpk_print ('Fetching system characteristics failed!' )
83
64
xpk_exit (return_code )
84
65
85
66
xpk_print (f'Starting cluster create for cluster { args .cluster } :' , flush = True )
86
- add_zone_and_project (args )
67
+ GCloudContextManager .add_zone_and_project (args )
68
+ cluster_manager = ClusterManager (args , system )
69
+ capacity_manager = CapacityManager (args )
70
+ resource_manager = ResourceManager (args , capacity_manager , system )
71
+ nodepools_manager = NodePoolManager (
72
+ args , system , resource_manager , capacity_manager
73
+ )
74
+ network_manager = ClusterNetworkManager (args )
75
+ vertex_ai = VertexAI (args , resource_manager )
76
+ try :
77
+ version_manager = GKEVersionManager (args )
78
+ except RuntimeError as e :
79
+ xpk_print (e )
80
+ xpk_exit (1 )
87
81
88
82
if system .device_type in cluster_gcluster .supported_device_types :
89
83
xpk_print (
@@ -93,12 +87,8 @@ def cluster_create(args) -> None:
93
87
cluster_gcluster .cluster_create (args )
94
88
xpk_exit (0 )
95
89
96
- return_code , gke_server_config = get_gke_server_config (args )
97
- if return_code != 0 :
98
- xpk_exit (return_code )
99
-
100
- return_code , gke_control_plane_version = get_gke_control_plane_version (
101
- args , gke_server_config
90
+ return_code , gke_control_plane_version = (
91
+ version_manager .get_gke_control_plane_version ()
102
92
)
103
93
if return_code != 0 :
104
94
xpk_exit (return_code )
@@ -123,60 +113,61 @@ def cluster_create(args) -> None:
123
113
or args .enable_gcpfilestore_csi_driver
124
114
):
125
115
update_cluster_command_code = (
126
- update_cluster_with_workload_identity_if_necessary (args )
116
+ cluster_manager . update_cluster_with_workload_identity_if_necessary ()
127
117
)
128
118
if update_cluster_command_code != 0 :
129
119
xpk_exit (update_cluster_command_code )
130
120
131
121
# Enable GCSFuse CSI Driver if not enabled already.
132
122
if args .enable_gcsfuse_csi_driver :
133
123
update_cluster_command_code = (
134
- update_cluster_with_gcsfuse_driver_if_necessary (args )
124
+ cluster_manager . update_cluster_with_gcsfuse_driver_if_necessary ()
135
125
)
136
126
if update_cluster_command_code != 0 :
137
127
xpk_exit (update_cluster_command_code )
138
128
139
129
if args .enable_gcpfilestore_csi_driver :
140
130
update_cluster_command_code = (
141
- update_cluster_with_gcpfilestore_driver_if_necessary (args )
131
+ cluster_manager . update_cluster_with_gcpfilestore_driver_if_necessary ()
142
132
)
143
133
if update_cluster_command_code != 0 :
144
134
xpk_exit (update_cluster_command_code )
145
135
146
136
# Update Pathways clusters with CloudDNS if not enabled already.
147
137
148
- get_cluster_credentials (args )
138
+ cluster_manager . get_cluster_credentials ()
149
139
150
140
# create Vertex Tensorboard for new and existing clusters if create-vertex-tensorboard is set
151
141
tensorboard_config = {}
152
142
if VERTEX_TENSORBOARD_FEATURE_FLAG and args .create_vertex_tensorboard :
153
- tensorboard_config = create_vertex_tensorboard (args )
143
+ tensorboard_config = vertex_ai . create_vertex_tensorboard ()
154
144
# exit if failed to create Tensorboard in Vertex AI
155
145
if not tensorboard_config :
156
146
xpk_exit (1 )
157
147
158
148
if system .accelerator_type == AcceleratorType ['GPU' ]:
159
149
xpk_print ('Setting up Network for cluster' )
160
- set_up_cluster_network_code = set_up_cluster_network_for_gpu ( args , system )
150
+ set_up_cluster_network_code = network_manager . set_up_network_for_gpu ( system )
161
151
if set_up_cluster_network_code != 0 :
162
152
xpk_exit (set_up_cluster_network_code )
163
153
164
- if system .device_type == H100_DEVICE_TYPE :
154
+ if system .device_type == DeviceType . H100 . value :
165
155
xpk_print ('Creating Network Config for cluster' )
166
- create_cluster_network_config_code = create_cluster_network_config ( args )
156
+ create_cluster_network_config_code = network_manager . create_network_config ( )
167
157
if create_cluster_network_config_code != 0 :
168
158
xpk_exit (create_cluster_network_config_code )
169
159
170
160
# Check the control plane version of the cluster and determine the node pool
171
161
# version to use.
172
- return_code , gke_node_pool_version = get_gke_node_pool_version (
173
- args , gke_server_config
162
+
163
+ return_code , gke_node_pool_version = (
164
+ nodepools_manager .get_gke_node_pool_version (version_manager )
174
165
)
175
166
if return_code != 0 :
176
167
xpk_exit (return_code )
177
168
178
- run_gke_node_pool_create_command_code = run_gke_node_pool_create_command (
179
- args , system , gke_node_pool_version
169
+ run_gke_node_pool_create_command_code = (
170
+ nodepools_manager . run_gke_node_pool_create_command ( gke_node_pool_version )
180
171
)
181
172
if run_gke_node_pool_create_command_code != 0 :
182
173
xpk_exit (run_gke_node_pool_create_command_code )
@@ -193,8 +184,8 @@ def cluster_create(args) -> None:
193
184
xpk_exit (return_code )
194
185
195
186
xpk_print ('Creating ConfigMap for cluster' )
196
- create_cluster_configmaps_code = create_cluster_configmaps (
197
- args , system , tensorboard_config , autoprovisioning_config
187
+ create_cluster_configmaps_code = resource_manager . create_cluster_configmaps (
188
+ tensorboard_config , autoprovisioning_config
198
189
)
199
190
if create_cluster_configmaps_code != 0 :
200
191
xpk_exit (create_cluster_configmaps_code )
@@ -203,7 +194,7 @@ def cluster_create(args) -> None:
203
194
'Enabling the jobset API on our cluster, to be deprecated when Jobset is'
204
195
' globally available'
205
196
)
206
- set_jobset_on_cluster_code = set_jobset_on_cluster (args )
197
+ set_jobset_on_cluster_code = cluster_manager . set_jobset_on_cluster ()
207
198
if set_jobset_on_cluster_code != 0 :
208
199
xpk_exit (set_jobset_on_cluster_code )
209
200
@@ -226,7 +217,7 @@ def cluster_create(args) -> None:
226
217
if err_code > 0 :
227
218
xpk_exit (err_code )
228
219
229
- k8s_client = setup_k8s_env (args )
220
+ k8s_client = cluster_manager . setup_k8s_env ()
230
221
install_storage_crd (k8s_client )
231
222
232
223
xpk_print ('Wait for Kueue to be fully available' )
@@ -243,7 +234,7 @@ def cluster_create(args) -> None:
243
234
244
235
if system .accelerator_type == AcceleratorType ['GPU' ]:
245
236
xpk_print ('Installing NCCL Plugin for cluster' )
246
- install_nccl_code = install_nccl_on_cluster (args , system )
237
+ install_nccl_code = cluster_manager . install_nccl_on_cluster ()
247
238
if install_nccl_code != 0 :
248
239
xpk_exit (install_nccl_code )
249
240
@@ -257,7 +248,7 @@ def cluster_create(args) -> None:
257
248
xpk_print (
258
249
'See your GKE Cluster here:'
259
250
# pylint: disable=line-too-long
260
- f' https://console.cloud.google.com/kubernetes/clusters/details/{ zone_to_region (args .zone )} /{ args .cluster } /details?project={ args .project } '
251
+ f' https://console.cloud.google.com/kubernetes/clusters/details/{ GCloudContextManager . zone_to_region (args .zone )} /{ args .cluster } /details?project={ args .project } '
261
252
)
262
253
xpk_exit (0 )
263
254
@@ -272,7 +263,7 @@ def cluster_delete(args) -> None:
272
263
0 if successful and 1 otherwise.
273
264
"""
274
265
xpk_print (f'Starting cluster delete for cluster: { args .cluster } ' , flush = True )
275
- add_zone_and_project (args )
266
+ GCloudContextManager . add_zone_and_project (args )
276
267
277
268
if cluster_gcluster .created_by_gcluster (args ):
278
269
xpk_print (f'Deleting { args .cluster } cluster using Cluster Toolkit...' )
@@ -303,14 +294,13 @@ def cluster_cacheimage(args) -> None:
303
294
xpk_print (
304
295
f'Starting cluster cacheimage for cluster: { args .cluster } ' , flush = True
305
296
)
306
- add_zone_and_project (args )
307
-
308
- get_cluster_credentials (args )
297
+ GCloudContextManager .add_zone_and_project (args )
309
298
system , return_code = get_system_characteristics (args )
310
-
311
299
if return_code > 0 :
312
300
xpk_print ('Fetching system characteristics failed!' )
313
301
xpk_exit (return_code )
302
+ cluster_manager = ClusterManager (args , system )
303
+ cluster_manager .get_cluster_credentials ()
314
304
315
305
node_selector_key = AcceleratorTypeToAcceleratorCharacteristics [
316
306
system .accelerator_type
@@ -352,9 +342,13 @@ def cluster_describe(args) -> None:
352
342
0 if successful and 1 otherwise.
353
343
"""
354
344
xpk_print (f'Starting nodepool list for cluster: { args .cluster } ' , flush = True )
355
- add_zone_and_project (args )
356
-
357
- get_cluster_credentials (args )
345
+ GCloudContextManager .add_zone_and_project (args )
346
+ system , return_code = get_system_characteristics (args )
347
+ if return_code > 0 :
348
+ xpk_print ('Fetching system characteristics failed!' )
349
+ xpk_exit (return_code )
350
+ cluster_manager = ClusterManager (args , system )
351
+ cluster_manager .get_cluster_credentials ()
358
352
359
353
return_code , data_table = nodepools_build_table (args )
360
354
if return_code != 0 :
@@ -583,7 +577,7 @@ def cluster_list(args) -> None:
583
577
Returns:
584
578
0 if successful and 1 otherwise.
585
579
"""
586
- add_zone_and_project (args )
580
+ GCloudContextManager . add_zone_and_project (args )
587
581
xpk_print (f'For project { args .project } and zone { args .zone } :' , flush = True )
588
582
if run_gke_clusters_list_command (args ):
589
583
xpk_exit (1 )
@@ -631,7 +625,8 @@ def create_cluster_if_necessary(
631
625
Returns:
632
626
0 if successful and 1 otherwise.
633
627
"""
634
- all_clusters , return_code = get_all_clusters_programmatic (args )
628
+ cluster_manager = ClusterManager (args , system )
629
+ all_clusters , return_code = cluster_manager .get_all_clusters_programmatic ()
635
630
if return_code > 0 :
636
631
xpk_print ('Listing all clusters failed!' )
637
632
return 1
@@ -653,6 +648,8 @@ def run_gke_cluster_delete_command(args) -> int:
653
648
Returns:
654
649
0 if successful and 1 otherwise.
655
650
"""
651
+ network_manager = ClusterNetworkManager (args )
652
+
656
653
if not args .force :
657
654
xpk_print ('Get the name of the workloads in the cluster.' )
658
655
args .filter_by_status = 'EVERYTHING'
@@ -675,15 +672,15 @@ def run_gke_cluster_delete_command(args) -> int:
675
672
command = (
676
673
'gcloud beta container clusters delete'
677
674
f' { args .cluster } --project={ args .project } '
678
- f' --region={ zone_to_region (args .zone )} --quiet'
675
+ f' --region={ GCloudContextManager . zone_to_region (args .zone )} --quiet'
679
676
)
680
677
681
678
return_code = run_command_with_updates (command , 'Cluster Delete' , args )
682
679
if return_code != 0 :
683
680
xpk_print (f'Cluster delete request returned ERROR { return_code } ' )
684
681
return 1
685
682
686
- return_code = delete_cluster_subnets ( args )
683
+ return_code = network_manager . delete_subnets ( )
687
684
if return_code != 0 :
688
685
return return_code
689
686
@@ -701,7 +698,7 @@ def run_gke_clusters_list_command(args) -> int:
701
698
"""
702
699
command = (
703
700
'gcloud container clusters list'
704
- f' --project={ args .project } --region={ zone_to_region (args .zone )} '
701
+ f' --project={ args .project } --region={ GCloudContextManager . zone_to_region (args .zone )} '
705
702
)
706
703
return_code = run_command_with_updates (command , 'Cluster List' , args )
707
704
if return_code != 0 :
@@ -748,7 +745,7 @@ def run_gke_cluster_create_command(
748
745
command = (
749
746
'gcloud beta container clusters create'
750
747
f' { args .cluster } --project={ args .project } '
751
- f' --region={ zone_to_region (args .zone )} '
748
+ f' --region={ GCloudContextManager . zone_to_region (args .zone )} '
752
749
f' --node-locations={ args .zone } '
753
750
f' --cluster-version={ gke_control_plane_version } '
754
751
f' --machine-type={ machine_type } '
0 commit comments