Skip to content

Commit 6e5d23a

Browse files
committed
Fix autoprovisioning with spot nodes
1 parent 8737b38 commit 6e5d23a

File tree

3 files changed

+117
-33
lines changed

3 files changed

+117
-33
lines changed

src/xpk/commands/workload.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@
4848
from ..core.kueue import LOCAL_QUEUE_NAME
4949
from ..core.nap import (
5050
get_autoprovisioning_node_selector_args,
51+
get_autoprovisioning_tolerations,
5152
is_autoprovisioning_enabled,
5253
)
5354
from ..core.pathways import (
@@ -101,6 +102,8 @@
101102
hostNetwork: true
102103
dnsPolicy: ClusterFirstWithHostNet
103104
terminationGracePeriodSeconds: {args.termination_grace_period_seconds}
105+
tolerations:
106+
{autoprovisioning_tolerations}
104107
containers:
105108
{container}
106109
volumes:
@@ -395,6 +398,7 @@ def workload_create(args) -> None:
395398

396399
# Currently autoprovisioning is not enabled for Pathways workloads.
397400
autoprovisioning_args = ''
401+
autoprovisioning_tolerations = ''
398402
autoprovisioning_enabled, return_code = is_autoprovisioning_enabled(
399403
args, system
400404
)
@@ -407,6 +411,11 @@ def workload_create(args) -> None:
407411
)
408412
if return_code != 0:
409413
xpk_exit(return_code)
414+
autoprovisioning_tolerations, return_code = (
415+
get_autoprovisioning_tolerations(args)
416+
)
417+
if return_code != 0:
418+
xpk_exit(return_code)
410419

411420
# Create the workload file based on accelerator type or workload type.
412421
if system.accelerator_type == AcceleratorType['GPU']:
@@ -467,6 +476,7 @@ def workload_create(args) -> None:
467476
local_queue_name=LOCAL_QUEUE_NAME,
468477
autoprovisioning_args=autoprovisioning_args,
469478
volumes=get_volumes(args, system),
479+
autoprovisioning_tolerations=autoprovisioning_tolerations,
470480
)
471481
tmp = write_tmp_file(yml_string)
472482
command = f'kubectl apply -f {str(tmp.file.name)}'

src/xpk/core/core.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -763,7 +763,7 @@ def get_capacity_node_selectors_from_capacity_type(
763763
case CapacityType.ON_DEMAND.name:
764764
node_selector = ''
765765
case CapacityType.SPOT.name:
766-
node_selector = 'cloud.google.com/gke-spot="true"'
766+
node_selector = 'cloud.google.com/gke-spot: "true"'
767767
case CapacityType.RESERVATION.name:
768768
node_selector = f'cloud.google.com/reservation-name: {args.reservation}'
769769
case _:

src/xpk/core/nap.py

Lines changed: 106 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -240,6 +240,29 @@ def create_autoprovisioning_config(
240240
return autoprovisioning_config, 0
241241

242242

243+
def get_cluster_metadata_configmap(args) -> tuple[dict, int]:
244+
"""Gets the cluster metadata configmap.
245+
246+
Args:
247+
args: user provided arguments for running the command.
248+
249+
Returns:
250+
configmap and 0 if found, None and 1 otherwise.
251+
"""
252+
configmap = get_cluster_configmap(
253+
args, f'{args.cluster}-{CLUSTER_METADATA_CONFIGMAP}'
254+
)
255+
if configmap is None:
256+
xpk_print(
257+
'Unable to find config map. Please specify a capacity type'
258+
' --on-demand, --spot, --reservation=$RESERVATION_ID) to continue'
259+
' to use autoprovisioning (--enable-autoprovisioning).'
260+
)
261+
return None, 1
262+
263+
return configmap, 0
264+
265+
243266
def is_autoprovisioning_enabled(
244267
args, system: SystemCharacteristics
245268
) -> tuple[bool, int]:
@@ -285,6 +308,42 @@ def is_autoprovisioning_enabled(
285308
return False, 1
286309

287310

311+
def get_capacity_type_str_from_args_or_cluster_default(args) -> tuple[str, int]:
312+
"""Determine the capacity type based on user arguments or cluster default.
313+
314+
Args:
315+
args: user provided arguments for running the command.
316+
317+
Returns:
318+
Tuple with string with the system characteristics and
319+
int of 0 if successful and 1 otherwise.
320+
"""
321+
# If the user doesn't specify args, then use the cluster settings.
322+
capacity_type, return_code = get_capacity_type(args)
323+
if return_code != 0:
324+
xpk_print('Unable to get capacity type.')
325+
return CapacityType.UNKNOWN.name, return_code
326+
327+
if capacity_type != CapacityType.UNKNOWN:
328+
return capacity_type.name, 0
329+
330+
# Use default settings from cluster creation.
331+
#
332+
# Error out if the metadata config map doesn't exist, and is attempting to use
333+
# autoprovisioning.
334+
cluster_config_map, return_code = get_cluster_metadata_configmap(args)
335+
if return_code != 0:
336+
return CapacityType.UNKNOWN.name, 1
337+
338+
return_code, capacity_type_str = get_value_from_map(
339+
CAPACITY_TYPE_CONFIG_KEY, cluster_config_map
340+
)
341+
if return_code != 0:
342+
return CapacityType.UNKNOWN.name, return_code
343+
344+
return capacity_type_str, 0
345+
346+
288347
def get_autoprovisioning_node_selector_args(args) -> tuple[str, int]:
289348
"""Determine the capacity type when autoprovisioning is enabled.
290349
@@ -297,44 +356,26 @@ def get_autoprovisioning_node_selector_args(args) -> tuple[str, int]:
297356
"""
298357
return_code = 0
299358
node_selector_args = ''
300-
# If the user doesn't specify args, then use the cluster settings.
301-
capacity_type, return_code = get_capacity_type(args)
302-
capacity_type_str = capacity_type.name
359+
capacity_type_str, return_code = (
360+
get_capacity_type_str_from_args_or_cluster_default(args)
361+
)
303362
if return_code != 0:
304-
xpk_print('Unable to get capacity type.')
305363
return node_selector_args, return_code
306364

307-
if capacity_type_str == CapacityType.UNKNOWN.name:
308-
# Use default settings from cluster creation.
309-
metadata_configmap_name = f'{args.cluster}-{CLUSTER_METADATA_CONFIGMAP}'
310-
cluster_config_map = get_cluster_configmap(args, metadata_configmap_name)
311-
312-
# Error out if the metadata config map doesn't exist, and is attempting to use
313-
# autoprovisioning.
314-
if cluster_config_map is None:
315-
xpk_print(
316-
'Unable to find config map. Please specify a capacity type'
317-
' --on-demand, --spot, --reservation=$RESERVATION_ID) to continue'
318-
' to use autoprovisioning (--enable-autoprovisioning).'
319-
)
320-
return node_selector_args, 1
321-
322-
return_code, capacity_type_str = get_value_from_map(
323-
CAPACITY_TYPE_CONFIG_KEY, cluster_config_map
365+
cluster_config_map, return_code = get_cluster_metadata_configmap(args)
366+
if return_code != 0:
367+
return node_selector_args, 1
368+
369+
if capacity_type_str == CapacityType.RESERVATION.name:
370+
return_code, args.reservation = get_value_from_map(
371+
RESERVATION_CONFIG_KEY, cluster_config_map
324372
)
325373
if return_code != 0:
326374
return node_selector_args, return_code
327-
328-
if capacity_type_str == CapacityType.RESERVATION.name:
329-
return_code, args.reservation = get_value_from_map(
330-
RESERVATION_CONFIG_KEY, cluster_config_map
331-
)
332-
if return_code != 0:
333-
return node_selector_args, return_code
334-
return_code = verify_reservation_exists(args)
335-
if return_code > 0:
336-
xpk_print('Unable to verify reservation name saved in config map.')
337-
return node_selector_args, return_code
375+
return_code = verify_reservation_exists(args)
376+
if return_code > 0:
377+
xpk_print('Unable to verify reservation name saved in config map.')
378+
return node_selector_args, return_code
338379

339380
# Check if reservation id is valid. Shared function with cluster creation.
340381
node_selector_args, return_code = (
@@ -345,3 +386,36 @@ def get_autoprovisioning_node_selector_args(args) -> tuple[str, int]:
345386
return node_selector_args, return_code
346387

347388
return node_selector_args, return_code
389+
390+
391+
def get_autoprovisioning_tolerations(args) -> tuple[str, int]:
392+
"""Determine the pod tolerations when autoprovisioning is enabled.
393+
394+
Args:
395+
args: user provided arguments for running the command.
396+
397+
Returns:
398+
Tuple with string of autoprovisioning tolerations and
399+
int of 0 if successful and 1 otherwise.
400+
"""
401+
capacity_type_str, return_code = (
402+
get_capacity_type_str_from_args_or_cluster_default(args)
403+
)
404+
if return_code != 0:
405+
return '', return_code
406+
407+
if capacity_type_str == CapacityType.SPOT.name:
408+
# https://cloud.google.com/kubernetes-engine/docs/concepts/node-auto-provisioning#support_for_spot_vms
409+
#
410+
# > Creating node pools based on Spot VMs is only considered if
411+
# > unschedulable pods with a toleration for the
412+
# > cloud.google.com/gke-spot="true":NoSchedule taint exist
413+
return (
414+
'''- key: "cloud.google.com/gke-spot"
415+
operator: "Equal"
416+
value: "true"
417+
effect: "NoSchedule"''',
418+
0,
419+
)
420+
421+
return '', 0

0 commit comments

Comments
 (0)