Skip to content

Commit 5ca8df3

Browse files
committed
Fix autoprovisioning with spot nodes
1 parent 8737b38 commit 5ca8df3

File tree

3 files changed

+88
-33
lines changed

3 files changed

+88
-33
lines changed

src/xpk/commands/workload.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@
4848
from ..core.kueue import LOCAL_QUEUE_NAME
4949
from ..core.nap import (
5050
get_autoprovisioning_node_selector_args,
51+
get_autoprovisioning_tolerations,
5152
is_autoprovisioning_enabled,
5253
)
5354
from ..core.pathways import (
@@ -101,6 +102,8 @@
101102
hostNetwork: true
102103
dnsPolicy: ClusterFirstWithHostNet
103104
terminationGracePeriodSeconds: {args.termination_grace_period_seconds}
105+
tolerations:
106+
{autoprovisioning_tolerations}
104107
containers:
105108
{container}
106109
volumes:
@@ -395,6 +398,7 @@ def workload_create(args) -> None:
395398

396399
# Currently autoprovisioning is not enabled for Pathways workloads.
397400
autoprovisioning_args = ''
401+
autoprovisioning_tolerations = ''
398402
autoprovisioning_enabled, return_code = is_autoprovisioning_enabled(
399403
args, system
400404
)
@@ -407,6 +411,11 @@ def workload_create(args) -> None:
407411
)
408412
if return_code != 0:
409413
xpk_exit(return_code)
414+
autoprovisioning_tolerations, return_code = (
415+
get_autoprovisioning_tolerations(args)
416+
)
417+
if return_code != 0:
418+
xpk_exit(return_code)
410419

411420
# Create the workload file based on accelerator type or workload type.
412421
if system.accelerator_type == AcceleratorType['GPU']:
@@ -467,6 +476,7 @@ def workload_create(args) -> None:
467476
local_queue_name=LOCAL_QUEUE_NAME,
468477
autoprovisioning_args=autoprovisioning_args,
469478
volumes=get_volumes(args, system),
479+
autoprovisioning_tolerations=autoprovisioning_tolerations,
470480
)
471481
tmp = write_tmp_file(yml_string)
472482
command = f'kubectl apply -f {str(tmp.file.name)}'

src/xpk/core/core.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -763,7 +763,7 @@ def get_capacity_node_selectors_from_capacity_type(
763763
case CapacityType.ON_DEMAND.name:
764764
node_selector = ''
765765
case CapacityType.SPOT.name:
766-
node_selector = 'cloud.google.com/gke-spot="true"'
766+
node_selector = 'cloud.google.com/gke-spot: "true"'
767767
case CapacityType.RESERVATION.name:
768768
node_selector = f'cloud.google.com/reservation-name: {args.reservation}'
769769
case _:

src/xpk/core/nap.py

Lines changed: 77 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -285,6 +285,47 @@ def is_autoprovisioning_enabled(
285285
return False, 1
286286

287287

288+
def get_capacity_type_str_with_cluster_default(args) -> tuple[str, int]:
289+
"""Determine the capacity type based on user arguments or cluster default.
290+
291+
Args:
292+
args: user provided arguments for running the command.
293+
294+
Returns:
295+
Tuple with string with the system characteristics and
296+
int of 0 if successful and 1 otherwise.
297+
"""
298+
# If the user doesn't specify args, then use the cluster settings.
299+
capacity_type, return_code = get_capacity_type(args)
300+
if return_code != 0:
301+
xpk_print('Unable to get capacity type.')
302+
return CapacityType.UNKNOWN.name, return_code
303+
304+
if capacity_type != CapacityType.UNKNOWN:
305+
return capacity_type.name, 0
306+
307+
# Use default settings from cluster creation.
308+
metadata_configmap_name = f'{args.cluster}-{CLUSTER_METADATA_CONFIGMAP}'
309+
cluster_config_map = get_cluster_configmap(args, metadata_configmap_name)
310+
311+
# Error out if the metadata config map doesn't exist, and is attempting to use
312+
# autoprovisioning.
313+
if cluster_config_map is None:
314+
xpk_print(
315+
'Unable to find config map. Please specify a capacity type'
316+
' --on-demand, --spot, --reservation=$RESERVATION_ID) to continue'
317+
' to use autoprovisioning (--enable-autoprovisioning).'
318+
)
319+
return CapacityType.UNKNOWN.name, 1
320+
321+
return_code, capacity_type_str = get_value_from_map(
322+
CAPACITY_TYPE_CONFIG_KEY, cluster_config_map
323+
)
324+
if return_code != 0:
325+
return CapacityType.UNKNOWN.name, return_code
326+
327+
return capacity_type_str, 0
328+
288329
def get_autoprovisioning_node_selector_args(args) -> tuple[str, int]:
289330
"""Determine the capacity type when autoprovisioning is enabled.
290331
@@ -297,44 +338,20 @@ def get_autoprovisioning_node_selector_args(args) -> tuple[str, int]:
297338
"""
298339
return_code = 0
299340
node_selector_args = ''
300-
# If the user doesn't specify args, then use the cluster settings.
301-
capacity_type, return_code = get_capacity_type(args)
302-
capacity_type_str = capacity_type.name
341+
capacity_type_str, return_code = get_capacity_type_str_with_cluster_default(args)
303342
if return_code != 0:
304-
xpk_print('Unable to get capacity type.')
305343
return node_selector_args, return_code
306344

307-
if capacity_type_str == CapacityType.UNKNOWN.name:
308-
# Use default settings from cluster creation.
309-
metadata_configmap_name = f'{args.cluster}-{CLUSTER_METADATA_CONFIGMAP}'
310-
cluster_config_map = get_cluster_configmap(args, metadata_configmap_name)
311-
312-
# Error out if the metadata config map doesn't exist, and is attempting to use
313-
# autoprovisioning.
314-
if cluster_config_map is None:
315-
xpk_print(
316-
'Unable to find config map. Please specify a capacity type'
317-
' --on-demand, --spot, --reservation=$RESERVATION_ID) to continue'
318-
' to use autoprovisioning (--enable-autoprovisioning).'
319-
)
320-
return node_selector_args, 1
321-
322-
return_code, capacity_type_str = get_value_from_map(
323-
CAPACITY_TYPE_CONFIG_KEY, cluster_config_map
345+
if capacity_type_str == CapacityType.RESERVATION.name:
346+
return_code, args.reservation = get_value_from_map(
347+
RESERVATION_CONFIG_KEY, cluster_config_map
324348
)
325349
if return_code != 0:
326350
return node_selector_args, return_code
327-
328-
if capacity_type_str == CapacityType.RESERVATION.name:
329-
return_code, args.reservation = get_value_from_map(
330-
RESERVATION_CONFIG_KEY, cluster_config_map
331-
)
332-
if return_code != 0:
333-
return node_selector_args, return_code
334-
return_code = verify_reservation_exists(args)
335-
if return_code > 0:
336-
xpk_print('Unable to verify reservation name saved in config map.')
337-
return node_selector_args, return_code
351+
return_code = verify_reservation_exists(args)
352+
if return_code > 0:
353+
xpk_print('Unable to verify reservation name saved in config map.')
354+
return node_selector_args, return_code
338355

339356
# Check if reservation id is valid. Shared function with cluster creation.
340357
node_selector_args, return_code = (
@@ -345,3 +362,31 @@ def get_autoprovisioning_node_selector_args(args) -> tuple[str, int]:
345362
return node_selector_args, return_code
346363

347364
return node_selector_args, return_code
365+
366+
367+
def get_autoprovisioning_tolerations(args) -> tuple[str, int]:
368+
"""Determine the pod tolerations when autoprovisioning is enabled.
369+
370+
Args:
371+
args: user provided arguments for running the command.
372+
373+
Returns:
374+
Tuple with string of autoprovisioning tolerations and
375+
int of 0 if successful and 1 otherwise.
376+
"""
377+
capacity_type_str, return_code = get_capacity_type_str_with_cluster_default(args)
378+
if return_code != 0:
379+
return '', return_code
380+
381+
if capacity_type_str == CapacityType.SPOT.name:
382+
# https://cloud.google.com/kubernetes-engine/docs/concepts/node-auto-provisioning#support_for_spot_vms
383+
#
384+
# > Creating node pools based on Spot VMs is only considered if
385+
# > unschedulable pods with a toleration for the
386+
# > cloud.google.com/gke-spot="true":NoSchedule taint exist
387+
return '''- key: "cloud.google.com/gke-spot"
388+
operator: "Equal"
389+
value: "true"
390+
effect: "NoSchedule"''', 0
391+
392+
return '', 0

0 commit comments

Comments
 (0)