Skip to content

Commit de5280c

Browse files
committed
Add local tests with Kind
1 parent dd721f5 commit de5280c

20 files changed

+429
-166
lines changed

Makefile

+4
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,10 @@ run-unittests:
3737
run-integrationtests:
3838
pytest src/xpk/core/tests/integration/
3939

40+
run-kindtests:
41+
chmod +x ./tools/run-kind-tests.sh
42+
./tools/run-kind-tests.sh
43+
4044
.PHONY: mkdir-bin
4145
mkdir-bin:
4246
mkdir -p $(BIN_PATH)

README.md

+7-2
Original file line numberDiff line numberDiff line change
@@ -1349,13 +1349,18 @@ xpk interfaces seamlessly with kind to manage Kubernetes clusters locally, facil
13491349
13501350
## Local Testing Basics
13511351
1352-
Local testing is available exclusively through the `batch` and `job` commands of xpk with the `--kind-cluster` flag. This allows you to simulate training jobs locally:
1352+
Local testing is achievable through most commands of `xpk` except those that require Pathways, like `cluster create-pathways` or `workload create-pathways`. This functionality is supported by using the `--kind-cluster` flag which allows you to simulate operations locally on the `kind` tool.
1353+
1354+
This example demonstrates how to run a batch job locally using the --kind-cluster flag:
13531355
13541356
```shell
13551357
python xpk.py batch [other-options] --kind-cluster script
13561358
```
13571359
1358-
Please note that all other xpk subcommands are intended for use with cloud systems on Google Cloud Engine (GCE) and don't support local testing. This includes commands like cluster, info, inspector, etc.
1360+
While the `--kind-cluster` flag does extend local testing capabilities to several commands, please be aware that commands requiring specific features from Google Cloud Platform (GCP) might not yet be fully supported when tested locally. Future updates may provide enhanced support for these GCP-specific features.
1361+
1362+
Currently supported local testing cases can be reviewed in the script: `tools/run-kind-tests.sh`.
1363+
13591364
13601365
# Other advanced usage
13611366
[Use a Jupyter notebook to interact with a Cloud TPU cluster](xpk-notebooks.md)

src/xpk/commands/cluster.py

+13-12
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,9 @@
1414
limitations under the License.
1515
"""
1616

17+
from tabulate import tabulate
18+
19+
from ..core.cluster_private import authorize_private_cluster_access_if_necessary
1720
from ..core.commands import run_command_for_value, run_command_with_updates
1821
from ..core.core import (
1922
VERTEX_TENSORBOARD_FEATURE_FLAG,
@@ -26,20 +29,15 @@
2629
get_gke_control_plane_version,
2730
get_gke_node_pool_version,
2831
get_gke_server_config,
32+
get_user_input,
2933
h100_device_type,
3034
install_nccl_on_cluster,
3135
run_gke_node_pool_create_command,
3236
set_jobset_on_cluster,
3337
set_up_cluster_network_for_gpu,
3438
zone_to_region,
35-
get_user_input,
36-
)
37-
from ..core.cluster_private import authorize_private_cluster_access_if_necessary
38-
from ..core.kjob import (
39-
verify_kjob_installed,
40-
prepare_kjob,
41-
apply_kjob_crds,
4239
)
40+
from ..core.kjob import apply_kjob_crds, prepare_kjob, verify_kjob_installed
4341
from ..core.kueue import (
4442
cluster_preheat_yml,
4543
install_kueue_crs,
@@ -55,12 +53,11 @@
5553
get_system_characteristics,
5654
)
5755
from ..core.workload import get_workload_list
58-
from ..utils.file import write_tmp_file
5956
from ..utils.console import xpk_exit, xpk_print
57+
from ..utils.file import write_tmp_file
6058
from . import cluster_gcluster
6159
from .common import set_cluster_command
62-
63-
from tabulate import tabulate
60+
from .kind import set_local_cluster_command
6461

6562

6663
def cluster_create(args) -> None:
@@ -316,9 +313,13 @@ def cluster_describe(args) -> None:
316313
0 if successful and 1 otherwise.
317314
"""
318315
xpk_print(f'Starting nodepool list for cluster: {args.cluster}', flush=True)
319-
add_zone_and_project(args)
320316

321-
set_cluster_command_code = set_cluster_command(args)
317+
if not args.kind_cluster:
318+
add_zone_and_project(args)
319+
set_cluster_command_code = set_cluster_command(args)
320+
else:
321+
set_cluster_command_code = set_local_cluster_command(args)
322+
322323
if set_cluster_command_code != 0:
323324
xpk_exit(set_cluster_command_code)
324325

src/xpk/commands/info.py

+8-3
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
from ..core.core import (
2424
add_zone_and_project,
2525
)
26+
from .kind import set_local_cluster_command
2627
import json
2728
from tabulate import tabulate
2829
from argparse import Namespace
@@ -38,8 +39,12 @@ def info(args: Namespace) -> None:
3839
Returns:
3940
None
4041
"""
41-
add_zone_and_project(args)
42-
set_cluster_command_code = set_cluster_command(args)
42+
if not args.kind_cluster:
43+
add_zone_and_project(args)
44+
set_cluster_command_code = set_cluster_command(args)
45+
else:
46+
set_cluster_command_code = set_local_cluster_command(args)
47+
4348
if set_cluster_command_code != 0:
4449
xpk_exit(set_cluster_command_code)
4550

@@ -84,7 +89,7 @@ def get_nominal_quotas(cqs: list[dict]) -> dict[str, dict[str, str]]:
8489
spec = cq['spec']
8590
cq_name = cq['metadata']['name']
8691
quotas[cq_name] = {}
87-
for rg in spec['resourceGroups']:
92+
for rg in spec.get('resourceGroups', []):
8893
for flavor in rg['flavors']:
8994
name = flavor['name']
9095
for resource in flavor['resources']:

src/xpk/commands/inspector.py

+88-72
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
from ..utils.console import xpk_exit, xpk_print
2727
from .common import set_cluster_command
2828
from .workload import get_workload_list
29+
from .kind import set_local_cluster_command
2930

3031

3132
def inspector_run_command_helper(
@@ -124,52 +125,63 @@ def inspector(args) -> None:
124125
final_return_code = 0
125126
xpk_print(args)
126127

127-
add_zone_and_project(args)
128-
set_cluster_command_code = set_cluster_command(args)
128+
if not args.kind_cluster:
129+
add_zone_and_project(args)
130+
set_cluster_command_code = set_cluster_command(args)
131+
else:
132+
set_cluster_command_code = set_local_cluster_command(args)
133+
129134
if set_cluster_command_code != 0:
130135
xpk_exit(set_cluster_command_code)
131136

132137
inspector_file = write_tmp_file(
133138
'==================\nXPK inspector OUTPUT:\n==================\n'
134139
)
135-
command_and_descriptions = [
136-
('gcloud version', 'Local Setup: gcloud version'),
137-
(
138-
(
139-
'gcloud config get project; gcloud config get compute/zone;'
140-
' gcloud config get compute/region'
141-
),
142-
'Local Setup: Project / Zone / Region',
143-
),
144-
(
145-
(
146-
'gcloud beta container clusters list --project'
147-
f' {args.project} --region {zone_to_region(args.zone)} | grep -e'
148-
f' NAME -e {args.cluster}'
149-
),
150-
'GKE: Cluster Details',
151-
),
152-
(
153-
(
154-
'kubectl get configmap'
155-
f' {args.cluster}-{CLUSTER_METADATA_CONFIGMAP} -o yaml'
156-
),
157-
'GKE: Cluster Metadata ConfigMap Details',
158-
),
159-
(
160-
(
161-
'kubectl get configmap'
162-
f' {args.cluster}-{CLUSTER_RESOURCES_CONFIGMAP} -o yaml'
163-
),
164-
'GKE: Cluster Resources ConfigMap Details',
165-
),
166-
(
167-
(
168-
f'gcloud beta container node-pools list --cluster {args.cluster} '
169-
f' --project={args.project} --region={zone_to_region(args.zone)}'
170-
),
171-
'GKE: Node pool Details',
172-
),
140+
141+
gcloud_commands_and_descriptions = []
142+
if not args.kind_cluster:
143+
gcloud_commands_and_descriptions = [
144+
('gcloud version', 'Local Setup: gcloud version'),
145+
(
146+
(
147+
'gcloud config get project; gcloud config get compute/zone;'
148+
' gcloud config get compute/region'
149+
),
150+
'Local Setup: Project / Zone / Region',
151+
),
152+
(
153+
(
154+
'gcloud beta container clusters list --project'
155+
f' {args.project} --region {zone_to_region(args.zone)} | grep'
156+
f' -e NAME -e {args.cluster}'
157+
),
158+
'GKE: Cluster Details',
159+
),
160+
(
161+
(
162+
'kubectl get configmap'
163+
f' {args.cluster}-{CLUSTER_METADATA_CONFIGMAP} -o yaml'
164+
),
165+
'GKE: Cluster Metadata ConfigMap Details',
166+
),
167+
(
168+
(
169+
'kubectl get configmap'
170+
f' {args.cluster}-{CLUSTER_RESOURCES_CONFIGMAP} -o yaml'
171+
),
172+
'GKE: Cluster Resources ConfigMap Details',
173+
),
174+
(
175+
(
176+
'gcloud beta container node-pools list --cluster'
177+
f' {args.cluster} '
178+
f' --project={args.project} --region={zone_to_region(args.zone)}'
179+
),
180+
'GKE: Node pool Details',
181+
),
182+
]
183+
184+
kubectl_commands_and_descriptions = [
173185
(
174186
(
175187
"kubectl get node -o custom-columns='NODE_NAME:metadata.name,"
@@ -234,6 +246,9 @@ def inspector(args) -> None:
234246
),
235247
]
236248

249+
command_and_descriptions = (
250+
gcloud_commands_and_descriptions + kubectl_commands_and_descriptions
251+
)
237252
for command, description in command_and_descriptions:
238253
return_code = inspector_run_command_helper(
239254
args, command, description, inspector_file
@@ -311,45 +326,46 @@ def inspector(args) -> None:
311326

312327
# Cloud Console Links:
313328
workload_links = []
314-
if args.workload:
329+
if args.workload and not args.kind_cluster:
315330
workload_links = [(
316331
f'Cloud Console for the workload {args.workload}',
317332
# pylint: disable=line-too-long
318333
f'https://console.cloud.google.com/kubernetes/service/{zone_to_region(args.zone)}/{args.cluster}/default/{args.workload}/details?project={args.project}',
319334
)]
320335

321-
links = [
322-
(
323-
'Cloud Console for the GKE Cluster',
324-
# pylint: disable=line-too-long
325-
f'https://console.cloud.google.com/kubernetes/clusters/details/{zone_to_region(args.zone)}/{args.cluster}/details?project={args.project}',
326-
),
327-
(
328-
'Cloud Console for all workloads in GKE Cluster',
329-
# pylint: disable=line-too-long
330-
f'https://console.cloud.google.com/kubernetes/workload/overview?project={args.project}&pageState=((gke%2F{zone_to_region(args.zone)}%2F{args.cluster}))',
331-
),
332-
(
333-
'Cloud Console for IAM Permissions',
334-
f'https://console.cloud.google.com/iam-admin/iam?project={args.project}',
335-
),
336-
(
337-
'Cloud Console for Quotas',
338-
f'https://console.cloud.google.com/iam-admin/quotas?project={args.project}',
339-
),
340-
]
341-
links.extend(workload_links)
342-
343-
for description, workload_link in links:
344-
return_code = inspector_output_link_helper(
345-
args, workload_link, description, inspector_file
346-
)
347-
if return_code != 0:
348-
final_return_code = return_code
349-
xpk_print(
350-
f'inspector failed in link: {workload_link} description:'
351-
f' {description} return code: {return_code}'
336+
if not args.kind_cluster:
337+
links = [
338+
(
339+
'Cloud Console for the GKE Cluster',
340+
# pylint: disable=line-too-long
341+
f'https://console.cloud.google.com/kubernetes/clusters/details/{zone_to_region(args.zone)}/{args.cluster}/details?project={args.project}',
342+
),
343+
(
344+
'Cloud Console for all workloads in GKE Cluster',
345+
# pylint: disable=line-too-long
346+
f'https://console.cloud.google.com/kubernetes/workload/overview?project={args.project}&pageState=((gke%2F{zone_to_region(args.zone)}%2F{args.cluster}))',
347+
),
348+
(
349+
'Cloud Console for IAM Permissions',
350+
f'https://console.cloud.google.com/iam-admin/iam?project={args.project}',
351+
),
352+
(
353+
'Cloud Console for Quotas',
354+
f'https://console.cloud.google.com/iam-admin/quotas?project={args.project}',
355+
),
356+
]
357+
links.extend(workload_links)
358+
359+
for description, workload_link in links:
360+
return_code = inspector_output_link_helper(
361+
args, workload_link, description, inspector_file
352362
)
363+
if return_code != 0:
364+
final_return_code = return_code
365+
xpk_print(
366+
f'inspector failed in link: {workload_link} description:'
367+
f' {description} return code: {return_code}'
368+
)
353369

354370
# Summarize inspector:
355371
xpk_print(f'Find xpk inspector output file: {inspector_file.name}')

src/xpk/commands/kind.py

+42-1
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,16 @@
2828
)
2929
from ..core.kueue import (
3030
install_kueue_on_cluster,
31+
wait_for_kueue_available,
32+
install_kueue_crs,
33+
)
34+
from ..core.system_characteristics import (
35+
get_system_characteristics,
3136
)
3237
from ..utils.console import (xpk_exit, xpk_print)
38+
from ..utils.file import write_tmp_file
39+
40+
import yaml
3341

3442

3543
def cluster_create(args) -> None:
@@ -79,6 +87,22 @@ def cluster_create(args) -> None:
7987
if err_code > 0:
8088
xpk_exit(err_code)
8189

90+
xpk_print('Wait for Kueue to be fully available')
91+
wait_for_kueue_available_code = wait_for_kueue_available(args)
92+
if wait_for_kueue_available_code != 0:
93+
xpk_exit(wait_for_kueue_available_code)
94+
95+
args.kind_cluster = True
96+
system, return_code = get_system_characteristics(args)
97+
if return_code > 0:
98+
xpk_print('Fetching system characteristics failed!')
99+
xpk_exit(return_code)
100+
101+
xpk_print('Install Kueue Custom Resources')
102+
enable_kueue_credentials_code = install_kueue_crs(args, system, None)
103+
if enable_kueue_credentials_code != 0:
104+
xpk_exit(enable_kueue_credentials_code)
105+
82106
xpk_print('Kind commands done! Resources are created.')
83107
xpk_exit(0)
84108

@@ -184,7 +208,24 @@ def run_kind_cluster_create_command(args) -> int:
184208
Returns:
185209
0 if successful and 1 otherwise.
186210
"""
187-
command = 'kind create cluster'
211+
# Simulating GCP Environments in Kind
212+
# Configures kind nodes to mimic GCP features such as node pools, TPU accelerators or instance types.
213+
# These modifications aid in testing the compatibility and behavior of applications outside real GCP.
214+
# For more details on kind configurations, visit: https://kind.sigs.k8s.io/docs/user/configuration/
215+
kind_config = {
216+
'kind': 'Cluster',
217+
'apiVersion': 'kind.x-k8s.io/v1alpha4',
218+
'nodes': [
219+
{'role': 'control-plane'},
220+
{
221+
'role': 'worker',
222+
'labels': {'cloud.google.com/gke-nodepool': 'kind-pool-0'},
223+
},
224+
],
225+
}
226+
yaml_str = yaml.dump(kind_config)
227+
tmp = write_tmp_file(yaml_str)
228+
command = f'kind create cluster --config {tmp.file.name}'
188229

189230
if args.cluster:
190231
command += f' --name={args.cluster}'

0 commit comments

Comments
 (0)