Skip to content

Commit e280ca9

Browse files
committed
Add local tests with Kind
1 parent f401848 commit e280ca9

23 files changed

+479
-223
lines changed

Makefile

+4
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,10 @@ run-unittests:
3737
run-integrationtests:
3838
pytest src/xpk/core/tests/integration/
3939

40+
run-kindtests:
41+
chmod +x ./tools/run-kind-tests.sh
42+
./tools/run-kind-tests.sh
43+
4044
.PHONY: mkdir-bin
4145
mkdir-bin:
4246
mkdir -p $(BIN_PATH)

README.md

+7-2
Original file line numberDiff line numberDiff line change
@@ -1349,13 +1349,18 @@ xpk interfaces seamlessly with kind to manage Kubernetes clusters locally, facil
13491349
13501350
## Local Testing Basics
13511351
1352-
Local testing is available exclusively through the `batch` and `job` commands of xpk with the `--kind-cluster` flag. This allows you to simulate training jobs locally:
1352+
Local testing is achievable through most commands of `xpk` except those that require Pathways, like `cluster create-pathways` or `workload create-pathways`. This functionality is supported by using the `--kind-cluster` flag which allows you to simulate operations locally on the `kind` tool.
1353+
1354+
This example demonstrates how to run a batch job locally using the --kind-cluster flag:
13531355
13541356
```shell
13551357
python xpk.py batch [other-options] --kind-cluster script
13561358
```
13571359
1358-
Please note that all other xpk subcommands are intended for use with cloud systems on Google Cloud Engine (GCE) and don't support local testing. This includes commands like cluster, info, inspector, etc.
1360+
While the `--kind-cluster` flag does extend local testing capabilities to several commands, please be aware that commands requiring specific features from Google Cloud Platform (GCP) might not yet be fully supported when tested locally. Future updates may provide enhanced support for these GCP-specific features.
1361+
1362+
Currently supported local testing cases can be reviewed in the script: `tools/run-kind-tests.sh`.
1363+
13591364
13601365
# Other advanced usage
13611366
[Use a Jupyter notebook to interact with a Cloud TPU cluster](xpk-notebooks.md)

src/xpk/commands/batch.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,12 @@
1616

1717
from argparse import Namespace
1818

19+
from ..core.commands import run_command_for_value
20+
from ..core.core import add_zone_and_project
21+
from ..core.kjob import AppProfileDefaults
1922
from ..core.kueue import LOCAL_QUEUE_NAME
2023
from ..utils.console import xpk_exit, xpk_print
2124
from .common import set_cluster_command
22-
from ..core.core import add_zone_and_project
23-
from ..core.kjob import AppProfileDefaults
24-
from ..core.commands import run_command_for_value
2525
from .kind import set_local_cluster_command
2626

2727

@@ -33,7 +33,7 @@ def batch(args: Namespace) -> None:
3333
Returns:
3434
None
3535
"""
36-
if not args.kind_cluster:
36+
if not getattr(args, 'kind_cluster', None):
3737
add_zone_and_project(args)
3838
set_cluster_command_code = set_cluster_command(args)
3939
else:

src/xpk/commands/cluster.py

+13-12
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,9 @@
1414
limitations under the License.
1515
"""
1616

17+
from tabulate import tabulate
18+
19+
from ..core.cluster_private import authorize_private_cluster_access_if_necessary
1720
from ..core.commands import run_command_for_value, run_command_with_updates
1821
from ..core.core import (
1922
VERTEX_TENSORBOARD_FEATURE_FLAG,
@@ -26,20 +29,15 @@
2629
get_gke_control_plane_version,
2730
get_gke_node_pool_version,
2831
get_gke_server_config,
32+
get_user_input,
2933
h100_device_type,
3034
install_nccl_on_cluster,
3135
run_gke_node_pool_create_command,
3236
set_jobset_on_cluster,
3337
set_up_cluster_network_for_gpu,
3438
zone_to_region,
35-
get_user_input,
36-
)
37-
from ..core.cluster_private import authorize_private_cluster_access_if_necessary
38-
from ..core.kjob import (
39-
verify_kjob_installed,
40-
prepare_kjob,
41-
apply_kjob_crds,
4239
)
40+
from ..core.kjob import apply_kjob_crds, prepare_kjob, verify_kjob_installed
4341
from ..core.kueue import (
4442
cluster_preheat_yml,
4543
install_kueue_crs,
@@ -55,12 +53,11 @@
5553
get_system_characteristics,
5654
)
5755
from ..core.workload import get_workload_list
58-
from ..utils.file import write_tmp_file
5956
from ..utils.console import xpk_exit, xpk_print
57+
from ..utils.file import write_tmp_file
6058
from . import cluster_gcluster
6159
from .common import set_cluster_command
62-
63-
from tabulate import tabulate
60+
from .kind import set_local_cluster_command
6461

6562

6663
def cluster_create(args) -> None:
@@ -316,9 +313,13 @@ def cluster_describe(args) -> None:
316313
0 if successful and 1 otherwise.
317314
"""
318315
xpk_print(f'Starting nodepool list for cluster: {args.cluster}', flush=True)
319-
add_zone_and_project(args)
320316

321-
set_cluster_command_code = set_cluster_command(args)
317+
if not getattr(args, 'kind_cluster', None):
318+
add_zone_and_project(args)
319+
set_cluster_command_code = set_cluster_command(args)
320+
else:
321+
set_cluster_command_code = set_local_cluster_command(args)
322+
322323
if set_cluster_command_code != 0:
323324
xpk_exit(set_cluster_command_code)
324325

src/xpk/commands/info.py

+16-13
Original file line numberDiff line numberDiff line change
@@ -14,19 +14,18 @@
1414
limitations under the License.
1515
"""
1616

17-
from ..utils.console import xpk_exit, xpk_print
18-
from ..core.kueue import verify_kueuectl
19-
from .common import set_cluster_command
20-
from ..core.commands import (
21-
run_command_for_value,
22-
)
23-
from ..core.core import (
24-
add_zone_and_project,
25-
)
2617
import json
27-
from tabulate import tabulate
2818
from argparse import Namespace
2919

20+
from tabulate import tabulate
21+
22+
from ..core.commands import run_command_for_value
23+
from ..core.core import add_zone_and_project
24+
from ..core.kueue import verify_kueuectl
25+
from ..utils.console import xpk_exit, xpk_print
26+
from .common import set_cluster_command
27+
from .kind import set_local_cluster_command
28+
3029
table_fmt = 'plain'
3130

3231

@@ -38,8 +37,12 @@ def info(args: Namespace) -> None:
3837
Returns:
3938
None
4039
"""
41-
add_zone_and_project(args)
42-
set_cluster_command_code = set_cluster_command(args)
40+
if not getattr(args, 'kind_cluster', None):
41+
add_zone_and_project(args)
42+
set_cluster_command_code = set_cluster_command(args)
43+
else:
44+
set_cluster_command_code = set_local_cluster_command(args)
45+
4346
if set_cluster_command_code != 0:
4447
xpk_exit(set_cluster_command_code)
4548

@@ -84,7 +87,7 @@ def get_nominal_quotas(cqs: list[dict]) -> dict[str, dict[str, str]]:
8487
spec = cq['spec']
8588
cq_name = cq['metadata']['name']
8689
quotas[cq_name] = {}
87-
for rg in spec['resourceGroups']:
90+
for rg in spec.get('resourceGroups', []):
8891
for flavor in rg['flavors']:
8992
name = flavor['name']
9093
for resource in flavor['resources']:

src/xpk/commands/inspector.py

+89-73
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,10 @@
2222
zone_to_region,
2323
)
2424
from ..core.kueue import CLUSTER_QUEUE_NAME, LOCAL_QUEUE_NAME
25-
from ..utils.file import append_tmp_file, write_tmp_file
2625
from ..utils.console import xpk_exit, xpk_print
26+
from ..utils.file import append_tmp_file, write_tmp_file
2727
from .common import set_cluster_command
28+
from .kind import set_local_cluster_command
2829
from .workload import get_workload_list
2930

3031

@@ -124,52 +125,63 @@ def inspector(args) -> None:
124125
final_return_code = 0
125126
xpk_print(args)
126127

127-
add_zone_and_project(args)
128-
set_cluster_command_code = set_cluster_command(args)
128+
if not getattr(args, 'kind_cluster', None):
129+
add_zone_and_project(args)
130+
set_cluster_command_code = set_cluster_command(args)
131+
else:
132+
set_cluster_command_code = set_local_cluster_command(args)
133+
129134
if set_cluster_command_code != 0:
130135
xpk_exit(set_cluster_command_code)
131136

132137
inspector_file = write_tmp_file(
133138
'==================\nXPK inspector OUTPUT:\n==================\n'
134139
)
135-
command_and_descriptions = [
136-
('gcloud version', 'Local Setup: gcloud version'),
137-
(
138-
(
139-
'gcloud config get project; gcloud config get compute/zone;'
140-
' gcloud config get compute/region'
141-
),
142-
'Local Setup: Project / Zone / Region',
143-
),
144-
(
145-
(
146-
'gcloud beta container clusters list --project'
147-
f' {args.project} --region {zone_to_region(args.zone)} | grep -e'
148-
f' NAME -e {args.cluster}'
149-
),
150-
'GKE: Cluster Details',
151-
),
152-
(
153-
(
154-
'kubectl get configmap'
155-
f' {args.cluster}-{CLUSTER_METADATA_CONFIGMAP} -o yaml'
156-
),
157-
'GKE: Cluster Metadata ConfigMap Details',
158-
),
159-
(
160-
(
161-
'kubectl get configmap'
162-
f' {args.cluster}-{CLUSTER_RESOURCES_CONFIGMAP} -o yaml'
163-
),
164-
'GKE: Cluster Resources ConfigMap Details',
165-
),
166-
(
167-
(
168-
f'gcloud beta container node-pools list --cluster {args.cluster} '
169-
f' --project={args.project} --region={zone_to_region(args.zone)}'
170-
),
171-
'GKE: Node pool Details',
172-
),
140+
141+
gcloud_commands_and_descriptions = []
142+
if not getattr(args, 'kind_cluster', None):
143+
gcloud_commands_and_descriptions = [
144+
('gcloud version', 'Local Setup: gcloud version'),
145+
(
146+
(
147+
'gcloud config get project; gcloud config get compute/zone;'
148+
' gcloud config get compute/region'
149+
),
150+
'Local Setup: Project / Zone / Region',
151+
),
152+
(
153+
(
154+
'gcloud beta container clusters list --project'
155+
f' {args.project} --region {zone_to_region(args.zone)} | grep'
156+
f' -e NAME -e {args.cluster}'
157+
),
158+
'GKE: Cluster Details',
159+
),
160+
(
161+
(
162+
'kubectl get configmap'
163+
f' {args.cluster}-{CLUSTER_METADATA_CONFIGMAP} -o yaml'
164+
),
165+
'GKE: Cluster Metadata ConfigMap Details',
166+
),
167+
(
168+
(
169+
'kubectl get configmap'
170+
f' {args.cluster}-{CLUSTER_RESOURCES_CONFIGMAP} -o yaml'
171+
),
172+
'GKE: Cluster Resources ConfigMap Details',
173+
),
174+
(
175+
(
176+
'gcloud beta container node-pools list --cluster'
177+
f' {args.cluster} '
178+
f' --project={args.project} --region={zone_to_region(args.zone)}'
179+
),
180+
'GKE: Node pool Details',
181+
),
182+
]
183+
184+
kubectl_commands_and_descriptions = [
173185
(
174186
(
175187
"kubectl get node -o custom-columns='NODE_NAME:metadata.name,"
@@ -234,6 +246,9 @@ def inspector(args) -> None:
234246
),
235247
]
236248

249+
command_and_descriptions = (
250+
gcloud_commands_and_descriptions + kubectl_commands_and_descriptions
251+
)
237252
for command, description in command_and_descriptions:
238253
return_code = inspector_run_command_helper(
239254
args, command, description, inspector_file
@@ -311,45 +326,46 @@ def inspector(args) -> None:
311326

312327
# Cloud Console Links:
313328
workload_links = []
314-
if args.workload:
329+
if args.workload and not getattr(args, 'kind_cluster', None):
315330
workload_links = [(
316331
f'Cloud Console for the workload {args.workload}',
317332
# pylint: disable=line-too-long
318333
f'https://console.cloud.google.com/kubernetes/service/{zone_to_region(args.zone)}/{args.cluster}/default/{args.workload}/details?project={args.project}',
319334
)]
320335

321-
links = [
322-
(
323-
'Cloud Console for the GKE Cluster',
324-
# pylint: disable=line-too-long
325-
f'https://console.cloud.google.com/kubernetes/clusters/details/{zone_to_region(args.zone)}/{args.cluster}/details?project={args.project}',
326-
),
327-
(
328-
'Cloud Console for all workloads in GKE Cluster',
329-
# pylint: disable=line-too-long
330-
f'https://console.cloud.google.com/kubernetes/workload/overview?project={args.project}&pageState=((gke%2F{zone_to_region(args.zone)}%2F{args.cluster}))',
331-
),
332-
(
333-
'Cloud Console for IAM Permissions',
334-
f'https://console.cloud.google.com/iam-admin/iam?project={args.project}',
335-
),
336-
(
337-
'Cloud Console for Quotas',
338-
f'https://console.cloud.google.com/iam-admin/quotas?project={args.project}',
339-
),
340-
]
341-
links.extend(workload_links)
342-
343-
for description, workload_link in links:
344-
return_code = inspector_output_link_helper(
345-
args, workload_link, description, inspector_file
346-
)
347-
if return_code != 0:
348-
final_return_code = return_code
349-
xpk_print(
350-
f'inspector failed in link: {workload_link} description:'
351-
f' {description} return code: {return_code}'
336+
if not getattr(args, 'kind_cluster', None):
337+
links = [
338+
(
339+
'Cloud Console for the GKE Cluster',
340+
# pylint: disable=line-too-long
341+
f'https://console.cloud.google.com/kubernetes/clusters/details/{zone_to_region(args.zone)}/{args.cluster}/details?project={args.project}',
342+
),
343+
(
344+
'Cloud Console for all workloads in GKE Cluster',
345+
# pylint: disable=line-too-long
346+
f'https://console.cloud.google.com/kubernetes/workload/overview?project={args.project}&pageState=((gke%2F{zone_to_region(args.zone)}%2F{args.cluster}))',
347+
),
348+
(
349+
'Cloud Console for IAM Permissions',
350+
f'https://console.cloud.google.com/iam-admin/iam?project={args.project}',
351+
),
352+
(
353+
'Cloud Console for Quotas',
354+
f'https://console.cloud.google.com/iam-admin/quotas?project={args.project}',
355+
),
356+
]
357+
links.extend(workload_links)
358+
359+
for description, workload_link in links:
360+
return_code = inspector_output_link_helper(
361+
args, workload_link, description, inspector_file
352362
)
363+
if return_code != 0:
364+
final_return_code = return_code
365+
xpk_print(
366+
f'inspector failed in link: {workload_link} description:'
367+
f' {description} return code: {return_code}'
368+
)
353369

354370
# Summarize inspector:
355371
xpk_print(f'Find xpk inspector output file: {inspector_file.name}')

0 commit comments

Comments
 (0)