Skip to content

Commit cf86443

Browse files
committed
Add local tests with Kind
1 parent 49f1ece commit cf86443

25 files changed

+450
-174
lines changed

Makefile

+4
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,10 @@ run-unittests:
3737
run-integrationtests:
3838
pytest src/xpk/core/tests/integration/
3939

40+
run-kindtests:
41+
chmod +x ./tools/run-kind-tests.sh
42+
./tools/run-kind-tests.sh
43+
4044
.PHONY: mkdir-bin
4145
mkdir-bin:
4246
mkdir -p $(BIN_PATH)

README.md

+7-2
Original file line numberDiff line numberDiff line change
@@ -1400,13 +1400,18 @@ xpk interfaces seamlessly with kind to manage Kubernetes clusters locally, facil
14001400
14011401
## Local Testing Basics
14021402
1403-
Local testing is available exclusively through the `batch` and `job` commands of xpk with the `--kind-cluster` flag. This allows you to simulate training jobs locally:
1403+
Local testing is achievable through most commands of `xpk` except those that require Pathways, like `cluster create-pathways` or `workload create-pathways`. This functionality is supported by using the `--kind-cluster` flag which allows you to simulate operations locally on the `kind` tool.
1404+
1405+
This example demonstrates how to run a batch job locally using the --kind-cluster flag:
14041406
14051407
```shell
14061408
python xpk.py batch [other-options] --kind-cluster script
14071409
```
14081410
1409-
Please note that all other xpk subcommands are intended for use with cloud systems on Google Cloud Engine (GCE) and don't support local testing. This includes commands like cluster, info, inspector, etc.
1411+
While the `--kind-cluster` flag does extend local testing capabilities to several commands, please be aware that commands requiring specific features from Google Cloud Platform (GCP) might not yet be fully supported when tested locally. Future updates may provide enhanced support for these GCP-specific features.
1412+
1413+
Currently supported local testing cases can be reviewed in the script: `tools/run-kind-tests.sh`.
1414+
14101415
14111416
# Other advanced usage
14121417
[Use a Jupyter notebook to interact with a Cloud TPU cluster](xpk-notebooks.md)

src/xpk/commands/batch.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ def batch(args: Namespace) -> None:
3333
Returns:
3434
None
3535
"""
36-
if not args.kind_cluster:
36+
if not getattr(args, 'kind_cluster', None):
3737
add_zone_and_project(args)
3838
set_cluster_command_code = set_cluster_command(args)
3939
else:

src/xpk/commands/cluster.py

+8-2
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@
6363
from ..utils.console import get_user_input, xpk_exit, xpk_print
6464
from ..utils.file import write_tmp_file
6565
from . import cluster_gcluster
66+
from .kind import set_local_cluster_command
6667

6768

6869
def cluster_create(args) -> None:
@@ -331,9 +332,14 @@ def cluster_describe(args) -> None:
331332
0 if successful and 1 otherwise.
332333
"""
333334
xpk_print(f'Starting nodepool list for cluster: {args.cluster}', flush=True)
334-
add_zone_and_project(args)
335335

336-
get_cluster_credentials(args)
336+
if not getattr(args, 'kind_cluster', None):
337+
add_zone_and_project(args)
338+
get_cluster_credentials(args)
339+
else:
340+
set_cluster_command_code = set_local_cluster_command(args)
341+
if set_cluster_command_code != 0:
342+
xpk_exit(set_cluster_command_code)
337343

338344
return_code, data_table = nodepools_build_table(args)
339345
if return_code != 0:

src/xpk/commands/info.py

+8-3
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
from ..core.kueue import verify_kueuectl
2525
from ..utils.console import xpk_exit, xpk_print
2626
from .common import set_cluster_command
27+
from .kind import set_local_cluster_command
2728

2829
table_fmt = 'plain'
2930

@@ -36,8 +37,12 @@ def info(args: Namespace) -> None:
3637
Returns:
3738
None
3839
"""
39-
add_zone_and_project(args)
40-
set_cluster_command_code = set_cluster_command(args)
40+
if not getattr(args, 'kind_cluster', None):
41+
add_zone_and_project(args)
42+
set_cluster_command_code = set_cluster_command(args)
43+
else:
44+
set_cluster_command_code = set_local_cluster_command(args)
45+
4146
if set_cluster_command_code != 0:
4247
xpk_exit(set_cluster_command_code)
4348

@@ -82,7 +87,7 @@ def get_nominal_quotas(cqs: list[dict]) -> dict[str, dict[str, str]]:
8287
spec = cq['spec']
8388
cq_name = cq['metadata']['name']
8489
quotas[cq_name] = {}
85-
for rg in spec['resourceGroups']:
90+
for rg in spec.get('resourceGroups', []):
8691
for flavor in rg['flavors']:
8792
name = flavor['name']
8893
for resource in flavor['resources']:

src/xpk/commands/inspector.py

+89-72
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
from ..core.resources import CLUSTER_METADATA_CONFIGMAP, CLUSTER_RESOURCES_CONFIGMAP
2222
from ..utils.console import xpk_exit, xpk_print
2323
from ..utils.file import append_tmp_file, write_tmp_file
24+
from .kind import set_local_cluster_command
2425
from .workload import get_workload_list
2526

2627

@@ -120,50 +121,62 @@ def inspector(args) -> None:
120121
final_return_code = 0
121122
xpk_print(args)
122123

123-
add_zone_and_project(args)
124-
get_cluster_credentials(args)
124+
if not getattr(args, 'kind_cluster', None):
125+
add_zone_and_project(args)
126+
get_cluster_credentials(args)
127+
else:
128+
set_cluster_command_code = set_local_cluster_command(args)
129+
if set_cluster_command_code != 0:
130+
xpk_exit(set_cluster_command_code)
125131

126132
inspector_file = write_tmp_file(
127133
'==================\nXPK inspector OUTPUT:\n==================\n'
128134
)
129-
command_and_descriptions = [
130-
('gcloud version', 'Local Setup: gcloud version'),
131-
(
132-
(
133-
'gcloud config get project; gcloud config get compute/zone;'
134-
' gcloud config get compute/region'
135-
),
136-
'Local Setup: Project / Zone / Region',
137-
),
138-
(
139-
(
140-
'gcloud beta container clusters list --project'
141-
f' {args.project} --region {zone_to_region(args.zone)} | grep -e'
142-
f' NAME -e {args.cluster}'
143-
),
144-
'GKE: Cluster Details',
145-
),
146-
(
147-
(
148-
'kubectl get configmap'
149-
f' {args.cluster}-{CLUSTER_METADATA_CONFIGMAP} -o yaml'
150-
),
151-
'GKE: Cluster Metadata ConfigMap Details',
152-
),
153-
(
154-
(
155-
'kubectl get configmap'
156-
f' {args.cluster}-{CLUSTER_RESOURCES_CONFIGMAP} -o yaml'
157-
),
158-
'GKE: Cluster Resources ConfigMap Details',
159-
),
160-
(
161-
(
162-
f'gcloud beta container node-pools list --cluster {args.cluster} '
163-
f' --project={args.project} --region={zone_to_region(args.zone)}'
164-
),
165-
'GKE: Node pool Details',
166-
),
135+
136+
gcloud_commands_and_descriptions = []
137+
if not getattr(args, 'kind_cluster', None):
138+
gcloud_commands_and_descriptions = [
139+
('gcloud version', 'Local Setup: gcloud version'),
140+
(
141+
(
142+
'gcloud config get project; gcloud config get compute/zone;'
143+
' gcloud config get compute/region'
144+
),
145+
'Local Setup: Project / Zone / Region',
146+
),
147+
(
148+
(
149+
'gcloud beta container clusters list --project'
150+
f' {args.project} --region {zone_to_region(args.zone)} | grep'
151+
f' -e NAME -e {args.cluster}'
152+
),
153+
'GKE: Cluster Details',
154+
),
155+
(
156+
(
157+
'kubectl get configmap'
158+
f' {args.cluster}-{CLUSTER_METADATA_CONFIGMAP} -o yaml'
159+
),
160+
'GKE: Cluster Metadata ConfigMap Details',
161+
),
162+
(
163+
(
164+
'kubectl get configmap'
165+
f' {args.cluster}-{CLUSTER_RESOURCES_CONFIGMAP} -o yaml'
166+
),
167+
'GKE: Cluster Resources ConfigMap Details',
168+
),
169+
(
170+
(
171+
'gcloud beta container node-pools list --cluster'
172+
f' {args.cluster} '
173+
f' --project={args.project} --region={zone_to_region(args.zone)}'
174+
),
175+
'GKE: Node pool Details',
176+
),
177+
]
178+
179+
kubectl_commands_and_descriptions = [
167180
(
168181
(
169182
"kubectl get node -o custom-columns='NODE_NAME:metadata.name,"
@@ -228,6 +241,9 @@ def inspector(args) -> None:
228241
),
229242
]
230243

244+
command_and_descriptions = (
245+
gcloud_commands_and_descriptions + kubectl_commands_and_descriptions
246+
)
231247
for command, description in command_and_descriptions:
232248
return_code = inspector_run_command_helper(
233249
args, command, description, inspector_file
@@ -305,45 +321,46 @@ def inspector(args) -> None:
305321

306322
# Cloud Console Links:
307323
workload_links = []
308-
if args.workload:
324+
if args.workload and not getattr(args, 'kind_cluster', None):
309325
workload_links = [(
310326
f'Cloud Console for the workload {args.workload}',
311327
# pylint: disable=line-too-long
312328
f'https://console.cloud.google.com/kubernetes/service/{zone_to_region(args.zone)}/{args.cluster}/default/{args.workload}/details?project={args.project}',
313329
)]
314330

315-
links = [
316-
(
317-
'Cloud Console for the GKE Cluster',
318-
# pylint: disable=line-too-long
319-
f'https://console.cloud.google.com/kubernetes/clusters/details/{zone_to_region(args.zone)}/{args.cluster}/details?project={args.project}',
320-
),
321-
(
322-
'Cloud Console for all workloads in GKE Cluster',
323-
# pylint: disable=line-too-long
324-
f'https://console.cloud.google.com/kubernetes/workload/overview?project={args.project}&pageState=((gke%2F{zone_to_region(args.zone)}%2F{args.cluster}))',
325-
),
326-
(
327-
'Cloud Console for IAM Permissions',
328-
f'https://console.cloud.google.com/iam-admin/iam?project={args.project}',
329-
),
330-
(
331-
'Cloud Console for Quotas',
332-
f'https://console.cloud.google.com/iam-admin/quotas?project={args.project}',
333-
),
334-
]
335-
links.extend(workload_links)
336-
337-
for description, workload_link in links:
338-
return_code = inspector_output_link_helper(
339-
args, workload_link, description, inspector_file
340-
)
341-
if return_code != 0:
342-
final_return_code = return_code
343-
xpk_print(
344-
f'inspector failed in link: {workload_link} description:'
345-
f' {description} return code: {return_code}'
331+
if not getattr(args, 'kind_cluster', None):
332+
links = [
333+
(
334+
'Cloud Console for the GKE Cluster',
335+
# pylint: disable=line-too-long
336+
f'https://console.cloud.google.com/kubernetes/clusters/details/{zone_to_region(args.zone)}/{args.cluster}/details?project={args.project}',
337+
),
338+
(
339+
'Cloud Console for all workloads in GKE Cluster',
340+
# pylint: disable=line-too-long
341+
f'https://console.cloud.google.com/kubernetes/workload/overview?project={args.project}&pageState=((gke%2F{zone_to_region(args.zone)}%2F{args.cluster}))',
342+
),
343+
(
344+
'Cloud Console for IAM Permissions',
345+
f'https://console.cloud.google.com/iam-admin/iam?project={args.project}',
346+
),
347+
(
348+
'Cloud Console for Quotas',
349+
f'https://console.cloud.google.com/iam-admin/quotas?project={args.project}',
350+
),
351+
]
352+
links.extend(workload_links)
353+
354+
for description, workload_link in links:
355+
return_code = inspector_output_link_helper(
356+
args, workload_link, description, inspector_file
346357
)
358+
if return_code != 0:
359+
final_return_code = return_code
360+
xpk_print(
361+
f'inspector failed in link: {workload_link} description:'
362+
f' {description} return code: {return_code}'
363+
)
347364

348365
# Summarize inspector:
349366
xpk_print(f'Find xpk inspector output file: {inspector_file.name}')

src/xpk/commands/job.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -141,7 +141,7 @@ def job_list(args) -> None:
141141
Returns:
142142
None
143143
"""
144-
if not args.kind_cluster:
144+
if not getattr(args, 'kind_cluster', None):
145145
add_zone_and_project(args)
146146
set_cluster_command_code = set_cluster_command(args)
147147
msg = f'Listing jobs for project {args.project} and zone {args.zone}:'
@@ -176,7 +176,7 @@ def job_cancel(args) -> None:
176176
None
177177
"""
178178
xpk_print(f'Starting job cancel for job: {args.name}', flush=True)
179-
if not args.kind_cluster:
179+
if not getattr(args, 'kind_cluster', None):
180180
add_zone_and_project(args)
181181
set_cluster_command_code = set_cluster_command(args)
182182
else:

src/xpk/commands/kind.py

+44-14
Original file line numberDiff line numberDiff line change
@@ -14,22 +14,19 @@
1414
limitations under the License.
1515
"""
1616

17-
from ..core.commands import (
18-
run_command_for_value,
19-
run_command_with_updates,
20-
)
21-
from ..core.cluster import (
22-
set_jobset_on_cluster,
23-
)
24-
from ..core.kjob import (
25-
verify_kjob_installed,
26-
prepare_kjob,
27-
apply_kjob_crds,
28-
)
17+
import yaml
18+
19+
from ..core.cluster import set_jobset_on_cluster
20+
from ..core.commands import run_command_for_value, run_command_with_updates
21+
from ..core.kjob import apply_kjob_crds, prepare_kjob, verify_kjob_installed
2922
from ..core.kueue import (
23+
install_kueue_crs,
3024
install_kueue_on_cluster,
25+
wait_for_kueue_available,
3126
)
32-
from ..utils.console import (xpk_exit, xpk_print)
27+
from ..core.system_characteristics import get_system_characteristics
28+
from ..utils.console import xpk_exit, xpk_print
29+
from ..utils.file import write_tmp_file
3330

3431

3532
def cluster_create(args) -> None:
@@ -79,6 +76,22 @@ def cluster_create(args) -> None:
7976
if err_code > 0:
8077
xpk_exit(err_code)
8178

79+
xpk_print('Wait for Kueue to be fully available')
80+
wait_for_kueue_available_code = wait_for_kueue_available(args)
81+
if wait_for_kueue_available_code != 0:
82+
xpk_exit(wait_for_kueue_available_code)
83+
84+
args.kind_cluster = True
85+
system, return_code = get_system_characteristics(args)
86+
if return_code > 0:
87+
xpk_print('Fetching system characteristics failed!')
88+
xpk_exit(return_code)
89+
90+
xpk_print('Install Kueue Custom Resources')
91+
enable_kueue_credentials_code = install_kueue_crs(args, system, None)
92+
if enable_kueue_credentials_code != 0:
93+
xpk_exit(enable_kueue_credentials_code)
94+
8295
xpk_print('Kind commands done! Resources are created.')
8396
xpk_exit(0)
8497

@@ -184,7 +197,24 @@ def run_kind_cluster_create_command(args) -> int:
184197
Returns:
185198
0 if successful and 1 otherwise.
186199
"""
187-
command = 'kind create cluster'
200+
# Simulating GCP Environments in Kind
201+
# Configures kind nodes to mimic GCP features such as node pools, TPU accelerators or instance types.
202+
# These modifications aid in testing the compatibility and behavior of applications outside real GCP.
203+
# For more details on kind configurations, visit: https://kind.sigs.k8s.io/docs/user/configuration/
204+
kind_config = {
205+
'kind': 'Cluster',
206+
'apiVersion': 'kind.x-k8s.io/v1alpha4',
207+
'nodes': [
208+
{'role': 'control-plane'},
209+
{
210+
'role': 'worker',
211+
'labels': {'cloud.google.com/gke-nodepool': 'kind-pool-0'},
212+
},
213+
],
214+
}
215+
yaml_str = yaml.dump(kind_config)
216+
tmp = write_tmp_file(yaml_str)
217+
command = f'kind create cluster --config {tmp.file.name}'
188218

189219
if args.cluster:
190220
command += f' --name={args.cluster}'

0 commit comments

Comments
 (0)