Skip to content

Add local tests with Kind #350

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,10 @@ run-unittests:
run-integrationtests:
pytest src/xpk/core/tests/integration/

run-kindtests:
chmod +x ./tools/run-kind-tests.sh
./tools/run-kind-tests.sh

.PHONY: mkdir-bin
mkdir-bin:
mkdir -p $(BIN_PATH)
Expand Down
9 changes: 7 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -1458,13 +1458,18 @@ xpk interfaces seamlessly with kind to manage Kubernetes clusters locally, facil

## Local Testing Basics

Local testing is available exclusively through the `batch` and `job` commands of xpk with the `--kind-cluster` flag. This allows you to simulate training jobs locally:
Local testing is achievable through most commands of `xpk` except those that require Pathways, like `cluster create-pathways` or `workload create-pathways`. This functionality is supported by using the `--kind-cluster` flag which allows you to simulate operations locally on the `kind` tool.

This example demonstrates how to run a batch job locally using the --kind-cluster flag:

```shell
python xpk.py batch [other-options] --kind-cluster script
```

Please note that all other xpk subcommands are intended for use with cloud systems on Google Cloud Engine (GCE) and don't support local testing. This includes commands like cluster, info, inspector, etc.
While the `--kind-cluster` flag does extend local testing capabilities to several commands, please be aware that commands requiring specific features from Google Cloud Platform (GCP) might not yet be fully supported when tested locally. Future updates may provide enhanced support for these GCP-specific features.

Currently supported local testing cases can be reviewed in the script: `tools/run-kind-tests.sh`.


# Other advanced usage
[Use a Jupyter notebook to interact with a Cloud TPU cluster](xpk-notebooks.md)
2 changes: 1 addition & 1 deletion src/xpk/commands/batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def batch(args: Namespace) -> None:
Returns:
None
"""
if not args.kind_cluster:
if not getattr(args, 'kind_cluster', None):
add_zone_and_project(args)
set_cluster_command_code = set_cluster_command(args)
else:
Expand Down
12 changes: 9 additions & 3 deletions src/xpk/commands/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
install_nccl_on_cluster,
set_jobset_on_cluster,
setup_k8s_env,
update_cluster_with_gcpfilestore_driver_if_necessary,
update_cluster_with_gcsfuse_driver_if_necessary,
update_cluster_with_workload_identity_if_necessary,
)
Expand Down Expand Up @@ -63,7 +64,7 @@
from ..utils.console import get_user_input, xpk_exit, xpk_print
from ..utils.file import write_tmp_file
from . import cluster_gcluster
from ..core.cluster import update_cluster_with_gcpfilestore_driver_if_necessary
from .kind import set_local_cluster_command


def cluster_create(args) -> None:
Expand Down Expand Up @@ -345,9 +346,14 @@ def cluster_describe(args) -> None:
0 if successful and 1 otherwise.
"""
xpk_print(f'Starting nodepool list for cluster: {args.cluster}', flush=True)
add_zone_and_project(args)

get_cluster_credentials(args)
if not getattr(args, 'kind_cluster', None):
add_zone_and_project(args)
get_cluster_credentials(args)
else:
set_cluster_command_code = set_local_cluster_command(args)
if set_cluster_command_code != 0:
xpk_exit(set_cluster_command_code)

return_code, data_table = nodepools_build_table(args)
if return_code != 0:
Expand Down
11 changes: 8 additions & 3 deletions src/xpk/commands/info.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
from ..core.kueue import verify_kueuectl
from ..utils.console import xpk_exit, xpk_print
from .common import set_cluster_command
from .kind import set_local_cluster_command

table_fmt = 'plain'

Expand All @@ -36,8 +37,12 @@ def info(args: Namespace) -> None:
Returns:
None
"""
add_zone_and_project(args)
set_cluster_command_code = set_cluster_command(args)
if not getattr(args, 'kind_cluster', None):
add_zone_and_project(args)
set_cluster_command_code = set_cluster_command(args)
else:
set_cluster_command_code = set_local_cluster_command(args)

if set_cluster_command_code != 0:
xpk_exit(set_cluster_command_code)

Expand Down Expand Up @@ -82,7 +87,7 @@ def get_nominal_quotas(cqs: list[dict]) -> dict[str, dict[str, str]]:
spec = cq['spec']
cq_name = cq['metadata']['name']
quotas[cq_name] = {}
for rg in spec['resourceGroups']:
for rg in spec.get('resourceGroups', []):
for flavor in rg['flavors']:
name = flavor['name']
for resource in flavor['resources']:
Expand Down
161 changes: 89 additions & 72 deletions src/xpk/commands/inspector.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from ..core.resources import CLUSTER_METADATA_CONFIGMAP, CLUSTER_RESOURCES_CONFIGMAP
from ..utils.console import xpk_exit, xpk_print
from ..utils.file import append_tmp_file, write_tmp_file
from .kind import set_local_cluster_command
from .workload import get_workload_list


Expand Down Expand Up @@ -120,50 +121,62 @@ def inspector(args) -> None:
final_return_code = 0
xpk_print(args)

add_zone_and_project(args)
get_cluster_credentials(args)
if not getattr(args, 'kind_cluster', None):
add_zone_and_project(args)
get_cluster_credentials(args)
else:
set_cluster_command_code = set_local_cluster_command(args)
if set_cluster_command_code != 0:
xpk_exit(set_cluster_command_code)

inspector_file = write_tmp_file(
'==================\nXPK inspector OUTPUT:\n==================\n'
)
command_and_descriptions = [
('gcloud version', 'Local Setup: gcloud version'),
(
(
'gcloud config get project; gcloud config get compute/zone;'
' gcloud config get compute/region'
),
'Local Setup: Project / Zone / Region',
),
(
(
'gcloud beta container clusters list --project'
f' {args.project} --region {zone_to_region(args.zone)} | grep -e'
f' NAME -e {args.cluster}'
),
'GKE: Cluster Details',
),
(
(
'kubectl get configmap'
f' {args.cluster}-{CLUSTER_METADATA_CONFIGMAP} -o yaml'
),
'GKE: Cluster Metadata ConfigMap Details',
),
(
(
'kubectl get configmap'
f' {args.cluster}-{CLUSTER_RESOURCES_CONFIGMAP} -o yaml'
),
'GKE: Cluster Resources ConfigMap Details',
),
(
(
f'gcloud beta container node-pools list --cluster {args.cluster} '
f' --project={args.project} --region={zone_to_region(args.zone)}'
),
'GKE: Node pool Details',
),

gcloud_commands_and_descriptions = []
if not getattr(args, 'kind_cluster', None):
gcloud_commands_and_descriptions = [
('gcloud version', 'Local Setup: gcloud version'),
(
(
'gcloud config get project; gcloud config get compute/zone;'
' gcloud config get compute/region'
),
'Local Setup: Project / Zone / Region',
),
(
(
'gcloud beta container clusters list --project'
f' {args.project} --region {zone_to_region(args.zone)} | grep'
f' -e NAME -e {args.cluster}'
),
'GKE: Cluster Details',
),
(
(
'kubectl get configmap'
f' {args.cluster}-{CLUSTER_METADATA_CONFIGMAP} -o yaml'
),
'GKE: Cluster Metadata ConfigMap Details',
),
(
(
'kubectl get configmap'
f' {args.cluster}-{CLUSTER_RESOURCES_CONFIGMAP} -o yaml'
),
'GKE: Cluster Resources ConfigMap Details',
),
(
(
'gcloud beta container node-pools list --cluster'
f' {args.cluster} '
f' --project={args.project} --region={zone_to_region(args.zone)}'
),
'GKE: Node pool Details',
),
]

kubectl_commands_and_descriptions = [
(
(
"kubectl get node -o custom-columns='NODE_NAME:metadata.name,"
Expand Down Expand Up @@ -228,6 +241,9 @@ def inspector(args) -> None:
),
]

command_and_descriptions = (
gcloud_commands_and_descriptions + kubectl_commands_and_descriptions
)
for command, description in command_and_descriptions:
return_code = inspector_run_command_helper(
args, command, description, inspector_file
Expand Down Expand Up @@ -305,45 +321,46 @@ def inspector(args) -> None:

# Cloud Console Links:
workload_links = []
if args.workload:
if args.workload and not getattr(args, 'kind_cluster', None):
workload_links = [(
f'Cloud Console for the workload {args.workload}',
# pylint: disable=line-too-long
f'https://console.cloud.google.com/kubernetes/service/{zone_to_region(args.zone)}/{args.cluster}/default/{args.workload}/details?project={args.project}',
)]

links = [
(
'Cloud Console for the GKE Cluster',
# pylint: disable=line-too-long
f'https://console.cloud.google.com/kubernetes/clusters/details/{zone_to_region(args.zone)}/{args.cluster}/details?project={args.project}',
),
(
'Cloud Console for all workloads in GKE Cluster',
# pylint: disable=line-too-long
f'https://console.cloud.google.com/kubernetes/workload/overview?project={args.project}&pageState=((gke%2F{zone_to_region(args.zone)}%2F{args.cluster}))',
),
(
'Cloud Console for IAM Permissions',
f'https://console.cloud.google.com/iam-admin/iam?project={args.project}',
),
(
'Cloud Console for Quotas',
f'https://console.cloud.google.com/iam-admin/quotas?project={args.project}',
),
]
links.extend(workload_links)

for description, workload_link in links:
return_code = inspector_output_link_helper(
args, workload_link, description, inspector_file
)
if return_code != 0:
final_return_code = return_code
xpk_print(
f'inspector failed in link: {workload_link} description:'
f' {description} return code: {return_code}'
if not getattr(args, 'kind_cluster', None):
links = [
(
'Cloud Console for the GKE Cluster',
# pylint: disable=line-too-long
f'https://console.cloud.google.com/kubernetes/clusters/details/{zone_to_region(args.zone)}/{args.cluster}/details?project={args.project}',
),
(
'Cloud Console for all workloads in GKE Cluster',
# pylint: disable=line-too-long
f'https://console.cloud.google.com/kubernetes/workload/overview?project={args.project}&pageState=((gke%2F{zone_to_region(args.zone)}%2F{args.cluster}))',
),
(
'Cloud Console for IAM Permissions',
f'https://console.cloud.google.com/iam-admin/iam?project={args.project}',
),
(
'Cloud Console for Quotas',
f'https://console.cloud.google.com/iam-admin/quotas?project={args.project}',
),
]
links.extend(workload_links)

for description, workload_link in links:
return_code = inspector_output_link_helper(
args, workload_link, description, inspector_file
)
if return_code != 0:
final_return_code = return_code
xpk_print(
f'inspector failed in link: {workload_link} description:'
f' {description} return code: {return_code}'
)

# Summarize inspector:
xpk_print(f'Find xpk inspector output file: {inspector_file.name}')
Expand Down
4 changes: 2 additions & 2 deletions src/xpk/commands/job.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ def job_list(args) -> None:
Returns:
None
"""
if not args.kind_cluster:
if not getattr(args, 'kind_cluster', None):
add_zone_and_project(args)
set_cluster_command_code = set_cluster_command(args)
msg = f'Listing jobs for project {args.project} and zone {args.zone}:'
Expand Down Expand Up @@ -176,7 +176,7 @@ def job_cancel(args) -> None:
None
"""
xpk_print(f'Starting job cancel for job: {args.name}', flush=True)
if not args.kind_cluster:
if not getattr(args, 'kind_cluster', None):
add_zone_and_project(args)
set_cluster_command_code = set_cluster_command(args)
else:
Expand Down
Loading
Loading