Skip to content

Commit f401848

Browse files
authored
Revert "Merge gcsfue into main (#317)" (#369)
This reverts commit c4e64e4.
1 parent c4e64e4 commit f401848

35 files changed

+67
-1894
lines changed

.github/workflows/build_tests.yaml

+12-47
Original file line numberDiff line numberDiff line change
@@ -35,15 +35,13 @@ env:
3535
TPU_CLUSTER_NAME: build-xpk-2-nodepools
3636
WORKLOAD_NAME: xpktest-build-${{ github.run_attempt }}
3737
PATHWAYS_WORKLOAD_NAME: xpkpw-build-${{ github.run_attempt }}
38-
RUN_ID: "pr-${{ github.event.number }}"
38+
CLUSTER_ARGUMENTS: "--network=${{secrets.NETWORK_NAME}} --subnetwork=${{secrets.SUBNETWORK_NAME}} --maintenance-window=23:50"
3939
PROJECT_ID: ${{secrets.PROJECT_NAME}}
4040
A3_MEGA_TEST_CLUSTER_NAME: "xpk-mega-ctk-int"
4141
A3_ULTRA_TEST_CLUSTER_NAME: "xpk-ultra-ctk-int"
4242
GKE_ML_TEST_CLUSTER_NAME: "xpk-gke-ml"
4343
ZONE: us-central2-a
4444
REGION: us-central2
45-
STORAGE_NAME: test-storage
46-
PW_CLUSTER_ARGUMENTS: "--network=${{secrets.NETWORK_NAME}} --subnetwork=${{secrets.SUBNETWORK_NAME}} --maintenance-window=23:50"
4745

4846
jobs:
4947
run-unit-tests:
@@ -134,10 +132,6 @@ jobs:
134132
with:
135133
version: '>= 363.0.0'
136134
install_components: 'beta,gke-gcloud-auth-plugin'
137-
- name: Generate random seed
138-
run: |
139-
RANDOM_SEED=$((RANDOM % 10000)) # Generate a random number between 0 and 9999
140-
echo "RANDOM_SEED=$RANDOM_SEED" >> $GITHUB_ENV
141135
- name: Verify gcp setup
142136
run: gcloud info
143137
- name: Set Google Cloud CLI properties to a unused zone to verify --zone arg is passed properly in commands.
@@ -152,56 +146,31 @@ jobs:
152146
run: gcloud components install kubectl && gcloud components install gke-gcloud-auth-plugin
153147
- name: Check xpk installation
154148
run: xpk --help
155-
- name: Create a private Pathways-enabled XPK Cluster with 2x v4-8 nodepools. Larger num-nodes to avoid master resizing.
156-
run: |
157-
python3 xpk.py cluster create-pathways --cluster $TPU_CLUSTER_NAME --private --tpu-type=v4-8 --num-slices=2 \
158-
--zone=us-central2-b --default-pool-cpu-machine-type=n1-standard-16 --default-pool-cpu-num-nodes=16 \
159-
--reservation='${{ secrets.GCP_TPU_V4_RESERVATION }}' --enable-workload-identity --enable-gcsfuse-csi-driver --custom-cluster-arguments="${PW_CLUSTER_ARGUMENTS}"\
149+
- name: Create a private Pathways-enabled XPK Cluster with 2x $TPU_TYPE nodepools. Larger num-nodes to avoid master resizing.
150+
run: python xpk.py cluster create-pathways --cluster $TPU_CLUSTER_NAME --private --tpu-type=$TPU_TYPE --num-slices=2 --zone=us-central2-b --default-pool-cpu-machine-type=n1-standard-16 --default-pool-cpu-num-nodes=16 --reservation='${{ secrets.GCP_TPU_V4_RESERVATION }}' --custom-cluster-arguments="${CLUSTER_ARGUMENTS}"
151+
- name: Verify the created cluster is private
152+
run: gcloud container clusters describe $TPU_CLUSTER_NAME --region=us-central2 --format="value(privateClusterConfig.enablePrivateNodes)" | grep 'True' || (echo 'The created cluster is not private.' && exit 1)
160153
- name: List out the nodepools on the cluster
161154
run: python xpk.py cluster describe --cluster $TPU_CLUSTER_NAME --zone=us-central2-b | grep -P '^(?=.*NODEPOOL_NAME)(?=.*SLICE)(?=.*TYPE)(?=.*EXPECTED_HEALTHY_NODES)(?=.*ACTUAL_HEALTHY_NODES)(?=.*TOTAL_NODES)'
162155
- name: Authenticate Docker
163156
run: gcloud auth configure-docker --quiet
164-
- name: Create auto-mount Storage instance
165-
run: |
166-
python3 xpk.py storage create $STORAGE_NAME --cluster=$TPU_CLUSTER_NAME --zone=us-central2-b --type=gcsfuse \
167-
--auto-mount=true \
168-
--mount-point='/test-mount-point' --readonly=false --manifest='./tests/data/pv-pvc-templates.yaml'
169-
- name: List and verify existing Storages
170-
run: python3 xpk.py storage list --cluster $TPU_CLUSTER_NAME --zone=us-central2-b | tee output.txt | grep 'test-storage' || (echo 'No storage found' && cat output.txt && exit 1)
171157
- name: Create test script to execute in workloads
172-
run: |
173-
echo -e \
174-
'#!/bin/bash \n
175-
echo "Hello world from a test script!"
176-
cd ~/../test-mount-point && echo "Hello world from a Github Action CI/CD test script!" > '$RANDOM_SEED'.txt' \
177-
> test.sh
178-
- name: Create a test script to execute in jobs.
179-
run: echo -e '#!/bin/bash \nsleep 3 \necho "Hello world from a test script!"' > job_test.sh
158+
run: echo -e '#!/bin/bash \nsleep 3 \necho "Hello world from a test script!"' > workload.sh
180159
- name: Run a job
181160
run: |
182161
python xpk.py run --cluster $TPU_CLUSTER_NAME --zone=us-central2-b workload.sh | awk '/Starting log streaming for pod xpk-def-app-profile-slurm-[a-zA-Z0-9]+-[0-9]+-[a-zA-Z0-9]+.../,/Job logs streaming finished./ { print }'
183-
# TODO: fix Delete the job failing step
184-
# - name: Delete the job
185-
# run: |
186-
# JOB_NAME=$(python3 xpk.py job ls --cluster $TPU_CLUSTER_NAME --zone=us-central2-b | grep 'xpk-def-app-profile-slurm-' | head -1 | awk '{print $1}')
187-
# kubectl delete job ${JOB_NAME}
188-
- name: Run a base-docker-image workload
162+
- name: Delete the job
189163
run: |
190-
python3 xpk.py workload create --cluster $TPU_CLUSTER_NAME --workload $WORKLOAD_NAME --command "bash test.sh" \
191-
--tpu-type=v4-8 --num-slices=2 --zone=us-central2-b
164+
JOB_NAME=$(python3 xpk.py job ls --cluster $TPU_CLUSTER_NAME --zone=us-central2-b | grep 'xpk-def-app-profile-slurm-' | head -1 | awk '{print $1}')
165+
kubectl delete job ${JOB_NAME}
166+
- name: Run a base-docker-image workload
167+
run: python xpk.py workload create --cluster $TPU_CLUSTER_NAME --workload $WORKLOAD_NAME --command "bash workload.sh" --tpu-type=$TPU_TYPE --num-slices=2 --zone=us-central2-b
192168
- name: Run xpk inspector with the workload created above
193169
run: python3 xpk.py inspector --cluster $TPU_CLUSTER_NAME --zone=us-central2-b --workload $WORKLOAD_NAME
194170
- name: Wait for workload completion and confirm it succeeded
195171
run: python3 xpk.py workload list --cluster $TPU_CLUSTER_NAME --zone=us-central2-b --wait-for-job-completion $WORKLOAD_NAME --timeout 300
196172
- name: Run a Pathways workload on Ubuntu base image
197-
run: |
198-
python3 xpk.py workload create-pathways --cluster $TPU_CLUSTER_NAME --workload $PATHWAYS_WORKLOAD_NAME \
199-
--docker-image='marketplace.gcr.io/google/ubuntu2004' --tpu-type=v4-8 --num-slices=2 --zone=us-central2-b \
200-
--command "echo \"Hello world from a test script! \""
201-
- name: Verify if the file was created in the GCS bucket
202-
run: gsutil cp gs://xpk-ci-cd-tests/$RANDOM_SEED.txt .
203-
- name: Check if the file contains desired content
204-
run: grep 'Hello world from a Github Action CI/CD test script!' $RANDOM_SEED.txt
173+
run: python xpk.py workload create-pathways --cluster $TPU_CLUSTER_NAME --workload $PATHWAYS_WORKLOAD_NAME --docker-image='marketplace.gcr.io/google/ubuntu2004' --tpu-type=$TPU_TYPE --num-slices=2 --zone=us-central2-b --command "echo \"Hello world from a test script! \""
205174
- name: Wait for Pathways workload completion and confirm it succeeded
206175
run: python3 xpk.py workload list --cluster $TPU_CLUSTER_NAME --zone=us-central2-b --wait-for-job-completion $PATHWAYS_WORKLOAD_NAME --timeout 300
207176
- name: List out the workloads on the cluster
@@ -212,10 +181,6 @@ jobs:
212181
run: python3 xpk.py workload delete --workload $WORKLOAD_NAME --cluster $TPU_CLUSTER_NAME --zone=us-central2-b
213182
- name: Delete the Pathways workload on the cluster
214183
run: python3 xpk.py workload delete --workload $PATHWAYS_WORKLOAD_NAME --cluster $TPU_CLUSTER_NAME --zone=us-central2-b
215-
- name: Delete created GCS file
216-
run: gsutil rm gs://xpk-ci-cd-tests/$RANDOM_SEED.txt
217-
- name: Delete existing Storage
218-
run: python3 xpk.py storage delete $STORAGE_NAME --cluster $TPU_CLUSTER_NAME --zone=us-central2-b
219184
- name: Create test script to execute in batch
220185
run: echo -e '#!/bin/bash \n#SBATCH --unknown-flag=value\n echo "Hello world from a test script!"' > batch.sh
221186
- name: Run a batch job on the cluster

.github/workflows/lint_and_format.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ on:
2222

2323
jobs:
2424
build-and-test:
25-
runs-on: ubuntu-22.04
25+
runs-on: ubuntu-latest
2626
strategy:
2727
matrix:
2828
python-version: ["3.10", "3.11"]

README.md

+2-51
Original file line numberDiff line numberDiff line change
@@ -56,10 +56,7 @@ and the following GPU types:
5656
and the following CPU types:
5757
* n2-standard-32
5858

59-
xpk also supports Google Cloud Storage solutions:
60-
* [Cloud Storage FUSE](https://cloud.google.com/storage/docs/gcs-fuse)
61-
62-
# Permissions needed on Cloud Console:
59+
# Cloud Console Permissions on the user or service account needed to run XPK:
6360

6461
* Artifact Registry Writer
6562
* Compute Admin
@@ -164,8 +161,6 @@ cleanup with a `Cluster Delete`.
164161
If you have failures with workloads not running, use `xpk inspector` to investigate
165162
more.
166163

167-
If you need your Workloads to have persistent storage, use `xpk storage` to find out more.
168-
169164
## Cluster Create
170165

171166
First set the project and zone through gcloud config or xpk arguments.
@@ -440,58 +435,14 @@ Currently, the below flags/arguments are supported for A3-Mega and A3-Ultra mach
440435
* --on-demand (only A3-Mega)
441436

442437

443-
## Storage
444-
Currently xpk supports Cloud Storage FUSE. A FUSE adapter that lets you mount and access Cloud Storage buckets as local file systems, so applications can read and write objects in your bucket using standard file system semantics.
445-
446-
To use the GCS FUSE with XPK user needs to:
447-
- create a [Storage Bucket](https://pantheon.corp.google.com/storage/)
448-
- create a manifest with PersistentVolume and PersistentVolumeClaim that mounts to the Bucket. To learn how to properly
449-
set up PersistentVolume and PersistentVolumeClaim visit [GKE Cloud Storage documentation](https://cloud.google.com/kubernetes-engine/docs/how-to/persistent-volumes/cloud-storage-fuse-csi-driver#provision-static)
450-
451-
Once it's ready user can define:
452-
453-
`--type` - defines a type of a storage, currently xpk supports `gcsfuse` only.
454-
`--auto-mount` - if set to true means that all workloads should have a given storage mounted by default.
455-
`--mount-point` - defines the path on which a given storage should be mounted for a workload.
456-
`--manifest` - defines the path to manifest which contains PersistentVolume and PersistentVolumeClaim definitions
457-
`--readonly` - if set to true, workload can only read from storage.
458-
459-
* Create a simple Storage
460-
461-
```shell
462-
python3 xpk.py storage create test-storage --project=$PROJECT
463-
--cluster=xpk-test --type=gcsfuse --auto-mount=false \
464-
--mount-point='/test-mount-point' --readonly=false \
465-
--manifest='pv-pvc-auto-mount.yaml'
466-
```
467-
468-
* Create a simple Workload with Storage attached
469-
```shell
470-
python3 xpk.py workload create \
471-
--workload xpk-test-workload --command "echo goodbye" \
472-
--cluster xpk-test \
473-
--tpu-type=v5litepod-16 \
474-
--storage test-storage --projet=$PROJECT
475-
```
476-
477-
* List Storage
478-
```shell
479-
python3 xpk.py storage list --cluster xpk-test --zone=us-central2-b --projet=$PROJECT
480-
```
481-
482-
* Delete Storage
483-
```shell
484-
python3 xpk.py storage delete test-storage --cluster xpk-test --zone=us-central2-b --projet=$PROJECT
485-
```
486-
487438
## Workload Create
488439
* Workload Create (submit training job):
489440

490441
```shell
491442
python3 xpk.py workload create \
492443
--workload xpk-test-workload --command "echo goodbye" \
493444
--cluster xpk-test \
494-
--tpu-type=v5litepod-16 --projet=$PROJECT
445+
--tpu-type=v5litepod-16
495446
```
496447

497448
* Workload Create for Pathways:

docs/contributing.md

-1
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,6 @@ source $VENV_DIR/bin/activate
4545

4646
```shell
4747
git clone https://github.com/google/xpk.git
48-
cd xpk
4948
pip install .[dev]
5049
```
5150

pyproject.toml

+1-5
Original file line numberDiff line numberDiff line change
@@ -35,9 +35,6 @@ dependencies = [
3535
"ruamel.yaml",
3636
"pyyaml",
3737
"docker",
38-
"kubernetes==31.0.0",
39-
"google-cloud",
40-
"google-api-core",
4138
"packaging"
4239
]
4340

@@ -67,9 +64,8 @@ dev = [
6764
version = {attr = "xpk.core.core.__version__"}
6865

6966
[tool.setuptools]
70-
packages = ["xpk", "xpk.parser", "xpk.core", "xpk.commands", "xpk.api", "xpk.templates", "xpk.utils", "xpk.core.blueprint", "xpk.core.workload_decorators"]
67+
packages = ["xpk", "xpk.parser", "xpk.core", "xpk.commands", "xpk.utils", "xpk.core.blueprint", "xpk.core.workload_decorators"]
7168
package-dir = {"" = "src"}
72-
package-data = {"xpk.api" = ["storage_crd.yaml"], "xpk.templates" = ["storage.yaml"]}
7369

7470
[tool.pyink]
7571
# Formatting configuration to follow Google style-guide.

pytype-conf.cfg

+1-2
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@ exclude =
2020
src/xpk/core/kjob.py
2121
src/xpk/core/kueue.py
2222
src/xpk/core/nap.py
23-
src/xpk/core/storage.py
2423
src/xpk/core/pathways.py
2524
src/xpk/core/system_characteristics.py
2625
src/xpk/parser
@@ -29,7 +28,7 @@ exclude =
2928
# Keep going past errors to analyze as many files as possible.
3029
keep_going = True
3130

32-
jobs = 5
31+
jobs = 10
3332

3433
# Platform (e.g., "linux", "win32") that the target code runs on.
3534
platform = linux

src/xpk/api/__init__.py

-15
This file was deleted.

src/xpk/api/storage_crd.yaml

-52
This file was deleted.

src/xpk/blueprints/a3mega/storage_crd.yaml

-52
This file was deleted.

0 commit comments

Comments
 (0)