Skip to content

Commit b1fe289

Browse files
authored
Merge pull request #220 from klueska/update-gke-demo
Update GKE deployment script for kubernetes 1.32
2 parents 6c34f5f + 737b187 commit b1fe289

File tree

3 files changed

+24
-19
lines changed

3 files changed

+24
-19
lines changed

demo/clusters/gke/create-cluster.sh

+18-15
Original file line numberDiff line numberDiff line change
@@ -35,35 +35,38 @@ DRIVER_NAME=$(from_versions_mk "DRIVER_NAME")
3535

3636
NETWORK_NAME="${DRIVER_NAME}-net"
3737
CLUSTER_NAME="${DRIVER_NAME}-cluster"
38-
NODE_VERSION="1.31.1"
38+
NODE_VERSION="1.32"
39+
ROUTER_REGION="us-central1"
40+
REGION="us-central1-c"
3941

4042
## Create the Network for the cluster
4143
gcloud compute networks create "${NETWORK_NAME}" \
4244
--quiet \
4345
--project="${PROJECT_NAME}" \
44-
--description=Manually\ created\ network\ for\ TMS\ DRA\ Alpha\ cluster \
46+
--description="Manually created network for DRA beta test cluster" \
4547
--subnet-mode=auto \
4648
--mtu=1460 \
4749
--bgp-routing-mode=regional
4850

4951
## Create the cluster
5052
gcloud container clusters create "${CLUSTER_NAME}" \
5153
--quiet \
52-
--enable-kubernetes-alpha \
54+
--enable-kubernetes-unstable-apis="resource.k8s.io/v1beta1/deviceclasses,resource.k8s.io/v1beta1/resourceclaims,resource.k8s.io/v1beta1/resourceclaimtemplates,resource.k8s.io/v1beta1/resourceslices" \
55+
--release-channel=rapid \
5356
--no-enable-autorepair \
54-
--no-enable-autoupgrade \
55-
--region us-west1 \
57+
--enable-autoupgrade \
58+
--region "${REGION}" \
5659
--num-nodes "1" \
5760
--network "${NETWORK_NAME}" \
5861
--cluster-version "${NODE_VERSION}" \
59-
--node-version "${NODE_VERSION}"
62+
--node-version "${NODE_VERSION}" \
6063

6164
# Create t4 node pool
6265
gcloud beta container node-pools create "pool-1" \
6366
--quiet \
6467
--project "${PROJECT_NAME}" \
6568
--cluster "${CLUSTER_NAME}" \
66-
--region "us-west1" \
69+
--region "${REGION}" \
6770
--node-version "${NODE_VERSION}" \
6871
--machine-type "n1-standard-8" \
6972
--accelerator "type=nvidia-tesla-t4,count=1" \
@@ -77,19 +80,19 @@ gcloud beta container node-pools create "pool-1" \
7780
--min-nodes "2" \
7881
--max-nodes "6" \
7982
--location-policy "ANY" \
80-
--no-enable-autoupgrade \
83+
--enable-autoupgrade \
8184
--no-enable-autorepair \
8285
--max-surge-upgrade 1 \
8386
--max-unavailable-upgrade 0 \
84-
--node-locations "us-west1-a" \
87+
--node-locations "${REGION}" \
8588
--node-labels=gke-no-default-nvidia-gpu-device-plugin=true,nvidia.com/gpu.present=true
8689

8790
# Create v100 node pool
8891
gcloud beta container node-pools create "pool-2" \
8992
--quiet \
9093
--project "${PROJECT_NAME}" \
9194
--cluster "${CLUSTER_NAME}" \
92-
--region "us-west1" \
95+
--region "${REGION}" \
9396
--node-version "${NODE_VERSION}" \
9497
--machine-type "n1-standard-8" \
9598
--accelerator "type=nvidia-tesla-v100,count=1" \
@@ -103,30 +106,30 @@ gcloud beta container node-pools create "pool-2" \
103106
--min-nodes "1" \
104107
--max-nodes "6" \
105108
--location-policy "ANY" \
106-
--no-enable-autoupgrade \
109+
--enable-autoupgrade \
107110
--no-enable-autorepair \
108111
--max-surge-upgrade 1 \
109112
--max-unavailable-upgrade 0 \
110-
--node-locations "us-west1-a" \
113+
--node-locations "${REGION}" \
111114
--node-labels=gke-no-default-nvidia-gpu-device-plugin=true,nvidia.com/gpu.present=true
112115

113116
## Allow the GPU nodes access to the internet
114117
gcloud compute routers create ${NETWORK_NAME}-nat-router \
115118
--quiet \
116119
--project "${PROJECT_NAME}" \
117120
--network "${NETWORK_NAME}" \
118-
--region "us-west1"
121+
--region "${ROUTER_REGION}" \
119122

120123
gcloud compute routers nats create "${NETWORK_NAME}-nat-config" \
121124
--quiet \
122125
--project "${PROJECT_NAME}" \
123126
--router "${NETWORK_NAME}-nat-router" \
124127
--nat-all-subnet-ip-ranges \
125128
--auto-allocate-nat-external-ips \
126-
--router-region "us-west1"
129+
--router-region "${ROUTER_REGION}" \
127130

128131
## Start using this cluster for kubectl
129-
gcloud container clusters get-credentials "${CLUSTER_NAME}" --location="us-west1"
132+
gcloud container clusters get-credentials "${CLUSTER_NAME}" --location="${REGION}"
130133

131134
## Launch the nvidia-driver-installer daemonset to install the GPU drivers on any GPU nodes that come online:
132135
kubectl label node --overwrite -l nvidia.com/gpu.present=true cloud.google.com/gke-gpu-driver-version-

demo/clusters/gke/delete-cluster.sh

+5-3
Original file line numberDiff line numberDiff line change
@@ -29,25 +29,27 @@ DRIVER_NAME=$(from_versions_mk "DRIVER_NAME")
2929

3030
NETWORK_NAME="${DRIVER_NAME}-net"
3131
CLUSTER_NAME="${DRIVER_NAME}-cluster"
32+
ROUTER_REGION="us-central1"
33+
REGION="us-central1-c"
3234

3335
## Delete the cluster
3436
gcloud container clusters delete "${CLUSTER_NAME}" \
3537
--quiet \
3638
--project "${PROJECT_NAME}" \
37-
--region "us-west1"
39+
--region "${REGION}"
3840

3941
## Delete the nat config
4042
gcloud compute routers nats delete "${NETWORK_NAME}-nat-config" \
4143
--quiet \
4244
--project "${PROJECT_NAME}" \
4345
--router "${NETWORK_NAME}-nat-router" \
44-
--router-region "us-west1"
46+
--router-region "${ROUTER_REGION}"
4547

4648
## Delete the nat router
4749
gcloud compute routers delete ${NETWORK_NAME}-nat-router \
4850
--quiet \
4951
--project "${PROJECT_NAME}" \
50-
--region "us-west1"
52+
--region "${ROUTER_REGION}"
5153

5254
## Delete the network
5355
gcloud compute networks delete "${NETWORK_NAME}" \

demo/clusters/gke/install-dra-driver.sh

+1-1
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ DRIVER_NAME=$(from_versions_mk "DRIVER_NAME")
2727

2828
: ${IMAGE_REGISTRY:=ghcr.io/nvidia}
2929
: ${IMAGE_NAME:=${DRIVER_NAME}}
30-
: ${IMAGE_TAG:=32805fec-ubi8}
30+
: ${IMAGE_TAG:=6c34f5fb-ubi8}
3131

3232
helm upgrade -i --create-namespace --namespace nvidia nvidia-dra-driver ${PROJECT_DIR}/deployments/helm/k8s-dra-driver \
3333
--set image.repository=${IMAGE_REGISTRY}/${IMAGE_NAME} \

0 commit comments

Comments
 (0)