File tree 4 files changed +41
-9
lines changed
deployments/helm/k8s-dra-driver
4 files changed +41
-9
lines changed Original file line number Diff line number Diff line change @@ -29,6 +29,7 @@ helm upgrade -i --create-namespace --namespace nvidia nvidia-dra-driver ${PROJEC
29
29
--set deviceClasses=" {${deviceClasses} }" \
30
30
${NVIDIA_CTK_PATH: +--set nvidiaCtkPath=${NVIDIA_CTK_PATH} } \
31
31
${NVIDIA_DRIVER_ROOT: +--set nvidiaDriverRoot=${NVIDIA_DRIVER_ROOT} } \
32
+ ${MASK_NVIDIA_DRIVER_PARAMS: +--set maskNvidiaDriverParams=${MASK_NVIDIA_DRIVER_PARAMS} } \
32
33
--wait
33
34
34
35
set +x
Load Diff This file was deleted.
Original file line number Diff line number Diff line change
1
+ #! /usr/bin/env bash
2
+
3
+ # Copyright 2024 NVIDIA CORPORATION.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ # A reference to the current directory where this script is located
18
+ CURRENT_DIR=" $( cd -- " $( dirname -- " ${BASH_SOURCE[0]} " ) " & > /dev/null && pwd) "
19
+
20
+ : ${MASK_NVIDIA_DRIVER_PARAMS:= " true" }
21
+ export MASK_NVIDIA_DRIVER_PARAMS
22
+ exec ${CURRENT_DIR} /../kind/install-dra-driver.sh
Original file line number Diff line number Diff line change @@ -56,14 +56,11 @@ spec:
56
56
command : ["bash", "-c"]
57
57
args :
58
58
- |-
59
- # TODO: Masking of the params file is done below to allow nvkind to
60
- # selectively exclude certain GPUs from being visible to the driver.
61
- # At present, this is only feasible with a host-mounted driver where
62
- # /dev in this container already has GPU devices present (as brought
63
- # in via the --privileged flag from docker/podman when using nvkind).
64
- # In the future we should revisit this to find a more robust method
65
- # of supporting this.
66
- if [ "${NVIDIA_DRIVER_ROOT}" = "/" ]; then
59
+ # Conditionally mask the params file to prevent this container from
60
+ # recreating any missing GPU device nodes. This is necessary, for
61
+ # example, when running under nvkind to limit the set GPUs governed
62
+ # by the plugin even though it has cgroup access to all of them.
63
+ if [ "${MASK_NVIDIA_DRIVER_PARAMS}" = "true" ]; then
67
64
cp /proc/driver/nvidia/params root/gpu-params
68
65
sed -i 's/^ModifyDeviceFiles: 1$/ModifyDeviceFiles: 0/' root/gpu-params
69
66
mount --bind root/gpu-params /proc/driver/nvidia/params
72
69
resources :
73
70
{{- toYaml .Values.kubeletPlugin.containers.plugin.resources | nindent 10 }}
74
71
env :
72
+ - name : MASK_NVIDIA_DRIVER_PARAMS
73
+ value : " {{ .Values.maskNvidiaDriverParams }}"
75
74
- name : NVIDIA_CTK_PATH
76
75
value : " {{ .Values.nvidiaCtkPath }}"
77
76
- name : NVIDIA_DRIVER_ROOT
Original file line number Diff line number Diff line change @@ -36,6 +36,17 @@ allowDefaultNamespace: false
36
36
37
37
deviceClasses : ["gpu", "mig", "imex"]
38
38
39
+ # Masking of the params file is typically done to allow nvkind to
40
+ # selectively exclude certain GPUs from being visible to the
41
+ # underlying GPU driver. Unfortunately, kind doesn't let you choose
42
+ # which device nodes to inject into each worker node (they all come in
43
+ # via the --priviliged flag passed to docker/podman). Because of
44
+ # this, all workers see all GPUs by default. By masking the params
45
+ # file we can prevent a container from recreating any missing GPU
46
+ # device nodes and limit its view to only those device nodes that
47
+ # nvkind decided to allow in.
48
+ maskNvidiaDriverParams : false
49
+
39
50
imagePullSecrets : []
40
51
image :
41
52
repository : nvcr.io/nvidia/cloud-native/k8s-dra-driver
You can’t perform that action at this time.
0 commit comments