Skip to content

Commit 32805fe

Browse files
authored
Merge pull request #198 from klueska/explicit-envvar-for-mask
Add explicit envvar to control if we mask /proc/driver/nvidia/params
2 parents dfe844b + 6f2c2aa commit 32805fe

File tree

4 files changed

+41
-9
lines changed

4 files changed

+41
-9
lines changed

demo/clusters/kind/install-dra-driver.sh

+1
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ helm upgrade -i --create-namespace --namespace nvidia nvidia-dra-driver ${PROJEC
2929
--set deviceClasses="{${deviceClasses}}" \
3030
${NVIDIA_CTK_PATH:+--set nvidiaCtkPath=${NVIDIA_CTK_PATH}} \
3131
${NVIDIA_DRIVER_ROOT:+--set nvidiaDriverRoot=${NVIDIA_DRIVER_ROOT}} \
32+
${MASK_NVIDIA_DRIVER_PARAMS:+--set maskNvidiaDriverParams=${MASK_NVIDIA_DRIVER_PARAMS}} \
3233
--wait
3334

3435
set +x

demo/clusters/nvkind/install-dra-driver.sh

-1
This file was deleted.
+22
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
#!/usr/bin/env bash
2+
3+
# Copyright 2024 NVIDIA CORPORATION.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
# A reference to the current directory where this script is located
18+
CURRENT_DIR="$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)"
19+
20+
: ${MASK_NVIDIA_DRIVER_PARAMS:="true"}
21+
export MASK_NVIDIA_DRIVER_PARAMS
22+
exec ${CURRENT_DIR}/../kind/install-dra-driver.sh

deployments/helm/k8s-dra-driver/templates/kubeletplugin.yaml

+7-8
Original file line numberDiff line numberDiff line change
@@ -56,14 +56,11 @@ spec:
5656
command: ["bash", "-c"]
5757
args:
5858
- |-
59-
# TODO: Masking of the params file is done below to allow nvkind to
60-
# selectively exclude certain GPUs from being visible to the driver.
61-
# At present, this is only feasible with a host-mounted driver where
62-
# /dev in this container already has GPU devices present (as brought
63-
# in via the --privileged flag from docker/podman when using nvkind).
64-
# In the future we should revisit this to find a more robust method
65-
# of supporting this.
66-
if [ "${NVIDIA_DRIVER_ROOT}" = "/" ]; then
59+
# Conditionally mask the params file to prevent this container from
60+
# recreating any missing GPU device nodes. This is necessary, for
61+
# example, when running under nvkind to limit the set GPUs governed
62+
# by the plugin even though it has cgroup access to all of them.
63+
if [ "${MASK_NVIDIA_DRIVER_PARAMS}" = "true" ]; then
6764
cp /proc/driver/nvidia/params root/gpu-params
6865
sed -i 's/^ModifyDeviceFiles: 1$/ModifyDeviceFiles: 0/' root/gpu-params
6966
mount --bind root/gpu-params /proc/driver/nvidia/params
@@ -72,6 +69,8 @@ spec:
7269
resources:
7370
{{- toYaml .Values.kubeletPlugin.containers.plugin.resources | nindent 10 }}
7471
env:
72+
- name: MASK_NVIDIA_DRIVER_PARAMS
73+
value: "{{ .Values.maskNvidiaDriverParams }}"
7574
- name: NVIDIA_CTK_PATH
7675
value: "{{ .Values.nvidiaCtkPath }}"
7776
- name: NVIDIA_DRIVER_ROOT

deployments/helm/k8s-dra-driver/values.yaml

+11
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,17 @@ allowDefaultNamespace: false
3636

3737
deviceClasses: ["gpu", "mig", "imex"]
3838

39+
# Masking of the params file is typically done to allow nvkind to
40+
# selectively exclude certain GPUs from being visible to the
41+
# underlying GPU driver. Unfortunately, kind doesn't let you choose
42+
# which device nodes to inject into each worker node (they all come in
43+
# via the --priviliged flag passed to docker/podman). Because of
44+
# this, all workers see all GPUs by default. By masking the params
45+
# file we can prevent a container from recreating any missing GPU
46+
# device nodes and limit its view to only those device nodes that
47+
# nvkind decided to allow in.
48+
maskNvidiaDriverParams: false
49+
3950
imagePullSecrets: []
4051
image:
4152
repository: nvcr.io/nvidia/cloud-native/k8s-dra-driver

0 commit comments

Comments
 (0)