Skip to content

Commit 54ea4a2

Browse files
committed
Load NVIDIA Kernel Modules for JIT-CDI mode
This change attempts to load the nvidia, nvidia-uvm, and nvidia-modeset kernel modules before generating the automatic (jit) CDI specification. The kernel modules can be controlled by the nvidia-container-runtime.modes.jit-cdi.load-kernel-modules config option. If this is set to the empty list, then no kernel modules are loaded. Errors in loading the kernel modules are logged, but ignored. Signed-off-by: Evan Lezar <[email protected]>
1 parent e436533 commit 54ea4a2

File tree

8 files changed

+87
-10
lines changed

8 files changed

+87
-10
lines changed

cmd/nvidia-ctk-installer/main_test.go

+15
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,9 @@ swarm-resource = ""
141141
[nvidia-container-runtime.modes.csv]
142142
mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d"
143143
144+
[nvidia-container-runtime.modes.jit-cdi]
145+
load-kernel-modules = ["nvidia", "nvidia-uvm", "nvidia-modeset"]
146+
144147
[nvidia-container-runtime-hook]
145148
path = "{{ .toolkitRoot }}/toolkit/nvidia-container-runtime-hook"
146149
skip-mode-detection = true
@@ -202,6 +205,9 @@ swarm-resource = ""
202205
[nvidia-container-runtime.modes.csv]
203206
mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d"
204207
208+
[nvidia-container-runtime.modes.jit-cdi]
209+
load-kernel-modules = ["nvidia", "nvidia-uvm", "nvidia-modeset"]
210+
205211
[nvidia-container-runtime-hook]
206212
path = "{{ .toolkitRoot }}/toolkit/nvidia-container-runtime-hook"
207213
skip-mode-detection = true
@@ -266,6 +272,9 @@ swarm-resource = ""
266272
[nvidia-container-runtime.modes.csv]
267273
mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d"
268274
275+
[nvidia-container-runtime.modes.jit-cdi]
276+
load-kernel-modules = ["nvidia", "nvidia-uvm", "nvidia-modeset"]
277+
269278
[nvidia-container-runtime-hook]
270279
path = "{{ .toolkitRoot }}/toolkit/nvidia-container-runtime-hook"
271280
skip-mode-detection = true
@@ -327,6 +336,9 @@ swarm-resource = ""
327336
[nvidia-container-runtime.modes.csv]
328337
mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d"
329338
339+
[nvidia-container-runtime.modes.jit-cdi]
340+
load-kernel-modules = ["nvidia", "nvidia-uvm", "nvidia-modeset"]
341+
330342
[nvidia-container-runtime-hook]
331343
path = "{{ .toolkitRoot }}/toolkit/nvidia-container-runtime-hook"
332344
skip-mode-detection = true
@@ -410,6 +422,9 @@ swarm-resource = ""
410422
[nvidia-container-runtime.modes.csv]
411423
mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d"
412424
425+
[nvidia-container-runtime.modes.jit-cdi]
426+
load-kernel-modules = ["nvidia", "nvidia-uvm", "nvidia-modeset"]
427+
413428
[nvidia-container-runtime-hook]
414429
path = "{{ .toolkitRoot }}/toolkit/nvidia-container-runtime-hook"
415430
skip-mode-detection = true

internal/config/config.go

+3
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,9 @@ func GetDefault() (*Config, error) {
121121
AnnotationPrefixes: []string{cdi.AnnotationPrefix},
122122
SpecDirs: cdi.DefaultSpecDirs,
123123
},
124+
JitCDI: jitCDIModeConfig{
125+
LoadKernelModules: []string{"nvidia", "nvidia-uvm", "nvidia-modeset"},
126+
},
124127
},
125128
},
126129
NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{

internal/config/config_test.go

+21
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,9 @@ func TestGetConfig(t *testing.T) {
7474
AnnotationPrefixes: []string{"cdi.k8s.io/"},
7575
SpecDirs: []string{"/etc/cdi", "/var/run/cdi"},
7676
},
77+
JitCDI: jitCDIModeConfig{
78+
LoadKernelModules: []string{"nvidia", "nvidia-uvm", "nvidia-modeset"},
79+
},
7780
},
7881
},
7982
NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{
@@ -102,6 +105,7 @@ func TestGetConfig(t *testing.T) {
102105
"nvidia-container-runtime.modes.cdi.annotation-prefixes = [\"cdi.k8s.io/\", \"example.vendor.com/\",]",
103106
"nvidia-container-runtime.modes.cdi.spec-dirs = [\"/except/etc/cdi\", \"/not/var/run/cdi\",]",
104107
"nvidia-container-runtime.modes.csv.mount-spec-path = \"/not/etc/nvidia-container-runtime/host-files-for-container.d\"",
108+
"nvidia-container-runtime.modes.jit-cdi.load-kernel-modules = [\"foo\"]",
105109
"nvidia-container-runtime-hook.path = \"/foo/bar/nvidia-container-runtime-hook\"",
106110
"nvidia-ctk.path = \"/foo/bar/nvidia-ctk\"",
107111
},
@@ -134,6 +138,9 @@ func TestGetConfig(t *testing.T) {
134138
"/not/var/run/cdi",
135139
},
136140
},
141+
JitCDI: jitCDIModeConfig{
142+
LoadKernelModules: []string{"foo"},
143+
},
137144
},
138145
},
139146
NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{
@@ -178,6 +185,9 @@ func TestGetConfig(t *testing.T) {
178185
"/var/run/cdi",
179186
},
180187
},
188+
JitCDI: jitCDIModeConfig{
189+
LoadKernelModules: []string{"nvidia", "nvidia-uvm", "nvidia-modeset"},
190+
},
181191
},
182192
},
183193
NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{
@@ -213,6 +223,8 @@ func TestGetConfig(t *testing.T) {
213223
"spec-dirs = [\"/except/etc/cdi\", \"/not/var/run/cdi\",]",
214224
"[nvidia-container-runtime.modes.csv]",
215225
"mount-spec-path = \"/not/etc/nvidia-container-runtime/host-files-for-container.d\"",
226+
"[nvidia-container-runtime.modes.jit-cdi]",
227+
"load-kernel-modules = [\"foo\"]",
216228
"[nvidia-container-runtime-hook]",
217229
"path = \"/foo/bar/nvidia-container-runtime-hook\"",
218230
"[nvidia-ctk]",
@@ -247,6 +259,9 @@ func TestGetConfig(t *testing.T) {
247259
"/not/var/run/cdi",
248260
},
249261
},
262+
JitCDI: jitCDIModeConfig{
263+
LoadKernelModules: []string{"foo"},
264+
},
250265
},
251266
},
252267
NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{
@@ -283,6 +298,9 @@ func TestGetConfig(t *testing.T) {
283298
AnnotationPrefixes: []string{"cdi.k8s.io/"},
284299
SpecDirs: []string{"/etc/cdi", "/var/run/cdi"},
285300
},
301+
JitCDI: jitCDIModeConfig{
302+
LoadKernelModules: []string{"nvidia", "nvidia-uvm", "nvidia-modeset"},
303+
},
286304
},
287305
},
288306
NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{
@@ -322,6 +340,9 @@ func TestGetConfig(t *testing.T) {
322340
AnnotationPrefixes: []string{"cdi.k8s.io/"},
323341
SpecDirs: []string{"/etc/cdi", "/var/run/cdi"},
324342
},
343+
JitCDI: jitCDIModeConfig{
344+
LoadKernelModules: []string{"nvidia", "nvidia-uvm", "nvidia-modeset"},
345+
},
325346
},
326347
},
327348
NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{

internal/config/runtime.go

+11-2
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,9 @@ type RuntimeConfig struct {
2929

3030
// modesConfig defines (optional) per-mode configs
3131
type modesConfig struct {
32-
CSV csvModeConfig `toml:"csv"`
33-
CDI cdiModeConfig `toml:"cdi"`
32+
CSV csvModeConfig `toml:"csv"`
33+
CDI cdiModeConfig `toml:"cdi"`
34+
JitCDI jitCDIModeConfig `toml:"jit-cdi"`
3435
}
3536

3637
type cdiModeConfig struct {
@@ -45,3 +46,11 @@ type cdiModeConfig struct {
4546
type csvModeConfig struct {
4647
MountSpecPath string `toml:"mount-spec-path"`
4748
}
49+
50+
type jitCDIModeConfig struct {
51+
// LoadKernelModules defines the names of the kernel modules that should be
52+
// loaded before generating a just-in-time CDI specification.
53+
// The module names must start with `nvidia` and if no modules are specified
54+
// no kernel modules are loaded.
55+
LoadKernelModules []string `toml:"load-kernel-modules"`
56+
}

internal/config/toml_test.go

+3
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,9 @@ spec-dirs = ["/etc/cdi", "/var/run/cdi"]
7474
[nvidia-container-runtime.modes.csv]
7575
mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d"
7676
77+
[nvidia-container-runtime.modes.jit-cdi]
78+
load-kernel-modules = ["nvidia", "nvidia-uvm", "nvidia-modeset"]
79+
7780
[nvidia-container-runtime-hook]
7881
path = "nvidia-container-runtime-hook"
7982
skip-mode-detection = false

internal/lookup/root/root.go

+20
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,15 @@
1717
package root
1818

1919
import (
20+
"errors"
21+
"fmt"
2022
"os"
2123
"path/filepath"
2224
"strings"
2325

2426
"github.com/NVIDIA/nvidia-container-toolkit/internal/logger"
2527
"github.com/NVIDIA/nvidia-container-toolkit/internal/lookup"
28+
"github.com/NVIDIA/nvidia-container-toolkit/internal/system/nvmodules"
2629
)
2730

2831
// Driver represents a filesystem in which a set of drivers or devices is defined.
@@ -125,3 +128,20 @@ func xdgDataDirs() []string {
125128

126129
return []string{"/usr/local/share", "/usr/share"}
127130
}
131+
132+
// LoadKmods loads the specified kernel modules in the driver root.
133+
// Errors in loading a module do not prevent other modules from being attempted.
134+
func (r *Driver) LoadKernelModules(moduleNames ...string) error {
135+
modules := nvmodules.New(
136+
nvmodules.WithLogger(r.logger),
137+
nvmodules.WithRoot(r.Root),
138+
)
139+
140+
var errs error
141+
for _, moduleName := range moduleNames {
142+
if err := modules.Load(moduleName); err != nil {
143+
errs = errors.Join(errs, fmt.Errorf("failed to load kernel module %q: %w", moduleName, err))
144+
}
145+
}
146+
return errs
147+
}

internal/modifier/cdi.go

+11-5
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ import (
2525
"github.com/NVIDIA/nvidia-container-toolkit/internal/config"
2626
"github.com/NVIDIA/nvidia-container-toolkit/internal/config/image"
2727
"github.com/NVIDIA/nvidia-container-toolkit/internal/logger"
28+
"github.com/NVIDIA/nvidia-container-toolkit/internal/lookup/root"
2829
"github.com/NVIDIA/nvidia-container-toolkit/internal/modifier/cdi"
2930
"github.com/NVIDIA/nvidia-container-toolkit/internal/oci"
3031
"github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi"
@@ -34,7 +35,7 @@ import (
3435
// NewCDIModifier creates an OCI spec modifier that determines the modifications to make based on the
3536
// CDI specifications available on the system. The NVIDIA_VISIBLE_DEVICES environment variable is
3637
// used to select the devices to include.
37-
func NewCDIModifier(logger logger.Interface, cfg *config.Config, ociSpec oci.Spec) (oci.SpecModifier, error) {
38+
func NewCDIModifier(logger logger.Interface, cfg *config.Config, driver *root.Driver, ociSpec oci.Spec) (oci.SpecModifier, error) {
3839
devices, err := getDevicesFromSpec(logger, ociSpec, cfg)
3940
if err != nil {
4041
return nil, fmt.Errorf("failed to get required devices from OCI specification: %v", err)
@@ -50,7 +51,7 @@ func NewCDIModifier(logger logger.Interface, cfg *config.Config, ociSpec oci.Spe
5051
return nil, fmt.Errorf("requesting a CDI device with vendor 'runtime.nvidia.com' is not supported when requesting other CDI devices")
5152
}
5253
if len(automaticDevices) > 0 {
53-
automaticModifier, err := newAutomaticCDISpecModifier(logger, cfg, automaticDevices)
54+
automaticModifier, err := newAutomaticCDISpecModifier(logger, cfg, driver, automaticDevices)
5455
if err == nil {
5556
return automaticModifier, nil
5657
}
@@ -163,9 +164,9 @@ func filterAutomaticDevices(devices []string) []string {
163164
return automatic
164165
}
165166

166-
func newAutomaticCDISpecModifier(logger logger.Interface, cfg *config.Config, devices []string) (oci.SpecModifier, error) {
167+
func newAutomaticCDISpecModifier(logger logger.Interface, cfg *config.Config, driver *root.Driver, devices []string) (oci.SpecModifier, error) {
167168
logger.Debugf("Generating in-memory CDI specs for devices %v", devices)
168-
spec, err := generateAutomaticCDISpec(logger, cfg, devices)
169+
spec, err := generateAutomaticCDISpec(logger, cfg, driver, devices)
169170
if err != nil {
170171
return nil, fmt.Errorf("failed to generate CDI spec: %w", err)
171172
}
@@ -180,7 +181,7 @@ func newAutomaticCDISpecModifier(logger logger.Interface, cfg *config.Config, de
180181
return cdiModifier, nil
181182
}
182183

183-
func generateAutomaticCDISpec(logger logger.Interface, cfg *config.Config, devices []string) (spec.Interface, error) {
184+
func generateAutomaticCDISpec(logger logger.Interface, cfg *config.Config, driver *root.Driver, devices []string) (spec.Interface, error) {
184185
cdilib, err := nvcdi.New(
185186
nvcdi.WithLogger(logger),
186187
nvcdi.WithNVIDIACDIHookPath(cfg.NVIDIACTKConfig.Path),
@@ -192,6 +193,11 @@ func generateAutomaticCDISpec(logger logger.Interface, cfg *config.Config, devic
192193
return nil, fmt.Errorf("failed to construct CDI library: %w", err)
193194
}
194195

196+
// TODO: Consider moving this into the nvcdi API.
197+
if err := driver.LoadKernelModules(cfg.NVIDIAContainerRuntimeConfig.Modes.JitCDI.LoadKernelModules...); err != nil {
198+
logger.Warningf("Ignoring error(s) loading kernel modules: %v", err)
199+
}
200+
195201
identifiers := []string{}
196202
for _, device := range devices {
197203
_, _, id := parser.ParseDevice(device)

internal/runtime/runtime_factory.go

+3-3
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ func newSpecModifier(logger logger.Interface, cfg *config.Config, ociSpec oci.Sp
7777
mode := info.ResolveAutoMode(logger, cfg.NVIDIAContainerRuntimeConfig.Mode, image)
7878
// We update the mode here so that we can continue passing just the config to other functions.
7979
cfg.NVIDIAContainerRuntimeConfig.Mode = mode
80-
modeModifier, err := newModeModifier(logger, mode, cfg, ociSpec, image)
80+
modeModifier, err := newModeModifier(logger, mode, cfg, driver, ociSpec, image)
8181
if err != nil {
8282
return nil, err
8383
}
@@ -107,14 +107,14 @@ func newSpecModifier(logger logger.Interface, cfg *config.Config, ociSpec oci.Sp
107107
return modifiers, nil
108108
}
109109

110-
func newModeModifier(logger logger.Interface, mode string, cfg *config.Config, ociSpec oci.Spec, image image.CUDA) (oci.SpecModifier, error) {
110+
func newModeModifier(logger logger.Interface, mode string, cfg *config.Config, driver *root.Driver, ociSpec oci.Spec, image image.CUDA) (oci.SpecModifier, error) {
111111
switch mode {
112112
case "legacy":
113113
return modifier.NewStableRuntimeModifier(logger, cfg.NVIDIAContainerRuntimeHookConfig.Path), nil
114114
case "csv":
115115
return modifier.NewCSVModifier(logger, cfg, image)
116116
case "cdi":
117-
return modifier.NewCDIModifier(logger, cfg, ociSpec)
117+
return modifier.NewCDIModifier(logger, cfg, driver, ociSpec)
118118
}
119119

120120
return nil, fmt.Errorf("invalid runtime mode: %v", cfg.NVIDIAContainerRuntimeConfig.Mode)

0 commit comments

Comments
 (0)