Skip to content

Load NVIDIA Kernel Modules for JIT-CDI mode #975

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions cmd/nvidia-ctk-installer/main_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,9 @@ swarm-resource = ""
[nvidia-container-runtime.modes.csv]
mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d"

[nvidia-container-runtime.modes.jit-cdi]
load-kernel-modules = ["nvidia", "nvidia-uvm", "nvidia-modeset"]

[nvidia-container-runtime-hook]
path = "{{ .toolkitRoot }}/toolkit/nvidia-container-runtime-hook"
skip-mode-detection = true
Expand Down Expand Up @@ -202,6 +205,9 @@ swarm-resource = ""
[nvidia-container-runtime.modes.csv]
mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d"

[nvidia-container-runtime.modes.jit-cdi]
load-kernel-modules = ["nvidia", "nvidia-uvm", "nvidia-modeset"]

[nvidia-container-runtime-hook]
path = "{{ .toolkitRoot }}/toolkit/nvidia-container-runtime-hook"
skip-mode-detection = true
Expand Down Expand Up @@ -266,6 +272,9 @@ swarm-resource = ""
[nvidia-container-runtime.modes.csv]
mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d"

[nvidia-container-runtime.modes.jit-cdi]
load-kernel-modules = ["nvidia", "nvidia-uvm", "nvidia-modeset"]

[nvidia-container-runtime-hook]
path = "{{ .toolkitRoot }}/toolkit/nvidia-container-runtime-hook"
skip-mode-detection = true
Expand Down Expand Up @@ -327,6 +336,9 @@ swarm-resource = ""
[nvidia-container-runtime.modes.csv]
mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d"

[nvidia-container-runtime.modes.jit-cdi]
load-kernel-modules = ["nvidia", "nvidia-uvm", "nvidia-modeset"]

[nvidia-container-runtime-hook]
path = "{{ .toolkitRoot }}/toolkit/nvidia-container-runtime-hook"
skip-mode-detection = true
Expand Down Expand Up @@ -410,6 +422,9 @@ swarm-resource = ""
[nvidia-container-runtime.modes.csv]
mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d"

[nvidia-container-runtime.modes.jit-cdi]
load-kernel-modules = ["nvidia", "nvidia-uvm", "nvidia-modeset"]

[nvidia-container-runtime-hook]
path = "{{ .toolkitRoot }}/toolkit/nvidia-container-runtime-hook"
skip-mode-detection = true
Expand Down
3 changes: 3 additions & 0 deletions internal/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,9 @@ func GetDefault() (*Config, error) {
AnnotationPrefixes: []string{cdi.AnnotationPrefix},
SpecDirs: cdi.DefaultSpecDirs,
},
JitCDI: jitCDIModeConfig{
LoadKernelModules: []string{"nvidia", "nvidia-uvm", "nvidia-modeset"},
},
},
},
NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{
Expand Down
21 changes: 21 additions & 0 deletions internal/config/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,9 @@ func TestGetConfig(t *testing.T) {
AnnotationPrefixes: []string{"cdi.k8s.io/"},
SpecDirs: []string{"/etc/cdi", "/var/run/cdi"},
},
JitCDI: jitCDIModeConfig{
LoadKernelModules: []string{"nvidia", "nvidia-uvm", "nvidia-modeset"},
},
},
},
NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{
Expand Down Expand Up @@ -102,6 +105,7 @@ func TestGetConfig(t *testing.T) {
"nvidia-container-runtime.modes.cdi.annotation-prefixes = [\"cdi.k8s.io/\", \"example.vendor.com/\",]",
"nvidia-container-runtime.modes.cdi.spec-dirs = [\"/except/etc/cdi\", \"/not/var/run/cdi\",]",
"nvidia-container-runtime.modes.csv.mount-spec-path = \"/not/etc/nvidia-container-runtime/host-files-for-container.d\"",
"nvidia-container-runtime.modes.jit-cdi.load-kernel-modules = [\"foo\"]",
"nvidia-container-runtime-hook.path = \"/foo/bar/nvidia-container-runtime-hook\"",
"nvidia-ctk.path = \"/foo/bar/nvidia-ctk\"",
},
Expand Down Expand Up @@ -134,6 +138,9 @@ func TestGetConfig(t *testing.T) {
"/not/var/run/cdi",
},
},
JitCDI: jitCDIModeConfig{
LoadKernelModules: []string{"foo"},
},
},
},
NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{
Expand Down Expand Up @@ -178,6 +185,9 @@ func TestGetConfig(t *testing.T) {
"/var/run/cdi",
},
},
JitCDI: jitCDIModeConfig{
LoadKernelModules: []string{"nvidia", "nvidia-uvm", "nvidia-modeset"},
},
},
},
NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{
Expand Down Expand Up @@ -213,6 +223,8 @@ func TestGetConfig(t *testing.T) {
"spec-dirs = [\"/except/etc/cdi\", \"/not/var/run/cdi\",]",
"[nvidia-container-runtime.modes.csv]",
"mount-spec-path = \"/not/etc/nvidia-container-runtime/host-files-for-container.d\"",
"[nvidia-container-runtime.modes.jit-cdi]",
"load-kernel-modules = [\"foo\"]",
"[nvidia-container-runtime-hook]",
"path = \"/foo/bar/nvidia-container-runtime-hook\"",
"[nvidia-ctk]",
Expand Down Expand Up @@ -247,6 +259,9 @@ func TestGetConfig(t *testing.T) {
"/not/var/run/cdi",
},
},
JitCDI: jitCDIModeConfig{
LoadKernelModules: []string{"foo"},
},
},
},
NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{
Expand Down Expand Up @@ -283,6 +298,9 @@ func TestGetConfig(t *testing.T) {
AnnotationPrefixes: []string{"cdi.k8s.io/"},
SpecDirs: []string{"/etc/cdi", "/var/run/cdi"},
},
JitCDI: jitCDIModeConfig{
LoadKernelModules: []string{"nvidia", "nvidia-uvm", "nvidia-modeset"},
},
},
},
NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{
Expand Down Expand Up @@ -322,6 +340,9 @@ func TestGetConfig(t *testing.T) {
AnnotationPrefixes: []string{"cdi.k8s.io/"},
SpecDirs: []string{"/etc/cdi", "/var/run/cdi"},
},
JitCDI: jitCDIModeConfig{
LoadKernelModules: []string{"nvidia", "nvidia-uvm", "nvidia-modeset"},
},
},
},
NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{
Expand Down
13 changes: 11 additions & 2 deletions internal/config/runtime.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,9 @@ type RuntimeConfig struct {

// modesConfig defines (optional) per-mode configs
type modesConfig struct {
CSV csvModeConfig `toml:"csv"`
CDI cdiModeConfig `toml:"cdi"`
CSV csvModeConfig `toml:"csv"`
CDI cdiModeConfig `toml:"cdi"`
JitCDI jitCDIModeConfig `toml:"jit-cdi"`
}

type cdiModeConfig struct {
Expand All @@ -45,3 +46,11 @@ type cdiModeConfig struct {
type csvModeConfig struct {
MountSpecPath string `toml:"mount-spec-path"`
}

type jitCDIModeConfig struct {
// LoadKernelModules defines the names of the kernel modules that should be
// loaded before generating a just-in-time CDI specification.
// The module names must start with `nvidia` and if no modules are specified
// no kernel modules are loaded.
LoadKernelModules []string `toml:"load-kernel-modules"`
}
3 changes: 3 additions & 0 deletions internal/config/toml_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,9 @@ spec-dirs = ["/etc/cdi", "/var/run/cdi"]
[nvidia-container-runtime.modes.csv]
mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d"

[nvidia-container-runtime.modes.jit-cdi]
load-kernel-modules = ["nvidia", "nvidia-uvm", "nvidia-modeset"]

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How did we end up making this fine selection? 🍷 🍇 🧀

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Member Author

@elezar elezar Apr 2, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is actually a typo. The module names should be nvidia, nvidia_uvm and nvidia_modeset.


[nvidia-container-runtime-hook]
path = "nvidia-container-runtime-hook"
skip-mode-detection = false
Expand Down
20 changes: 20 additions & 0 deletions internal/lookup/root/root.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,15 @@
package root

import (
"errors"
"fmt"
"os"
"path/filepath"
"strings"

"github.com/NVIDIA/nvidia-container-toolkit/internal/logger"
"github.com/NVIDIA/nvidia-container-toolkit/internal/lookup"
"github.com/NVIDIA/nvidia-container-toolkit/internal/system/nvmodules"
)

// Driver represents a filesystem in which a set of drivers or devices is defined.
Expand Down Expand Up @@ -125,3 +128,20 @@ func xdgDataDirs() []string {

return []string{"/usr/local/share", "/usr/share"}
}

// LoadKmods loads the specified kernel modules in the driver root.
// Errors in loading a module do not prevent other modules from being attempted.
func (r *Driver) LoadKernelModules(moduleNames ...string) error {
modules := nvmodules.New(
nvmodules.WithLogger(r.logger),
nvmodules.WithRoot(r.Root),
)

var errs error
for _, moduleName := range moduleNames {
if err := modules.Load(moduleName); err != nil {
errs = errors.Join(errs, fmt.Errorf("failed to load kernel module %q: %w", moduleName, err))
}
}
return errs
}
16 changes: 11 additions & 5 deletions internal/modifier/cdi.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import (
"github.com/NVIDIA/nvidia-container-toolkit/internal/config"
"github.com/NVIDIA/nvidia-container-toolkit/internal/config/image"
"github.com/NVIDIA/nvidia-container-toolkit/internal/logger"
"github.com/NVIDIA/nvidia-container-toolkit/internal/lookup/root"
"github.com/NVIDIA/nvidia-container-toolkit/internal/modifier/cdi"
"github.com/NVIDIA/nvidia-container-toolkit/internal/oci"
"github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi"
Expand All @@ -34,7 +35,7 @@ import (
// NewCDIModifier creates an OCI spec modifier that determines the modifications to make based on the
// CDI specifications available on the system. The NVIDIA_VISIBLE_DEVICES environment variable is
// used to select the devices to include.
func NewCDIModifier(logger logger.Interface, cfg *config.Config, ociSpec oci.Spec) (oci.SpecModifier, error) {
func NewCDIModifier(logger logger.Interface, cfg *config.Config, driver *root.Driver, ociSpec oci.Spec) (oci.SpecModifier, error) {
devices, err := getDevicesFromSpec(logger, ociSpec, cfg)
if err != nil {
return nil, fmt.Errorf("failed to get required devices from OCI specification: %v", err)
Expand All @@ -50,7 +51,7 @@ func NewCDIModifier(logger logger.Interface, cfg *config.Config, ociSpec oci.Spe
return nil, fmt.Errorf("requesting a CDI device with vendor 'runtime.nvidia.com' is not supported when requesting other CDI devices")
}
if len(automaticDevices) > 0 {
automaticModifier, err := newAutomaticCDISpecModifier(logger, cfg, automaticDevices)
automaticModifier, err := newAutomaticCDISpecModifier(logger, cfg, driver, automaticDevices)
if err == nil {
return automaticModifier, nil
}
Expand Down Expand Up @@ -163,9 +164,9 @@ func filterAutomaticDevices(devices []string) []string {
return automatic
}

func newAutomaticCDISpecModifier(logger logger.Interface, cfg *config.Config, devices []string) (oci.SpecModifier, error) {
func newAutomaticCDISpecModifier(logger logger.Interface, cfg *config.Config, driver *root.Driver, devices []string) (oci.SpecModifier, error) {
logger.Debugf("Generating in-memory CDI specs for devices %v", devices)
spec, err := generateAutomaticCDISpec(logger, cfg, devices)
spec, err := generateAutomaticCDISpec(logger, cfg, driver, devices)
if err != nil {
return nil, fmt.Errorf("failed to generate CDI spec: %w", err)
}
Expand All @@ -180,7 +181,7 @@ func newAutomaticCDISpecModifier(logger logger.Interface, cfg *config.Config, de
return cdiModifier, nil
}

func generateAutomaticCDISpec(logger logger.Interface, cfg *config.Config, devices []string) (spec.Interface, error) {
func generateAutomaticCDISpec(logger logger.Interface, cfg *config.Config, driver *root.Driver, devices []string) (spec.Interface, error) {
cdilib, err := nvcdi.New(
nvcdi.WithLogger(logger),
nvcdi.WithNVIDIACDIHookPath(cfg.NVIDIACTKConfig.Path),
Expand All @@ -192,6 +193,11 @@ func generateAutomaticCDISpec(logger logger.Interface, cfg *config.Config, devic
return nil, fmt.Errorf("failed to construct CDI library: %w", err)
}

// TODO: Consider moving this into the nvcdi API.
if err := driver.LoadKernelModules(cfg.NVIDIAContainerRuntimeConfig.Modes.JitCDI.LoadKernelModules...); err != nil {
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@klueska are there any cases where we DON'T want to load / try to load the kernel modules? Note that we aslo skip this when running in a user namespace in libnvidia-container.

logger.Warningf("Ignoring error(s) loading kernel modules: %v", err)
}

identifiers := []string{}
for _, device := range devices {
_, _, id := parser.ParseDevice(device)
Expand Down
6 changes: 3 additions & 3 deletions internal/runtime/runtime_factory.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ func newSpecModifier(logger logger.Interface, cfg *config.Config, ociSpec oci.Sp
mode := info.ResolveAutoMode(logger, cfg.NVIDIAContainerRuntimeConfig.Mode, image)
// We update the mode here so that we can continue passing just the config to other functions.
cfg.NVIDIAContainerRuntimeConfig.Mode = mode
modeModifier, err := newModeModifier(logger, mode, cfg, ociSpec, image)
modeModifier, err := newModeModifier(logger, mode, cfg, driver, ociSpec, image)
if err != nil {
return nil, err
}
Expand Down Expand Up @@ -107,14 +107,14 @@ func newSpecModifier(logger logger.Interface, cfg *config.Config, ociSpec oci.Sp
return modifiers, nil
}

func newModeModifier(logger logger.Interface, mode string, cfg *config.Config, ociSpec oci.Spec, image image.CUDA) (oci.SpecModifier, error) {
func newModeModifier(logger logger.Interface, mode string, cfg *config.Config, driver *root.Driver, ociSpec oci.Spec, image image.CUDA) (oci.SpecModifier, error) {
switch mode {
case "legacy":
return modifier.NewStableRuntimeModifier(logger, cfg.NVIDIAContainerRuntimeHookConfig.Path), nil
case "csv":
return modifier.NewCSVModifier(logger, cfg, image)
case "cdi":
return modifier.NewCDIModifier(logger, cfg, ociSpec)
return modifier.NewCDIModifier(logger, cfg, driver, ociSpec)
}

return nil, fmt.Errorf("invalid runtime mode: %v", cfg.NVIDIAContainerRuntimeConfig.Mode)
Expand Down