-
Notifications
You must be signed in to change notification settings - Fork 373
Load NVIDIA Kernel Modules for JIT-CDI mode #975
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -17,12 +17,15 @@ | |||||
package root | ||||||
|
||||||
import ( | ||||||
"errors" | ||||||
"fmt" | ||||||
"os" | ||||||
"path/filepath" | ||||||
"strings" | ||||||
|
||||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/logger" | ||||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/lookup" | ||||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/system/nvmodules" | ||||||
) | ||||||
|
||||||
// Driver represents a filesystem in which a set of drivers or devices is defined. | ||||||
|
@@ -125,3 +128,20 @@ func xdgDataDirs() []string { | |||||
|
||||||
return []string{"/usr/local/share", "/usr/share"} | ||||||
} | ||||||
|
||||||
// LoadKmods loads the specified kernel modules in the driver root. | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
// Errors in loading a module do not prevent other modules from being attempted. | ||||||
func (r *Driver) LoadKernelModules(moduleNames ...string) error { | ||||||
modules := nvmodules.New( | ||||||
nvmodules.WithLogger(r.logger), | ||||||
nvmodules.WithRoot(r.Root), | ||||||
) | ||||||
|
||||||
var errs error | ||||||
for _, moduleName := range moduleNames { | ||||||
if err := modules.Load(moduleName); err != nil { | ||||||
errs = errors.Join(errs, fmt.Errorf("failed to load kernel module %q: %w", moduleName, err)) | ||||||
} | ||||||
} | ||||||
return errs | ||||||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -25,6 +25,7 @@ import ( | |
"github.com/NVIDIA/nvidia-container-toolkit/internal/config" | ||
"github.com/NVIDIA/nvidia-container-toolkit/internal/config/image" | ||
"github.com/NVIDIA/nvidia-container-toolkit/internal/logger" | ||
"github.com/NVIDIA/nvidia-container-toolkit/internal/lookup/root" | ||
"github.com/NVIDIA/nvidia-container-toolkit/internal/modifier/cdi" | ||
"github.com/NVIDIA/nvidia-container-toolkit/internal/oci" | ||
"github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi" | ||
|
@@ -34,7 +35,7 @@ import ( | |
// NewCDIModifier creates an OCI spec modifier that determines the modifications to make based on the | ||
// CDI specifications available on the system. The NVIDIA_VISIBLE_DEVICES environment variable is | ||
// used to select the devices to include. | ||
func NewCDIModifier(logger logger.Interface, cfg *config.Config, ociSpec oci.Spec) (oci.SpecModifier, error) { | ||
func NewCDIModifier(logger logger.Interface, cfg *config.Config, driver *root.Driver, ociSpec oci.Spec) (oci.SpecModifier, error) { | ||
devices, err := getDevicesFromSpec(logger, ociSpec, cfg) | ||
if err != nil { | ||
return nil, fmt.Errorf("failed to get required devices from OCI specification: %v", err) | ||
|
@@ -50,7 +51,7 @@ func NewCDIModifier(logger logger.Interface, cfg *config.Config, ociSpec oci.Spe | |
return nil, fmt.Errorf("requesting a CDI device with vendor 'runtime.nvidia.com' is not supported when requesting other CDI devices") | ||
} | ||
if len(automaticDevices) > 0 { | ||
automaticModifier, err := newAutomaticCDISpecModifier(logger, cfg, automaticDevices) | ||
automaticModifier, err := newAutomaticCDISpecModifier(logger, cfg, driver, automaticDevices) | ||
if err == nil { | ||
return automaticModifier, nil | ||
} | ||
|
@@ -163,9 +164,9 @@ func filterAutomaticDevices(devices []string) []string { | |
return automatic | ||
} | ||
|
||
func newAutomaticCDISpecModifier(logger logger.Interface, cfg *config.Config, devices []string) (oci.SpecModifier, error) { | ||
func newAutomaticCDISpecModifier(logger logger.Interface, cfg *config.Config, driver *root.Driver, devices []string) (oci.SpecModifier, error) { | ||
logger.Debugf("Generating in-memory CDI specs for devices %v", devices) | ||
spec, err := generateAutomaticCDISpec(logger, cfg, devices) | ||
spec, err := generateAutomaticCDISpec(logger, cfg, driver, devices) | ||
if err != nil { | ||
return nil, fmt.Errorf("failed to generate CDI spec: %w", err) | ||
} | ||
|
@@ -180,7 +181,7 @@ func newAutomaticCDISpecModifier(logger logger.Interface, cfg *config.Config, de | |
return cdiModifier, nil | ||
} | ||
|
||
func generateAutomaticCDISpec(logger logger.Interface, cfg *config.Config, devices []string) (spec.Interface, error) { | ||
func generateAutomaticCDISpec(logger logger.Interface, cfg *config.Config, driver *root.Driver, devices []string) (spec.Interface, error) { | ||
cdilib, err := nvcdi.New( | ||
nvcdi.WithLogger(logger), | ||
nvcdi.WithNVIDIACDIHookPath(cfg.NVIDIACTKConfig.Path), | ||
|
@@ -192,6 +193,11 @@ func generateAutomaticCDISpec(logger logger.Interface, cfg *config.Config, devic | |
return nil, fmt.Errorf("failed to construct CDI library: %w", err) | ||
} | ||
|
||
// TODO: Consider moving this into the nvcdi API. | ||
if err := driver.LoadKernelModules(cfg.NVIDIAContainerRuntimeConfig.Modes.JitCDI.LoadKernelModules...); err != nil { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @klueska are there any cases where we DON'T want to load / try to load the kernel modules? Note that we aslo skip this when running in a user namespace in |
||
logger.Warningf("Ignoring error(s) loading kernel modules: %v", err) | ||
} | ||
|
||
identifiers := []string{} | ||
for _, device := range devices { | ||
_, _, id := parser.ParseDevice(device) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
How did we end up making this fine selection? 🍷 🍇 🧀
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
These are the kernel modules that are required for various GPU functionalities. In the
nvidia-container-cli
we do this throughnvidia-modprobe
:nvidia
: https://github.com/NVIDIA/libnvidia-container/blob/95d3e86522976061e856724867ebcaf75c4e9b60/src/nvc.c#L279nvidia-uvm
: https://github.com/NVIDIA/libnvidia-container/blob/95d3e86522976061e856724867ebcaf75c4e9b60/src/nvc.c#L305nvidia-modeset
: https://github.com/NVIDIA/libnvidia-container/blob/95d3e86522976061e856724867ebcaf75c4e9b60/src/nvc.c#L314Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is actually a typo. The module names should be
nvidia
,nvidia_uvm
andnvidia_modeset
.