-
Notifications
You must be signed in to change notification settings - Fork 360
Use native CDI in container runtimes when supported #1285
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
9f867bb
72756e7
56998b2
7a07a99
78007bc
0c440fb
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -180,6 +180,8 @@ const ( | |
// DriverInstallDirCtrPathEnvName is the name of the envvar used by the driver-validator to represent the path | ||
// of the driver install dir mounted in the container | ||
DriverInstallDirCtrPathEnvName = "DRIVER_INSTALL_DIR_CTR_PATH" | ||
// NvidiaRuntimeSetAsDefaultEnvName is the name of the toolkit container env for configuring NVIDIA Container Runtime as the default runtime | ||
NvidiaRuntimeSetAsDefaultEnvName = "NVIDIA_RUNTIME_SET_AS_DEFAULT" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Question. Out of scope: Should we sort these constants to allow us to find them more easily? |
||
) | ||
|
||
// ContainerProbe defines container probe types | ||
|
@@ -939,7 +941,7 @@ func TransformGPUDiscoveryPlugin(obj *appsv1.DaemonSet, config *gpuv1.ClusterPol | |
} | ||
|
||
// set RuntimeClass for supported runtimes | ||
setRuntimeClass(&obj.Spec.Template.Spec, n.runtime, config.Operator.RuntimeClass) | ||
setRuntimeClass(&obj.Spec.Template.Spec, n, config.Operator.RuntimeClass) | ||
|
||
// update env required for MIG support | ||
applyMIGConfiguration(&(obj.Spec.Template.Spec.Containers[0]), config.MIG.Strategy) | ||
|
@@ -1195,6 +1197,28 @@ func getProxyEnv(proxyConfig *apiconfigv1.Proxy) []corev1.EnvVar { | |
return envVars | ||
} | ||
|
||
func transformToolkitForCDI(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) { | ||
if !config.CDI.IsEnabled() { | ||
return | ||
} | ||
|
||
setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), CDIEnabledEnvName, "true") | ||
setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), CrioConfigModeEnvName, "config") | ||
setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), NvidiaCtrRuntimeModeEnvName, "cdi") | ||
|
||
if !n.runtimeSupportsCDI { | ||
setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), NvidiaCtrRuntimeCDIPrefixesEnvName, "nvidia.cdi.k8s.io/") | ||
} | ||
|
||
// When the container runtime supports CDI, we do not configure 'nvidia' as the default runtime. | ||
// Instead, we leverage native CDI support in containerd / cri-o to inject GPUs into workloads. | ||
// The 'nvidia' runtime will be set as the runtime class for our management containers so that they | ||
// get access to all GPUs. | ||
if n.runtimeSupportsCDI { | ||
setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), NvidiaRuntimeSetAsDefaultEnvName, "false") | ||
} | ||
Comment on lines
+1209
to
+1219
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So we're saying that we could pull this logic into the toolkit container instead? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, that is what I was suggesting. We would need to incorporate this logic in both the toolkit container and device-plugin actually since the configuration of both components is dependent on whether native-CDI is supported. |
||
} | ||
|
||
// TransformToolkit transforms Nvidia container-toolkit daemonset with required config as per ClusterPolicy | ||
func TransformToolkit(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) error { | ||
// update validation container | ||
|
@@ -1233,14 +1257,7 @@ func TransformToolkit(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n | |
} | ||
|
||
// update env required for CDI support | ||
if config.CDI.IsEnabled() { | ||
setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), CDIEnabledEnvName, "true") | ||
setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), NvidiaCtrRuntimeCDIPrefixesEnvName, "nvidia.cdi.k8s.io/") | ||
setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), CrioConfigModeEnvName, "config") | ||
if config.CDI.IsDefault() { | ||
setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), NvidiaCtrRuntimeModeEnvName, "cdi") | ||
} | ||
} | ||
transformToolkitForCDI(obj, config, n) | ||
|
||
// set install directory for the toolkit | ||
if config.Toolkit.InstallDir != "" && config.Toolkit.InstallDir != DefaultToolkitInstallDir { | ||
|
@@ -1352,6 +1369,29 @@ func transformForRuntime(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, | |
return nil | ||
} | ||
|
||
func transformDevicePluginForCDI(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) { | ||
if !config.CDI.IsEnabled() { | ||
return | ||
} | ||
|
||
setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), CDIEnabledEnvName, "true") | ||
if config.Toolkit.IsEnabled() { | ||
setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), NvidiaCDIHookPathEnvName, filepath.Join(config.Toolkit.InstallDir, "toolkit/nvidia-cdi-hook")) | ||
} | ||
|
||
// When the container runtime supports CDI, we leverage native CDI support in container / cri-o | ||
// to inject GPUs into workloads. If native CDI is not supported, we leverage CDI support in | ||
// NVIDIA Container Toolkit. | ||
deviceListStrategy := "cdi-cri" | ||
cdiAnnotationPrefix := "cdi.k8s.io/" | ||
if !n.runtimeSupportsCDI { | ||
deviceListStrategy = "envvar,cdi-annotations" | ||
cdiAnnotationPrefix = "nvidia.cdi.k8s.io/" | ||
} | ||
setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), DeviceListStrategyEnvName, deviceListStrategy) | ||
setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), CDIAnnotationPrefixEnvName, cdiAnnotationPrefix) | ||
} | ||
|
||
// TransformDevicePlugin transforms k8s-device-plugin daemonset with required config as per ClusterPolicy | ||
func TransformDevicePlugin(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) error { | ||
// update validation container | ||
|
@@ -1406,20 +1446,13 @@ func TransformDevicePlugin(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpe | |
} | ||
|
||
// set RuntimeClass for supported runtimes | ||
setRuntimeClass(&obj.Spec.Template.Spec, n.runtime, config.Operator.RuntimeClass) | ||
setRuntimeClass(&obj.Spec.Template.Spec, n, config.Operator.RuntimeClass) | ||
|
||
// update env required for MIG support | ||
applyMIGConfiguration(&(obj.Spec.Template.Spec.Containers[0]), config.MIG.Strategy) | ||
|
||
// update env required for CDI support | ||
if config.CDI.IsEnabled() { | ||
setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), CDIEnabledEnvName, "true") | ||
setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), DeviceListStrategyEnvName, "envvar,cdi-annotations") | ||
setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), CDIAnnotationPrefixEnvName, "nvidia.cdi.k8s.io/") | ||
if config.Toolkit.IsEnabled() { | ||
setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), NvidiaCDIHookPathEnvName, filepath.Join(config.Toolkit.InstallDir, "toolkit/nvidia-cdi-hook")) | ||
} | ||
} | ||
transformDevicePluginForCDI(obj, config, n) | ||
|
||
// update MPS volumes and set MPS_ROOT env var if a custom MPS root is configured | ||
if config.DevicePlugin.MPS != nil && config.DevicePlugin.MPS.Root != "" && | ||
|
@@ -1494,7 +1527,7 @@ func TransformMPSControlDaemon(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolic | |
} | ||
|
||
// set RuntimeClass for supported runtimes | ||
setRuntimeClass(&obj.Spec.Template.Spec, n.runtime, config.Operator.RuntimeClass) | ||
setRuntimeClass(&obj.Spec.Template.Spec, n, config.Operator.RuntimeClass) | ||
|
||
// update env required for MIG support | ||
applyMIGConfiguration(mainContainer, config.MIG.Strategy) | ||
|
@@ -1608,7 +1641,7 @@ func TransformDCGMExporter(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpe | |
} | ||
|
||
// set RuntimeClass for supported runtimes | ||
setRuntimeClass(&obj.Spec.Template.Spec, n.runtime, config.Operator.RuntimeClass) | ||
setRuntimeClass(&obj.Spec.Template.Spec, n, config.Operator.RuntimeClass) | ||
|
||
// mount configmap for custom metrics if provided by user | ||
if config.DCGMExporter.MetricsConfig != nil && config.DCGMExporter.MetricsConfig.Name != "" { | ||
|
@@ -1725,7 +1758,7 @@ func TransformDCGM(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n Clu | |
} | ||
|
||
// set RuntimeClass for supported runtimes | ||
setRuntimeClass(&obj.Spec.Template.Spec, n.runtime, config.Operator.RuntimeClass) | ||
setRuntimeClass(&obj.Spec.Template.Spec, n, config.Operator.RuntimeClass) | ||
|
||
return nil | ||
} | ||
|
@@ -1775,7 +1808,7 @@ func TransformMIGManager(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, | |
} | ||
|
||
// set RuntimeClass for supported runtimes | ||
setRuntimeClass(&obj.Spec.Template.Spec, n.runtime, config.Operator.RuntimeClass) | ||
setRuntimeClass(&obj.Spec.Template.Spec, n, config.Operator.RuntimeClass) | ||
|
||
// set ConfigMap name for "mig-parted-config" Volume | ||
for i, vol := range obj.Spec.Template.Spec.Volumes { | ||
|
@@ -2060,7 +2093,7 @@ func TransformValidator(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, | |
} | ||
|
||
// set RuntimeClass for supported runtimes | ||
setRuntimeClass(&obj.Spec.Template.Spec, n.runtime, config.Operator.RuntimeClass) | ||
setRuntimeClass(&obj.Spec.Template.Spec, n, config.Operator.RuntimeClass) | ||
|
||
var validatorErr error | ||
// apply changes for individual component validators(initContainers) | ||
|
@@ -2392,13 +2425,15 @@ func getRuntimeClass(config *gpuv1.ClusterPolicySpec) string { | |
return DefaultRuntimeClass | ||
} | ||
|
||
func setRuntimeClass(podSpec *corev1.PodSpec, runtime gpuv1.Runtime, runtimeClass string) { | ||
if runtime == gpuv1.Containerd { | ||
if runtimeClass == "" { | ||
runtimeClass = DefaultRuntimeClass | ||
} | ||
podSpec.RuntimeClassName = &runtimeClass | ||
func setRuntimeClass(podSpec *corev1.PodSpec, n ClusterPolicyController, runtimeClass string) { | ||
if !n.singleton.Spec.CDI.IsEnabled() && n.runtime != gpuv1.Containerd { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Are we not also expecting to use a runtimeclass for There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. My intent was to retain the existing behavior when CDI is disabled. That is, use the hook for There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. One slightly unrelated comment is that this in incompatible whith where we want to get with the NVIDIA Container Toolkit and we should definitely look at transitioning to a config-based mechansim for CRI-O too. |
||
return | ||
} | ||
|
||
if runtimeClass == "" { | ||
runtimeClass = DefaultRuntimeClass | ||
} | ||
podSpec.RuntimeClassName = &runtimeClass | ||
} | ||
|
||
func setContainerProbe(container *corev1.Container, probe *gpuv1.ContainerProbeSpec, probeType ContainerProbe) { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -161,10 +161,11 @@ type ClusterPolicyController struct { | |
openshift string | ||
ocpDriverToolkit OpenShiftDriverToolkit | ||
|
||
runtime gpuv1.Runtime | ||
hasGPUNodes bool | ||
hasNFDLabels bool | ||
sandboxEnabled bool | ||
runtime gpuv1.Runtime | ||
runtimeSupportsCDI bool | ||
Comment on lines
+164
to
+165
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is |
||
hasGPUNodes bool | ||
hasNFDLabels bool | ||
sandboxEnabled bool | ||
} | ||
|
||
func addState(n *ClusterPolicyController, path string) { | ||
|
@@ -580,7 +581,7 @@ func (n *ClusterPolicyController) labelGPUNodes() (bool, int, error) { | |
return clusterHasNFDLabels, gpuNodesTotal, nil | ||
} | ||
|
||
func getRuntimeString(node corev1.Node) (gpuv1.Runtime, error) { | ||
func getRuntimeVersionString(node corev1.Node) (gpuv1.Runtime, string, error) { | ||
// ContainerRuntimeVersion string will look like <runtime>://<x.y.z> | ||
runtimeVer := node.Status.NodeInfo.ContainerRuntimeVersion | ||
var runtime gpuv1.Runtime | ||
|
@@ -592,9 +593,11 @@ func getRuntimeString(node corev1.Node) (gpuv1.Runtime, error) { | |
case strings.HasPrefix(runtimeVer, "cri-o"): | ||
runtime = gpuv1.CRIO | ||
default: | ||
return "", fmt.Errorf("runtime not recognized: %s", runtimeVer) | ||
return "", "", fmt.Errorf("runtime not recognized: %s", runtimeVer) | ||
} | ||
return runtime, nil | ||
version := strings.SplitAfter(runtimeVer, "//")[1] | ||
vVersion := "v" + strings.TrimPrefix(version, "v") | ||
return runtime, vVersion, nil | ||
} | ||
|
||
func (n *ClusterPolicyController) setPodSecurityLabelsForNamespace() error { | ||
|
@@ -706,13 +709,14 @@ func (n *ClusterPolicyController) ocpEnsureNamespaceMonitoring() error { | |
return nil | ||
} | ||
|
||
// getRuntime will detect the container runtime used by nodes in the | ||
// cluster and correctly set the value for clusterPolicyController.runtime | ||
// For openshift, set runtime to crio. Otherwise, the default runtime is | ||
// containerd -- if >=1 node is configured with containerd, set | ||
// clusterPolicyController.runtime = containerd | ||
func (n *ClusterPolicyController) getRuntime() error { | ||
// getContainerRuntimeInfo will detect the container runtime version used by nodes | ||
// in the cluster and correctly set the value for clusterPolicyController.runtime | ||
// and clusterPolicyController.runtimeSupportsCDI. On OpenShift, the runtime | ||
// is always assumed to be cri-o. We assume the runtime supports CDI unless | ||
// containerd < 1.7.0 is detected. | ||
func (n *ClusterPolicyController) getContainerRuntimeInfo() error { | ||
ctx := n.ctx | ||
n.runtimeSupportsCDI = true | ||
// assume crio for openshift clusters | ||
if n.openshift != "" { | ||
n.runtime = gpuv1.CRIO | ||
|
@@ -725,27 +729,26 @@ func (n *ClusterPolicyController) getRuntime() error { | |
list := &corev1.NodeList{} | ||
err := n.client.List(ctx, list, opts...) | ||
if err != nil { | ||
return fmt.Errorf("Unable to list nodes prior to checking container runtime: %v", err) | ||
return fmt.Errorf("failed to list nodes: %w", err) | ||
} | ||
|
||
var runtime gpuv1.Runtime | ||
for _, node := range list.Items { | ||
rt, err := getRuntimeString(node) | ||
for i, node := range list.Items { | ||
rt, version, err := getRuntimeVersionString(node) | ||
if err != nil { | ||
n.logger.Info(fmt.Sprintf("Unable to get runtime info for node %s: %v", node.Name, err)) | ||
continue | ||
return fmt.Errorf("failed to get runtime info for node %s: %w", node.Name, err) | ||
} | ||
if i == 0 { | ||
runtime = rt | ||
} else if rt != runtime { | ||
n.logger.Error(nil, "Different runtimes on different worker nodes is not supported") | ||
return fmt.Errorf("different runtimes on different worker nodes is not supported") | ||
} | ||
runtime = rt | ||
if runtime == gpuv1.Containerd { | ||
// default to containerd if >=1 node running containerd | ||
break | ||
if runtime == gpuv1.Containerd && semver.Compare(version, "v1.7.0") < 0 { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is it possible to query the CRI API and see if CDI is enabled ? or a similar check? That would be a more robust check IMO. It's possible that we may have forks of containerd running (rehashes based on a vanilla containerd with different versioning). This check wouldn't yield the expected result in those cases There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think there was some talk about exposing this for use in DRA too, but it may not officially be part of the CRI. If I recall correctly, runtimes could send this information in the CRI status: https://github.com/containerd/containerd/blob/6f652853f01ef9ba340a860c2f39edf1701102d1/internal/cri/server/status.go#L34 and https://github.com/cri-o/cri-o/blob/02f3400b358159265d28a37df61be430404925e9/server/runtime_status.go#L15 I would be surprised if the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Assuming this information is not available via the CRI, is there any other way we could potentially get this information instead of checking version strings here in the controller? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For containerd, one could do a config dump and check whether
via the CRI api. Not sure if this is visible in the context of the GPU Operator though. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah, I see. What are your thoughts on adding this logic to the toolkit container? That is, the toolkit container would check if CDI is supported in containerd (by doing a config dump), and if supported it would ensure native-CDI is used for workloads by NOT configuring There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think that sounds reasonable. Do we just assume native CDI support for CRI-O? In this mode of operation we would:
For Some questions:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
From the perspective of GPU Operator, I am comfortable saying yes since our next operator release will not support a K8s version that supports docker.
This is what I was envisioning as CRI-O has supported CDI since 1.23.2. Since we are shortening our K8s support matrix to n-3 at the time of release, and CRI-O follows the K8s release cycle with respect to minor versions, I think it is relatively safe to always assume native-CDI support for CRI-O. But maybe I am overlooking something.
I propose triggering this behavior whenever
I am assuming this is with respect to the device list strategy we configure in the plugin. I believe if we push the "native-CDI" detection logic to the toolkit container, we would have to do something similar in the plugin. That is, the behavior of the toolkit container and device-plugin when I'll try to write-up this proposal in more detail before the weekend. |
||
n.runtimeSupportsCDI = false | ||
} | ||
} | ||
|
||
if runtime.String() == "" { | ||
n.logger.Info("Unable to get runtime info from the cluster, defaulting to containerd") | ||
runtime = gpuv1.Containerd | ||
} | ||
n.runtime = runtime | ||
return nil | ||
} | ||
|
@@ -868,11 +871,12 @@ func (n *ClusterPolicyController) init(ctx context.Context, reconciler *ClusterP | |
} | ||
|
||
// detect the container runtime on worker nodes | ||
err = n.getRuntime() | ||
err = n.getContainerRuntimeInfo() | ||
if err != nil { | ||
return err | ||
return fmt.Errorf("failed to get container runtime info: %w", err) | ||
} | ||
n.logger.Info(fmt.Sprintf("Using container runtime: %s", n.runtime.String())) | ||
n.logger.Info(fmt.Sprintf("Container runtime supports CDI: %t", n.runtimeSupportsCDI)) | ||
|
||
// fetch all kernel versions from the GPU nodes in the cluster | ||
if n.singleton.Spec.Driver.IsEnabled() && n.singleton.Spec.Driver.UsePrecompiledDrivers() { | ||
|
Uh oh!
There was an error while loading. Please reload this page.