NVIDIA · cdesiniotis · Feb 18, 2025 · Feb 18, 2025 · Feb 18, 2025 · Feb 18, 2025
@@ -1632,20 +1632,20 @@ type VGPUDevicesConfigSpec struct {
 
 // CDIConfigSpec defines how the Container Device Interface is used in the cluster.
 type CDIConfigSpec struct {
-	// Enabled indicates whether CDI can be used to make GPUs accessible to containers.
+	// Enabled indicates whether CDI should be used as the mechanism for making GPUs accessible to containers.
 	// +kubebuilder:validation:Optional
 	// +kubebuilder:default=false
 	// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
-	// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Enable CDI as a mechanism for making GPUs accessible to containers"
+	// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Enable CDI as the mechanism for making GPUs accessible to containers"
 	// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch"
 	Enabled *bool `json:"enabled,omitempty"`
 
-	// Default indicates whether to use CDI as the default mechanism for providing GPU access to containers.
+	// Deprecated: This field is no longer used. Setting cdi.enabled=true will configure CDI as the default mechanism for making GPUs accessible to containers.
 	// +kubebuilder:validation:Optional
 	// +kubebuilder:default=false
 	// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
-	// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Configure CDI as the default mechanism for making GPUs accessible to containers"
-	// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch"
+	// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Deprecated: This field is no longer used"
+	// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch,urn:alm:descriptor:com.tectonic.ui:hidden"
 	Default *bool `json:"default,omitempty"`
 }
 

@@ -536,6 +536,20 @@ spec:
           path: toolkit.imagePullPolicy
           x-descriptors:
             - 'urn:alm:descriptor:com.tectonic.ui:imagePullPolicy'
+        - displayName: CDI
+          description: Container Device Interface (CDI) Configuration
+          path: cdi
+        - displayName: Enabled
+          description: 'Enabled indicates whether CDI should be used as the mechanism for making GPUs accessible to containers.'
+          path: cdi.enabled
+          x-descriptors:
+            - 'urn:alm:descriptor:com.tectonic.ui:booleanSwitch'
+        - displayName: Default
+          description: 'Deprecated: This field is no longer used. Setting cdi.enabled=true will configure CDI as the default mechanism for making GPUs accessible to containers.'
+          path: cdi.default
+          x-descriptors:
+            - 'urn:alm:descriptor:com.tectonic.ui:hidden'
+            - 'urn:alm:descriptor:com.tectonic.ui:booleanSwitch'
         - displayName: NVIDIA DCGM config
           description: NVIDIA DCGM config
           path: dcgm

@@ -136,13 +136,14 @@ spec:
                 properties:
                   default:
                     default: false
-                    description: Default indicates whether to use CDI as the default
-                      mechanism for providing GPU access to containers.
+                    description: 'Deprecated: This field is no longer used. Setting
+                      cdi.enabled=true will configure CDI as the default mechanism
+                      for making GPUs accessible to containers.'
                     type: boolean
                   enabled:
                     default: false
-                    description: Enabled indicates whether CDI can be used to make
-                      GPUs accessible to containers.
+                    description: Enabled indicates whether CDI should be used as the
+                      mechanism for making GPUs accessible to containers.
                     type: boolean
                 type: object
               daemonsets:

@@ -136,13 +136,14 @@ spec:
                 properties:
                   default:
                     default: false
-                    description: Default indicates whether to use CDI as the default
-                      mechanism for providing GPU access to containers.
+                    description: 'Deprecated: This field is no longer used. Setting
+                      cdi.enabled=true will configure CDI as the default mechanism
+                      for making GPUs accessible to containers.'
                     type: boolean
                   enabled:
                     default: false
-                    description: Enabled indicates whether CDI can be used to make
-                      GPUs accessible to containers.
+                    description: Enabled indicates whether CDI should be used as the
+                      mechanism for making GPUs accessible to containers.
                     type: boolean
                 type: object
               daemonsets:

@@ -180,6 +180,8 @@ const (
 	// DriverInstallDirCtrPathEnvName is the name of the envvar used by the driver-validator to represent the path
 	// of the driver install dir mounted in the container
 	DriverInstallDirCtrPathEnvName = "DRIVER_INSTALL_DIR_CTR_PATH"
+	// NvidiaRuntimeSetAsDefaultEnvName is the name of the toolkit container env for configuring NVIDIA Container Runtime as the default runtime
+	NvidiaRuntimeSetAsDefaultEnvName = "NVIDIA_RUNTIME_SET_AS_DEFAULT"
 )
 
 // ContainerProbe defines container probe types
@@ -939,7 +941,7 @@ func TransformGPUDiscoveryPlugin(obj *appsv1.DaemonSet, config *gpuv1.ClusterPol
 	}
 
 	// set RuntimeClass for supported runtimes
-	setRuntimeClass(&obj.Spec.Template.Spec, n.runtime, config.Operator.RuntimeClass)
+	setRuntimeClass(&obj.Spec.Template.Spec, n, config.Operator.RuntimeClass)
 
 	// update env required for MIG support
 	applyMIGConfiguration(&(obj.Spec.Template.Spec.Containers[0]), config.MIG.Strategy)
@@ -1195,6 +1197,28 @@ func getProxyEnv(proxyConfig *apiconfigv1.Proxy) []corev1.EnvVar {
 	return envVars
 }
 
+func transformToolkitForCDI(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) {
+	if !config.CDI.IsEnabled() {
+		return
+	}
+
+	setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), CDIEnabledEnvName, "true")
+	setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), CrioConfigModeEnvName, "config")
+	setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), NvidiaCtrRuntimeModeEnvName, "cdi")
+
+	if !n.runtimeSupportsCDI {
+		setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), NvidiaCtrRuntimeCDIPrefixesEnvName, "nvidia.cdi.k8s.io/")
+	}
+
+	// When the container runtime supports CDI, we do not configure 'nvidia' as the default runtime.
+	// Instead, we leverage native CDI support in containerd / cri-o to inject GPUs into workloads.
+	// The 'nvidia' runtime will be set as the runtime class for our management containers so that they
+	// get access to all GPUs.
+	if n.runtimeSupportsCDI {
+		setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), NvidiaRuntimeSetAsDefaultEnvName, "false")
+	}
+}
+
 // TransformToolkit transforms Nvidia container-toolkit daemonset with required config as per ClusterPolicy
 func TransformToolkit(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) error {
 	// update validation container
@@ -1233,14 +1257,7 @@ func TransformToolkit(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n
 	}
 
 	// update env required for CDI support
-	if config.CDI.IsEnabled() {
-		setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), CDIEnabledEnvName, "true")
-		setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), NvidiaCtrRuntimeCDIPrefixesEnvName, "nvidia.cdi.k8s.io/")
-		setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), CrioConfigModeEnvName, "config")
-		if config.CDI.IsDefault() {
-			setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), NvidiaCtrRuntimeModeEnvName, "cdi")
-		}
-	}
+	transformToolkitForCDI(obj, config, n)
 
 	// set install directory for the toolkit
 	if config.Toolkit.InstallDir != "" && config.Toolkit.InstallDir != DefaultToolkitInstallDir {
@@ -1352,6 +1369,29 @@ func transformForRuntime(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec,
 	return nil
 }
 
+func transformDevicePluginForCDI(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) {
+	if !config.CDI.IsEnabled() {
+		return
+	}
+
+	setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), CDIEnabledEnvName, "true")
+	if config.Toolkit.IsEnabled() {
+		setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), NvidiaCDIHookPathEnvName, filepath.Join(config.Toolkit.InstallDir, "toolkit/nvidia-cdi-hook"))
+	}
+
+	// When the container runtime supports CDI, we leverage native CDI support in container / cri-o
+	// to inject GPUs into workloads. If native CDI is not supported, we leverage CDI support in
+	// NVIDIA Container Toolkit.
+	deviceListStrategy := "cdi-cri"
+	cdiAnnotationPrefix := "cdi.k8s.io/"
+	if !n.runtimeSupportsCDI {
+		deviceListStrategy = "envvar,cdi-annotations"
+		cdiAnnotationPrefix = "nvidia.cdi.k8s.io/"
+	}
+	setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), DeviceListStrategyEnvName, deviceListStrategy)
+	setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), CDIAnnotationPrefixEnvName, cdiAnnotationPrefix)
+}
+
 // TransformDevicePlugin transforms k8s-device-plugin daemonset with required config as per ClusterPolicy
 func TransformDevicePlugin(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) error {
 	// update validation container
@@ -1406,20 +1446,13 @@ func TransformDevicePlugin(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpe
 	}
 
 	// set RuntimeClass for supported runtimes
-	setRuntimeClass(&obj.Spec.Template.Spec, n.runtime, config.Operator.RuntimeClass)
+	setRuntimeClass(&obj.Spec.Template.Spec, n, config.Operator.RuntimeClass)
 
 	// update env required for MIG support
 	applyMIGConfiguration(&(obj.Spec.Template.Spec.Containers[0]), config.MIG.Strategy)
 
 	// update env required for CDI support
-	if config.CDI.IsEnabled() {
-		setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), CDIEnabledEnvName, "true")
-		setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), DeviceListStrategyEnvName, "envvar,cdi-annotations")
-		setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), CDIAnnotationPrefixEnvName, "nvidia.cdi.k8s.io/")
-		if config.Toolkit.IsEnabled() {
-			setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), NvidiaCDIHookPathEnvName, filepath.Join(config.Toolkit.InstallDir, "toolkit/nvidia-cdi-hook"))
-		}
-	}
+	transformDevicePluginForCDI(obj, config, n)
 
 	// update MPS volumes and set MPS_ROOT env var if a custom MPS root is configured
 	if config.DevicePlugin.MPS != nil && config.DevicePlugin.MPS.Root != "" &&
@@ -1494,7 +1527,7 @@ func TransformMPSControlDaemon(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolic
 	}
 
 	// set RuntimeClass for supported runtimes
-	setRuntimeClass(&obj.Spec.Template.Spec, n.runtime, config.Operator.RuntimeClass)
+	setRuntimeClass(&obj.Spec.Template.Spec, n, config.Operator.RuntimeClass)
 
 	// update env required for MIG support
 	applyMIGConfiguration(mainContainer, config.MIG.Strategy)
@@ -1608,7 +1641,7 @@ func TransformDCGMExporter(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpe
 	}
 
 	// set RuntimeClass for supported runtimes
-	setRuntimeClass(&obj.Spec.Template.Spec, n.runtime, config.Operator.RuntimeClass)
+	setRuntimeClass(&obj.Spec.Template.Spec, n, config.Operator.RuntimeClass)
 
 	// mount configmap for custom metrics if provided by user
 	if config.DCGMExporter.MetricsConfig != nil && config.DCGMExporter.MetricsConfig.Name != "" {
@@ -1725,7 +1758,7 @@ func TransformDCGM(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n Clu
 	}
 
 	// set RuntimeClass for supported runtimes
-	setRuntimeClass(&obj.Spec.Template.Spec, n.runtime, config.Operator.RuntimeClass)
+	setRuntimeClass(&obj.Spec.Template.Spec, n, config.Operator.RuntimeClass)
 
 	return nil
 }
@@ -1775,7 +1808,7 @@ func TransformMIGManager(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec,
 	}
 
 	// set RuntimeClass for supported runtimes
-	setRuntimeClass(&obj.Spec.Template.Spec, n.runtime, config.Operator.RuntimeClass)
+	setRuntimeClass(&obj.Spec.Template.Spec, n, config.Operator.RuntimeClass)
 
 	// set ConfigMap name for "mig-parted-config" Volume
 	for i, vol := range obj.Spec.Template.Spec.Volumes {
@@ -2060,7 +2093,7 @@ func TransformValidator(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec,
 	}
 
 	// set RuntimeClass for supported runtimes
-	setRuntimeClass(&obj.Spec.Template.Spec, n.runtime, config.Operator.RuntimeClass)
+	setRuntimeClass(&obj.Spec.Template.Spec, n, config.Operator.RuntimeClass)
 
 	var validatorErr error
 	// apply changes for individual component validators(initContainers)
@@ -2392,13 +2425,15 @@ func getRuntimeClass(config *gpuv1.ClusterPolicySpec) string {
 	return DefaultRuntimeClass
 }
 
-func setRuntimeClass(podSpec *corev1.PodSpec, runtime gpuv1.Runtime, runtimeClass string) {
-	if runtime == gpuv1.Containerd {
-		if runtimeClass == "" {
-			runtimeClass = DefaultRuntimeClass
-		}
-		podSpec.RuntimeClassName = &runtimeClass
+func setRuntimeClass(podSpec *corev1.PodSpec, n ClusterPolicyController, runtimeClass string) {
+	if !n.singleton.Spec.CDI.IsEnabled() && n.runtime != gpuv1.Containerd {
+		return
+	}
+
+	if runtimeClass == "" {
+		runtimeClass = DefaultRuntimeClass
 	}
+	podSpec.RuntimeClassName = &runtimeClass
 }
 
 func setContainerProbe(container *corev1.Container, probe *gpuv1.ContainerProbeSpec, probeType ContainerProbe) {

@@ -161,10 +161,11 @@ type ClusterPolicyController struct {
 	openshift        string
 	ocpDriverToolkit OpenShiftDriverToolkit
 
-	runtime        gpuv1.Runtime
-	hasGPUNodes    bool
-	hasNFDLabels   bool
-	sandboxEnabled bool
+	runtime            gpuv1.Runtime
+	runtimeSupportsCDI bool
+	hasGPUNodes        bool
+	hasNFDLabels       bool
+	sandboxEnabled     bool
 }
 
 func addState(n *ClusterPolicyController, path string) {
@@ -580,7 +581,7 @@ func (n *ClusterPolicyController) labelGPUNodes() (bool, int, error) {
 	return clusterHasNFDLabels, gpuNodesTotal, nil
 }
 
-func getRuntimeString(node corev1.Node) (gpuv1.Runtime, error) {
+func getRuntimeVersionString(node corev1.Node) (gpuv1.Runtime, string, error) {
 	// ContainerRuntimeVersion string will look like <runtime>://<x.y.z>
 	runtimeVer := node.Status.NodeInfo.ContainerRuntimeVersion
 	var runtime gpuv1.Runtime
@@ -592,9 +593,11 @@ func getRuntimeString(node corev1.Node) (gpuv1.Runtime, error) {
 	case strings.HasPrefix(runtimeVer, "cri-o"):
 		runtime = gpuv1.CRIO
 	default:
-		return "", fmt.Errorf("runtime not recognized: %s", runtimeVer)
+		return "", "", fmt.Errorf("runtime not recognized: %s", runtimeVer)
 	}
-	return runtime, nil
+	version := strings.SplitAfter(runtimeVer, "//")[1]
+	vVersion := "v" + strings.TrimPrefix(version, "v")
+	return runtime, vVersion, nil
 }
 
 func (n *ClusterPolicyController) setPodSecurityLabelsForNamespace() error {
@@ -706,13 +709,14 @@ func (n *ClusterPolicyController) ocpEnsureNamespaceMonitoring() error {
 	return nil
 }
 
-// getRuntime will detect the container runtime used by nodes in the
-// cluster and correctly set the value for clusterPolicyController.runtime
-// For openshift, set runtime to crio. Otherwise, the default runtime is
-// containerd -- if >=1 node is configured with containerd, set
-// clusterPolicyController.runtime = containerd
-func (n *ClusterPolicyController) getRuntime() error {
+// getContainerRuntimeInfo will detect the container runtime version used by nodes
+// in the cluster and correctly set the value for clusterPolicyController.runtime
+// and clusterPolicyController.runtimeSupportsCDI. On OpenShift, the runtime
+// is always assumed to be cri-o. We assume the runtime supports CDI unless
+// containerd < 1.7.0 is detected.
+func (n *ClusterPolicyController) getContainerRuntimeInfo() error {
 	ctx := n.ctx
+	n.runtimeSupportsCDI = true
 	// assume crio for openshift clusters
 	if n.openshift != "" {
 		n.runtime = gpuv1.CRIO
@@ -725,27 +729,26 @@ func (n *ClusterPolicyController) getRuntime() error {
 	list := &corev1.NodeList{}
 	err := n.client.List(ctx, list, opts...)
 	if err != nil {
-		return fmt.Errorf("Unable to list nodes prior to checking container runtime: %v", err)
+		return fmt.Errorf("failed to list nodes: %w", err)
 	}
 
 	var runtime gpuv1.Runtime
-	for _, node := range list.Items {
-		rt, err := getRuntimeString(node)
+	for i, node := range list.Items {
+		rt, version, err := getRuntimeVersionString(node)
 		if err != nil {
-			n.logger.Info(fmt.Sprintf("Unable to get runtime info for node %s: %v", node.Name, err))
-			continue
+			return fmt.Errorf("failed to get runtime info for node %s: %w", node.Name, err)
+		}
+		if i == 0 {
+			runtime = rt
+		} else if rt != runtime {
+			n.logger.Error(nil, "Different runtimes on different worker nodes is not supported")
+			return fmt.Errorf("different runtimes on different worker nodes is not supported")
 		}
-		runtime = rt
-		if runtime == gpuv1.Containerd {
-			// default to containerd if >=1 node running containerd
-			break
+		if runtime == gpuv1.Containerd && semver.Compare(version, "v1.7.0") < 0 {
+			n.runtimeSupportsCDI = false
 		}
 	}
 
-	if runtime.String() == "" {
-		n.logger.Info("Unable to get runtime info from the cluster, defaulting to containerd")
-		runtime = gpuv1.Containerd
-	}
 	n.runtime = runtime
 	return nil
 }
@@ -868,11 +871,12 @@ func (n *ClusterPolicyController) init(ctx context.Context, reconciler *ClusterP
 	}
 
 	// detect the container runtime on worker nodes
-	err = n.getRuntime()
+	err = n.getContainerRuntimeInfo()
 	if err != nil {
-		return err
+		return fmt.Errorf("failed to get container runtime info: %w", err)
 	}
 	n.logger.Info(fmt.Sprintf("Using container runtime: %s", n.runtime.String()))
+	n.logger.Info(fmt.Sprintf("Container runtime supports CDI: %t", n.runtimeSupportsCDI))
 
 	// fetch all kernel versions from the GPU nodes in the cluster
 	if n.singleton.Spec.Driver.IsEnabled() && n.singleton.Spec.Driver.UsePrecompiledDrivers() {