@@ -45,6 +45,10 @@ const (
45
45
DefaultDockerConfigFile = "/etc/docker/daemon.json"
46
46
// DefaultDockerSocketFile indicates default docker socket file
47
47
DefaultDockerSocketFile = "/var/run/docker.sock"
48
+ // DefaultCRIOConfigFile indicates default config file path for cri-o.
49
+ // Note, config files in the drop-in directory, /etc/crio/crio.conf.d,
50
+ // have a higher priority than the default /etc/crio/crio.conf file.
51
+ DefaultCRIOConfigFile = "/etc/crio/crio.conf.d/99-nvidia.conf"
48
52
// TrustedCAConfigMapName indicates configmap with custom user CA injected
49
53
TrustedCAConfigMapName = "gpu-operator-trusted-ca"
50
54
// TrustedCABundleFileName indicates custom user ca certificate filename
@@ -111,18 +115,26 @@ const (
111
115
ServiceMonitorCRDName = "servicemonitors.monitoring.coreos.com"
112
116
// DefaultToolkitInstallDir is the default toolkit installation directory on the host
113
117
DefaultToolkitInstallDir = "/usr/local/nvidia"
118
+ // ToolkitInstallDirEnvName is the name of the toolkit container env for configuring where NVIDIA Container Toolkit is installed
119
+ ToolkitInstallDirEnvName = "ROOT"
114
120
// VgpuDMDefaultConfigMapName indicates name of ConfigMap containing default vGPU devices configuration
115
121
VgpuDMDefaultConfigMapName = "default-vgpu-devices-config"
116
122
// VgpuDMDefaultConfigName indicates name of default configuration in the vGPU devices config file
117
123
VgpuDMDefaultConfigName = "default"
118
124
// NvidiaCtrRuntimeModeEnvName is the name of the toolkit container env for configuring the NVIDIA Container Runtime mode
119
125
NvidiaCtrRuntimeModeEnvName = "NVIDIA_CONTAINER_RUNTIME_MODE"
126
+ // NvidiaCtrRuntimeCDIPrefixesEnvName is the name of toolkit container env for configuring the CDI annotation prefixes
127
+ NvidiaCtrRuntimeCDIPrefixesEnvName = "NVIDIA_CONTAINER_RUNTIME_MODES_CDI_ANNOTATION_PREFIXES"
120
128
// CDIEnabledEnvName is the name of the envvar used to enable CDI in the operands
121
129
CDIEnabledEnvName = "CDI_ENABLED"
122
130
// NvidiaCTKPathEnvName is the name of the envvar specifying the path to the 'nvidia-ctk' binary
123
131
NvidiaCTKPathEnvName = "NVIDIA_CTK_PATH"
124
132
// CrioConfigModeEnvName is the name of the envvar controlling how the toolkit container updates the cri-o configuration
125
133
CrioConfigModeEnvName = "CRIO_CONFIG_MODE"
134
+ // DeviceListStrategyEnvName is the name of the envvar for configuring the device-list-strategy in the device-plugin
135
+ DeviceListStrategyEnvName = "DEVICE_LIST_STRATEGY"
136
+ // CDIAnnotationPrefixEnvName is the name of the device-plugin envvar for configuring the CDI annotation prefix
137
+ CDIAnnotationPrefixEnvName = "CDI_ANNOTATION_PREFIX"
126
138
)
127
139
128
140
// RepoConfigPathMap indicates standard OS specific paths for repository configuration files
@@ -1047,6 +1059,7 @@ func TransformToolkit(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n
1047
1059
// update env required for CDI support
1048
1060
if config .CDI .IsEnabled () {
1049
1061
setContainerEnv (& (obj .Spec .Template .Spec .Containers [0 ]), CDIEnabledEnvName , "true" )
1062
+ setContainerEnv (& (obj .Spec .Template .Spec .Containers [0 ]), NvidiaCtrRuntimeCDIPrefixesEnvName , "nvidia.cdi.k8s.io/" )
1050
1063
setContainerEnv (& (obj .Spec .Template .Spec .Containers [0 ]), CrioConfigModeEnvName , "config" )
1051
1064
if config .CDI .IsDefault () {
1052
1065
setContainerEnv (& (obj .Spec .Template .Spec .Containers [0 ]), NvidiaCtrRuntimeModeEnvName , "cdi" )
@@ -1055,13 +1068,8 @@ func TransformToolkit(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n
1055
1068
1056
1069
// set install directory for the toolkit
1057
1070
if config .Toolkit .InstallDir != "" && config .Toolkit .InstallDir != DefaultToolkitInstallDir {
1058
- // set args for the toolkit
1059
- toolkitArgStrFmt := "[[ -f /run/nvidia/validations/host-driver-ready ]] && driver_root=/ || driver_root=/run/nvidia/driver; export NVIDIA_DRIVER_ROOT=$driver_root; sleep 5; exec nvidia-toolkit %s"
1060
- toolkitArg := fmt .Sprintf (toolkitArgStrFmt , config .Toolkit .InstallDir )
1061
- args := []string {toolkitArg }
1062
- obj .Spec .Template .Spec .Containers [0 ].Args = args
1071
+ setContainerEnv (& (obj .Spec .Template .Spec .Containers [0 ]), ToolkitInstallDirEnvName , config .Toolkit .InstallDir )
1063
1072
1064
- // update install directory for the toolkit
1065
1073
for i , volume := range obj .Spec .Template .Spec .Volumes {
1066
1074
if volume .Name == "toolkit-install-dir" {
1067
1075
obj .Spec .Template .Spec .Volumes [i ].HostPath .Path = config .Toolkit .InstallDir
@@ -1086,27 +1094,30 @@ func TransformToolkit(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n
1086
1094
setContainerEnv (& (obj .Spec .Template .Spec .Containers [0 ]), "CONTAINERD_RUNTIME_CLASS" , getRuntimeClass (config ))
1087
1095
}
1088
1096
1089
- // setup mounts for runtime config file and socket file
1090
- if runtime == gpuv1 . Docker . String () || runtime == gpuv1 . Containerd . String () {
1091
- runtimeConfigFile := getRuntimeConfigFile ( & ( obj . Spec . Template . Spec . Containers [ 0 ]), runtime )
1092
- runtimeSocketFile := getRuntimeSocketFile ( & ( obj . Spec . Template . Spec . Containers [ 0 ]), runtime )
1093
-
1094
- sourceSocketFileName := path .Base (runtimeSocketFile )
1095
- sourceConfigFileName := path . Base ( runtimeConfigFile )
1097
+ // setup mounts for runtime config file
1098
+ runtimeConfigFile , err := getRuntimeConfigFile ( & ( obj . Spec . Template . Spec . Containers [ 0 ]), runtime )
1099
+ if err != nil {
1100
+ return fmt . Errorf ( "error getting path to runtime config file: %v" , err )
1101
+ }
1102
+ sourceConfigFileName := path .Base (runtimeConfigFile )
1103
+ runtimeArgs := "--config " + DefaultRuntimeConfigTargetDir + sourceConfigFileName
1096
1104
1097
- // docker needs socket file as runtime arg
1098
- setContainerEnv ( & ( obj . Spec . Template . Spec . Containers [ 0 ]), "RUNTIME_ARGS" ,
1099
- "--socket " + DefaultRuntimeSocketTargetDir + sourceSocketFileName + " --config " + DefaultRuntimeConfigTargetDir + sourceConfigFileName )
1105
+ volMountConfigName := fmt . Sprintf ( "%s-config" , runtime )
1106
+ volMountConfig := corev1. VolumeMount { Name : volMountConfigName , MountPath : DefaultRuntimeConfigTargetDir }
1107
+ obj . Spec . Template . Spec . Containers [ 0 ]. VolumeMounts = append ( obj . Spec . Template . Spec . Containers [ 0 ]. VolumeMounts , volMountConfig )
1100
1108
1101
- // setup config file mount
1102
- volMountConfigName := fmt .Sprintf ("%s-config" , runtime )
1103
- volMountConfig := corev1.VolumeMount {Name : volMountConfigName , MountPath : DefaultRuntimeConfigTargetDir }
1104
- obj .Spec .Template .Spec .Containers [0 ].VolumeMounts = append (obj .Spec .Template .Spec .Containers [0 ].VolumeMounts , volMountConfig )
1109
+ configVol := corev1.Volume {Name : volMountConfigName , VolumeSource : corev1.VolumeSource {HostPath : & corev1.HostPathVolumeSource {Path : path .Dir (runtimeConfigFile ), Type : newHostPathType (corev1 .HostPathDirectoryOrCreate )}}}
1110
+ obj .Spec .Template .Spec .Volumes = append (obj .Spec .Template .Spec .Volumes , configVol )
1105
1111
1106
- configVol := corev1.Volume {Name : volMountConfigName , VolumeSource : corev1.VolumeSource {HostPath : & corev1.HostPathVolumeSource {Path : path .Dir (runtimeConfigFile )}}}
1107
- obj .Spec .Template .Spec .Volumes = append (obj .Spec .Template .Spec .Volumes , configVol )
1112
+ // setup mounts for runtime socket file
1113
+ if runtime == gpuv1 .Docker .String () || runtime == gpuv1 .Containerd .String () {
1114
+ runtimeSocketFile , err := getRuntimeSocketFile (& (obj .Spec .Template .Spec .Containers [0 ]), runtime )
1115
+ if err != nil {
1116
+ return fmt .Errorf ("error getting path to runtime socket: %v" , err )
1117
+ }
1118
+ sourceSocketFileName := path .Base (runtimeSocketFile )
1119
+ runtimeArgs += " --socket " + DefaultRuntimeSocketTargetDir + sourceSocketFileName
1108
1120
1109
- // setup socket file mount
1110
1121
volMountSocketName := fmt .Sprintf ("%s-socket" , runtime )
1111
1122
volMountSocket := corev1.VolumeMount {Name : volMountSocketName , MountPath : DefaultRuntimeSocketTargetDir }
1112
1123
obj .Spec .Template .Spec .Containers [0 ].VolumeMounts = append (obj .Spec .Template .Spec .Containers [0 ].VolumeMounts , volMountSocket )
@@ -1115,6 +1126,9 @@ func TransformToolkit(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n
1115
1126
obj .Spec .Template .Spec .Volumes = append (obj .Spec .Template .Spec .Volumes , socketVol )
1116
1127
}
1117
1128
1129
+ // update runtime args
1130
+ setContainerEnv (& (obj .Spec .Template .Spec .Containers [0 ]), "RUNTIME_ARGS" , runtimeArgs )
1131
+
1118
1132
// Update CRI-O hooks path to use default path for non OCP cases
1119
1133
if n .openshift == "" && n .runtime == gpuv1 .CRIO {
1120
1134
for index , volume := range obj .Spec .Template .Spec .Volumes {
@@ -1187,6 +1201,8 @@ func TransformDevicePlugin(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpe
1187
1201
// update env required for CDI support
1188
1202
if config .CDI .IsEnabled () {
1189
1203
setContainerEnv (& (obj .Spec .Template .Spec .Containers [0 ]), CDIEnabledEnvName , "true" )
1204
+ setContainerEnv (& (obj .Spec .Template .Spec .Containers [0 ]), DeviceListStrategyEnvName , "envvar,cdi-annotations" )
1205
+ setContainerEnv (& (obj .Spec .Template .Spec .Containers [0 ]), CDIAnnotationPrefixEnvName , "nvidia.cdi.k8s.io/" )
1190
1206
if config .Toolkit .IsEnabled () {
1191
1207
setContainerEnv (& (obj .Spec .Template .Spec .Containers [0 ]), NvidiaCTKPathEnvName , filepath .Join (config .Toolkit .InstallDir , "toolkit/nvidia-ctk" ))
1192
1208
}
@@ -1506,6 +1522,11 @@ func TransformMIGManager(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec,
1506
1522
break
1507
1523
}
1508
1524
1525
+ // update env required for CDI support
1526
+ if config .CDI .IsEnabled () {
1527
+ setContainerEnv (& (obj .Spec .Template .Spec .Containers [0 ]), CDIEnabledEnvName , "true" )
1528
+ }
1529
+
1509
1530
return nil
1510
1531
}
1511
1532
@@ -1856,35 +1877,50 @@ func TransformNodeStatusExporter(obj *appsv1.DaemonSet, config *gpuv1.ClusterPol
1856
1877
}
1857
1878
1858
1879
// get runtime(docker, containerd) config file path based on toolkit container env or default
1859
- func getRuntimeConfigFile (c * corev1.Container , runtime string ) (runtimeConfigFile string ) {
1860
- if runtime == gpuv1 .Docker .String () {
1880
+ func getRuntimeConfigFile (c * corev1.Container , runtime string ) (string , error ) {
1881
+ var runtimeConfigFile string
1882
+ switch runtime {
1883
+ case gpuv1 .Docker .String ():
1861
1884
runtimeConfigFile = DefaultDockerConfigFile
1862
- if getContainerEnv (c , "DOCKER_CONFIG" ) != "" {
1863
- runtimeConfigFile = getContainerEnv ( c , "DOCKER_CONFIG" )
1885
+ if value := getContainerEnv (c , "DOCKER_CONFIG" ); value != "" {
1886
+ runtimeConfigFile = value
1864
1887
}
1865
- } else if runtime == gpuv1 .Containerd .String () {
1888
+ case gpuv1 .Containerd .String ():
1866
1889
runtimeConfigFile = DefaultContainerdConfigFile
1867
- if getContainerEnv (c , "CONTAINERD_CONFIG" ) != "" {
1868
- runtimeConfigFile = getContainerEnv ( c , "CONTAINERD_CONFIG" )
1890
+ if value := getContainerEnv (c , "CONTAINERD_CONFIG" ); value != "" {
1891
+ runtimeConfigFile = value
1869
1892
}
1893
+ case gpuv1 .CRIO .String ():
1894
+ runtimeConfigFile = DefaultCRIOConfigFile
1895
+ if value := getContainerEnv (c , "CRIO_CONFIG" ); value != "" {
1896
+ runtimeConfigFile = value
1897
+ }
1898
+ default :
1899
+ return "" , fmt .Errorf ("invalid runtime: %s" , runtime )
1870
1900
}
1871
- return runtimeConfigFile
1901
+
1902
+ return runtimeConfigFile , nil
1872
1903
}
1873
1904
1874
1905
// get runtime(docker, containerd) socket file path based on toolkit container env or default
1875
- func getRuntimeSocketFile (c * corev1.Container , runtime string ) (runtimeSocketFile string ) {
1876
- if runtime == gpuv1 .Docker .String () {
1906
+ func getRuntimeSocketFile (c * corev1.Container , runtime string ) (string , error ) {
1907
+ var runtimeSocketFile string
1908
+ switch runtime {
1909
+ case gpuv1 .Docker .String ():
1877
1910
runtimeSocketFile = DefaultDockerSocketFile
1878
1911
if getContainerEnv (c , "DOCKER_SOCKET" ) != "" {
1879
1912
runtimeSocketFile = getContainerEnv (c , "DOCKER_SOCKET" )
1880
1913
}
1881
- } else if runtime == gpuv1 .Containerd .String () {
1914
+ case gpuv1 .Containerd .String ():
1882
1915
runtimeSocketFile = DefaultContainerdSocketFile
1883
1916
if getContainerEnv (c , "CONTAINERD_SOCKET" ) != "" {
1884
1917
runtimeSocketFile = getContainerEnv (c , "CONTAINERD_SOCKET" )
1885
1918
}
1919
+ default :
1920
+ return "" , fmt .Errorf ("invalid runtime: %s" , runtime )
1886
1921
}
1887
- return runtimeSocketFile
1922
+
1923
+ return runtimeSocketFile , nil
1888
1924
}
1889
1925
1890
1926
func getContainerEnv (c * corev1.Container , key string ) string {
0 commit comments