@@ -77,6 +77,10 @@ const (
77
77
DCGMDefaultHostPort = 5555
78
78
// GPUDirectRDMAEnabledEnvName indicates if GPU direct RDMA is enabled through GPU operator
79
79
GPUDirectRDMAEnabledEnvName = "GPU_DIRECT_RDMA_ENABLED"
80
+ // MetricsConfigMountPath indicates mount path for custom dcgm metrics file
81
+ MetricsConfigMountPath = "/etc/dcgm-exporter/" + MetricsConfigFileName
82
+ // MetricsConfigFileName indicates custom dcgm metrics file name
83
+ MetricsConfigFileName = "dcgm-metrics.csv"
80
84
)
81
85
82
86
type controlFunc []func (n ClusterPolicyController ) (gpuv1.State , error )
@@ -412,7 +416,6 @@ func parseOSRelease() (map[string]string, error) {
412
416
release [m [1 ]] = strings .Trim (m [2 ], `"` )
413
417
}
414
418
}
415
-
416
419
return release , nil
417
420
}
418
421
@@ -774,16 +777,39 @@ func TransformDCGMExporter(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpe
774
777
// set RuntimeClass for supported runtimes
775
778
setRuntimeClass (& obj .Spec .Template .Spec , config .Operator .DefaultRuntime , config .Operator .RuntimeClass )
776
779
777
- kvers , osTag , _ := kernelFullVersion (n )
778
- if kvers == "" {
779
- return fmt .Errorf ("ERROR: Could not find kernel full version: ('%s', '%s')" , kvers , osTag )
780
+ // mount configmap for custom metrics if provided by user
781
+ if config .DCGMExporter .MetricsConfig != nil && config .DCGMExporter .MetricsConfig .Name != "" {
782
+ metricsConfigVolMount := corev1.VolumeMount {Name : "metrics-config" , ReadOnly : true , MountPath : MetricsConfigMountPath , SubPath : MetricsConfigFileName }
783
+ obj .Spec .Template .Spec .Containers [0 ].VolumeMounts = append (obj .Spec .Template .Spec .Containers [0 ].VolumeMounts , metricsConfigVolMount )
784
+
785
+ metricsConfigVolumeSource := corev1.VolumeSource {
786
+ ConfigMap : & corev1.ConfigMapVolumeSource {
787
+ LocalObjectReference : corev1.LocalObjectReference {
788
+ Name : config .DCGMExporter .MetricsConfig .Name ,
789
+ },
790
+ Items : []corev1.KeyToPath {
791
+ {
792
+ Key : MetricsConfigFileName ,
793
+ Path : MetricsConfigFileName ,
794
+ },
795
+ },
796
+ },
797
+ }
798
+ metricsConfigVol := corev1.Volume {Name : "metrics-config" , VolumeSource : metricsConfigVolumeSource }
799
+ obj .Spec .Template .Spec .Volumes = append (obj .Spec .Template .Spec .Volumes , metricsConfigVol )
780
800
}
781
801
782
- if ! strings .Contains (osTag , "rhel" ) && ! strings .Contains (osTag , "rhcos" ) {
802
+ release , err := parseOSRelease ()
803
+ if err != nil {
804
+ return fmt .Errorf ("ERROR: failed to get os-release: %s" , err )
805
+ }
806
+
807
+ // skip SELinux changes if not an OCP cluster
808
+ if _ , ok := release ["OPENSHIFT_VERSION" ]; ! ok {
783
809
return nil
784
810
}
785
811
786
- // update init container config for per pod specific resources
812
+ // Add initContainer for OCP to set proper SELinux context on /var/lib/kubelet/pod- resources
787
813
initImage , err := gpuv1 .ImagePath (& config .Operator .InitContainer )
788
814
if err != nil {
789
815
return err
@@ -801,7 +827,6 @@ func TransformDCGMExporter(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpe
801
827
Privileged : & privileged ,
802
828
}
803
829
804
- // Add initContainer for OCP to set proper SELinux context on /var/lib/kubelet/pod-resources
805
830
initContainer .SecurityContext = securityContext
806
831
807
832
volMountSockName , volMountSockPath := "pod-gpu-resources" , "/var/lib/kubelet/pod-resources"
@@ -1294,9 +1319,8 @@ func resolveDriverTag(n ClusterPolicyController, driverSpec *gpuv1.DriverSpec) (
1294
1319
}
1295
1320
1296
1321
func transformDriverContainer (obj * appsv1.DaemonSet , config * gpuv1.ClusterPolicySpec , n ClusterPolicyController ) error {
1297
- logger := n .rec .Log .WithValues ("Daemonset" , obj .Name )
1298
1322
for i , container := range obj .Spec .Template .Spec .Containers {
1299
- // skip if not nvidia-peermem
1323
+ // skip if not nvidia-driver container
1300
1324
if ! strings .Contains (container .Name , "nvidia-driver" ) {
1301
1325
continue
1302
1326
}
@@ -1394,41 +1418,28 @@ func transformDriverContainer(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicy
1394
1418
obj .Spec .Template .Spec .Volumes = append (obj .Spec .Template .Spec .Volumes , topologyConfigVol )
1395
1419
}
1396
1420
1397
- // Inject EUS kernel RPM's as an override to the entrypoint
1398
- // Add Env Vars needed by nvidia-driver to enable the right releasever and rpm repo
1399
- kvers , osTag , _ := kernelFullVersion (n )
1400
- if kvers == "" {
1401
- return fmt .Errorf ("ERROR: Could not find kernel full version: ('%s', '%s')" , kvers , osTag )
1402
- }
1403
-
1404
- if ! strings .Contains (osTag , "rhel" ) && ! strings .Contains (osTag , "rhcos" ) {
1405
- return nil
1406
- }
1407
-
1408
1421
release , err := parseOSRelease ()
1409
1422
if err != nil {
1410
1423
return fmt .Errorf ("ERROR: failed to get os-release: %s" , err )
1411
1424
}
1412
1425
1413
- ocpV , err := OpenshiftVersion ()
1414
- if err != nil {
1415
- // might be RHEL node using upstream k8s, don't error out.
1416
- logger .Info (fmt .Sprintf ("ERROR: failed to get OpenShift version: %s" , err ))
1426
+ // skip proxy and env settings if not ocp cluster
1427
+ if _ , ok := release ["OPENSHIFT_VERSION" ]; ! ok {
1428
+ return nil
1417
1429
}
1418
1430
1431
+ // Add env vars needed by nvidia-driver to enable the right releasever and EUS rpm repos
1419
1432
rhelVersion := corev1.EnvVar {Name : "RHEL_VERSION" , Value : release ["RHEL_VERSION" ]}
1420
- ocpVersion := corev1.EnvVar {Name : "OPENSHIFT_VERSION" , Value : ocpV }
1433
+ ocpVersion := corev1.EnvVar {Name : "OPENSHIFT_VERSION" , Value : release [ "OPENSHIFT_VERSION" ] }
1421
1434
1422
1435
obj .Spec .Template .Spec .Containers [i ].Env = append (obj .Spec .Template .Spec .Containers [i ].Env , rhelVersion )
1423
1436
obj .Spec .Template .Spec .Containers [i ].Env = append (obj .Spec .Template .Spec .Containers [i ].Env , ocpVersion )
1424
1437
1425
- if ocpV != "" {
1426
- // Automatically apply proxy settings for OCP and inject custom CA if configured by user
1427
- // https://docs.openshift.com/container-platform/4.6/networking/configuring-a-custom-pki.html
1428
- err = applyOCPProxySpec (n , & obj .Spec .Template .Spec )
1429
- if err != nil {
1430
- return err
1431
- }
1438
+ // Automatically apply proxy settings for OCP and inject custom CA if configured by user
1439
+ // https://docs.openshift.com/container-platform/4.6/networking/configuring-a-custom-pki.html
1440
+ err = applyOCPProxySpec (n , & obj .Spec .Template .Spec )
1441
+ if err != nil {
1442
+ return err
1432
1443
}
1433
1444
}
1434
1445
return nil
0 commit comments