Skip to content

Commit 992bfc1

Browse files
committed
Merge branch 'allow_custom_dcgm_metrics' into 'master'
Allow custom dcgm metrics See merge request nvidia/kubernetes/gpu-operator!282
2 parents ae38747 + e2fc23e commit 992bfc1

11 files changed

+121
-35
lines changed

api/v1/clusterpolicy_types.go

+15
Original file line numberDiff line numberDiff line change
@@ -510,6 +510,21 @@ type DCGMExporterSpec struct {
510510
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Environment Variables"
511511
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:advanced,urn:alm:descriptor:com.tectonic.ui:text"
512512
Env []corev1.EnvVar `json:"env,omitempty"`
513+
514+
// Optional: Custom metrics configuration for DCGM exporter
515+
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
516+
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Custom Metrics Configuration For DCGM Exporter"
517+
MetricsConfig *DCGMExporterMetricsConfig `json:"config,omitempty"`
518+
}
519+
520+
// DCGMExporterMetricsConfig defines metrics to be collected by DCGM Exporter
521+
type DCGMExporterMetricsConfig struct {
522+
// ConfigMap name with file dcgm-metrics.csv for metrics to be collected by DCGM exporter
523+
// +kubebuilder:validation:Optional
524+
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
525+
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="ConfigMap name with file dcgm-metrics.csv"
526+
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:text"
527+
Name string `json:"name,omitempty"`
513528
}
514529

515530
// DCGMSpec defines the properties for DCGM deployment

api/v1/zz_generated.deepcopy.go

+20
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

bundle/manifests/gpu-operator.clusterserviceversion.yaml

+3
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,9 @@ metadata:
1616
"image": "dcgm-exporter",
1717
"imagePullSecrets": [],
1818
"repository": "nvcr.io/nvidia/k8s",
19+
"config": {
20+
"name": ""
21+
},
1922
"env": [
2023
{
2124
"name": "DCGM_EXPORTER_LISTEN",

bundle/manifests/nvidia.com_clusterpolicies.yaml

+9
Original file line numberDiff line numberDiff line change
@@ -405,6 +405,15 @@ spec:
405405
items:
406406
type: string
407407
type: array
408+
config:
409+
description: 'Optional: Custom metrics configuration for DCGM
410+
exporter'
411+
properties:
412+
name:
413+
description: ConfigMap name with file dcgm-metrics.csv for
414+
metrics to be collected by DCGM exporter
415+
type: string
416+
type: object
408417
env:
409418
description: 'Optional: List of environment variables'
410419
items:

config/crd/bases/nvidia.com_clusterpolicies.yaml

+9
Original file line numberDiff line numberDiff line change
@@ -405,6 +405,15 @@ spec:
405405
items:
406406
type: string
407407
type: array
408+
config:
409+
description: 'Optional: Custom metrics configuration for DCGM
410+
exporter'
411+
properties:
412+
name:
413+
description: ConfigMap name with file dcgm-metrics.csv for
414+
metrics to be collected by DCGM exporter
415+
type: string
416+
type: object
408417
env:
409418
description: 'Optional: List of environment variables'
410419
items:

config/rbac/role.yaml

+3-1
Original file line numberDiff line numberDiff line change
@@ -77,12 +77,14 @@ rules:
7777
- apiGroups:
7878
- monitoring.coreos.com
7979
resources:
80-
- servicemonitors
8180
- prometheusrule
81+
- servicemonitors
8282
verbs:
8383
- create
8484
- get
8585
- list
86+
- patch
87+
- update
8688
- watch
8789
- apiGroups:
8890
- nvidia.com

config/samples/v1_clusterpolicy.yaml

+3
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,9 @@ spec:
140140
value: "/etc/dcgm-exporter/dcp-metrics-included.csv"
141141
securityContext: {}
142142
resources: {}
143+
# configmap name for custom dcgm metrics
144+
config:
145+
name: ""
143146

144147
gfd:
145148
repository: nvcr.io/nvidia

controllers/clusterpolicy_controller.go

+4-2
Original file line numberDiff line numberDiff line change
@@ -18,13 +18,16 @@ package controllers
1818

1919
import (
2020
"context"
21+
2122
"github.com/go-logr/logr"
2223
"github.com/prometheus/common/log"
2324
corev1 "k8s.io/api/core/v1"
2425
"k8s.io/apimachinery/pkg/api/errors"
2526
"k8s.io/apimachinery/pkg/runtime"
2627
"k8s.io/apimachinery/pkg/types"
2728

29+
"time"
30+
2831
ctrl "sigs.k8s.io/controller-runtime"
2932
"sigs.k8s.io/controller-runtime/pkg/client"
3033
"sigs.k8s.io/controller-runtime/pkg/controller"
@@ -34,7 +37,6 @@ import (
3437
"sigs.k8s.io/controller-runtime/pkg/predicate"
3538
"sigs.k8s.io/controller-runtime/pkg/reconcile"
3639
"sigs.k8s.io/controller-runtime/pkg/source"
37-
"time"
3840

3941
gpuv1 "github.com/NVIDIA/gpu-operator/api/v1"
4042
)
@@ -57,7 +59,7 @@ type ClusterPolicyReconciler struct {
5759
// +kubebuilder:rbac:groups="",resources=namespaces;serviceaccounts;pods;services;services/finalizers;endpoints,verbs=get;list;watch;create;update;patch;delete
5860
// +kubebuilder:rbac:groups="",resources=persistentvolumeclaims;events;configmaps;secrets;nodes,verbs=get;list;watch;create;update;patch;delete
5961
// +kubebuilder:rbac:groups=apps,resources=deployments;daemonsets;replicasets;statefulsets,verbs=get;list;watch;create;update;patch;delete
60-
// +kubebuilder:rbac:groups=monitoring.coreos.com,resources=servicemonitors,verbs=get;list;watch;create
62+
// +kubebuilder:rbac:groups=monitoring.coreos.com,resources=servicemonitors;prometheusrule,verbs=get;list;watch;create;update;patch
6163
// +kubebuilder:rbac:groups=scheduling.k8s.io,resources=priorityclasses,verbs=get;list;watch;create
6264
// +kubebuilder:rbac:groups=batch,resources=jobs,verbs=get;list;watch;create;update;patch
6365
// +kubebuilder:rbac:groups=route.openshift.io,resources=routes,verbs=get;list;watch;create;update;patch

controllers/object_controls.go

+43-32
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,10 @@ const (
7777
DCGMDefaultHostPort = 5555
7878
// GPUDirectRDMAEnabledEnvName indicates if GPU direct RDMA is enabled through GPU operator
7979
GPUDirectRDMAEnabledEnvName = "GPU_DIRECT_RDMA_ENABLED"
80+
// MetricsConfigMountPath indicates mount path for custom dcgm metrics file
81+
MetricsConfigMountPath = "/etc/dcgm-exporter/" + MetricsConfigFileName
82+
// MetricsConfigFileName indicates custom dcgm metrics file name
83+
MetricsConfigFileName = "dcgm-metrics.csv"
8084
)
8185

8286
type controlFunc []func(n ClusterPolicyController) (gpuv1.State, error)
@@ -412,7 +416,6 @@ func parseOSRelease() (map[string]string, error) {
412416
release[m[1]] = strings.Trim(m[2], `"`)
413417
}
414418
}
415-
416419
return release, nil
417420
}
418421

@@ -774,16 +777,39 @@ func TransformDCGMExporter(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpe
774777
// set RuntimeClass for supported runtimes
775778
setRuntimeClass(&obj.Spec.Template.Spec, config.Operator.DefaultRuntime, config.Operator.RuntimeClass)
776779

777-
kvers, osTag, _ := kernelFullVersion(n)
778-
if kvers == "" {
779-
return fmt.Errorf("ERROR: Could not find kernel full version: ('%s', '%s')", kvers, osTag)
780+
// mount configmap for custom metrics if provided by user
781+
if config.DCGMExporter.MetricsConfig != nil && config.DCGMExporter.MetricsConfig.Name != "" {
782+
metricsConfigVolMount := corev1.VolumeMount{Name: "metrics-config", ReadOnly: true, MountPath: MetricsConfigMountPath, SubPath: MetricsConfigFileName}
783+
obj.Spec.Template.Spec.Containers[0].VolumeMounts = append(obj.Spec.Template.Spec.Containers[0].VolumeMounts, metricsConfigVolMount)
784+
785+
metricsConfigVolumeSource := corev1.VolumeSource{
786+
ConfigMap: &corev1.ConfigMapVolumeSource{
787+
LocalObjectReference: corev1.LocalObjectReference{
788+
Name: config.DCGMExporter.MetricsConfig.Name,
789+
},
790+
Items: []corev1.KeyToPath{
791+
{
792+
Key: MetricsConfigFileName,
793+
Path: MetricsConfigFileName,
794+
},
795+
},
796+
},
797+
}
798+
metricsConfigVol := corev1.Volume{Name: "metrics-config", VolumeSource: metricsConfigVolumeSource}
799+
obj.Spec.Template.Spec.Volumes = append(obj.Spec.Template.Spec.Volumes, metricsConfigVol)
780800
}
781801

782-
if !strings.Contains(osTag, "rhel") && !strings.Contains(osTag, "rhcos") {
802+
release, err := parseOSRelease()
803+
if err != nil {
804+
return fmt.Errorf("ERROR: failed to get os-release: %s", err)
805+
}
806+
807+
// skip SELinux changes if not an OCP cluster
808+
if _, ok := release["OPENSHIFT_VERSION"]; !ok {
783809
return nil
784810
}
785811

786-
// update init container config for per pod specific resources
812+
// Add initContainer for OCP to set proper SELinux context on /var/lib/kubelet/pod-resources
787813
initImage, err := gpuv1.ImagePath(&config.Operator.InitContainer)
788814
if err != nil {
789815
return err
@@ -801,7 +827,6 @@ func TransformDCGMExporter(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpe
801827
Privileged: &privileged,
802828
}
803829

804-
// Add initContainer for OCP to set proper SELinux context on /var/lib/kubelet/pod-resources
805830
initContainer.SecurityContext = securityContext
806831

807832
volMountSockName, volMountSockPath := "pod-gpu-resources", "/var/lib/kubelet/pod-resources"
@@ -1294,9 +1319,8 @@ func resolveDriverTag(n ClusterPolicyController, driverSpec *gpuv1.DriverSpec) (
12941319
}
12951320

12961321
func transformDriverContainer(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) error {
1297-
logger := n.rec.Log.WithValues("Daemonset", obj.Name)
12981322
for i, container := range obj.Spec.Template.Spec.Containers {
1299-
// skip if not nvidia-peermem
1323+
// skip if not nvidia-driver container
13001324
if !strings.Contains(container.Name, "nvidia-driver") {
13011325
continue
13021326
}
@@ -1394,41 +1418,28 @@ func transformDriverContainer(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicy
13941418
obj.Spec.Template.Spec.Volumes = append(obj.Spec.Template.Spec.Volumes, topologyConfigVol)
13951419
}
13961420

1397-
// Inject EUS kernel RPM's as an override to the entrypoint
1398-
// Add Env Vars needed by nvidia-driver to enable the right releasever and rpm repo
1399-
kvers, osTag, _ := kernelFullVersion(n)
1400-
if kvers == "" {
1401-
return fmt.Errorf("ERROR: Could not find kernel full version: ('%s', '%s')", kvers, osTag)
1402-
}
1403-
1404-
if !strings.Contains(osTag, "rhel") && !strings.Contains(osTag, "rhcos") {
1405-
return nil
1406-
}
1407-
14081421
release, err := parseOSRelease()
14091422
if err != nil {
14101423
return fmt.Errorf("ERROR: failed to get os-release: %s", err)
14111424
}
14121425

1413-
ocpV, err := OpenshiftVersion()
1414-
if err != nil {
1415-
// might be RHEL node using upstream k8s, don't error out.
1416-
logger.Info(fmt.Sprintf("ERROR: failed to get OpenShift version: %s", err))
1426+
// skip proxy and env settings if not ocp cluster
1427+
if _, ok := release["OPENSHIFT_VERSION"]; !ok {
1428+
return nil
14171429
}
14181430

1431+
// Add env vars needed by nvidia-driver to enable the right releasever and EUS rpm repos
14191432
rhelVersion := corev1.EnvVar{Name: "RHEL_VERSION", Value: release["RHEL_VERSION"]}
1420-
ocpVersion := corev1.EnvVar{Name: "OPENSHIFT_VERSION", Value: ocpV}
1433+
ocpVersion := corev1.EnvVar{Name: "OPENSHIFT_VERSION", Value: release["OPENSHIFT_VERSION"]}
14211434

14221435
obj.Spec.Template.Spec.Containers[i].Env = append(obj.Spec.Template.Spec.Containers[i].Env, rhelVersion)
14231436
obj.Spec.Template.Spec.Containers[i].Env = append(obj.Spec.Template.Spec.Containers[i].Env, ocpVersion)
14241437

1425-
if ocpV != "" {
1426-
// Automatically apply proxy settings for OCP and inject custom CA if configured by user
1427-
// https://docs.openshift.com/container-platform/4.6/networking/configuring-a-custom-pki.html
1428-
err = applyOCPProxySpec(n, &obj.Spec.Template.Spec)
1429-
if err != nil {
1430-
return err
1431-
}
1438+
// Automatically apply proxy settings for OCP and inject custom CA if configured by user
1439+
// https://docs.openshift.com/container-platform/4.6/networking/configuring-a-custom-pki.html
1440+
err = applyOCPProxySpec(n, &obj.Spec.Template.Spec)
1441+
if err != nil {
1442+
return err
14321443
}
14331444
}
14341445
return nil

deployments/gpu-operator/crds/nvidia.com_clusterpolicies_crd.yaml

+9
Original file line numberDiff line numberDiff line change
@@ -405,6 +405,15 @@ spec:
405405
items:
406406
type: string
407407
type: array
408+
config:
409+
description: 'Optional: Custom metrics configuration for DCGM
410+
exporter'
411+
properties:
412+
name:
413+
description: ConfigMap name with file dcgm-metrics.csv for
414+
metrics to be collected by DCGM exporter
415+
type: string
416+
type: object
408417
env:
409418
description: 'Optional: List of environment variables'
410419
items:

deployments/gpu-operator/templates/clusterpolicy.yaml

+3
Original file line numberDiff line numberDiff line change
@@ -267,6 +267,9 @@ spec:
267267
{{- if .Values.dcgmExporter.args }}
268268
args: {{ toYaml .Values.dcgmExporter.args | nindent 6 }}
269269
{{- end }}
270+
{{- if .Values.dcgmExporter.config }}
271+
config: {{ toYaml .Values.dcgmExporter.config | nindent 6 }}
272+
{{- end }}
270273
gfd:
271274
{{- if .Values.gfd.repository }}
272275
repository: {{ .Values.gfd.repository }}

0 commit comments

Comments
 (0)