Skip to content

Commit 832f317

Browse files
committed
Merge branch 'make_dcgm_remote_optional' into 'master'
Make it optional to run DCGM hostengine as a separate container See merge request nvidia/kubernetes/gpu-operator!283
2 parents 5279290 + f489ca5 commit 832f317

11 files changed

+47
-7
lines changed

api/v1/clusterpolicy_types.go

+15
Original file line numberDiff line numberDiff line change
@@ -529,6 +529,12 @@ type DCGMExporterMetricsConfig struct {
529529

530530
// DCGMSpec defines the properties for DCGM deployment
531531
type DCGMSpec struct {
532+
// Enabled indicates if deployment of DCGM hostengine as a separate pod is enabled.
533+
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
534+
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Enable DCGM hostengine as a separate Pod"
535+
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch"
536+
Enabled *bool `json:"enabled,omitempty"`
537+
532538
// DCGM image repository
533539
// +kubebuilder:validation:Optional
534540
Repository string `json:"repository"`
@@ -983,3 +989,12 @@ func (g *GPUDirectRDMASpec) IsEnabled() bool {
983989
}
984990
return *g.Enabled
985991
}
992+
993+
// IsEnabled returns true if DCGM hostengine as a separate Pod is enabled through gpu-perator
994+
func (dcgm *DCGMSpec) IsEnabled() bool {
995+
if dcgm.Enabled == nil {
996+
// DCGM is enabled by default
997+
return true
998+
}
999+
return *dcgm.Enabled
1000+
}

api/v1/zz_generated.deepcopy.go

+5
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

bundle/manifests/gpu-operator.clusterserviceversion.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ metadata:
3939
"version": "sha256:e37404194fa2bc2275827411049422b93d1493991fb925957f170b4b842846ff"
4040
},
4141
"dcgm": {
42+
"enabled": true,
4243
"image": "dcgm",
4344
"imagePullSecrets": [],
4445
"repository": "nvcr.io/nvidia/cloud-native",

bundle/manifests/nvidia.com_clusterpolicies.yaml

+4
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,10 @@ spec:
9090
items:
9191
type: string
9292
type: array
93+
enabled:
94+
description: Enabled indicates if deployment of DCGM hostengine
95+
as a separate pod is enabled.
96+
type: boolean
9397
env:
9498
description: 'Optional: List of environment variables'
9599
items:

config/crd/bases/nvidia.com_clusterpolicies.yaml

+4
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,10 @@ spec:
9090
items:
9191
type: string
9292
type: array
93+
enabled:
94+
description: Enabled indicates if deployment of DCGM hostengine
95+
as a separate pod is enabled.
96+
type: boolean
9397
env:
9498
description: 'Optional: List of environment variables'
9599
items:

config/samples/v1_clusterpolicy.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,7 @@ spec:
115115
resources: {}
116116

117117
dcgm:
118+
enabled: true
118119
repository: nvcr.io/nvidia/cloud-native
119120
image: dcgm
120121
version: 2.2.3-ubi8

controllers/object_controls.go

+8-6
Original file line numberDiff line numberDiff line change
@@ -767,13 +767,15 @@ func TransformDCGMExporter(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpe
767767
setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), env.Name, env.Value)
768768
}
769769
}
770-
// set DCGM host engine env. NODE_IP will be substituted during pod runtime
771-
dcgmHostPort := int32(DCGMDefaultHostPort)
772-
if config.DCGM.HostPort != 0 {
773-
dcgmHostPort = config.DCGM.HostPort
770+
// check if DCGM hostengine is enabled as a separate Pod and setup env accordingly
771+
if config.DCGM.IsEnabled() {
772+
// set DCGM host engine env. NODE_IP will be substituted during pod runtime
773+
dcgmHostPort := int32(DCGMDefaultHostPort)
774+
if config.DCGM.HostPort != 0 {
775+
dcgmHostPort = config.DCGM.HostPort
776+
}
777+
setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), DCGMRemoteEngineEnvName, fmt.Sprintf("$(NODE_IP):%d", dcgmHostPort))
774778
}
775-
setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), DCGMRemoteEngineEnvName, fmt.Sprintf("$(NODE_IP):%d", dcgmHostPort))
776-
777779
// set RuntimeClass for supported runtimes
778780
setRuntimeClass(&obj.Spec.Template.Spec, config.Operator.DefaultRuntime, config.Operator.RuntimeClass)
779781

controllers/state_manager.go

+3-1
Original file line numberDiff line numberDiff line change
@@ -317,7 +317,9 @@ func (n *ClusterPolicyController) init(reconciler *ClusterPolicyReconciler, clus
317317
}
318318
addState(n, "/opt/gpu-operator/state-operator-validation")
319319
addState(n, "/opt/gpu-operator/state-device-plugin")
320-
addState(n, "/opt/gpu-operator/state-dcgm")
320+
if clusterPolicy.Spec.DCGM.IsEnabled() {
321+
addState(n, "/opt/gpu-operator/state-dcgm")
322+
}
321323
addState(n, "/opt/gpu-operator/state-dcgm-exporter")
322324

323325
addState(n, "/opt/gpu-operator/gpu-feature-discovery")

deployments/gpu-operator/crds/nvidia.com_clusterpolicies_crd.yaml

+4
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,10 @@ spec:
9090
items:
9191
type: string
9292
type: array
93+
enabled:
94+
description: Enabled indicates if deployment of DCGM hostengine
95+
as a separate pod is enabled.
96+
type: boolean
9397
env:
9498
description: 'Optional: List of environment variables'
9599
items:

deployments/gpu-operator/templates/clusterpolicy.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -209,6 +209,7 @@ spec:
209209
args: {{ toYaml .Values.devicePlugin.args | nindent 6 }}
210210
{{- end }}
211211
dcgm:
212+
enabled: {{ .Values.dcgm.enabled }}
212213
{{- if .Values.dcgm.repository }}
213214
repository: {{ .Values.dcgm.repository }}
214215
{{- end }}

deployments/gpu-operator/values.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,7 @@ devicePlugin:
162162
resources: {}
163163

164164
dcgm:
165+
enabled: true
165166
repository: nvcr.io/nvidia/cloud-native
166167
image: dcgm
167168
version: 2.2.3-ubuntu20.04

0 commit comments

Comments
 (0)