Skip to content

Commit 9c0a8ce

Browse files
committed
Merge branch 'cherry_pick_e27d9800b574d57e40f73f5b55ae6580315c3274' into 'release-1.9'
Cherry-pick: controllers/state_manager.go: ensure OpenShift namespace monitoring See merge request nvidia/kubernetes/gpu-operator!349
2 parents d458e23 + 46922c2 commit 9c0a8ce

File tree

3 files changed

+73
-1
lines changed

3 files changed

+73
-1
lines changed

bundle/manifests/gpu-operator.clusterserviceversion.yaml

-1
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,6 @@ metadata:
7676
]
7777
operators.operatorframework.io/builder: operator-sdk-v1.4.0
7878
operators.operatorframework.io/project_layout: go.kubebuilder.io/v3
79-
operatorframework.io/cluster-monitoring: "true"
8079
operatorframework.io/suggested-namespace: nvidia-gpu-operator
8180
capabilities: Basic Install
8281
categories: AI/Machine Learning, OpenShift Optional

controllers/state_manager.go

+70
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,11 @@ const (
3333
ocpDriverToolkitVersionLabel = "openshift.driver-toolkit.rhcos"
3434
ocpDriverToolkitIdentificationLabel = "openshift.driver-toolkit"
3535
ocpDriverToolkitIdentificationValue = "true"
36+
ocpNamespaceMonitoringLabelKey = "openshift.io/cluster-monitoring"
37+
ocpNamespaceMonitoringLabelValue = "true"
38+
// see bundle/manifests/gpu-operator.clusterserviceversion.yaml
39+
// --> ClusterServiceVersion.metadata.annotations.operatorframework.io/suggested-namespace
40+
ocpSuggestedNamespace = "nvidia-gpu-operator"
3641
)
3742

3843
var gpuStateLabels = map[string]string{
@@ -337,6 +342,64 @@ func getRuntimeString(node corev1.Node) (gpuv1.Runtime, error) {
337342
return runtime, nil
338343
}
339344

345+
func (n *ClusterPolicyController) ocpEnsureNamespaceMonitoring() error {
346+
namespaceName := clusterPolicyCtrl.operatorNamespace
347+
348+
if namespaceName != ocpSuggestedNamespace {
349+
// The GPU Operator is not installed in the suggested
350+
// namespace, so the namespace may be shared with other
351+
// untrusted operators. Do not enable namespace monitoring in
352+
// this case, as per OpenShift/Prometheus best practices.
353+
n.rec.Log.Info("GPU Operator not installed in the suggested namespace, skipping namespace monitoring verification",
354+
"namespace", namespaceName,
355+
"suggested namespace", ocpSuggestedNamespace)
356+
return nil
357+
}
358+
359+
ns := &corev1.Namespace{}
360+
opts := client.ObjectKey{Name: namespaceName}
361+
err := n.rec.Client.Get(context.TODO(), opts, ns)
362+
if err != nil {
363+
return fmt.Errorf("ERROR: could not get Namespace %s from client: %v", namespaceName, err)
364+
}
365+
366+
val, ok := ns.ObjectMeta.Labels[ocpNamespaceMonitoringLabelKey]
367+
if ok {
368+
// label already defined, do not change it
369+
var msg string
370+
if val == ocpNamespaceMonitoringLabelValue {
371+
msg = "OpenShift monitoring is enabled on the GPU Operator namespace"
372+
} else {
373+
msg = "WARNING: OpenShift monitoring currently disabled on user request"
374+
}
375+
n.rec.Log.Info(msg,
376+
"namespace", namespaceName,
377+
"label", ocpNamespaceMonitoringLabelKey,
378+
"value", val,
379+
"excepted value", ocpNamespaceMonitoringLabelValue)
380+
381+
return nil
382+
}
383+
384+
// label not defined, enable monitoring
385+
n.rec.Log.Info("Enabling OpenShift monitoring")
386+
n.rec.Log.Info("DEBUG: Adding monitoring label to the operator namespace",
387+
"namespace", namespaceName,
388+
"label", ocpNamespaceMonitoringLabelKey,
389+
"value", ocpNamespaceMonitoringLabelValue)
390+
n.rec.Log.Info("Monitoring can be disabled by setting the namespace label " +
391+
ocpNamespaceMonitoringLabelKey + "=false")
392+
patch := client.MergeFrom(ns.DeepCopy())
393+
ns.ObjectMeta.Labels[ocpNamespaceMonitoringLabelKey] = ocpNamespaceMonitoringLabelValue
394+
err = n.rec.Client.Patch(context.TODO(), ns, patch)
395+
if err != nil {
396+
return fmt.Errorf("Unable to label namespace %s for the GPU Operator monitoring, err %s",
397+
namespaceName, err.Error())
398+
}
399+
400+
return nil
401+
}
402+
340403
// getRuntime will detect the container runtime used by nodes in the
341404
// cluster and correctly set the value for clusterPolicyController.runtime
342405
// For openshift, set runtime to crio. Otherwise, the default runtime is
@@ -507,6 +570,13 @@ func (n *ClusterPolicyController) init(reconciler *ClusterPolicyReconciler, clus
507570
n.operatorMetrics.openshiftDriverToolkitNfdTooOld.Set(1)
508571
}
509572
}
573+
574+
if n.openshift != "" {
575+
if err := n.ocpEnsureNamespaceMonitoring(); err != nil {
576+
return err
577+
}
578+
}
579+
510580
return nil
511581
}
512582

deployments/gpu-operator/templates/role.yaml

+3
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,10 @@ rules:
4242
- namespaces
4343
verbs:
4444
- get
45+
- list
4546
- create
47+
- watch
48+
- update
4649
- apiGroups:
4750
- apps
4851
resources:

0 commit comments

Comments
 (0)