@@ -33,6 +33,11 @@ const (
33
33
ocpDriverToolkitVersionLabel = "openshift.driver-toolkit.rhcos"
34
34
ocpDriverToolkitIdentificationLabel = "openshift.driver-toolkit"
35
35
ocpDriverToolkitIdentificationValue = "true"
36
+ ocpNamespaceMonitoringLabelKey = "openshift.io/cluster-monitoring"
37
+ ocpNamespaceMonitoringLabelValue = "true"
38
+ // see bundle/manifests/gpu-operator.clusterserviceversion.yaml
39
+ // --> ClusterServiceVersion.metadata.annotations.operatorframework.io/suggested-namespace
40
+ ocpSuggestedNamespace = "nvidia-gpu-operator"
36
41
)
37
42
38
43
var gpuStateLabels = map [string ]string {
@@ -337,6 +342,64 @@ func getRuntimeString(node corev1.Node) (gpuv1.Runtime, error) {
337
342
return runtime , nil
338
343
}
339
344
345
+ func (n * ClusterPolicyController ) ocpEnsureNamespaceMonitoring () error {
346
+ namespaceName := clusterPolicyCtrl .operatorNamespace
347
+
348
+ if namespaceName != ocpSuggestedNamespace {
349
+ // The GPU Operator is not installed in the suggested
350
+ // namespace, so the namespace may be shared with other
351
+ // untrusted operators. Do not enable namespace monitoring in
352
+ // this case, as per OpenShift/Prometheus best practices.
353
+ n .rec .Log .Info ("GPU Operator not installed in the suggested namespace, skipping namespace monitoring verification" ,
354
+ "namespace" , namespaceName ,
355
+ "suggested namespace" , ocpSuggestedNamespace )
356
+ return nil
357
+ }
358
+
359
+ ns := & corev1.Namespace {}
360
+ opts := client.ObjectKey {Name : namespaceName }
361
+ err := n .rec .Client .Get (context .TODO (), opts , ns )
362
+ if err != nil {
363
+ return fmt .Errorf ("ERROR: could not get Namespace %s from client: %v" , namespaceName , err )
364
+ }
365
+
366
+ val , ok := ns .ObjectMeta .Labels [ocpNamespaceMonitoringLabelKey ]
367
+ if ok {
368
+ // label already defined, do not change it
369
+ var msg string
370
+ if val == ocpNamespaceMonitoringLabelValue {
371
+ msg = "OpenShift monitoring is enabled on the GPU Operator namespace"
372
+ } else {
373
+ msg = "WARNING: OpenShift monitoring currently disabled on user request"
374
+ }
375
+ n .rec .Log .Info (msg ,
376
+ "namespace" , namespaceName ,
377
+ "label" , ocpNamespaceMonitoringLabelKey ,
378
+ "value" , val ,
379
+ "excepted value" , ocpNamespaceMonitoringLabelValue )
380
+
381
+ return nil
382
+ }
383
+
384
+ // label not defined, enable monitoring
385
+ n .rec .Log .Info ("Enabling OpenShift monitoring" )
386
+ n .rec .Log .Info ("DEBUG: Adding monitoring label to the operator namespace" ,
387
+ "namespace" , namespaceName ,
388
+ "label" , ocpNamespaceMonitoringLabelKey ,
389
+ "value" , ocpNamespaceMonitoringLabelValue )
390
+ n .rec .Log .Info ("Monitoring can be disabled by setting the namespace label " +
391
+ ocpNamespaceMonitoringLabelKey + "=false" )
392
+ patch := client .MergeFrom (ns .DeepCopy ())
393
+ ns .ObjectMeta .Labels [ocpNamespaceMonitoringLabelKey ] = ocpNamespaceMonitoringLabelValue
394
+ err = n .rec .Client .Patch (context .TODO (), ns , patch )
395
+ if err != nil {
396
+ return fmt .Errorf ("Unable to label namespace %s for the GPU Operator monitoring, err %s" ,
397
+ namespaceName , err .Error ())
398
+ }
399
+
400
+ return nil
401
+ }
402
+
340
403
// getRuntime will detect the container runtime used by nodes in the
341
404
// cluster and correctly set the value for clusterPolicyController.runtime
342
405
// For openshift, set runtime to crio. Otherwise, the default runtime is
@@ -507,6 +570,13 @@ func (n *ClusterPolicyController) init(reconciler *ClusterPolicyReconciler, clus
507
570
n .operatorMetrics .openshiftDriverToolkitNfdTooOld .Set (1 )
508
571
}
509
572
}
573
+
574
+ if n .openshift != "" {
575
+ if err := n .ocpEnsureNamespaceMonitoring (); err != nil {
576
+ return err
577
+ }
578
+ }
579
+
510
580
return nil
511
581
}
512
582
0 commit comments