Skip to content

Commit 2ad39e6

Browse files
authored
Merge pull request #894 from NVIDIA/cherry-pick_fixes-for-openshift
Cherry-pick fixes for OpenShift
2 parents 1d0a78c + 2032e2d commit 2ad39e6

File tree

7 files changed

+32
-2
lines changed

7 files changed

+32
-2
lines changed

assets/state-node-status-exporter/0200_role.yaml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,3 +19,11 @@ rules:
1919
verbs:
2020
- get
2121
- list
22+
- apiGroups:
23+
- apps
24+
resources:
25+
- daemonsets
26+
verbs:
27+
- get
28+
- list
29+
- watch

assets/state-node-status-exporter/0700_daemonset.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,10 @@ spec:
4040
valueFrom:
4141
fieldRef:
4242
fieldPath: spec.nodeName
43+
- name: OPERATOR_NAMESPACE
44+
valueFrom:
45+
fieldRef:
46+
fieldPath: metadata.namespace
4347
ports:
4448
- name: node-status
4549
containerPort: 8000

cmd/gpu-operator/main.go

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ import (
2727
// to ensure that exec-entrypoint and run can make use of them.
2828
"go.uber.org/zap/zapcore"
2929
_ "k8s.io/client-go/plugin/pkg/client/auth"
30+
"sigs.k8s.io/controller-runtime/pkg/cache"
3031

3132
apiconfigv1 "github.com/openshift/api/config/v1"
3233
apiimagev1 "github.com/openshift/api/image/v1"
@@ -49,6 +50,7 @@ import (
4950
nvidiav1alpha1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1alpha1"
5051
"github.com/NVIDIA/gpu-operator/controllers"
5152
"github.com/NVIDIA/gpu-operator/controllers/clusterinfo"
53+
"github.com/NVIDIA/gpu-operator/internal/consts"
5254
"github.com/NVIDIA/gpu-operator/internal/info"
5355
// +kubebuilder:scaffold:imports
5456
)
@@ -104,13 +106,24 @@ func main() {
104106
Port: 9443,
105107
})
106108

109+
operatorNamespace := os.Getenv("OPERATOR_NAMESPACE")
110+
openshiftNamespace := consts.OpenshiftNamespace
111+
cacheOptions := cache.Options{
112+
DefaultNamespaces: map[string]cache.Config{
113+
operatorNamespace: {},
114+
// Also cache resources in the openshift namespace to retrieve ImageStreams when on an openshift cluster
115+
openshiftNamespace: {},
116+
},
117+
}
118+
107119
options := ctrl.Options{
108120
Scheme: scheme,
109121
Metrics: metricsOptions,
110122
HealthProbeBindAddress: probeAddr,
111123
LeaderElection: enableLeaderElection,
112124
LeaderElectionID: "53822513.nvidia.com",
113125
WebhookServer: webhookServer,
126+
Cache: cacheOptions,
114127
}
115128

116129
if enableLeaderElection && int(renewDeadline) != 0 {

controllers/clusterinfo/clusterinfo.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -341,7 +341,7 @@ func getOpenshiftDTKImages(ctx context.Context, c *rest.Config) map[string]strin
341341
logger := log.FromContext(ctx)
342342

343343
name := "driver-toolkit"
344-
namespace := "openshift"
344+
namespace := consts.OpenshiftNamespace
345345

346346
ocpImageClient, err := imagesv1.NewForConfig(c)
347347
if err != nil {

controllers/object_controls.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ import (
5252
"sigs.k8s.io/yaml"
5353

5454
gpuv1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1"
55+
"github.com/NVIDIA/gpu-operator/internal/consts"
5556
"github.com/NVIDIA/gpu-operator/internal/utils"
5657
)
5758

@@ -3705,7 +3706,7 @@ func ocpHasDriverToolkitImageStream(n *ClusterPolicyController) (bool, error) {
37053706
ctx := n.ctx
37063707
found := &apiimagev1.ImageStream{}
37073708
name := "driver-toolkit"
3708-
namespace := "openshift"
3709+
namespace := consts.OpenshiftNamespace
37093710
err := n.client.Get(ctx, types.NamespacedName{Namespace: namespace, Name: name}, found)
37103711
if err != nil {
37113712
if apierrors.IsNotFound(err) {

internal/consts/consts.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,9 @@ const (
3939
// Containerd runtime
4040
Containerd = "containerd"
4141

42+
// OpenshiftNamespace indicates the main namespace of an Openshift cluster
43+
OpenshiftNamespace = "openshift"
44+
4245
OcpDriverToolkitVersionLabel = "openshift.driver-toolkit.rhcos"
4346
OcpDriverToolkitIdentificationLabel = "openshift.driver-toolkit"
4447
NfdOSTreeVersionLabelKey = "feature.node.kubernetes.io/system-os_release.OSTREE_VERSION"

validator/metrics.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -238,6 +238,7 @@ func (nm *NodeMetrics) watchDriverValidation() {
238238
nm.driverValidation.Set(1)
239239
nm.driverValidationLastSuccess.Set(float64(time.Now().Unix()))
240240
} else {
241+
log.Errorf("failed to validate driver: %v", err)
241242
nm.driverValidation.Set(0)
242243
}
243244
time.Sleep(driverValidationCheckDelaySeconds * time.Second)

0 commit comments

Comments
 (0)