Skip to content

Commit 9487493

Browse files
committed
Merge branch '22.9.2-cherry-picks' into 'release-22.09'
Create all /dev/char symlinks in driver validator See merge request nvidia/kubernetes/gpu-operator!633
2 parents a702511 + a4952b7 commit 9487493

File tree

46 files changed

+38829
-151
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

46 files changed

+38829
-151
lines changed

assets/state-container-toolkit/0400_container_toolkit.yml

+5
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,8 @@ spec:
5151
mountPath: /host
5252
readOnly: true
5353
mountPropagation: HostToContainer
54+
- name: host-dev-char
55+
mountPath: /host-dev-char
5456
containers:
5557
- image: "FILLED BY THE OPERATOR"
5658
command: [bash, -c]
@@ -97,3 +99,6 @@ spec:
9799
- name: crio-hooks
98100
hostPath:
99101
path: /run/containers/oci/hooks.d
102+
- name: host-dev-char
103+
hostPath:
104+
path: /dev/char

assets/state-operator-validation/0500_daemonset.yaml

+5
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,8 @@ spec:
5050
- name: run-nvidia-validations
5151
mountPath: /run/nvidia/validations
5252
mountPropagation: Bidirectional
53+
- name: host-dev-char
54+
mountPath: /host-dev-char
5355
- name: nvidia-fs-validation
5456
image: "FILLED BY THE OPERATOR"
5557
command: ['sh', '-c']
@@ -162,3 +164,6 @@ spec:
162164
- name: host-root
163165
hostPath:
164166
path: /
167+
- name: host-dev-char
168+
hostPath:
169+
path: /dev/char

bundle/manifests/gpu-operator-certified.clusterserviceversion.yaml

+4-4
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ metadata:
99
pod-security.kubernetes.io/warn: privileged
1010
annotations:
1111
operators.openshift.io/infrastructure-features: '["Disconnected"]'
12-
olm.skipRange: '>=1.9.0 <22.9.1'
12+
olm.skipRange: '>=1.9.0 <22.9.2'
1313
alm-examples: |-
1414
[
1515
{
@@ -143,7 +143,7 @@ metadata:
143143
provider: NVIDIA
144144
repository: http://github.com/NVIDIA/gpu-operator
145145
support: NVIDIA
146-
name: gpu-operator-certified.v22.9.0
146+
name: gpu-operator-certified.v22.9.2
147147
namespace: placeholder
148148
spec:
149149
apiservicedefinitions: {}
@@ -855,5 +855,5 @@ spec:
855855
maturity: stable
856856
provider:
857857
name: NVIDIA Corporation
858-
version: 22.9.0
859-
replaces: gpu-operator-certified.v1.11.1
858+
version: 22.9.2
859+
replaces: gpu-operator-certified.v22.9.1

controllers/state_manager.go

+3-1
Original file line numberDiff line numberDiff line change
@@ -410,7 +410,9 @@ func (n *ClusterPolicyController) applyDriverAutoUpgradeAnnotation() error {
410410
updateRequired := false
411411
value := "true"
412412
annotationValue, annotationExists := node.ObjectMeta.Annotations[driverAutoUpgradeAnnotationKey]
413-
if n.singleton.Spec.Driver.UpgradePolicy != nil && n.singleton.Spec.Driver.UpgradePolicy.AutoUpgrade {
413+
if n.singleton.Spec.Driver.UpgradePolicy != nil &&
414+
n.singleton.Spec.Driver.UpgradePolicy.AutoUpgrade &&
415+
!n.sandboxEnabled {
414416
// check if we need to add the annotation
415417
if !annotationExists {
416418
updateRequired = true

controllers/upgrade_controller.go

+13-7
Original file line numberDiff line numberDiff line change
@@ -89,18 +89,24 @@ func (r *UpgradeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct
8989
return reconcile.Result{}, err
9090
}
9191

92+
if clusterPolicy.Spec.SandboxWorkloads.IsEnabled() {
93+
reqLogger.V(consts.LogLevelInfo).Info("Advanced driver upgrade policy is not supported when 'sandboxWorkloads.enabled=true'" +
94+
"in ClusterPolicy, cleaning up upgrade state and skipping reconciliation")
95+
// disable driver upgrade metrics
96+
if clusterPolicyCtrl.operatorMetrics != nil {
97+
clusterPolicyCtrl.operatorMetrics.driverAutoUpgradeEnabled.Set(driverAutoUpgradeDisabled)
98+
}
99+
return ctrl.Result{}, r.removeNodeUpgradeStateLabels(ctx)
100+
}
101+
92102
if clusterPolicy.Spec.Driver.UpgradePolicy == nil ||
93103
!clusterPolicy.Spec.Driver.UpgradePolicy.AutoUpgrade {
94-
reqLogger.V(1).Info("Driver Upgrade Policy is disabled, skipping driver upgrade")
104+
reqLogger.V(consts.LogLevelInfo).Info("Advanced driver upgrade policy is disabled, cleaning up upgrade state and skipping reconciliation")
95105
// disable driver upgrade metrics
96106
if clusterPolicyCtrl.operatorMetrics != nil {
97107
clusterPolicyCtrl.operatorMetrics.driverAutoUpgradeEnabled.Set(driverAutoUpgradeDisabled)
98108
}
99-
err = r.removeNodeUpgradeStateLabels(ctx)
100-
if err != nil {
101-
return ctrl.Result{}, err
102-
}
103-
return ctrl.Result{}, nil
109+
return ctrl.Result{}, r.removeNodeUpgradeStateLabels(ctx)
104110
}
105111
// enable driver upgrade metrics
106112
if clusterPolicyCtrl.operatorMetrics != nil {
@@ -122,7 +128,7 @@ func (r *UpgradeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct
122128
}
123129

124130
reqLogger.Info("Propagate state to state manager")
125-
reqLogger.V(1).Info("Current cluster upgrade state", "state", state)
131+
reqLogger.V(consts.LogLevelInfo).Info("Current cluster upgrade state", "state", state)
126132

127133
// log metrics with the current state
128134
if clusterPolicyCtrl.operatorMetrics != nil {

go.mod

+4-3
Original file line numberDiff line numberDiff line change
@@ -4,16 +4,17 @@ go 1.19
44

55
require (
66
github.com/NVIDIA/k8s-operator-libs v0.0.0-20230118182127-8e73980389ca
7+
github.com/NVIDIA/nvidia-container-toolkit v1.12.0-rc.2.0.20230127101129-9fc2c5912242
78
github.com/go-logr/logr v1.2.3
89
github.com/mitchellh/hashstructure v1.1.0
910
github.com/openshift/api v0.0.0-20210924154557-a4f696157341
1011
github.com/openshift/client-go v0.0.0-20210916133943-9acee1a0fb83
1112
github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.56.3
1213
github.com/prometheus/client_golang v1.13.0
13-
github.com/sirupsen/logrus v1.8.1
14+
github.com/sirupsen/logrus v1.9.0
1415
github.com/stretchr/testify v1.8.1
1516
github.com/urfave/cli/v2 v2.3.0
16-
gitlab.com/nvidia/cloud-native/go-nvlib v0.0.0-20220525163429-038b3f8b475d
17+
gitlab.com/nvidia/cloud-native/go-nvlib v0.0.0-20230119114711-6fe07bb33342
1718
golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4
1819
k8s.io/api v0.25.3
1920
k8s.io/apiextensions-apiserver v0.25.0
@@ -56,7 +57,7 @@ require (
5657
github.com/google/go-cmp v0.5.9 // indirect
5758
github.com/google/gofuzz v1.1.0 // indirect
5859
github.com/google/shlex v0.0.0-20191202100458-e7afc7fbc510 // indirect
59-
github.com/google/uuid v1.1.2 // indirect
60+
github.com/google/uuid v1.3.0 // indirect
6061
github.com/gregjones/httpcache v0.0.0-20180305231024-9cad4c3443a7 // indirect
6162
github.com/imdario/mergo v0.3.12 // indirect
6263
github.com/inconshreveable/mousetrap v1.0.0 // indirect

go.sum

+12-7
Original file line numberDiff line numberDiff line change
@@ -69,8 +69,11 @@ github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03
6969
github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo=
7070
github.com/MakeNowJust/heredoc v1.0.0 h1:cXCdzVdstXyiTqTvfqk9SDHpKNjxuom+DOlyEeQ4pzQ=
7171
github.com/MakeNowJust/heredoc v1.0.0/go.mod h1:mG5amYoWBHf8vpLOuehzbGGw0EHxpZZ6lCpQ4fNJ8LE=
72+
github.com/NVIDIA/go-nvml v0.11.6-0.0.20220823120812-7e2082095e82/go.mod h1:hy7HYeQy335x6nEss0Ne3PYqleRa6Ct+VKD9RQ4nyFs=
7273
github.com/NVIDIA/k8s-operator-libs v0.0.0-20230118182127-8e73980389ca h1:WW+3Hs5oLau3ofkf0IQzupL4dxJ3HSOuAwwJ8L+lnC4=
7374
github.com/NVIDIA/k8s-operator-libs v0.0.0-20230118182127-8e73980389ca/go.mod h1:oIo3/7miWTtrzlcLrpceTYc6/59RtCoZjSLgquHikEg=
75+
github.com/NVIDIA/nvidia-container-toolkit v1.12.0-rc.2.0.20230127101129-9fc2c5912242 h1:HF3fCuJBd1GVqd/HgLIU02t3TOR258Ww7phlLQSatEA=
76+
github.com/NVIDIA/nvidia-container-toolkit v1.12.0-rc.2.0.20230127101129-9fc2c5912242/go.mod h1:Og2VJA5PQsCmE16OayRT2/dhsAkB/E80q+E2ueUbMhA=
7477
github.com/NYTimes/gziphandler v0.0.0-20170623195520-56545f4a5d46/go.mod h1:3wb06e3pkSAbeQ52E9H9iFoQsEEwGN64994WTCIhntQ=
7578
github.com/OneOfOne/xxhash v1.2.2/go.mod h1:HSdplMjZKSmBqAxg5vPj2TmRDmfkzw+cTzAElWljhcU=
7679
github.com/PuerkitoBio/purell v1.1.1 h1:WEQqlqaGbrPkxLJWfBwQmfEAE1Z7ONdDLqrN38tNFfI=
@@ -268,8 +271,9 @@ github.com/google/renameio v0.1.0/go.mod h1:KWCgfxg9yswjAJkECMjeO8J8rahYeXnNhOm4
268271
github.com/google/shlex v0.0.0-20191202100458-e7afc7fbc510 h1:El6M4kTTCOh6aBiKaUGG7oYTSPP8MxqL4YI3kZKwcP4=
269272
github.com/google/shlex v0.0.0-20191202100458-e7afc7fbc510/go.mod h1:pupxD2MaaD3pAXIBCelhxNneeOaAeabZDe5s4K6zSpQ=
270273
github.com/google/uuid v1.1.1/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
271-
github.com/google/uuid v1.1.2 h1:EVhdT+1Kseyi1/pUmXKaFxYsDNy9RQYkMWRH68J/W7Y=
272274
github.com/google/uuid v1.1.2/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
275+
github.com/google/uuid v1.3.0 h1:t6JiXgmwXMjEs8VusXIJk2BXHsn+wx8BZdTaoZ5fu7I=
276+
github.com/google/uuid v1.3.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
273277
github.com/googleapis/gax-go/v2 v2.0.4/go.mod h1:0Wqv26UfaUD9n4G6kQubkQ+KchISgw+vpHVxEJEs9eg=
274278
github.com/googleapis/gax-go/v2 v2.0.5/go.mod h1:DWXyrwAJ9X0FpwwEdw+IPEYBICEFu5mhpdKc/us6bOk=
275279
github.com/googleapis/gax-go/v2 v2.1.0/go.mod h1:Q3nei7sK6ybPYH7twZdmQpAd1MKb7pfu6SK+H1/DsU0=
@@ -308,6 +312,7 @@ github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxv
308312
github.com/konsorten/go-windows-terminal-sequences v1.0.3/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
309313
github.com/kr/logfmt v0.0.0-20140226030751-b84e30acd515/go.mod h1:+0opPa2QZZtGFBFZlji/RkVcI2GknAs/DXo4wKdlNEc=
310314
github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo=
315+
github.com/kr/pretty v0.2.0 h1:s5hAObm+yFO5uHYt5dYjxi2rXrsnmRpJx4OYvIWUaQs=
311316
github.com/kr/pretty v0.2.0/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI=
312317
github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
313318
github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
@@ -346,7 +351,6 @@ github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8m
346351
github.com/mwitkow/go-conntrack v0.0.0-20161129095857-cc309e4a2223/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U=
347352
github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U=
348353
github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f/go.mod h1:ZdcZmHo+o7JKHSa8/e818NopupXU1YMK5fe1lsApnBw=
349-
github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e h1:fD57ERR4JtEqsWbfPhv4DMiApHyliiK5xCTNVSPiaAs=
350354
github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLANZcx1PVRCS0qkT7pwLkGfwJo4zjcN/Tysno=
351355
github.com/nxadm/tail v1.4.4/go.mod h1:kenIhsEOeOJmVchQTgglprH7qJGnHDVpk1VPCcaMI8A=
352356
github.com/nxadm/tail v1.4.8 h1:nPr65rt6Y5JFSKQO7qToXr7pePgD6Gwiw05lkbyAQTE=
@@ -414,8 +418,8 @@ github.com/shurcooL/sanitized_anchor_name v1.0.0/go.mod h1:1NzhyTcUVG4SuEtjjoZeV
414418
github.com/sirupsen/logrus v1.2.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPxbbu5VWo=
415419
github.com/sirupsen/logrus v1.4.2/go.mod h1:tLMulIdttU9McNUspp0xgXVQah82FyeX6MwdIuYE2rE=
416420
github.com/sirupsen/logrus v1.6.0/go.mod h1:7uNnSEd1DgxDLC74fIahvMZmmYsHGZGEOFrfsX/uA88=
417-
github.com/sirupsen/logrus v1.8.1 h1:dJKuHgqk1NNQlqoA6BTlM1Wf9DOH3NBjQyu0h9+AZZE=
418-
github.com/sirupsen/logrus v1.8.1/go.mod h1:yWOB1SBYBC5VeMP7gHvWumXLIWorT60ONWic61uBYv0=
421+
github.com/sirupsen/logrus v1.9.0 h1:trlNQbNUG3OdDrDil03MCb1H2o9nJ1x4/5LYw7byDE0=
422+
github.com/sirupsen/logrus v1.9.0/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ=
419423
github.com/spaolacci/murmur3 v0.0.0-20180118202830-f09979ecbc72/go.mod h1:JwIasOWyU6f++ZhiEuf87xNszmSA2myDM2Kzu9HwQUA=
420424
github.com/spf13/afero v1.2.2/go.mod h1:9ZxEEn6pIJ8Rxe320qSDBk6AsU0r9pR7Q4OcevTdifk=
421425
github.com/spf13/cobra v1.4.0 h1:y+wJpx64xcgO1V+RcnwW0LEHxTKRi2ZDPSBjWnrg88Q=
@@ -449,8 +453,8 @@ github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9de
449453
github.com/yuin/goldmark v1.1.32/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
450454
github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
451455
github.com/yuin/goldmark v1.3.5/go.mod h1:mwnBkeHKe2W/ZEtQ+71ViKU8L12m81fl3OWwC1Zlc8k=
452-
gitlab.com/nvidia/cloud-native/go-nvlib v0.0.0-20220525163429-038b3f8b475d h1:WLnoEFzgCutNeWSdVTOU6U/55FA/FS0EbXNifmOAPHA=
453-
gitlab.com/nvidia/cloud-native/go-nvlib v0.0.0-20220525163429-038b3f8b475d/go.mod h1:TBB3sR7/jg4RCThC/cgT4fB8mAbbMO307TycfgeR59w=
456+
gitlab.com/nvidia/cloud-native/go-nvlib v0.0.0-20230119114711-6fe07bb33342 h1:083n9fJt2dWOpJd/X/q9Xgl5XtQLL22uSFYbzVqJssg=
457+
gitlab.com/nvidia/cloud-native/go-nvlib v0.0.0-20230119114711-6fe07bb33342/go.mod h1:GStidGxhaqJhYFW1YpOnLvYCbL2EsM0od7IW4u7+JgU=
454458
go.opencensus.io v0.21.0/go.mod h1:mSImk1erAIZhrmZN+AvHh14ztQfjbGwt4TtuofqLduU=
455459
go.opencensus.io v0.22.0/go.mod h1:+kGneAE2xo2IficOXnaByMWTGM9T73dGwxeWcUqIpI8=
456460
go.opencensus.io v0.22.2/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw=
@@ -661,6 +665,7 @@ golang.org/x/sys v0.0.0-20210908233432-aa78b53d3365/go.mod h1:oPkhp1MJrh7nUepCBc
661665
golang.org/x/sys v0.0.0-20211216021012-1d35b9e2eb4e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
662666
golang.org/x/sys v0.0.0-20220114195835-da31bd327af9/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
663667
golang.org/x/sys v0.0.0-20220412211240-33da011f77ad/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
668+
golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
664669
golang.org/x/sys v0.1.0 h1:kunALQeHf1/185U1i0GOB/fy1IPRDDpuoOOqRReG57U=
665670
golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
666671
golang.org/x/term v0.0.0-20201117132131-f5c789dd3221/go.mod h1:Nr5EML6q2oocZ2LXRh80K7BxOlk5/8JxuGnuhpl+muw=
@@ -886,8 +891,8 @@ gopkg.in/alecthomas/kingpin.v2 v2.2.6/go.mod h1:FMv+mEhP44yOT+4EoQTLFTRgOQ1FBLks
886891
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
887892
gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
888893
gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
889-
gopkg.in/check.v1 v1.0.0-20200227125254-8fa46927fb4f h1:BLraFXnmrev5lT+xlilqcH8XK9/i0At2xKjWk4p6zsU=
890894
gopkg.in/check.v1 v1.0.0-20200227125254-8fa46927fb4f/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
895+
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
891896
gopkg.in/errgo.v2 v2.1.0/go.mod h1:hNsd1EY+bozCKY1Ytp96fpM3vjJbqLJn88ws8XvfDNI=
892897
gopkg.in/fsnotify.v1 v1.4.7/go.mod h1:Tz8NjZHkW78fSQdbUxIjBTcgA1z1m8ZHf0WmKUhAMys=
893898
gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc=

validator/main.go

+77-34
Original file line numberDiff line numberDiff line change
@@ -28,10 +28,12 @@ import (
2828
"syscall"
2929
"time"
3030

31+
devchar "github.com/NVIDIA/nvidia-container-toolkit/cmd/nvidia-ctk/hook/create-dev-char-symlinks"
3132
log "github.com/sirupsen/logrus"
3233
cli "github.com/urfave/cli/v2"
3334
"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvmdev"
3435
"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvpci"
36+
3537
v1 "k8s.io/api/core/v1"
3638
"k8s.io/apimachinery/pkg/api/resource"
3739
meta_v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -99,18 +101,19 @@ type VGPUDevices struct {
99101
}
100102

101103
var (
102-
kubeconfigFlag string
103-
nodeNameFlag string
104-
namespaceFlag string
105-
withWaitFlag bool
106-
withWorkloadFlag bool
107-
componentFlag string
108-
cleanupAllFlag bool
109-
outputDirFlag string
110-
sleepIntervalSecondsFlag int
111-
migStrategyFlag string
112-
metricsPort int
113-
defaultGPUWorkloadConfigFlag string
104+
kubeconfigFlag string
105+
nodeNameFlag string
106+
namespaceFlag string
107+
withWaitFlag bool
108+
withWorkloadFlag bool
109+
componentFlag string
110+
cleanupAllFlag bool
111+
outputDirFlag string
112+
sleepIntervalSecondsFlag int
113+
migStrategyFlag string
114+
metricsPort int
115+
defaultGPUWorkloadConfigFlag string
116+
disableDevCharSymlinkCreation bool
114117
)
115118

116119
// defaultGPUWorkloadConfig is "vm-passthrough" unless
@@ -124,6 +127,10 @@ const (
124127
defaultSleepIntervalSeconds = 5
125128
// defaultMetricsPort indicates the port on which the metrics will be exposed.
126129
defaultMetricsPort = 0
130+
// hostDevCharPath indicates the path in the container where the host '/dev/char' directory is mounted to
131+
hostDevCharPath = "/host-dev-char"
132+
// driverContainerRoot indicates the path on the host where driver container mounts it's root filesystem
133+
driverContainerRoot = "/run/nvidia/driver"
127134
// driverStatusFile indicates status file for containerizeddriver readiness
128135
driverStatusFile = "driver-ready"
129136
// hostDriverStatusFile indicates status file for host driver readiness
@@ -298,6 +305,13 @@ func main() {
298305
Destination: &defaultGPUWorkloadConfigFlag,
299306
EnvVars: []string{"DEFAULT_GPU_WORKLOAD_CONFIG"},
300307
},
308+
&cli.BoolFlag{
309+
Name: "disable-dev-char-symlink-creation",
310+
Value: false,
311+
Usage: "disable creation of symlinks under /dev/char corresponding to NVIDIA character devices",
312+
Destination: &disableDevCharSymlinkCreation,
313+
EnvVars: []string{"DISABLE_DEV_CHAR_SYMLINK_CREATION"},
314+
},
301315
}
302316

303317
// Handle signals
@@ -574,7 +588,7 @@ func getDriverRoot() (string, bool) {
574588
return "/host", true
575589
}
576590

577-
return "/run/nvidia/driver", false
591+
return driverContainerRoot, false
578592
}
579593

580594
// For driver container installs, check existence of .driver-ctr-ready to confirm running driver
@@ -625,10 +639,32 @@ func (d *Driver) validate() error {
625639

626640
hostDriver, err := d.runValidation(false)
627641
if err != nil {
628-
fmt.Println("driver is not ready")
642+
log.Error("driver is not ready")
629643
return err
630644
}
631645

646+
if !disableDevCharSymlinkCreation {
647+
log.Info("creating symlinks under /dev/char that correspond to NVIDIA character devices")
648+
err = createDevCharSymlinks(hostDriver)
649+
if err != nil {
650+
msg := strings.Join([]string{
651+
"Failed to create symlinks under /dev/char that point to all possible NVIDIA character devices.",
652+
"The existence of these symlinks is required to address the following bug:",
653+
"",
654+
" https://github.com/NVIDIA/gpu-operator/issues/430",
655+
"",
656+
"This bug impacts container runtimes configured with systemd cgroup management enabled.",
657+
"To disable the symlink creation, set the following envvar in ClusterPolicy:",
658+
"",
659+
" validator:",
660+
" driver:",
661+
" env:",
662+
" - name: DISABLE_DEV_CHAR_SYMLINK_CREATION",
663+
" value: \"true\""}, "\n")
664+
return fmt.Errorf("%v\n\n%s", err, msg)
665+
}
666+
}
667+
632668
statusFile := driverStatusFile
633669
if hostDriver {
634670
statusFile = hostDriverStatusFile
@@ -642,6 +678,29 @@ func (d *Driver) validate() error {
642678
return nil
643679
}
644680

681+
func createDevCharSymlinks(hostDriver bool) error {
682+
driverRoot := driverContainerRoot
683+
if hostDriver {
684+
driverRoot = "/"
685+
}
686+
687+
creator, err := devchar.NewSymlinkCreator(
688+
devchar.WithDriverRoot(driverRoot),
689+
devchar.WithDevCharPath(hostDevCharPath),
690+
devchar.WithCreateAll(true),
691+
)
692+
if err != nil {
693+
return fmt.Errorf("error creating symlink creator: %v", err)
694+
}
695+
696+
err = creator.CreateLinks()
697+
if err != nil {
698+
return fmt.Errorf("error creating symlinks: %v", err)
699+
}
700+
701+
return nil
702+
}
703+
645704
func createStatusFile(statusFile string) error {
646705
_, err := os.Create(statusFile)
647706
if err != nil {
@@ -1276,33 +1335,17 @@ func (v *VfioPCI) validate() error {
12761335

12771336
func (v *VfioPCI) runValidation(silent bool) error {
12781337
nvpci := nvpci.New()
1279-
nvdevices, err := nvpci.GetAllDevices()
1338+
nvdevices, err := nvpci.GetGPUs()
12801339
if err != nil {
12811340
return fmt.Errorf("error getting NVIDIA PCI devices: %v", err)
12821341
}
12831342

12841343
for _, dev := range nvdevices {
1285-
path := filepath.Join(dev.Path, "driver")
1286-
fileInfo, err := os.Lstat(path)
1287-
if err != nil {
1288-
if os.IsNotExist(err) {
1289-
return fmt.Errorf("device %s is not bound to any driver", dev.Address)
1290-
}
1291-
return fmt.Errorf("failed to get file info for %s: %v", path, err)
1292-
}
1293-
1294-
driverName := ""
1295-
if fileInfo.Mode()&os.ModeSymlink == os.ModeSymlink {
1296-
link, _ := filepath.EvalSymlinks(path)
1297-
driverName = filepath.Base(link)
1298-
} else {
1299-
return fmt.Errorf("%s is malinformed: %v", path, err)
1300-
}
1301-
1302-
if driverName != "vfio-pci" {
1303-
return fmt.Errorf("device %s is bound to driver '%s'", dev.Address, driverName)
1344+
if dev.Driver != "vfio-pci" {
1345+
return fmt.Errorf("device not bound to 'vfio-pci'; device: %s driver: '%s'", dev.Address, dev.Driver)
13041346
}
13051347
}
1348+
13061349
return nil
13071350
}
13081351

0 commit comments

Comments
 (0)