Skip to content

Commit 6c34f5f

Browse files
authored
Merge pull request #218 from klueska/update-1.32
Update to work with kubernetes 1.32
2 parents c0b728c + 1258b53 commit 6c34f5f

File tree

1,237 files changed

+54928
-38469
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,237 files changed

+54928
-38469
lines changed

README.md

+5-4
Original file line numberDiff line numberDiff line change
@@ -79,13 +79,14 @@ We now install the NVIDIA GPU DRA driver:
7979
./demo/clusters/kind/install-dra-driver.sh
8080
```
8181

82-
This should show two pods running in the `nvidia-dra-driver` namespace:
82+
This should show two pods running in the `nvidia` namespace:
8383
```console
84-
kubectl get pods -n nvidia-dra-driver
84+
kubectl get pods -n nvidia
8585
```
8686
```
87-
NAME READY STATUS RESTARTS AGE
88-
nvidia-k8s-dra-driver-kubelet-plugin-t5qgz 1/1 Running 0 44s
87+
NAME READY STATUS RESTARTS AGE
88+
nvidia-dra-driver-k8s-dra-driver-controller-844fcb94b-ktbkc 1/1 Running 0 69s
89+
nvidia-dra-driver-k8s-dra-driver-kubelet-plugin-5vfp9 1/1 Running 0 69s
8990
```
9091

9192
### Run the examples by following the steps in the demo script

cmd/nvidia-dra-controller/imex.go

+15-21
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ import (
2525
"time"
2626

2727
v1 "k8s.io/api/core/v1"
28-
resourceapi "k8s.io/api/resource/v1alpha3"
28+
resourceapi "k8s.io/api/resource/v1beta1"
2929
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3030
"k8s.io/apimachinery/pkg/labels"
3131
"k8s.io/apimachinery/pkg/selection"
@@ -60,7 +60,6 @@ type ImexManager struct {
6060
waitGroup sync.WaitGroup
6161
clientset kubernetes.Interface
6262
imexDomainOffsets imexDomainOffsets
63-
owner resourceslice.Owner
6463
driverResources *resourceslice.DriverResources
6564
}
6665

@@ -77,20 +76,6 @@ func StartIMEXManager(ctx context.Context, config *Config) (*ImexManager, error)
7776
return nil, fmt.Errorf("error creating dynamic client: %w", err)
7877
}
7978

80-
// Fetch the current Pod object
81-
pod, err := clientset.CoreV1().Pods(config.flags.namespace).Get(ctx, config.flags.podName, metav1.GetOptions{})
82-
if err != nil {
83-
return nil, fmt.Errorf("error fetching pod: %w", err)
84-
}
85-
86-
// Set the owner of the ResourceSlices we will create
87-
owner := resourceslice.Owner{
88-
APIVersion: "v1",
89-
Kind: "Pod",
90-
Name: pod.Name,
91-
UID: pod.UID,
92-
}
93-
9479
// Create a new set of DriverResources
9580
driverResources := &resourceslice.DriverResources{
9681
Pools: make(map[string]resourceslice.Pool),
@@ -103,7 +88,6 @@ func StartIMEXManager(ctx context.Context, config *Config) (*ImexManager, error)
10388
driverImexChannelLimit: DriverImexChannelLimit,
10489
retryTimeout: RetryTimeout,
10590
clientset: clientset,
106-
owner: owner,
10791
driverResources: driverResources,
10892
imexDomainOffsets: make(imexDomainOffsets),
10993
}
@@ -125,8 +109,14 @@ func (m *ImexManager) manageResourceSlices(ctx context.Context) error {
125109
return fmt.Errorf("error streaming IMEX domains: %w", err)
126110
}
127111

112+
options := resourceslice.Options{
113+
DriverName: m.driverName,
114+
KubeClient: m.clientset,
115+
Resources: m.driverResources,
116+
}
117+
128118
klog.Info("Start publishing IMEX channels to ResourceSlices...")
129-
controller, err := resourceslice.StartController(ctx, m.clientset, m.driverName, m.owner, m.driverResources)
119+
controller, err := resourceslice.StartController(ctx, options)
130120
if err != nil {
131121
return fmt.Errorf("error starting resource slice controller: %w", err)
132122
}
@@ -310,13 +300,13 @@ func (m *ImexManager) cleanupResourceSlices() error {
310300
ops := metav1.ListOptions{
311301
FieldSelector: fmt.Sprintf("%s=%s", resourceapi.ResourceSliceSelectorDriver, DriverName),
312302
}
313-
l, err := m.clientset.ResourceV1alpha3().ResourceSlices().List(context.Background(), ops)
303+
l, err := m.clientset.ResourceV1beta1().ResourceSlices().List(context.Background(), ops)
314304
if err != nil {
315305
return fmt.Errorf("error listing resource slices: %w", err)
316306
}
317307

318308
for _, rs := range l.Items {
319-
err := m.clientset.ResourceV1alpha3().ResourceSlices().Delete(context.Background(), rs.Name, metav1.DeleteOptions{})
309+
err := m.clientset.ResourceV1beta1().ResourceSlices().Delete(context.Background(), rs.Name, metav1.DeleteOptions{})
320310
if err != nil {
321311
return fmt.Errorf("error deleting resource slice %s: %w", rs.Name, err)
322312
}
@@ -415,7 +405,11 @@ func generateImexChannelPool(imexDomain string, startChannel int, numChannels in
415405
},
416406
},
417407
},
418-
Devices: devices,
408+
Slices: []resourceslice.Slice{
409+
{
410+
Devices: devices,
411+
},
412+
},
419413
}
420414

421415
return pool

cmd/nvidia-dra-plugin/allocatable.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ package main
1919
import (
2020
"slices"
2121

22-
resourceapi "k8s.io/api/resource/v1alpha3"
22+
resourceapi "k8s.io/api/resource/v1beta1"
2323
)
2424

2525
type AllocatableDevices map[string]*AllocatableDevice

cmd/nvidia-dra-plugin/device_state.go

+2-2
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,10 @@ import (
2222
"slices"
2323
"sync"
2424

25-
resourceapi "k8s.io/api/resource/v1alpha3"
25+
resourceapi "k8s.io/api/resource/v1beta1"
2626
"k8s.io/apimachinery/pkg/runtime"
2727
"k8s.io/klog/v2"
28-
drapbv1 "k8s.io/kubelet/pkg/apis/dra/v1alpha4"
28+
drapbv1 "k8s.io/kubelet/pkg/apis/dra/v1beta1"
2929
"k8s.io/kubernetes/pkg/kubelet/checkpointmanager"
3030
cdiapi "tags.cncf.io/container-device-interface/pkg/cdi"
3131

cmd/nvidia-dra-plugin/deviceinfo.go

+18-12
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ import (
2222
"github.com/Masterminds/semver"
2323
nvdev "github.com/NVIDIA/go-nvlib/pkg/nvlib/device"
2424
"github.com/NVIDIA/go-nvml/pkg/nvml"
25-
resourceapi "k8s.io/api/resource/v1alpha3"
25+
resourceapi "k8s.io/api/resource/v1beta1"
2626
"k8s.io/apimachinery/pkg/api/resource"
2727
"k8s.io/utils/ptr"
2828
)
@@ -131,8 +131,10 @@ func (d *GpuInfo) GetDevice() resourceapi.Device {
131131
VersionValue: ptr.To(semver.MustParse(d.cudaDriverVersion).String()),
132132
},
133133
},
134-
Capacity: map[resourceapi.QualifiedName]resource.Quantity{
135-
"memory": *resource.NewQuantity(int64(d.memoryBytes), resource.BinarySI),
134+
Capacity: map[resourceapi.QualifiedName]resourceapi.DeviceCapacity{
135+
"memory": {
136+
Value: *resource.NewQuantity(int64(d.memoryBytes), resource.BinarySI),
137+
},
136138
},
137139
},
138140
}
@@ -181,20 +183,24 @@ func (d *MigDeviceInfo) GetDevice() resourceapi.Device {
181183
VersionValue: ptr.To(semver.MustParse(d.parent.cudaDriverVersion).String()),
182184
},
183185
},
184-
Capacity: map[resourceapi.QualifiedName]resource.Quantity{
185-
"multiprocessors": *resource.NewQuantity(int64(d.giProfileInfo.MultiprocessorCount), resource.BinarySI),
186-
"copyEngines": *resource.NewQuantity(int64(d.giProfileInfo.CopyEngineCount), resource.BinarySI),
187-
"decoders": *resource.NewQuantity(int64(d.giProfileInfo.DecoderCount), resource.BinarySI),
188-
"encoders": *resource.NewQuantity(int64(d.giProfileInfo.EncoderCount), resource.BinarySI),
189-
"jpegEngines": *resource.NewQuantity(int64(d.giProfileInfo.JpegCount), resource.BinarySI),
190-
"ofaEngines": *resource.NewQuantity(int64(d.giProfileInfo.OfaCount), resource.BinarySI),
191-
"memory": *resource.NewQuantity(int64(d.giProfileInfo.MemorySizeMB*1024*1024), resource.BinarySI),
186+
Capacity: map[resourceapi.QualifiedName]resourceapi.DeviceCapacity{
187+
"multiprocessors": {
188+
Value: *resource.NewQuantity(int64(d.giProfileInfo.MultiprocessorCount), resource.BinarySI),
189+
},
190+
"copyEngines": {Value: *resource.NewQuantity(int64(d.giProfileInfo.CopyEngineCount), resource.BinarySI)},
191+
"decoders": {Value: *resource.NewQuantity(int64(d.giProfileInfo.DecoderCount), resource.BinarySI)},
192+
"encoders": {Value: *resource.NewQuantity(int64(d.giProfileInfo.EncoderCount), resource.BinarySI)},
193+
"jpegEngines": {Value: *resource.NewQuantity(int64(d.giProfileInfo.JpegCount), resource.BinarySI)},
194+
"ofaEngines": {Value: *resource.NewQuantity(int64(d.giProfileInfo.OfaCount), resource.BinarySI)},
195+
"memory": {Value: *resource.NewQuantity(int64(d.giProfileInfo.MemorySizeMB*1024*1024), resource.BinarySI)},
192196
},
193197
},
194198
}
195199
for i := d.placement.Start; i < d.placement.Start+d.placement.Size; i++ {
196200
capacity := resourceapi.QualifiedName(fmt.Sprintf("memorySlice%d", i))
197-
device.Basic.Capacity[capacity] = *resource.NewQuantity(1, resource.BinarySI)
201+
device.Basic.Capacity[capacity] = resourceapi.DeviceCapacity{
202+
Value: *resource.NewQuantity(1, resource.BinarySI),
203+
}
198204
}
199205
return device
200206
}

cmd/nvidia-dra-plugin/driver.go

+5-3
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,11 @@ import (
2525
coreclientset "k8s.io/client-go/kubernetes"
2626
"k8s.io/dynamic-resource-allocation/kubeletplugin"
2727
"k8s.io/klog/v2"
28-
drapbv1 "k8s.io/kubelet/pkg/apis/dra/v1alpha4"
28+
drapbv1 "k8s.io/kubelet/pkg/apis/dra/v1beta1"
2929
)
3030

31+
var _ drapbv1.DRAPluginServer = &driver{}
32+
3133
type driver struct {
3234
sync.Mutex
3335
client coreclientset.Interface
@@ -48,7 +50,7 @@ func NewDriver(ctx context.Context, config *Config) (*driver, error) {
4850

4951
plugin, err := kubeletplugin.Start(
5052
ctx,
51-
driver,
53+
[]any{driver},
5254
kubeletplugin.KubeClient(driver.client),
5355
kubeletplugin.NodeName(config.flags.nodeName),
5456
kubeletplugin.DriverName(DriverName),
@@ -117,7 +119,7 @@ func (d *driver) nodePrepareResource(ctx context.Context, claim *drapbv1.Claim)
117119
d.Lock()
118120
defer d.Unlock()
119121

120-
resourceClaim, err := d.client.ResourceV1alpha3().ResourceClaims(claim.Namespace).Get(
122+
resourceClaim, err := d.client.ResourceV1beta1().ResourceClaims(claim.Namespace).Get(
121123
ctx,
122124
claim.Name,
123125
metav1.GetOptions{})

cmd/nvidia-dra-plugin/prepared.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ package main
1919
import (
2020
"slices"
2121

22-
drapbv1 "k8s.io/kubelet/pkg/apis/dra/v1alpha4"
22+
drapbv1 "k8s.io/kubelet/pkg/apis/dra/v1beta1"
2323
)
2424

2525
type PreparedDeviceList []PreparedDevice

demo/clusters/kind/scripts/common.sh

+1-1
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ DRIVER_IMAGE_VERSION=$(from_versions_mk "VERSION")
3939
# From https://github.com/kubernetes/kubernetes/tags
4040
# See also https://hub.docker.com/r/kindest/node/tags
4141
: ${KIND_K8S_REPO:="https://github.com/kubernetes/kubernetes.git"}
42-
: ${KIND_K8S_TAG:="v1.31.0"}
42+
: ${KIND_K8S_TAG:="v1.32.0"}
4343

4444
# The name of the kind cluster to create
4545
: ${KIND_CLUSTER_NAME:="${DRIVER_NAME}-cluster"}

demo/clusters/kind/scripts/kind-cluster-config.yaml

+5-1
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ nodes:
3232
kind: ClusterConfiguration
3333
apiServer:
3434
extraArgs:
35-
runtime-config: "resource.k8s.io/v1alpha3=true"
35+
runtime-config: "resource.k8s.io/v1beta1=true"
3636
scheduler:
3737
extraArgs:
3838
v: "1"
@@ -66,3 +66,7 @@ nodes:
6666
# on the kind nodes.
6767
- hostPath: /usr/bin/nvidia-ctk
6868
containerPath: /usr/bin/nvidia-ctk
69+
# We need to inject the fabricmanager socket to support MIG with toolkit 1.16.2
70+
# TODO: Remove this once we have a version of the toolkit where this is not required
71+
- hostPath: /run/nvidia-fabricmanager/socket
72+
containerPath: /run/nvidia-fabricmanager/socket

demo/clusters/nvkind/scripts/kind-cluster-config.yaml

+12-3
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ nodes:
3232
kind: ClusterConfiguration
3333
apiServer:
3434
extraArgs:
35-
runtime-config: "resource.k8s.io/v1alpha3=true"
35+
runtime-config: "resource.k8s.io/v1beta1=true"
3636
scheduler:
3737
extraArgs:
3838
v: "1"
@@ -48,8 +48,6 @@ nodes:
4848
- role: worker
4949
labels:
5050
node-role.x-k8s.io/worker: ""
51-
nvidia.com/gpu.clusteruuid: "0f884867-ba2f-4294-9155-b495ff367eea"
52-
nvidia.com/gpu.cliqueid: "{{ add 1 (mod $gpu 2) }}"
5351
kubeadmConfigPatches:
5452
- |
5553
kind: JoinConfiguration
@@ -62,4 +60,15 @@ nodes:
6260
# in `/etc/nvidia-container-runtime/config.toml`
6361
- hostPath: /dev/null
6462
containerPath: /var/run/nvidia-container-devices/cdi/runtime.nvidia.com/gpu/{{ $gpu }}
63+
# The generated CDI specification assumes that `nvidia-ctk` is available on a
64+
# node -- specifically for the `nvidia-ctk hook` subcommand. As a workaround,
65+
# we mount it from the host.
66+
# TODO: Remove this once we have a more stable solution to make `nvidia-ctk`
67+
# on the kind nodes.
68+
- hostPath: /usr/bin/nvidia-ctk
69+
containerPath: /usr/bin/nvidia-ctk
70+
# We need to inject the fabricmanager socket to support MIG with toolkit 1.16.2
71+
# TODO: Remove this once we have a version of the toolkit where this is not required
72+
- hostPath: /run/nvidia-fabricmanager/socket
73+
containerPath: /run/nvidia-fabricmanager/socket
6574
{{- end }}

demo/specs/quickstart/gpu-test-mps.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ kind: Namespace
55
metadata:
66
name: gpu-test-mps
77
---
8-
apiVersion: resource.k8s.io/v1alpha3
8+
apiVersion: resource.k8s.io/v1beta1
99
kind: ResourceClaimTemplate
1010
metadata:
1111
namespace: gpu-test-mps

demo/specs/quickstart/gpu-test1.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ metadata:
88
name: gpu-test1
99

1010
---
11-
apiVersion: resource.k8s.io/v1alpha3
11+
apiVersion: resource.k8s.io/v1beta1
1212
kind: ResourceClaimTemplate
1313
metadata:
1414
namespace: gpu-test1

demo/specs/quickstart/gpu-test2.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ metadata:
88
name: gpu-test2
99

1010
---
11-
apiVersion: resource.k8s.io/v1alpha3
11+
apiVersion: resource.k8s.io/v1beta1
1212
kind: ResourceClaimTemplate
1313
metadata:
1414
namespace: gpu-test2

demo/specs/quickstart/gpu-test3.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ metadata:
88
name: gpu-test3
99

1010
---
11-
apiVersion: resource.k8s.io/v1alpha3
11+
apiVersion: resource.k8s.io/v1beta1
1212
kind: ResourceClaim
1313
metadata:
1414
namespace: gpu-test3

demo/specs/quickstart/gpu-test4.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ metadata:
99
name: gpu-test4
1010

1111
---
12-
apiVersion: resource.k8s.io/v1alpha3
12+
apiVersion: resource.k8s.io/v1beta1
1313
kind: ResourceClaimTemplate
1414
metadata:
1515
namespace: gpu-test4

demo/specs/quickstart/gpu-test5.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ metadata:
88
name: gpu-test5
99

1010
---
11-
apiVersion: resource.k8s.io/v1alpha3
11+
apiVersion: resource.k8s.io/v1beta1
1212
kind: ResourceClaimTemplate
1313
metadata:
1414
namespace: gpu-test5

demo/specs/quickstart/gpu-test6.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ metadata:
99
name: gpu-test6
1010

1111
---
12-
apiVersion: resource.k8s.io/v1alpha3
12+
apiVersion: resource.k8s.io/v1beta1
1313
kind: ResourceClaimTemplate
1414
metadata:
1515
namespace: gpu-test6

demo/specs/quickstart/imex-test1.yaml

+5-5
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ metadata:
88
name: imex-test1
99

1010
---
11-
apiVersion: resource.k8s.io/v1alpha3
11+
apiVersion: resource.k8s.io/v1beta1
1212
kind: ResourceClaim
1313
metadata:
1414
namespace: imex-test1
@@ -20,7 +20,7 @@ spec:
2020
deviceClassName: imex.nvidia.com
2121

2222
---
23-
apiVersion: resource.k8s.io/v1alpha3
23+
apiVersion: resource.k8s.io/v1beta1
2424
kind: ResourceClaim
2525
metadata:
2626
namespace: imex-test1
@@ -32,7 +32,7 @@ spec:
3232
deviceClassName: imex.nvidia.com
3333

3434
---
35-
apiVersion: resource.k8s.io/v1alpha3
35+
apiVersion: resource.k8s.io/v1beta1
3636
kind: ResourceClaim
3737
metadata:
3838
namespace: imex-test1
@@ -44,7 +44,7 @@ spec:
4444
deviceClassName: imex.nvidia.com
4545

4646
---
47-
apiVersion: resource.k8s.io/v1alpha3
47+
apiVersion: resource.k8s.io/v1beta1
4848
kind: ResourceClaim
4949
metadata:
5050
namespace: imex-test1
@@ -56,7 +56,7 @@ spec:
5656
deviceClassName: imex.nvidia.com
5757

5858
---
59-
apiVersion: resource.k8s.io/v1alpha3
59+
apiVersion: resource.k8s.io/v1beta1
6060
kind: ResourceClaimTemplate
6161
metadata:
6262
namespace: imex-test1

0 commit comments

Comments
 (0)