Skip to content

Commit 86de1cb

Browse files
committed
GPU sharing on cuda compute capability >=7.5
Signed-off-by: Swati Gupta <[email protected]>
1 parent 54334a1 commit 86de1cb

File tree

1 file changed

+21
-2
lines changed

1 file changed

+21
-2
lines changed

cmd/nvidia-dra-plugin/device_state.go

+21-2
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ import (
2020
"context"
2121
"fmt"
2222
"slices"
23+
"strings"
2324
"sync"
2425

2526
resourceapi "k8s.io/api/resource/v1beta1"
@@ -29,6 +30,8 @@ import (
2930
"k8s.io/kubernetes/pkg/kubelet/checkpointmanager"
3031
cdiapi "tags.cncf.io/container-device-interface/pkg/cdi"
3132

33+
"golang.org/x/mod/semver"
34+
3235
configapi "github.com/NVIDIA/k8s-dra-driver/api/nvidia.com/resource/gpu/v1alpha1"
3336
)
3437

@@ -390,6 +393,21 @@ func (s *DeviceState) applySharingConfig(ctx context.Context, config configapi.S
390393
allocatableDevices[r.Device] = s.allocatable[r.Device]
391394
}
392395

396+
// allow devices only with cuda compute compatility >= 7.5 as time slicing and MPS does not work with old arch
397+
shareableAllocatableDevices := make(AllocatableDevices)
398+
for device, deviceType := range allocatableDevices {
399+
if deviceType.Gpu != nil {
400+
cudaCCv := "v" + strings.TrimPrefix(deviceType.Gpu.cudaComputeCapability, "v")
401+
gpuUUID := deviceType.Gpu.UUID
402+
if semver.Compare(semver.Canonical(cudaCCv), semver.Canonical("v7.5")) >= 0 {
403+
klog.Infof("GPU sharing is available on this device UUID=%v with CudaComputeCapability=%v", gpuUUID, cudaCCv)
404+
shareableAllocatableDevices[device] = deviceType
405+
} else {
406+
return nil, fmt.Errorf("GPU sharing is not available on this device UUID=%v", gpuUUID)
407+
}
408+
}
409+
}
410+
393411
// Declare a device group state object to populate.
394412
var configState DeviceConfigState
395413

@@ -400,7 +418,7 @@ func (s *DeviceState) applySharingConfig(ctx context.Context, config configapi.S
400418
return nil, fmt.Errorf("error getting timeslice config for requests '%v' in claim '%v': %w", requests, claim.UID, err)
401419
}
402420
if tsc != nil {
403-
err = s.tsManager.SetTimeSlice(allocatableDevices, tsc)
421+
err = s.tsManager.SetTimeSlice(shareableAllocatableDevices, tsc)
404422
if err != nil {
405423
return nil, fmt.Errorf("error setting timeslice config for requests '%v' in claim '%v': %w", requests, claim.UID, err)
406424
}
@@ -413,7 +431,8 @@ func (s *DeviceState) applySharingConfig(ctx context.Context, config configapi.S
413431
if err != nil {
414432
return nil, fmt.Errorf("error getting MPS configuration: %w", err)
415433
}
416-
mpsControlDaemon := s.mpsManager.NewMpsControlDaemon(string(claim.UID), allocatableDevices)
434+
435+
mpsControlDaemon := s.mpsManager.NewMpsControlDaemon(string(claim.UID), shareableAllocatableDevices)
417436
if err := mpsControlDaemon.Start(ctx, mpsc); err != nil {
418437
return nil, fmt.Errorf("error starting MPS control daemon: %w", err)
419438
}

0 commit comments

Comments
 (0)