Skip to content

Commit b46450f

Browse files
committed
skip old architecture version GPU settings time slice
Signed-off-by: wawa0210 <[email protected]>
1 parent 58ae162 commit b46450f

File tree

1 file changed

+50
-1
lines changed

1 file changed

+50
-1
lines changed

cmd/nvidia-dra-plugin/sharing.go

+50-1
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ import (
2222
"fmt"
2323
"os"
2424
"os/exec"
25+
"strconv"
2526
"strings"
2627
"text/template"
2728
"time"
@@ -101,6 +102,21 @@ func (t *TimeSlicingManager) SetTimeSlice(devices *PreparedDevices, config *nasc
101102
return fmt.Errorf("setting a TimeSlice duration on MIG devices is unsupported")
102103
}
103104

105+
var noSupportTimeSliceIDs []string
106+
for _, gpu := range devices.Gpu.Devices {
107+
err, isSupportTimeSlice := detectSupportTimeSliceByCudaComputeCapability(gpu.cudaComputeCapability)
108+
if err != nil {
109+
return fmt.Errorf("failed to detectSupportTimeSliceByCudaComputeCapability : %w", err)
110+
}
111+
if !isSupportTimeSlice {
112+
klog.InfoS("the current card does not support setting time slices and will be ignored.", "arch", gpu.architecture, "uuid", gpu.uuid, "cudaComputeCapability", gpu.cudaComputeCapability)
113+
noSupportTimeSliceIDs = append(noSupportTimeSliceIDs, gpu.uuid)
114+
continue
115+
}
116+
}
117+
118+
supportTimeSliceIDs := difference(devices.UUIDs(), noSupportTimeSliceIDs)
119+
104120
timeSlice := nascrd.DefaultTimeSlice
105121
if config != nil && config.TimeSlice != nil {
106122
timeSlice = *config.TimeSlice
@@ -111,7 +127,7 @@ func (t *TimeSlicingManager) SetTimeSlice(devices *PreparedDevices, config *nasc
111127
return fmt.Errorf("error setting compute mode: %w", err)
112128
}
113129

114-
err = t.nvdevlib.setTimeSlice(devices.UUIDs(), timeSlice.Int())
130+
err = t.nvdevlib.setTimeSlice(supportTimeSliceIDs, timeSlice.Int())
115131
if err != nil {
116132
return fmt.Errorf("error setting time slice: %w", err)
117133
}
@@ -389,3 +405,36 @@ func (m *MpsControlDaemon) Stop(ctx context.Context) error {
389405

390406
return nil
391407
}
408+
409+
// detactSupportTimeSliceByArch Determine whether the architecture series
410+
// supports setting time slices based on the gpu cudaComputeCapability.
411+
func detectSupportTimeSliceByCudaComputeCapability(cudaComputeCapability string) (error, bool) {
412+
// ref https://github.com/NVIDIA/k8s-dra-driver/pull/58#discussion_r1469338562
413+
// we believe time-slicing is available on Volta+ architectures, so the check would simply be cudaComputeCapability >= 7.0
414+
// by https://github.com/NVIDIA/go-nvlib/blob/main/pkg/nvlib/device/device.go#L149, We know that cuda major and minor versions are concatenated through `.` .
415+
416+
cudaVersion := strings.Split(cudaComputeCapability, ".")
417+
major, err := strconv.Atoi(cudaVersion[0])
418+
if err != nil {
419+
return fmt.Errorf("error to get cudaComputeCapability major version %v", cudaComputeCapability), false
420+
}
421+
if major >= 7 {
422+
return nil, true
423+
}
424+
return nil, false
425+
}
426+
427+
// difference returns the elements in `a` that aren't in `b`.
428+
func difference(a, b []string) []string {
429+
mb := make(map[string]struct{}, len(b))
430+
for _, x := range b {
431+
mb[x] = struct{}{}
432+
}
433+
var diff []string
434+
for _, x := range a {
435+
if _, found := mb[x]; !found {
436+
diff = append(diff, x)
437+
}
438+
}
439+
return diff
440+
}

0 commit comments

Comments
 (0)