Skip to content

Commit 395dabc

Browse files
committed
skip old architecture version GPU settings time slice
Signed-off-by: wawa0210 <[email protected]>
1 parent 0e01612 commit 395dabc

File tree

1 file changed

+33
-1
lines changed

1 file changed

+33
-1
lines changed

cmd/nvidia-dra-plugin/sharing.go

+33-1
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ import (
2222
"fmt"
2323
"os"
2424
"os/exec"
25+
"strconv"
2526
"strings"
2627
"text/template"
2728
"time"
@@ -102,6 +103,19 @@ func (t *TimeSlicingManager) SetTimeSlice(devices *PreparedDevices, config *shar
102103
return fmt.Errorf("setting a TimeSlice duration on MIG devices is unsupported")
103104
}
104105

106+
var supportTimeSliceIDs []string
107+
for _, gpu := range devices.Gpu.Devices {
108+
err, isSupportTimeSlice := detectSupportTimeSliceByCudaComputeCapability(gpu.cudaComputeCapability)
109+
if err != nil {
110+
return fmt.Errorf("failed to detectSupportTimeSliceByCudaComputeCapability : %w", err)
111+
}
112+
if isSupportTimeSlice {
113+
supportTimeSliceIDs = append(supportTimeSliceIDs, gpu.uuid)
114+
continue
115+
}
116+
klog.InfoS("the current card does not support setting time slices and will be ignored.", "arch", gpu.architecture, "uuid", gpu.uuid, "cudaComputeCapability", gpu.cudaComputeCapability)
117+
}
118+
105119
timeSlice := sharing.DefaultTimeSlice
106120
if config != nil && config.TimeSlice != nil {
107121
timeSlice = *config.TimeSlice
@@ -112,7 +126,7 @@ func (t *TimeSlicingManager) SetTimeSlice(devices *PreparedDevices, config *shar
112126
return fmt.Errorf("error setting compute mode: %w", err)
113127
}
114128

115-
err = t.nvdevlib.setTimeSlice(devices.UUIDs(), timeSlice.Int())
129+
err = t.nvdevlib.setTimeSlice(supportTimeSliceIDs, timeSlice.Int())
116130
if err != nil {
117131
return fmt.Errorf("error setting time slice: %w", err)
118132
}
@@ -390,3 +404,21 @@ func (m *MpsControlDaemon) Stop(ctx context.Context) error {
390404

391405
return nil
392406
}
407+
408+
// detactSupportTimeSliceByArch Determine whether the architecture series
409+
// supports setting time slices based on the gpu cudaComputeCapability.
410+
func detectSupportTimeSliceByCudaComputeCapability(cudaComputeCapability string) (error, bool) {
411+
// ref https://github.com/NVIDIA/k8s-dra-driver/pull/58#discussion_r1469338562
412+
// we believe time-slicing is available on Volta+ architectures, so the check would simply be cudaComputeCapability >= 7.0
413+
// by https://github.com/NVIDIA/go-nvlib/blob/main/pkg/nvlib/device/device.go#L149, We know that cuda major and minor versions are concatenated through `.` .
414+
415+
cudaVersion := strings.Split(cudaComputeCapability, ".")
416+
major, err := strconv.Atoi(cudaVersion[0])
417+
if err != nil {
418+
return fmt.Errorf("error to get cudaComputeCapability major version %v", cudaComputeCapability), false
419+
}
420+
if major >= 7 {
421+
return nil, true
422+
}
423+
return nil, false
424+
}

0 commit comments

Comments
 (0)