Skip to content

Commit ac04af7

Browse files
authored
feat: add GPU info configuration and TFLOPS-based resource limiting (#84)
* feat: add GPU info configuration and TFLOPS-based resource limiting * fix lint
1 parent 3990470 commit ac04af7

File tree

9 files changed

+62
-14
lines changed

9 files changed

+62
-14
lines changed

charts/tensor-fusion/Chart.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ type: application
1515
# This is the chart version. This version number should be incremented each time you make changes
1616
# to the chart and its templates, including the app version.
1717
# Versions are expected to follow Semantic Versioning (https://semver.org/)
18-
version: 1.2.13
18+
version: 1.2.14
1919

2020
# This is the version number of the application being deployed. This version number should be
2121
# incremented each time you make changes to the application. Versions are not expected to

charts/tensor-fusion/templates/controller-deployment.yaml

+7
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,9 @@ spec:
5757
- name: cloud-vendor-credentials
5858
mountPath: /tmp/secret
5959
readOnly: true
60+
- name: gpu-info
61+
mountPath: /etc/tensor-fusion
62+
readOnly: true
6063
{{- if .Values.agent.agentId }}
6164
- name: cluster-agent
6265
image: "{{ .Values.agent.image.repository }}:{{ .Values.agent.image.tag | default "latest" }}"
@@ -105,6 +108,10 @@ spec:
105108
secret:
106109
secretName: tf-cloud-vendor-credentials
107110
defaultMode: 420
111+
- configMap:
112+
defaultMode: 420
113+
name: {{ .Release.Name }}-public-gpu-info
114+
name: gpu-info
108115
- name: logs
109116
emptyDir: {}
110117
{{- with .Values.controller.affinity }}

cmd/main.go

+11
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ import (
4141

4242
tensorfusionaiv1 "github.com/NexusGPU/tensor-fusion/api/v1"
4343
tfv1 "github.com/NexusGPU/tensor-fusion/api/v1"
44+
"github.com/NexusGPU/tensor-fusion/internal/config"
4445
"github.com/NexusGPU/tensor-fusion/internal/controller"
4546
"github.com/NexusGPU/tensor-fusion/internal/scheduler"
4647
"github.com/NexusGPU/tensor-fusion/internal/server"
@@ -69,6 +70,7 @@ func main() {
6970
var secureMetrics bool
7071
var enableHTTP2 bool
7172
var tlsOpts []func(*tls.Config)
73+
var gpuInfoConfig string
7274

7375
flag.StringVar(&metricsAddr, "metrics-bind-address", "0", "The address the metrics endpoint binds to. "+
7476
"Use :8443 for HTTPS or :8080 for HTTP, or leave as 0 to disable the metrics service.")
@@ -80,6 +82,8 @@ func main() {
8082
"If set, the metrics endpoint is served securely via HTTPS. Use --metrics-secure=false to use HTTP instead.")
8183
flag.BoolVar(&enableHTTP2, "enable-http2", false,
8284
"If set, HTTP/2 will be enabled for the metrics and webhook servers")
85+
flag.StringVar(&gpuInfoConfig, "gpu-info-config",
86+
"/etc/tensor-fusion/gpu-info.yaml", "specify the path to gpuInfoConfig file")
8387
opts := zap.Options{
8488
Development: true,
8589
}
@@ -107,6 +111,12 @@ func main() {
107111
TLSOpts: tlsOpts,
108112
})
109113

114+
gpuInfos, err := config.LoadGpuInfoFromFile(gpuInfoConfig)
115+
if err != nil {
116+
ctrl.Log.Error(err, "unable to read gpuInfoConfig file")
117+
gpuInfos = make([]config.GpuInfo, 0)
118+
}
119+
110120
// Metrics endpoint is enabled in 'config/default/kustomization.yaml'. The Metrics options configure the server.
111121
// More info:
112122
// - https://pkg.go.dev/sigs.k8s.io/[email protected]/pkg/metrics/server
@@ -260,6 +270,7 @@ func main() {
260270
Scheme: mgr.GetScheme(),
261271
Scheduler: scheduler,
262272
Recorder: mgr.GetEventRecorderFor("tensorfusionworkload"),
273+
GpuInfos: gpuInfos,
263274
}).SetupWithManager(mgr); err != nil {
264275
setupLog.Error(err, "unable to create controller", "controller", "TensorFusionWorkload")
265276
os.Exit(1)

cmd/nodediscovery/main.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ func main() {
109109

110110
allDeviceIDs := make([]string, 0)
111111

112-
for i := 0; i < count; i++ {
112+
for i := range count {
113113
device, ret := nvml.DeviceGetHandleByIndex(i)
114114
if ret != nvml.SUCCESS {
115115
ctrl.Log.Error(errors.New(nvml.ErrorString(ret)), "unable to get device", "index", i)

internal/config/gpu_info.go

+12
Original file line numberDiff line numberDiff line change
@@ -27,3 +27,15 @@ func LoadGpuInfoFromFile(filename string) ([]GpuInfo, error) {
2727
}
2828
return infos, nil
2929
}
30+
31+
func MockGpuInfo() []GpuInfo {
32+
return []GpuInfo{
33+
{
34+
Model: "mock",
35+
Vendor: "mock",
36+
CostPerHour: 0.1,
37+
Fp16TFlops: resource.MustParse("1000"),
38+
FullModelName: "mock",
39+
},
40+
}
41+
}

internal/constants/constants.go

+7-6
Original file line numberDiff line numberDiff line change
@@ -45,12 +45,13 @@ const (
4545
ConnectionNameEnv = "TENSOR_FUSION_CONNECTION_NAME"
4646
ConnectionNamespaceEnv = "TENSOR_FUSION_CONNECTION_NAMESPACE"
4747

48-
WorkerPortEnv = "TENSOR_FUSION_WORKER_PORT"
49-
WorkerCudaUpLimitEnv = "TENSOR_FUSION_CUDA_UP_LIMIT"
50-
WorkerCudaMemLimitEnv = "TENSOR_FUSION_CUDA_MEM_LIMIT"
51-
WorkerPodNameEnv = "POD_NAME"
52-
NamespaceEnv = "OPERATOR_NAMESPACE"
53-
NamespaceDefaultVal = "tensor-fusion-sys"
48+
WorkerPortEnv = "TENSOR_FUSION_WORKER_PORT"
49+
WorkerCudaUpLimitTflopsEnv = "TENSOR_FUSION_CUDA_UP_LIMIT_TFLOPS"
50+
WorkerCudaUpLimitEnv = "TENSOR_FUSION_CUDA_UP_LIMIT"
51+
WorkerCudaMemLimitEnv = "TENSOR_FUSION_CUDA_MEM_LIMIT"
52+
WorkerPodNameEnv = "POD_NAME"
53+
NamespaceEnv = "OPERATOR_NAMESPACE"
54+
NamespaceDefaultVal = "tensor-fusion-sys"
5455
)
5556

5657
const (

internal/controller/tensorfusionworkload_controller.go

+3-1
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ import (
3535

3636
tensorfusionaiv1 "github.com/NexusGPU/tensor-fusion/api/v1"
3737
tfv1 "github.com/NexusGPU/tensor-fusion/api/v1"
38+
"github.com/NexusGPU/tensor-fusion/internal/config"
3839
"github.com/NexusGPU/tensor-fusion/internal/constants"
3940
"github.com/NexusGPU/tensor-fusion/internal/metrics"
4041
scheduler "github.com/NexusGPU/tensor-fusion/internal/scheduler"
@@ -49,6 +50,7 @@ type TensorFusionWorkloadReconciler struct {
4950
Scheme *runtime.Scheme
5051
Scheduler scheduler.Scheduler
5152
Recorder record.EventRecorder
53+
GpuInfos []config.GpuInfo
5254
}
5355

5456
// +kubebuilder:rbac:groups=tensor-fusion.ai,resources=tensorfusionworkloads,verbs=get;list;watch;create;update;patch;delete
@@ -106,7 +108,7 @@ func (r *TensorFusionWorkloadReconciler) Reconcile(ctx context.Context, req ctrl
106108
}
107109

108110
// Create worker generator
109-
workerGenerator := &worker.WorkerGenerator{WorkerConfig: pool.Spec.ComponentConfig.Worker}
111+
workerGenerator := &worker.WorkerGenerator{WorkerConfig: pool.Spec.ComponentConfig.Worker, GpuInfos: r.GpuInfos}
110112

111113
podTemplateHash, err := workerGenerator.PodTemplateHash(workload.Spec.Resources.Limits)
112114
if err != nil {

internal/controller/tensorfusionworkload_controller_test.go

+5-2
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ import (
3333

3434
tensorfusionaiv1 "github.com/NexusGPU/tensor-fusion/api/v1"
3535
tfv1 "github.com/NexusGPU/tensor-fusion/api/v1"
36+
"github.com/NexusGPU/tensor-fusion/internal/config"
3637
"github.com/NexusGPU/tensor-fusion/internal/constants"
3738
scheduler "github.com/NexusGPU/tensor-fusion/internal/scheduler"
3839
)
@@ -70,8 +71,9 @@ var _ = Describe("TensorFusionWorkload Controller", func() {
7071
}
7172
Expect(k8sClient.Create(ctx, gpu)).To(Succeed())
7273
gpu.Status = tfv1.GPUStatus{
73-
Phase: tfv1.TensorFusionGPUPhaseRunning,
74-
UUID: "mock-gpu",
74+
Phase: tfv1.TensorFusionGPUPhaseRunning,
75+
UUID: "mock-gpu",
76+
GPUModel: "mock",
7577
NodeSelector: map[string]string{
7678
"kubernetes.io/hostname": "mock-node",
7779
},
@@ -92,6 +94,7 @@ var _ = Describe("TensorFusionWorkload Controller", func() {
9294
Scheme: k8sClient.Scheme(),
9395
Scheduler: scheduler.NewScheduler(k8sClient),
9496
Recorder: record.NewFakeRecorder(3),
97+
GpuInfos: config.MockGpuInfo(),
9598
}
9699

97100
// Clean up any pods from previous tests

internal/worker/worker.go

+15-3
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,12 @@ import (
44
"context"
55
"encoding/json"
66
"fmt"
7+
"math"
78
"strconv"
89
"time"
910

1011
tfv1 "github.com/NexusGPU/tensor-fusion/api/v1"
12+
"github.com/NexusGPU/tensor-fusion/internal/config"
1113
"github.com/NexusGPU/tensor-fusion/internal/constants"
1214
"github.com/NexusGPU/tensor-fusion/internal/utils"
1315
"github.com/samber/lo"
@@ -22,6 +24,7 @@ func init() {
2224
}
2325

2426
type WorkerGenerator struct {
27+
GpuInfos []config.GpuInfo
2528
WorkerConfig *tfv1.WorkerConfig
2629
}
2730

@@ -84,16 +87,25 @@ func (wg *WorkerGenerator) GenerateWorkerPod(
8487
SubPathExpr: fmt.Sprintf("${%s}", constants.WorkerPodNameEnv),
8588
})
8689

90+
info, ok := lo.Find(wg.GpuInfos, func(info config.GpuInfo) bool {
91+
return info.FullModelName == gpu.Status.GPUModel
92+
})
93+
if !ok {
94+
return nil, "", fmt.Errorf("gpu info(%s) not found", gpu.Status.GPUModel)
95+
}
96+
8797
spec.Containers[0].Env = append(spec.Containers[0].Env, corev1.EnvVar{
8898
Name: "NVIDIA_VISIBLE_DEVICES",
8999
Value: gpu.Status.UUID,
90100
}, corev1.EnvVar{
91101
Name: constants.WorkerPortEnv,
92102
Value: strconv.Itoa(port),
93103
}, corev1.EnvVar{
94-
Name: constants.WorkerCudaUpLimitEnv,
95-
// TODO: convert tflops to percent
96-
Value: "100",
104+
Name: constants.WorkerCudaUpLimitTflopsEnv,
105+
Value: strconv.FormatInt(info.Fp16TFlops.Value(), 10),
106+
}, corev1.EnvVar{
107+
Name: constants.WorkerCudaUpLimitEnv,
108+
Value: strconv.FormatInt(int64(math.Ceil(float64(limits.Tflops.Value())/float64(info.Fp16TFlops.Value())*100)), 10),
97109
}, corev1.EnvVar{
98110
Name: constants.WorkerCudaMemLimitEnv,
99111
// bytesize

0 commit comments

Comments
 (0)