@@ -41,6 +41,7 @@ import (
41
41
42
42
tensorfusionaiv1 "github.com/NexusGPU/tensor-fusion/api/v1"
43
43
tfv1 "github.com/NexusGPU/tensor-fusion/api/v1"
44
+ "github.com/NexusGPU/tensor-fusion/internal/config"
44
45
"github.com/NexusGPU/tensor-fusion/internal/controller"
45
46
"github.com/NexusGPU/tensor-fusion/internal/scheduler"
46
47
"github.com/NexusGPU/tensor-fusion/internal/server"
@@ -69,6 +70,7 @@ func main() {
69
70
var secureMetrics bool
70
71
var enableHTTP2 bool
71
72
var tlsOpts []func (* tls.Config )
73
+ var gpuInfoConfig string
72
74
73
75
flag .StringVar (& metricsAddr , "metrics-bind-address" , "0" , "The address the metrics endpoint binds to. " +
74
76
"Use :8443 for HTTPS or :8080 for HTTP, or leave as 0 to disable the metrics service." )
@@ -80,6 +82,8 @@ func main() {
80
82
"If set, the metrics endpoint is served securely via HTTPS. Use --metrics-secure=false to use HTTP instead." )
81
83
flag .BoolVar (& enableHTTP2 , "enable-http2" , false ,
82
84
"If set, HTTP/2 will be enabled for the metrics and webhook servers" )
85
+ flag .StringVar (& gpuInfoConfig , "gpu-info-config" ,
86
+ "/etc/tensor-fusion/gpu-info.yaml" , "specify the path to gpuInfoConfig file" )
83
87
opts := zap.Options {
84
88
Development : true ,
85
89
}
@@ -107,6 +111,12 @@ func main() {
107
111
TLSOpts : tlsOpts ,
108
112
})
109
113
114
+ gpuInfos , err := config .LoadGpuInfoFromFile (gpuInfoConfig )
115
+ if err != nil {
116
+ ctrl .Log .Error (err , "unable to read gpuInfoConfig file" )
117
+ gpuInfos = make ([]config.GpuInfo , 0 )
118
+ }
119
+
110
120
// Metrics endpoint is enabled in 'config/default/kustomization.yaml'. The Metrics options configure the server.
111
121
// More info:
112
122
// - https://pkg.go.dev/sigs.k8s.io/[email protected] /pkg/metrics/server
@@ -260,6 +270,7 @@ func main() {
260
270
Scheme : mgr .GetScheme (),
261
271
Scheduler : scheduler ,
262
272
Recorder : mgr .GetEventRecorderFor ("tensorfusionworkload" ),
273
+ GpuInfos : gpuInfos ,
263
274
}).SetupWithManager (mgr ); err != nil {
264
275
setupLog .Error (err , "unable to create controller" , "controller" , "TensorFusionWorkload" )
265
276
os .Exit (1 )
0 commit comments