NexusGPU · Code2Life · Aug 25, 2025 · Aug 15, 2025 · Aug 15, 2025 · Aug 18, 2025
diff --git a/.gitignore b/.gitignore
@@ -39,3 +39,5 @@ __debug*
 
 vendor
 logs
+
+*.prof
diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -68,6 +68,7 @@
                 "--gpu-info-config", "${workspaceFolder}/config/samples/gpu-info-config.yaml",
                 "--dynamic-config", "${workspaceFolder}/config/samples/dynamic-config.yaml",
                 "--scheduler-config", "${workspaceFolder}/config/samples/scheduler-config.yaml",
+                "--enable-alert",
                 "-v", "4"
             ],
             "program": "${workspaceFolder}/cmd/main.go",

diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -22,6 +22,7 @@
         "certificaterequests",
         "certmanager",
         "clientcmd",
+        "clientcmdapi",
         "clientgoscheme",
         "clientset",
         "cloudnative",

diff --git a/api/v1/gpupool_types.go b/api/v1/gpupool_types.go
@@ -389,14 +389,6 @@ type GPUPoolStatus struct {
 	// when the progress is 100, the component version or config is fully updated.
 	ComponentStatus PoolComponentStatus `json:"componentStatus"`
 
-	// TODO: calculated every 1h/1d/1w average
-	UtilizedTFlopsPercent string `json:"utilizedTFlopsPercent,omitempty"`
-	UtilizedVRAMPercent   string `json:"utilizedVRAMPercent,omitempty"`
-
-	// TODO: updated with interval
-	AllocatedTFlopsPercent string `json:"allocatedTFlopsPercent,omitempty"`
-	AllocatedVRAMPercent   string `json:"allocatedVRAMPercent,omitempty"`
-
 	// TODO: aggregated with interval
 	SavedCostsPerMonth       string `json:"savedCostsPerMonth,omitempty"`
 	PotentialSavingsPerMonth string `json:"potentialSavingsPerMonth,omitempty"`

diff --git a/charts/tensor-fusion/Chart.yaml b/charts/tensor-fusion/Chart.yaml
@@ -15,7 +15,7 @@ type: application
 # This is the chart version. This version number should be incremented each time you make changes
 # to the chart and its templates, including the app version.
 # Versions are expected to follow Semantic Versioning (https://semver.org/)
-version: 1.5.4
+version: 1.5.5
 
 # This is the version number of the application being deployed. This version number should be
 # incremented each time you make changes to the application. Versions are not expected to

diff --git a/charts/tensor-fusion/crds/tensor-fusion.ai_gpupools.yaml b/charts/tensor-fusion/crds/tensor-fusion.ai_gpupools.yaml
@@ -601,10 +601,6 @@ spec:
           status:
             description: GPUPoolStatus defines the observed state of GPUPool.
             properties:
-              allocatedTFlopsPercent:
-                type: string
-              allocatedVRAMPercent:
-                type: string
               availableTFlops:
                 anyOf:
                 - type: integer
@@ -760,10 +756,6 @@ spec:
                 - type: string
                 pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
                 x-kubernetes-int-or-string: true
-              utilizedTFlopsPercent:
-                type: string
-              utilizedVRAMPercent:
-                type: string
               virtualAvailableTFlops:
                 anyOf:
                 - type: integer

diff --git a/charts/tensor-fusion/templates/alert-manager.yaml b/charts/tensor-fusion/templates/alert-manager.yaml
@@ -32,25 +32,52 @@ spec:
         {{- include "tensor-fusion.labels" . | nindent 8 }}
     spec:
       enableServiceLinks: false
+      {{- if gt (.Values.alert.replicaCount | int) 1 }}
+      affinity:
+        podAntiAffinity:
+          preferredDuringSchedulingIgnoredDuringExecution:
+          - weight: 100
+            podAffinityTerm:
+              labelSelector:
+                matchExpressions:
+                - key: tensor-fusion.ai/component
+                  operator: In
+                  values:
+                  - alert-manager
+              topologyKey: kubernetes.io/hostname
+      {{- end }}
       volumes:
         - name: config
           configMap:
             name: {{ .Release.Name }}-alert-manager-config
             defaultMode: 420
+        {{- if not .Values.alert.persistence.enabled }}
         - name: storage
-          hostPath:
-            path: /data/alertmanager
-            type: DirectoryOrCreate
+          emptyDir: {}
+        {{- end }}
       containers:
         - name: alertmanager
           image: "{{ .Values.alert.image.repository }}:{{ .Values.alert.image.tag }}"
           args:
             - '--storage.path=/alertmanager'
             - '--config.file=/etc/alertmanager/alertmanager.yml'
+            - '--web.listen-address=0.0.0.0:9093'
+            {{- if gt (.Values.alert.replicaCount | int) 1 }}
+            - '--cluster.listen-address=0.0.0.0:9094'
+            - '--cluster.advertise-address=$(POD_IP):9094'
+            {{- range $i := until (.Values.alert.replicaCount | int) }}
+            - '--cluster.peer={{ $.Release.Name }}-alert-manager-{{ $i }}.alert-manager-headless.{{ include "tensor-fusion.namespace" $ }}.svc.cluster.local:9094'
+            {{- end }}
+            {{- end }}
           ports:
             - name: http
               containerPort: 9093
               protocol: TCP
+            {{- if gt (.Values.alert.replicaCount | int) 1 }}
+            - name: gossip
+              containerPort: 9094
+              protocol: TCP
+            {{- end }}
           env:
             - name: POD_IP
               valueFrom:
@@ -62,8 +89,13 @@ spec:
           volumeMounts:
             - name: config
               mountPath: /etc/alertmanager
+            {{- if .Values.alert.persistence.enabled }}
+            - name: alertmanager-storage
+              mountPath: /alertmanager
+            {{- else }}
             - name: storage
               mountPath: /alertmanager
+            {{- end }}
           livenessProbe:
             httpGet:
               path: /
@@ -85,6 +117,20 @@ spec:
       restartPolicy: Always
       serviceAccountName: alert-manager
   serviceName: alert-manager-headless
+  {{- if .Values.alert.persistence.enabled }}
+  volumeClaimTemplates:
+  - metadata:
+      name: alertmanager-storage
+    spec:
+      accessModes:
+        - ReadWriteOnce
+      {{- if .Values.alert.persistence.storageClass }}
+      storageClassName: {{ .Values.alert.persistence.storageClass }}
+      {{- end }}
+      resources:
+        requests:
+          storage: {{ .Values.alert.persistence.size }}
+  {{- end }}
   updateStrategy:
     type: RollingUpdate
     rollingUpdate:
@@ -105,6 +151,12 @@ spec:
       protocol: TCP
       port: 9093
       targetPort: http
+    {{- if gt (.Values.alert.replicaCount | int) 1 }}
+    - name: gossip
+      protocol: TCP
+      port: 9094
+      targetPort: gossip
+    {{- end }}
   selector:
     tensor-fusion.ai/component: alert-manager
   type: ClusterIP
@@ -125,6 +177,12 @@ spec:
       protocol: TCP
       port: 9093
       targetPort: http
+    {{- if gt (.Values.alert.replicaCount | int) 1 }}
+    - name: gossip
+      protocol: TCP
+      port: 9094
+      targetPort: gossip
+    {{- end }}
   selector:
     tensor-fusion.ai/component: alert-manager
   clusterIP: None

diff --git a/charts/tensor-fusion/values-production.yaml b/charts/tensor-fusion/values-production.yaml
@@ -29,4 +29,7 @@ alert:
       cpu: 200m
     limits:
       memory: 1Gi
-      cpu: 2000m
+      cpu: 2000m
+  persistence:
+    enabled: true
+    size: 5Gi
diff --git a/charts/tensor-fusion/values.schema.json b/charts/tensor-fusion/values.schema.json
@@ -430,6 +430,27 @@
             }
           }
         },
+        "persistence": {
+          "type": "object",
+          "description": "Persistent storage configuration for alerting",
+          "properties": {
+            "enabled": {
+              "type": "boolean",
+              "description": "Enable persistent storage for alerting",
+              "default": false
+            },
+            "storageClass": {
+              "type": "string",
+              "description": "Storage class for persistent storage",
+              "default": ""
+            },
+            "size": {
+              "type": "string",
+              "description": "Size of persistent storage",
+              "default": ""
+            }
+          }
+        },
         "alertManagerConfig": {
           "type": "object",
           "description": "Alertmanager configuration"

diff --git a/charts/tensor-fusion/values.yaml b/charts/tensor-fusion/values.yaml
@@ -148,6 +148,10 @@ alert:
     limits:
       memory: 1Gi
       cpu: 1500m
+  persistence:
+    enabled: false
+    # storageClass: "gp3"
+    # size: 10Gi
   alertManagerConfig:
     global: {}
     receivers:
@@ -347,7 +351,7 @@ dynamicConfig:
     - name: NodeTFlopsAllocationCritical
       query: | 
         SELECT node, pool, (100 - avg(allocated_tflops_percent)) as tflops_available
-        FROM tf_node_resources
+        FROM tf_node_metrics
         WHERE {{ .Conditions }}
         GROUP BY node, pool
         HAVING tflops_available < {{ .Threshold }}
@@ -362,7 +366,7 @@ dynamicConfig:
     - name: NodeTFlopsAllocationWarning
       query: | 
         SELECT node, pool, (100 - avg(allocated_tflops_percent)) as tflops_available
-        FROM tf_node_resources
+        FROM tf_node_metrics
         WHERE {{ .Conditions }}
         GROUP BY node, pool
         HAVING tflops_available < {{ .Threshold }}
@@ -378,7 +382,7 @@ dynamicConfig:
     - name: PoolTotalTFlopsAllocationCritical
       query: |
         SELECT pool, (100 - avg(allocated_tflops_percent)) as tflops_available
-        FROM tf_node_resources
+        FROM tf_node_metrics
         WHERE {{ .Conditions }}
         GROUP BY pool
         HAVING tflops_available < {{ .Threshold }}
@@ -393,7 +397,7 @@ dynamicConfig:
     - name: PoolTotalTFlopsAllocationWarning
       query: |
         SELECT pool, (100 - avg(allocated_tflops_percent)) as tflops_available
-        FROM tf_node_resources
+        FROM tf_node_metrics
         WHERE {{ .Conditions }}
         GROUP BY pool
         HAVING tflops_available < {{ .Threshold }}
@@ -409,7 +413,7 @@ dynamicConfig:
     - name: NodeVRAMAllocationCritical
       query: |
         SELECT node, pool, (100 - avg(allocated_vram_percent)) as vram_available
-        FROM tf_node_resources
+        FROM tf_node_metrics
         WHERE {{ .Conditions }}
         GROUP BY node, pool
         HAVING vram_available < {{ .Threshold }}
@@ -424,7 +428,7 @@ dynamicConfig:
     - name: NodeVRAMAllocationWarning
       query: |
         SELECT node, pool, (100 - avg(allocated_vram_percent)) as vram_available
-        FROM tf_node_resources
+        FROM tf_node_metrics
         WHERE {{ .Conditions }}
         GROUP BY node, pool
         HAVING vram_available < {{ .Threshold }}
@@ -440,7 +444,7 @@ dynamicConfig:
     - name: PoolVRAMAllocationWarning
       query: |
         SELECT pool, (100 - avg(allocated_vram_percent)) as vram_available
-        FROM tf_node_resources
+        FROM tf_node_metrics
         WHERE {{ .Conditions }}
         GROUP BY pool
         HAVING vram_available < {{ .Threshold }}
@@ -456,7 +460,7 @@ dynamicConfig:
     - name: EmptyGPU
       query: |
         SELECT DISTINCT node 
-        FROM tf_node_resources 
+        FROM tf_node_metrics 
         WHERE {{ .Conditions }} AND node NOT IN (
             SELECT DISTINCT node 
             FROM tf_worker_usage 

diff --git a/cmd/main.go b/cmd/main.go
@@ -94,6 +94,7 @@ var timeSeriesDB *metrics.TimeSeriesDB
 var dynamicConfigPath string
 var alertEvaluator *alert.AlertEvaluator
 var schedulerConfigPath string
+var alertEvaluatorReady chan struct{}
 
 func init() {
 	utilruntime.Must(clientgoscheme.AddToScheme(scheme))
@@ -203,6 +204,7 @@ func main() {
 	_ = os.Setenv(constants.KubeApiVersionMajorEnv, version.Major)
 	_ = os.Setenv(constants.KubeApiVersionMinorEnv, version.Minor)
 
+	alertEvaluatorReady = make(chan struct{})
 	setupTimeSeriesAndWatchGlobalConfigChanges(ctx, mgr)
 
 	if autoScaleCanBeEnabled && enableAutoScale {
@@ -471,7 +473,7 @@ func startScheduler(
 		gpuTopoPlugin.NewWithDeps(allocator, mgr.GetClient()),
 	)
 
-	cc, scheduler, err := sched.SetupScheduler(ctx, mgr, schedulerConfigPath, gpuResourceFitOpt, gpuTopoOpt)
+	cc, scheduler, err := sched.SetupScheduler(ctx, mgr, schedulerConfigPath, false, gpuResourceFitOpt, gpuTopoOpt)
 	if err != nil {
 		setupLog.Error(err, "unable to create tensor fusion scheduler")
 		os.Exit(1)
@@ -500,18 +502,11 @@ func setupTimeSeriesAndWatchGlobalConfigChanges(ctx context.Context, mgr manager
 			return nil
 		}
 		timeSeriesDB = setupTimeSeriesDB()
-		if timeSeriesDB != nil {
-			if err := timeSeriesDB.SetupTables(mgr.GetClient()); err != nil {
-				setupLog.Error(err, "unable to init timeseries tables")
-			} else {
-				autoScaleCanBeEnabled = true
-				alertCanBeEnabled = true
-
-				setupLog.Info("time series db setup successfully.")
-			}
-		}
-
 		alertEvaluator = alert.NewAlertEvaluator(ctx, timeSeriesDB, config.GetGlobalConfig().AlertRules, alertManagerAddr)
+		autoScaleCanBeEnabled = true
+		alertCanBeEnabled = true
+		close(alertEvaluatorReady)
+		setupLog.Info("time series db setup successfully.")
 		return nil
 	}))
 	if err != nil {
@@ -542,7 +537,7 @@ func watchAndHandleConfigChanges(ctx context.Context, mgr manager.Manager, needT
 
 		// handle alert rules update
 		go func() {
-			<-mgr.Elected()
+			<-alertEvaluatorReady
 			if alertCanBeEnabled && enableAlert {
 				err = alertEvaluator.UpdateAlertRules(globalConfig.AlertRules)
 				if err != nil {

diff --git a/cmd/sched/setup.go b/cmd/sched/setup.go
@@ -49,11 +49,17 @@ func SetupScheduler(
 	ctx context.Context,
 	mgr manager.Manager,
 	schedulerConfigPath string,
+	disableHttpEndpoint bool,
 	outOfTreeRegistryOptions ...app.Option,
 ) (*schedulerserverconfig.CompletedConfig, *scheduler.Scheduler, error) {
 	opts := options.NewOptions()
 	schedulerConfigFlag := opts.Flags.FlagSet(schedulerConfigFlagSet).Lookup(schedulerConfigFlag)
 	schedulerConfigFlag.Changed = true
+
+	if disableHttpEndpoint {
+		opts.SecureServing.BindPort = 0
+	}
+
 	cfgPath, err := preHandleConfig(schedulerConfigPath)
 	if err != nil {
 		return nil, nil, err
@@ -169,7 +175,9 @@ func RunScheduler(ctx context.Context,
 	startInformersAndWaitForSync(ctx)
 
 	go func() {
-		<-mgr.Elected()
+		if mgr != nil {
+			<-mgr.Elected()
+		}
 		logger.Info("Starting scheduling cycle")
 		sched.Run(ctx)
 		cc.EventBroadcaster.Shutdown()