Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -39,3 +39,5 @@ __debug*

vendor
logs

*.prof
1 change: 1 addition & 0 deletions .vscode/launch.json
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@
"--gpu-info-config", "${workspaceFolder}/config/samples/gpu-info-config.yaml",
"--dynamic-config", "${workspaceFolder}/config/samples/dynamic-config.yaml",
"--scheduler-config", "${workspaceFolder}/config/samples/scheduler-config.yaml",
"--enable-alert",
"-v", "4"
],
"program": "${workspaceFolder}/cmd/main.go",
Expand Down
1 change: 1 addition & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
"certificaterequests",
"certmanager",
"clientcmd",
"clientcmdapi",
"clientgoscheme",
"clientset",
"cloudnative",
Expand Down
8 changes: 0 additions & 8 deletions api/v1/gpupool_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -389,14 +389,6 @@ type GPUPoolStatus struct {
// when the progress is 100, the component version or config is fully updated.
ComponentStatus PoolComponentStatus `json:"componentStatus"`

// TODO: calculated every 1h/1d/1w average
UtilizedTFlopsPercent string `json:"utilizedTFlopsPercent,omitempty"`
UtilizedVRAMPercent string `json:"utilizedVRAMPercent,omitempty"`

// TODO: updated with interval
AllocatedTFlopsPercent string `json:"allocatedTFlopsPercent,omitempty"`
AllocatedVRAMPercent string `json:"allocatedVRAMPercent,omitempty"`

// TODO: aggregated with interval
SavedCostsPerMonth string `json:"savedCostsPerMonth,omitempty"`
PotentialSavingsPerMonth string `json:"potentialSavingsPerMonth,omitempty"`
Expand Down
2 changes: 1 addition & 1 deletion charts/tensor-fusion/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ type: application
# This is the chart version. This version number should be incremented each time you make changes
# to the chart and its templates, including the app version.
# Versions are expected to follow Semantic Versioning (https://semver.org/)
version: 1.5.4
version: 1.5.5

# This is the version number of the application being deployed. This version number should be
# incremented each time you make changes to the application. Versions are not expected to
Expand Down
8 changes: 0 additions & 8 deletions charts/tensor-fusion/crds/tensor-fusion.ai_gpupools.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -601,10 +601,6 @@ spec:
status:
description: GPUPoolStatus defines the observed state of GPUPool.
properties:
allocatedTFlopsPercent:
type: string
allocatedVRAMPercent:
type: string
availableTFlops:
anyOf:
- type: integer
Expand Down Expand Up @@ -760,10 +756,6 @@ spec:
- type: string
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
x-kubernetes-int-or-string: true
utilizedTFlopsPercent:
type: string
utilizedVRAMPercent:
type: string
virtualAvailableTFlops:
anyOf:
- type: integer
Expand Down
64 changes: 61 additions & 3 deletions charts/tensor-fusion/templates/alert-manager.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,25 +32,52 @@ spec:
{{- include "tensor-fusion.labels" . | nindent 8 }}
spec:
enableServiceLinks: false
{{- if gt (.Values.alert.replicaCount | int) 1 }}
affinity:
podAntiAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
podAffinityTerm:
labelSelector:
matchExpressions:
- key: tensor-fusion.ai/component
operator: In
values:
- alert-manager
topologyKey: kubernetes.io/hostname
{{- end }}
volumes:
- name: config
configMap:
name: {{ .Release.Name }}-alert-manager-config
defaultMode: 420
{{- if not .Values.alert.persistence.enabled }}
- name: storage
hostPath:
path: /data/alertmanager
type: DirectoryOrCreate
emptyDir: {}
{{- end }}
containers:
- name: alertmanager
image: "{{ .Values.alert.image.repository }}:{{ .Values.alert.image.tag }}"
args:
- '--storage.path=/alertmanager'
- '--config.file=/etc/alertmanager/alertmanager.yml'
- '--web.listen-address=0.0.0.0:9093'
{{- if gt (.Values.alert.replicaCount | int) 1 }}
- '--cluster.listen-address=0.0.0.0:9094'
- '--cluster.advertise-address=$(POD_IP):9094'
{{- range $i := until (.Values.alert.replicaCount | int) }}
- '--cluster.peer={{ $.Release.Name }}-alert-manager-{{ $i }}.alert-manager-headless.{{ include "tensor-fusion.namespace" $ }}.svc.cluster.local:9094'
{{- end }}
{{- end }}
ports:
- name: http
containerPort: 9093
protocol: TCP
{{- if gt (.Values.alert.replicaCount | int) 1 }}
- name: gossip
containerPort: 9094
protocol: TCP
{{- end }}
env:
- name: POD_IP
valueFrom:
Expand All @@ -62,8 +89,13 @@ spec:
volumeMounts:
- name: config
mountPath: /etc/alertmanager
{{- if .Values.alert.persistence.enabled }}
- name: alertmanager-storage
mountPath: /alertmanager
{{- else }}
- name: storage
mountPath: /alertmanager
{{- end }}
livenessProbe:
httpGet:
path: /
Expand All @@ -85,6 +117,20 @@ spec:
restartPolicy: Always
serviceAccountName: alert-manager
serviceName: alert-manager-headless
{{- if .Values.alert.persistence.enabled }}
volumeClaimTemplates:
- metadata:
name: alertmanager-storage
spec:
accessModes:
- ReadWriteOnce
{{- if .Values.alert.persistence.storageClass }}
storageClassName: {{ .Values.alert.persistence.storageClass }}
{{- end }}
resources:
requests:
storage: {{ .Values.alert.persistence.size }}
{{- end }}
updateStrategy:
type: RollingUpdate
rollingUpdate:
Expand All @@ -105,6 +151,12 @@ spec:
protocol: TCP
port: 9093
targetPort: http
{{- if gt (.Values.alert.replicaCount | int) 1 }}
- name: gossip
protocol: TCP
port: 9094
targetPort: gossip
{{- end }}
selector:
tensor-fusion.ai/component: alert-manager
type: ClusterIP
Expand All @@ -125,6 +177,12 @@ spec:
protocol: TCP
port: 9093
targetPort: http
{{- if gt (.Values.alert.replicaCount | int) 1 }}
- name: gossip
protocol: TCP
port: 9094
targetPort: gossip
{{- end }}
selector:
tensor-fusion.ai/component: alert-manager
clusterIP: None
Expand Down
5 changes: 4 additions & 1 deletion charts/tensor-fusion/values-production.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,4 +29,7 @@ alert:
cpu: 200m
limits:
memory: 1Gi
cpu: 2000m
cpu: 2000m
persistence:
enabled: true
size: 5Gi
21 changes: 21 additions & 0 deletions charts/tensor-fusion/values.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -430,6 +430,27 @@
}
}
},
"persistence": {
"type": "object",
"description": "Persistent storage configuration for alerting",
"properties": {
"enabled": {
"type": "boolean",
"description": "Enable persistent storage for alerting",
"default": false
},
"storageClass": {
"type": "string",
"description": "Storage class for persistent storage",
"default": ""
},
"size": {
"type": "string",
"description": "Size of persistent storage",
"default": ""
}
}
},
"alertManagerConfig": {
"type": "object",
"description": "Alertmanager configuration"
Expand Down
20 changes: 12 additions & 8 deletions charts/tensor-fusion/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,10 @@ alert:
limits:
memory: 1Gi
cpu: 1500m
persistence:
enabled: false
# storageClass: "gp3"
# size: 10Gi
alertManagerConfig:
global: {}
receivers:
Expand Down Expand Up @@ -347,7 +351,7 @@ dynamicConfig:
- name: NodeTFlopsAllocationCritical
query: |
SELECT node, pool, (100 - avg(allocated_tflops_percent)) as tflops_available
FROM tf_node_resources
FROM tf_node_metrics
WHERE {{ .Conditions }}
GROUP BY node, pool
HAVING tflops_available < {{ .Threshold }}
Expand All @@ -362,7 +366,7 @@ dynamicConfig:
- name: NodeTFlopsAllocationWarning
query: |
SELECT node, pool, (100 - avg(allocated_tflops_percent)) as tflops_available
FROM tf_node_resources
FROM tf_node_metrics
WHERE {{ .Conditions }}
GROUP BY node, pool
HAVING tflops_available < {{ .Threshold }}
Expand All @@ -378,7 +382,7 @@ dynamicConfig:
- name: PoolTotalTFlopsAllocationCritical
query: |
SELECT pool, (100 - avg(allocated_tflops_percent)) as tflops_available
FROM tf_node_resources
FROM tf_node_metrics
WHERE {{ .Conditions }}
GROUP BY pool
HAVING tflops_available < {{ .Threshold }}
Expand All @@ -393,7 +397,7 @@ dynamicConfig:
- name: PoolTotalTFlopsAllocationWarning
query: |
SELECT pool, (100 - avg(allocated_tflops_percent)) as tflops_available
FROM tf_node_resources
FROM tf_node_metrics
WHERE {{ .Conditions }}
GROUP BY pool
HAVING tflops_available < {{ .Threshold }}
Expand All @@ -409,7 +413,7 @@ dynamicConfig:
- name: NodeVRAMAllocationCritical
query: |
SELECT node, pool, (100 - avg(allocated_vram_percent)) as vram_available
FROM tf_node_resources
FROM tf_node_metrics
WHERE {{ .Conditions }}
GROUP BY node, pool
HAVING vram_available < {{ .Threshold }}
Expand All @@ -424,7 +428,7 @@ dynamicConfig:
- name: NodeVRAMAllocationWarning
query: |
SELECT node, pool, (100 - avg(allocated_vram_percent)) as vram_available
FROM tf_node_resources
FROM tf_node_metrics
WHERE {{ .Conditions }}
GROUP BY node, pool
HAVING vram_available < {{ .Threshold }}
Expand All @@ -440,7 +444,7 @@ dynamicConfig:
- name: PoolVRAMAllocationWarning
query: |
SELECT pool, (100 - avg(allocated_vram_percent)) as vram_available
FROM tf_node_resources
FROM tf_node_metrics
WHERE {{ .Conditions }}
GROUP BY pool
HAVING vram_available < {{ .Threshold }}
Expand All @@ -456,7 +460,7 @@ dynamicConfig:
- name: EmptyGPU
query: |
SELECT DISTINCT node
FROM tf_node_resources
FROM tf_node_metrics
WHERE {{ .Conditions }} AND node NOT IN (
SELECT DISTINCT node
FROM tf_worker_usage
Expand Down
21 changes: 8 additions & 13 deletions cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ var timeSeriesDB *metrics.TimeSeriesDB
var dynamicConfigPath string
var alertEvaluator *alert.AlertEvaluator
var schedulerConfigPath string
var alertEvaluatorReady chan struct{}

func init() {
utilruntime.Must(clientgoscheme.AddToScheme(scheme))
Expand Down Expand Up @@ -203,6 +204,7 @@ func main() {
_ = os.Setenv(constants.KubeApiVersionMajorEnv, version.Major)
_ = os.Setenv(constants.KubeApiVersionMinorEnv, version.Minor)

alertEvaluatorReady = make(chan struct{})
setupTimeSeriesAndWatchGlobalConfigChanges(ctx, mgr)

if autoScaleCanBeEnabled && enableAutoScale {
Expand Down Expand Up @@ -471,7 +473,7 @@ func startScheduler(
gpuTopoPlugin.NewWithDeps(allocator, mgr.GetClient()),
)

cc, scheduler, err := sched.SetupScheduler(ctx, mgr, schedulerConfigPath, gpuResourceFitOpt, gpuTopoOpt)
cc, scheduler, err := sched.SetupScheduler(ctx, mgr, schedulerConfigPath, false, gpuResourceFitOpt, gpuTopoOpt)
if err != nil {
setupLog.Error(err, "unable to create tensor fusion scheduler")
os.Exit(1)
Expand Down Expand Up @@ -500,18 +502,11 @@ func setupTimeSeriesAndWatchGlobalConfigChanges(ctx context.Context, mgr manager
return nil
}
timeSeriesDB = setupTimeSeriesDB()
if timeSeriesDB != nil {
if err := timeSeriesDB.SetupTables(mgr.GetClient()); err != nil {
setupLog.Error(err, "unable to init timeseries tables")
} else {
autoScaleCanBeEnabled = true
alertCanBeEnabled = true

setupLog.Info("time series db setup successfully.")
}
}

alertEvaluator = alert.NewAlertEvaluator(ctx, timeSeriesDB, config.GetGlobalConfig().AlertRules, alertManagerAddr)
autoScaleCanBeEnabled = true
alertCanBeEnabled = true
close(alertEvaluatorReady)
setupLog.Info("time series db setup successfully.")
return nil
}))
if err != nil {
Expand Down Expand Up @@ -542,7 +537,7 @@ func watchAndHandleConfigChanges(ctx context.Context, mgr manager.Manager, needT

// handle alert rules update
go func() {
<-mgr.Elected()
<-alertEvaluatorReady
if alertCanBeEnabled && enableAlert {
err = alertEvaluator.UpdateAlertRules(globalConfig.AlertRules)
if err != nil {
Expand Down
10 changes: 9 additions & 1 deletion cmd/sched/setup.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,11 +49,17 @@ func SetupScheduler(
ctx context.Context,
mgr manager.Manager,
schedulerConfigPath string,
disableHttpEndpoint bool,
outOfTreeRegistryOptions ...app.Option,
) (*schedulerserverconfig.CompletedConfig, *scheduler.Scheduler, error) {
opts := options.NewOptions()
schedulerConfigFlag := opts.Flags.FlagSet(schedulerConfigFlagSet).Lookup(schedulerConfigFlag)
schedulerConfigFlag.Changed = true

if disableHttpEndpoint {
opts.SecureServing.BindPort = 0
}

cfgPath, err := preHandleConfig(schedulerConfigPath)
if err != nil {
return nil, nil, err
Expand Down Expand Up @@ -169,7 +175,9 @@ func RunScheduler(ctx context.Context,
startInformersAndWaitForSync(ctx)

go func() {
<-mgr.Elected()
if mgr != nil {
<-mgr.Elected()
}
logger.Info("Starting scheduling cycle")
sched.Run(ctx)
cc.EventBroadcaster.Shutdown()
Expand Down
Loading