Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit 553b223

Browse files
committedFeb 12, 2025·
fix: node discovery bug
1 parent 6b7c60e commit 553b223

File tree

7 files changed

+44
-4
lines changed

7 files changed

+44
-4
lines changed
 

‎charts/tensor-fusion/Chart.yaml

+2-2
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,10 @@ type: application
1515
# This is the chart version. This version number should be incremented each time you make changes
1616
# to the chart and its templates, including the app version.
1717
# Versions are expected to follow Semantic Versioning (https://semver.org/)
18-
version: 1.1.0
18+
version: 1.1.1
1919

2020
# This is the version number of the application being deployed. This version number should be
2121
# incremented each time you make changes to the application. Versions are not expected to
2222
# follow Semantic Versioning. They should reflect the version the application is using.
2323
# It is recommended to use it with quotes.
24-
appVersion: "1.12.0"
24+
appVersion: "1.12.1"

‎charts/tensor-fusion/templates/gpu-public-gpu-info.yaml

+16-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
apiVersion: v1
22
kind: ConfigMap
33
metadata:
4-
name: {{ include "tensor-fusion.fullname" . }}-public-gpu-info
4+
name: {{ .Release.Name }}-public-gpu-info
55
namespace: {{ include "tensor-fusion.namespace" . }}
66
labels:
77
{{- include "tensor-fusion.labels" . | nindent 4 }}
@@ -22,81 +22,96 @@ data:
2222
2323
# Turing Architecture Series
2424
- model: T4
25+
fullModelName: "Tesla T4"
2526
vendor: NVIDIA
2627
costPerHour: 0.53
2728
fp16TFlops: 65
2829
2930
# Ampere Architecture Series
3031
- model: A100_SXM4
32+
fullModelName: "A100 SXM4"
3133
vendor: NVIDIA
3234
costPerHour: 1.89
3335
fp16TFlops: 312
3436
3537
- model: A100_PCIe
38+
fullModelName: "A100 PCIe"
3639
vendor: NVIDIA
3740
costPerHour: 1.64
3841
fp16TFlops: 312
3942
4043
- model: A10
44+
fullModelName: "A10"
4145
vendor: NVIDIA
4246
costPerHour: 0.9
4347
fp16TFlops: 125
4448
4549
# A10G has less CUDA core than A10, but with RT cores for rendering case
4650
- model: A10G
51+
fullModelName: "A10G"
4752
vendor: NVIDIA
4853
costPerHour: 0.75 # from lambda labs
4954
fp16TFlops: 125
5055
5156
- model: A40
57+
fullModelName: "A40"
5258
vendor: NVIDIA
5359
costPerHour: 0.44
5460
fp16TFlops: 125
5561
5662
- model: RTX3090
63+
fullModelName: "RTX3090"
5764
vendor: NVIDIA
5865
costPerHour: 0.43
5966
fp16TFlops: 143
6067
6168
# Ada Lovelace Architecture Series
6269
- model: L4
70+
fullModelName: "L4"
6371
vendor: NVIDIA
6472
costPerHour: 0.43
6573
fp16TFlops: 121
6674
6775
- model: L40
76+
fullModelName: "L40"
6877
vendor: NVIDIA
6978
costPerHour: 0.86 # should be a bit cheaper than L40s
7079
fp16TFlops: 362
7180
7281
- model: L40s
82+
fullModelName: "L40s"
7383
vendor: NVIDIA
7484
costPerHour: 0.86
7585
fp16TFlops: 362
7686
7787
- model: RTX4090
88+
fullModelName: "RTX4090"
7889
vendor: NVIDIA
7990
costPerHour: 0.69
8091
fp16TFlops: 330
8192
8293
# Hopper Architecture Series
8394
- model: H100_SXM4
95+
fullModelName: "H100 SXM4"
8496
vendor: NVIDIA
8597
costPerHour: 2.99
8698
fp16TFlops: 989
8799
88100
- model: H100_PCIe
101+
fullModelName: "H100 PCIe"
89102
vendor: NVIDIA
90103
costPerHour: 2.39
91104
fp16TFlops: 835
92105
93106
# Blackwell Architecture Series
94107
- model: B200_SXM4
108+
fullModelName: "B200 SXM4"
95109
vendor: NVIDIA
96110
costPerHour: 10.99 # unknown price,on-request
97111
fp16TFlops: 2250
98112
99113
- model: RTX5090
114+
fullModelName: "RTX5090"
100115
vendor: NVIDIA
101116
costPerHour: 2.99
102117
fp16TFlops: 838

‎charts/tensor-fusion/templates/rbac.yaml

+11
Original file line numberDiff line numberDiff line change
@@ -67,10 +67,21 @@ rules:
6767
- batch
6868
resources:
6969
- cronjobs
70+
verbs:
71+
- get
72+
- list
73+
- watch
74+
- apiGroups:
75+
- batch
76+
resources:
7077
- jobs
7178
verbs:
79+
- create
80+
- delete
7281
- get
7382
- list
83+
- patch
84+
- update
7485
- watch
7586
- apiGroups:
7687
- tensor-fusion.ai

‎config/rbac/role.yaml

+11
Original file line numberDiff line numberDiff line change
@@ -68,10 +68,21 @@ rules:
6868
- batch
6969
resources:
7070
- cronjobs
71+
verbs:
72+
- get
73+
- list
74+
- watch
75+
- apiGroups:
76+
- batch
77+
resources:
7178
- jobs
7279
verbs:
80+
- create
81+
- delete
7382
- get
7483
- list
84+
- patch
85+
- update
7586
- watch
7687
- apiGroups:
7788
- tensor-fusion.ai

‎internal/controller/gpupool_controller.go

+1
Original file line numberDiff line numberDiff line change
@@ -202,6 +202,7 @@ func (r *GPUPoolReconciler) startNodeDiscovery(
202202
if err != nil {
203203
return fmt.Errorf("unmarshal pod template: %w", err)
204204
}
205+
205206
selector := labels.NewSelector()
206207
poolReq, err := labels.NewRequirement(fmt.Sprintf(constants.GPUNodePoolIdentifierLabelFormat, pool.Name), selection.DoubleEquals, []string{"true"})
207208
if err != nil {

‎internal/controller/node_controller.go

+2
Original file line numberDiff line numberDiff line change
@@ -128,4 +128,6 @@ func (r *NodeReconciler) SetupWithManager(mgr ctrl.Manager) error {
128128
For(&corev1.Node{}, builder.WithPredicates(p)).
129129
Named("node").
130130
Complete(r)
131+
// When Pool changed, all nodes should re-generated, delete not matched ones
132+
//
131133
}

‎internal/controller/tensorfusioncluster_controller.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ type TensorFusionClusterReconciler struct {
5151
// +kubebuilder:rbac:groups=tensor-fusion.ai,resources=tensorfusionclusters/finalizers,verbs=update
5252
// +kubebuilder:rbac:groups=core,resources=events,verbs=create;patch
5353
// +kubebuilder:rbac:groups=apps,resources=deployments;namespaces;configmaps;secrets,verbs=get;list;watch;create;update;patch;delete
54-
// +kubebuilder:rbac:groups=batch,resources=jobs,verbs=get;list;watch
54+
// +kubebuilder:rbac:groups=batch,resources=jobs,verbs=get;list;watch;create;update;patch;delete
5555
// +kubebuilder:rbac:groups=batch,resources=cronjobs,verbs=get;list;watch
5656
// +kubebuilder:rbac:groups="",resources=secrets,verbs=get;list;watch
5757
// +kubebuilder:rbac:groups=apps,resources=daemonsets,verbs=get;list;watch;create;update;patch;delete

0 commit comments

Comments
 (0)
Please sign in to comment.