@@ -24,6 +24,7 @@ import (
24
24
25
25
tfv1 "github.com/NexusGPU/tensor-fusion-operator/api/v1"
26
26
"github.com/NexusGPU/tensor-fusion-operator/internal/constants"
27
+ "github.com/NexusGPU/tensor-fusion-operator/internal/utils"
27
28
corev1 "k8s.io/api/core/v1"
28
29
"k8s.io/apimachinery/pkg/api/errors"
29
30
"k8s.io/apimachinery/pkg/api/resource"
@@ -33,8 +34,11 @@ import (
33
34
"sigs.k8s.io/controller-runtime/pkg/builder"
34
35
"sigs.k8s.io/controller-runtime/pkg/client"
35
36
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
37
+ "sigs.k8s.io/controller-runtime/pkg/event"
38
+ "sigs.k8s.io/controller-runtime/pkg/handler"
36
39
"sigs.k8s.io/controller-runtime/pkg/log"
37
40
"sigs.k8s.io/controller-runtime/pkg/predicate"
41
+ "sigs.k8s.io/controller-runtime/pkg/reconcile"
38
42
39
43
schedulingcorev1 "k8s.io/component-helpers/scheduling/corev1"
40
44
)
@@ -95,7 +99,14 @@ func (r *NodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.
95
99
return ctrl.Result {}, err
96
100
}
97
101
if ! matched {
98
- log .Info ("No matched GPU pool found, skip reconcile the Node" , "node" , node .Name , "labels" , node .Labels )
102
+ // delete gpunode if no matched pool
103
+ if err := r .Client .Delete (ctx , & tfv1.GPUNode {
104
+ ObjectMeta : metav1.ObjectMeta {
105
+ Name : node .Name ,
106
+ },
107
+ }); err != nil {
108
+ return ctrl.Result {}, fmt .Errorf ("can not delete gpuNode(%s) : %w" , node .Name , err )
109
+ }
99
110
return ctrl.Result {}, nil
100
111
}
101
112
@@ -169,11 +180,36 @@ func (r *NodeReconciler) SetupWithManager(mgr ctrl.Manager) error {
169
180
if err != nil {
170
181
return fmt .Errorf ("unable to create predicate: %w" , err )
171
182
}
183
+
172
184
return ctrl .NewControllerManagedBy (mgr ).
173
185
For (& corev1.Node {}, builder .WithPredicates (p )).
174
186
Named ("node" ).
187
+ Watches (& tfv1.GPUPool {}, handler .EnqueueRequestsFromMapFunc (func (ctx context.Context , obj client.Object ) []reconcile.Request {
188
+ nodelist := & tfv1.GPUNodeList {}
189
+ if err := mgr .GetClient ().List (ctx , nodelist , client.MatchingLabels {
190
+ selectors [0 ]: selectors [1 ],
191
+ }); err != nil {
192
+ log .FromContext (ctx ).Error (err , "failed to list GPUNode" )
193
+ return []reconcile.Request {}
194
+ }
195
+ var requests []reconcile.Request
196
+ for _ , n := range nodelist .Items {
197
+ requests = append (requests , reconcile.Request {NamespacedName : client.ObjectKey {Name : n .Name }})
198
+ }
199
+ return requests
200
+ }), builder .WithPredicates (predicate.Funcs {
201
+ UpdateFunc : func (e event.UpdateEvent ) bool {
202
+ oldObj , ok1 := e .ObjectOld .(* tfv1.GPUPool )
203
+ newObj , ok2 := e .ObjectNew .(* tfv1.GPUPool )
204
+ if ! ok1 || ! ok2 {
205
+ return false
206
+ }
207
+ oldNodeSelector := oldObj .Spec .NodeManagerConfig .NodeSelector
208
+ newNodeSelector := newObj .Spec .NodeManagerConfig .NodeSelector
209
+ return utils .GetObjectHash (oldNodeSelector ) != utils .GetObjectHash (newNodeSelector )
210
+ },
211
+ })).
175
212
Complete (r )
176
- // TODO: When Pool changed, all nodes should re-generated, delete not matched ones, this logic should be added into GPUPool controller
177
213
}
178
214
179
215
func getMatchedPoolName (node * corev1.Node , poolList []tfv1.GPUPool ) (* tfv1.GPUPool , bool , error ) {
0 commit comments