@@ -38,11 +38,9 @@ import (
38
38
"k8s.io/client-go/util/retry"
39
39
"k8s.io/utils/ptr"
40
40
ctrl "sigs.k8s.io/controller-runtime"
41
- "sigs.k8s.io/controller-runtime/pkg/builder"
42
41
"sigs.k8s.io/controller-runtime/pkg/client"
43
42
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
44
43
"sigs.k8s.io/controller-runtime/pkg/log"
45
- "sigs.k8s.io/controller-runtime/pkg/predicate"
46
44
)
47
45
48
46
// GPUNodeReconciler reconciles a GPUNode object
@@ -74,7 +72,6 @@ func (r *GPUNodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct
74
72
}
75
73
76
74
deleted , err := utils .HandleFinalizer (ctx , node , r .Client , func (ctx context.Context , node * tfv1.GPUNode ) (bool , error ) {
77
-
78
75
if node .Status .Phase != tfv1 .TensorFusionGPUNodePhaseDestroying {
79
76
node .Status .Phase = tfv1 .TensorFusionGPUNodePhaseDestroying
80
77
if err := r .Status ().Update (ctx , node ); err != nil {
@@ -136,13 +133,39 @@ func (r *GPUNodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct
136
133
break
137
134
}
138
135
}
136
+ if poolName == "" {
137
+ log .Error (nil , "failed to get pool name" , "node" , node .Name )
138
+ return ctrl.Result {}, nil
139
+ }
139
140
140
141
poolObj := & tfv1.GPUPool {}
141
142
err = r .Client .Get (ctx , client.ObjectKey {Name : poolName }, poolObj )
142
143
if err != nil {
143
144
return ctrl.Result {}, fmt .Errorf ("failed to get tensor-fusion pool, can not create node discovery job, pool: %s" , poolName )
144
145
}
145
146
147
+ if node .Spec .ManageMode != tfv1 .GPUNodeManageModeProvisioned {
148
+ // Check if the Kubernetes node exists; if not, the GPUNode should delete itself.
149
+ if node .Status .KubernetesNodeName != "" {
150
+ // Try to get the Kubernetes node
151
+ coreNode := & corev1.Node {}
152
+ err := r .Get (ctx , client.ObjectKey {Name : node .Status .KubernetesNodeName }, coreNode )
153
+ if err != nil {
154
+ if errors .IsNotFound (err ) {
155
+ // The Kubernetes node does not exist, delete the GPUNode
156
+ log .Info ("Kubernetes node does not exist, deleting GPUNode" ,
157
+ "kubernetesNodeName" , node .Status .KubernetesNodeName )
158
+ if err := r .Delete (ctx , node ); err != nil {
159
+ return ctrl.Result {}, fmt .Errorf ("failed to delete GPUNode after Kubernetes node was deleted: %w" , err )
160
+ }
161
+ // Return early since we've deleted the resource
162
+ return ctrl.Result {}, nil
163
+ }
164
+ return ctrl.Result {}, fmt .Errorf ("failed to get Kubernetes node %s: %w" ,
165
+ node .Status .KubernetesNodeName , err )
166
+ }
167
+ }
168
+ }
146
169
if err := r .reconcileCloudVendorNode (ctx , node , poolObj ); err != nil {
147
170
return ctrl.Result {}, err
148
171
}
@@ -151,10 +174,6 @@ func (r *GPUNodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct
151
174
if node .Status .KubernetesNodeName == "" {
152
175
return ctrl.Result {RequeueAfter : 10 * time .Second }, nil
153
176
}
154
- if poolName == "" {
155
- log .Error (nil , "failed to get pool name" , "node" , node .Name )
156
- return ctrl.Result {}, nil
157
- }
158
177
159
178
if err := r .reconcileNodeDiscoveryJob (ctx , node , poolObj ); err != nil {
160
179
return ctrl.Result {}, err
@@ -415,7 +434,6 @@ func (r *GPUNodeReconciler) reconcileHypervisorPod(ctx context.Context, node *tf
415
434
}
416
435
417
436
func (r * GPUNodeReconciler ) reconcileCloudVendorNode (ctx context.Context , node * tfv1.GPUNode , pool * tfv1.GPUPool ) error {
418
-
419
437
// Avoid creating duplicated cloud vendor nodes, if not working, keep pending status
420
438
if node .Status .NodeInfo .InstanceID != "" {
421
439
// node already created, check status
@@ -532,7 +550,7 @@ func (r *GPUNodeReconciler) CalculateVirtualCapacity(node *tfv1.GPUNode, pool *t
532
550
// SetupWithManager sets up the controller with the Manager.
533
551
func (r * GPUNodeReconciler ) SetupWithManager (mgr ctrl.Manager ) error {
534
552
return ctrl .NewControllerManagedBy (mgr ).
535
- For (& tfv1.GPUNode {}, builder . WithPredicates (predicate. GenerationChangedPredicate {}) ).
553
+ For (& tfv1.GPUNode {}).
536
554
Named ("gpunode" ).
537
555
Owns (& corev1.Node {}).
538
556
Owns (& batchv1.Job {}).
0 commit comments