Skip to content

Commit 25c74ca

Browse files
authored
fix: delete GPUNode when Kubernetes node no longer exists (#81)
* fix: delete GPUNode when Kubernetes node no longer exists * fix: test
1 parent 4d61ca1 commit 25c74ca

File tree

8 files changed

+47
-17
lines changed

8 files changed

+47
-17
lines changed

api/v1/gpupool_types.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -363,7 +363,7 @@ type GPUPoolStatus struct {
363363

364364
TotalNodes int32 `json:"totalNodes,omitempty"`
365365
TotalGPUs int32 `json:"totalGPUs,omitempty"`
366-
ReadyNodes int32 `json:"readyNodes,omitempty"`
366+
ReadyNodes int32 `json:"readyNodes"`
367367
NotReadyNodes int32 `json:"notReadyNodes"`
368368

369369
TotalTFlops resource.Quantity `json:"totalTFlops"`

charts/tensor-fusion/crds/tensor-fusion.ai_gpupools.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -942,6 +942,7 @@ spec:
942942
- componentStatus
943943
- notReadyNodes
944944
- phase
945+
- readyNodes
945946
- totalTFlops
946947
- totalVRAM
947948
- virtualTFlops

config/crd/bases/tensor-fusion.ai_gpupools.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -942,6 +942,7 @@ spec:
942942
- componentStatus
943943
- notReadyNodes
944944
- phase
945+
- readyNodes
945946
- totalTFlops
946947
- totalVRAM
947948
- virtualTFlops

internal/constants/constants.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ const (
5050
WorkerCudaMemLimitEnv = "TENSOR_FUSION_CUDA_MEM_LIMIT"
5151
WorkerPodNameEnv = "POD_NAME"
5252
NamespaceEnv = "OPERATOR_NAMESPACE"
53-
NamespaceDefaultVal = "tensor-fusion"
53+
NamespaceDefaultVal = "tensor-fusion-sys"
5454
)
5555

5656
const (

internal/controller/gpunode_controller.go

+27-9
Original file line numberDiff line numberDiff line change
@@ -38,11 +38,9 @@ import (
3838
"k8s.io/client-go/util/retry"
3939
"k8s.io/utils/ptr"
4040
ctrl "sigs.k8s.io/controller-runtime"
41-
"sigs.k8s.io/controller-runtime/pkg/builder"
4241
"sigs.k8s.io/controller-runtime/pkg/client"
4342
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
4443
"sigs.k8s.io/controller-runtime/pkg/log"
45-
"sigs.k8s.io/controller-runtime/pkg/predicate"
4644
)
4745

4846
// GPUNodeReconciler reconciles a GPUNode object
@@ -74,7 +72,6 @@ func (r *GPUNodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct
7472
}
7573

7674
deleted, err := utils.HandleFinalizer(ctx, node, r.Client, func(ctx context.Context, node *tfv1.GPUNode) (bool, error) {
77-
7875
if node.Status.Phase != tfv1.TensorFusionGPUNodePhaseDestroying {
7976
node.Status.Phase = tfv1.TensorFusionGPUNodePhaseDestroying
8077
if err := r.Status().Update(ctx, node); err != nil {
@@ -136,13 +133,39 @@ func (r *GPUNodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct
136133
break
137134
}
138135
}
136+
if poolName == "" {
137+
log.Error(nil, "failed to get pool name", "node", node.Name)
138+
return ctrl.Result{}, nil
139+
}
139140

140141
poolObj := &tfv1.GPUPool{}
141142
err = r.Client.Get(ctx, client.ObjectKey{Name: poolName}, poolObj)
142143
if err != nil {
143144
return ctrl.Result{}, fmt.Errorf("failed to get tensor-fusion pool, can not create node discovery job, pool: %s", poolName)
144145
}
145146

147+
if node.Spec.ManageMode != tfv1.GPUNodeManageModeProvisioned {
148+
// Check if the Kubernetes node exists; if not, the GPUNode should delete itself.
149+
if node.Status.KubernetesNodeName != "" {
150+
// Try to get the Kubernetes node
151+
coreNode := &corev1.Node{}
152+
err := r.Get(ctx, client.ObjectKey{Name: node.Status.KubernetesNodeName}, coreNode)
153+
if err != nil {
154+
if errors.IsNotFound(err) {
155+
// The Kubernetes node does not exist, delete the GPUNode
156+
log.Info("Kubernetes node does not exist, deleting GPUNode",
157+
"kubernetesNodeName", node.Status.KubernetesNodeName)
158+
if err := r.Delete(ctx, node); err != nil {
159+
return ctrl.Result{}, fmt.Errorf("failed to delete GPUNode after Kubernetes node was deleted: %w", err)
160+
}
161+
// Return early since we've deleted the resource
162+
return ctrl.Result{}, nil
163+
}
164+
return ctrl.Result{}, fmt.Errorf("failed to get Kubernetes node %s: %w",
165+
node.Status.KubernetesNodeName, err)
166+
}
167+
}
168+
}
146169
if err := r.reconcileCloudVendorNode(ctx, node, poolObj); err != nil {
147170
return ctrl.Result{}, err
148171
}
@@ -151,10 +174,6 @@ func (r *GPUNodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct
151174
if node.Status.KubernetesNodeName == "" {
152175
return ctrl.Result{RequeueAfter: 10 * time.Second}, nil
153176
}
154-
if poolName == "" {
155-
log.Error(nil, "failed to get pool name", "node", node.Name)
156-
return ctrl.Result{}, nil
157-
}
158177

159178
if err := r.reconcileNodeDiscoveryJob(ctx, node, poolObj); err != nil {
160179
return ctrl.Result{}, err
@@ -415,7 +434,6 @@ func (r *GPUNodeReconciler) reconcileHypervisorPod(ctx context.Context, node *tf
415434
}
416435

417436
func (r *GPUNodeReconciler) reconcileCloudVendorNode(ctx context.Context, node *tfv1.GPUNode, pool *tfv1.GPUPool) error {
418-
419437
// Avoid creating duplicated cloud vendor nodes, if not working, keep pending status
420438
if node.Status.NodeInfo.InstanceID != "" {
421439
// node already created, check status
@@ -532,7 +550,7 @@ func (r *GPUNodeReconciler) CalculateVirtualCapacity(node *tfv1.GPUNode, pool *t
532550
// SetupWithManager sets up the controller with the Manager.
533551
func (r *GPUNodeReconciler) SetupWithManager(mgr ctrl.Manager) error {
534552
return ctrl.NewControllerManagedBy(mgr).
535-
For(&tfv1.GPUNode{}, builder.WithPredicates(predicate.GenerationChangedPredicate{})).
553+
For(&tfv1.GPUNode{}).
536554
Named("gpunode").
537555
Owns(&corev1.Node{}).
538556
Owns(&batchv1.Job{}).

internal/controller/gpunode_controller_test.go

+13
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,14 @@ var _ = Describe("GPUNode Controller", func() {
6363
resource.Status.Phase = tfv1.TensorFusionGPUNodePhaseRunning
6464
Expect(k8sClient.Status().Update(ctx, resource)).To(Succeed())
6565
}
66+
By("creating the core node")
67+
coreNode := &corev1.Node{
68+
ObjectMeta: metav1.ObjectMeta{
69+
Name: resourceName,
70+
},
71+
Spec: corev1.NodeSpec{},
72+
}
73+
Expect(k8sClient.Create(ctx, coreNode)).To(Succeed())
6674
})
6775

6876
AfterEach(func() {
@@ -72,6 +80,11 @@ var _ = Describe("GPUNode Controller", func() {
7280

7381
By("Cleanup the specific resource instance GPUNode")
7482
Expect(k8sClient.Delete(ctx, resource)).To(Succeed())
83+
By("Cleanup the core node")
84+
coreNode := &corev1.Node{ObjectMeta: metav1.ObjectMeta{
85+
Name: resourceName,
86+
}}
87+
Expect(k8sClient.Delete(ctx, coreNode)).To(Succeed())
7588
})
7689

7790
It("should successfully reconcile the resource", func() {

internal/controller/gpupool_controller.go

+1
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ func (r *GPUPoolReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct
5555

5656
runNow, alreadyQueued, waitTime := utils.DebouncedReconcileCheck(ctx, &r.LastProcessedItems, req.NamespacedName)
5757
if alreadyQueued {
58+
log.Info("GPUPool already queued for reconcile", "name", req.NamespacedName.Name)
5859
return ctrl.Result{}, nil
5960
}
6061
if !runNow {

internal/controller/node_controller.go

+2-6
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,6 @@ func (r *NodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.
105105
return ctrl.Result{}, err
106106
}
107107
if !matched {
108-
109108
// delete gpunode if no matched pool
110109
if err := r.Client.Delete(ctx, &tfv1.GPUNode{
111110
ObjectMeta: metav1.ObjectMeta{
@@ -122,10 +121,6 @@ func (r *NodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.
122121
if err := r.Client.Get(ctx, client.ObjectKey{Name: node.Name}, gpuNode); err != nil {
123122
if errors.IsNotFound(err) {
124123
gpuNode = r.generateGPUNode(node, pool)
125-
// Set owner reference to cascade delete after GPU node created
126-
if err := controllerutil.SetControllerReference(node, gpuNode, r.Scheme); err != nil {
127-
return ctrl.Result{}, fmt.Errorf("failed to set controller reference: %w", err)
128-
}
129124
_, e := controllerutil.CreateOrUpdate(ctx, r.Client, gpuNode, func() error { return nil })
130125
if e != nil {
131126
return ctrl.Result{}, fmt.Errorf("failed to create or patch GPUNode: %w", e)
@@ -159,6 +154,7 @@ func (r *NodeReconciler) generateGPUNode(node *corev1.Node, pool *tfv1.GPUPool)
159154
ObjectMeta: metav1.ObjectMeta{
160155
Name: node.Name,
161156
Labels: map[string]string{
157+
constants.LabelKeyOwner: pool.Name,
162158
fmt.Sprintf(constants.GPUNodePoolIdentifierLabelFormat, pool.Name): "true",
163159
},
164160
},
@@ -167,7 +163,7 @@ func (r *NodeReconciler) generateGPUNode(node *corev1.Node, pool *tfv1.GPUPool)
167163
},
168164
}
169165

170-
_ = controllerutil.SetOwnerReference(pool, gpuNode, r.Scheme)
166+
_ = controllerutil.SetControllerReference(pool, gpuNode, r.Scheme)
171167
return gpuNode
172168
}
173169

0 commit comments

Comments
 (0)