Skip to content

Commit b45f6f6

Browse files
authored
fix: update the status to WorkerPending when worker selection fails (#76)
1 parent 810fa3c commit b45f6f6

File tree

2 files changed

+100
-0
lines changed

2 files changed

+100
-0
lines changed

internal/controller/tensorfusionconnection_controller.go

+5
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,11 @@ func (r *TensorFusionConnectionReconciler) Reconcile(ctx context.Context, req ct
8484
s, err := worker.SelectWorker(ctx, r.Client, workload, 1)
8585
if err != nil {
8686
r.Recorder.Eventf(connection, corev1.EventTypeWarning, "WorkerSelectionFailed", "Failed to select worker: %v", err)
87+
// Update the status to WorkerPending when worker selection fails
88+
connection.Status.Phase = tfv1.WorkerPending
89+
if updateErr := r.Status().Update(ctx, connection); updateErr != nil {
90+
return ctrl.Result{}, fmt.Errorf("failed to select worker: %w, failed to update status: %v", err, updateErr)
91+
}
8792
return ctrl.Result{}, err
8893
}
8994
workerStatus = *s

internal/controller/tensorfusionconnection_controller_test.go

+95
Original file line numberDiff line numberDiff line change
@@ -271,5 +271,100 @@ var _ = Describe("TensorFusionConnection Controller", func() {
271271
connection.Status.ConnectionURL == "native+192.168.1.2+8081+test-worker-2-0"
272272
}, time.Second*5, time.Millisecond*100).Should(BeTrue())
273273
})
274+
275+
It("should update status to WorkerPending when worker selection fails", func() {
276+
By("Creating a TensorFusionWorkload without worker status")
277+
278+
// Create a workload with no workers (empty WorkerStatuses)
279+
failWorkloadName := "test-workload-no-workers"
280+
failWorkloadNamespacedName := types.NamespacedName{
281+
Name: failWorkloadName,
282+
Namespace: "default",
283+
}
284+
285+
failWorkload := &tfv1.TensorFusionWorkload{
286+
ObjectMeta: metav1.ObjectMeta{
287+
Name: failWorkloadName,
288+
Namespace: "default",
289+
},
290+
Spec: tfv1.TensorFusionWorkloadSpec{
291+
PoolName: "mock-empty",
292+
Resources: tfv1.Resources{
293+
Requests: tfv1.Resource{
294+
Tflops: resource.MustParse("1"),
295+
Vram: resource.MustParse("1Gi"),
296+
},
297+
Limits: tfv1.Resource{
298+
Tflops: resource.MustParse("1"),
299+
Vram: resource.MustParse("1Gi"),
300+
},
301+
},
302+
},
303+
Status: tfv1.TensorFusionWorkloadStatus{
304+
Replicas: 0,
305+
ReadyReplicas: 0,
306+
// Empty WorkerStatuses to force selection failure
307+
WorkerStatuses: []tfv1.WorkerStatus{},
308+
},
309+
}
310+
Expect(k8sClient.Create(ctx, failWorkload)).To(Succeed())
311+
// Update status
312+
Expect(k8sClient.Status().Update(ctx, failWorkload)).To(Succeed())
313+
314+
// Verify workload was created properly
315+
createdWorkload := &tfv1.TensorFusionWorkload{}
316+
Eventually(func() bool {
317+
if err := k8sClient.Get(ctx, failWorkloadNamespacedName, createdWorkload); err != nil {
318+
return false
319+
}
320+
return len(createdWorkload.Status.WorkerStatuses) == 0
321+
}, time.Second*5, time.Millisecond*100).Should(BeTrue())
322+
323+
By("Creating a connection to the workload with no workers")
324+
failConnectionName := "test-connection-fail"
325+
failConnectionNamespacedName := types.NamespacedName{
326+
Name: failConnectionName,
327+
Namespace: "default",
328+
}
329+
330+
failConnection := &tfv1.TensorFusionConnection{
331+
ObjectMeta: metav1.ObjectMeta{
332+
Name: failConnectionName,
333+
Namespace: "default",
334+
Labels: map[string]string{
335+
constants.WorkloadKey: failWorkloadName,
336+
},
337+
},
338+
Spec: tfv1.TensorFusionConnectionSpec{
339+
WorkloadName: failWorkloadName,
340+
},
341+
}
342+
Expect(k8sClient.Create(ctx, failConnection)).To(Succeed())
343+
344+
By("Reconciling the connection to trigger worker selection failure")
345+
controllerReconciler := &TensorFusionConnectionReconciler{
346+
Client: k8sClient,
347+
Scheme: k8sClient.Scheme(),
348+
Recorder: record.NewFakeRecorder(10),
349+
}
350+
351+
_, err := controllerReconciler.Reconcile(ctx, reconcile.Request{
352+
NamespacedName: failConnectionNamespacedName,
353+
})
354+
// We expect an error since worker selection should fail
355+
Expect(err).To(HaveOccurred())
356+
357+
By("Verifying the connection status is updated to WorkerPending")
358+
Eventually(func() bool {
359+
if err := k8sClient.Get(ctx, failConnectionNamespacedName, failConnection); err != nil {
360+
return false
361+
}
362+
return failConnection.Status.Phase == tfv1.WorkerPending
363+
}, time.Second*5, time.Millisecond*100).Should(BeTrue())
364+
365+
By("Cleaning up test resources")
366+
Expect(k8sClient.Delete(ctx, failConnection)).To(Succeed())
367+
Expect(k8sClient.Delete(ctx, failWorkload)).To(Succeed())
368+
})
274369
})
275370
})

0 commit comments

Comments
 (0)