@@ -164,8 +164,12 @@ func (r *TensorFusionWorkloadReconciler) Reconcile(ctx context.Context, req ctrl
164
164
165
165
// Calculate how many pods need to be added
166
166
podsToAdd := int (desiredReplicas - currentReplicas )
167
- if err := r .scaleUpWorkers (ctx , workerGenerator , workload , podsToAdd ); err != nil {
168
- return ctrl.Result {}, err
167
+ result , err := r .scaleUpWorkers (ctx , workerGenerator , workload , podsToAdd )
168
+ if err != nil {
169
+ return ctrl.Result {}, fmt .Errorf ("scale up workers: %w" , err )
170
+ }
171
+ if ! result .IsZero () {
172
+ return result , nil
169
173
}
170
174
} else if currentReplicas > desiredReplicas {
171
175
log .Info ("Scaling down workers" , "from" , currentReplicas , "to" , desiredReplicas )
@@ -306,7 +310,7 @@ func (r *TensorFusionWorkloadReconciler) deletePod(ctx context.Context, pod *cor
306
310
}
307
311
308
312
// scaleUpWorkers handles the scaling up of worker pods
309
- func (r * TensorFusionWorkloadReconciler ) scaleUpWorkers (ctx context.Context , workerGenerator * worker.WorkerGenerator , workload * tfv1.TensorFusionWorkload , count int ) error {
313
+ func (r * TensorFusionWorkloadReconciler ) scaleUpWorkers (ctx context.Context , workerGenerator * worker.WorkerGenerator , workload * tfv1.TensorFusionWorkload , count int ) (ctrl. Result , error ) {
310
314
log := log .FromContext (ctx )
311
315
312
316
// Create worker pods
@@ -315,7 +319,7 @@ func (r *TensorFusionWorkloadReconciler) scaleUpWorkers(ctx context.Context, wor
315
319
gpu , err := r .Scheduler .Schedule (ctx , workload .Spec .PoolName , workload .Spec .Resources .Requests )
316
320
if err != nil {
317
321
r .Recorder .Eventf (workload , corev1 .EventTypeWarning , "ScheduleGPUFailed" , "Failed to schedule GPU: %v" , err )
318
- return fmt . Errorf ( "schedule GPU: %w" , err )
322
+ return ctrl. Result { RequeueAfter : constants . PendingRequeueDuration }, nil
319
323
}
320
324
321
325
pod , err := r .tryStartWorker (ctx , workerGenerator , gpu , workload )
@@ -325,7 +329,7 @@ func (r *TensorFusionWorkloadReconciler) scaleUpWorkers(ctx context.Context, wor
325
329
if releaseErr != nil {
326
330
log .Error (releaseErr , "Failed to release GPU after pod creation failure" )
327
331
}
328
- return fmt .Errorf ("create worker pod: %w" , err )
332
+ return ctrl. Result {}, fmt .Errorf ("create worker pod: %w" , err )
329
333
}
330
334
331
335
labels := prometheus.Labels {
@@ -339,7 +343,7 @@ func (r *TensorFusionWorkloadReconciler) scaleUpWorkers(ctx context.Context, wor
339
343
metrics .VramBytesLimit .With (labels ).Set (workload .Spec .Resources .Limits .Vram .AsApproximateFloat64 ())
340
344
}
341
345
342
- return nil
346
+ return ctrl. Result {}, nil
343
347
}
344
348
345
349
// updateStatus updates the WorkerStatuses and readyReplicas field in the workload status
0 commit comments