Skip to content

Commit 319dd4c

Browse files
fix: prevent job termination on node lookup fail
Change IsNodeDrain() and IsNodeDrained() to fail-closed behavior. Previously, these functions returned true (drained) when Slurm node lookups failed, causing immediate pod deletion and job termination. Now returns false and propagates errors, allowing the operator to retry until it can properly verify drain status. This protects running jobs from premature termination during transient failures or node name mismatches.
1 parent 812ad79 commit 319dd4c

File tree

3 files changed

+67
-11
lines changed

3 files changed

+67
-11
lines changed

internal/controller/nodeset/nodeset_sync_test.go

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -864,7 +864,8 @@ func TestNodeSetReconciler_processCondemned(t *testing.T) {
864864
condemned: pods,
865865
i: 0,
866866
},
867-
wantErr: false,
867+
// Expect error when Slurm node is not found
868+
wantErr: true,
868869
wantDrain: false,
869870
wantDelete: true,
870871
}
@@ -1054,7 +1055,9 @@ func TestNodeSetReconciler_processCondemned(t *testing.T) {
10541055
}
10551056
pod := tt.args.condemned[tt.args.i]
10561057
if isDrain, err := r.slurmControl.IsNodeDrain(tt.args.ctx, tt.args.nodeset, pod); err != nil {
1057-
t.Errorf("slurmControl.IsNodeDrain() error = %v", err)
1058+
if !tt.wantDelete {
1059+
t.Errorf("slurmControl.IsNodeDrain() error = %v", err)
1060+
}
10581061
} else if isDrain != tt.wantDrain && !tt.wantDelete {
10591062
t.Errorf("slurmControl.IsNodeDrain() = %v, wantDrain %v", isDrain, tt.wantDrain)
10601063
}
@@ -1897,7 +1900,8 @@ func TestNodeSetReconciler_syncRollingUpdate(t *testing.T) {
18971900
pods: []*corev1.Pod{pod1, pod2},
18981901
hash: hash,
18991902
},
1900-
wantErr: false,
1903+
// Expect error when Slurm node is not found
1904+
wantErr: true,
19011905
}
19021906
}(),
19031907
}

internal/controller/nodeset/slurmcontrol/slurmcontrol.go

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -280,15 +280,12 @@ func (r *realSlurmControl) IsNodeDrain(ctx context.Context, nodeset *slinkyv1bet
280280
if slurmClient == nil {
281281
logger.V(2).Info("no client for nodeset, cannot do IsNodeDrain()",
282282
"pod", klog.KObj(pod))
283-
return true, nil
283+
return false, nil
284284
}
285285

286286
slurmNode := &slurmtypes.V0044Node{}
287287
key := slurmobject.ObjectKey(nodesetutils.GetNodeName(pod))
288288
if err := slurmClient.Get(ctx, key, slurmNode); err != nil {
289-
if tolerateError(err) {
290-
return true, nil
291-
}
292289
return false, err
293290
}
294291

@@ -304,15 +301,12 @@ func (r *realSlurmControl) IsNodeDrained(ctx context.Context, nodeset *slinkyv1b
304301
if slurmClient == nil {
305302
logger.V(2).Info("no client for nodeset, cannot do IsNodeDrained()",
306303
"pod", klog.KObj(pod))
307-
return true, nil
304+
return false, nil
308305
}
309306

310307
slurmNode := &slurmtypes.V0044Node{}
311308
key := slurmobject.ObjectKey(nodesetutils.GetNodeName(pod))
312309
if err := slurmClient.Get(ctx, key, slurmNode); err != nil {
313-
if tolerateError(err) {
314-
return true, nil
315-
}
316310
return false, err
317311
}
318312

internal/controller/nodeset/slurmcontrol/slurmcontrol_test.go

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -465,6 +465,35 @@ func Test_realSlurmControl_IsNodeDrain(t *testing.T) {
465465
want: true,
466466
wantErr: false,
467467
},
468+
{
469+
name: "No Slurm client - fail closed",
470+
fields: fields{
471+
clientMap: clientmap.NewClientMap(),
472+
},
473+
args: args{
474+
ctx: ctx,
475+
nodeset: nodeset,
476+
pod: pod,
477+
},
478+
want: false,
479+
wantErr: false,
480+
},
481+
{
482+
name: "Node not found - fail closed",
483+
fields: func() fields {
484+
sclient := fake.NewClientBuilder().Build()
485+
return fields{
486+
clientMap: newSlurmClientMap(controller.Name, sclient),
487+
}
488+
}(),
489+
args: args{
490+
ctx: ctx,
491+
nodeset: nodeset,
492+
pod: pod,
493+
},
494+
want: false,
495+
wantErr: true,
496+
},
468497
}
469498
for _, tt := range tests {
470499
t.Run(tt.name, func(t *testing.T) {
@@ -750,6 +779,35 @@ func Test_realSlurmControl_IsNodeDrained(t *testing.T) {
750779
},
751780
want: false,
752781
},
782+
{
783+
name: "No Slurm client - fail closed",
784+
fields: fields{
785+
clientMap: clientmap.NewClientMap(),
786+
},
787+
args: args{
788+
ctx: ctx,
789+
nodeset: nodeset,
790+
pod: pod,
791+
},
792+
want: false,
793+
wantErr: false,
794+
},
795+
{
796+
name: "Node not found - fail closed",
797+
fields: func() fields {
798+
sclient := fake.NewClientBuilder().Build()
799+
return fields{
800+
clientMap: newSlurmClientMap(controller.Name, sclient),
801+
}
802+
}(),
803+
args: args{
804+
ctx: ctx,
805+
nodeset: nodeset,
806+
pod: pod,
807+
},
808+
want: false,
809+
wantErr: true,
810+
},
753811
}
754812
for _, tt := range tests {
755813
t.Run(tt.name, func(t *testing.T) {

0 commit comments

Comments
 (0)