Skip to content

Commit 881c029

Browse files
authored
Merge pull request #55 from artem-zinnatullin/az/delete-hanging-jobs
Delete hanging jobs (DeadlineExceeded, Active == 1 but actually completed)
2 parents 33cf7c3 + 2e8df24 commit 881c029

File tree

3 files changed

+76
-14
lines changed

3 files changed

+76
-14
lines changed

.gitignore

+3
Original file line numberDiff line numberDiff line change
@@ -11,3 +11,6 @@ bin/
1111

1212
# Output of the go coverage tool, specifically when used with LiteIDE
1313
*.out
14+
15+
# Coverage report generated by `make test`.
16+
coverage.txt

pkg/controller/job.go

+17-6
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,6 @@ import (
88
)
99

1010
func shouldDeleteJob(job *batchv1.Job, deleteSuccessfulAfter, deleteFailedAfter time.Duration, ignoreCronJobs bool) bool {
11-
// skip the job if it has any active pods
12-
if job.Status.Active > 0 {
13-
return false
14-
}
15-
1611
if ignoreCronJobs {
1712
owners := getJobOwnerKinds(job)
1813
if isOwnedByCronJob(owners) {
@@ -33,7 +28,7 @@ func shouldDeleteJob(job *batchv1.Job, deleteSuccessfulAfter, deleteFailedAfter
3328
return true
3429
}
3530
}
36-
if job.Status.Failed > 0 {
31+
if isFailed(job) {
3732
if deleteFailedAfter > 0 && timeSinceFinish >= deleteFailedAfter {
3833
return true
3934
}
@@ -65,6 +60,22 @@ func jobFinishTime(jobObj *batchv1.Job) time.Time {
6560
return time.Time{}
6661
}
6762

63+
func isFailed(jobObj *batchv1.Job) bool {
64+
if jobObj.Status.Failed > 0 {
65+
return true
66+
}
67+
68+
// In case when Job fails due to the Deadline set on it (DeadlineExceeded), the job.Status.Failed does not contain a value
69+
// Thus we have to iterate over job.Status.Conditions and look for a JobFailed in state "true".
70+
for _, jc := range jobObj.Status.Conditions {
71+
if jc.Type == batchv1.JobFailed && jc.Status == corev1.ConditionTrue {
72+
return true
73+
}
74+
}
75+
76+
return false
77+
}
78+
6879
// isOwnedByCronJob returns true if and only if job has a single owner CronJob
6980
// and this owners kind is CronJob
7081
func isOwnedByCronJob(ownerKinds []string) bool {

pkg/controller/job_test.go

+56-8
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,11 @@ import (
55
"time"
66

77
batchv1 "k8s.io/api/batch/v1"
8+
corev1 "k8s.io/api/core/v1"
89
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
910
)
1011

11-
func createJob(ownedByCronJob bool, completed time.Time, active, succeeded, failed int32) *batchv1.Job {
12+
func createJob(ownedByCronJob bool, completed time.Time, active, succeeded, failed int32, conditions []batchv1.JobCondition) *batchv1.Job {
1213
ts := metav1.NewTime(completed)
1314
job := batchv1.Job{
1415
Spec: batchv1.JobSpec{},
@@ -17,6 +18,7 @@ func createJob(ownedByCronJob bool, completed time.Time, active, succeeded, fail
1718
Active: active,
1819
Succeeded: succeeded,
1920
Failed: failed,
21+
Conditions: conditions,
2022
},
2123
}
2224
if ownedByCronJob {
@@ -39,54 +41,100 @@ func TestKleaner_DeleteJob(t *testing.T) {
3941
expected bool
4042
}{
4143
"jobs owned by cronjobs should be ignored": {
42-
jobSpec: createJob(true, ts.Add(-time.Minute), 0, 0, 0),
44+
jobSpec: createJob(true, ts.Add(-time.Minute), 0, 0, 0, []batchv1.JobCondition{}),
4345
successful: time.Second,
4446
failed: time.Second,
4547
ignoreCron: true,
4648
expected: false,
4749
},
4850
"jobs owned by cronjobs should be deleted": {
49-
jobSpec: createJob(true, ts.Add(-time.Minute), 0, 0, 0),
51+
jobSpec: createJob(true, ts.Add(-time.Minute), 0, 0, 0, []batchv1.JobCondition{}),
5052
successful: time.Second,
5153
failed: time.Second,
5254
ignoreCron: false,
5355
expected: false,
5456
},
5557
"jobs with active pods should not be deleted": {
56-
jobSpec: createJob(false, ts.Add(-time.Minute), 1, 0, 0), // job.Status.Active > 0
58+
jobSpec: createJob(false, ts.Add(-time.Minute), 1, 0, 0, []batchv1.JobCondition{}), // job.Status.Active > 0
5759
successful: time.Second,
5860
failed: time.Second,
5961
ignoreCron: false,
6062
expected: false,
6163
},
6264
"expired successful jobs should be deleted": {
63-
jobSpec: createJob(false, ts.Add(-time.Minute), 0, 1, 0),
65+
jobSpec: createJob(false, ts.Add(-time.Minute), 0, 1, 0, []batchv1.JobCondition{}),
6466
successful: time.Second,
6567
failed: time.Second,
6668
ignoreCron: false,
6769
expected: true,
6870
},
6971
"non-expired successful jobs should not be deleted": {
70-
jobSpec: createJob(false, ts.Add(-time.Minute), 0, 1, 0),
72+
jobSpec: createJob(false, ts.Add(-time.Minute), 0, 1, 0, []batchv1.JobCondition{}),
7173
successful: time.Minute * 2,
7274
failed: time.Second,
7375
ignoreCron: false,
7476
expected: false,
7577
},
7678
"expired failed jobs should be deleted": {
77-
jobSpec: createJob(false, ts.Add(-time.Minute), 0, 0, 1),
79+
jobSpec: createJob(false, ts.Add(-time.Minute), 0, 0, 1, []batchv1.JobCondition{}),
7880
successful: time.Second,
7981
failed: time.Second,
8082
ignoreCron: false,
8183
expected: true,
8284
},
8385
"non-expired failed jobs should not be deleted": {
84-
jobSpec: createJob(false, ts.Add(-time.Minute), 0, 0, 1),
86+
jobSpec: createJob(false, ts.Add(-time.Minute), 0, 0, 1, []batchv1.JobCondition{}),
8587
successful: time.Second,
8688
failed: time.Minute * 2,
8789
ignoreCron: false,
8890
expected: false,
8991
},
92+
"failed (based on JobCondition) but not marked as failed jobs should be deleted": {
93+
jobSpec: createJob(false, time.Time{}, 0, 0, 0, []batchv1.JobCondition{
94+
batchv1.JobCondition{
95+
Type: batchv1.JobFailed,
96+
Status: corev1.ConditionTrue,
97+
LastProbeTime: metav1.NewTime(ts),
98+
LastTransitionTime: metav1.NewTime(ts.Add(-time.Minute)),
99+
Reason: "DeadlineExceeded",
100+
Message: "Job was active longer than specified deadline",
101+
},
102+
}),
103+
successful: time.Second,
104+
failed: time.Second,
105+
ignoreCron: false,
106+
expected: true,
107+
},
108+
"successful but 'active' jobs should be deleted": {
109+
jobSpec: createJob(false, ts.Add(-time.Minute), 1, 1, 0, []batchv1.JobCondition{}),
110+
successful: time.Second,
111+
failed: time.Second,
112+
ignoreCron: false,
113+
expected: true,
114+
},
115+
"failed but 'active' jobs should be deleted": {
116+
jobSpec: createJob(false, ts.Add(-time.Minute), 1, 0, 1, []batchv1.JobCondition{}),
117+
successful: time.Second,
118+
failed: time.Second,
119+
ignoreCron: false,
120+
expected: true,
121+
},
122+
"failed (based on JobCondition) but 'active' jobs should be deleted": {
123+
jobSpec: createJob(false, ts.Add(-time.Minute), 1, 0, 0, []batchv1.JobCondition{
124+
batchv1.JobCondition{
125+
Type: batchv1.JobFailed,
126+
Status: corev1.ConditionTrue,
127+
LastProbeTime: metav1.NewTime(ts),
128+
LastTransitionTime: metav1.NewTime(ts.Add(-time.Minute)),
129+
Reason: "DeadlineExceeded",
130+
Message: "Job was active longer than specified deadline",
131+
},
132+
}),
133+
successful: time.Second,
134+
failed: time.Second,
135+
ignoreCron: false,
136+
expected: true,
137+
},
90138
}
91139
for name, tc := range testCases {
92140
t.Run(name, func(t *testing.T) {

0 commit comments

Comments
 (0)