@@ -18,6 +18,7 @@ package status
18
18
import (
19
19
"context"
20
20
"fmt"
21
+ "regexp"
21
22
"strings"
22
23
23
24
"github.com/devfile/devworkspace-operator/pkg/common"
@@ -145,10 +146,15 @@ func CheckPodEvents(pod *corev1.Pod, workspaceID string, ignoredEvents []string,
145
146
if maxCount , isUnrecoverableEvent := unrecoverablePodEventReasons [ev .Reason ]; isUnrecoverableEvent {
146
147
if ! checkIfUnrecoverableEventIgnored (ev .Reason , ignoredEvents ) && getEventCount (ev ) >= maxCount {
147
148
var msg string
149
+ eventMessage := ev .Message // Original Kubelet message from the event
150
+ if ev .Reason == "FailedPostStartHook" {
151
+ eventMessage = getConcisePostStartFailureMessage (ev .Message )
152
+ }
153
+
148
154
if getEventCount (ev ) > 1 {
149
- msg = fmt .Sprintf ("Detected unrecoverable event %s %d times: %s. " , ev .Reason , getEventCount (ev ), ev . Message )
155
+ msg = fmt .Sprintf ("Detected unrecoverable event %s %d times: %s" , ev .Reason , getEventCount (ev ), eventMessage )
150
156
} else {
151
- msg = fmt .Sprintf ("Detected unrecoverable event %s: %s. " , ev .Reason , ev . Message )
157
+ msg = fmt .Sprintf ("Detected unrecoverable event %s: %s" , ev .Reason , eventMessage )
152
158
}
153
159
return msg , nil
154
160
}
@@ -157,22 +163,110 @@ func CheckPodEvents(pod *corev1.Pod, workspaceID string, ignoredEvents []string,
157
163
return "" , nil
158
164
}
159
165
166
+ // getConcisePostStartFailureMessage tries to parse the Kubelet's verbose message
167
+ // for a PostStartHookError into a more user-friendly one.
168
+ func getConcisePostStartFailureMessage (kubeletMsg string ) string {
169
+
170
+ /* regexes for specific messages from our postStart script's output */
171
+
172
+ // matches: "[postStart hook] Commands terminated by SIGTERM (likely timed out after ...s). Exit code 143."
173
+ reTerminatedSigterm := regexp .MustCompile (`(\[postStart hook\] Commands terminated by SIGTERM \(likely timed out after [^)]+?\)\. Exit code 143\.)` )
174
+
175
+ // matches: "[postStart hook] Commands forcefully killed by SIGKILL (likely after --kill-after ...s expired). Exit code 137."
176
+ reKilledSigkill := regexp .MustCompile (`(\[postStart hook\] Commands forcefully killed by SIGKILL \(likely after --kill-after [^)]+?\)\. Exit code 137\.)` )
177
+
178
+ // matches: "[postStart hook] Commands failed with exit code ..." (for any other script-reported non-zero exit code)
179
+ reGenericFailedExitCode := regexp .MustCompile (`(\[postStart hook\] Commands failed with exit code \d+\.)` )
180
+
181
+ // regex to capture Kubelet's explicit message field content if it exists
182
+ reKubeletInternalMessage := regexp .MustCompile (`message:\s*"([^"]*)"` )
183
+
184
+ // regex to capture Kubelet's reported exit code for the hook command
185
+ reKubeletExitCode := regexp .MustCompile (`exited with (\d+):` )
186
+
187
+ /* 1: check Kubelet's explicit `message: "..."` field for the specific output */
188
+
189
+ kubeletInternalMsgMatch := reKubeletInternalMessage .FindStringSubmatch (kubeletMsg )
190
+ if len (kubeletInternalMsgMatch ) > 1 && kubeletInternalMsgMatch [1 ] != "" {
191
+ internalMsg := kubeletInternalMsgMatch [1 ]
192
+ if match := reTerminatedSigterm .FindString (internalMsg ); match != "" {
193
+ return match
194
+ }
195
+ if match := reKilledSigkill .FindString (internalMsg ); match != "" {
196
+ return match
197
+ }
198
+ if match := reGenericFailedExitCode .FindString (internalMsg ); match != "" {
199
+ return match
200
+ }
201
+ }
202
+
203
+ /* 2: parse Kubelet's reported exit code for the entire hook command */
204
+
205
+ matchesKubeletExitCode := reKubeletExitCode .FindStringSubmatch (kubeletMsg )
206
+ if len (matchesKubeletExitCode ) > 1 {
207
+ exitCodeStr := matchesKubeletExitCode [1 ]
208
+ var exitCode int
209
+ fmt .Sscanf (exitCodeStr , "%d" , & exitCode )
210
+
211
+ // generate messages indicating the source is Kubelet's reported exit code
212
+ if exitCode == 143 { // SIGTERM
213
+ return "[postStart hook] Commands terminated by SIGTERM due to timeout"
214
+ } else if exitCode == 137 { // SIGKILL
215
+ return "[postStart hook] Commands forcefully killed by SIGKILL due to timeout"
216
+ } else if exitCode != 0 { // Other non-zero exit codes (e.g., 124, 127)
217
+ return fmt .Sprintf ("[postStart hook] Commands failed (Kubelet reported exit code %s)" , exitCodeStr )
218
+ }
219
+ }
220
+
221
+ /* 3: try to match specific script outputs against the *entire* Kubelet message */
222
+
223
+ if match := reTerminatedSigterm .FindString (kubeletMsg ); match != "" {
224
+ return match
225
+ }
226
+ if match := reKilledSigkill .FindString (kubeletMsg ); match != "" {
227
+ return match
228
+ }
229
+ if match := reGenericFailedExitCode .FindString (kubeletMsg ); match != "" {
230
+ return match
231
+ }
232
+
233
+ /* 4: fallback */
234
+
235
+ return "[postStart hook] failed with an unknown error (see pod events or container logs for more details)"
236
+ }
237
+
160
238
func CheckContainerStatusForFailure (containerStatus * corev1.ContainerStatus , ignoredEvents []string ) (ok bool , reason string ) {
161
239
if containerStatus .State .Waiting != nil {
240
+ // Explicitly check for PostStartHookError
241
+ if containerStatus .State .Waiting .Reason == "PostStartHookError" { // Kubelet uses this reason
242
+ conciseMsg := getConcisePostStartFailureMessage (containerStatus .State .Waiting .Message )
243
+ return checkIfUnrecoverableEventIgnored ("FailedPostStartHook" , ignoredEvents ), conciseMsg
244
+ }
245
+ // Check against other generic failure reasons
162
246
for _ , failureReason := range containerFailureStateReasons {
163
247
if containerStatus .State .Waiting .Reason == failureReason {
164
- return checkIfUnrecoverableEventIgnored (containerStatus .State .Waiting .Reason , ignoredEvents ), containerStatus .State .Waiting .Reason
248
+ return checkIfUnrecoverableEventIgnored (containerStatus .State .Waiting .Reason , ignoredEvents ),
249
+ containerStatus .State .Waiting .Reason
165
250
}
166
251
}
167
252
}
168
253
169
254
if containerStatus .State .Terminated != nil {
255
+ // Check if termination was due to a generic error, which might include postStart issues
256
+ // if the container failed to run.
257
+ if containerStatus .State .Terminated .Reason == "Error" || containerStatus .State .Terminated .Reason == "ContainerCannotRun" {
258
+ return checkIfUnrecoverableEventIgnored (containerStatus .State .Terminated .Reason , ignoredEvents ),
259
+ fmt .Sprintf ("%s: %s" , containerStatus .State .Terminated .Reason , containerStatus .State .Terminated .Message )
260
+ }
261
+ // Check against other generic failure reasons for terminated state
170
262
for _ , failureReason := range containerFailureStateReasons {
171
263
if containerStatus .State .Terminated .Reason == failureReason {
172
- return checkIfUnrecoverableEventIgnored (containerStatus .State .Terminated .Reason , ignoredEvents ), containerStatus .State .Terminated .Reason
264
+ return checkIfUnrecoverableEventIgnored (containerStatus .State .Terminated .Reason , ignoredEvents ),
265
+ containerStatus .State .Terminated .Reason
173
266
}
174
267
}
175
268
}
269
+
176
270
return true , ""
177
271
}
178
272
0 commit comments