Skip to content

Commit 0d16ee5

Browse files
committed
feat: timeout the postStart hook commands
Signed-off-by: Oleksii Kurinnyi <[email protected]>
1 parent 1a65450 commit 0d16ee5

File tree

6 files changed

+173
-18
lines changed

6 files changed

+173
-18
lines changed

apis/controller/v1alpha1/devworkspaceoperatorconfig_types.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,16 @@ type WorkspaceConfig struct {
186186
RuntimeClassName *string `json:"runtimeClassName,omitempty"`
187187
// CleanupCronJobConfig defines configuration options for a cron job that automatically cleans up stale DevWorkspaces.
188188
CleanupCronJob *CleanupCronJobConfig `json:"cleanupCronJob,omitempty"`
189+
// PostStartTimeout defines the maximum duration the PostStart hook can run
190+
// before it is automatically failed. This timeout is used for the postStart lifecycle hook
191+
// that is used to run commands in the workspace container. The timeout is specified in seconds.
192+
// If not specified, the timeout is disabled (0 seconds).
193+
// +kubebuilder:validation:Minimum=0
194+
// +kubebuilder:default:=0
195+
// +kubebuilder:validation:Optional
196+
// +kubebuilder:validation:Type=integer
197+
// +kubebuilder:validation:Format=int32
198+
PostStartTimeout *int32 `json:"postStartTimeout,omitempty"`
189199
}
190200

191201
type WebhookConfig struct {

controllers/workspace/devworkspace_controller.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -329,7 +329,9 @@ func (r *DevWorkspaceReconciler) Reconcile(ctx context.Context, req ctrl.Request
329329
&workspace.Spec.Template,
330330
workspace.Config.Workspace.ContainerSecurityContext,
331331
workspace.Config.Workspace.ImagePullPolicy,
332-
workspace.Config.Workspace.DefaultContainerResources)
332+
workspace.Config.Workspace.DefaultContainerResources,
333+
workspace.Config.Workspace.PostStartTimeout,
334+
)
333335
if err != nil {
334336
return r.failWorkspace(workspace, fmt.Sprintf("Error processing devfile: %s", err), metrics.ReasonBadRequest, reqLogger, &reconcileStatus), nil
335337
}

pkg/config/sync.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -428,6 +428,10 @@ func mergeConfig(from, to *controller.OperatorConfiguration) {
428428
to.Workspace.CleanupCronJob.Schedule = from.Workspace.CleanupCronJob.Schedule
429429
}
430430
}
431+
432+
if from.Workspace.PostStartTimeout != nil {
433+
to.Workspace.PostStartTimeout = from.Workspace.PostStartTimeout
434+
}
431435
}
432436
}
433437

pkg/library/container/container.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ import (
4545
// rewritten as Volumes are added to PodAdditions, in order to support e.g. using one PVC to hold all volumes
4646
//
4747
// Note: Requires DevWorkspace to be flattened (i.e. the DevWorkspace contains no Parent or Components of type Plugin)
48-
func GetKubeContainersFromDevfile(workspace *dw.DevWorkspaceTemplateSpec, securityContext *corev1.SecurityContext, pullPolicy string, defaultResources *corev1.ResourceRequirements) (*v1alpha1.PodAdditions, error) {
48+
func GetKubeContainersFromDevfile(workspace *dw.DevWorkspaceTemplateSpec, securityContext *corev1.SecurityContext, pullPolicy string, defaultResources *corev1.ResourceRequirements, postStartTimeout *int32) (*v1alpha1.PodAdditions, error) {
4949
if !flatten.DevWorkspaceIsFlattened(workspace, nil) {
5050
return nil, fmt.Errorf("devfile is not flattened")
5151
}
@@ -77,7 +77,7 @@ func GetKubeContainersFromDevfile(workspace *dw.DevWorkspaceTemplateSpec, securi
7777
podAdditions.Containers = append(podAdditions.Containers, *k8sContainer)
7878
}
7979

80-
if err := lifecycle.AddPostStartLifecycleHooks(workspace, podAdditions.Containers); err != nil {
80+
if err := lifecycle.AddPostStartLifecycleHooks(workspace, podAdditions.Containers, postStartTimeout); err != nil {
8181
return nil, err
8282
}
8383

pkg/library/lifecycle/poststart.go

Lines changed: 56 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,20 @@ import (
2121
corev1 "k8s.io/api/core/v1"
2222
)
2323

24-
const redirectOutputFmt = `{
25-
%s
26-
} 1>/tmp/poststart-stdout.txt 2>/tmp/poststart-stderr.txt
24+
const (
25+
// `tee` both stdout and stderr to files and to the main output streams.
26+
redirectOutputFmt = `{
27+
# This script block ensures its exit code is preserved
28+
# while its stdout and stderr are tee'd.
29+
_script_to_run() {
30+
%s # This will be replaced by scriptWithTimeout
31+
}
32+
_script_to_run
33+
} 1> >(tee -a "/tmp/poststart-stdout.txt") 2> >(tee -a "/tmp/poststart-stderr.txt" >&2)
2734
`
35+
)
2836

29-
func AddPostStartLifecycleHooks(wksp *dw.DevWorkspaceTemplateSpec, containers []corev1.Container) error {
37+
func AddPostStartLifecycleHooks(wksp *dw.DevWorkspaceTemplateSpec, containers []corev1.Container, postStartTimeout *int32) error {
3038
if wksp.Events == nil || len(wksp.Events.PostStart) == 0 {
3139
return nil
3240
}
@@ -54,7 +62,7 @@ func AddPostStartLifecycleHooks(wksp *dw.DevWorkspaceTemplateSpec, containers []
5462
return fmt.Errorf("failed to process postStart event %s: %w", commands[0].Id, err)
5563
}
5664

57-
postStartHandler, err := processCommandsForPostStart(commands)
65+
postStartHandler, err := processCommandsForPostStart(commands, postStartTimeout)
5866
if err != nil {
5967
return fmt.Errorf("failed to process postStart event %s: %w", commands[0].Id, err)
6068
}
@@ -79,27 +87,64 @@ func AddPostStartLifecycleHooks(wksp *dw.DevWorkspaceTemplateSpec, containers []
7987
// - |
8088
// cd <workingDir>
8189
// <commandline>
82-
func processCommandsForPostStart(commands []dw.Command) (*corev1.LifecycleHandler, error) {
83-
var dwCommands []string
90+
func processCommandsForPostStart(commands []dw.Command, postStartTimeout *int32) (*corev1.LifecycleHandler, error) {
91+
var commandScriptLines []string
8492
for _, command := range commands {
8593
execCmd := command.Exec
8694
if len(execCmd.Env) > 0 {
8795
return nil, fmt.Errorf("env vars in postStart command %s are unsupported", command.Id)
8896
}
97+
var singleCommandParts []string
8998
if execCmd.WorkingDir != "" {
90-
dwCommands = append(dwCommands, fmt.Sprintf("cd %s", execCmd.WorkingDir))
99+
// Safely quote the working directory path
100+
safeWorkingDir := strings.ReplaceAll(execCmd.WorkingDir, "'", `'\''`)
101+
singleCommandParts = append(singleCommandParts, fmt.Sprintf("cd '%s'", safeWorkingDir))
102+
}
103+
if execCmd.CommandLine != "" {
104+
singleCommandParts = append(singleCommandParts, execCmd.CommandLine)
105+
}
106+
if len(singleCommandParts) > 0 {
107+
commandScriptLines = append(commandScriptLines, strings.Join(singleCommandParts, " && "))
91108
}
92-
dwCommands = append(dwCommands, execCmd.CommandLine)
93109
}
94110

95-
joinedCommands := strings.Join(dwCommands, "\n")
111+
originalUserScript := strings.Join(commandScriptLines, "\n")
112+
113+
scriptToExecute := "set -e\n" + originalUserScript
114+
escapedUserScript := strings.ReplaceAll(scriptToExecute, "'", `'\''`)
115+
116+
scriptWithTimeout := fmt.Sprintf(`
117+
export POSTSTART_TIMEOUT_DURATION="%d"
118+
export POSTSTART_KILL_AFTER_DURATION="5"
119+
120+
echo "[postStart hook] Executing commands with timeout: ${POSTSTART_TIMEOUT_DURATION} s, kill after: ${POSTSTART_KILL_AFTER_DURATION} s" >&2
121+
122+
# Run the user's script under the 'timeout' utility.
123+
timeout --preserve-status --kill-after="${POSTSTART_KILL_AFTER_DURATION}" "${POSTSTART_TIMEOUT_DURATION}" /bin/sh -c '%s'
124+
exit_code=$?
125+
126+
# Check the exit code from 'timeout'
127+
if [ $exit_code -eq 143 ]; then # 128 + 15 (SIGTERM)
128+
echo "[postStart hook] Commands terminated by SIGTERM (likely timed out after ${POSTSTART_TIMEOUT_DURATION}s). Exit code 143." >&2
129+
elif [ $exit_code -eq 137 ]; then # 128 + 9 (SIGKILL)
130+
echo "[postStart hook] Commands forcefully killed by SIGKILL (likely after --kill-after ${POSTSTART_KILL_AFTER_DURATION}s expired). Exit code 137." >&2
131+
elif [ $exit_code -ne 0 ]; then # Catches any other non-zero exit code, including 124
132+
echo "[postStart hook] Commands failed with exit code $exit_code." >&2
133+
else
134+
echo "[postStart hook] Commands completed successfully within the time limit." >&2
135+
fi
136+
137+
exit $exit_code
138+
`, *postStartTimeout, escapedUserScript)
139+
140+
finalScriptForHook := fmt.Sprintf(redirectOutputFmt, scriptWithTimeout)
96141

97142
handler := &corev1.LifecycleHandler{
98143
Exec: &corev1.ExecAction{
99144
Command: []string{
100145
"/bin/sh",
101146
"-c",
102-
fmt.Sprintf(redirectOutputFmt, joinedCommands),
147+
finalScriptForHook,
103148
},
104149
},
105150
}

pkg/library/status/check.go

Lines changed: 98 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ package status
1818
import (
1919
"context"
2020
"fmt"
21+
"regexp"
2122
"strings"
2223

2324
"github.com/devfile/devworkspace-operator/pkg/common"
@@ -145,10 +146,15 @@ func CheckPodEvents(pod *corev1.Pod, workspaceID string, ignoredEvents []string,
145146
if maxCount, isUnrecoverableEvent := unrecoverablePodEventReasons[ev.Reason]; isUnrecoverableEvent {
146147
if !checkIfUnrecoverableEventIgnored(ev.Reason, ignoredEvents) && getEventCount(ev) >= maxCount {
147148
var msg string
149+
eventMessage := ev.Message // Original Kubelet message from the event
150+
if ev.Reason == "FailedPostStartHook" {
151+
eventMessage = getConcisePostStartFailureMessage(ev.Message)
152+
}
153+
148154
if getEventCount(ev) > 1 {
149-
msg = fmt.Sprintf("Detected unrecoverable event %s %d times: %s.", ev.Reason, getEventCount(ev), ev.Message)
155+
msg = fmt.Sprintf("Detected unrecoverable event %s %d times: %s", ev.Reason, getEventCount(ev), eventMessage)
150156
} else {
151-
msg = fmt.Sprintf("Detected unrecoverable event %s: %s.", ev.Reason, ev.Message)
157+
msg = fmt.Sprintf("Detected unrecoverable event %s: %s", ev.Reason, eventMessage)
152158
}
153159
return msg, nil
154160
}
@@ -157,22 +163,110 @@ func CheckPodEvents(pod *corev1.Pod, workspaceID string, ignoredEvents []string,
157163
return "", nil
158164
}
159165

166+
// getConcisePostStartFailureMessage tries to parse the Kubelet's verbose message
167+
// for a PostStartHookError into a more user-friendly one.
168+
func getConcisePostStartFailureMessage(kubeletMsg string) string {
169+
170+
/* regexes for specific messages from our postStart script's output */
171+
172+
// matches: "[postStart hook] Commands terminated by SIGTERM (likely timed out after ...s). Exit code 143."
173+
reTerminatedSigterm := regexp.MustCompile(`(\[postStart hook\] Commands terminated by SIGTERM \(likely timed out after [^)]+?\)\. Exit code 143\.)`)
174+
175+
// matches: "[postStart hook] Commands forcefully killed by SIGKILL (likely after --kill-after ...s expired). Exit code 137."
176+
reKilledSigkill := regexp.MustCompile(`(\[postStart hook\] Commands forcefully killed by SIGKILL \(likely after --kill-after [^)]+?\)\. Exit code 137\.)`)
177+
178+
// matches: "[postStart hook] Commands failed with exit code ..." (for any other script-reported non-zero exit code)
179+
reGenericFailedExitCode := regexp.MustCompile(`(\[postStart hook\] Commands failed with exit code \d+\.)`)
180+
181+
// regex to capture Kubelet's explicit message field content if it exists
182+
reKubeletInternalMessage := regexp.MustCompile(`message:\s*"([^"]*)"`)
183+
184+
// regex to capture Kubelet's reported exit code for the hook command
185+
reKubeletExitCode := regexp.MustCompile(`exited with (\d+):`)
186+
187+
/* 1: check Kubelet's explicit `message: "..."` field for the specific output */
188+
189+
kubeletInternalMsgMatch := reKubeletInternalMessage.FindStringSubmatch(kubeletMsg)
190+
if len(kubeletInternalMsgMatch) > 1 && kubeletInternalMsgMatch[1] != "" {
191+
internalMsg := kubeletInternalMsgMatch[1]
192+
if match := reTerminatedSigterm.FindString(internalMsg); match != "" {
193+
return match
194+
}
195+
if match := reKilledSigkill.FindString(internalMsg); match != "" {
196+
return match
197+
}
198+
if match := reGenericFailedExitCode.FindString(internalMsg); match != "" {
199+
return match
200+
}
201+
}
202+
203+
/* 2: parse Kubelet's reported exit code for the entire hook command */
204+
205+
matchesKubeletExitCode := reKubeletExitCode.FindStringSubmatch(kubeletMsg)
206+
if len(matchesKubeletExitCode) > 1 {
207+
exitCodeStr := matchesKubeletExitCode[1]
208+
var exitCode int
209+
fmt.Sscanf(exitCodeStr, "%d", &exitCode)
210+
211+
// generate messages indicating the source is Kubelet's reported exit code
212+
if exitCode == 143 { // SIGTERM
213+
return "[postStart hook] Commands terminated by SIGTERM due to timeout"
214+
} else if exitCode == 137 { // SIGKILL
215+
return "[postStart hook] Commands forcefully killed by SIGKILL due to timeout"
216+
} else if exitCode != 0 { // Other non-zero exit codes (e.g., 124, 127)
217+
return fmt.Sprintf("[postStart hook] Commands failed (Kubelet reported exit code %s)", exitCodeStr)
218+
}
219+
}
220+
221+
/* 3: try to match specific script outputs against the *entire* Kubelet message */
222+
223+
if match := reTerminatedSigterm.FindString(kubeletMsg); match != "" {
224+
return match
225+
}
226+
if match := reKilledSigkill.FindString(kubeletMsg); match != "" {
227+
return match
228+
}
229+
if match := reGenericFailedExitCode.FindString(kubeletMsg); match != "" {
230+
return match
231+
}
232+
233+
/* 4: fallback */
234+
235+
return "[postStart hook] failed with an unknown error (see pod events or container logs for more details)"
236+
}
237+
160238
func CheckContainerStatusForFailure(containerStatus *corev1.ContainerStatus, ignoredEvents []string) (ok bool, reason string) {
161239
if containerStatus.State.Waiting != nil {
240+
// Explicitly check for PostStartHookError
241+
if containerStatus.State.Waiting.Reason == "PostStartHookError" { // Kubelet uses this reason
242+
conciseMsg := getConcisePostStartFailureMessage(containerStatus.State.Waiting.Message)
243+
return checkIfUnrecoverableEventIgnored("FailedPostStartHook", ignoredEvents), conciseMsg
244+
}
245+
// Check against other generic failure reasons
162246
for _, failureReason := range containerFailureStateReasons {
163247
if containerStatus.State.Waiting.Reason == failureReason {
164-
return checkIfUnrecoverableEventIgnored(containerStatus.State.Waiting.Reason, ignoredEvents), containerStatus.State.Waiting.Reason
248+
return checkIfUnrecoverableEventIgnored(containerStatus.State.Waiting.Reason, ignoredEvents),
249+
containerStatus.State.Waiting.Reason
165250
}
166251
}
167252
}
168253

169254
if containerStatus.State.Terminated != nil {
255+
// Check if termination was due to a generic error, which might include postStart issues
256+
// if the container failed to run.
257+
if containerStatus.State.Terminated.Reason == "Error" || containerStatus.State.Terminated.Reason == "ContainerCannotRun" {
258+
return checkIfUnrecoverableEventIgnored(containerStatus.State.Terminated.Reason, ignoredEvents),
259+
fmt.Sprintf("%s: %s", containerStatus.State.Terminated.Reason, containerStatus.State.Terminated.Message)
260+
}
261+
// Check against other generic failure reasons for terminated state
170262
for _, failureReason := range containerFailureStateReasons {
171263
if containerStatus.State.Terminated.Reason == failureReason {
172-
return checkIfUnrecoverableEventIgnored(containerStatus.State.Terminated.Reason, ignoredEvents), containerStatus.State.Terminated.Reason
264+
return checkIfUnrecoverableEventIgnored(containerStatus.State.Terminated.Reason, ignoredEvents),
265+
containerStatus.State.Terminated.Reason
173266
}
174267
}
175268
}
269+
176270
return true, ""
177271
}
178272

0 commit comments

Comments
 (0)