diff --git a/apis/controller/v1alpha1/devworkspaceoperatorconfig_types.go b/apis/controller/v1alpha1/devworkspaceoperatorconfig_types.go index 4d0303630..81aa5987e 100644 --- a/apis/controller/v1alpha1/devworkspaceoperatorconfig_types.go +++ b/apis/controller/v1alpha1/devworkspaceoperatorconfig_types.go @@ -189,6 +189,13 @@ type WorkspaceConfig struct { RuntimeClassName *string `json:"runtimeClassName,omitempty"` // CleanupCronJobConfig defines configuration options for a cron job that automatically cleans up stale DevWorkspaces. CleanupCronJob *CleanupCronJobConfig `json:"cleanupCronJob,omitempty"` + // PostStartTimeout defines the maximum duration the PostStart hook can run + // before it is automatically failed. This timeout is used for the postStart lifecycle hook + // that is used to run commands in the workspace container. The timeout is specified in seconds. + // Duration should be specified in a format parseable by Go's time package, e.g. "20s", "2m". + // If not specified or "0", the timeout is disabled. + // +kubebuilder:validation:Optional + PostStartTimeout string `json:"postStartTimeout,omitempty"` } type WebhookConfig struct { diff --git a/controllers/workspace/devworkspace_controller.go b/controllers/workspace/devworkspace_controller.go index 0fecaa364..31fc05d36 100644 --- a/controllers/workspace/devworkspace_controller.go +++ b/controllers/workspace/devworkspace_controller.go @@ -327,7 +327,9 @@ func (r *DevWorkspaceReconciler) Reconcile(ctx context.Context, req ctrl.Request &workspace.Spec.Template, workspace.Config.Workspace.ContainerSecurityContext, workspace.Config.Workspace.ImagePullPolicy, - workspace.Config.Workspace.DefaultContainerResources) + workspace.Config.Workspace.DefaultContainerResources, + workspace.Config.Workspace.PostStartTimeout, + ) if err != nil { return r.failWorkspace(workspace, fmt.Sprintf("Error processing devfile: %s", err), metrics.ReasonBadRequest, reqLogger, &reconcileStatus), nil } diff --git a/deploy/bundle/manifests/controller.devfile.io_devworkspaceoperatorconfigs.yaml b/deploy/bundle/manifests/controller.devfile.io_devworkspaceoperatorconfigs.yaml index 4aee38cd5..073cd8f7a 100644 --- a/deploy/bundle/manifests/controller.devfile.io_devworkspaceoperatorconfigs.yaml +++ b/deploy/bundle/manifests/controller.devfile.io_devworkspaceoperatorconfigs.yaml @@ -2764,6 +2764,14 @@ spec: type: string type: object type: object + postStartTimeout: + description: |- + PostStartTimeout defines the maximum duration the PostStart hook can run + before it is automatically failed. This timeout is used for the postStart lifecycle hook + that is used to run commands in the workspace container. The timeout is specified in seconds. + Duration should be specified in a format parseable by Go's time package, e.g. "20s", "2m". + If not specified or "0", the timeout is disabled. + type: string progressTimeout: description: |- ProgressTimeout determines the maximum duration a DevWorkspace can be in diff --git a/deploy/deployment/kubernetes/combined.yaml b/deploy/deployment/kubernetes/combined.yaml index 007fa31bd..98c860a86 100644 --- a/deploy/deployment/kubernetes/combined.yaml +++ b/deploy/deployment/kubernetes/combined.yaml @@ -2900,6 +2900,14 @@ spec: type: string type: object type: object + postStartTimeout: + description: |- + PostStartTimeout defines the maximum duration the PostStart hook can run + before it is automatically failed. This timeout is used for the postStart lifecycle hook + that is used to run commands in the workspace container. The timeout is specified in seconds. + Duration should be specified in a format parseable by Go's time package, e.g. "20s", "2m". + If not specified or "0", the timeout is disabled. + type: string progressTimeout: description: |- ProgressTimeout determines the maximum duration a DevWorkspace can be in diff --git a/deploy/deployment/kubernetes/objects/devworkspaceoperatorconfigs.controller.devfile.io.CustomResourceDefinition.yaml b/deploy/deployment/kubernetes/objects/devworkspaceoperatorconfigs.controller.devfile.io.CustomResourceDefinition.yaml index c826f4c11..2718a363e 100644 --- a/deploy/deployment/kubernetes/objects/devworkspaceoperatorconfigs.controller.devfile.io.CustomResourceDefinition.yaml +++ b/deploy/deployment/kubernetes/objects/devworkspaceoperatorconfigs.controller.devfile.io.CustomResourceDefinition.yaml @@ -2900,6 +2900,14 @@ spec: type: string type: object type: object + postStartTimeout: + description: |- + PostStartTimeout defines the maximum duration the PostStart hook can run + before it is automatically failed. This timeout is used for the postStart lifecycle hook + that is used to run commands in the workspace container. The timeout is specified in seconds. + Duration should be specified in a format parseable by Go's time package, e.g. "20s", "2m". + If not specified or "0", the timeout is disabled. + type: string progressTimeout: description: |- ProgressTimeout determines the maximum duration a DevWorkspace can be in diff --git a/deploy/deployment/openshift/combined.yaml b/deploy/deployment/openshift/combined.yaml index 26c4d1373..dfae8130f 100644 --- a/deploy/deployment/openshift/combined.yaml +++ b/deploy/deployment/openshift/combined.yaml @@ -2900,6 +2900,14 @@ spec: type: string type: object type: object + postStartTimeout: + description: |- + PostStartTimeout defines the maximum duration the PostStart hook can run + before it is automatically failed. This timeout is used for the postStart lifecycle hook + that is used to run commands in the workspace container. The timeout is specified in seconds. + Duration should be specified in a format parseable by Go's time package, e.g. "20s", "2m". + If not specified or "0", the timeout is disabled. + type: string progressTimeout: description: |- ProgressTimeout determines the maximum duration a DevWorkspace can be in diff --git a/deploy/deployment/openshift/objects/devworkspaceoperatorconfigs.controller.devfile.io.CustomResourceDefinition.yaml b/deploy/deployment/openshift/objects/devworkspaceoperatorconfigs.controller.devfile.io.CustomResourceDefinition.yaml index c826f4c11..2718a363e 100644 --- a/deploy/deployment/openshift/objects/devworkspaceoperatorconfigs.controller.devfile.io.CustomResourceDefinition.yaml +++ b/deploy/deployment/openshift/objects/devworkspaceoperatorconfigs.controller.devfile.io.CustomResourceDefinition.yaml @@ -2900,6 +2900,14 @@ spec: type: string type: object type: object + postStartTimeout: + description: |- + PostStartTimeout defines the maximum duration the PostStart hook can run + before it is automatically failed. This timeout is used for the postStart lifecycle hook + that is used to run commands in the workspace container. The timeout is specified in seconds. + Duration should be specified in a format parseable by Go's time package, e.g. "20s", "2m". + If not specified or "0", the timeout is disabled. + type: string progressTimeout: description: |- ProgressTimeout determines the maximum duration a DevWorkspace can be in diff --git a/deploy/templates/crd/bases/controller.devfile.io_devworkspaceoperatorconfigs.yaml b/deploy/templates/crd/bases/controller.devfile.io_devworkspaceoperatorconfigs.yaml index f41ef271f..6927c9195 100644 --- a/deploy/templates/crd/bases/controller.devfile.io_devworkspaceoperatorconfigs.yaml +++ b/deploy/templates/crd/bases/controller.devfile.io_devworkspaceoperatorconfigs.yaml @@ -2898,6 +2898,14 @@ spec: type: string type: object type: object + postStartTimeout: + description: |- + PostStartTimeout defines the maximum duration the PostStart hook can run + before it is automatically failed. This timeout is used for the postStart lifecycle hook + that is used to run commands in the workspace container. The timeout is specified in seconds. + Duration should be specified in a format parseable by Go's time package, e.g. "20s", "2m". + If not specified or "0", the timeout is disabled. + type: string progressTimeout: description: |- ProgressTimeout determines the maximum duration a DevWorkspace can be in diff --git a/pkg/config/sync.go b/pkg/config/sync.go index c5b8150f7..3b774f129 100644 --- a/pkg/config/sync.go +++ b/pkg/config/sync.go @@ -431,6 +431,10 @@ func mergeConfig(from, to *controller.OperatorConfiguration) { to.Workspace.CleanupCronJob.Schedule = from.Workspace.CleanupCronJob.Schedule } } + + if from.Workspace.PostStartTimeout != "" { + to.Workspace.PostStartTimeout = from.Workspace.PostStartTimeout + } } } @@ -601,6 +605,9 @@ func GetCurrentConfigString(currConfig *controller.OperatorConfiguration) string if workspace.IdleTimeout != defaultConfig.Workspace.IdleTimeout { config = append(config, fmt.Sprintf("workspace.idleTimeout=%s", workspace.IdleTimeout)) } + if workspace.PostStartTimeout != defaultConfig.Workspace.PostStartTimeout { + config = append(config, fmt.Sprintf("workspace.postStartTimeout=%s", workspace.PostStartTimeout)) + } if workspace.ProgressTimeout != "" && workspace.ProgressTimeout != defaultConfig.Workspace.ProgressTimeout { config = append(config, fmt.Sprintf("workspace.progressTimeout=%s", workspace.ProgressTimeout)) } diff --git a/pkg/library/container/container.go b/pkg/library/container/container.go index c69bc4b40..03132b90c 100644 --- a/pkg/library/container/container.go +++ b/pkg/library/container/container.go @@ -45,7 +45,7 @@ import ( // rewritten as Volumes are added to PodAdditions, in order to support e.g. using one PVC to hold all volumes // // Note: Requires DevWorkspace to be flattened (i.e. the DevWorkspace contains no Parent or Components of type Plugin) -func GetKubeContainersFromDevfile(workspace *dw.DevWorkspaceTemplateSpec, securityContext *corev1.SecurityContext, pullPolicy string, defaultResources *corev1.ResourceRequirements) (*v1alpha1.PodAdditions, error) { +func GetKubeContainersFromDevfile(workspace *dw.DevWorkspaceTemplateSpec, securityContext *corev1.SecurityContext, pullPolicy string, defaultResources *corev1.ResourceRequirements, postStartTimeout string) (*v1alpha1.PodAdditions, error) { if !flatten.DevWorkspaceIsFlattened(workspace, nil) { return nil, fmt.Errorf("devfile is not flattened") } @@ -77,7 +77,7 @@ func GetKubeContainersFromDevfile(workspace *dw.DevWorkspaceTemplateSpec, securi podAdditions.Containers = append(podAdditions.Containers, *k8sContainer) } - if err := lifecycle.AddPostStartLifecycleHooks(workspace, podAdditions.Containers); err != nil { + if err := lifecycle.AddPostStartLifecycleHooks(workspace, podAdditions.Containers, postStartTimeout); err != nil { return nil, err } diff --git a/pkg/library/container/container_test.go b/pkg/library/container/container_test.go index 6b6f23463..36bf47738 100644 --- a/pkg/library/container/container_test.go +++ b/pkg/library/container/container_test.go @@ -87,7 +87,7 @@ func TestGetKubeContainersFromDevfile(t *testing.T) { t.Run(tt.Name, func(t *testing.T) { // sanity check that file is read correctly. assert.True(t, len(tt.Input.Components) > 0, "Input defines no components") - gotPodAdditions, err := GetKubeContainersFromDevfile(tt.Input, nil, testImagePullPolicy, defaultResources) + gotPodAdditions, err := GetKubeContainersFromDevfile(tt.Input, nil, testImagePullPolicy, defaultResources, "") if tt.Output.ErrRegexp != nil && assert.Error(t, err) { assert.Regexp(t, *tt.Output.ErrRegexp, err.Error(), "Error message should match") } else { diff --git a/pkg/library/lifecycle/poststart.go b/pkg/library/lifecycle/poststart.go index 572efa60c..f2a78f232 100644 --- a/pkg/library/lifecycle/poststart.go +++ b/pkg/library/lifecycle/poststart.go @@ -16,17 +16,32 @@ package lifecycle import ( "fmt" "strings" + "time" dw "github.com/devfile/api/v2/pkg/apis/workspaces/v1alpha2" corev1 "k8s.io/api/core/v1" + "sigs.k8s.io/controller-runtime/pkg/log" ) -const redirectOutputFmt = `{ +const ( + // `tee` both stdout and stderr to files and to the main output streams. + redirectOutputFmt = `{ + # This script block ensures its exit code is preserved + # while its stdout and stderr are tee'd. + _script_to_run() { + %s # This will be replaced by scriptWithTimeout + } + _script_to_run +} 1> >(tee -a "/tmp/poststart-stdout.txt") 2> >(tee -a "/tmp/poststart-stderr.txt" >&2) +` + + noTimeoutRedirectOutputFmt = `{ %s } 1>/tmp/poststart-stdout.txt 2>/tmp/poststart-stderr.txt ` +) -func AddPostStartLifecycleHooks(wksp *dw.DevWorkspaceTemplateSpec, containers []corev1.Container) error { +func AddPostStartLifecycleHooks(wksp *dw.DevWorkspaceTemplateSpec, containers []corev1.Container, postStartTimeout string) error { if wksp.Events == nil || len(wksp.Events.PostStart) == 0 { return nil } @@ -54,7 +69,7 @@ func AddPostStartLifecycleHooks(wksp *dw.DevWorkspaceTemplateSpec, containers [] return fmt.Errorf("failed to process postStart event %s: %w", commands[0].Id, err) } - postStartHandler, err := processCommandsForPostStart(commands) + postStartHandler, err := processCommandsForPostStart(commands, postStartTimeout) if err != nil { return fmt.Errorf("failed to process postStart event %s: %w", commands[0].Id, err) } @@ -68,7 +83,41 @@ func AddPostStartLifecycleHooks(wksp *dw.DevWorkspaceTemplateSpec, containers [] return nil } -// processCommandsForPostStart builds a lifecycle handler that runs the provided command(s) +// processCommandsForPostStart processes a list of DevWorkspace commands +// and generates a corev1.LifecycleHandler for the PostStart lifecycle hook. +func processCommandsForPostStart(commands []dw.Command, postStartTimeout string) (*corev1.LifecycleHandler, error) { + if postStartTimeout == "" { + // use the fallback if no timeout propagated + return processCommandsWithoutTimeoutFallback(commands) + } + + originalUserScript, err := buildUserScript(commands) + if err != nil { + return nil, fmt.Errorf("failed to build aggregated user script: %w", err) + } + + // The user script needs 'set -e' to ensure it exits on error. + // This script is then passed to `sh -c '...'`, so single quotes within it must be escaped. + scriptToExecute := "set -e\n" + originalUserScript + escapedUserScriptForTimeoutWrapper := strings.ReplaceAll(scriptToExecute, "'", `'\''`) + + fullScriptWithTimeout := generateScriptWithTimeout(escapedUserScriptForTimeoutWrapper, postStartTimeout) + + finalScriptForHook := fmt.Sprintf(redirectOutputFmt, fullScriptWithTimeout) + + handler := &corev1.LifecycleHandler{ + Exec: &corev1.ExecAction{ + Command: []string{ + "/bin/sh", + "-c", + finalScriptForHook, + }, + }, + } + return handler, nil +} + +// processCommandsWithoutTimeoutFallback builds a lifecycle handler that runs the provided command(s) // The command has the format // // exec: @@ -79,7 +128,7 @@ func AddPostStartLifecycleHooks(wksp *dw.DevWorkspaceTemplateSpec, containers [] // - | // cd // -func processCommandsForPostStart(commands []dw.Command) (*corev1.LifecycleHandler, error) { +func processCommandsWithoutTimeoutFallback(commands []dw.Command) (*corev1.LifecycleHandler, error) { var dwCommands []string for _, command := range commands { execCmd := command.Exec @@ -99,9 +148,96 @@ func processCommandsForPostStart(commands []dw.Command) (*corev1.LifecycleHandle Command: []string{ "/bin/sh", "-c", - fmt.Sprintf(redirectOutputFmt, joinedCommands), + fmt.Sprintf(noTimeoutRedirectOutputFmt, joinedCommands), }, }, } return handler, nil } + +// buildUserScript takes a list of DevWorkspace commands and constructs a single +// shell script string that executes them sequentially. +func buildUserScript(commands []dw.Command) (string, error) { + var commandScriptLines []string + for _, command := range commands { + execCmd := command.Exec + if execCmd == nil { + // Should be caught by earlier validation, but good to be safe + return "", fmt.Errorf("exec command is nil for command ID %s", command.Id) + } + var singleCommandParts []string + for _, envVar := range execCmd.Env { + singleCommandParts = append(singleCommandParts, fmt.Sprintf("export %s=%q", envVar.Name, envVar.Value)) + } + + if execCmd.WorkingDir != "" { + singleCommandParts = append(singleCommandParts, fmt.Sprintf("cd %q", execCmd.WorkingDir)) + } + if execCmd.CommandLine != "" { + singleCommandParts = append(singleCommandParts, execCmd.CommandLine) + } + if len(singleCommandParts) > 0 { + commandScriptLines = append(commandScriptLines, strings.Join(singleCommandParts, " && ")) + } + } + return strings.Join(commandScriptLines, "\n"), nil +} + +// generateScriptWithTimeout wraps a given user script with timeout logic, +// environment variable exports, and specific exit code handling. +// The killAfterDurationSeconds is hardcoded to 5s within this generated script. +// It conditionally prefixes the user script with the timeout command if available. +func generateScriptWithTimeout(escapedUserScript string, postStartTimeout string) string { + // Convert `postStartTimeout` into the `timeout` format + var timeoutSeconds int64 + if postStartTimeout != "" && postStartTimeout != "0" { + duration, err := time.ParseDuration(postStartTimeout) + if err != nil { + log.Log.Error(err, "Could not parse post-start timeout, disabling timeout", "value", postStartTimeout) + timeoutSeconds = 0 + } else { + timeoutSeconds = int64(duration.Seconds()) + } + } + + return fmt.Sprintf(` +export POSTSTART_TIMEOUT_DURATION="%d" +export POSTSTART_KILL_AFTER_DURATION="5" + +_TIMEOUT_COMMAND_PART="" +_WAS_TIMEOUT_USED="false" # Use strings "true" or "false" for shell boolean + +if command -v timeout >/dev/null 2>&1; then + echo "[postStart hook] Executing commands with timeout: ${POSTSTART_TIMEOUT_DURATION} seconds, kill after: ${POSTSTART_KILL_AFTER_DURATION} seconds" >&2 + _TIMEOUT_COMMAND_PART="timeout --preserve-status --kill-after=${POSTSTART_KILL_AFTER_DURATION} ${POSTSTART_TIMEOUT_DURATION}" + _WAS_TIMEOUT_USED="true" +else + echo "[postStart hook] WARNING: 'timeout' utility not found. Executing commands without timeout." >&2 +fi + +# Execute the user's script +${_TIMEOUT_COMMAND_PART} /bin/sh -c '%s' +exit_code=$? + +# Check the exit code based on whether timeout was attempted +if [ "$_WAS_TIMEOUT_USED" = "true" ]; then + if [ $exit_code -eq 143 ]; then # 128 + 15 (SIGTERM) + echo "[postStart hook] Commands terminated by SIGTERM (likely timed out after ${POSTSTART_TIMEOUT_DURATION}s). Exit code 143." >&2 + elif [ $exit_code -eq 137 ]; then # 128 + 9 (SIGKILL) + echo "[postStart hook] Commands forcefully killed by SIGKILL (likely after --kill-after ${POSTSTART_KILL_AFTER_DURATION}s expired). Exit code 137." >&2 + elif [ $exit_code -ne 0 ]; then # Catches any other non-zero exit code + echo "[postStart hook] Commands failed with exit code $exit_code." >&2 + else + echo "[postStart hook] Commands completed successfully within the time limit." >&2 + fi +else + if [ $exit_code -ne 0 ]; then + echo "[postStart hook] Commands failed with exit code $exit_code (no timeout)." >&2 + else + echo "[postStart hook] Commands completed successfully (no timeout)." >&2 + fi +fi + +exit $exit_code +`, timeoutSeconds, escapedUserScript) +} diff --git a/pkg/library/lifecycle/poststart_test.go b/pkg/library/lifecycle/poststart_test.go index e49197163..b487858ea 100644 --- a/pkg/library/lifecycle/poststart_test.go +++ b/pkg/library/lifecycle/poststart_test.go @@ -75,7 +75,8 @@ func TestAddPostStartLifecycleHooks(t *testing.T) { tests := loadAllPostStartTestCasesOrPanic(t, "./testdata/postStart") for _, tt := range tests { t.Run(fmt.Sprintf("%s (%s)", tt.Name, tt.testPath), func(t *testing.T) { - err := AddPostStartLifecycleHooks(tt.Input.Devfile, tt.Input.Containers) + var timeout string + err := AddPostStartLifecycleHooks(tt.Input.Devfile, tt.Input.Containers, timeout) if tt.Output.ErrRegexp != nil && assert.Error(t, err) { assert.Regexp(t, *tt.Output.ErrRegexp, err.Error(), "Error message should match") } else { @@ -87,3 +88,451 @@ func TestAddPostStartLifecycleHooks(t *testing.T) { }) } } + +func TestBuildUserScript(t *testing.T) { + tests := []struct { + name string + commands []dw.Command + expectedScript string + expectedErr string + }{ + { + name: "No commands", + commands: []dw.Command{}, + expectedScript: "", + expectedErr: "", + }, + { + name: "Single command without workingDir", + commands: []dw.Command{ + { + Id: "cmd1", + CommandUnion: dw.CommandUnion{ + Exec: &dw.ExecCommand{ + CommandLine: "echo hello", + Component: "tools", + }, + }, + }, + }, + expectedScript: "echo hello", + expectedErr: "", + }, + { + name: "Single command with workingDir", + commands: []dw.Command{ + { + Id: "cmd1", + CommandUnion: dw.CommandUnion{ + Exec: &dw.ExecCommand{ + CommandLine: "ls -la", + WorkingDir: "/projects/app", + Component: "tools", + }, + }, + }, + }, + expectedScript: "cd \"/projects/app\" && ls -la", + expectedErr: "", + }, + { + name: "Single command with only workingDir", + commands: []dw.Command{ + { + Id: "cmd1", + CommandUnion: dw.CommandUnion{ + Exec: &dw.ExecCommand{ + WorkingDir: "/data", + Component: "tools", + }, + }, + }, + }, + expectedScript: "cd \"/data\"", + expectedErr: "", + }, + { + name: "Single command with workingDir containing single quote", + commands: []dw.Command{ + { + Id: "cmd1", + CommandUnion: dw.CommandUnion{ + Exec: &dw.ExecCommand{ + CommandLine: "cat file.txt", + WorkingDir: "/projects/app's", + Component: "tools", + }, + }, + }, + }, + expectedScript: "cd \"/projects/app's\" && cat file.txt", + expectedErr: "", + }, + { + name: "Multiple commands", + commands: []dw.Command{ + { + Id: "cmd1", + CommandUnion: dw.CommandUnion{ + Exec: &dw.ExecCommand{ + CommandLine: "npm install", + WorkingDir: "/projects/frontend", + Component: "tools", + }, + }, + }, + { + Id: "cmd2", + CommandUnion: dw.CommandUnion{ + Exec: &dw.ExecCommand{ + CommandLine: "npm start", + Component: "tools", + }, + }, + }, + { + Id: "cmd3", + CommandUnion: dw.CommandUnion{ + Exec: &dw.ExecCommand{ + WorkingDir: "/projects/backend", + CommandLine: "mvn spring-boot:run", + Component: "tools", + }, + }, + }, + }, + expectedScript: "cd \"/projects/frontend\" && npm install\nnpm start\ncd \"/projects/backend\" && mvn spring-boot:run", + expectedErr: "", + }, + { + name: "Command with Env vars", + commands: []dw.Command{ + { + Id: "cmd-with-env", + CommandUnion: dw.CommandUnion{ + Exec: &dw.ExecCommand{ + CommandLine: "echo $MY_VAR", + Component: "tools", + Env: []dw.EnvVar{ + {Name: "MY_VAR", Value: "test"}, + }, + }, + }, + }, + }, + expectedScript: "export MY_VAR=\"test\" && echo $MY_VAR", + expectedErr: "", + }, + { + name: "Command with nil Exec field", + commands: []dw.Command{ + { + Id: "cmd-nil-exec", + CommandUnion: dw.CommandUnion{Exec: nil}, + }, + }, + expectedScript: "", + expectedErr: "exec command is nil for command ID cmd-nil-exec", + }, + { + name: "Command with empty CommandLine and no WorkingDir", + commands: []dw.Command{ + { + Id: "cmd-empty", + CommandUnion: dw.CommandUnion{ + Exec: &dw.ExecCommand{ + CommandLine: "", + WorkingDir: "", + Component: "tools", + }, + }, + }, + { + Id: "cmd-after-empty", + CommandUnion: dw.CommandUnion{ + Exec: &dw.ExecCommand{ + CommandLine: "echo 'still works'", + Component: "tools", + }, + }, + }, + }, + expectedScript: "echo 'still works'", // The empty command should result in no line + expectedErr: "", + }, + { + name: "Command with only CommandLine (empty working dir)", + commands: []dw.Command{ + { + Id: "cmd-empty-wdir", + CommandUnion: dw.CommandUnion{ + Exec: &dw.ExecCommand{ + CommandLine: "pwd", + WorkingDir: "", + Component: "tools", + }, + }, + }, + }, + expectedScript: "pwd", + expectedErr: "", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + script, err := buildUserScript(tt.commands) + + if tt.expectedErr != "" { + assert.Error(t, err) + assert.Contains(t, err.Error(), tt.expectedErr) + } else { + assert.NoError(t, err) + assert.Equal(t, tt.expectedScript, script) + } + }) + } +} + +func TestGenerateScriptWithTimeout(t *testing.T) { + tests := []struct { + name string + escapedUserScript string + timeout string + expectedScript string + }{ + { + name: "Basic script with timeout", + escapedUserScript: "echo 'hello world'\nsleep 1", + timeout: "10s", + expectedScript: ` +export POSTSTART_TIMEOUT_DURATION="10" +export POSTSTART_KILL_AFTER_DURATION="5" + +_TIMEOUT_COMMAND_PART="" +_WAS_TIMEOUT_USED="false" # Use strings "true" or "false" for shell boolean + +if command -v timeout >/dev/null 2>&1; then + echo "[postStart hook] Executing commands with timeout: ${POSTSTART_TIMEOUT_DURATION} seconds, kill after: ${POSTSTART_KILL_AFTER_DURATION} seconds" >&2 + _TIMEOUT_COMMAND_PART="timeout --preserve-status --kill-after=${POSTSTART_KILL_AFTER_DURATION} ${POSTSTART_TIMEOUT_DURATION}" + _WAS_TIMEOUT_USED="true" +else + echo "[postStart hook] WARNING: 'timeout' utility not found. Executing commands without timeout." >&2 +fi + +# Execute the user's script +${_TIMEOUT_COMMAND_PART} /bin/sh -c 'echo 'hello world' +sleep 1' +exit_code=$? + +# Check the exit code based on whether timeout was attempted +if [ "$_WAS_TIMEOUT_USED" = "true" ]; then + if [ $exit_code -eq 143 ]; then # 128 + 15 (SIGTERM) + echo "[postStart hook] Commands terminated by SIGTERM (likely timed out after ${POSTSTART_TIMEOUT_DURATION}s). Exit code 143." >&2 + elif [ $exit_code -eq 137 ]; then # 128 + 9 (SIGKILL) + echo "[postStart hook] Commands forcefully killed by SIGKILL (likely after --kill-after ${POSTSTART_KILL_AFTER_DURATION}s expired). Exit code 137." >&2 + elif [ $exit_code -ne 0 ]; then # Catches any other non-zero exit code + echo "[postStart hook] Commands failed with exit code $exit_code." >&2 + else + echo "[postStart hook] Commands completed successfully within the time limit." >&2 + fi +else + if [ $exit_code -ne 0 ]; then + echo "[postStart hook] Commands failed with exit code $exit_code (no timeout)." >&2 + else + echo "[postStart hook] Commands completed successfully (no timeout)." >&2 + fi +fi + +exit $exit_code +`, + }, + { + name: "Script with zero timeout (no timeout)", + escapedUserScript: "echo 'running indefinitely...'", + timeout: "0s", + expectedScript: ` +export POSTSTART_TIMEOUT_DURATION="0" +export POSTSTART_KILL_AFTER_DURATION="5" + +_TIMEOUT_COMMAND_PART="" +_WAS_TIMEOUT_USED="false" # Use strings "true" or "false" for shell boolean + +if command -v timeout >/dev/null 2>&1; then + echo "[postStart hook] Executing commands with timeout: ${POSTSTART_TIMEOUT_DURATION} seconds, kill after: ${POSTSTART_KILL_AFTER_DURATION} seconds" >&2 + _TIMEOUT_COMMAND_PART="timeout --preserve-status --kill-after=${POSTSTART_KILL_AFTER_DURATION} ${POSTSTART_TIMEOUT_DURATION}" + _WAS_TIMEOUT_USED="true" +else + echo "[postStart hook] WARNING: 'timeout' utility not found. Executing commands without timeout." >&2 +fi + +# Execute the user's script +${_TIMEOUT_COMMAND_PART} /bin/sh -c 'echo 'running indefinitely...'' +exit_code=$? + +# Check the exit code based on whether timeout was attempted +if [ "$_WAS_TIMEOUT_USED" = "true" ]; then + if [ $exit_code -eq 143 ]; then # 128 + 15 (SIGTERM) + echo "[postStart hook] Commands terminated by SIGTERM (likely timed out after ${POSTSTART_TIMEOUT_DURATION}s). Exit code 143." >&2 + elif [ $exit_code -eq 137 ]; then # 128 + 9 (SIGKILL) + echo "[postStart hook] Commands forcefully killed by SIGKILL (likely after --kill-after ${POSTSTART_KILL_AFTER_DURATION}s expired). Exit code 137." >&2 + elif [ $exit_code -ne 0 ]; then # Catches any other non-zero exit code + echo "[postStart hook] Commands failed with exit code $exit_code." >&2 + else + echo "[postStart hook] Commands completed successfully within the time limit." >&2 + fi +else + if [ $exit_code -ne 0 ]; then + echo "[postStart hook] Commands failed with exit code $exit_code (no timeout)." >&2 + else + echo "[postStart hook] Commands completed successfully (no timeout)." >&2 + fi +fi + +exit $exit_code +`, + }, + { + name: "Empty user script", + escapedUserScript: "", + timeout: "5s", + expectedScript: ` +export POSTSTART_TIMEOUT_DURATION="5" +export POSTSTART_KILL_AFTER_DURATION="5" + +_TIMEOUT_COMMAND_PART="" +_WAS_TIMEOUT_USED="false" # Use strings "true" or "false" for shell boolean + +if command -v timeout >/dev/null 2>&1; then + echo "[postStart hook] Executing commands with timeout: ${POSTSTART_TIMEOUT_DURATION} seconds, kill after: ${POSTSTART_KILL_AFTER_DURATION} seconds" >&2 + _TIMEOUT_COMMAND_PART="timeout --preserve-status --kill-after=${POSTSTART_KILL_AFTER_DURATION} ${POSTSTART_TIMEOUT_DURATION}" + _WAS_TIMEOUT_USED="true" +else + echo "[postStart hook] WARNING: 'timeout' utility not found. Executing commands without timeout." >&2 +fi + +# Execute the user's script +${_TIMEOUT_COMMAND_PART} /bin/sh -c '' +exit_code=$? + +# Check the exit code based on whether timeout was attempted +if [ "$_WAS_TIMEOUT_USED" = "true" ]; then + if [ $exit_code -eq 143 ]; then # 128 + 15 (SIGTERM) + echo "[postStart hook] Commands terminated by SIGTERM (likely timed out after ${POSTSTART_TIMEOUT_DURATION}s). Exit code 143." >&2 + elif [ $exit_code -eq 137 ]; then # 128 + 9 (SIGKILL) + echo "[postStart hook] Commands forcefully killed by SIGKILL (likely after --kill-after ${POSTSTART_KILL_AFTER_DURATION}s expired). Exit code 137." >&2 + elif [ $exit_code -ne 0 ]; then # Catches any other non-zero exit code + echo "[postStart hook] Commands failed with exit code $exit_code." >&2 + else + echo "[postStart hook] Commands completed successfully within the time limit." >&2 + fi +else + if [ $exit_code -ne 0 ]; then + echo "[postStart hook] Commands failed with exit code $exit_code (no timeout)." >&2 + else + echo "[postStart hook] Commands completed successfully (no timeout)." >&2 + fi +fi + +exit $exit_code +`, + }, + { + name: "User script with already escaped single quotes", + escapedUserScript: "echo 'it'\\''s complex'", + timeout: "30s", + expectedScript: ` +export POSTSTART_TIMEOUT_DURATION="30" +export POSTSTART_KILL_AFTER_DURATION="5" + +_TIMEOUT_COMMAND_PART="" +_WAS_TIMEOUT_USED="false" # Use strings "true" or "false" for shell boolean + +if command -v timeout >/dev/null 2>&1; then + echo "[postStart hook] Executing commands with timeout: ${POSTSTART_TIMEOUT_DURATION} seconds, kill after: ${POSTSTART_KILL_AFTER_DURATION} seconds" >&2 + _TIMEOUT_COMMAND_PART="timeout --preserve-status --kill-after=${POSTSTART_KILL_AFTER_DURATION} ${POSTSTART_TIMEOUT_DURATION}" + _WAS_TIMEOUT_USED="true" +else + echo "[postStart hook] WARNING: 'timeout' utility not found. Executing commands without timeout." >&2 +fi + +# Execute the user's script +${_TIMEOUT_COMMAND_PART} /bin/sh -c 'echo 'it'\''s complex'' +exit_code=$? + +# Check the exit code based on whether timeout was attempted +if [ "$_WAS_TIMEOUT_USED" = "true" ]; then + if [ $exit_code -eq 143 ]; then # 128 + 15 (SIGTERM) + echo "[postStart hook] Commands terminated by SIGTERM (likely timed out after ${POSTSTART_TIMEOUT_DURATION}s). Exit code 143." >&2 + elif [ $exit_code -eq 137 ]; then # 128 + 9 (SIGKILL) + echo "[postStart hook] Commands forcefully killed by SIGKILL (likely after --kill-after ${POSTSTART_KILL_AFTER_DURATION}s expired). Exit code 137." >&2 + elif [ $exit_code -ne 0 ]; then # Catches any other non-zero exit code + echo "[postStart hook] Commands failed with exit code $exit_code." >&2 + else + echo "[postStart hook] Commands completed successfully within the time limit." >&2 + fi +else + if [ $exit_code -ne 0 ]; then + echo "[postStart hook] Commands failed with exit code $exit_code (no timeout)." >&2 + else + echo "[postStart hook] Commands completed successfully (no timeout)." >&2 + fi +fi + +exit $exit_code +`, + }, + { + name: "User script with minute timeout", + escapedUserScript: "echo 'wait for it...'", + timeout: "2m", + expectedScript: ` +export POSTSTART_TIMEOUT_DURATION="120" +export POSTSTART_KILL_AFTER_DURATION="5" + +_TIMEOUT_COMMAND_PART="" +_WAS_TIMEOUT_USED="false" # Use strings "true" or "false" for shell boolean + +if command -v timeout >/dev/null 2>&1; then + echo "[postStart hook] Executing commands with timeout: ${POSTSTART_TIMEOUT_DURATION} seconds, kill after: ${POSTSTART_KILL_AFTER_DURATION} seconds" >&2 + _TIMEOUT_COMMAND_PART="timeout --preserve-status --kill-after=${POSTSTART_KILL_AFTER_DURATION} ${POSTSTART_TIMEOUT_DURATION}" + _WAS_TIMEOUT_USED="true" +else + echo "[postStart hook] WARNING: 'timeout' utility not found. Executing commands without timeout." >&2 +fi + +# Execute the user's script +${_TIMEOUT_COMMAND_PART} /bin/sh -c 'echo 'wait for it...'' +exit_code=$? + +# Check the exit code based on whether timeout was attempted +if [ "$_WAS_TIMEOUT_USED" = "true" ]; then + if [ $exit_code -eq 143 ]; then # 128 + 15 (SIGTERM) + echo "[postStart hook] Commands terminated by SIGTERM (likely timed out after ${POSTSTART_TIMEOUT_DURATION}s). Exit code 143." >&2 + elif [ $exit_code -eq 137 ]; then # 128 + 9 (SIGKILL) + echo "[postStart hook] Commands forcefully killed by SIGKILL (likely after --kill-after ${POSTSTART_KILL_AFTER_DURATION}s expired). Exit code 137." >&2 + elif [ $exit_code -ne 0 ]; then # Catches any other non-zero exit code + echo "[postStart hook] Commands failed with exit code $exit_code." >&2 + else + echo "[postStart hook] Commands completed successfully within the time limit." >&2 + fi +else + if [ $exit_code -ne 0 ]; then + echo "[postStart hook] Commands failed with exit code $exit_code (no timeout)." >&2 + else + echo "[postStart hook] Commands completed successfully (no timeout)." >&2 + fi +fi + +exit $exit_code +`, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + script := generateScriptWithTimeout(tt.escapedUserScript, tt.timeout) + assert.Equal(t, tt.expectedScript, script) + }) + } +} diff --git a/pkg/library/lifecycle/testdata/postStart/adds_all_postStart_commands.yaml b/pkg/library/lifecycle/testdata/postStart/adds_all_postStart_commands.yaml index 7da46f12e..e2a9bf371 100644 --- a/pkg/library/lifecycle/testdata/postStart/adds_all_postStart_commands.yaml +++ b/pkg/library/lifecycle/testdata/postStart/adds_all_postStart_commands.yaml @@ -33,8 +33,8 @@ output: postStart: exec: command: - - "/bin/sh" - - "-c" + - /bin/sh + - -c - | { echo 'hello world 1' @@ -45,13 +45,12 @@ output: postStart: exec: command: - - "/bin/sh" - - "-c" + - /bin/sh + - -c - | { cd /tmp/test-dir echo 'hello world 2' } 1>/tmp/poststart-stdout.txt 2>/tmp/poststart-stderr.txt - - name: test-component-3 image: test-img diff --git a/pkg/library/lifecycle/testdata/postStart/basic_postStart.yaml b/pkg/library/lifecycle/testdata/postStart/basic_postStart.yaml index 30ced94b2..acbf1ae36 100644 --- a/pkg/library/lifecycle/testdata/postStart/basic_postStart.yaml +++ b/pkg/library/lifecycle/testdata/postStart/basic_postStart.yaml @@ -22,8 +22,8 @@ output: postStart: exec: command: - - "/bin/sh" - - "-c" + - /bin/sh + - -c - | { echo 'hello world' diff --git a/pkg/library/lifecycle/testdata/postStart/multiple_poststart_commands.yaml b/pkg/library/lifecycle/testdata/postStart/multiple_poststart_commands.yaml index 01cfeb55e..db14a94bf 100644 --- a/pkg/library/lifecycle/testdata/postStart/multiple_poststart_commands.yaml +++ b/pkg/library/lifecycle/testdata/postStart/multiple_poststart_commands.yaml @@ -27,8 +27,8 @@ output: postStart: exec: command: - - "/bin/sh" - - "-c" + - /bin/sh + - -c - | { echo 'hello world 1' diff --git a/pkg/library/lifecycle/testdata/postStart/workingDir_postStart.yaml b/pkg/library/lifecycle/testdata/postStart/workingDir_postStart.yaml index 57646070c..dfe976b4e 100644 --- a/pkg/library/lifecycle/testdata/postStart/workingDir_postStart.yaml +++ b/pkg/library/lifecycle/testdata/postStart/workingDir_postStart.yaml @@ -23,8 +23,8 @@ output: postStart: exec: command: - - "/bin/sh" - - "-c" + - /bin/sh + - -c - | { cd /tmp/test-dir diff --git a/pkg/library/status/check.go b/pkg/library/status/check.go index b44843772..bd8006143 100644 --- a/pkg/library/status/check.go +++ b/pkg/library/status/check.go @@ -18,6 +18,7 @@ package status import ( "context" "fmt" + "regexp" "strings" "github.com/devfile/devworkspace-operator/pkg/common" @@ -30,6 +31,23 @@ import ( k8sclient "sigs.k8s.io/controller-runtime/pkg/client" ) +var ( + // reTerminatedSigterm matches: "[postStart hook] Commands terminated by SIGTERM (likely timed out after ...s). Exit code 143." + reTerminatedSigterm = regexp.MustCompile(`\[postStart hook\] Commands terminated by SIGTERM \(likely timed out after \d+[^\)]+?\)\. Exit code 143\.`) + + // reKilledSigkill matches: "[postStart hook] Commands forcefully killed by SIGKILL (likely after --kill-after ...s expired). Exit code 137." + reKilledSigkill = regexp.MustCompile(`\[postStart hook\] Commands forcefully killed by SIGKILL \(likely after --kill-after \d+[^\)]+?\)\. Exit code 137\.`) + + // reGenericFailedExitCode matches: "[postStart hook] Commands failed with exit code ..." (for any other script-reported non-zero exit code) + reGenericFailedExitCode = regexp.MustCompile(`\[postStart hook\] Commands failed with exit code \d+\.`) + + // reKubeletInternalMessage regex to capture Kubelet's explicit message field content if it exists + reKubeletInternalMessage = regexp.MustCompile(`message:\s*"([^"]*)"`) + + // reKubeletExitCode regex to capture Kubelet's reported exit code for the hook command + reKubeletExitCode = regexp.MustCompile(`exited with (\d+):`) +) + var containerFailureStateReasons = []string{ "CrashLoopBackOff", "ImagePullBackOff", @@ -145,10 +163,15 @@ func CheckPodEvents(pod *corev1.Pod, workspaceID string, ignoredEvents []string, if maxCount, isUnrecoverableEvent := unrecoverablePodEventReasons[ev.Reason]; isUnrecoverableEvent { if !checkIfUnrecoverableEventIgnored(ev.Reason, ignoredEvents) && getEventCount(ev) >= maxCount { var msg string + eventMessage := ev.Message // Original Kubelet message from the event + if ev.Reason == "FailedPostStartHook" { + eventMessage = getConcisePostStartFailureMessage(ev.Message) + } + if getEventCount(ev) > 1 { - msg = fmt.Sprintf("Detected unrecoverable event %s %d times: %s.", ev.Reason, getEventCount(ev), ev.Message) + msg = fmt.Sprintf("Detected unrecoverable event %s %d times: %s", ev.Reason, getEventCount(ev), eventMessage) } else { - msg = fmt.Sprintf("Detected unrecoverable event %s: %s.", ev.Reason, ev.Message) + msg = fmt.Sprintf("Detected unrecoverable event %s: %s", ev.Reason, eventMessage) } return msg, nil } @@ -157,22 +180,92 @@ func CheckPodEvents(pod *corev1.Pod, workspaceID string, ignoredEvents []string, return "", nil } +// getConcisePostStartFailureMessage tries to parse the Kubelet's verbose message +// for a PostStartHookError into a more user-friendly one. +func getConcisePostStartFailureMessage(kubeletMsg string) string { + /* 1: check Kubelet's explicit 'message: "..."' field for the specific output */ + + kubeletInternalMsgMatch := reKubeletInternalMessage.FindStringSubmatch(kubeletMsg) + if len(kubeletInternalMsgMatch) > 1 && kubeletInternalMsgMatch[1] != "" { + internalMsg := kubeletInternalMsgMatch[1] + if match := reTerminatedSigterm.FindString(internalMsg); match != "" { + return match + } + if match := reKilledSigkill.FindString(internalMsg); match != "" { + return match + } + if match := reGenericFailedExitCode.FindString(internalMsg); match != "" { + return match + } + } + + /* 2: parse Kubelet's reported exit code for the entire hook command */ + + matchesKubeletExitCode := reKubeletExitCode.FindStringSubmatch(kubeletMsg) + if len(matchesKubeletExitCode) > 1 { + exitCodeStr := matchesKubeletExitCode[1] + var exitCode int + fmt.Sscanf(exitCodeStr, "%d", &exitCode) + + // generate messages indicating the source is Kubelet's reported exit code + if exitCode == 143 { // SIGTERM + return "[postStart hook] Commands terminated by SIGTERM due to timeout" + } else if exitCode == 137 { // SIGKILL + return "[postStart hook] Commands forcefully killed by SIGKILL due to timeout" + } else if exitCode != 0 { // Other non-zero exit codes (e.g., 124, 127) + return fmt.Sprintf("[postStart hook] Commands failed (Kubelet reported exit code %s)", exitCodeStr) + } + } + + /* 3: try to match specific script outputs against the *entire* Kubelet message */ + + if match := reTerminatedSigterm.FindString(kubeletMsg); match != "" { + return match + } + if match := reKilledSigkill.FindString(kubeletMsg); match != "" { + return match + } + if match := reGenericFailedExitCode.FindString(kubeletMsg); match != "" { + return match + } + + /* 4: fallback */ + + return "[postStart hook] failed with an unknown error (see pod events or container logs for more details)" +} + func CheckContainerStatusForFailure(containerStatus *corev1.ContainerStatus, ignoredEvents []string) (ok bool, reason string) { if containerStatus.State.Waiting != nil { + // Explicitly check for PostStartHookError + if containerStatus.State.Waiting.Reason == "PostStartHookError" { // Kubelet uses this reason + conciseMsg := getConcisePostStartFailureMessage(containerStatus.State.Waiting.Message) + return checkIfUnrecoverableEventIgnored("FailedPostStartHook", ignoredEvents), conciseMsg + } + // Check against other generic failure reasons for _, failureReason := range containerFailureStateReasons { if containerStatus.State.Waiting.Reason == failureReason { - return checkIfUnrecoverableEventIgnored(containerStatus.State.Waiting.Reason, ignoredEvents), containerStatus.State.Waiting.Reason + return checkIfUnrecoverableEventIgnored(containerStatus.State.Waiting.Reason, ignoredEvents), + containerStatus.State.Waiting.Reason } } } if containerStatus.State.Terminated != nil { + // Check if termination was due to a generic error, which might include postStart issues + // if the container failed to run. + if containerStatus.State.Terminated.Reason == "Error" || containerStatus.State.Terminated.Reason == "ContainerCannotRun" { + return checkIfUnrecoverableEventIgnored(containerStatus.State.Terminated.Reason, ignoredEvents), + fmt.Sprintf("%s: %s", containerStatus.State.Terminated.Reason, containerStatus.State.Terminated.Message) + } + // Check against other generic failure reasons for terminated state for _, failureReason := range containerFailureStateReasons { if containerStatus.State.Terminated.Reason == failureReason { - return checkIfUnrecoverableEventIgnored(containerStatus.State.Terminated.Reason, ignoredEvents), containerStatus.State.Terminated.Reason + return checkIfUnrecoverableEventIgnored(containerStatus.State.Terminated.Reason, ignoredEvents), + containerStatus.State.Terminated.Reason } } } + return true, "" } diff --git a/pkg/library/status/check_test.go b/pkg/library/status/check_test.go new file mode 100644 index 000000000..b9eabae03 --- /dev/null +++ b/pkg/library/status/check_test.go @@ -0,0 +1,120 @@ +// Copyright (c) 2019-2025 Red Hat, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package status + +import ( + "testing" +) + +func TestGetConcisePostStartFailureMessage(t *testing.T) { + tests := []struct { + name string + kubeletMsg string + expectedMsg string + }{ + { + name: "Kubelet internal message - SIGTERM", + kubeletMsg: `PostStartHookError: rpc error: code = Unknown desc = command error: command terminated by SIGTERM, message: "[postStart hook] Commands terminated by SIGTERM (likely timed out after 30s). Exit code 143."`, + expectedMsg: "[postStart hook] Commands terminated by SIGTERM (likely timed out after 30s). Exit code 143.", + }, + { + name: "Kubelet internal message - SIGKILL", + kubeletMsg: `PostStartHookError: rpc error: code = Unknown desc = command error: command terminated by SIGKILL, message: "[postStart hook] Commands forcefully killed by SIGKILL (likely after --kill-after 10s expired). Exit code 137."`, + expectedMsg: "[postStart hook] Commands forcefully killed by SIGKILL (likely after --kill-after 10s expired). Exit code 137.", + }, + { + name: "Kubelet internal message - Generic Fail", + kubeletMsg: `PostStartHookError: rpc error: code = Unknown desc = command error: command failed, message: "[postStart hook] Commands failed with exit code 1."`, + expectedMsg: "[postStart hook] Commands failed with exit code 1.", + }, + { + name: "Kubelet internal message - No match, fall through to Kubelet exit code", + kubeletMsg: `PostStartHookError: rpc error: code = Unknown desc = command error: command terminated by signal: SIGTERM, message: "Container command \\\'sleep 60\\\' was terminated by signal SIGTERM"\nexited with 143: ...`, + expectedMsg: "[postStart hook] Commands terminated by SIGTERM due to timeout", + }, + { + name: "Kubelet reported exit code - 143 (SIGTERM)", + kubeletMsg: `PostStartHookError: command 'sh -c ...' exited with 143: ...`, + expectedMsg: "[postStart hook] Commands terminated by SIGTERM due to timeout", + }, + { + name: "Kubelet exit code - 137 (SIGKILL)", + kubeletMsg: `PostStartHookError: command 'sh -c ...' exited with 137: ...`, + expectedMsg: "[postStart hook] Commands forcefully killed by SIGKILL due to timeout", + }, + { + name: "Kubelet exit code - 1 (Generic)", + kubeletMsg: `PostStartHookError: command 'sh -c ...' exited with 1: ...`, + expectedMsg: "[postStart hook] Commands failed (Kubelet reported exit code 1)", + }, + { + name: "Kubelet exit code - 124 (e.g. timeout command itself)", + kubeletMsg: `PostStartHookError: command 'sh -c ...' exited with 124: ...`, + expectedMsg: "[postStart hook] Commands failed (Kubelet reported exit code 124)", + }, + { + name: "Full Kubelet message match - SIGTERM (no internal message field, no Kubelet exit code first part)", + kubeletMsg: `PostStartHookError: Error executing postStart hook: [postStart hook] Commands terminated by SIGTERM (likely timed out after 45s). Exit code 143.`, + expectedMsg: "[postStart hook] Commands terminated by SIGTERM (likely timed out after 45s). Exit code 143.", + }, + { + name: "Full Kubelet message match - SIGKILL (no internal message field, no Kubelet exit code first part)", + kubeletMsg: `PostStartHookError: Error executing postStart hook: [postStart hook] Commands forcefully killed by SIGKILL (likely after --kill-after 5s expired). Exit code 137.`, + expectedMsg: "[postStart hook] Commands forcefully killed by SIGKILL (likely after --kill-after 5s expired). Exit code 137.", + }, + { + name: "Full Kubelet message match - Generic Fail (no internal message field, no Kubelet exit code first part)", + kubeletMsg: `PostStartHookError: Error executing postStart hook: [postStart hook] Commands failed with exit code 2.`, + expectedMsg: "[postStart hook] Commands failed with exit code 2.", + }, + { + name: "Kubelet internal message with escaped quotes and script output", + kubeletMsg: `PostStartHookError: rpc error: code = Unknown desc = failed to exec in container: command /bin/sh -c export POSTSTART_TIMEOUT_DURATION="30s"; export POSTSTART_KILL_AFTER_DURATION="10s"; echo "[postStart hook] Executing user commands with timeout ${POSTSTART_TIMEOUT_DURATION}, kill after ${POSTSTART_KILL_AFTER_DURATION}..."; _script_to_run() { set -e\\necho \\\'hello\\\' >&2\\nexit 1\\n }; timeout --preserve-status --kill-after="${POSTSTART_KILL_AFTER_DURATION}" "${POSTSTART_TIMEOUT_DURATION}" /bin/sh -c "_script_to_run" 1> >(tee -a "/tmp/poststart-stdout.txt") 2> >(tee -a "/tmp/poststart-stderr.txt" >&2); exit_code=$?; if [ $exit_code -eq 143 ]; then echo "[postStart hook] Commands terminated by SIGTERM (likely timed out after ${POSTSTART_TIMEOUT_DURATION}). Exit code 143." >&2; elif [ $exit_code -eq 137 ]; then echo "[postStart hook] Commands forcefully killed by SIGKILL (likely after --kill-after ${POSTSTART_KILL_AFTER_DURATION} expired). Exit code 137." >&2; elif [ $exit_code -ne 0 ]; then echo "[postStart hook] Commands failed with exit code ${exit_code}." >&2; fi; exit $exit_code: exit status 1, message: "[postStart hook] Commands failed with exit code 1."`, + expectedMsg: "[postStart hook] Commands failed with exit code 1.", + }, + { + name: "Fallback - Unrecognized Kubelet message", + kubeletMsg: "PostStartHookError: An unexpected error occurred.", + expectedMsg: "[postStart hook] failed with an unknown error (see pod events or container logs for more details)", + }, + { + name: "Fallback - Empty Kubelet message", + kubeletMsg: "", + expectedMsg: "[postStart hook] failed with an unknown error (see pod events or container logs for more details)", + }, + { + name: "Kubelet internal message - SIGTERM - with leading/trailing spaces in message", + kubeletMsg: `PostStartHookError: rpc error: code = Unknown desc = command error: command terminated by SIGTERM, message: " [postStart hook] Commands terminated by SIGTERM (likely timed out after 30s). Exit code 143. "`, + expectedMsg: "[postStart hook] Commands terminated by SIGTERM (likely timed out after 30s). Exit code 143.", + }, + { + name: "Kubelet exit code - 143 - with surrounding text", + kubeletMsg: `FailedPostStartHook: container "theia-ide" postStart hook failed: command 'sh -c mycommand' exited with 143:`, + expectedMsg: "[postStart hook] Commands terminated by SIGTERM due to timeout", + }, + { + name: "Fallback - Kubelet message with exit code 0 but error text", + kubeletMsg: `PostStartHookError: command "sh -c echo hello && exit 0" exited with 0: "unexpected error"`, + expectedMsg: "[postStart hook] failed with an unknown error (see pod events or container logs for more details)", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := getConcisePostStartFailureMessage(tt.kubeletMsg); got != tt.expectedMsg { + t.Errorf("getConcisePostStartFailureMessage() = %v, want %v", got, tt.expectedMsg) + } + }) + } +}