Merge pull request #26456 from Honny1/dev/jrodak/healthcheck-timeout-termination-v5.4-rhel

[v5.4-rhel] Fix: Ensure HealthCheck exec session terminates on timeout
This commit is contained in:
openshift-merge-bot[bot]
2025-06-23 14:33:35 +00:00
committed by GitHub
8 changed files with 170 additions and 39 deletions

View File

@ -623,7 +623,7 @@ func DefineCreateFlags(cmd *cobra.Command, cf *entities.ContainerCreateOptions,
createFlags.StringVar(
&cf.HealthTimeout,
healthTimeoutFlagName, define.DefaultHealthCheckTimeout,
"the maximum time allowed to complete the healthcheck before an interval is considered failed",
"the maximum time allowed to complete the healthcheck before an interval is considered failed and SIGKILL is sent to the healthcheck process",
)
_ = cmd.RegisterFlagCompletionFunc(healthTimeoutFlagName, completion.AutocompleteNone)

View File

@ -6,3 +6,6 @@
The maximum time allowed to complete the healthcheck before an interval is considered failed. Like start-period, the
value can be expressed in a time format such as **1m22s**. The default value is **30s**.
Note: A timeout marks the healthcheck as failed. If the healthcheck command itself runs longer than the specified *timeout*,
it will be sent a `SIGKILL` signal.

View File

@ -158,34 +158,20 @@ type legacyExecSession struct {
PID int `json:"pid"`
}
// ExecCreate creates a new exec session for the container.
// The session is not started. The ID of the new exec session will be returned.
func (c *Container) ExecCreate(config *ExecConfig) (string, error) {
if !c.batched {
c.lock.Lock()
defer c.lock.Unlock()
if err := c.syncContainer(); err != nil {
return "", err
}
}
// Verify our config
func (c *Container) verifyExecConfig(config *ExecConfig) error {
if config == nil {
return "", fmt.Errorf("must provide a configuration to ExecCreate: %w", define.ErrInvalidArg)
return fmt.Errorf("must provide a configuration to ExecCreate: %w", define.ErrInvalidArg)
}
if len(config.Command) == 0 {
return "", fmt.Errorf("must provide a non-empty command to start an exec session: %w", define.ErrInvalidArg)
return fmt.Errorf("must provide a non-empty command to start an exec session: %w", define.ErrInvalidArg)
}
if config.ExitCommandDelay > 0 && len(config.ExitCommand) == 0 {
return "", fmt.Errorf("must provide a non-empty exit command if giving an exit command delay: %w", define.ErrInvalidArg)
}
// Verify that we are in a good state to continue
if !c.ensureState(define.ContainerStateRunning) {
return "", fmt.Errorf("can only create exec sessions on running containers: %w", define.ErrCtrStateInvalid)
return fmt.Errorf("must provide a non-empty exit command if giving an exit command delay: %w", define.ErrInvalidArg)
}
return nil
}
func (c *Container) getUniqueExecSessionID() string {
// Generate an ID for our new exec session
sessionID := stringid.GenerateRandomID()
found := true
@ -202,20 +188,52 @@ func (c *Container) ExecCreate(config *ExecConfig) (string, error) {
sessionID = stringid.GenerateRandomID()
}
}
return sessionID
}
// Make our new exec session
func (c *Container) createExecSession(config *ExecConfig) (*ExecSession, error) {
session := new(ExecSession)
session.Id = sessionID
session.Id = c.getUniqueExecSessionID()
session.ContainerId = c.ID()
session.State = define.ExecStateCreated
session.Config = new(ExecConfig)
if err := JSONDeepCopy(config, session.Config); err != nil {
return "", fmt.Errorf("copying exec configuration into exec session: %w", err)
return nil, fmt.Errorf("copying exec configuration into exec session: %w", err)
}
if len(session.Config.ExitCommand) > 0 {
session.Config.ExitCommand = append(session.Config.ExitCommand, []string{session.ID(), c.ID()}...)
}
return session, nil
}
// ExecCreate creates a new exec session for the container.
// The session is not started. The ID of the new exec session will be returned.
func (c *Container) ExecCreate(config *ExecConfig) (string, error) {
if !c.batched {
c.lock.Lock()
defer c.lock.Unlock()
if err := c.syncContainer(); err != nil {
return "", err
}
}
// Verify our config
if err := c.verifyExecConfig(config); err != nil {
return "", err
}
// Verify that we are in a good state to continue
if !c.ensureState(define.ContainerStateRunning) {
return "", fmt.Errorf("can only create exec sessions on running containers: %w", define.ErrCtrStateInvalid)
}
// Make our new exec session
session, err := c.createExecSession(config)
if err != nil {
return "", err
}
if c.state.ExecSessions == nil {
c.state.ExecSessions = make(map[string]*ExecSession)
@ -232,7 +250,7 @@ func (c *Container) ExecCreate(config *ExecConfig) (string, error) {
logrus.Infof("Created exec session %s in container %s", session.ID(), c.ID())
return sessionID, nil
return session.Id, nil
}
// ExecStart starts an exec session in the container, but does not attach to it.
@ -775,6 +793,76 @@ func (c *Container) ExecResize(sessionID string, newSize resize.TerminalSize) er
return c.ociRuntime.ExecAttachResize(c, sessionID, newSize)
}
func (c *Container) healthCheckExec(config *ExecConfig, timeout time.Duration, streams *define.AttachStreams) (int, error) {
unlock := true
if !c.batched {
c.lock.Lock()
defer func() {
if unlock {
c.lock.Unlock()
}
}()
if err := c.syncContainer(); err != nil {
return -1, err
}
}
if err := c.verifyExecConfig(config); err != nil {
return -1, err
}
if !c.ensureState(define.ContainerStateRunning) {
return -1, fmt.Errorf("can only create exec sessions on running containers: %w", define.ErrCtrStateInvalid)
}
session, err := c.createExecSession(config)
if err != nil {
return -1, err
}
if c.state.ExecSessions == nil {
c.state.ExecSessions = make(map[string]*ExecSession)
}
c.state.ExecSessions[session.ID()] = session
defer delete(c.state.ExecSessions, session.ID())
opts, err := prepareForExec(c, session)
if err != nil {
return -1, err
}
defer func() {
if err := c.cleanupExecBundle(session.ID()); err != nil {
logrus.Errorf("Container %s light exec session cleanup error: %v", c.ID(), err)
}
}()
pid, attachErrChan, err := c.ociRuntime.ExecContainer(c, session.ID(), opts, streams, nil)
if err != nil {
return -1, err
}
session.PID = pid
if !c.batched {
c.lock.Unlock()
unlock = false
}
select {
case err = <-attachErrChan:
if err != nil {
return -1, fmt.Errorf("container %s light exec session with pid: %d error: %v", c.ID(), pid, err)
}
case <-time.After(timeout):
if err := c.ociRuntime.ExecStopContainer(c, session.ID(), 0); err != nil {
return -1, err
}
return -1, fmt.Errorf("%v of %s", define.ErrHealthCheckTimeout, c.HealthCheckConfig().Timeout.String())
}
return c.readExecExitCode(session.ID())
}
func (c *Container) Exec(config *ExecConfig, streams *define.AttachStreams, resize <-chan resize.TerminalSize) (int, error) {
return c.exec(config, streams, resize, false)
}

View File

@ -217,4 +217,7 @@ var (
// ErrRemovingCtrs indicates that there was an error removing all
// containers from a pod.
ErrRemovingCtrs = errors.New("removing pod containers")
// ErrHealthCheckTimeout indicates that a HealthCheck timed out.
ErrHealthCheckTimeout = errors.New("healthcheck command exceeded timeout")
)

View File

@ -93,19 +93,23 @@ func (c *Container) runHealthCheck(ctx context.Context, isStartup bool) (define.
streams.AttachInput = true
logrus.Debugf("executing health check command %s for %s", strings.Join(newCommand, " "), c.ID())
timeStart := time.Now()
hcResult := define.HealthCheckSuccess
config := new(ExecConfig)
config.Command = newCommand
exitCode, hcErr := c.exec(config, streams, nil, true)
timeStart := time.Now()
exitCode, hcErr := c.healthCheckExec(config, c.HealthCheckConfig().Timeout, streams)
timeEnd := time.Now()
if hcErr != nil {
hcResult = define.HealthCheckFailure
if errors.Is(hcErr, define.ErrOCIRuntimeNotFound) ||
switch {
case errors.Is(hcErr, define.ErrOCIRuntimeNotFound) ||
errors.Is(hcErr, define.ErrOCIRuntimePermissionDenied) ||
errors.Is(hcErr, define.ErrOCIRuntime) {
errors.Is(hcErr, define.ErrOCIRuntime):
returnCode = 1
hcErr = nil
} else {
case errors.Is(hcErr, define.ErrHealthCheckTimeout):
returnCode = -1
default:
returnCode = 125
}
} else if exitCode != 0 {
@ -124,7 +128,6 @@ func (c *Container) runHealthCheck(ctx context.Context, isStartup bool) (define.
}
}
timeEnd := time.Now()
if c.HealthCheckConfig().StartPeriod > 0 {
// there is a start-period we need to honor; we add startPeriod to container start time
startPeriodTime := c.state.StartedTime.Add(c.HealthCheckConfig().StartPeriod)
@ -140,12 +143,6 @@ func (c *Container) runHealthCheck(ctx context.Context, isStartup bool) (define.
eventLog = eventLog[:c.HealthCheckMaxLogSize()]
}
if timeEnd.Sub(timeStart) > c.HealthCheckConfig().Timeout {
returnCode = -1
hcResult = define.HealthCheckFailure
hcErr = fmt.Errorf("healthcheck command exceeded timeout of %s", c.HealthCheckConfig().Timeout.String())
}
hcl := newHealthCheckLog(timeStart, timeEnd, returnCode, eventLog)
healthCheckResult, err := c.updateHealthCheckLog(hcl, inStartPeriod, isStartup)

View File

@ -250,7 +250,7 @@ func (r *ConmonOCIRuntime) ExecStopContainer(ctr *Container, sessionID string, t
// SIGTERM did not work. On to SIGKILL.
logrus.Debugf("Killing exec session %s (PID %d) of container %s with SIGKILL", sessionID, pid, ctr.ID())
if err := unix.Kill(pid, unix.SIGTERM); err != nil {
if err := unix.Kill(pid, unix.SIGKILL); err != nil {
if err == unix.ESRCH {
return nil
}

View File

@ -390,4 +390,13 @@ HEALTHCHECK CMD ls -l / 2>&1`, ALPINE)
Expect(ps.OutputToStringArray()).To(HaveLen(2))
Expect(ps.OutputToString()).To(ContainSubstring("hc"))
})
It("podman healthcheck - health timeout", func() {
ctrName := "c-h-" + RandomString(6)
podmanTest.PodmanExitCleanly("run", "-d", "--name", ctrName, "--health-cmd", "top", "--health-timeout=3s", ALPINE, "top")
hc := podmanTest.Podman([]string{"healthcheck", "run", ctrName})
hc.WaitWithTimeout(10)
Expect(hc).Should(ExitWithError(125, "Error: healthcheck command exceeded timeout of 3s"))
})
})

View File

@ -418,4 +418,35 @@ function _check_health_log {
run_podman rm -t 0 -f $ctrname
}
@test "podman healthcheck - stop container when healthcheck runs" {
ctr="c-h-$(safename)"
msg="hc-msg-$(random_string)"
run_podman run -d --name $ctr \
--health-cmd "sleep 20; echo $msg" \
$IMAGE /home/podman/pause
timeout --foreground -v --kill=10 60 \
$PODMAN healthcheck run $ctr &
hc_pid=$!
run_podman inspect $ctr --format "{{.State.Status}}"
assert "$output" == "running" "Container is running"
run_podman stop $ctr
# Wait for background healthcheck to finish and make sure the exit status is 1
rc=0
wait -n $hc_pid || rc=$?
assert $rc -eq 1 "exit status check of healthcheck command"
run_podman inspect $ctr --format "{{.State.Status}}"
assert "$output" == "exited" "Container is stopped"
run_podman inspect $ctr --format "{{.State.Health.Log}}"
assert "$output" !~ "$msg" "Health log message not found"
run_podman rm -f -t0 $ctr
}
# vim: filetype=sh