Fix: Ensure HealthCheck exec session terminates on timeout

Previously, the HealthCheck exec session would not terminate on timeout, allowing the healthcheck to run indefinitely.

Fixes: https://issues.redhat.com/browse/RHEL-86096

Signed-off-by: Jan Rodák <hony.com@seznam.cz>
This commit is contained in:
Jan Rodák
2025-05-06 16:42:13 +02:00
parent c0b352f2eb
commit 499ea1168b
6 changed files with 37 additions and 20 deletions

View File

@ -627,7 +627,7 @@ func DefineCreateFlags(cmd *cobra.Command, cf *entities.ContainerCreateOptions,
createFlags.StringVar(
&cf.HealthTimeout,
healthTimeoutFlagName, define.DefaultHealthCheckTimeout,
"the maximum time allowed to complete the healthcheck before an interval is considered failed",
"the maximum time allowed to complete the healthcheck before an interval is considered failed and SIGKILL is sent to the healthcheck process",
)
_ = cmd.RegisterFlagCompletionFunc(healthTimeoutFlagName, completion.AutocompleteNone)

View File

@ -7,8 +7,7 @@
The maximum time allowed to complete the healthcheck before an interval is considered failed. Like start-period, the
value can be expressed in a time format such as **1m22s**. The default value is **30s**.
Note: A timeout marks the healthcheck as failed but does not terminate the running process.
This ensures that a slow but eventually successful healthcheck does not disrupt the container
but is still accounted for in the health status.
Note: A timeout marks the healthcheck as failed. If the healthcheck command itself runs longer than the specified *timeout*,
it will be sent a `SIGKILL` signal.
Note: This parameter will overwrite related healthcheck configuration from the image.

View File

@ -820,7 +820,7 @@ func (c *Container) ExecResize(sessionID string, newSize resize.TerminalSize) er
return c.ociRuntime.ExecAttachResize(c, sessionID, newSize)
}
func (c *Container) healthCheckExec(config *ExecConfig, streams *define.AttachStreams) (int, error) {
func (c *Container) healthCheckExec(config *ExecConfig, timeout time.Duration, streams *define.AttachStreams) (int, error) {
unlock := true
if !c.batched {
c.lock.Lock()
@ -868,16 +868,25 @@ func (c *Container) healthCheckExec(config *ExecConfig, streams *define.AttachSt
if err != nil {
return -1, err
}
session.PID = pid
session.PIDData = getPidData(pid)
if !c.batched {
c.lock.Unlock()
unlock = false
}
err = <-attachErrChan
select {
case err = <-attachErrChan:
if err != nil {
return -1, fmt.Errorf("container %s light exec session with pid: %d error: %v", c.ID(), pid, err)
}
case <-time.After(timeout):
if err := c.ociRuntime.ExecStopContainer(c, session.ID(), 0); err != nil {
return -1, err
}
return -1, fmt.Errorf("%v of %s", define.ErrHealthCheckTimeout, c.HealthCheckConfig().Timeout.String())
}
return c.readExecExitCode(session.ID())
}

View File

@ -217,4 +217,7 @@ var (
// ErrRemovingCtrs indicates that there was an error removing all
// containers from a pod.
ErrRemovingCtrs = errors.New("removing pod containers")
// ErrHealthCheckTimeout indicates that a HealthCheck timed out.
ErrHealthCheckTimeout = errors.New("healthcheck command exceeded timeout")
)

View File

@ -93,19 +93,23 @@ func (c *Container) runHealthCheck(ctx context.Context, isStartup bool) (define.
streams.AttachInput = true
logrus.Debugf("executing health check command %s for %s", strings.Join(newCommand, " "), c.ID())
timeStart := time.Now()
hcResult := define.HealthCheckSuccess
config := new(ExecConfig)
config.Command = newCommand
exitCode, hcErr := c.healthCheckExec(config, streams)
timeStart := time.Now()
exitCode, hcErr := c.healthCheckExec(config, c.HealthCheckConfig().Timeout, streams)
timeEnd := time.Now()
if hcErr != nil {
hcResult = define.HealthCheckFailure
if errors.Is(hcErr, define.ErrOCIRuntimeNotFound) ||
switch {
case errors.Is(hcErr, define.ErrOCIRuntimeNotFound) ||
errors.Is(hcErr, define.ErrOCIRuntimePermissionDenied) ||
errors.Is(hcErr, define.ErrOCIRuntime) {
errors.Is(hcErr, define.ErrOCIRuntime):
returnCode = 1
hcErr = nil
} else {
case errors.Is(hcErr, define.ErrHealthCheckTimeout):
returnCode = -1
default:
returnCode = 125
}
} else if exitCode != 0 {
@ -140,7 +144,6 @@ func (c *Container) runHealthCheck(ctx context.Context, isStartup bool) (define.
hcResult = define.HealthCheckContainerStopped
}
timeEnd := time.Now()
if c.HealthCheckConfig().StartPeriod > 0 {
// there is a start-period we need to honor; we add startPeriod to container start time
startPeriodTime := c.state.StartedTime.Add(c.HealthCheckConfig().StartPeriod)
@ -156,12 +159,6 @@ func (c *Container) runHealthCheck(ctx context.Context, isStartup bool) (define.
eventLog = eventLog[:c.HealthCheckMaxLogSize()]
}
if timeEnd.Sub(timeStart) > c.HealthCheckConfig().Timeout {
returnCode = -1
hcResult = define.HealthCheckFailure
hcErr = fmt.Errorf("healthcheck command exceeded timeout of %s", c.HealthCheckConfig().Timeout.String())
}
hcl := newHealthCheckLog(timeStart, timeEnd, returnCode, eventLog)
healthCheckResult, err := c.updateHealthCheckLog(hcl, hcResult, inStartPeriod, isStartup)

View File

@ -404,4 +404,13 @@ HEALTHCHECK CMD ls -l / 2>&1`, ALPINE)
Expect(ps.OutputToStringArray()).To(HaveLen(2))
Expect(ps.OutputToString()).To(ContainSubstring("hc"))
})
It("podman healthcheck - health timeout", func() {
ctrName := "c-h-" + RandomString(6)
podmanTest.PodmanExitCleanly("run", "-d", "--name", ctrName, "--health-cmd", "top", "--health-timeout=3s", ALPINE, "top")
hc := podmanTest.Podman([]string{"healthcheck", "run", ctrName})
hc.WaitWithTimeout(10)
Expect(hc).Should(ExitWithError(125, "Error: healthcheck command exceeded timeout of 3s"))
})
})