Fix: Ensure HealthCheck exec session terminates on timeout

Previously, the HealthCheck exec session would not terminate on timeout, allowing the healthcheck to run indefinitely.

Fixes: https://issues.redhat.com/browse/RHEL-86096

Signed-off-by: Jan Rodák <hony.com@seznam.cz>
This commit is contained in:
Jan Rodák
2025-05-06 16:42:13 +02:00
parent c0b352f2eb
commit 499ea1168b
6 changed files with 37 additions and 20 deletions

View File

@ -627,7 +627,7 @@ func DefineCreateFlags(cmd *cobra.Command, cf *entities.ContainerCreateOptions,
createFlags.StringVar( createFlags.StringVar(
&cf.HealthTimeout, &cf.HealthTimeout,
healthTimeoutFlagName, define.DefaultHealthCheckTimeout, healthTimeoutFlagName, define.DefaultHealthCheckTimeout,
"the maximum time allowed to complete the healthcheck before an interval is considered failed", "the maximum time allowed to complete the healthcheck before an interval is considered failed and SIGKILL is sent to the healthcheck process",
) )
_ = cmd.RegisterFlagCompletionFunc(healthTimeoutFlagName, completion.AutocompleteNone) _ = cmd.RegisterFlagCompletionFunc(healthTimeoutFlagName, completion.AutocompleteNone)

View File

@ -7,8 +7,7 @@
The maximum time allowed to complete the healthcheck before an interval is considered failed. Like start-period, the The maximum time allowed to complete the healthcheck before an interval is considered failed. Like start-period, the
value can be expressed in a time format such as **1m22s**. The default value is **30s**. value can be expressed in a time format such as **1m22s**. The default value is **30s**.
Note: A timeout marks the healthcheck as failed but does not terminate the running process. Note: A timeout marks the healthcheck as failed. If the healthcheck command itself runs longer than the specified *timeout*,
This ensures that a slow but eventually successful healthcheck does not disrupt the container it will be sent a `SIGKILL` signal.
but is still accounted for in the health status.
Note: This parameter will overwrite related healthcheck configuration from the image. Note: This parameter will overwrite related healthcheck configuration from the image.

View File

@ -820,7 +820,7 @@ func (c *Container) ExecResize(sessionID string, newSize resize.TerminalSize) er
return c.ociRuntime.ExecAttachResize(c, sessionID, newSize) return c.ociRuntime.ExecAttachResize(c, sessionID, newSize)
} }
func (c *Container) healthCheckExec(config *ExecConfig, streams *define.AttachStreams) (int, error) { func (c *Container) healthCheckExec(config *ExecConfig, timeout time.Duration, streams *define.AttachStreams) (int, error) {
unlock := true unlock := true
if !c.batched { if !c.batched {
c.lock.Lock() c.lock.Lock()
@ -868,16 +868,25 @@ func (c *Container) healthCheckExec(config *ExecConfig, streams *define.AttachSt
if err != nil { if err != nil {
return -1, err return -1, err
} }
session.PID = pid
session.PIDData = getPidData(pid)
if !c.batched { if !c.batched {
c.lock.Unlock() c.lock.Unlock()
unlock = false unlock = false
} }
err = <-attachErrChan select {
case err = <-attachErrChan:
if err != nil { if err != nil {
return -1, fmt.Errorf("container %s light exec session with pid: %d error: %v", c.ID(), pid, err) return -1, fmt.Errorf("container %s light exec session with pid: %d error: %v", c.ID(), pid, err)
} }
case <-time.After(timeout):
if err := c.ociRuntime.ExecStopContainer(c, session.ID(), 0); err != nil {
return -1, err
}
return -1, fmt.Errorf("%v of %s", define.ErrHealthCheckTimeout, c.HealthCheckConfig().Timeout.String())
}
return c.readExecExitCode(session.ID()) return c.readExecExitCode(session.ID())
} }

View File

@ -217,4 +217,7 @@ var (
// ErrRemovingCtrs indicates that there was an error removing all // ErrRemovingCtrs indicates that there was an error removing all
// containers from a pod. // containers from a pod.
ErrRemovingCtrs = errors.New("removing pod containers") ErrRemovingCtrs = errors.New("removing pod containers")
// ErrHealthCheckTimeout indicates that a HealthCheck timed out.
ErrHealthCheckTimeout = errors.New("healthcheck command exceeded timeout")
) )

View File

@ -93,19 +93,23 @@ func (c *Container) runHealthCheck(ctx context.Context, isStartup bool) (define.
streams.AttachInput = true streams.AttachInput = true
logrus.Debugf("executing health check command %s for %s", strings.Join(newCommand, " "), c.ID()) logrus.Debugf("executing health check command %s for %s", strings.Join(newCommand, " "), c.ID())
timeStart := time.Now()
hcResult := define.HealthCheckSuccess hcResult := define.HealthCheckSuccess
config := new(ExecConfig) config := new(ExecConfig)
config.Command = newCommand config.Command = newCommand
exitCode, hcErr := c.healthCheckExec(config, streams) timeStart := time.Now()
exitCode, hcErr := c.healthCheckExec(config, c.HealthCheckConfig().Timeout, streams)
timeEnd := time.Now()
if hcErr != nil { if hcErr != nil {
hcResult = define.HealthCheckFailure hcResult = define.HealthCheckFailure
if errors.Is(hcErr, define.ErrOCIRuntimeNotFound) || switch {
case errors.Is(hcErr, define.ErrOCIRuntimeNotFound) ||
errors.Is(hcErr, define.ErrOCIRuntimePermissionDenied) || errors.Is(hcErr, define.ErrOCIRuntimePermissionDenied) ||
errors.Is(hcErr, define.ErrOCIRuntime) { errors.Is(hcErr, define.ErrOCIRuntime):
returnCode = 1 returnCode = 1
hcErr = nil hcErr = nil
} else { case errors.Is(hcErr, define.ErrHealthCheckTimeout):
returnCode = -1
default:
returnCode = 125 returnCode = 125
} }
} else if exitCode != 0 { } else if exitCode != 0 {
@ -140,7 +144,6 @@ func (c *Container) runHealthCheck(ctx context.Context, isStartup bool) (define.
hcResult = define.HealthCheckContainerStopped hcResult = define.HealthCheckContainerStopped
} }
timeEnd := time.Now()
if c.HealthCheckConfig().StartPeriod > 0 { if c.HealthCheckConfig().StartPeriod > 0 {
// there is a start-period we need to honor; we add startPeriod to container start time // there is a start-period we need to honor; we add startPeriod to container start time
startPeriodTime := c.state.StartedTime.Add(c.HealthCheckConfig().StartPeriod) startPeriodTime := c.state.StartedTime.Add(c.HealthCheckConfig().StartPeriod)
@ -156,12 +159,6 @@ func (c *Container) runHealthCheck(ctx context.Context, isStartup bool) (define.
eventLog = eventLog[:c.HealthCheckMaxLogSize()] eventLog = eventLog[:c.HealthCheckMaxLogSize()]
} }
if timeEnd.Sub(timeStart) > c.HealthCheckConfig().Timeout {
returnCode = -1
hcResult = define.HealthCheckFailure
hcErr = fmt.Errorf("healthcheck command exceeded timeout of %s", c.HealthCheckConfig().Timeout.String())
}
hcl := newHealthCheckLog(timeStart, timeEnd, returnCode, eventLog) hcl := newHealthCheckLog(timeStart, timeEnd, returnCode, eventLog)
healthCheckResult, err := c.updateHealthCheckLog(hcl, hcResult, inStartPeriod, isStartup) healthCheckResult, err := c.updateHealthCheckLog(hcl, hcResult, inStartPeriod, isStartup)

View File

@ -404,4 +404,13 @@ HEALTHCHECK CMD ls -l / 2>&1`, ALPINE)
Expect(ps.OutputToStringArray()).To(HaveLen(2)) Expect(ps.OutputToStringArray()).To(HaveLen(2))
Expect(ps.OutputToString()).To(ContainSubstring("hc")) Expect(ps.OutputToString()).To(ContainSubstring("hc"))
}) })
It("podman healthcheck - health timeout", func() {
ctrName := "c-h-" + RandomString(6)
podmanTest.PodmanExitCleanly("run", "-d", "--name", ctrName, "--health-cmd", "top", "--health-timeout=3s", ALPINE, "top")
hc := podmanTest.Podman([]string{"healthcheck", "run", ctrName})
hc.WaitWithTimeout(10)
Expect(hc).Should(ExitWithError(125, "Error: healthcheck command exceeded timeout of 3s"))
})
}) })