[v5.4-rhel] Fix: Ensure HealthCheck exec session terminates on timeout

Previously, the HealthCheck exec session would not terminate on timeout, allowing the healthcheck to run indefinitely.

Fixes: https://issues.redhat.com/browse/RHEL-86096
Fixes: https://issues.redhat.com/browse/RHEL-96916
Fixes: https://issues.redhat.com/browse/RHEL-96917

Signed-off-by: Jan Rodák <hony.com@seznam.cz>
(cherry picked from commit 499ea1168bd52a36e98927222a987a6b959f0b50)
Signed-off-by: Jan Rodák <hony.com@seznam.cz>
This commit is contained in:
Jan Rodák
2025-05-06 16:42:13 +02:00
parent 00007bc171
commit ce22ca96ef
6 changed files with 37 additions and 17 deletions

View File

@ -623,7 +623,7 @@ func DefineCreateFlags(cmd *cobra.Command, cf *entities.ContainerCreateOptions,
createFlags.StringVar(
&cf.HealthTimeout,
healthTimeoutFlagName, define.DefaultHealthCheckTimeout,
"the maximum time allowed to complete the healthcheck before an interval is considered failed",
"the maximum time allowed to complete the healthcheck before an interval is considered failed and SIGKILL is sent to the healthcheck process",
)
_ = cmd.RegisterFlagCompletionFunc(healthTimeoutFlagName, completion.AutocompleteNone)

View File

@ -6,3 +6,6 @@
The maximum time allowed to complete the healthcheck before an interval is considered failed. Like start-period, the
value can be expressed in a time format such as **1m22s**. The default value is **30s**.
Note: A timeout marks the healthcheck as failed. If the healthcheck command itself runs longer than the specified *timeout*,
it will be sent a `SIGKILL` signal.

View File

@ -793,7 +793,7 @@ func (c *Container) ExecResize(sessionID string, newSize resize.TerminalSize) er
return c.ociRuntime.ExecAttachResize(c, sessionID, newSize)
}
func (c *Container) healthCheckExec(config *ExecConfig, streams *define.AttachStreams) (int, error) {
func (c *Container) healthCheckExec(config *ExecConfig, timeout time.Duration, streams *define.AttachStreams) (int, error) {
unlock := true
if !c.batched {
c.lock.Lock()
@ -841,15 +841,23 @@ func (c *Container) healthCheckExec(config *ExecConfig, streams *define.AttachSt
if err != nil {
return -1, err
}
session.PID = pid
if !c.batched {
c.lock.Unlock()
unlock = false
}
err = <-attachErrChan
if err != nil {
return -1, fmt.Errorf("container %s light exec session with pid: %d error: %v", c.ID(), pid, err)
select {
case err = <-attachErrChan:
if err != nil {
return -1, fmt.Errorf("container %s light exec session with pid: %d error: %v", c.ID(), pid, err)
}
case <-time.After(timeout):
if err := c.ociRuntime.ExecStopContainer(c, session.ID(), 0); err != nil {
return -1, err
}
return -1, fmt.Errorf("%v of %s", define.ErrHealthCheckTimeout, c.HealthCheckConfig().Timeout.String())
}
return c.readExecExitCode(session.ID())

View File

@ -217,4 +217,7 @@ var (
// ErrRemovingCtrs indicates that there was an error removing all
// containers from a pod.
ErrRemovingCtrs = errors.New("removing pod containers")
// ErrHealthCheckTimeout indicates that a HealthCheck timed out.
ErrHealthCheckTimeout = errors.New("healthcheck command exceeded timeout")
)

View File

@ -93,19 +93,23 @@ func (c *Container) runHealthCheck(ctx context.Context, isStartup bool) (define.
streams.AttachInput = true
logrus.Debugf("executing health check command %s for %s", strings.Join(newCommand, " "), c.ID())
timeStart := time.Now()
hcResult := define.HealthCheckSuccess
config := new(ExecConfig)
config.Command = newCommand
exitCode, hcErr := c.healthCheckExec(config, streams)
timeStart := time.Now()
exitCode, hcErr := c.healthCheckExec(config, c.HealthCheckConfig().Timeout, streams)
timeEnd := time.Now()
if hcErr != nil {
hcResult = define.HealthCheckFailure
if errors.Is(hcErr, define.ErrOCIRuntimeNotFound) ||
switch {
case errors.Is(hcErr, define.ErrOCIRuntimeNotFound) ||
errors.Is(hcErr, define.ErrOCIRuntimePermissionDenied) ||
errors.Is(hcErr, define.ErrOCIRuntime) {
errors.Is(hcErr, define.ErrOCIRuntime):
returnCode = 1
hcErr = nil
} else {
case errors.Is(hcErr, define.ErrHealthCheckTimeout):
returnCode = -1
default:
returnCode = 125
}
} else if exitCode != 0 {
@ -124,7 +128,6 @@ func (c *Container) runHealthCheck(ctx context.Context, isStartup bool) (define.
}
}
timeEnd := time.Now()
if c.HealthCheckConfig().StartPeriod > 0 {
// there is a start-period we need to honor; we add startPeriod to container start time
startPeriodTime := c.state.StartedTime.Add(c.HealthCheckConfig().StartPeriod)
@ -140,12 +143,6 @@ func (c *Container) runHealthCheck(ctx context.Context, isStartup bool) (define.
eventLog = eventLog[:c.HealthCheckMaxLogSize()]
}
if timeEnd.Sub(timeStart) > c.HealthCheckConfig().Timeout {
returnCode = -1
hcResult = define.HealthCheckFailure
hcErr = fmt.Errorf("healthcheck command exceeded timeout of %s", c.HealthCheckConfig().Timeout.String())
}
hcl := newHealthCheckLog(timeStart, timeEnd, returnCode, eventLog)
healthCheckResult, err := c.updateHealthCheckLog(hcl, inStartPeriod, isStartup)

View File

@ -390,4 +390,13 @@ HEALTHCHECK CMD ls -l / 2>&1`, ALPINE)
Expect(ps.OutputToStringArray()).To(HaveLen(2))
Expect(ps.OutputToString()).To(ContainSubstring("hc"))
})
It("podman healthcheck - health timeout", func() {
ctrName := "c-h-" + RandomString(6)
podmanTest.PodmanExitCleanly("run", "-d", "--name", ctrName, "--health-cmd", "top", "--health-timeout=3s", ALPINE, "top")
hc := podmanTest.Podman([]string{"healthcheck", "run", ctrName})
hc.WaitWithTimeout(10)
Expect(hc).Should(ExitWithError(125, "Error: healthcheck command exceeded timeout of 3s"))
})
})