diff --git a/cmd/podman/common/create.go b/cmd/podman/common/create.go index d1d083beea..e054d1bdb4 100644 --- a/cmd/podman/common/create.go +++ b/cmd/podman/common/create.go @@ -627,7 +627,7 @@ func DefineCreateFlags(cmd *cobra.Command, cf *entities.ContainerCreateOptions, createFlags.StringVar( &cf.HealthTimeout, healthTimeoutFlagName, define.DefaultHealthCheckTimeout, - "the maximum time allowed to complete the healthcheck before an interval is considered failed", + "the maximum time allowed to complete the healthcheck before an interval is considered failed and SIGKILL is sent to the healthcheck process", ) _ = cmd.RegisterFlagCompletionFunc(healthTimeoutFlagName, completion.AutocompleteNone) diff --git a/docs/source/markdown/options/health-timeout.md b/docs/source/markdown/options/health-timeout.md index 4adbdb1e90..458442508b 100644 --- a/docs/source/markdown/options/health-timeout.md +++ b/docs/source/markdown/options/health-timeout.md @@ -7,8 +7,7 @@ The maximum time allowed to complete the healthcheck before an interval is considered failed. Like start-period, the value can be expressed in a time format such as **1m22s**. The default value is **30s**. -Note: A timeout marks the healthcheck as failed but does not terminate the running process. -This ensures that a slow but eventually successful healthcheck does not disrupt the container -but is still accounted for in the health status. +Note: A timeout marks the healthcheck as failed. If the healthcheck command itself runs longer than the specified *timeout*, +it will be sent a `SIGKILL` signal. Note: This parameter will overwrite related healthcheck configuration from the image. diff --git a/libpod/container_exec.go b/libpod/container_exec.go index fa5020f1c8..b7397b71b2 100644 --- a/libpod/container_exec.go +++ b/libpod/container_exec.go @@ -820,7 +820,7 @@ func (c *Container) ExecResize(sessionID string, newSize resize.TerminalSize) er return c.ociRuntime.ExecAttachResize(c, sessionID, newSize) } -func (c *Container) healthCheckExec(config *ExecConfig, streams *define.AttachStreams) (int, error) { +func (c *Container) healthCheckExec(config *ExecConfig, timeout time.Duration, streams *define.AttachStreams) (int, error) { unlock := true if !c.batched { c.lock.Lock() @@ -868,15 +868,24 @@ func (c *Container) healthCheckExec(config *ExecConfig, streams *define.AttachSt if err != nil { return -1, err } + session.PID = pid + session.PIDData = getPidData(pid) if !c.batched { c.lock.Unlock() unlock = false } - err = <-attachErrChan - if err != nil { - return -1, fmt.Errorf("container %s light exec session with pid: %d error: %v", c.ID(), pid, err) + select { + case err = <-attachErrChan: + if err != nil { + return -1, fmt.Errorf("container %s light exec session with pid: %d error: %v", c.ID(), pid, err) + } + case <-time.After(timeout): + if err := c.ociRuntime.ExecStopContainer(c, session.ID(), 0); err != nil { + return -1, err + } + return -1, fmt.Errorf("%v of %s", define.ErrHealthCheckTimeout, c.HealthCheckConfig().Timeout.String()) } return c.readExecExitCode(session.ID()) diff --git a/libpod/define/errors.go b/libpod/define/errors.go index 96f4ac6224..b9f0d83fdd 100644 --- a/libpod/define/errors.go +++ b/libpod/define/errors.go @@ -217,4 +217,7 @@ var ( // ErrRemovingCtrs indicates that there was an error removing all // containers from a pod. ErrRemovingCtrs = errors.New("removing pod containers") + + // ErrHealthCheckTimeout indicates that a HealthCheck timed out. + ErrHealthCheckTimeout = errors.New("healthcheck command exceeded timeout") ) diff --git a/libpod/healthcheck.go b/libpod/healthcheck.go index 012ff1f486..b9b563a420 100644 --- a/libpod/healthcheck.go +++ b/libpod/healthcheck.go @@ -93,19 +93,23 @@ func (c *Container) runHealthCheck(ctx context.Context, isStartup bool) (define. streams.AttachInput = true logrus.Debugf("executing health check command %s for %s", strings.Join(newCommand, " "), c.ID()) - timeStart := time.Now() hcResult := define.HealthCheckSuccess config := new(ExecConfig) config.Command = newCommand - exitCode, hcErr := c.healthCheckExec(config, streams) + timeStart := time.Now() + exitCode, hcErr := c.healthCheckExec(config, c.HealthCheckConfig().Timeout, streams) + timeEnd := time.Now() if hcErr != nil { hcResult = define.HealthCheckFailure - if errors.Is(hcErr, define.ErrOCIRuntimeNotFound) || + switch { + case errors.Is(hcErr, define.ErrOCIRuntimeNotFound) || errors.Is(hcErr, define.ErrOCIRuntimePermissionDenied) || - errors.Is(hcErr, define.ErrOCIRuntime) { + errors.Is(hcErr, define.ErrOCIRuntime): returnCode = 1 hcErr = nil - } else { + case errors.Is(hcErr, define.ErrHealthCheckTimeout): + returnCode = -1 + default: returnCode = 125 } } else if exitCode != 0 { @@ -140,7 +144,6 @@ func (c *Container) runHealthCheck(ctx context.Context, isStartup bool) (define. hcResult = define.HealthCheckContainerStopped } - timeEnd := time.Now() if c.HealthCheckConfig().StartPeriod > 0 { // there is a start-period we need to honor; we add startPeriod to container start time startPeriodTime := c.state.StartedTime.Add(c.HealthCheckConfig().StartPeriod) @@ -156,12 +159,6 @@ func (c *Container) runHealthCheck(ctx context.Context, isStartup bool) (define. eventLog = eventLog[:c.HealthCheckMaxLogSize()] } - if timeEnd.Sub(timeStart) > c.HealthCheckConfig().Timeout { - returnCode = -1 - hcResult = define.HealthCheckFailure - hcErr = fmt.Errorf("healthcheck command exceeded timeout of %s", c.HealthCheckConfig().Timeout.String()) - } - hcl := newHealthCheckLog(timeStart, timeEnd, returnCode, eventLog) healthCheckResult, err := c.updateHealthCheckLog(hcl, hcResult, inStartPeriod, isStartup) diff --git a/test/e2e/healthcheck_run_test.go b/test/e2e/healthcheck_run_test.go index 97b5098ef6..c3397c494b 100644 --- a/test/e2e/healthcheck_run_test.go +++ b/test/e2e/healthcheck_run_test.go @@ -404,4 +404,13 @@ HEALTHCHECK CMD ls -l / 2>&1`, ALPINE) Expect(ps.OutputToStringArray()).To(HaveLen(2)) Expect(ps.OutputToString()).To(ContainSubstring("hc")) }) + + It("podman healthcheck - health timeout", func() { + ctrName := "c-h-" + RandomString(6) + podmanTest.PodmanExitCleanly("run", "-d", "--name", ctrName, "--health-cmd", "top", "--health-timeout=3s", ALPINE, "top") + + hc := podmanTest.Podman([]string{"healthcheck", "run", ctrName}) + hc.WaitWithTimeout(10) + Expect(hc).Should(ExitWithError(125, "Error: healthcheck command exceeded timeout of 3s")) + }) })