mirror of
https://github.com/containers/podman.git
synced 2025-10-17 03:04:21 +08:00
Merge pull request #26456 from Honny1/dev/jrodak/healthcheck-timeout-termination-v5.4-rhel
[v5.4-rhel] Fix: Ensure HealthCheck exec session terminates on timeout
This commit is contained in:
@ -623,7 +623,7 @@ func DefineCreateFlags(cmd *cobra.Command, cf *entities.ContainerCreateOptions,
|
||||
createFlags.StringVar(
|
||||
&cf.HealthTimeout,
|
||||
healthTimeoutFlagName, define.DefaultHealthCheckTimeout,
|
||||
"the maximum time allowed to complete the healthcheck before an interval is considered failed",
|
||||
"the maximum time allowed to complete the healthcheck before an interval is considered failed and SIGKILL is sent to the healthcheck process",
|
||||
)
|
||||
_ = cmd.RegisterFlagCompletionFunc(healthTimeoutFlagName, completion.AutocompleteNone)
|
||||
|
||||
|
@ -6,3 +6,6 @@
|
||||
|
||||
The maximum time allowed to complete the healthcheck before an interval is considered failed. Like start-period, the
|
||||
value can be expressed in a time format such as **1m22s**. The default value is **30s**.
|
||||
|
||||
Note: A timeout marks the healthcheck as failed. If the healthcheck command itself runs longer than the specified *timeout*,
|
||||
it will be sent a `SIGKILL` signal.
|
||||
|
@ -158,34 +158,20 @@ type legacyExecSession struct {
|
||||
PID int `json:"pid"`
|
||||
}
|
||||
|
||||
// ExecCreate creates a new exec session for the container.
|
||||
// The session is not started. The ID of the new exec session will be returned.
|
||||
func (c *Container) ExecCreate(config *ExecConfig) (string, error) {
|
||||
if !c.batched {
|
||||
c.lock.Lock()
|
||||
defer c.lock.Unlock()
|
||||
|
||||
if err := c.syncContainer(); err != nil {
|
||||
return "", err
|
||||
}
|
||||
}
|
||||
|
||||
// Verify our config
|
||||
func (c *Container) verifyExecConfig(config *ExecConfig) error {
|
||||
if config == nil {
|
||||
return "", fmt.Errorf("must provide a configuration to ExecCreate: %w", define.ErrInvalidArg)
|
||||
return fmt.Errorf("must provide a configuration to ExecCreate: %w", define.ErrInvalidArg)
|
||||
}
|
||||
if len(config.Command) == 0 {
|
||||
return "", fmt.Errorf("must provide a non-empty command to start an exec session: %w", define.ErrInvalidArg)
|
||||
return fmt.Errorf("must provide a non-empty command to start an exec session: %w", define.ErrInvalidArg)
|
||||
}
|
||||
if config.ExitCommandDelay > 0 && len(config.ExitCommand) == 0 {
|
||||
return "", fmt.Errorf("must provide a non-empty exit command if giving an exit command delay: %w", define.ErrInvalidArg)
|
||||
}
|
||||
|
||||
// Verify that we are in a good state to continue
|
||||
if !c.ensureState(define.ContainerStateRunning) {
|
||||
return "", fmt.Errorf("can only create exec sessions on running containers: %w", define.ErrCtrStateInvalid)
|
||||
return fmt.Errorf("must provide a non-empty exit command if giving an exit command delay: %w", define.ErrInvalidArg)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *Container) getUniqueExecSessionID() string {
|
||||
// Generate an ID for our new exec session
|
||||
sessionID := stringid.GenerateRandomID()
|
||||
found := true
|
||||
@ -202,20 +188,52 @@ func (c *Container) ExecCreate(config *ExecConfig) (string, error) {
|
||||
sessionID = stringid.GenerateRandomID()
|
||||
}
|
||||
}
|
||||
return sessionID
|
||||
}
|
||||
|
||||
// Make our new exec session
|
||||
func (c *Container) createExecSession(config *ExecConfig) (*ExecSession, error) {
|
||||
session := new(ExecSession)
|
||||
session.Id = sessionID
|
||||
session.Id = c.getUniqueExecSessionID()
|
||||
session.ContainerId = c.ID()
|
||||
session.State = define.ExecStateCreated
|
||||
session.Config = new(ExecConfig)
|
||||
if err := JSONDeepCopy(config, session.Config); err != nil {
|
||||
return "", fmt.Errorf("copying exec configuration into exec session: %w", err)
|
||||
return nil, fmt.Errorf("copying exec configuration into exec session: %w", err)
|
||||
}
|
||||
|
||||
if len(session.Config.ExitCommand) > 0 {
|
||||
session.Config.ExitCommand = append(session.Config.ExitCommand, []string{session.ID(), c.ID()}...)
|
||||
}
|
||||
return session, nil
|
||||
}
|
||||
|
||||
// ExecCreate creates a new exec session for the container.
|
||||
// The session is not started. The ID of the new exec session will be returned.
|
||||
func (c *Container) ExecCreate(config *ExecConfig) (string, error) {
|
||||
if !c.batched {
|
||||
c.lock.Lock()
|
||||
defer c.lock.Unlock()
|
||||
|
||||
if err := c.syncContainer(); err != nil {
|
||||
return "", err
|
||||
}
|
||||
}
|
||||
|
||||
// Verify our config
|
||||
if err := c.verifyExecConfig(config); err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
// Verify that we are in a good state to continue
|
||||
if !c.ensureState(define.ContainerStateRunning) {
|
||||
return "", fmt.Errorf("can only create exec sessions on running containers: %w", define.ErrCtrStateInvalid)
|
||||
}
|
||||
|
||||
// Make our new exec session
|
||||
session, err := c.createExecSession(config)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
if c.state.ExecSessions == nil {
|
||||
c.state.ExecSessions = make(map[string]*ExecSession)
|
||||
@ -232,7 +250,7 @@ func (c *Container) ExecCreate(config *ExecConfig) (string, error) {
|
||||
|
||||
logrus.Infof("Created exec session %s in container %s", session.ID(), c.ID())
|
||||
|
||||
return sessionID, nil
|
||||
return session.Id, nil
|
||||
}
|
||||
|
||||
// ExecStart starts an exec session in the container, but does not attach to it.
|
||||
@ -775,6 +793,76 @@ func (c *Container) ExecResize(sessionID string, newSize resize.TerminalSize) er
|
||||
return c.ociRuntime.ExecAttachResize(c, sessionID, newSize)
|
||||
}
|
||||
|
||||
func (c *Container) healthCheckExec(config *ExecConfig, timeout time.Duration, streams *define.AttachStreams) (int, error) {
|
||||
unlock := true
|
||||
if !c.batched {
|
||||
c.lock.Lock()
|
||||
defer func() {
|
||||
if unlock {
|
||||
c.lock.Unlock()
|
||||
}
|
||||
}()
|
||||
|
||||
if err := c.syncContainer(); err != nil {
|
||||
return -1, err
|
||||
}
|
||||
}
|
||||
|
||||
if err := c.verifyExecConfig(config); err != nil {
|
||||
return -1, err
|
||||
}
|
||||
|
||||
if !c.ensureState(define.ContainerStateRunning) {
|
||||
return -1, fmt.Errorf("can only create exec sessions on running containers: %w", define.ErrCtrStateInvalid)
|
||||
}
|
||||
|
||||
session, err := c.createExecSession(config)
|
||||
if err != nil {
|
||||
return -1, err
|
||||
}
|
||||
|
||||
if c.state.ExecSessions == nil {
|
||||
c.state.ExecSessions = make(map[string]*ExecSession)
|
||||
}
|
||||
c.state.ExecSessions[session.ID()] = session
|
||||
defer delete(c.state.ExecSessions, session.ID())
|
||||
|
||||
opts, err := prepareForExec(c, session)
|
||||
if err != nil {
|
||||
return -1, err
|
||||
}
|
||||
defer func() {
|
||||
if err := c.cleanupExecBundle(session.ID()); err != nil {
|
||||
logrus.Errorf("Container %s light exec session cleanup error: %v", c.ID(), err)
|
||||
}
|
||||
}()
|
||||
|
||||
pid, attachErrChan, err := c.ociRuntime.ExecContainer(c, session.ID(), opts, streams, nil)
|
||||
if err != nil {
|
||||
return -1, err
|
||||
}
|
||||
session.PID = pid
|
||||
|
||||
if !c.batched {
|
||||
c.lock.Unlock()
|
||||
unlock = false
|
||||
}
|
||||
|
||||
select {
|
||||
case err = <-attachErrChan:
|
||||
if err != nil {
|
||||
return -1, fmt.Errorf("container %s light exec session with pid: %d error: %v", c.ID(), pid, err)
|
||||
}
|
||||
case <-time.After(timeout):
|
||||
if err := c.ociRuntime.ExecStopContainer(c, session.ID(), 0); err != nil {
|
||||
return -1, err
|
||||
}
|
||||
return -1, fmt.Errorf("%v of %s", define.ErrHealthCheckTimeout, c.HealthCheckConfig().Timeout.String())
|
||||
}
|
||||
|
||||
return c.readExecExitCode(session.ID())
|
||||
}
|
||||
|
||||
func (c *Container) Exec(config *ExecConfig, streams *define.AttachStreams, resize <-chan resize.TerminalSize) (int, error) {
|
||||
return c.exec(config, streams, resize, false)
|
||||
}
|
||||
|
@ -217,4 +217,7 @@ var (
|
||||
// ErrRemovingCtrs indicates that there was an error removing all
|
||||
// containers from a pod.
|
||||
ErrRemovingCtrs = errors.New("removing pod containers")
|
||||
|
||||
// ErrHealthCheckTimeout indicates that a HealthCheck timed out.
|
||||
ErrHealthCheckTimeout = errors.New("healthcheck command exceeded timeout")
|
||||
)
|
||||
|
@ -93,19 +93,23 @@ func (c *Container) runHealthCheck(ctx context.Context, isStartup bool) (define.
|
||||
streams.AttachInput = true
|
||||
|
||||
logrus.Debugf("executing health check command %s for %s", strings.Join(newCommand, " "), c.ID())
|
||||
timeStart := time.Now()
|
||||
hcResult := define.HealthCheckSuccess
|
||||
config := new(ExecConfig)
|
||||
config.Command = newCommand
|
||||
exitCode, hcErr := c.exec(config, streams, nil, true)
|
||||
timeStart := time.Now()
|
||||
exitCode, hcErr := c.healthCheckExec(config, c.HealthCheckConfig().Timeout, streams)
|
||||
timeEnd := time.Now()
|
||||
if hcErr != nil {
|
||||
hcResult = define.HealthCheckFailure
|
||||
if errors.Is(hcErr, define.ErrOCIRuntimeNotFound) ||
|
||||
switch {
|
||||
case errors.Is(hcErr, define.ErrOCIRuntimeNotFound) ||
|
||||
errors.Is(hcErr, define.ErrOCIRuntimePermissionDenied) ||
|
||||
errors.Is(hcErr, define.ErrOCIRuntime) {
|
||||
errors.Is(hcErr, define.ErrOCIRuntime):
|
||||
returnCode = 1
|
||||
hcErr = nil
|
||||
} else {
|
||||
case errors.Is(hcErr, define.ErrHealthCheckTimeout):
|
||||
returnCode = -1
|
||||
default:
|
||||
returnCode = 125
|
||||
}
|
||||
} else if exitCode != 0 {
|
||||
@ -124,7 +128,6 @@ func (c *Container) runHealthCheck(ctx context.Context, isStartup bool) (define.
|
||||
}
|
||||
}
|
||||
|
||||
timeEnd := time.Now()
|
||||
if c.HealthCheckConfig().StartPeriod > 0 {
|
||||
// there is a start-period we need to honor; we add startPeriod to container start time
|
||||
startPeriodTime := c.state.StartedTime.Add(c.HealthCheckConfig().StartPeriod)
|
||||
@ -140,12 +143,6 @@ func (c *Container) runHealthCheck(ctx context.Context, isStartup bool) (define.
|
||||
eventLog = eventLog[:c.HealthCheckMaxLogSize()]
|
||||
}
|
||||
|
||||
if timeEnd.Sub(timeStart) > c.HealthCheckConfig().Timeout {
|
||||
returnCode = -1
|
||||
hcResult = define.HealthCheckFailure
|
||||
hcErr = fmt.Errorf("healthcheck command exceeded timeout of %s", c.HealthCheckConfig().Timeout.String())
|
||||
}
|
||||
|
||||
hcl := newHealthCheckLog(timeStart, timeEnd, returnCode, eventLog)
|
||||
|
||||
healthCheckResult, err := c.updateHealthCheckLog(hcl, inStartPeriod, isStartup)
|
||||
|
@ -250,7 +250,7 @@ func (r *ConmonOCIRuntime) ExecStopContainer(ctr *Container, sessionID string, t
|
||||
|
||||
// SIGTERM did not work. On to SIGKILL.
|
||||
logrus.Debugf("Killing exec session %s (PID %d) of container %s with SIGKILL", sessionID, pid, ctr.ID())
|
||||
if err := unix.Kill(pid, unix.SIGTERM); err != nil {
|
||||
if err := unix.Kill(pid, unix.SIGKILL); err != nil {
|
||||
if err == unix.ESRCH {
|
||||
return nil
|
||||
}
|
||||
|
@ -390,4 +390,13 @@ HEALTHCHECK CMD ls -l / 2>&1`, ALPINE)
|
||||
Expect(ps.OutputToStringArray()).To(HaveLen(2))
|
||||
Expect(ps.OutputToString()).To(ContainSubstring("hc"))
|
||||
})
|
||||
|
||||
It("podman healthcheck - health timeout", func() {
|
||||
ctrName := "c-h-" + RandomString(6)
|
||||
podmanTest.PodmanExitCleanly("run", "-d", "--name", ctrName, "--health-cmd", "top", "--health-timeout=3s", ALPINE, "top")
|
||||
|
||||
hc := podmanTest.Podman([]string{"healthcheck", "run", ctrName})
|
||||
hc.WaitWithTimeout(10)
|
||||
Expect(hc).Should(ExitWithError(125, "Error: healthcheck command exceeded timeout of 3s"))
|
||||
})
|
||||
})
|
||||
|
@ -418,4 +418,35 @@ function _check_health_log {
|
||||
run_podman rm -t 0 -f $ctrname
|
||||
}
|
||||
|
||||
@test "podman healthcheck - stop container when healthcheck runs" {
|
||||
ctr="c-h-$(safename)"
|
||||
msg="hc-msg-$(random_string)"
|
||||
|
||||
run_podman run -d --name $ctr \
|
||||
--health-cmd "sleep 20; echo $msg" \
|
||||
$IMAGE /home/podman/pause
|
||||
|
||||
timeout --foreground -v --kill=10 60 \
|
||||
$PODMAN healthcheck run $ctr &
|
||||
hc_pid=$!
|
||||
|
||||
run_podman inspect $ctr --format "{{.State.Status}}"
|
||||
assert "$output" == "running" "Container is running"
|
||||
|
||||
run_podman stop $ctr
|
||||
|
||||
# Wait for background healthcheck to finish and make sure the exit status is 1
|
||||
rc=0
|
||||
wait -n $hc_pid || rc=$?
|
||||
assert $rc -eq 1 "exit status check of healthcheck command"
|
||||
|
||||
run_podman inspect $ctr --format "{{.State.Status}}"
|
||||
assert "$output" == "exited" "Container is stopped"
|
||||
|
||||
run_podman inspect $ctr --format "{{.State.Health.Log}}"
|
||||
assert "$output" !~ "$msg" "Health log message not found"
|
||||
|
||||
run_podman rm -f -t0 $ctr
|
||||
}
|
||||
|
||||
# vim: filetype=sh
|
||||
|
Reference in New Issue
Block a user