mirror of
https://github.com/containers/podman.git
synced 2025-10-17 03:04:21 +08:00
report healthcheck start errors
When starting a container consider healthcheck errors fatal. That way user know when systemd-run failed to setup the timer to run the healthcheck and we don't get into a state where the container is running but not the healthcheck. This also fixes the broken error reporting from the systemd-run exec, if the binary could not be run the output was just empty leaving the users with no idea what failed. Fixes #25034 Signed-off-by: Paul Holzinger <pholzing@redhat.com>
This commit is contained in:
@ -1144,7 +1144,7 @@ func (c *Container) init(ctx context.Context, retainRetries bool) error {
|
||||
timer = c.config.StartupHealthCheckConfig.Interval.String()
|
||||
}
|
||||
if err := c.createTimer(timer, c.config.StartupHealthCheckConfig != nil); err != nil {
|
||||
logrus.Error(err)
|
||||
return fmt.Errorf("create healthcheck: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
@ -1333,10 +1333,10 @@ func (c *Container) start() error {
|
||||
// to update status in such case.
|
||||
if c.config.HealthCheckConfig != nil && !(len(c.config.HealthCheckConfig.Test) == 1 && c.config.HealthCheckConfig.Test[0] == "NONE") {
|
||||
if err := c.updateHealthStatus(define.HealthCheckStarting); err != nil {
|
||||
logrus.Error(err)
|
||||
return fmt.Errorf("update healthcheck status: %w", err)
|
||||
}
|
||||
if err := c.startTimer(c.config.StartupHealthCheckConfig != nil); err != nil {
|
||||
logrus.Error(err)
|
||||
return fmt.Errorf("start healthcheck: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
@ -1627,7 +1627,7 @@ func (c *Container) unpause() error {
|
||||
timer = c.config.StartupHealthCheckConfig.Interval.String()
|
||||
}
|
||||
if err := c.createTimer(timer, isStartupHealthCheck); err != nil {
|
||||
return err
|
||||
return fmt.Errorf("create healthcheck: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
@ -2895,7 +2895,9 @@ func (c *Container) resetHealthCheckTimers(noHealthCheck bool, changedTimer bool
|
||||
|
||||
if !isStartup {
|
||||
if c.state.StartupHCPassed || c.config.StartupHealthCheckConfig == nil {
|
||||
c.recreateHealthCheckTimer(context.Background(), false, false)
|
||||
if err := c.recreateHealthCheckTimer(context.Background(), false, false); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
@ -2908,7 +2910,9 @@ func (c *Container) resetHealthCheckTimers(noHealthCheck bool, changedTimer bool
|
||||
return err
|
||||
}
|
||||
if wasEnabledHealthCheck {
|
||||
c.recreateHealthCheckTimer(context.Background(), true, true)
|
||||
if err := c.recreateHealthCheckTimer(context.Background(), true, true); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
@ -126,9 +126,13 @@ func (c *Container) runHealthCheck(ctx context.Context, isStartup bool) (define.
|
||||
inStartPeriod = true
|
||||
if hcErr != nil || exitCode != 0 {
|
||||
hcResult = define.HealthCheckStartup
|
||||
c.incrementStartupHCFailureCounter(ctx)
|
||||
if err := c.incrementStartupHCFailureCounter(ctx); err != nil {
|
||||
return define.HealthCheckInternalError, "", err
|
||||
}
|
||||
} else {
|
||||
c.incrementStartupHCSuccessCounter(ctx)
|
||||
if err := c.incrementStartupHCSuccessCounter(ctx); err != nil {
|
||||
return define.HealthCheckInternalError, "", err
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -229,15 +233,15 @@ func checkHealthCheckCanBeRun(c *Container) (define.HealthCheckStatus, error) {
|
||||
// Can stop the startup HC and start the regular HC if the startup HC has enough
|
||||
// consecutive successes.
|
||||
// NOTE: The caller must lock and sync the container.
|
||||
func (c *Container) incrementStartupHCSuccessCounter(ctx context.Context) {
|
||||
func (c *Container) incrementStartupHCSuccessCounter(ctx context.Context) error {
|
||||
// We don't have a startup HC, can't do anything
|
||||
if c.config.StartupHealthCheckConfig == nil {
|
||||
return
|
||||
return nil
|
||||
}
|
||||
|
||||
// Race: someone else got here first
|
||||
if c.state.StartupHCPassed {
|
||||
return
|
||||
return nil
|
||||
}
|
||||
|
||||
// Increment the success counter
|
||||
@ -256,27 +260,27 @@ func (c *Container) incrementStartupHCSuccessCounter(ctx context.Context) {
|
||||
}
|
||||
|
||||
if err := c.save(); err != nil {
|
||||
logrus.Errorf("Error saving container %s state: %v", c.ID(), err)
|
||||
return
|
||||
return err
|
||||
}
|
||||
|
||||
if recreateTimer {
|
||||
// This kills the process the healthcheck is running.
|
||||
// Which happens to be us.
|
||||
// So this has to be last - after this, systemd serves us a
|
||||
// SIGTERM and we exit.
|
||||
// Special case, via SIGTERM we exit(1) which means systemd logs a failure in the unit.
|
||||
// We do not want this as the unit will be leaked on failure states unless "reset-failed"
|
||||
// is called. Fundamentally this is expected so switch it to exit 0.
|
||||
// NOTE: This is only safe while being called from "podman healthcheck run" which we know
|
||||
// is the case here as we should not alter the exit code of another process that just
|
||||
// happened to call this.
|
||||
shutdown.SetExitCode(0)
|
||||
c.recreateHealthCheckTimer(ctx, false, true)
|
||||
if !recreateTimer {
|
||||
return nil
|
||||
}
|
||||
// This kills the process the healthcheck is running.
|
||||
// Which happens to be us.
|
||||
// So this has to be last - after this, systemd serves us a
|
||||
// SIGTERM and we exit.
|
||||
// Special case, via SIGTERM we exit(1) which means systemd logs a failure in the unit.
|
||||
// We do not want this as the unit will be leaked on failure states unless "reset-failed"
|
||||
// is called. Fundamentally this is expected so switch it to exit 0.
|
||||
// NOTE: This is only safe while being called from "podman healthcheck run" which we know
|
||||
// is the case here as we should not alter the exit code of another process that just
|
||||
// happened to call this.
|
||||
shutdown.SetExitCode(0)
|
||||
return c.recreateHealthCheckTimer(ctx, false, true)
|
||||
}
|
||||
|
||||
func (c *Container) recreateHealthCheckTimer(ctx context.Context, isStartup bool, isStartupRemoved bool) {
|
||||
func (c *Container) recreateHealthCheckTimer(ctx context.Context, isStartup bool, isStartupRemoved bool) error {
|
||||
logrus.Infof("Startup healthcheck for container %s passed, recreating timer", c.ID())
|
||||
|
||||
oldUnit := c.state.HCUnitName
|
||||
@ -287,31 +291,30 @@ func (c *Container) recreateHealthCheckTimer(ctx context.Context, isStartup bool
|
||||
}
|
||||
|
||||
if err := c.createTimer(interval, isStartup); err != nil {
|
||||
logrus.Errorf("Error recreating container %s (isStartup: %t) healthcheck: %v", c.ID(), isStartup, err)
|
||||
return
|
||||
return fmt.Errorf("recreating container %s (isStartup: %t) healthcheck: %v", c.ID(), isStartup, err)
|
||||
}
|
||||
if err := c.startTimer(isStartup); err != nil {
|
||||
logrus.Errorf("Error restarting container %s (isStartup: %t) healthcheck timer: %v", c.ID(), isStartup, err)
|
||||
return fmt.Errorf("restarting container %s (isStartup: %t) healthcheck timer: %v", c.ID(), isStartup, err)
|
||||
}
|
||||
|
||||
if err := c.removeTransientFiles(ctx, isStartupRemoved, oldUnit); err != nil {
|
||||
logrus.Errorf("Error removing container %s healthcheck: %v", c.ID(), err)
|
||||
return
|
||||
return fmt.Errorf("removing container %s healthcheck: %v", c.ID(), err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Increment the current startup healthcheck failure counter.
|
||||
// Can restart the container if the HC fails enough times consecutively.
|
||||
// NOTE: The caller must lock and sync the container.
|
||||
func (c *Container) incrementStartupHCFailureCounter(ctx context.Context) {
|
||||
func (c *Container) incrementStartupHCFailureCounter(ctx context.Context) error {
|
||||
// We don't have a startup HC, can't do anything
|
||||
if c.config.StartupHealthCheckConfig == nil {
|
||||
return
|
||||
return nil
|
||||
}
|
||||
|
||||
// Race: someone else got here first
|
||||
if c.state.StartupHCPassed {
|
||||
return
|
||||
return nil
|
||||
}
|
||||
|
||||
c.state.StartupHCFailureCount++
|
||||
@ -322,14 +325,12 @@ func (c *Container) incrementStartupHCFailureCounter(ctx context.Context) {
|
||||
logrus.Infof("Restarting container %s as startup healthcheck failed", c.ID())
|
||||
// Restart the container
|
||||
if err := c.restartWithTimeout(ctx, c.config.StopTimeout); err != nil {
|
||||
logrus.Errorf("Error restarting container %s after healthcheck failure: %v", c.ID(), err)
|
||||
return fmt.Errorf("restarting container %s after healthcheck failure: %v", c.ID(), err)
|
||||
}
|
||||
return
|
||||
return nil
|
||||
}
|
||||
|
||||
if err := c.save(); err != nil {
|
||||
logrus.Errorf("Error saving container %s state: %v", c.ID(), err)
|
||||
}
|
||||
return c.save()
|
||||
}
|
||||
|
||||
func newHealthCheckLog(start, end time.Time, exitCode int, log string) define.HealthCheckLog {
|
||||
|
@ -4,6 +4,7 @@ package libpod
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"math/rand"
|
||||
"os"
|
||||
@ -55,7 +56,11 @@ func (c *Container) createTimer(interval string, isStartup bool) error {
|
||||
logrus.Debugf("creating systemd-transient files: %s %s", "systemd-run", cmd)
|
||||
systemdRun := exec.Command("systemd-run", cmd...)
|
||||
if output, err := systemdRun.CombinedOutput(); err != nil {
|
||||
return fmt.Errorf("%s", output)
|
||||
exitError := &exec.ExitError{}
|
||||
if errors.As(err, &exitError) {
|
||||
return fmt.Errorf("systemd-run failed: %w: output: %s", err, strings.TrimSpace(string(output)))
|
||||
}
|
||||
return fmt.Errorf("failed to execute systemd-run: %w", err)
|
||||
}
|
||||
|
||||
c.state.HCUnitName = hcUnitName
|
||||
|
@ -499,4 +499,34 @@ function _check_health_log {
|
||||
run_podman rm -f -t0 $ctr
|
||||
}
|
||||
|
||||
# https://github.com/containers/podman/issues/25034
|
||||
@test "podman healthcheck - start errors" {
|
||||
skip_if_remote '$PATH overwrite not working via remote'
|
||||
ctr1="c1-h-$(safename)"
|
||||
ctr2="c2-h-$(safename)"
|
||||
|
||||
local systemd_run="$PODMAN_TMPDIR/systemd-run"
|
||||
touch $systemd_run
|
||||
chmod +x $systemd_run
|
||||
|
||||
# Set custom PATH to force our stub to be called instead of the real systemd-run.
|
||||
PATH="$PODMAN_TMPDIR:$PATH" run_podman 126 run -d --name $ctr1 \
|
||||
--health-cmd "true" $IMAGE /home/podman/pause
|
||||
assert "$output" =~ "create healthcheck: failed to execute systemd-run: fork/exec $systemd_run: exec format error" "error on invalid systemd-run"
|
||||
|
||||
local systemd_run="$PODMAN_TMPDIR/systemd-run"
|
||||
cat > $systemd_run <<EOF
|
||||
#!/bin/bash
|
||||
echo stdout
|
||||
echo stderr >&2
|
||||
exit 2
|
||||
EOF
|
||||
PATH="$PODMAN_TMPDIR:$PATH" run_podman 126 run -d --name $ctr2 \
|
||||
--health-cmd "true" $IMAGE /home/podman/pause
|
||||
assert "$output" =~ "create healthcheck: systemd-run failed: exit status 2: output: stdout
|
||||
stderr" "systemd-run error message"
|
||||
|
||||
run_podman rm -f -t0 $ctr1 $ctr2
|
||||
}
|
||||
|
||||
# vim: filetype=sh
|
||||
|
Reference in New Issue
Block a user