report healthcheck start errors

When starting a container consider healthcheck errors fatal. That way
user know when systemd-run failed to setup the timer to run the
healthcheck and we don't get into a state where the container is running
but not the healthcheck.

This also fixes the broken error reporting from the systemd-run exec, if
the binary could not be run the output was just empty leaving the users
with no idea what failed.

Fixes #25034

Signed-off-by: Paul Holzinger <pholzing@redhat.com>
This commit is contained in:
Paul Holzinger
2025-03-04 15:46:45 +01:00
parent 4ac061f383
commit 47a743bba2
4 changed files with 81 additions and 41 deletions

View File

@ -1144,7 +1144,7 @@ func (c *Container) init(ctx context.Context, retainRetries bool) error {
timer = c.config.StartupHealthCheckConfig.Interval.String() timer = c.config.StartupHealthCheckConfig.Interval.String()
} }
if err := c.createTimer(timer, c.config.StartupHealthCheckConfig != nil); err != nil { if err := c.createTimer(timer, c.config.StartupHealthCheckConfig != nil); err != nil {
logrus.Error(err) return fmt.Errorf("create healthcheck: %w", err)
} }
} }
@ -1333,10 +1333,10 @@ func (c *Container) start() error {
// to update status in such case. // to update status in such case.
if c.config.HealthCheckConfig != nil && !(len(c.config.HealthCheckConfig.Test) == 1 && c.config.HealthCheckConfig.Test[0] == "NONE") { if c.config.HealthCheckConfig != nil && !(len(c.config.HealthCheckConfig.Test) == 1 && c.config.HealthCheckConfig.Test[0] == "NONE") {
if err := c.updateHealthStatus(define.HealthCheckStarting); err != nil { if err := c.updateHealthStatus(define.HealthCheckStarting); err != nil {
logrus.Error(err) return fmt.Errorf("update healthcheck status: %w", err)
} }
if err := c.startTimer(c.config.StartupHealthCheckConfig != nil); err != nil { if err := c.startTimer(c.config.StartupHealthCheckConfig != nil); err != nil {
logrus.Error(err) return fmt.Errorf("start healthcheck: %w", err)
} }
} }
@ -1627,7 +1627,7 @@ func (c *Container) unpause() error {
timer = c.config.StartupHealthCheckConfig.Interval.String() timer = c.config.StartupHealthCheckConfig.Interval.String()
} }
if err := c.createTimer(timer, isStartupHealthCheck); err != nil { if err := c.createTimer(timer, isStartupHealthCheck); err != nil {
return err return fmt.Errorf("create healthcheck: %w", err)
} }
} }
@ -2895,7 +2895,9 @@ func (c *Container) resetHealthCheckTimers(noHealthCheck bool, changedTimer bool
if !isStartup { if !isStartup {
if c.state.StartupHCPassed || c.config.StartupHealthCheckConfig == nil { if c.state.StartupHCPassed || c.config.StartupHealthCheckConfig == nil {
c.recreateHealthCheckTimer(context.Background(), false, false) if err := c.recreateHealthCheckTimer(context.Background(), false, false); err != nil {
return err
}
} }
return nil return nil
} }
@ -2908,7 +2910,9 @@ func (c *Container) resetHealthCheckTimers(noHealthCheck bool, changedTimer bool
return err return err
} }
if wasEnabledHealthCheck { if wasEnabledHealthCheck {
c.recreateHealthCheckTimer(context.Background(), true, true) if err := c.recreateHealthCheckTimer(context.Background(), true, true); err != nil {
return err
}
} }
return nil return nil
} }

View File

@ -126,9 +126,13 @@ func (c *Container) runHealthCheck(ctx context.Context, isStartup bool) (define.
inStartPeriod = true inStartPeriod = true
if hcErr != nil || exitCode != 0 { if hcErr != nil || exitCode != 0 {
hcResult = define.HealthCheckStartup hcResult = define.HealthCheckStartup
c.incrementStartupHCFailureCounter(ctx) if err := c.incrementStartupHCFailureCounter(ctx); err != nil {
return define.HealthCheckInternalError, "", err
}
} else { } else {
c.incrementStartupHCSuccessCounter(ctx) if err := c.incrementStartupHCSuccessCounter(ctx); err != nil {
return define.HealthCheckInternalError, "", err
}
} }
} }
@ -229,15 +233,15 @@ func checkHealthCheckCanBeRun(c *Container) (define.HealthCheckStatus, error) {
// Can stop the startup HC and start the regular HC if the startup HC has enough // Can stop the startup HC and start the regular HC if the startup HC has enough
// consecutive successes. // consecutive successes.
// NOTE: The caller must lock and sync the container. // NOTE: The caller must lock and sync the container.
func (c *Container) incrementStartupHCSuccessCounter(ctx context.Context) { func (c *Container) incrementStartupHCSuccessCounter(ctx context.Context) error {
// We don't have a startup HC, can't do anything // We don't have a startup HC, can't do anything
if c.config.StartupHealthCheckConfig == nil { if c.config.StartupHealthCheckConfig == nil {
return return nil
} }
// Race: someone else got here first // Race: someone else got here first
if c.state.StartupHCPassed { if c.state.StartupHCPassed {
return return nil
} }
// Increment the success counter // Increment the success counter
@ -256,11 +260,12 @@ func (c *Container) incrementStartupHCSuccessCounter(ctx context.Context) {
} }
if err := c.save(); err != nil { if err := c.save(); err != nil {
logrus.Errorf("Error saving container %s state: %v", c.ID(), err) return err
return
} }
if recreateTimer { if !recreateTimer {
return nil
}
// This kills the process the healthcheck is running. // This kills the process the healthcheck is running.
// Which happens to be us. // Which happens to be us.
// So this has to be last - after this, systemd serves us a // So this has to be last - after this, systemd serves us a
@ -272,11 +277,10 @@ func (c *Container) incrementStartupHCSuccessCounter(ctx context.Context) {
// is the case here as we should not alter the exit code of another process that just // is the case here as we should not alter the exit code of another process that just
// happened to call this. // happened to call this.
shutdown.SetExitCode(0) shutdown.SetExitCode(0)
c.recreateHealthCheckTimer(ctx, false, true) return c.recreateHealthCheckTimer(ctx, false, true)
}
} }
func (c *Container) recreateHealthCheckTimer(ctx context.Context, isStartup bool, isStartupRemoved bool) { func (c *Container) recreateHealthCheckTimer(ctx context.Context, isStartup bool, isStartupRemoved bool) error {
logrus.Infof("Startup healthcheck for container %s passed, recreating timer", c.ID()) logrus.Infof("Startup healthcheck for container %s passed, recreating timer", c.ID())
oldUnit := c.state.HCUnitName oldUnit := c.state.HCUnitName
@ -287,31 +291,30 @@ func (c *Container) recreateHealthCheckTimer(ctx context.Context, isStartup bool
} }
if err := c.createTimer(interval, isStartup); err != nil { if err := c.createTimer(interval, isStartup); err != nil {
logrus.Errorf("Error recreating container %s (isStartup: %t) healthcheck: %v", c.ID(), isStartup, err) return fmt.Errorf("recreating container %s (isStartup: %t) healthcheck: %v", c.ID(), isStartup, err)
return
} }
if err := c.startTimer(isStartup); err != nil { if err := c.startTimer(isStartup); err != nil {
logrus.Errorf("Error restarting container %s (isStartup: %t) healthcheck timer: %v", c.ID(), isStartup, err) return fmt.Errorf("restarting container %s (isStartup: %t) healthcheck timer: %v", c.ID(), isStartup, err)
} }
if err := c.removeTransientFiles(ctx, isStartupRemoved, oldUnit); err != nil { if err := c.removeTransientFiles(ctx, isStartupRemoved, oldUnit); err != nil {
logrus.Errorf("Error removing container %s healthcheck: %v", c.ID(), err) return fmt.Errorf("removing container %s healthcheck: %v", c.ID(), err)
return
} }
return nil
} }
// Increment the current startup healthcheck failure counter. // Increment the current startup healthcheck failure counter.
// Can restart the container if the HC fails enough times consecutively. // Can restart the container if the HC fails enough times consecutively.
// NOTE: The caller must lock and sync the container. // NOTE: The caller must lock and sync the container.
func (c *Container) incrementStartupHCFailureCounter(ctx context.Context) { func (c *Container) incrementStartupHCFailureCounter(ctx context.Context) error {
// We don't have a startup HC, can't do anything // We don't have a startup HC, can't do anything
if c.config.StartupHealthCheckConfig == nil { if c.config.StartupHealthCheckConfig == nil {
return return nil
} }
// Race: someone else got here first // Race: someone else got here first
if c.state.StartupHCPassed { if c.state.StartupHCPassed {
return return nil
} }
c.state.StartupHCFailureCount++ c.state.StartupHCFailureCount++
@ -322,14 +325,12 @@ func (c *Container) incrementStartupHCFailureCounter(ctx context.Context) {
logrus.Infof("Restarting container %s as startup healthcheck failed", c.ID()) logrus.Infof("Restarting container %s as startup healthcheck failed", c.ID())
// Restart the container // Restart the container
if err := c.restartWithTimeout(ctx, c.config.StopTimeout); err != nil { if err := c.restartWithTimeout(ctx, c.config.StopTimeout); err != nil {
logrus.Errorf("Error restarting container %s after healthcheck failure: %v", c.ID(), err) return fmt.Errorf("restarting container %s after healthcheck failure: %v", c.ID(), err)
} }
return return nil
} }
if err := c.save(); err != nil { return c.save()
logrus.Errorf("Error saving container %s state: %v", c.ID(), err)
}
} }
func newHealthCheckLog(start, end time.Time, exitCode int, log string) define.HealthCheckLog { func newHealthCheckLog(start, end time.Time, exitCode int, log string) define.HealthCheckLog {

View File

@ -4,6 +4,7 @@ package libpod
import ( import (
"context" "context"
"errors"
"fmt" "fmt"
"math/rand" "math/rand"
"os" "os"
@ -55,7 +56,11 @@ func (c *Container) createTimer(interval string, isStartup bool) error {
logrus.Debugf("creating systemd-transient files: %s %s", "systemd-run", cmd) logrus.Debugf("creating systemd-transient files: %s %s", "systemd-run", cmd)
systemdRun := exec.Command("systemd-run", cmd...) systemdRun := exec.Command("systemd-run", cmd...)
if output, err := systemdRun.CombinedOutput(); err != nil { if output, err := systemdRun.CombinedOutput(); err != nil {
return fmt.Errorf("%s", output) exitError := &exec.ExitError{}
if errors.As(err, &exitError) {
return fmt.Errorf("systemd-run failed: %w: output: %s", err, strings.TrimSpace(string(output)))
}
return fmt.Errorf("failed to execute systemd-run: %w", err)
} }
c.state.HCUnitName = hcUnitName c.state.HCUnitName = hcUnitName

View File

@ -499,4 +499,34 @@ function _check_health_log {
run_podman rm -f -t0 $ctr run_podman rm -f -t0 $ctr
} }
# https://github.com/containers/podman/issues/25034
@test "podman healthcheck - start errors" {
skip_if_remote '$PATH overwrite not working via remote'
ctr1="c1-h-$(safename)"
ctr2="c2-h-$(safename)"
local systemd_run="$PODMAN_TMPDIR/systemd-run"
touch $systemd_run
chmod +x $systemd_run
# Set custom PATH to force our stub to be called instead of the real systemd-run.
PATH="$PODMAN_TMPDIR:$PATH" run_podman 126 run -d --name $ctr1 \
--health-cmd "true" $IMAGE /home/podman/pause
assert "$output" =~ "create healthcheck: failed to execute systemd-run: fork/exec $systemd_run: exec format error" "error on invalid systemd-run"
local systemd_run="$PODMAN_TMPDIR/systemd-run"
cat > $systemd_run <<EOF
#!/bin/bash
echo stdout
echo stderr >&2
exit 2
EOF
PATH="$PODMAN_TMPDIR:$PATH" run_podman 126 run -d --name $ctr2 \
--health-cmd "true" $IMAGE /home/podman/pause
assert "$output" =~ "create healthcheck: systemd-run failed: exit status 2: output: stdout
stderr" "systemd-run error message"
run_podman rm -f -t0 $ctr1 $ctr2
}
# vim: filetype=sh # vim: filetype=sh