report healthcheck start errors

When starting a container consider healthcheck errors fatal. That way user know when systemd-run failed to setup the timer to run the healthcheck and we don't get into a state where the container is running but not the healthcheck. This also fixes the broken error reporting from the systemd-run exec, if the binary could not be run the output was just empty leaving the users with no idea what failed. Fixes #25034 Signed-off-by: Paul Holzinger <pholzing@redhat.com>
2025-10-17 03:04:21 +08:00 · 2025-03-04 15:46:45 +01:00
parent 4ac061f383
commit 47a743bba2
4 changed files with 81 additions and 41 deletions
--- a/libpod/container_internal.go
+++ b/libpod/container_internal.go
@ -1144,7 +1144,7 @@ func (c *Container) init(ctx context.Context, retainRetries bool) error {
 			timer = c.config.StartupHealthCheckConfig.Interval.String()
 		}
 		if err := c.createTimer(timer, c.config.StartupHealthCheckConfig != nil); err != nil {
-			logrus.Error(err)
+			return fmt.Errorf("create healthcheck: %w", err)
 		}
 	}

@ -1333,10 +1333,10 @@ func (c *Container) start() error {
 	// to update status in such case.
 	if c.config.HealthCheckConfig != nil && !(len(c.config.HealthCheckConfig.Test) == 1 && c.config.HealthCheckConfig.Test[0] == "NONE") {
 		if err := c.updateHealthStatus(define.HealthCheckStarting); err != nil {
-			logrus.Error(err)
+			return fmt.Errorf("update healthcheck status: %w", err)
 		}
 		if err := c.startTimer(c.config.StartupHealthCheckConfig != nil); err != nil {
-			logrus.Error(err)
+			return fmt.Errorf("start healthcheck: %w", err)
 		}
 	}

@ -1627,7 +1627,7 @@ func (c *Container) unpause() error {
 			timer = c.config.StartupHealthCheckConfig.Interval.String()
 		}
 		if err := c.createTimer(timer, isStartupHealthCheck); err != nil {
-			return err
+			return fmt.Errorf("create healthcheck: %w", err)
 		}
 	}

@ -2895,7 +2895,9 @@ func (c *Container) resetHealthCheckTimers(noHealthCheck bool, changedTimer bool

 	if !isStartup {
 		if c.state.StartupHCPassed || c.config.StartupHealthCheckConfig == nil {
-			c.recreateHealthCheckTimer(context.Background(), false, false)
+			if err := c.recreateHealthCheckTimer(context.Background(), false, false); err != nil {
+				return err
+			}
 		}
 		return nil
 	}
@ -2908,7 +2910,9 @@ func (c *Container) resetHealthCheckTimers(noHealthCheck bool, changedTimer bool
 			return err
 		}
 		if wasEnabledHealthCheck {
-			c.recreateHealthCheckTimer(context.Background(), true, true)
+			if err := c.recreateHealthCheckTimer(context.Background(), true, true); err != nil {
+				return err
+			}
 		}
 		return nil
 	}
--- a/libpod/healthcheck.go
+++ b/libpod/healthcheck.go
@ -126,9 +126,13 @@ func (c *Container) runHealthCheck(ctx context.Context, isStartup bool) (define.
 		inStartPeriod = true
 		if hcErr != nil || exitCode != 0 {
 			hcResult = define.HealthCheckStartup
-			c.incrementStartupHCFailureCounter(ctx)
+			if err := c.incrementStartupHCFailureCounter(ctx); err != nil {
+				return define.HealthCheckInternalError, "", err
+			}
 		} else {
-			c.incrementStartupHCSuccessCounter(ctx)
+			if err := c.incrementStartupHCSuccessCounter(ctx); err != nil {
+				return define.HealthCheckInternalError, "", err
+			}
 		}
 	}

@ -229,15 +233,15 @@ func checkHealthCheckCanBeRun(c *Container) (define.HealthCheckStatus, error) {
 // Can stop the startup HC and start the regular HC if the startup HC has enough
 // consecutive successes.
 // NOTE: The caller must lock and sync the container.
-func (c *Container) incrementStartupHCSuccessCounter(ctx context.Context) {
+func (c *Container) incrementStartupHCSuccessCounter(ctx context.Context) error {
 	// We don't have a startup HC, can't do anything
 	if c.config.StartupHealthCheckConfig == nil {
-		return
+		return nil
 	}

 	// Race: someone else got here first
 	if c.state.StartupHCPassed {
-		return
+		return nil
 	}

 	// Increment the success counter
@ -256,27 +260,27 @@ func (c *Container) incrementStartupHCSuccessCounter(ctx context.Context) {
 	}

 	if err := c.save(); err != nil {
-		logrus.Errorf("Error saving container %s state: %v", c.ID(), err)
-		return
+		return err
 	}

-	if recreateTimer {
-		// This kills the process the healthcheck is running.
-		// Which happens to be us.
-		// So this has to be last - after this, systemd serves us a
-		// SIGTERM and we exit.
-		// Special case, via SIGTERM we exit(1) which means systemd logs a failure in the unit.
-		// We do not want this as the unit will be leaked on failure states unless "reset-failed"
-		// is called. Fundamentally this is expected so switch it to exit 0.
-		// NOTE: This is only safe while being called from "podman healthcheck run" which we know
-		// is the case here as we should not alter the exit code of another process that just
-		// happened to call this.
-		shutdown.SetExitCode(0)
-		c.recreateHealthCheckTimer(ctx, false, true)
+	if !recreateTimer {
+		return nil
 	}
+	// This kills the process the healthcheck is running.
+	// Which happens to be us.
+	// So this has to be last - after this, systemd serves us a
+	// SIGTERM and we exit.
+	// Special case, via SIGTERM we exit(1) which means systemd logs a failure in the unit.
+	// We do not want this as the unit will be leaked on failure states unless "reset-failed"
+	// is called. Fundamentally this is expected so switch it to exit 0.
+	// NOTE: This is only safe while being called from "podman healthcheck run" which we know
+	// is the case here as we should not alter the exit code of another process that just
+	// happened to call this.
+	shutdown.SetExitCode(0)
+	return c.recreateHealthCheckTimer(ctx, false, true)
 }

-func (c *Container) recreateHealthCheckTimer(ctx context.Context, isStartup bool, isStartupRemoved bool) {
+func (c *Container) recreateHealthCheckTimer(ctx context.Context, isStartup bool, isStartupRemoved bool) error {
 	logrus.Infof("Startup healthcheck for container %s passed, recreating timer", c.ID())

 	oldUnit := c.state.HCUnitName
@ -287,31 +291,30 @@ func (c *Container) recreateHealthCheckTimer(ctx context.Context, isStartup bool
 	}

 	if err := c.createTimer(interval, isStartup); err != nil {
-		logrus.Errorf("Error recreating container %s (isStartup: %t) healthcheck: %v", c.ID(), isStartup, err)
-		return
+		return fmt.Errorf("recreating container %s (isStartup: %t) healthcheck: %v", c.ID(), isStartup, err)
 	}
 	if err := c.startTimer(isStartup); err != nil {
-		logrus.Errorf("Error restarting container %s (isStartup: %t) healthcheck timer: %v", c.ID(), isStartup, err)
+		return fmt.Errorf("restarting container %s (isStartup: %t) healthcheck timer: %v", c.ID(), isStartup, err)
 	}

 	if err := c.removeTransientFiles(ctx, isStartupRemoved, oldUnit); err != nil {
-		logrus.Errorf("Error removing container %s healthcheck: %v", c.ID(), err)
-		return
+		return fmt.Errorf("removing container %s healthcheck: %v", c.ID(), err)
 	}
+	return nil
 }

 // Increment the current startup healthcheck failure counter.
 // Can restart the container if the HC fails enough times consecutively.
 // NOTE: The caller must lock and sync the container.
-func (c *Container) incrementStartupHCFailureCounter(ctx context.Context) {
+func (c *Container) incrementStartupHCFailureCounter(ctx context.Context) error {
 	// We don't have a startup HC, can't do anything
 	if c.config.StartupHealthCheckConfig == nil {
-		return
+		return nil
 	}

 	// Race: someone else got here first
 	if c.state.StartupHCPassed {
-		return
+		return nil
 	}

 	c.state.StartupHCFailureCount++
@ -322,14 +325,12 @@ func (c *Container) incrementStartupHCFailureCounter(ctx context.Context) {
 		logrus.Infof("Restarting container %s as startup healthcheck failed", c.ID())
 		// Restart the container
 		if err := c.restartWithTimeout(ctx, c.config.StopTimeout); err != nil {
-			logrus.Errorf("Error restarting container %s after healthcheck failure: %v", c.ID(), err)
+			return fmt.Errorf("restarting container %s after healthcheck failure: %v", c.ID(), err)
 		}
-		return
+		return nil
 	}

-	if err := c.save(); err != nil {
-		logrus.Errorf("Error saving container %s state: %v", c.ID(), err)
-	}
+	return c.save()
 }

 func newHealthCheckLog(start, end time.Time, exitCode int, log string) define.HealthCheckLog {
--- a/libpod/healthcheck_linux.go
+++ b/libpod/healthcheck_linux.go
@ -4,6 +4,7 @@ package libpod

 import (
 	"context"
+	"errors"
 	"fmt"
 	"math/rand"
 	"os"
@ -55,7 +56,11 @@ func (c *Container) createTimer(interval string, isStartup bool) error {
 	logrus.Debugf("creating systemd-transient files: %s %s", "systemd-run", cmd)
 	systemdRun := exec.Command("systemd-run", cmd...)
 	if output, err := systemdRun.CombinedOutput(); err != nil {
-		return fmt.Errorf("%s", output)
+		exitError := &exec.ExitError{}
+		if errors.As(err, &exitError) {
+			return fmt.Errorf("systemd-run failed: %w: output: %s", err, strings.TrimSpace(string(output)))
+		}
+		return fmt.Errorf("failed to execute systemd-run: %w", err)
 	}

 	c.state.HCUnitName = hcUnitName
--- a/test/system/220-healthcheck.bats
+++ b/test/system/220-healthcheck.bats
@ -499,4 +499,34 @@ function _check_health_log {
    run_podman rm -f -t0 $ctr
 }

+# https://github.com/containers/podman/issues/25034
+@test "podman healthcheck - start errors" {
+    skip_if_remote '$PATH overwrite not working via remote'
+    ctr1="c1-h-$(safename)"
+    ctr2="c2-h-$(safename)"
+
+    local systemd_run="$PODMAN_TMPDIR/systemd-run"
+    touch $systemd_run
+    chmod +x $systemd_run
+
+    # Set custom PATH to force our stub to be called instead of the real systemd-run.
+    PATH="$PODMAN_TMPDIR:$PATH" run_podman 126 run -d --name $ctr1 \
+           --health-cmd "true" $IMAGE /home/podman/pause
+    assert "$output" =~ "create healthcheck: failed to execute systemd-run: fork/exec $systemd_run: exec format error" "error on invalid systemd-run"
+
+    local systemd_run="$PODMAN_TMPDIR/systemd-run"
+    cat > $systemd_run <<EOF
+#!/bin/bash
+echo stdout
+echo stderr >&2
+exit 2
+EOF
+    PATH="$PODMAN_TMPDIR:$PATH" run_podman 126 run -d --name $ctr2 \
+           --health-cmd "true" $IMAGE /home/podman/pause
+    assert "$output" =~ "create healthcheck: systemd-run failed: exit status 2: output: stdout
+stderr" "systemd-run error message"
+
+    run_podman rm -f -t0 $ctr1 $ctr2
+}
+
 # vim: filetype=sh