mirror of
https://github.com/containers/podman.git
synced 2025-05-17 15:18:43 +08:00

Previously, the HealthCheck exec session would not terminate on timeout, allowing the healthcheck to run indefinitely. Fixes: https://issues.redhat.com/browse/RHEL-86096 Signed-off-by: Jan Rodák <hony.com@seznam.cz>
482 lines
16 KiB
Go
482 lines
16 KiB
Go
//go:build !remote
|
|
|
|
package libpod
|
|
|
|
import (
|
|
"bufio"
|
|
"bytes"
|
|
"context"
|
|
"errors"
|
|
"fmt"
|
|
"io/fs"
|
|
"os"
|
|
"path/filepath"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/containers/podman/v5/libpod/define"
|
|
"github.com/containers/podman/v5/libpod/shutdown"
|
|
"github.com/sirupsen/logrus"
|
|
"golang.org/x/sys/unix"
|
|
)
|
|
|
|
// HealthCheck verifies the state and validity of the healthcheck configuration
|
|
// on the container and then executes the healthcheck
|
|
func (r *Runtime) HealthCheck(ctx context.Context, name string) (define.HealthCheckStatus, error) {
|
|
container, err := r.LookupContainer(name)
|
|
if err != nil {
|
|
return define.HealthCheckContainerNotFound, fmt.Errorf("unable to look up %s to perform a health check: %w", name, err)
|
|
}
|
|
|
|
hcStatus, err := checkHealthCheckCanBeRun(container)
|
|
if err != nil {
|
|
return hcStatus, err
|
|
}
|
|
|
|
isStartupHC := false
|
|
if container.config.StartupHealthCheckConfig != nil {
|
|
passed, err := container.StartupHCPassed()
|
|
if err != nil {
|
|
return define.HealthCheckInternalError, err
|
|
}
|
|
isStartupHC = !passed
|
|
}
|
|
|
|
hcStatus, logStatus, err := container.runHealthCheck(ctx, isStartupHC)
|
|
if !isStartupHC {
|
|
if err := container.processHealthCheckStatus(logStatus); err != nil {
|
|
return hcStatus, err
|
|
}
|
|
}
|
|
return hcStatus, err
|
|
}
|
|
|
|
func (c *Container) runHealthCheck(ctx context.Context, isStartup bool) (define.HealthCheckStatus, string, error) {
|
|
var (
|
|
newCommand []string
|
|
returnCode int
|
|
inStartPeriod bool
|
|
)
|
|
|
|
hcCommand := c.HealthCheckConfig().Test
|
|
if isStartup {
|
|
logrus.Debugf("Running startup healthcheck for container %s", c.ID())
|
|
hcCommand = c.config.StartupHealthCheckConfig.Test
|
|
}
|
|
if len(hcCommand) < 1 {
|
|
return define.HealthCheckNotDefined, "", fmt.Errorf("container %s has no defined healthcheck", c.ID())
|
|
}
|
|
switch hcCommand[0] {
|
|
case "", define.HealthConfigTestNone:
|
|
return define.HealthCheckNotDefined, "", fmt.Errorf("container %s has no defined healthcheck", c.ID())
|
|
case define.HealthConfigTestCmd:
|
|
newCommand = hcCommand[1:]
|
|
case define.HealthConfigTestCmdShell:
|
|
// TODO: SHELL command from image not available in Container - use Docker default
|
|
newCommand = []string{"/bin/sh", "-c", strings.Join(hcCommand[1:], " ")}
|
|
default:
|
|
// command supplied on command line - pass as-is
|
|
newCommand = hcCommand
|
|
}
|
|
if len(newCommand) < 1 || newCommand[0] == "" {
|
|
return define.HealthCheckNotDefined, "", fmt.Errorf("container %s has no defined healthcheck", c.ID())
|
|
}
|
|
|
|
streams := new(define.AttachStreams)
|
|
output := &bytes.Buffer{}
|
|
|
|
streams.InputStream = bufio.NewReader(os.Stdin)
|
|
streams.OutputStream = output
|
|
streams.ErrorStream = output
|
|
streams.AttachOutput = true
|
|
streams.AttachError = true
|
|
streams.AttachInput = true
|
|
|
|
logrus.Debugf("executing health check command %s for %s", strings.Join(newCommand, " "), c.ID())
|
|
hcResult := define.HealthCheckSuccess
|
|
config := new(ExecConfig)
|
|
config.Command = newCommand
|
|
timeStart := time.Now()
|
|
exitCode, hcErr := c.healthCheckExec(config, c.HealthCheckConfig().Timeout, streams)
|
|
timeEnd := time.Now()
|
|
if hcErr != nil {
|
|
hcResult = define.HealthCheckFailure
|
|
switch {
|
|
case errors.Is(hcErr, define.ErrOCIRuntimeNotFound) ||
|
|
errors.Is(hcErr, define.ErrOCIRuntimePermissionDenied) ||
|
|
errors.Is(hcErr, define.ErrOCIRuntime):
|
|
returnCode = 1
|
|
hcErr = nil
|
|
case errors.Is(hcErr, define.ErrHealthCheckTimeout):
|
|
returnCode = -1
|
|
default:
|
|
returnCode = 125
|
|
}
|
|
} else if exitCode != 0 {
|
|
hcResult = define.HealthCheckFailure
|
|
returnCode = 1
|
|
}
|
|
|
|
if !c.batched {
|
|
c.lock.Lock()
|
|
defer c.lock.Unlock()
|
|
if err := c.syncContainer(); err != nil {
|
|
return define.HealthCheckInternalError, "", err
|
|
}
|
|
}
|
|
|
|
// Handle startup HC
|
|
if isStartup {
|
|
inStartPeriod = true
|
|
if hcErr != nil || exitCode != 0 {
|
|
hcResult = define.HealthCheckStartup
|
|
if err := c.incrementStartupHCFailureCounter(ctx); err != nil {
|
|
return define.HealthCheckInternalError, "", err
|
|
}
|
|
} else {
|
|
if err := c.incrementStartupHCSuccessCounter(ctx); err != nil {
|
|
return define.HealthCheckInternalError, "", err
|
|
}
|
|
}
|
|
}
|
|
|
|
if exitCode != 0 && c.ensureState(define.ContainerStateStopped, define.ContainerStateStopping, define.ContainerStateExited) {
|
|
hcResult = define.HealthCheckContainerStopped
|
|
}
|
|
|
|
if c.HealthCheckConfig().StartPeriod > 0 {
|
|
// there is a start-period we need to honor; we add startPeriod to container start time
|
|
startPeriodTime := c.state.StartedTime.Add(c.HealthCheckConfig().StartPeriod)
|
|
if timeStart.Before(startPeriodTime) {
|
|
// we are still in the start period, flip the inStartPeriod bool
|
|
inStartPeriod = true
|
|
logrus.Debugf("healthcheck for %s being run in start-period", c.ID())
|
|
}
|
|
}
|
|
|
|
eventLog := output.String()
|
|
if c.HealthCheckMaxLogSize() != 0 && len(eventLog) > int(c.HealthCheckMaxLogSize()) {
|
|
eventLog = eventLog[:c.HealthCheckMaxLogSize()]
|
|
}
|
|
|
|
hcl := newHealthCheckLog(timeStart, timeEnd, returnCode, eventLog)
|
|
|
|
healthCheckResult, err := c.updateHealthCheckLog(hcl, hcResult, inStartPeriod, isStartup)
|
|
if err != nil {
|
|
return hcResult, "", fmt.Errorf("unable to update health check log %s for %s: %w", c.getHealthCheckLogDestination(), c.ID(), err)
|
|
}
|
|
|
|
// Write HC event with appropriate status as the last thing before we
|
|
// return.
|
|
if hcResult == define.HealthCheckNotDefined || hcResult == define.HealthCheckInternalError {
|
|
return hcResult, healthCheckResult.Status, hcErr
|
|
}
|
|
if c.runtime.config.Engine.HealthcheckEvents {
|
|
c.newContainerHealthCheckEvent(healthCheckResult)
|
|
}
|
|
|
|
return hcResult, healthCheckResult.Status, hcErr
|
|
}
|
|
|
|
func (c *Container) processHealthCheckStatus(status string) error {
|
|
if status != define.HealthCheckUnhealthy {
|
|
return nil
|
|
}
|
|
|
|
switch c.config.HealthCheckOnFailureAction {
|
|
case define.HealthCheckOnFailureActionNone: // Nothing to do
|
|
|
|
case define.HealthCheckOnFailureActionKill:
|
|
if err := c.Kill(uint(unix.SIGKILL)); err != nil {
|
|
return fmt.Errorf("killing container health-check turned unhealthy: %w", err)
|
|
}
|
|
|
|
case define.HealthCheckOnFailureActionRestart:
|
|
// We let the cleanup process handle the restart. Otherwise
|
|
// the container would be restarted in the context of a
|
|
// transient systemd unit which may cause undesired side
|
|
// effects.
|
|
if err := c.Stop(); err != nil {
|
|
return fmt.Errorf("restarting/stopping container after health-check turned unhealthy: %w", err)
|
|
}
|
|
|
|
case define.HealthCheckOnFailureActionStop:
|
|
if err := c.Stop(); err != nil {
|
|
return fmt.Errorf("stopping container after health-check turned unhealthy: %w", err)
|
|
}
|
|
|
|
default: // Should not happen but better be safe than sorry
|
|
return fmt.Errorf("unsupported on-failure action %d", c.config.HealthCheckOnFailureAction)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func checkHealthCheckCanBeRun(c *Container) (define.HealthCheckStatus, error) {
|
|
cstate, err := c.State()
|
|
if err != nil {
|
|
return define.HealthCheckInternalError, err
|
|
}
|
|
if cstate != define.ContainerStateRunning {
|
|
return define.HealthCheckContainerStopped, fmt.Errorf("container %s is not running", c.ID())
|
|
}
|
|
if !c.HasHealthCheck() {
|
|
return define.HealthCheckNotDefined, fmt.Errorf("container %s has no defined healthcheck", c.ID())
|
|
}
|
|
return define.HealthCheckDefined, nil
|
|
}
|
|
|
|
// Increment the current startup healthcheck success counter.
|
|
// Can stop the startup HC and start the regular HC if the startup HC has enough
|
|
// consecutive successes.
|
|
// NOTE: The caller must lock and sync the container.
|
|
func (c *Container) incrementStartupHCSuccessCounter(ctx context.Context) error {
|
|
// We don't have a startup HC, can't do anything
|
|
if c.config.StartupHealthCheckConfig == nil {
|
|
return nil
|
|
}
|
|
|
|
// Race: someone else got here first
|
|
if c.state.StartupHCPassed {
|
|
return nil
|
|
}
|
|
|
|
// Increment the success counter
|
|
c.state.StartupHCSuccessCount++
|
|
|
|
logrus.Debugf("Startup healthcheck for container %s succeeded, success counter now %d", c.ID(), c.state.StartupHCSuccessCount)
|
|
|
|
// Did we exceed threshold?
|
|
recreateTimer := false
|
|
if c.config.StartupHealthCheckConfig.Successes == 0 || c.state.StartupHCSuccessCount >= c.config.StartupHealthCheckConfig.Successes {
|
|
c.state.StartupHCPassed = true
|
|
c.state.StartupHCSuccessCount = 0
|
|
c.state.StartupHCFailureCount = 0
|
|
|
|
recreateTimer = true
|
|
}
|
|
|
|
if err := c.save(); err != nil {
|
|
return err
|
|
}
|
|
|
|
if !recreateTimer {
|
|
return nil
|
|
}
|
|
// This kills the process the healthcheck is running.
|
|
// Which happens to be us.
|
|
// So this has to be last - after this, systemd serves us a
|
|
// SIGTERM and we exit.
|
|
// Special case, via SIGTERM we exit(1) which means systemd logs a failure in the unit.
|
|
// We do not want this as the unit will be leaked on failure states unless "reset-failed"
|
|
// is called. Fundamentally this is expected so switch it to exit 0.
|
|
// NOTE: This is only safe while being called from "podman healthcheck run" which we know
|
|
// is the case here as we should not alter the exit code of another process that just
|
|
// happened to call this.
|
|
shutdown.SetExitCode(0)
|
|
return c.recreateHealthCheckTimer(ctx, false, true)
|
|
}
|
|
|
|
func (c *Container) recreateHealthCheckTimer(ctx context.Context, isStartup bool, isStartupRemoved bool) error {
|
|
logrus.Infof("Startup healthcheck for container %s passed, recreating timer", c.ID())
|
|
|
|
oldUnit := c.state.HCUnitName
|
|
// Create the new, standard healthcheck timer first.
|
|
interval := c.HealthCheckConfig().Interval.String()
|
|
if isStartup {
|
|
interval = c.config.StartupHealthCheckConfig.StartInterval.String()
|
|
}
|
|
|
|
if err := c.createTimer(interval, isStartup); err != nil {
|
|
return fmt.Errorf("recreating container %s (isStartup: %t) healthcheck: %v", c.ID(), isStartup, err)
|
|
}
|
|
if err := c.startTimer(isStartup); err != nil {
|
|
return fmt.Errorf("restarting container %s (isStartup: %t) healthcheck timer: %v", c.ID(), isStartup, err)
|
|
}
|
|
|
|
if err := c.removeTransientFiles(ctx, isStartupRemoved, oldUnit); err != nil {
|
|
return fmt.Errorf("removing container %s healthcheck: %v", c.ID(), err)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// Increment the current startup healthcheck failure counter.
|
|
// Can restart the container if the HC fails enough times consecutively.
|
|
// NOTE: The caller must lock and sync the container.
|
|
func (c *Container) incrementStartupHCFailureCounter(ctx context.Context) error {
|
|
// We don't have a startup HC, can't do anything
|
|
if c.config.StartupHealthCheckConfig == nil {
|
|
return nil
|
|
}
|
|
|
|
// Race: someone else got here first
|
|
if c.state.StartupHCPassed {
|
|
return nil
|
|
}
|
|
|
|
c.state.StartupHCFailureCount++
|
|
|
|
logrus.Debugf("Startup healthcheck for container %s failed, failure counter now %d", c.ID(), c.state.StartupHCFailureCount)
|
|
|
|
if c.config.StartupHealthCheckConfig.Retries != 0 && c.state.StartupHCFailureCount >= c.config.StartupHealthCheckConfig.Retries {
|
|
logrus.Infof("Restarting container %s as startup healthcheck failed", c.ID())
|
|
// Restart the container
|
|
if err := c.restartWithTimeout(ctx, c.config.StopTimeout); err != nil {
|
|
return fmt.Errorf("restarting container %s after healthcheck failure: %v", c.ID(), err)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
return c.save()
|
|
}
|
|
|
|
func newHealthCheckLog(start, end time.Time, exitCode int, log string) define.HealthCheckLog {
|
|
return define.HealthCheckLog{
|
|
Start: start.Format(time.RFC3339Nano),
|
|
End: end.Format(time.RFC3339Nano),
|
|
ExitCode: exitCode,
|
|
Output: log,
|
|
}
|
|
}
|
|
|
|
// updateHealthStatus updates the health status of the container
|
|
// in the healthcheck log
|
|
func (c *Container) updateHealthStatus(status string) error {
|
|
healthCheck, err := c.readHealthCheckLog()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
healthCheck.Status = status
|
|
return c.writeHealthCheckLog(healthCheck)
|
|
}
|
|
|
|
// isUnhealthy returns true if the current health check status is unhealthy.
|
|
func (c *Container) isUnhealthy() (bool, error) {
|
|
if !c.HasHealthCheck() {
|
|
return false, nil
|
|
}
|
|
healthCheck, err := c.readHealthCheckLog()
|
|
if err != nil {
|
|
return false, err
|
|
}
|
|
return healthCheck.Status == define.HealthCheckUnhealthy, nil
|
|
}
|
|
|
|
// UpdateHealthCheckLog parses the health check results and writes the log
|
|
// NOTE: The caller must lock the container.
|
|
func (c *Container) updateHealthCheckLog(hcl define.HealthCheckLog, hcResult define.HealthCheckStatus, inStartPeriod, isStartup bool) (define.HealthCheckResults, error) {
|
|
// If we are playing a kube yaml then let's honor the start period time for
|
|
// both failing and succeeding cases to match kube behavior.
|
|
// So don't update the health check log till the start period is over
|
|
if _, ok := c.config.Spec.Annotations[define.KubeHealthCheckAnnotation]; ok && inStartPeriod && !isStartup {
|
|
return define.HealthCheckResults{}, nil
|
|
}
|
|
|
|
healthCheck, err := c.readHealthCheckLog()
|
|
if err != nil {
|
|
return define.HealthCheckResults{}, err
|
|
}
|
|
if hcl.ExitCode == 0 {
|
|
// set status to healthy, reset failing state to 0
|
|
healthCheck.Status = define.HealthCheckHealthy
|
|
healthCheck.FailingStreak = 0
|
|
} else {
|
|
if len(healthCheck.Status) < 1 {
|
|
healthCheck.Status = define.HealthCheckHealthy
|
|
}
|
|
if hcResult == define.HealthCheckContainerStopped {
|
|
healthCheck.Status = define.HealthCheckStopped
|
|
} else if !inStartPeriod {
|
|
// increment failing streak
|
|
healthCheck.FailingStreak++
|
|
// if failing streak > retries, then status to unhealthy
|
|
if healthCheck.FailingStreak >= c.HealthCheckConfig().Retries {
|
|
healthCheck.Status = define.HealthCheckUnhealthy
|
|
}
|
|
}
|
|
}
|
|
healthCheck.Log = append(healthCheck.Log, hcl)
|
|
if c.HealthCheckMaxLogCount() != 0 && len(healthCheck.Log) > int(c.HealthCheckMaxLogCount()) {
|
|
healthCheck.Log = healthCheck.Log[1:]
|
|
}
|
|
return healthCheck, c.writeHealthCheckLog(healthCheck)
|
|
}
|
|
|
|
func (c *Container) witeToFileHealthCheckResults(path string, result define.HealthCheckResults) error {
|
|
newResults, err := json.Marshal(result)
|
|
if err != nil {
|
|
return fmt.Errorf("unable to marshall healthchecks for writing: %w", err)
|
|
}
|
|
return os.WriteFile(path, newResults, 0700)
|
|
}
|
|
|
|
func (c *Container) getHealthCheckLogDestination() string {
|
|
var destination string
|
|
switch c.HealthCheckLogDestination() {
|
|
case define.DefaultHealthCheckLocalDestination, define.HealthCheckEventsLoggerDestination, "":
|
|
destination = filepath.Join(filepath.Dir(c.state.RunDir), "healthcheck.log")
|
|
default:
|
|
destination = filepath.Join(c.HealthCheckLogDestination(), c.ID()+"-healthcheck.log")
|
|
}
|
|
return destination
|
|
}
|
|
|
|
func (c *Container) writeHealthCheckLog(result define.HealthCheckResults) error {
|
|
return c.witeToFileHealthCheckResults(c.getHealthCheckLogDestination(), result)
|
|
}
|
|
|
|
// readHealthCheckLog read HealthCheck logs from the path or events_logger
|
|
// The caller should lock the container before this function is called.
|
|
func (c *Container) readHealthCheckLog() (define.HealthCheckResults, error) {
|
|
return c.readFromFileHealthCheckLog(c.getHealthCheckLogDestination())
|
|
}
|
|
|
|
// readFromFileHealthCheckLog returns HealthCheck results by reading the container's
|
|
// health check log file. If the health check log file does not exist, then
|
|
// an empty healthcheck struct is returned
|
|
// The caller should lock the container before this function is called.
|
|
func (c *Container) readFromFileHealthCheckLog(path string) (define.HealthCheckResults, error) {
|
|
var healthCheck define.HealthCheckResults
|
|
b, err := os.ReadFile(path)
|
|
if err != nil {
|
|
if errors.Is(err, fs.ErrNotExist) {
|
|
// If the file does not exists just return empty healthcheck and no error.
|
|
return healthCheck, nil
|
|
}
|
|
return healthCheck, fmt.Errorf("failed to read health check log file: %w", err)
|
|
}
|
|
if err := json.Unmarshal(b, &healthCheck); err != nil {
|
|
return healthCheck, fmt.Errorf("failed to unmarshal existing healthcheck results in %s: %w", path, err)
|
|
}
|
|
return healthCheck, nil
|
|
}
|
|
|
|
// HealthCheckStatus returns the current state of a container with a healthcheck.
|
|
// Returns an empty string if no health check is defined for the container.
|
|
func (c *Container) HealthCheckStatus() (string, error) {
|
|
if !c.batched {
|
|
c.lock.Lock()
|
|
defer c.lock.Unlock()
|
|
}
|
|
return c.healthCheckStatus()
|
|
}
|
|
|
|
// Internal function to return the current state of a container with a healthcheck.
|
|
// This function does not lock the container.
|
|
func (c *Container) healthCheckStatus() (string, error) {
|
|
if !c.HasHealthCheck() {
|
|
return "", nil
|
|
}
|
|
|
|
if err := c.syncContainer(); err != nil {
|
|
return "", err
|
|
}
|
|
|
|
results, err := c.readHealthCheckLog()
|
|
if err != nil {
|
|
return "", fmt.Errorf("unable to get healthcheck log for %s: %w", c.ID(), err)
|
|
}
|
|
|
|
return results.Status, nil
|
|
}
|