Add support for startup healthchecks

Startup healthchecks are similar to K8S startup probes, in that
they are a separate check from the regular healthcheck that runs
before it. If the startup healthcheck fails repeatedly, the
associated container is restarted.

Signed-off-by: Matthew Heon <matthew.heon@pm.me>
This commit is contained in:
Matthew Heon
2022-04-15 19:22:12 -04:00
parent 935c8eb5ca
commit d16129330d
24 changed files with 551 additions and 147 deletions

View File

@ -180,7 +180,7 @@ func DefineCreateFlags(cmd *cobra.Command, cf *entities.ContainerCreateOptions,
createFlags.StringVar( createFlags.StringVar(
&cf.HealthInterval, &cf.HealthInterval,
healthIntervalFlagName, define.DefaultHealthCheckInterval, healthIntervalFlagName, define.DefaultHealthCheckInterval,
"set an interval for the healthchecks (a value of disable results in no automatic timer setup)", "set an interval for the healthcheck (a value of disable results in no automatic timer setup)",
) )
_ = cmd.RegisterFlagCompletionFunc(healthIntervalFlagName, completion.AutocompleteNone) _ = cmd.RegisterFlagCompletionFunc(healthIntervalFlagName, completion.AutocompleteNone)
@ -428,6 +428,46 @@ func DefineCreateFlags(cmd *cobra.Command, cf *entities.ContainerCreateOptions,
) )
_ = cmd.RegisterFlagCompletionFunc(secretFlagName, AutocompleteSecrets) _ = cmd.RegisterFlagCompletionFunc(secretFlagName, AutocompleteSecrets)
startupHCCmdFlagName := "health-startup-cmd"
createFlags.StringVar(
&cf.StartupHCCmd,
startupHCCmdFlagName, "",
"Set a startup healthcheck command for the container",
)
_ = cmd.RegisterFlagCompletionFunc(startupHCCmdFlagName, completion.AutocompleteNone)
startupHCIntervalFlagName := "health-startup-interval"
createFlags.StringVar(
&cf.StartupHCInterval,
startupHCIntervalFlagName, define.DefaultHealthCheckInterval,
"Set an interval for the startup healthcheck",
)
_ = cmd.RegisterFlagCompletionFunc(startupHCIntervalFlagName, completion.AutocompleteNone)
startupHCRetriesFlagName := "health-startup-retries"
createFlags.UintVar(
&cf.StartupHCRetries,
startupHCRetriesFlagName, 0,
"Set the maximum number of retries before the startup healthcheck will restart the container",
)
_ = cmd.RegisterFlagCompletionFunc(startupHCRetriesFlagName, completion.AutocompleteNone)
startupHCSuccessesFlagName := "health-startup-success"
createFlags.UintVar(
&cf.StartupHCSuccesses,
startupHCSuccessesFlagName, 0,
"Set the number of consecutive successes before the startup healthcheck is marked as successful and the normal healthcheck begins (0 indicates any success will start the regular healthcheck)",
)
_ = cmd.RegisterFlagCompletionFunc(startupHCSuccessesFlagName, completion.AutocompleteNone)
startupHCTimeoutFlagName := "health-startup-timeout"
createFlags.StringVar(
&cf.StartupHCTimeout,
startupHCTimeoutFlagName, define.DefaultHealthCheckTimeout,
"Set the maximum amount of time that the startup healthcheck may take before it is considered failed",
)
_ = cmd.RegisterFlagCompletionFunc(startupHCTimeoutFlagName, completion.AutocompleteNone)
stopSignalFlagName := "stop-signal" stopSignalFlagName := "stop-signal"
createFlags.StringVar( createFlags.StringVar(
&cf.StopSignal, &cf.StopSignal,

View File

@ -35,7 +35,7 @@ func run(cmd *cobra.Command, args []string) error {
if err != nil { if err != nil {
return err return err
} }
if response.Status == define.HealthCheckUnhealthy { if response.Status == define.HealthCheckUnhealthy || response.Status == define.HealthCheckStarting {
registry.SetExitCode(1) registry.SetExitCode(1)
fmt.Println(response.Status) fmt.Println(response.Status)
} }

View File

@ -0,0 +1,11 @@
####> This option file is used in:
####> podman create, run
####> If you edit this file, make sure your changes
####> are applicable to all of those.
#### **--health-startup-cmd**=*"command"* | *'["command", "arg1", ...]'*
Set a startup healthcheck command for a container. This command will be executed inside the container and is used to gate the regular
healthcheck. When the startup command succeeds, the regular healthcheck will begin and the startup healthcheck will cease. Optionally,
if the command fails for a set number of attempts, the container will be restarted. A startup healthcheck can be used to ensure that
containers with an extended startup period are not marked as unhealthy until they are fully started. Startup healthchecks can only be
used when a regular healthcheck (from the container's image or the **--health-cmd** option) is also set.

View File

@ -0,0 +1,7 @@
####> This option file is used in:
####> podman create, run
####> If you edit this file, make sure your changes
####> are applicable to all of those.
#### **--health-startup-interval**=*interval*
Set an interval for the startup healthcheck. An _interval_ of **disable** results in no automatic timer setup. The default is **30s**.

View File

@ -0,0 +1,8 @@
####> This option file is used in:
####> podman create, run
####> If you edit this file, make sure your changes
####> are applicable to all of those.
#### **--health-startup-retries**=*retries*
The number of attempts allowed before the startup healthcheck restarts the container. If set to **0**, the container will never be
restarted. The default is **0**.

View File

@ -0,0 +1,8 @@
####> This option file is used in:
####> podman create, run
####> If you edit this file, make sure your changes
####> are applicable to all of those.
#### **--health-startup-success**=*retries*
The number of successful runs required before the startup healthcheck will succeed and the regular healthcheck will begin. A value
of **0** means that any success will begin the regular healthcheck. The default is **0**.

View File

@ -0,0 +1,8 @@
####> This option file is used in:
####> podman create, run
####> If you edit this file, make sure your changes
####> are applicable to all of those.
#### **--health-startup-timeout**=*timeout*
The maximum time a startup healthcheck command has to complete before it is marked as failed. The value can be expressed in a time
format like **2m3s**. The default value is **30s**.

View File

@ -172,6 +172,16 @@ See [**Environment**](#environment) note below for precedence and examples.
@@option health-start-period @@option health-start-period
@@option health-startup-cmd
@@option health-startup-interval
@@option health-startup-retries
@@option health-startup-success
@@option health-startup-timeout
@@option health-timeout @@option health-timeout
#### **--help** #### **--help**

View File

@ -204,6 +204,16 @@ See [**Environment**](#environment) note below for precedence and examples.
@@option health-start-period @@option health-start-period
@@option health-startup-cmd
@@option health-startup-interval
@@option health-startup-retries
@@option health-startup-success
@@option health-startup-timeout
@@option health-timeout @@option health-timeout
#### **--help** #### **--help**

View File

@ -200,6 +200,18 @@ type ContainerState struct {
// (only by restart policy). // (only by restart policy).
RestartCount uint `json:"restartCount,omitempty"` RestartCount uint `json:"restartCount,omitempty"`
// StartupHCPassed indicates that the startup healthcheck has
// succeeded and the main healthcheck can begin.
StartupHCPassed bool `json:"startupHCPassed,omitempty"`
// StartupHCSuccessCount indicates the number of successes of the
// startup healthcheck. A startup HC can require more than one success
// to be marked as passed.
StartupHCSuccessCount int `json:"startupHCSuccessCount,omitempty"`
// StartupHCFailureCount indicates the number of failures of the startup
// healthcheck. The container will be restarted if this exceed a set
// number in the startup HC config.
StartupHCFailureCount int `json:"startupHCFailureCount,omitempty"`
// ExtensionStageHooks holds hooks which will be executed by libpod // ExtensionStageHooks holds hooks which will be executed by libpod
// and not delegated to the OCI runtime. // and not delegated to the OCI runtime.
ExtensionStageHooks map[string][]spec.Hook `json:"extensionStageHooks,omitempty"` ExtensionStageHooks map[string][]spec.Hook `json:"extensionStageHooks,omitempty"`
@ -929,6 +941,20 @@ func (c *Container) StoppedByUser() (bool, error) {
return c.state.StoppedByUser, nil return c.state.StoppedByUser, nil
} }
// StartupHCPassed returns whether the container's startup healthcheck passed.
func (c *Container) StartupHCPassed() (bool, error) {
if !c.batched {
c.lock.Lock()
defer c.lock.Unlock()
if err := c.syncContainer(); err != nil {
return false, err
}
}
return c.state.StartupHCPassed, nil
}
// Misc Accessors // Misc Accessors
// Most will require locking // Most will require locking

View File

@ -395,6 +395,10 @@ type ContainerMiscConfig struct {
HealthCheckConfig *manifest.Schema2HealthConfig `json:"healthcheck"` HealthCheckConfig *manifest.Schema2HealthConfig `json:"healthcheck"`
// HealthCheckOnFailureAction defines an action to take once the container turns unhealthy. // HealthCheckOnFailureAction defines an action to take once the container turns unhealthy.
HealthCheckOnFailureAction define.HealthCheckOnFailureAction `json:"healthcheck_on_failure_action"` HealthCheckOnFailureAction define.HealthCheckOnFailureAction `json:"healthcheck_on_failure_action"`
// StartupHealthCheckConfig is the configuration of the startup
// healthcheck for the container. This will run before the regular HC
// runs, and when it passes the regular HC will be activated.
StartupHealthCheckConfig *define.StartupHealthCheck `json:"startupHealthCheck,omitempty"`
// PreserveFDs is a number of additional file descriptors (in addition // PreserveFDs is a number of additional file descriptors (in addition
// to 0, 1, 2) that will be passed to the executed process. The total FDs // to 0, 1, 2) that will be passed to the executed process. The total FDs
// passed will be 3 + PreserveFDs. // passed will be 3 + PreserveFDs.

View File

@ -622,6 +622,9 @@ func resetState(state *ContainerState) {
state.CheckpointPath = "" state.CheckpointPath = ""
state.CheckpointLog = "" state.CheckpointLog = ""
state.RestoreLog = "" state.RestoreLog = ""
state.StartupHCPassed = false
state.StartupHCSuccessCount = 0
state.StartupHCFailureCount = 0
} }
// Refresh refreshes the container's state after a restart. // Refresh refreshes the container's state after a restart.
@ -1072,6 +1075,9 @@ func (c *Container) init(ctx context.Context, retainRetries bool) error {
c.state.State = define.ContainerStateCreated c.state.State = define.ContainerStateCreated
c.state.StoppedByUser = false c.state.StoppedByUser = false
c.state.RestartPolicyMatch = false c.state.RestartPolicyMatch = false
c.state.StartupHCFailureCount = 0
c.state.StartupHCSuccessCount = 0
c.state.StartupHCPassed = false
if !retainRetries { if !retainRetries {
c.state.RestartCount = 0 c.state.RestartCount = 0
@ -1091,7 +1097,11 @@ func (c *Container) init(ctx context.Context, retainRetries bool) error {
} }
if c.config.HealthCheckConfig != nil { if c.config.HealthCheckConfig != nil {
if err := c.createTimer(); err != nil { timer := c.config.HealthCheckConfig.Interval.String()
if c.config.StartupHealthCheckConfig != nil {
timer = c.config.StartupHealthCheckConfig.Interval.String()
}
if err := c.createTimer(timer, c.config.StartupHealthCheckConfig != nil); err != nil {
logrus.Error(err) logrus.Error(err)
} }
} }
@ -1244,7 +1254,7 @@ func (c *Container) start() error {
if err := c.updateHealthStatus(define.HealthCheckStarting); err != nil { if err := c.updateHealthStatus(define.HealthCheckStarting); err != nil {
logrus.Error(err) logrus.Error(err)
} }
if err := c.startTimer(); err != nil { if err := c.startTimer(c.config.StartupHealthCheckConfig != nil); err != nil {
logrus.Error(err) logrus.Error(err)
} }
} }
@ -1422,7 +1432,7 @@ func (c *Container) restartWithTimeout(ctx context.Context, timeout uint) (retEr
return err return err
} }
if c.config.HealthCheckConfig != nil { if c.config.HealthCheckConfig != nil {
if err := c.removeTransientFiles(context.Background()); err != nil { if err := c.removeTransientFiles(context.Background(), c.config.StartupHealthCheckConfig != nil && !c.state.StartupHCPassed); err != nil {
logrus.Error(err.Error()) logrus.Error(err.Error())
} }
} }
@ -1859,7 +1869,7 @@ func (c *Container) cleanup(ctx context.Context) error {
// Remove healthcheck unit/timer file if it execs // Remove healthcheck unit/timer file if it execs
if c.config.HealthCheckConfig != nil { if c.config.HealthCheckConfig != nil {
if err := c.removeTransientFiles(ctx); err != nil { if err := c.removeTransientFiles(ctx, c.config.StartupHealthCheckConfig != nil && !c.state.StartupHCPassed); err != nil {
logrus.Errorf("Removing timer for container %s healthcheck: %v", c.ID(), err) logrus.Errorf("Removing timer for container %s healthcheck: %v", c.ID(), err)
} }
} }

View File

@ -156,6 +156,11 @@ func (c *Container) validate() error {
} }
} }
// Cannot set startup HC without a healthcheck
if c.config.HealthCheckConfig == nil && c.config.StartupHealthCheckConfig != nil {
return fmt.Errorf("cannot set a startup healthcheck when there is no regular healthcheck: %w", define.ErrInvalidArg)
}
return nil return nil
} }

View File

@ -3,6 +3,8 @@ package define
import ( import (
"fmt" "fmt"
"strings" "strings"
"github.com/containers/image/v5/manifest"
) )
const ( const (
@ -38,6 +40,9 @@ const (
HealthCheckInternalError HealthCheckStatus = iota HealthCheckInternalError HealthCheckStatus = iota
// HealthCheckDefined means the healthcheck was found on the container // HealthCheckDefined means the healthcheck was found on the container
HealthCheckDefined HealthCheckStatus = iota HealthCheckDefined HealthCheckStatus = iota
// HealthCheckStartup means the healthcheck was unhealthy, but is still
// either within the startup HC or the startup period of the healthcheck
HealthCheckStartup HealthCheckStatus = iota
) )
// Healthcheck defaults. These are used both in the cli as well in // Healthcheck defaults. These are used both in the cli as well in
@ -131,3 +136,12 @@ func ParseHealthCheckOnFailureAction(s string) (HealthCheckOnFailureAction, erro
return HealthCheckOnFailureActionInvalid, err return HealthCheckOnFailureActionInvalid, err
} }
} }
// StartupHealthCheck is the configuration of a startup healthcheck.
type StartupHealthCheck struct {
manifest.Schema2HealthConfig
// Successes are the number of successes required to mark the startup HC
// as passed.
// If set to 0, a single success will mark the HC as passed.
Successes int `json:",omitempty"`
}

View File

@ -25,7 +25,7 @@ const (
// HealthCheck verifies the state and validity of the healthcheck configuration // HealthCheck verifies the state and validity of the healthcheck configuration
// on the container and then executes the healthcheck // on the container and then executes the healthcheck
func (r *Runtime) HealthCheck(name string) (define.HealthCheckStatus, error) { func (r *Runtime) HealthCheck(ctx context.Context, name string) (define.HealthCheckStatus, error) {
container, err := r.LookupContainer(name) container, err := r.LookupContainer(name)
if err != nil { if err != nil {
return define.HealthCheckContainerNotFound, fmt.Errorf("unable to look up %s to perform a health check: %w", name, err) return define.HealthCheckContainerNotFound, fmt.Errorf("unable to look up %s to perform a health check: %w", name, err)
@ -36,21 +36,35 @@ func (r *Runtime) HealthCheck(name string) (define.HealthCheckStatus, error) {
return hcStatus, err return hcStatus, err
} }
hcStatus, logStatus, err := container.runHealthCheck() isStartupHC := false
if err := container.processHealthCheckStatus(logStatus); err != nil { if container.config.StartupHealthCheckConfig != nil {
return hcStatus, err passed, err := container.StartupHCPassed()
if err != nil {
return define.HealthCheckInternalError, err
}
isStartupHC = !passed
}
hcStatus, logStatus, err := container.runHealthCheck(ctx, isStartupHC)
if !isStartupHC {
if err := container.processHealthCheckStatus(logStatus); err != nil {
return hcStatus, err
}
} }
return hcStatus, err return hcStatus, err
} }
// runHealthCheck runs the health check as defined by the container func (c *Container) runHealthCheck(ctx context.Context, isStartup bool) (define.HealthCheckStatus, string, error) {
func (c *Container) runHealthCheck() (define.HealthCheckStatus, string, error) {
var ( var (
newCommand []string newCommand []string
returnCode int returnCode int
inStartPeriod bool inStartPeriod bool
) )
hcCommand := c.HealthCheckConfig().Test hcCommand := c.HealthCheckConfig().Test
if isStartup {
logrus.Debugf("Running startup healthcheck for container %s", c.ID())
hcCommand = c.config.StartupHealthCheckConfig.Test
}
if len(hcCommand) < 1 { if len(hcCommand) < 1 {
return define.HealthCheckNotDefined, "", fmt.Errorf("container %s has no defined healthcheck", c.ID()) return define.HealthCheckNotDefined, "", fmt.Errorf("container %s has no defined healthcheck", c.ID())
} }
@ -113,6 +127,18 @@ func (c *Container) runHealthCheck() (define.HealthCheckStatus, string, error) {
hcResult = define.HealthCheckFailure hcResult = define.HealthCheckFailure
returnCode = 1 returnCode = 1
} }
// Handle startup HC
if isStartup {
inStartPeriod = true
if hcErr != nil || exitCode != 0 {
hcResult = define.HealthCheckStartup
c.incrementStartupHCFailureCounter(ctx)
} else {
c.incrementStartupHCSuccessCounter(ctx)
}
}
timeEnd := time.Now() timeEnd := time.Now()
if c.HealthCheckConfig().StartPeriod > 0 { if c.HealthCheckConfig().StartPeriod > 0 {
// there is a start-period we need to honor; we add startPeriod to container start time // there is a start-period we need to honor; we add startPeriod to container start time
@ -188,6 +214,114 @@ func checkHealthCheckCanBeRun(c *Container) (define.HealthCheckStatus, error) {
return define.HealthCheckDefined, nil return define.HealthCheckDefined, nil
} }
// Increment the current startup healthcheck success counter.
// Can stop the startup HC and start the regular HC if the startup HC has enough
// consecutive successes.
func (c *Container) incrementStartupHCSuccessCounter(ctx context.Context) {
if !c.batched {
c.lock.Lock()
defer c.lock.Unlock()
if err := c.syncContainer(); err != nil {
logrus.Errorf("Error syncing container %s state: %v", c.ID(), err)
return
}
}
// We don't have a startup HC, can't do anything
if c.config.StartupHealthCheckConfig == nil {
return
}
// Race: someone else got here first
if c.state.StartupHCPassed {
return
}
// Increment the success counter
c.state.StartupHCSuccessCount++
logrus.Debugf("Startup healthcheck for container %s succeeded, success counter now %d", c.ID(), c.state.StartupHCSuccessCount)
// Did we exceed threshold?
recreateTimer := false
if c.config.StartupHealthCheckConfig.Successes == 0 || c.state.StartupHCSuccessCount >= c.config.StartupHealthCheckConfig.Successes {
c.state.StartupHCPassed = true
c.state.StartupHCSuccessCount = 0
c.state.StartupHCFailureCount = 0
recreateTimer = true
}
if err := c.save(); err != nil {
logrus.Errorf("Error saving container %s state: %v", c.ID(), err)
return
}
if recreateTimer {
logrus.Infof("Startup healthcheck for container %s passed, recreating timer", c.ID())
// Create the new, standard healthcheck timer first.
if err := c.createTimer(c.HealthCheckConfig().Interval.String(), false); err != nil {
logrus.Errorf("Error recreating container %s healthcheck: %v", c.ID(), err)
return
}
if err := c.startTimer(false); err != nil {
logrus.Errorf("Error restarting container %s healthcheck timer: %v", c.ID(), err)
}
// This kills the process the healthcheck is running.
// Which happens to be us.
// So this has to be last - after this, systemd serves us a
// SIGTERM and we exit.
if err := c.removeTransientFiles(ctx, true); err != nil {
logrus.Errorf("Error removing container %s healthcheck: %v", c.ID(), err)
return
}
}
}
// Increment the current startup healthcheck failure counter.
// Can restart the container if the HC fails enough times consecutively.
func (c *Container) incrementStartupHCFailureCounter(ctx context.Context) {
if !c.batched {
c.lock.Lock()
defer c.lock.Unlock()
if err := c.syncContainer(); err != nil {
logrus.Errorf("Error syncing container %s state: %v", c.ID(), err)
return
}
}
// We don't have a startup HC, can't do anything
if c.config.StartupHealthCheckConfig == nil {
return
}
// Race: someone else got here first
if c.state.StartupHCPassed {
return
}
c.state.StartupHCFailureCount++
logrus.Debugf("Startup healthcheck for container %s failed, failure counter now %d", c.ID(), c.state.StartupHCFailureCount)
if c.config.StartupHealthCheckConfig.Retries != 0 && c.state.StartupHCFailureCount >= c.config.StartupHealthCheckConfig.Retries {
logrus.Infof("Restarting container %s as startup healthcheck failed", c.ID())
// Restart the container
if err := c.restartWithTimeout(ctx, c.config.StopTimeout); err != nil {
logrus.Errorf("Error restarting container %s after healthcheck failure: %v", c.ID(), err)
}
return
}
if err := c.save(); err != nil {
logrus.Errorf("Error saving container %s state: %v", c.ID(), err)
}
}
func newHealthCheckLog(start, end time.Time, exitCode int, log string) define.HealthCheckLog { func newHealthCheckLog(start, end time.Time, exitCode int, log string) define.HealthCheckLog {
return define.HealthCheckLog{ return define.HealthCheckLog{
Start: start.Format(time.RFC3339Nano), Start: start.Format(time.RFC3339Nano),
@ -299,12 +433,26 @@ func (c *Container) healthCheckStatus() (string, error) {
return results.Status, nil return results.Status, nil
} }
func (c *Container) disableHealthCheckSystemd() bool { func (c *Container) disableHealthCheckSystemd(isStartup bool) bool {
if os.Getenv("DISABLE_HC_SYSTEMD") == "true" { if os.Getenv("DISABLE_HC_SYSTEMD") == "true" {
return true return true
} }
if isStartup {
if c.config.StartupHealthCheckConfig.Interval == 0 {
return true
}
}
if c.config.HealthCheckConfig.Interval == 0 { if c.config.HealthCheckConfig.Interval == 0 {
return true return true
} }
return false return false
} }
// Systemd unit name for the healthcheck systemd unit
func (c *Container) hcUnitName(isStartup bool) string {
unitName := c.ID()
if isStartup {
unitName += "-startup"
}
return unitName
}

View File

@ -14,8 +14,8 @@ import (
) )
// createTimer systemd timers for healthchecks of a container // createTimer systemd timers for healthchecks of a container
func (c *Container) createTimer() error { func (c *Container) createTimer(interval string, isStartup bool) error {
if c.disableHealthCheckSystemd() { if c.disableHealthCheckSystemd(isStartup) {
return nil return nil
} }
podman, err := os.Executable() podman, err := os.Executable()
@ -31,7 +31,14 @@ func (c *Container) createTimer() error {
if path != "" { if path != "" {
cmd = append(cmd, "--setenv=PATH="+path) cmd = append(cmd, "--setenv=PATH="+path)
} }
cmd = append(cmd, "--unit", c.ID(), fmt.Sprintf("--on-unit-inactive=%s", c.HealthCheckConfig().Interval.String()), "--timer-property=AccuracySec=1s", podman, "healthcheck", "run", c.ID())
cmd = append(cmd, "--unit", c.hcUnitName(isStartup), fmt.Sprintf("--on-unit-inactive=%s", interval), "--timer-property=AccuracySec=1s", podman)
if logrus.IsLevelEnabled(logrus.DebugLevel) {
cmd = append(cmd, "--log-level=debug", "--syslog")
}
cmd = append(cmd, "healthcheck", "run", c.ID())
conn, err := systemd.ConnectToDBUS() conn, err := systemd.ConnectToDBUS()
if err != nil { if err != nil {
@ -58,8 +65,8 @@ func systemdOpSuccessful(c chan string) error {
} }
// startTimer starts a systemd timer for the healthchecks // startTimer starts a systemd timer for the healthchecks
func (c *Container) startTimer() error { func (c *Container) startTimer(isStartup bool) error {
if c.disableHealthCheckSystemd() { if c.disableHealthCheckSystemd(isStartup) {
return nil return nil
} }
conn, err := systemd.ConnectToDBUS() conn, err := systemd.ConnectToDBUS()
@ -68,7 +75,7 @@ func (c *Container) startTimer() error {
} }
defer conn.Close() defer conn.Close()
startFile := fmt.Sprintf("%s.service", c.ID()) startFile := fmt.Sprintf("%s.service", c.hcUnitName(isStartup))
startChan := make(chan string) startChan := make(chan string)
if _, err := conn.RestartUnitContext(context.Background(), startFile, "fail", startChan); err != nil { if _, err := conn.RestartUnitContext(context.Background(), startFile, "fail", startChan); err != nil {
return err return err
@ -82,8 +89,8 @@ func (c *Container) startTimer() error {
// removeTransientFiles removes the systemd timer and unit files // removeTransientFiles removes the systemd timer and unit files
// for the container // for the container
func (c *Container) removeTransientFiles(ctx context.Context) error { func (c *Container) removeTransientFiles(ctx context.Context, isStartup bool) error {
if c.disableHealthCheckSystemd() { if c.disableHealthCheckSystemd(isStartup) {
return nil return nil
} }
conn, err := systemd.ConnectToDBUS() conn, err := systemd.ConnectToDBUS()
@ -99,7 +106,7 @@ func (c *Container) removeTransientFiles(ctx context.Context) error {
// Stop the timer before the service to make sure the timer does not // Stop the timer before the service to make sure the timer does not
// fire after the service is stopped. // fire after the service is stopped.
timerChan := make(chan string) timerChan := make(chan string)
timerFile := fmt.Sprintf("%s.timer", c.ID()) timerFile := fmt.Sprintf("%s.timer", c.hcUnitName(isStartup))
if _, err := conn.StopUnitContext(ctx, timerFile, "fail", timerChan); err != nil { if _, err := conn.StopUnitContext(ctx, timerFile, "fail", timerChan); err != nil {
if !strings.HasSuffix(err.Error(), ".timer not loaded.") { if !strings.HasSuffix(err.Error(), ".timer not loaded.") {
stopErrors = append(stopErrors, fmt.Errorf("removing health-check timer %q: %w", timerFile, err)) stopErrors = append(stopErrors, fmt.Errorf("removing health-check timer %q: %w", timerFile, err))
@ -111,7 +118,7 @@ func (c *Container) removeTransientFiles(ctx context.Context) error {
// Reset the service before stopping it to make sure it's being removed // Reset the service before stopping it to make sure it's being removed
// on stop. // on stop.
serviceChan := make(chan string) serviceChan := make(chan string)
serviceFile := fmt.Sprintf("%s.service", c.ID()) serviceFile := fmt.Sprintf("%s.service", c.hcUnitName(isStartup))
if err := conn.ResetFailedUnitContext(ctx, serviceFile); err != nil { if err := conn.ResetFailedUnitContext(ctx, serviceFile); err != nil {
logrus.Debugf("Failed to reset unit file: %q", err) logrus.Debugf("Failed to reset unit file: %q", err)
} }

View File

@ -1898,6 +1898,21 @@ func WithInfraConfig(compatibleOptions InfraInherit) CtrCreateOption {
} }
} }
// WithStartupHealthcheck sets a startup healthcheck for the container.
// Requires that a healthcheck must be set.
func WithStartupHealthcheck(startupHC *define.StartupHealthCheck) CtrCreateOption {
return func(ctr *Container) error {
if ctr.valid {
return define.ErrCtrFinalized
}
ctr.config.StartupHealthCheckConfig = new(define.StartupHealthCheck)
if err := JSONDeepCopy(startupHC, ctr.config.StartupHealthCheckConfig); err != nil {
return fmt.Errorf("error copying startup healthcheck into container: %w", err)
}
return nil
}
}
// Pod Creation Options // Pod Creation Options
// WithPodCreateCommand adds the full command plus arguments of the current // WithPodCreateCommand adds the full command plus arguments of the current

View File

@ -12,7 +12,7 @@ import (
func RunHealthCheck(w http.ResponseWriter, r *http.Request) { func RunHealthCheck(w http.ResponseWriter, r *http.Request) {
runtime := r.Context().Value(api.RuntimeKey).(*libpod.Runtime) runtime := r.Context().Value(api.RuntimeKey).(*libpod.Runtime)
name := utils.GetName(r) name := utils.GetName(r)
status, err := runtime.HealthCheck(name) status, err := runtime.HealthCheck(r.Context(), name)
if err != nil { if err != nil {
if status == define.HealthCheckContainerNotFound { if status == define.HealthCheckContainerNotFound {
utils.ContainerNotFound(w, name, err) utils.ContainerNotFound(w, name, err)
@ -32,6 +32,8 @@ func RunHealthCheck(w http.ResponseWriter, r *http.Request) {
hcStatus := define.HealthCheckUnhealthy hcStatus := define.HealthCheckUnhealthy
if status == define.HealthCheckSuccess { if status == define.HealthCheckSuccess {
hcStatus = define.HealthCheckHealthy hcStatus = define.HealthCheckHealthy
} else if status == define.HealthCheckStartup {
hcStatus = define.HealthCheckStarting
} }
report := define.HealthCheckResults{ report := define.HealthCheckResults{
Status: hcStatus, Status: hcStatus,

View File

@ -174,125 +174,129 @@ const (
) )
type ContainerCreateOptions struct { type ContainerCreateOptions struct {
Annotation []string Annotation []string
Attach []string Attach []string
Authfile string Authfile string
BlkIOWeight string BlkIOWeight string
BlkIOWeightDevice []string BlkIOWeightDevice []string
CapAdd []string CapAdd []string
CapDrop []string CapDrop []string
CgroupNS string CgroupNS string
CgroupsMode string CgroupsMode string
CgroupParent string `json:"cgroup_parent,omitempty"` CgroupParent string `json:"cgroup_parent,omitempty"`
CIDFile string CIDFile string
ConmonPIDFile string `json:"container_conmon_pidfile,omitempty"` ConmonPIDFile string `json:"container_conmon_pidfile,omitempty"`
CPUPeriod uint64 CPUPeriod uint64
CPUQuota int64 CPUQuota int64
CPURTPeriod uint64 CPURTPeriod uint64
CPURTRuntime int64 CPURTRuntime int64
CPUShares uint64 CPUShares uint64
CPUS float64 `json:"cpus,omitempty"` CPUS float64 `json:"cpus,omitempty"`
CPUSetCPUs string `json:"cpuset_cpus,omitempty"` CPUSetCPUs string `json:"cpuset_cpus,omitempty"`
CPUSetMems string CPUSetMems string
Devices []string `json:"devices,omitempty"` Devices []string `json:"devices,omitempty"`
DeviceCgroupRule []string DeviceCgroupRule []string
DeviceReadBPs []string `json:"device_read_bps,omitempty"` DeviceReadBPs []string `json:"device_read_bps,omitempty"`
DeviceReadIOPs []string DeviceReadIOPs []string
DeviceWriteBPs []string DeviceWriteBPs []string
DeviceWriteIOPs []string DeviceWriteIOPs []string
Entrypoint *string `json:"container_command,omitempty"` Entrypoint *string `json:"container_command,omitempty"`
Env []string Env []string
EnvHost bool EnvHost bool
EnvFile []string EnvFile []string
Expose []string Expose []string
GIDMap []string GIDMap []string
GroupAdd []string GroupAdd []string
HealthCmd string HealthCmd string
HealthInterval string HealthInterval string
HealthRetries uint HealthRetries uint
HealthStartPeriod string HealthStartPeriod string
HealthTimeout string HealthTimeout string
HealthOnFailure string HealthOnFailure string
Hostname string `json:"hostname,omitempty"` Hostname string `json:"hostname,omitempty"`
HTTPProxy bool HTTPProxy bool
HostUsers []string HostUsers []string
ImageVolume string ImageVolume string
Init bool Init bool
InitContainerType string InitContainerType string
InitPath string InitPath string
Interactive bool Interactive bool
IPC string IPC string
Label []string Label []string
LabelFile []string LabelFile []string
LogDriver string LogDriver string
LogOptions []string LogOptions []string
Memory string Memory string
MemoryReservation string MemoryReservation string
MemorySwap string MemorySwap string
MemorySwappiness int64 MemorySwappiness int64
Name string `json:"container_name"` Name string `json:"container_name"`
NoHealthCheck bool NoHealthCheck bool
OOMKillDisable bool OOMKillDisable bool
OOMScoreAdj *int OOMScoreAdj *int
Arch string Arch string
OS string OS string
Variant string Variant string
PID string `json:"pid,omitempty"` PID string `json:"pid,omitempty"`
PIDsLimit *int64 PIDsLimit *int64
Platform string Platform string
Pod string Pod string
PodIDFile string PodIDFile string
Personality string Personality string
PreserveFDs uint PreserveFDs uint
Privileged bool Privileged bool
PublishAll bool PublishAll bool
Pull string Pull string
Quiet bool Quiet bool
ReadOnly bool ReadOnly bool
ReadOnlyTmpFS bool ReadOnlyTmpFS bool
Restart string Restart string
Replace bool Replace bool
Requires []string Requires []string
Rm bool Rm bool
RootFS bool RootFS bool
Secrets []string Secrets []string
SecurityOpt []string `json:"security_opt,omitempty"` SecurityOpt []string `json:"security_opt,omitempty"`
SdNotifyMode string SdNotifyMode string
ShmSize string ShmSize string
SignaturePolicy string SignaturePolicy string
StopSignal string StartupHCCmd string
StopTimeout uint StartupHCInterval string
StorageOpts []string StartupHCRetries uint
SubUIDName string StartupHCSuccesses uint
SubGIDName string StartupHCTimeout string
Sysctl []string `json:"sysctl,omitempty"` StopSignal string
Systemd string StopTimeout uint
Timeout uint StorageOpts []string
TLSVerify commonFlag.OptionalBool SubUIDName string
TmpFS []string SubGIDName string
TTY bool Sysctl []string `json:"sysctl,omitempty"`
Timezone string Systemd string
Umask string Timeout uint
EnvMerge []string TLSVerify commonFlag.OptionalBool
UnsetEnv []string TmpFS []string
UnsetEnvAll bool TTY bool
UIDMap []string Timezone string
Ulimit []string Umask string
User string EnvMerge []string
UserNS string `json:"-"` UnsetEnv []string
UTS string UnsetEnvAll bool
Mount []string UIDMap []string
Volume []string `json:"volume,omitempty"` Ulimit []string
VolumesFrom []string `json:"volumes_from,omitempty"` User string
Workdir string UserNS string `json:"-"`
SeccompPolicy string UTS string
PidFile string Mount []string
ChrootDirs []string Volume []string `json:"volume,omitempty"`
IsInfra bool VolumesFrom []string `json:"volumes_from,omitempty"`
IsClone bool Workdir string
DecryptionKeys []string SeccompPolicy string
PidFile string
Net *NetOptions `json:"net,omitempty"` ChrootDirs []string
IsInfra bool
IsClone bool
DecryptionKeys []string
Net *NetOptions `json:"net,omitempty"`
CgroupConf []string CgroupConf []string

View File

@ -8,13 +8,15 @@ import (
) )
func (ic *ContainerEngine) HealthCheckRun(ctx context.Context, nameOrID string, options entities.HealthCheckOptions) (*define.HealthCheckResults, error) { func (ic *ContainerEngine) HealthCheckRun(ctx context.Context, nameOrID string, options entities.HealthCheckOptions) (*define.HealthCheckResults, error) {
status, err := ic.Libpod.HealthCheck(nameOrID) status, err := ic.Libpod.HealthCheck(ctx, nameOrID)
if err != nil { if err != nil {
return nil, err return nil, err
} }
hcStatus := define.HealthCheckUnhealthy hcStatus := define.HealthCheckUnhealthy
if status == define.HealthCheckSuccess { if status == define.HealthCheckSuccess {
hcStatus = define.HealthCheckHealthy hcStatus = define.HealthCheckHealthy
} else if status == define.HealthCheckStartup {
hcStatus = define.HealthCheckStarting
} }
report := define.HealthCheckResults{ report := define.HealthCheckResults{
Status: hcStatus, Status: hcStatus,

View File

@ -527,6 +527,9 @@ func createContainerOptions(rt *libpod.Runtime, s *specgen.SpecGenerator, pod *l
options = append(options, libpod.WithHealthCheck(s.ContainerHealthCheckConfig.HealthConfig)) options = append(options, libpod.WithHealthCheck(s.ContainerHealthCheckConfig.HealthConfig))
logrus.Debugf("New container has a health check") logrus.Debugf("New container has a health check")
} }
if s.ContainerHealthCheckConfig.StartupHealthConfig != nil {
options = append(options, libpod.WithStartupHealthcheck(s.ContainerHealthCheckConfig.StartupHealthConfig))
}
if s.ContainerHealthCheckConfig.HealthCheckOnFailureAction != define.HealthCheckOnFailureActionNone { if s.ContainerHealthCheckConfig.HealthCheckOnFailureAction != define.HealthCheckOnFailureActionNone {
options = append(options, libpod.WithHealthCheckOnFailureAction(s.ContainerHealthCheckConfig.HealthCheckOnFailureAction)) options = append(options, libpod.WithHealthCheckOnFailureAction(s.ContainerHealthCheckConfig.HealthCheckOnFailureAction))

View File

@ -536,6 +536,10 @@ type ContainerResourceConfig struct {
type ContainerHealthCheckConfig struct { type ContainerHealthCheckConfig struct {
HealthConfig *manifest.Schema2HealthConfig `json:"healthconfig,omitempty"` HealthConfig *manifest.Schema2HealthConfig `json:"healthconfig,omitempty"`
HealthCheckOnFailureAction define.HealthCheckOnFailureAction `json:"health_check_on_failure_action,omitempty"` HealthCheckOnFailureAction define.HealthCheckOnFailureAction `json:"health_check_on_failure_action,omitempty"`
// Startup healthcheck for a container.
// Requires that HealthConfig be set.
// Optional.
StartupHealthConfig *define.StartupHealthCheck `json:"startupHealthConfig,omitempty"`
} }
// SpecGenerator creates an OCI spec and Libpod configuration options to create // SpecGenerator creates an OCI spec and Libpod configuration options to create

View File

@ -256,7 +256,7 @@ func FillOutSpecGen(s *specgen.SpecGenerator, c *entities.ContainerCreateOptions
if c.NoHealthCheck { if c.NoHealthCheck {
return errors.New("cannot specify both --no-healthcheck and --health-cmd") return errors.New("cannot specify both --no-healthcheck and --health-cmd")
} }
s.HealthConfig, err = makeHealthCheckFromCli(c.HealthCmd, c.HealthInterval, c.HealthRetries, c.HealthTimeout, c.HealthStartPeriod) s.HealthConfig, err = makeHealthCheckFromCli(c.HealthCmd, c.HealthInterval, c.HealthRetries, c.HealthTimeout, c.HealthStartPeriod, false)
if err != nil { if err != nil {
return err return err
} }
@ -272,6 +272,25 @@ func FillOutSpecGen(s *specgen.SpecGenerator, c *entities.ContainerCreateOptions
} }
s.HealthCheckOnFailureAction = onFailureAction s.HealthCheckOnFailureAction = onFailureAction
if c.StartupHCCmd != "" {
if c.NoHealthCheck {
return errors.New("cannot specify both --no-healthcheck and --health-startup-cmd")
}
// The hardcoded "1s" will be discarded, as the startup
// healthcheck does not have a period. So just hardcode
// something that parses correctly.
tmpHcConfig, err := makeHealthCheckFromCli(c.StartupHCCmd, c.StartupHCInterval, c.StartupHCRetries, c.StartupHCTimeout, "1s", true)
if err != nil {
return err
}
s.StartupHealthConfig = new(define.StartupHealthCheck)
s.StartupHealthConfig.Test = tmpHcConfig.Test
s.StartupHealthConfig.Interval = tmpHcConfig.Interval
s.StartupHealthConfig.Timeout = tmpHcConfig.Timeout
s.StartupHealthConfig.Retries = tmpHcConfig.Retries
s.StartupHealthConfig.Successes = int(c.StartupHCSuccesses)
}
if err := setNamespaces(s, c); err != nil { if err := setNamespaces(s, c); err != nil {
return err return err
} }
@ -838,7 +857,7 @@ func FillOutSpecGen(s *specgen.SpecGenerator, c *entities.ContainerCreateOptions
return nil return nil
} }
func makeHealthCheckFromCli(inCmd, interval string, retries uint, timeout, startPeriod string) (*manifest.Schema2HealthConfig, error) { func makeHealthCheckFromCli(inCmd, interval string, retries uint, timeout, startPeriod string, isStartup bool) (*manifest.Schema2HealthConfig, error) {
cmdArr := []string{} cmdArr := []string{}
isArr := true isArr := true
err := json.Unmarshal([]byte(inCmd), &cmdArr) // array unmarshalling err := json.Unmarshal([]byte(inCmd), &cmdArr) // array unmarshalling
@ -886,7 +905,7 @@ func makeHealthCheckFromCli(inCmd, interval string, retries uint, timeout, start
hc.Interval = intervalDuration hc.Interval = intervalDuration
if retries < 1 { if retries < 1 && !isStartup {
return nil, errors.New("healthcheck-retries must be greater than 0") return nil, errors.New("healthcheck-retries must be greater than 0")
} }
hc.Retries = int(retries) hc.Retries = int(retries)

View File

@ -334,4 +334,43 @@ HEALTHCHECK CMD ls -l / 2>&1`, ALPINE)
// Check to make sure characters were not coerced to utf8 // Check to make sure characters were not coerced to utf8
Expect(inspect[0].Config.Healthcheck).To(HaveField("Test", []string{"CMD-SHELL", "ls -l / 2>&1"})) Expect(inspect[0].Config.Healthcheck).To(HaveField("Test", []string{"CMD-SHELL", "ls -l / 2>&1"}))
}) })
It("Startup healthcheck success transitions to regular healthcheck", func() {
ctrName := "hcCtr"
ctrRun := podmanTest.Podman([]string{"run", "-dt", "--name", ctrName, "--health-cmd", "echo regular", "--health-startup-cmd", "cat /test", ALPINE, "top"})
ctrRun.WaitWithDefaultTimeout()
Expect(ctrRun).Should(Exit(0))
inspect := podmanTest.InspectContainer(ctrName)
Expect(inspect[0].State.Health).To(HaveField("Status", "starting"))
hc := podmanTest.Podman([]string{"healthcheck", "run", ctrName})
hc.WaitWithDefaultTimeout()
Expect(hc).Should(Exit(1))
exec := podmanTest.Podman([]string{"exec", ctrName, "sh", "-c", "touch /test && echo startup > /test"})
exec.WaitWithDefaultTimeout()
Expect(exec).Should(Exit(0))
hc = podmanTest.Podman([]string{"healthcheck", "run", ctrName})
hc.WaitWithDefaultTimeout()
Expect(hc).Should(Exit(0))
inspect = podmanTest.InspectContainer(ctrName)
Expect(inspect[0].State.Health).To(HaveField("Status", define.HealthCheckHealthy))
hc = podmanTest.Podman([]string{"healthcheck", "run", ctrName})
hc.WaitWithDefaultTimeout()
Expect(hc).Should(Exit(0))
inspect = podmanTest.InspectContainer(ctrName)
Expect(inspect[0].State.Health).To(HaveField("Status", define.HealthCheckHealthy))
// Test podman ps --filter heath is working (#11687)
ps := podmanTest.Podman([]string{"ps", "--filter", "health=healthy"})
ps.WaitWithDefaultTimeout()
Expect(ps).Should(Exit(0))
Expect(ps.OutputToStringArray()).To(HaveLen(2))
Expect(ps.OutputToString()).To(ContainSubstring("hc"))
})
}) })