mirror of
				https://github.com/containers/podman.git
				synced 2025-10-25 10:16:43 +08:00 
			
		
		
		
	Add support for startup healthchecks
Startup healthchecks are similar to K8S startup probes, in that they are a separate check from the regular healthcheck that runs before it. If the startup healthcheck fails repeatedly, the associated container is restarted. Signed-off-by: Matthew Heon <matthew.heon@pm.me>
This commit is contained in:
		| @ -180,7 +180,7 @@ func DefineCreateFlags(cmd *cobra.Command, cf *entities.ContainerCreateOptions, | ||||
| 		createFlags.StringVar( | ||||
| 			&cf.HealthInterval, | ||||
| 			healthIntervalFlagName, define.DefaultHealthCheckInterval, | ||||
| 			"set an interval for the healthchecks (a value of disable results in no automatic timer setup)", | ||||
| 			"set an interval for the healthcheck (a value of disable results in no automatic timer setup)", | ||||
| 		) | ||||
| 		_ = cmd.RegisterFlagCompletionFunc(healthIntervalFlagName, completion.AutocompleteNone) | ||||
|  | ||||
| @ -428,6 +428,46 @@ func DefineCreateFlags(cmd *cobra.Command, cf *entities.ContainerCreateOptions, | ||||
| 		) | ||||
| 		_ = cmd.RegisterFlagCompletionFunc(secretFlagName, AutocompleteSecrets) | ||||
|  | ||||
| 		startupHCCmdFlagName := "health-startup-cmd" | ||||
| 		createFlags.StringVar( | ||||
| 			&cf.StartupHCCmd, | ||||
| 			startupHCCmdFlagName, "", | ||||
| 			"Set a startup healthcheck command for the container", | ||||
| 		) | ||||
| 		_ = cmd.RegisterFlagCompletionFunc(startupHCCmdFlagName, completion.AutocompleteNone) | ||||
|  | ||||
| 		startupHCIntervalFlagName := "health-startup-interval" | ||||
| 		createFlags.StringVar( | ||||
| 			&cf.StartupHCInterval, | ||||
| 			startupHCIntervalFlagName, define.DefaultHealthCheckInterval, | ||||
| 			"Set an interval for the startup healthcheck", | ||||
| 		) | ||||
| 		_ = cmd.RegisterFlagCompletionFunc(startupHCIntervalFlagName, completion.AutocompleteNone) | ||||
|  | ||||
| 		startupHCRetriesFlagName := "health-startup-retries" | ||||
| 		createFlags.UintVar( | ||||
| 			&cf.StartupHCRetries, | ||||
| 			startupHCRetriesFlagName, 0, | ||||
| 			"Set the maximum number of retries before the startup healthcheck will restart the container", | ||||
| 		) | ||||
| 		_ = cmd.RegisterFlagCompletionFunc(startupHCRetriesFlagName, completion.AutocompleteNone) | ||||
|  | ||||
| 		startupHCSuccessesFlagName := "health-startup-success" | ||||
| 		createFlags.UintVar( | ||||
| 			&cf.StartupHCSuccesses, | ||||
| 			startupHCSuccessesFlagName, 0, | ||||
| 			"Set the number of consecutive successes before the startup healthcheck is marked as successful and the normal healthcheck begins (0 indicates any success will start the regular healthcheck)", | ||||
| 		) | ||||
| 		_ = cmd.RegisterFlagCompletionFunc(startupHCSuccessesFlagName, completion.AutocompleteNone) | ||||
|  | ||||
| 		startupHCTimeoutFlagName := "health-startup-timeout" | ||||
| 		createFlags.StringVar( | ||||
| 			&cf.StartupHCTimeout, | ||||
| 			startupHCTimeoutFlagName, define.DefaultHealthCheckTimeout, | ||||
| 			"Set the maximum amount of time that the startup healthcheck may take before it is considered failed", | ||||
| 		) | ||||
| 		_ = cmd.RegisterFlagCompletionFunc(startupHCTimeoutFlagName, completion.AutocompleteNone) | ||||
|  | ||||
| 		stopSignalFlagName := "stop-signal" | ||||
| 		createFlags.StringVar( | ||||
| 			&cf.StopSignal, | ||||
|  | ||||
| @ -35,7 +35,7 @@ func run(cmd *cobra.Command, args []string) error { | ||||
| 	if err != nil { | ||||
| 		return err | ||||
| 	} | ||||
| 	if response.Status == define.HealthCheckUnhealthy { | ||||
| 	if response.Status == define.HealthCheckUnhealthy || response.Status == define.HealthCheckStarting { | ||||
| 		registry.SetExitCode(1) | ||||
| 		fmt.Println(response.Status) | ||||
| 	} | ||||
|  | ||||
							
								
								
									
										11
									
								
								docs/source/markdown/options/health-startup-cmd.md
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										11
									
								
								docs/source/markdown/options/health-startup-cmd.md
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,11 @@ | ||||
| ####> This option file is used in: | ||||
| ####>   podman create, run | ||||
| ####> If you edit this file, make sure your changes | ||||
| ####> are applicable to all of those. | ||||
| #### **--health-startup-cmd**=*"command"* | *'["command", "arg1", ...]'* | ||||
|  | ||||
| Set a startup healthcheck command for a container. This command will be executed inside the container and is used to gate the regular | ||||
| healthcheck. When the startup command succeeds, the regular healthcheck will begin and the startup healthcheck will cease. Optionally, | ||||
| if the command fails for a set number of attempts, the container will be restarted. A startup healthcheck can be used to ensure that | ||||
| containers with an extended startup period are not marked as unhealthy until they are fully started. Startup healthchecks can only be | ||||
| used when a regular healthcheck (from the container's image or the **--health-cmd** option) is also set. | ||||
							
								
								
									
										7
									
								
								docs/source/markdown/options/health-startup-interval.md
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										7
									
								
								docs/source/markdown/options/health-startup-interval.md
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,7 @@ | ||||
| ####> This option file is used in: | ||||
| ####>   podman create, run | ||||
| ####> If you edit this file, make sure your changes | ||||
| ####> are applicable to all of those. | ||||
| #### **--health-startup-interval**=*interval* | ||||
|  | ||||
| Set an interval for the startup healthcheck. An _interval_ of **disable** results in no automatic timer setup. The default is **30s**. | ||||
							
								
								
									
										8
									
								
								docs/source/markdown/options/health-startup-retries.md
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										8
									
								
								docs/source/markdown/options/health-startup-retries.md
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,8 @@ | ||||
| ####> This option file is used in: | ||||
| ####>   podman create, run | ||||
| ####> If you edit this file, make sure your changes | ||||
| ####> are applicable to all of those. | ||||
| #### **--health-startup-retries**=*retries* | ||||
|  | ||||
| The number of attempts allowed before the startup healthcheck restarts the container. If set to **0**, the container will never be | ||||
| restarted. The default is **0**. | ||||
							
								
								
									
										8
									
								
								docs/source/markdown/options/health-startup-success.md
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										8
									
								
								docs/source/markdown/options/health-startup-success.md
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,8 @@ | ||||
| ####> This option file is used in: | ||||
| ####>   podman create, run | ||||
| ####> If you edit this file, make sure your changes | ||||
| ####> are applicable to all of those. | ||||
| #### **--health-startup-success**=*retries* | ||||
|  | ||||
| The number of successful runs required before the startup healthcheck will succeed and the regular healthcheck will begin. A value | ||||
| of **0** means that any success will begin the regular healthcheck. The default is **0**. | ||||
							
								
								
									
										8
									
								
								docs/source/markdown/options/health-startup-timeout.md
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										8
									
								
								docs/source/markdown/options/health-startup-timeout.md
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,8 @@ | ||||
| ####> This option file is used in: | ||||
| ####>   podman create, run | ||||
| ####> If you edit this file, make sure your changes | ||||
| ####> are applicable to all of those. | ||||
| #### **--health-startup-timeout**=*timeout* | ||||
|  | ||||
| The maximum time a startup healthcheck command has to complete before it is marked as failed. The value can be expressed in a time | ||||
| format like **2m3s**. The default value is **30s**. | ||||
| @ -172,6 +172,16 @@ See [**Environment**](#environment) note below for precedence and examples. | ||||
|  | ||||
| @@option health-start-period | ||||
|  | ||||
| @@option health-startup-cmd | ||||
|  | ||||
| @@option health-startup-interval | ||||
|  | ||||
| @@option health-startup-retries | ||||
|  | ||||
| @@option health-startup-success | ||||
|  | ||||
| @@option health-startup-timeout | ||||
|  | ||||
| @@option health-timeout | ||||
|  | ||||
| #### **--help** | ||||
|  | ||||
| @ -204,6 +204,16 @@ See [**Environment**](#environment) note below for precedence and examples. | ||||
|  | ||||
| @@option health-start-period | ||||
|  | ||||
| @@option health-startup-cmd | ||||
|  | ||||
| @@option health-startup-interval | ||||
|  | ||||
| @@option health-startup-retries | ||||
|  | ||||
| @@option health-startup-success | ||||
|  | ||||
| @@option health-startup-timeout | ||||
|  | ||||
| @@option health-timeout | ||||
|  | ||||
| #### **--help** | ||||
|  | ||||
| @ -200,6 +200,18 @@ type ContainerState struct { | ||||
| 	// (only by restart policy). | ||||
| 	RestartCount uint `json:"restartCount,omitempty"` | ||||
|  | ||||
| 	// StartupHCPassed indicates that the startup healthcheck has | ||||
| 	// succeeded and the main healthcheck can begin. | ||||
| 	StartupHCPassed bool `json:"startupHCPassed,omitempty"` | ||||
| 	// StartupHCSuccessCount indicates the number of successes of the | ||||
| 	// startup healthcheck. A startup HC can require more than one success | ||||
| 	// to be marked as passed. | ||||
| 	StartupHCSuccessCount int `json:"startupHCSuccessCount,omitempty"` | ||||
| 	// StartupHCFailureCount indicates the number of failures of the startup | ||||
| 	// healthcheck. The container will be restarted if this exceed a set | ||||
| 	// number in the startup HC config. | ||||
| 	StartupHCFailureCount int `json:"startupHCFailureCount,omitempty"` | ||||
|  | ||||
| 	// ExtensionStageHooks holds hooks which will be executed by libpod | ||||
| 	// and not delegated to the OCI runtime. | ||||
| 	ExtensionStageHooks map[string][]spec.Hook `json:"extensionStageHooks,omitempty"` | ||||
| @ -929,6 +941,20 @@ func (c *Container) StoppedByUser() (bool, error) { | ||||
| 	return c.state.StoppedByUser, nil | ||||
| } | ||||
|  | ||||
| // StartupHCPassed returns whether the container's startup healthcheck passed. | ||||
| func (c *Container) StartupHCPassed() (bool, error) { | ||||
| 	if !c.batched { | ||||
| 		c.lock.Lock() | ||||
| 		defer c.lock.Unlock() | ||||
|  | ||||
| 		if err := c.syncContainer(); err != nil { | ||||
| 			return false, err | ||||
| 		} | ||||
| 	} | ||||
|  | ||||
| 	return c.state.StartupHCPassed, nil | ||||
| } | ||||
|  | ||||
| // Misc Accessors | ||||
| // Most will require locking | ||||
|  | ||||
|  | ||||
| @ -395,6 +395,10 @@ type ContainerMiscConfig struct { | ||||
| 	HealthCheckConfig *manifest.Schema2HealthConfig `json:"healthcheck"` | ||||
| 	// HealthCheckOnFailureAction defines an action to take once the container turns unhealthy. | ||||
| 	HealthCheckOnFailureAction define.HealthCheckOnFailureAction `json:"healthcheck_on_failure_action"` | ||||
| 	// StartupHealthCheckConfig is the configuration of the startup | ||||
| 	// healthcheck for the container. This will run before the regular HC | ||||
| 	// runs, and when it passes the regular HC will be activated. | ||||
| 	StartupHealthCheckConfig *define.StartupHealthCheck `json:"startupHealthCheck,omitempty"` | ||||
| 	// PreserveFDs is a number of additional file descriptors (in addition | ||||
| 	// to 0, 1, 2) that will be passed to the executed process. The total FDs | ||||
| 	// passed will be 3 + PreserveFDs. | ||||
|  | ||||
| @ -622,6 +622,9 @@ func resetState(state *ContainerState) { | ||||
| 	state.CheckpointPath = "" | ||||
| 	state.CheckpointLog = "" | ||||
| 	state.RestoreLog = "" | ||||
| 	state.StartupHCPassed = false | ||||
| 	state.StartupHCSuccessCount = 0 | ||||
| 	state.StartupHCFailureCount = 0 | ||||
| } | ||||
|  | ||||
| // Refresh refreshes the container's state after a restart. | ||||
| @ -1072,6 +1075,9 @@ func (c *Container) init(ctx context.Context, retainRetries bool) error { | ||||
| 	c.state.State = define.ContainerStateCreated | ||||
| 	c.state.StoppedByUser = false | ||||
| 	c.state.RestartPolicyMatch = false | ||||
| 	c.state.StartupHCFailureCount = 0 | ||||
| 	c.state.StartupHCSuccessCount = 0 | ||||
| 	c.state.StartupHCPassed = false | ||||
|  | ||||
| 	if !retainRetries { | ||||
| 		c.state.RestartCount = 0 | ||||
| @ -1091,7 +1097,11 @@ func (c *Container) init(ctx context.Context, retainRetries bool) error { | ||||
| 	} | ||||
|  | ||||
| 	if c.config.HealthCheckConfig != nil { | ||||
| 		if err := c.createTimer(); err != nil { | ||||
| 		timer := c.config.HealthCheckConfig.Interval.String() | ||||
| 		if c.config.StartupHealthCheckConfig != nil { | ||||
| 			timer = c.config.StartupHealthCheckConfig.Interval.String() | ||||
| 		} | ||||
| 		if err := c.createTimer(timer, c.config.StartupHealthCheckConfig != nil); err != nil { | ||||
| 			logrus.Error(err) | ||||
| 		} | ||||
| 	} | ||||
| @ -1244,7 +1254,7 @@ func (c *Container) start() error { | ||||
| 		if err := c.updateHealthStatus(define.HealthCheckStarting); err != nil { | ||||
| 			logrus.Error(err) | ||||
| 		} | ||||
| 		if err := c.startTimer(); err != nil { | ||||
| 		if err := c.startTimer(c.config.StartupHealthCheckConfig != nil); err != nil { | ||||
| 			logrus.Error(err) | ||||
| 		} | ||||
| 	} | ||||
| @ -1422,7 +1432,7 @@ func (c *Container) restartWithTimeout(ctx context.Context, timeout uint) (retEr | ||||
| 			return err | ||||
| 		} | ||||
| 		if c.config.HealthCheckConfig != nil { | ||||
| 			if err := c.removeTransientFiles(context.Background()); err != nil { | ||||
| 			if err := c.removeTransientFiles(context.Background(), c.config.StartupHealthCheckConfig != nil && !c.state.StartupHCPassed); err != nil { | ||||
| 				logrus.Error(err.Error()) | ||||
| 			} | ||||
| 		} | ||||
| @ -1859,7 +1869,7 @@ func (c *Container) cleanup(ctx context.Context) error { | ||||
|  | ||||
| 	// Remove healthcheck unit/timer file if it execs | ||||
| 	if c.config.HealthCheckConfig != nil { | ||||
| 		if err := c.removeTransientFiles(ctx); err != nil { | ||||
| 		if err := c.removeTransientFiles(ctx, c.config.StartupHealthCheckConfig != nil && !c.state.StartupHCPassed); err != nil { | ||||
| 			logrus.Errorf("Removing timer for container %s healthcheck: %v", c.ID(), err) | ||||
| 		} | ||||
| 	} | ||||
|  | ||||
| @ -156,6 +156,11 @@ func (c *Container) validate() error { | ||||
| 		} | ||||
| 	} | ||||
|  | ||||
| 	// Cannot set startup HC without a healthcheck | ||||
| 	if c.config.HealthCheckConfig == nil && c.config.StartupHealthCheckConfig != nil { | ||||
| 		return fmt.Errorf("cannot set a startup healthcheck when there is no regular healthcheck: %w", define.ErrInvalidArg) | ||||
| 	} | ||||
|  | ||||
| 	return nil | ||||
| } | ||||
|  | ||||
|  | ||||
| @ -3,6 +3,8 @@ package define | ||||
| import ( | ||||
| 	"fmt" | ||||
| 	"strings" | ||||
|  | ||||
| 	"github.com/containers/image/v5/manifest" | ||||
| ) | ||||
|  | ||||
| const ( | ||||
| @ -38,6 +40,9 @@ const ( | ||||
| 	HealthCheckInternalError HealthCheckStatus = iota | ||||
| 	// HealthCheckDefined means the healthcheck was found on the container | ||||
| 	HealthCheckDefined HealthCheckStatus = iota | ||||
| 	// HealthCheckStartup means the healthcheck was unhealthy, but is still | ||||
| 	// either within the startup HC or the startup period of the healthcheck | ||||
| 	HealthCheckStartup HealthCheckStatus = iota | ||||
| ) | ||||
|  | ||||
| // Healthcheck defaults.  These are used both in the cli as well in | ||||
| @ -131,3 +136,12 @@ func ParseHealthCheckOnFailureAction(s string) (HealthCheckOnFailureAction, erro | ||||
| 		return HealthCheckOnFailureActionInvalid, err | ||||
| 	} | ||||
| } | ||||
|  | ||||
| // StartupHealthCheck is the configuration of a startup healthcheck. | ||||
| type StartupHealthCheck struct { | ||||
| 	manifest.Schema2HealthConfig | ||||
| 	// Successes are the number of successes required to mark the startup HC | ||||
| 	// as passed. | ||||
| 	// If set to 0, a single success will mark the HC as passed. | ||||
| 	Successes int `json:",omitempty"` | ||||
| } | ||||
|  | ||||
| @ -25,7 +25,7 @@ const ( | ||||
|  | ||||
| // HealthCheck verifies the state and validity of the healthcheck configuration | ||||
| // on the container and then executes the healthcheck | ||||
| func (r *Runtime) HealthCheck(name string) (define.HealthCheckStatus, error) { | ||||
| func (r *Runtime) HealthCheck(ctx context.Context, name string) (define.HealthCheckStatus, error) { | ||||
| 	container, err := r.LookupContainer(name) | ||||
| 	if err != nil { | ||||
| 		return define.HealthCheckContainerNotFound, fmt.Errorf("unable to look up %s to perform a health check: %w", name, err) | ||||
| @ -36,21 +36,35 @@ func (r *Runtime) HealthCheck(name string) (define.HealthCheckStatus, error) { | ||||
| 		return hcStatus, err | ||||
| 	} | ||||
|  | ||||
| 	hcStatus, logStatus, err := container.runHealthCheck() | ||||
| 	isStartupHC := false | ||||
| 	if container.config.StartupHealthCheckConfig != nil { | ||||
| 		passed, err := container.StartupHCPassed() | ||||
| 		if err != nil { | ||||
| 			return define.HealthCheckInternalError, err | ||||
| 		} | ||||
| 		isStartupHC = !passed | ||||
| 	} | ||||
|  | ||||
| 	hcStatus, logStatus, err := container.runHealthCheck(ctx, isStartupHC) | ||||
| 	if !isStartupHC { | ||||
| 		if err := container.processHealthCheckStatus(logStatus); err != nil { | ||||
| 			return hcStatus, err | ||||
| 		} | ||||
| 	} | ||||
| 	return hcStatus, err | ||||
| } | ||||
|  | ||||
| // runHealthCheck runs the health check as defined by the container | ||||
| func (c *Container) runHealthCheck() (define.HealthCheckStatus, string, error) { | ||||
| func (c *Container) runHealthCheck(ctx context.Context, isStartup bool) (define.HealthCheckStatus, string, error) { | ||||
| 	var ( | ||||
| 		newCommand    []string | ||||
| 		returnCode    int | ||||
| 		inStartPeriod bool | ||||
| 	) | ||||
| 	hcCommand := c.HealthCheckConfig().Test | ||||
| 	if isStartup { | ||||
| 		logrus.Debugf("Running startup healthcheck for container %s", c.ID()) | ||||
| 		hcCommand = c.config.StartupHealthCheckConfig.Test | ||||
| 	} | ||||
| 	if len(hcCommand) < 1 { | ||||
| 		return define.HealthCheckNotDefined, "", fmt.Errorf("container %s has no defined healthcheck", c.ID()) | ||||
| 	} | ||||
| @ -113,6 +127,18 @@ func (c *Container) runHealthCheck() (define.HealthCheckStatus, string, error) { | ||||
| 		hcResult = define.HealthCheckFailure | ||||
| 		returnCode = 1 | ||||
| 	} | ||||
|  | ||||
| 	// Handle startup HC | ||||
| 	if isStartup { | ||||
| 		inStartPeriod = true | ||||
| 		if hcErr != nil || exitCode != 0 { | ||||
| 			hcResult = define.HealthCheckStartup | ||||
| 			c.incrementStartupHCFailureCounter(ctx) | ||||
| 		} else { | ||||
| 			c.incrementStartupHCSuccessCounter(ctx) | ||||
| 		} | ||||
| 	} | ||||
|  | ||||
| 	timeEnd := time.Now() | ||||
| 	if c.HealthCheckConfig().StartPeriod > 0 { | ||||
| 		// there is a start-period we need to honor; we add startPeriod to container start time | ||||
| @ -188,6 +214,114 @@ func checkHealthCheckCanBeRun(c *Container) (define.HealthCheckStatus, error) { | ||||
| 	return define.HealthCheckDefined, nil | ||||
| } | ||||
|  | ||||
| // Increment the current startup healthcheck success counter. | ||||
| // Can stop the startup HC and start the regular HC if the startup HC has enough | ||||
| // consecutive successes. | ||||
| func (c *Container) incrementStartupHCSuccessCounter(ctx context.Context) { | ||||
| 	if !c.batched { | ||||
| 		c.lock.Lock() | ||||
| 		defer c.lock.Unlock() | ||||
|  | ||||
| 		if err := c.syncContainer(); err != nil { | ||||
| 			logrus.Errorf("Error syncing container %s state: %v", c.ID(), err) | ||||
| 			return | ||||
| 		} | ||||
| 	} | ||||
|  | ||||
| 	// We don't have a startup HC, can't do anything | ||||
| 	if c.config.StartupHealthCheckConfig == nil { | ||||
| 		return | ||||
| 	} | ||||
|  | ||||
| 	// Race: someone else got here first | ||||
| 	if c.state.StartupHCPassed { | ||||
| 		return | ||||
| 	} | ||||
|  | ||||
| 	// Increment the success counter | ||||
| 	c.state.StartupHCSuccessCount++ | ||||
|  | ||||
| 	logrus.Debugf("Startup healthcheck for container %s succeeded, success counter now %d", c.ID(), c.state.StartupHCSuccessCount) | ||||
|  | ||||
| 	// Did we exceed threshold? | ||||
| 	recreateTimer := false | ||||
| 	if c.config.StartupHealthCheckConfig.Successes == 0 || c.state.StartupHCSuccessCount >= c.config.StartupHealthCheckConfig.Successes { | ||||
| 		c.state.StartupHCPassed = true | ||||
| 		c.state.StartupHCSuccessCount = 0 | ||||
| 		c.state.StartupHCFailureCount = 0 | ||||
|  | ||||
| 		recreateTimer = true | ||||
| 	} | ||||
|  | ||||
| 	if err := c.save(); err != nil { | ||||
| 		logrus.Errorf("Error saving container %s state: %v", c.ID(), err) | ||||
| 		return | ||||
| 	} | ||||
|  | ||||
| 	if recreateTimer { | ||||
| 		logrus.Infof("Startup healthcheck for container %s passed, recreating timer", c.ID()) | ||||
|  | ||||
| 		// Create the new, standard healthcheck timer first. | ||||
| 		if err := c.createTimer(c.HealthCheckConfig().Interval.String(), false); err != nil { | ||||
| 			logrus.Errorf("Error recreating container %s healthcheck: %v", c.ID(), err) | ||||
| 			return | ||||
| 		} | ||||
| 		if err := c.startTimer(false); err != nil { | ||||
| 			logrus.Errorf("Error restarting container %s healthcheck timer: %v", c.ID(), err) | ||||
| 		} | ||||
|  | ||||
| 		// This kills the process the healthcheck is running. | ||||
| 		// Which happens to be us. | ||||
| 		// So this has to be last - after this, systemd serves us a | ||||
| 		// SIGTERM and we exit. | ||||
| 		if err := c.removeTransientFiles(ctx, true); err != nil { | ||||
| 			logrus.Errorf("Error removing container %s healthcheck: %v", c.ID(), err) | ||||
| 			return | ||||
| 		} | ||||
| 	} | ||||
| } | ||||
|  | ||||
| // Increment the current startup healthcheck failure counter. | ||||
| // Can restart the container if the HC fails enough times consecutively. | ||||
| func (c *Container) incrementStartupHCFailureCounter(ctx context.Context) { | ||||
| 	if !c.batched { | ||||
| 		c.lock.Lock() | ||||
| 		defer c.lock.Unlock() | ||||
|  | ||||
| 		if err := c.syncContainer(); err != nil { | ||||
| 			logrus.Errorf("Error syncing container %s state: %v", c.ID(), err) | ||||
| 			return | ||||
| 		} | ||||
| 	} | ||||
|  | ||||
| 	// We don't have a startup HC, can't do anything | ||||
| 	if c.config.StartupHealthCheckConfig == nil { | ||||
| 		return | ||||
| 	} | ||||
|  | ||||
| 	// Race: someone else got here first | ||||
| 	if c.state.StartupHCPassed { | ||||
| 		return | ||||
| 	} | ||||
|  | ||||
| 	c.state.StartupHCFailureCount++ | ||||
|  | ||||
| 	logrus.Debugf("Startup healthcheck for container %s failed, failure counter now %d", c.ID(), c.state.StartupHCFailureCount) | ||||
|  | ||||
| 	if c.config.StartupHealthCheckConfig.Retries != 0 && c.state.StartupHCFailureCount >= c.config.StartupHealthCheckConfig.Retries { | ||||
| 		logrus.Infof("Restarting container %s as startup healthcheck failed", c.ID()) | ||||
| 		// Restart the container | ||||
| 		if err := c.restartWithTimeout(ctx, c.config.StopTimeout); err != nil { | ||||
| 			logrus.Errorf("Error restarting container %s after healthcheck failure: %v", c.ID(), err) | ||||
| 		} | ||||
| 		return | ||||
| 	} | ||||
|  | ||||
| 	if err := c.save(); err != nil { | ||||
| 		logrus.Errorf("Error saving container %s state: %v", c.ID(), err) | ||||
| 	} | ||||
| } | ||||
|  | ||||
| func newHealthCheckLog(start, end time.Time, exitCode int, log string) define.HealthCheckLog { | ||||
| 	return define.HealthCheckLog{ | ||||
| 		Start:    start.Format(time.RFC3339Nano), | ||||
| @ -299,12 +433,26 @@ func (c *Container) healthCheckStatus() (string, error) { | ||||
| 	return results.Status, nil | ||||
| } | ||||
|  | ||||
| func (c *Container) disableHealthCheckSystemd() bool { | ||||
| func (c *Container) disableHealthCheckSystemd(isStartup bool) bool { | ||||
| 	if os.Getenv("DISABLE_HC_SYSTEMD") == "true" { | ||||
| 		return true | ||||
| 	} | ||||
| 	if isStartup { | ||||
| 		if c.config.StartupHealthCheckConfig.Interval == 0 { | ||||
| 			return true | ||||
| 		} | ||||
| 	} | ||||
| 	if c.config.HealthCheckConfig.Interval == 0 { | ||||
| 		return true | ||||
| 	} | ||||
| 	return false | ||||
| } | ||||
|  | ||||
| // Systemd unit name for the healthcheck systemd unit | ||||
| func (c *Container) hcUnitName(isStartup bool) string { | ||||
| 	unitName := c.ID() | ||||
| 	if isStartup { | ||||
| 		unitName += "-startup" | ||||
| 	} | ||||
| 	return unitName | ||||
| } | ||||
|  | ||||
| @ -14,8 +14,8 @@ import ( | ||||
| ) | ||||
|  | ||||
| // createTimer systemd timers for healthchecks of a container | ||||
| func (c *Container) createTimer() error { | ||||
| 	if c.disableHealthCheckSystemd() { | ||||
| func (c *Container) createTimer(interval string, isStartup bool) error { | ||||
| 	if c.disableHealthCheckSystemd(isStartup) { | ||||
| 		return nil | ||||
| 	} | ||||
| 	podman, err := os.Executable() | ||||
| @ -31,7 +31,14 @@ func (c *Container) createTimer() error { | ||||
| 	if path != "" { | ||||
| 		cmd = append(cmd, "--setenv=PATH="+path) | ||||
| 	} | ||||
| 	cmd = append(cmd, "--unit", c.ID(), fmt.Sprintf("--on-unit-inactive=%s", c.HealthCheckConfig().Interval.String()), "--timer-property=AccuracySec=1s", podman, "healthcheck", "run", c.ID()) | ||||
|  | ||||
| 	cmd = append(cmd, "--unit", c.hcUnitName(isStartup), fmt.Sprintf("--on-unit-inactive=%s", interval), "--timer-property=AccuracySec=1s", podman) | ||||
|  | ||||
| 	if logrus.IsLevelEnabled(logrus.DebugLevel) { | ||||
| 		cmd = append(cmd, "--log-level=debug", "--syslog") | ||||
| 	} | ||||
|  | ||||
| 	cmd = append(cmd, "healthcheck", "run", c.ID()) | ||||
|  | ||||
| 	conn, err := systemd.ConnectToDBUS() | ||||
| 	if err != nil { | ||||
| @ -58,8 +65,8 @@ func systemdOpSuccessful(c chan string) error { | ||||
| } | ||||
|  | ||||
| // startTimer starts a systemd timer for the healthchecks | ||||
| func (c *Container) startTimer() error { | ||||
| 	if c.disableHealthCheckSystemd() { | ||||
| func (c *Container) startTimer(isStartup bool) error { | ||||
| 	if c.disableHealthCheckSystemd(isStartup) { | ||||
| 		return nil | ||||
| 	} | ||||
| 	conn, err := systemd.ConnectToDBUS() | ||||
| @ -68,7 +75,7 @@ func (c *Container) startTimer() error { | ||||
| 	} | ||||
| 	defer conn.Close() | ||||
|  | ||||
| 	startFile := fmt.Sprintf("%s.service", c.ID()) | ||||
| 	startFile := fmt.Sprintf("%s.service", c.hcUnitName(isStartup)) | ||||
| 	startChan := make(chan string) | ||||
| 	if _, err := conn.RestartUnitContext(context.Background(), startFile, "fail", startChan); err != nil { | ||||
| 		return err | ||||
| @ -82,8 +89,8 @@ func (c *Container) startTimer() error { | ||||
|  | ||||
| // removeTransientFiles removes the systemd timer and unit files | ||||
| // for the container | ||||
| func (c *Container) removeTransientFiles(ctx context.Context) error { | ||||
| 	if c.disableHealthCheckSystemd() { | ||||
| func (c *Container) removeTransientFiles(ctx context.Context, isStartup bool) error { | ||||
| 	if c.disableHealthCheckSystemd(isStartup) { | ||||
| 		return nil | ||||
| 	} | ||||
| 	conn, err := systemd.ConnectToDBUS() | ||||
| @ -99,7 +106,7 @@ func (c *Container) removeTransientFiles(ctx context.Context) error { | ||||
| 	// Stop the timer before the service to make sure the timer does not | ||||
| 	// fire after the service is stopped. | ||||
| 	timerChan := make(chan string) | ||||
| 	timerFile := fmt.Sprintf("%s.timer", c.ID()) | ||||
| 	timerFile := fmt.Sprintf("%s.timer", c.hcUnitName(isStartup)) | ||||
| 	if _, err := conn.StopUnitContext(ctx, timerFile, "fail", timerChan); err != nil { | ||||
| 		if !strings.HasSuffix(err.Error(), ".timer not loaded.") { | ||||
| 			stopErrors = append(stopErrors, fmt.Errorf("removing health-check timer %q: %w", timerFile, err)) | ||||
| @ -111,7 +118,7 @@ func (c *Container) removeTransientFiles(ctx context.Context) error { | ||||
| 	// Reset the service before stopping it to make sure it's being removed | ||||
| 	// on stop. | ||||
| 	serviceChan := make(chan string) | ||||
| 	serviceFile := fmt.Sprintf("%s.service", c.ID()) | ||||
| 	serviceFile := fmt.Sprintf("%s.service", c.hcUnitName(isStartup)) | ||||
| 	if err := conn.ResetFailedUnitContext(ctx, serviceFile); err != nil { | ||||
| 		logrus.Debugf("Failed to reset unit file: %q", err) | ||||
| 	} | ||||
|  | ||||
| @ -1898,6 +1898,21 @@ func WithInfraConfig(compatibleOptions InfraInherit) CtrCreateOption { | ||||
| 	} | ||||
| } | ||||
|  | ||||
| // WithStartupHealthcheck sets a startup healthcheck for the container. | ||||
| // Requires that a healthcheck must be set. | ||||
| func WithStartupHealthcheck(startupHC *define.StartupHealthCheck) CtrCreateOption { | ||||
| 	return func(ctr *Container) error { | ||||
| 		if ctr.valid { | ||||
| 			return define.ErrCtrFinalized | ||||
| 		} | ||||
| 		ctr.config.StartupHealthCheckConfig = new(define.StartupHealthCheck) | ||||
| 		if err := JSONDeepCopy(startupHC, ctr.config.StartupHealthCheckConfig); err != nil { | ||||
| 			return fmt.Errorf("error copying startup healthcheck into container: %w", err) | ||||
| 		} | ||||
| 		return nil | ||||
| 	} | ||||
| } | ||||
|  | ||||
| // Pod Creation Options | ||||
|  | ||||
| // WithPodCreateCommand adds the full command plus arguments of the current | ||||
|  | ||||
| @ -12,7 +12,7 @@ import ( | ||||
| func RunHealthCheck(w http.ResponseWriter, r *http.Request) { | ||||
| 	runtime := r.Context().Value(api.RuntimeKey).(*libpod.Runtime) | ||||
| 	name := utils.GetName(r) | ||||
| 	status, err := runtime.HealthCheck(name) | ||||
| 	status, err := runtime.HealthCheck(r.Context(), name) | ||||
| 	if err != nil { | ||||
| 		if status == define.HealthCheckContainerNotFound { | ||||
| 			utils.ContainerNotFound(w, name, err) | ||||
| @ -32,6 +32,8 @@ func RunHealthCheck(w http.ResponseWriter, r *http.Request) { | ||||
| 	hcStatus := define.HealthCheckUnhealthy | ||||
| 	if status == define.HealthCheckSuccess { | ||||
| 		hcStatus = define.HealthCheckHealthy | ||||
| 	} else if status == define.HealthCheckStartup { | ||||
| 		hcStatus = define.HealthCheckStarting | ||||
| 	} | ||||
| 	report := define.HealthCheckResults{ | ||||
| 		Status: hcStatus, | ||||
|  | ||||
| @ -260,6 +260,11 @@ type ContainerCreateOptions struct { | ||||
| 	SdNotifyMode       string | ||||
| 	ShmSize            string | ||||
| 	SignaturePolicy    string | ||||
| 	StartupHCCmd       string | ||||
| 	StartupHCInterval  string | ||||
| 	StartupHCRetries   uint | ||||
| 	StartupHCSuccesses uint | ||||
| 	StartupHCTimeout   string | ||||
| 	StopSignal         string | ||||
| 	StopTimeout        uint | ||||
| 	StorageOpts        []string | ||||
| @ -291,7 +296,6 @@ type ContainerCreateOptions struct { | ||||
| 	IsInfra            bool | ||||
| 	IsClone            bool | ||||
| 	DecryptionKeys     []string | ||||
|  | ||||
| 	Net                *NetOptions `json:"net,omitempty"` | ||||
|  | ||||
| 	CgroupConf []string | ||||
|  | ||||
| @ -8,13 +8,15 @@ import ( | ||||
| ) | ||||
|  | ||||
| func (ic *ContainerEngine) HealthCheckRun(ctx context.Context, nameOrID string, options entities.HealthCheckOptions) (*define.HealthCheckResults, error) { | ||||
| 	status, err := ic.Libpod.HealthCheck(nameOrID) | ||||
| 	status, err := ic.Libpod.HealthCheck(ctx, nameOrID) | ||||
| 	if err != nil { | ||||
| 		return nil, err | ||||
| 	} | ||||
| 	hcStatus := define.HealthCheckUnhealthy | ||||
| 	if status == define.HealthCheckSuccess { | ||||
| 		hcStatus = define.HealthCheckHealthy | ||||
| 	} else if status == define.HealthCheckStartup { | ||||
| 		hcStatus = define.HealthCheckStarting | ||||
| 	} | ||||
| 	report := define.HealthCheckResults{ | ||||
| 		Status: hcStatus, | ||||
|  | ||||
| @ -527,6 +527,9 @@ func createContainerOptions(rt *libpod.Runtime, s *specgen.SpecGenerator, pod *l | ||||
| 		options = append(options, libpod.WithHealthCheck(s.ContainerHealthCheckConfig.HealthConfig)) | ||||
| 		logrus.Debugf("New container has a health check") | ||||
| 	} | ||||
| 	if s.ContainerHealthCheckConfig.StartupHealthConfig != nil { | ||||
| 		options = append(options, libpod.WithStartupHealthcheck(s.ContainerHealthCheckConfig.StartupHealthConfig)) | ||||
| 	} | ||||
|  | ||||
| 	if s.ContainerHealthCheckConfig.HealthCheckOnFailureAction != define.HealthCheckOnFailureActionNone { | ||||
| 		options = append(options, libpod.WithHealthCheckOnFailureAction(s.ContainerHealthCheckConfig.HealthCheckOnFailureAction)) | ||||
|  | ||||
| @ -536,6 +536,10 @@ type ContainerResourceConfig struct { | ||||
| type ContainerHealthCheckConfig struct { | ||||
| 	HealthConfig               *manifest.Schema2HealthConfig     `json:"healthconfig,omitempty"` | ||||
| 	HealthCheckOnFailureAction define.HealthCheckOnFailureAction `json:"health_check_on_failure_action,omitempty"` | ||||
| 	// Startup healthcheck for a container. | ||||
| 	// Requires that HealthConfig be set. | ||||
| 	// Optional. | ||||
| 	StartupHealthConfig *define.StartupHealthCheck `json:"startupHealthConfig,omitempty"` | ||||
| } | ||||
|  | ||||
| // SpecGenerator creates an OCI spec and Libpod configuration options to create | ||||
|  | ||||
| @ -256,7 +256,7 @@ func FillOutSpecGen(s *specgen.SpecGenerator, c *entities.ContainerCreateOptions | ||||
| 		if c.NoHealthCheck { | ||||
| 			return errors.New("cannot specify both --no-healthcheck and --health-cmd") | ||||
| 		} | ||||
| 		s.HealthConfig, err = makeHealthCheckFromCli(c.HealthCmd, c.HealthInterval, c.HealthRetries, c.HealthTimeout, c.HealthStartPeriod) | ||||
| 		s.HealthConfig, err = makeHealthCheckFromCli(c.HealthCmd, c.HealthInterval, c.HealthRetries, c.HealthTimeout, c.HealthStartPeriod, false) | ||||
| 		if err != nil { | ||||
| 			return err | ||||
| 		} | ||||
| @ -272,6 +272,25 @@ func FillOutSpecGen(s *specgen.SpecGenerator, c *entities.ContainerCreateOptions | ||||
| 	} | ||||
| 	s.HealthCheckOnFailureAction = onFailureAction | ||||
|  | ||||
| 	if c.StartupHCCmd != "" { | ||||
| 		if c.NoHealthCheck { | ||||
| 			return errors.New("cannot specify both --no-healthcheck and --health-startup-cmd") | ||||
| 		} | ||||
| 		// The hardcoded "1s" will be discarded, as the startup | ||||
| 		// healthcheck does not have a period. So just hardcode | ||||
| 		// something that parses correctly. | ||||
| 		tmpHcConfig, err := makeHealthCheckFromCli(c.StartupHCCmd, c.StartupHCInterval, c.StartupHCRetries, c.StartupHCTimeout, "1s", true) | ||||
| 		if err != nil { | ||||
| 			return err | ||||
| 		} | ||||
| 		s.StartupHealthConfig = new(define.StartupHealthCheck) | ||||
| 		s.StartupHealthConfig.Test = tmpHcConfig.Test | ||||
| 		s.StartupHealthConfig.Interval = tmpHcConfig.Interval | ||||
| 		s.StartupHealthConfig.Timeout = tmpHcConfig.Timeout | ||||
| 		s.StartupHealthConfig.Retries = tmpHcConfig.Retries | ||||
| 		s.StartupHealthConfig.Successes = int(c.StartupHCSuccesses) | ||||
| 	} | ||||
|  | ||||
| 	if err := setNamespaces(s, c); err != nil { | ||||
| 		return err | ||||
| 	} | ||||
| @ -838,7 +857,7 @@ func FillOutSpecGen(s *specgen.SpecGenerator, c *entities.ContainerCreateOptions | ||||
| 	return nil | ||||
| } | ||||
|  | ||||
| func makeHealthCheckFromCli(inCmd, interval string, retries uint, timeout, startPeriod string) (*manifest.Schema2HealthConfig, error) { | ||||
| func makeHealthCheckFromCli(inCmd, interval string, retries uint, timeout, startPeriod string, isStartup bool) (*manifest.Schema2HealthConfig, error) { | ||||
| 	cmdArr := []string{} | ||||
| 	isArr := true | ||||
| 	err := json.Unmarshal([]byte(inCmd), &cmdArr) // array unmarshalling | ||||
| @ -886,7 +905,7 @@ func makeHealthCheckFromCli(inCmd, interval string, retries uint, timeout, start | ||||
|  | ||||
| 	hc.Interval = intervalDuration | ||||
|  | ||||
| 	if retries < 1 { | ||||
| 	if retries < 1 && !isStartup { | ||||
| 		return nil, errors.New("healthcheck-retries must be greater than 0") | ||||
| 	} | ||||
| 	hc.Retries = int(retries) | ||||
|  | ||||
| @ -334,4 +334,43 @@ HEALTHCHECK CMD ls -l / 2>&1`, ALPINE) | ||||
| 		// Check to make sure characters were not coerced to utf8 | ||||
| 		Expect(inspect[0].Config.Healthcheck).To(HaveField("Test", []string{"CMD-SHELL", "ls -l / 2>&1"})) | ||||
| 	}) | ||||
|  | ||||
| 	It("Startup healthcheck success transitions to regular healthcheck", func() { | ||||
| 		ctrName := "hcCtr" | ||||
| 		ctrRun := podmanTest.Podman([]string{"run", "-dt", "--name", ctrName, "--health-cmd", "echo regular", "--health-startup-cmd", "cat /test", ALPINE, "top"}) | ||||
| 		ctrRun.WaitWithDefaultTimeout() | ||||
| 		Expect(ctrRun).Should(Exit(0)) | ||||
|  | ||||
| 		inspect := podmanTest.InspectContainer(ctrName) | ||||
| 		Expect(inspect[0].State.Health).To(HaveField("Status", "starting")) | ||||
|  | ||||
| 		hc := podmanTest.Podman([]string{"healthcheck", "run", ctrName}) | ||||
| 		hc.WaitWithDefaultTimeout() | ||||
| 		Expect(hc).Should(Exit(1)) | ||||
|  | ||||
| 		exec := podmanTest.Podman([]string{"exec", ctrName, "sh", "-c", "touch /test && echo startup > /test"}) | ||||
| 		exec.WaitWithDefaultTimeout() | ||||
| 		Expect(exec).Should(Exit(0)) | ||||
|  | ||||
| 		hc = podmanTest.Podman([]string{"healthcheck", "run", ctrName}) | ||||
| 		hc.WaitWithDefaultTimeout() | ||||
| 		Expect(hc).Should(Exit(0)) | ||||
|  | ||||
| 		inspect = podmanTest.InspectContainer(ctrName) | ||||
| 		Expect(inspect[0].State.Health).To(HaveField("Status", define.HealthCheckHealthy)) | ||||
|  | ||||
| 		hc = podmanTest.Podman([]string{"healthcheck", "run", ctrName}) | ||||
| 		hc.WaitWithDefaultTimeout() | ||||
| 		Expect(hc).Should(Exit(0)) | ||||
|  | ||||
| 		inspect = podmanTest.InspectContainer(ctrName) | ||||
| 		Expect(inspect[0].State.Health).To(HaveField("Status", define.HealthCheckHealthy)) | ||||
|  | ||||
| 		// Test podman ps --filter heath is working (#11687) | ||||
| 		ps := podmanTest.Podman([]string{"ps", "--filter", "health=healthy"}) | ||||
| 		ps.WaitWithDefaultTimeout() | ||||
| 		Expect(ps).Should(Exit(0)) | ||||
| 		Expect(ps.OutputToStringArray()).To(HaveLen(2)) | ||||
| 		Expect(ps.OutputToString()).To(ContainSubstring("hc")) | ||||
| 	}) | ||||
| }) | ||||
|  | ||||
		Reference in New Issue
	
	Block a user
	 Matthew Heon
					Matthew Heon