Merge pull request #16084 from vrothberg/health-check-fix

health checks: make on-failure action retry aware
This commit is contained in:
OpenShift Merge Robot
2022-10-07 19:27:42 +02:00
committed by GitHub
3 changed files with 30 additions and 23 deletions

View File

@ -32,18 +32,19 @@ func (r *Runtime) HealthCheck(name string) (define.HealthCheckStatus, error) {
} }
hcStatus, err := checkHealthCheckCanBeRun(container) hcStatus, err := checkHealthCheckCanBeRun(container)
if err == nil { if err != nil {
hcStatus, err := container.runHealthCheck() return hcStatus, err
if err := container.processHealthCheckStatus(hcStatus); err != nil { }
return hcStatus, err
} hcStatus, logStatus, err := container.runHealthCheck()
if err := container.processHealthCheckStatus(logStatus); err != nil {
return hcStatus, err return hcStatus, err
} }
return hcStatus, err return hcStatus, err
} }
// runHealthCheck runs the health check as defined by the container // runHealthCheck runs the health check as defined by the container
func (c *Container) runHealthCheck() (define.HealthCheckStatus, error) { func (c *Container) runHealthCheck() (define.HealthCheckStatus, string, error) {
var ( var (
newCommand []string newCommand []string
returnCode int returnCode int
@ -51,11 +52,11 @@ func (c *Container) runHealthCheck() (define.HealthCheckStatus, error) {
) )
hcCommand := c.HealthCheckConfig().Test hcCommand := c.HealthCheckConfig().Test
if len(hcCommand) < 1 { if len(hcCommand) < 1 {
return define.HealthCheckNotDefined, fmt.Errorf("container %s has no defined healthcheck", c.ID()) return define.HealthCheckNotDefined, "", fmt.Errorf("container %s has no defined healthcheck", c.ID())
} }
switch hcCommand[0] { switch hcCommand[0] {
case "", define.HealthConfigTestNone: case "", define.HealthConfigTestNone:
return define.HealthCheckNotDefined, fmt.Errorf("container %s has no defined healthcheck", c.ID()) return define.HealthCheckNotDefined, "", fmt.Errorf("container %s has no defined healthcheck", c.ID())
case define.HealthConfigTestCmd: case define.HealthConfigTestCmd:
newCommand = hcCommand[1:] newCommand = hcCommand[1:]
case define.HealthConfigTestCmdShell: case define.HealthConfigTestCmdShell:
@ -66,11 +67,11 @@ func (c *Container) runHealthCheck() (define.HealthCheckStatus, error) {
newCommand = hcCommand newCommand = hcCommand
} }
if len(newCommand) < 1 || newCommand[0] == "" { if len(newCommand) < 1 || newCommand[0] == "" {
return define.HealthCheckNotDefined, fmt.Errorf("container %s has no defined healthcheck", c.ID()) return define.HealthCheckNotDefined, "", fmt.Errorf("container %s has no defined healthcheck", c.ID())
} }
rPipe, wPipe, err := os.Pipe() rPipe, wPipe, err := os.Pipe()
if err != nil { if err != nil {
return define.HealthCheckInternalError, fmt.Errorf("unable to create pipe for healthcheck session: %w", err) return define.HealthCheckInternalError, "", fmt.Errorf("unable to create pipe for healthcheck session: %w", err)
} }
defer wPipe.Close() defer wPipe.Close()
defer rPipe.Close() defer rPipe.Close()
@ -135,15 +136,16 @@ func (c *Container) runHealthCheck() (define.HealthCheckStatus, error) {
} }
hcl := newHealthCheckLog(timeStart, timeEnd, returnCode, eventLog) hcl := newHealthCheckLog(timeStart, timeEnd, returnCode, eventLog)
if err := c.updateHealthCheckLog(hcl, inStartPeriod); err != nil { logStatus, err := c.updateHealthCheckLog(hcl, inStartPeriod)
return hcResult, fmt.Errorf("unable to update health check log %s for %s: %w", c.healthCheckLogPath(), c.ID(), err) if err != nil {
return hcResult, "", fmt.Errorf("unable to update health check log %s for %s: %w", c.healthCheckLogPath(), c.ID(), err)
} }
return hcResult, hcErr return hcResult, logStatus, hcErr
} }
func (c *Container) processHealthCheckStatus(status define.HealthCheckStatus) error { func (c *Container) processHealthCheckStatus(status string) error {
if status == define.HealthCheckSuccess { if status != define.HealthCheckUnhealthy {
return nil return nil
} }
@ -211,10 +213,13 @@ func (c *Container) updateHealthStatus(status string) error {
} }
// UpdateHealthCheckLog parses the health check results and writes the log // UpdateHealthCheckLog parses the health check results and writes the log
func (c *Container) updateHealthCheckLog(hcl define.HealthCheckLog, inStartPeriod bool) error { func (c *Container) updateHealthCheckLog(hcl define.HealthCheckLog, inStartPeriod bool) (string, error) {
c.lock.Lock()
defer c.lock.Unlock()
healthCheck, err := c.getHealthCheckLog() healthCheck, err := c.getHealthCheckLog()
if err != nil { if err != nil {
return err return "", err
} }
if hcl.ExitCode == 0 { if hcl.ExitCode == 0 {
// set status to healthy, reset failing state to 0 // set status to healthy, reset failing state to 0
@ -239,9 +244,9 @@ func (c *Container) updateHealthCheckLog(hcl define.HealthCheckLog, inStartPerio
} }
newResults, err := json.Marshal(healthCheck) newResults, err := json.Marshal(healthCheck)
if err != nil { if err != nil {
return fmt.Errorf("unable to marshall healthchecks for writing: %w", err) return "", fmt.Errorf("unable to marshall healthchecks for writing: %w", err)
} }
return os.WriteFile(c.healthCheckLogPath(), newResults, 0700) return healthCheck.Status, os.WriteFile(c.healthCheckLogPath(), newResults, 0700)
} }
// HealthCheckLogPath returns the path for where the health check log is // HealthCheckLogPath returns the path for where the health check log is

View File

@ -28,10 +28,11 @@ function _check_health {
--health-cmd /healthcheck \ --health-cmd /healthcheck \
--health-interval 1s \ --health-interval 1s \
--health-retries 3 \ --health-retries 3 \
--health-on-failure=kill \
healthcheck_i healthcheck_i
run_podman inspect healthcheck_c --format "{{.Config.HealthcheckOnFailureAction}}" run_podman inspect healthcheck_c --format "{{.Config.HealthcheckOnFailureAction}}"
is "$output" "none" "default on-failure action is none" is "$output" "kill" "on-failure action is set to kill"
# We can't check for 'starting' because a 1-second interval is too # We can't check for 'starting' because a 1-second interval is too
# short; it could run healthcheck before we get to our first check. # short; it could run healthcheck before we get to our first check.
@ -67,9 +68,8 @@ Log[-1].ExitCode | 1
Log[-1].Output | \"Uh-oh on stdout!\\\nUh-oh on stderr!\" Log[-1].Output | \"Uh-oh on stdout!\\\nUh-oh on stderr!\"
" "
# healthcheck should now fail, with exit status 1 and 'unhealthy' output # now the on-failure should kick in and kill the container
run_podman 1 healthcheck run healthcheck_c podman wait healthcheck_c
is "$output" "unhealthy" "output from 'podman healthcheck run'"
# Clean up # Clean up
run_podman rm -t 0 -f healthcheck_c run_podman rm -t 0 -f healthcheck_c
@ -95,6 +95,7 @@ Log[-1].Output | \"Uh-oh on stdout!\\\nUh-oh on stderr!\"
# Run that healthcheck image. # Run that healthcheck image.
run_podman run -d --name $ctr \ run_podman run -d --name $ctr \
--health-cmd /healthcheck \ --health-cmd /healthcheck \
--health-retries=1 \
--health-on-failure=$policy \ --health-on-failure=$policy \
$img $img

View File

@ -318,6 +318,7 @@ LISTEN_FDNAMES=listen_fdnames" | sort)
run_podman create --name $cname \ run_podman create --name $cname \
--health-cmd /healthcheck \ --health-cmd /healthcheck \
--health-on-failure=kill \ --health-on-failure=kill \
--health-retries=1 \
--restart=on-failure \ --restart=on-failure \
$img $img