Add a random suffix to healthcheck unit names

Systemd dislikes it when we rapidly create and remove a transient
unit. Solution: If we change the name every time, it's different
enough that systemd is satisfied and we stop having errors trying
to restart the healthcheck.

Generate a random 32-bit integer, and add it (formatted as hex)
to the end of the unit name to do this. As a result, we now have
to store the unit name in the database, but it does make
backwards compat easy - if the unit name in the DB is empty, we
revert to the old behavior because the timer was created by old
Podman.

Should resolve RHEL-26105

Signed-off-by: Matt Heon <mheon@redhat.com>
This commit is contained in:
Matt Heon
2024-05-03 09:44:28 -04:00
parent 6ec2c0b43b
commit 4fd84190b8
3 changed files with 38 additions and 6 deletions

View File

@ -209,6 +209,9 @@ type ContainerState struct {
// healthcheck. The container will be restarted if this exceed a set // healthcheck. The container will be restarted if this exceed a set
// number in the startup HC config. // number in the startup HC config.
StartupHCFailureCount int `json:"startupHCFailureCount,omitempty"` StartupHCFailureCount int `json:"startupHCFailureCount,omitempty"`
// HCUnitName records the name of the healthcheck unit.
// Automatically generated when the healthcheck is started.
HCUnitName string `json:"hcUnitName,omitempty"`
// ExtensionStageHooks holds hooks which will be executed by libpod // ExtensionStageHooks holds hooks which will be executed by libpod
// and not delegated to the OCI runtime. // and not delegated to the OCI runtime.

View File

@ -661,6 +661,7 @@ func resetContainerState(state *ContainerState) {
state.StartupHCPassed = false state.StartupHCPassed = false
state.StartupHCSuccessCount = 0 state.StartupHCSuccessCount = 0
state.StartupHCFailureCount = 0 state.StartupHCFailureCount = 0
state.HCUnitName = ""
state.NetNS = "" state.NetNS = ""
state.NetworkStatus = nil state.NetworkStatus = nil
} }

View File

@ -5,6 +5,7 @@ package libpod
import ( import (
"context" "context"
"fmt" "fmt"
"math/rand"
"os" "os"
"os/exec" "os/exec"
"strings" "strings"
@ -21,6 +22,9 @@ func (c *Container) createTimer(interval string, isStartup bool) error {
if c.disableHealthCheckSystemd(isStartup) { if c.disableHealthCheckSystemd(isStartup) {
return nil return nil
} }
hcUnitName := c.hcUnitName(isStartup, false)
podman, err := os.Executable() podman, err := os.Executable()
if err != nil { if err != nil {
return fmt.Errorf("failed to get path for podman for a health check timer: %w", err) return fmt.Errorf("failed to get path for podman for a health check timer: %w", err)
@ -35,7 +39,7 @@ func (c *Container) createTimer(interval string, isStartup bool) error {
cmd = append(cmd, "--setenv=PATH="+path) cmd = append(cmd, "--setenv=PATH="+path)
} }
cmd = append(cmd, "--unit", c.hcUnitName(isStartup), fmt.Sprintf("--on-unit-inactive=%s", interval), "--timer-property=AccuracySec=1s", podman) cmd = append(cmd, "--unit", hcUnitName, fmt.Sprintf("--on-unit-inactive=%s", interval), "--timer-property=AccuracySec=1s", podman)
if logrus.IsLevelEnabled(logrus.DebugLevel) { if logrus.IsLevelEnabled(logrus.DebugLevel) {
cmd = append(cmd, "--log-level=debug", "--syslog") cmd = append(cmd, "--log-level=debug", "--syslog")
@ -53,6 +57,12 @@ func (c *Container) createTimer(interval string, isStartup bool) error {
if output, err := systemdRun.CombinedOutput(); err != nil { if output, err := systemdRun.CombinedOutput(); err != nil {
return fmt.Errorf("%s", output) return fmt.Errorf("%s", output)
} }
c.state.HCUnitName = hcUnitName
if err := c.save(); err != nil {
return fmt.Errorf("saving container %s healthcheck unit name: %w", c.ID(), err)
}
return nil return nil
} }
@ -72,13 +82,19 @@ func (c *Container) startTimer(isStartup bool) error {
if c.disableHealthCheckSystemd(isStartup) { if c.disableHealthCheckSystemd(isStartup) {
return nil return nil
} }
hcUnitName := c.state.HCUnitName
if hcUnitName == "" {
hcUnitName = c.hcUnitName(isStartup, true)
}
conn, err := systemd.ConnectToDBUS() conn, err := systemd.ConnectToDBUS()
if err != nil { if err != nil {
return fmt.Errorf("unable to get systemd connection to start healthchecks: %w", err) return fmt.Errorf("unable to get systemd connection to start healthchecks: %w", err)
} }
defer conn.Close() defer conn.Close()
startFile := fmt.Sprintf("%s.service", c.hcUnitName(isStartup)) startFile := fmt.Sprintf("%s.service", hcUnitName)
startChan := make(chan string) startChan := make(chan string)
if _, err := conn.RestartUnitContext(context.Background(), startFile, "fail", startChan); err != nil { if _, err := conn.RestartUnitContext(context.Background(), startFile, "fail", startChan); err != nil {
return err return err
@ -106,10 +122,14 @@ func (c *Container) removeTransientFiles(ctx context.Context, isStartup bool) er
// clean up as much as possible. // clean up as much as possible.
stopErrors := []error{} stopErrors := []error{}
unitName := c.state.HCUnitName
if unitName == "" {
unitName = c.hcUnitName(isStartup, true)
}
// Stop the timer before the service to make sure the timer does not // Stop the timer before the service to make sure the timer does not
// fire after the service is stopped. // fire after the service is stopped.
timerChan := make(chan string) timerChan := make(chan string)
timerFile := fmt.Sprintf("%s.timer", c.hcUnitName(isStartup)) timerFile := fmt.Sprintf("%s.timer", unitName)
if _, err := conn.StopUnitContext(ctx, timerFile, "ignore-dependencies", timerChan); err != nil { if _, err := conn.StopUnitContext(ctx, timerFile, "ignore-dependencies", timerChan); err != nil {
if !strings.HasSuffix(err.Error(), ".timer not loaded.") { if !strings.HasSuffix(err.Error(), ".timer not loaded.") {
stopErrors = append(stopErrors, fmt.Errorf("removing health-check timer %q: %w", timerFile, err)) stopErrors = append(stopErrors, fmt.Errorf("removing health-check timer %q: %w", timerFile, err))
@ -121,7 +141,7 @@ func (c *Container) removeTransientFiles(ctx context.Context, isStartup bool) er
// Reset the service before stopping it to make sure it's being removed // Reset the service before stopping it to make sure it's being removed
// on stop. // on stop.
serviceChan := make(chan string) serviceChan := make(chan string)
serviceFile := fmt.Sprintf("%s.service", c.hcUnitName(isStartup)) serviceFile := fmt.Sprintf("%s.service", unitName)
if err := conn.ResetFailedUnitContext(ctx, serviceFile); err != nil { if err := conn.ResetFailedUnitContext(ctx, serviceFile); err != nil {
logrus.Debugf("Failed to reset unit file: %q", err) logrus.Debugf("Failed to reset unit file: %q", err)
} }
@ -151,11 +171,19 @@ func (c *Container) disableHealthCheckSystemd(isStartup bool) bool {
return false return false
} }
// Systemd unit name for the healthcheck systemd unit // Systemd unit name for the healthcheck systemd unit.
func (c *Container) hcUnitName(isStartup bool) string { // Bare indicates that a random suffix should not be applied to the name. This
// was default behavior previously, and is used for backwards compatibility.
func (c *Container) hcUnitName(isStartup, bare bool) string {
unitName := c.ID() unitName := c.ID()
if isStartup { if isStartup {
unitName += "-startup" unitName += "-startup"
} }
if !bare {
// Ensure that unit names are unique from run to run by appending
// a random suffix.
// Ref: RH Jira RHEL-26105
unitName += fmt.Sprintf("-%x", rand.Int())
}
return unitName return unitName
} }