mirror of
https://github.com/containers/podman.git
synced 2025-05-21 00:56:36 +08:00
Add a random suffix to healthcheck unit names
Systemd dislikes it when we rapidly create and remove a transient unit. Solution: If we change the name every time, it's different enough that systemd is satisfied and we stop having errors trying to restart the healthcheck. Generate a random 32-bit integer, and add it (formatted as hex) to the end of the unit name to do this. As a result, we now have to store the unit name in the database, but it does make backwards compat easy - if the unit name in the DB is empty, we revert to the old behavior because the timer was created by old Podman. Should resolve RHEL-26105 Signed-off-by: Matt Heon <mheon@redhat.com>
This commit is contained in:
@ -209,6 +209,9 @@ type ContainerState struct {
|
|||||||
// healthcheck. The container will be restarted if this exceed a set
|
// healthcheck. The container will be restarted if this exceed a set
|
||||||
// number in the startup HC config.
|
// number in the startup HC config.
|
||||||
StartupHCFailureCount int `json:"startupHCFailureCount,omitempty"`
|
StartupHCFailureCount int `json:"startupHCFailureCount,omitempty"`
|
||||||
|
// HCUnitName records the name of the healthcheck unit.
|
||||||
|
// Automatically generated when the healthcheck is started.
|
||||||
|
HCUnitName string `json:"hcUnitName,omitempty"`
|
||||||
|
|
||||||
// ExtensionStageHooks holds hooks which will be executed by libpod
|
// ExtensionStageHooks holds hooks which will be executed by libpod
|
||||||
// and not delegated to the OCI runtime.
|
// and not delegated to the OCI runtime.
|
||||||
|
@ -661,6 +661,7 @@ func resetContainerState(state *ContainerState) {
|
|||||||
state.StartupHCPassed = false
|
state.StartupHCPassed = false
|
||||||
state.StartupHCSuccessCount = 0
|
state.StartupHCSuccessCount = 0
|
||||||
state.StartupHCFailureCount = 0
|
state.StartupHCFailureCount = 0
|
||||||
|
state.HCUnitName = ""
|
||||||
state.NetNS = ""
|
state.NetNS = ""
|
||||||
state.NetworkStatus = nil
|
state.NetworkStatus = nil
|
||||||
}
|
}
|
||||||
|
@ -5,6 +5,7 @@ package libpod
|
|||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"math/rand"
|
||||||
"os"
|
"os"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"strings"
|
"strings"
|
||||||
@ -21,6 +22,9 @@ func (c *Container) createTimer(interval string, isStartup bool) error {
|
|||||||
if c.disableHealthCheckSystemd(isStartup) {
|
if c.disableHealthCheckSystemd(isStartup) {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
hcUnitName := c.hcUnitName(isStartup, false)
|
||||||
|
|
||||||
podman, err := os.Executable()
|
podman, err := os.Executable()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("failed to get path for podman for a health check timer: %w", err)
|
return fmt.Errorf("failed to get path for podman for a health check timer: %w", err)
|
||||||
@ -35,7 +39,7 @@ func (c *Container) createTimer(interval string, isStartup bool) error {
|
|||||||
cmd = append(cmd, "--setenv=PATH="+path)
|
cmd = append(cmd, "--setenv=PATH="+path)
|
||||||
}
|
}
|
||||||
|
|
||||||
cmd = append(cmd, "--unit", c.hcUnitName(isStartup), fmt.Sprintf("--on-unit-inactive=%s", interval), "--timer-property=AccuracySec=1s", podman)
|
cmd = append(cmd, "--unit", hcUnitName, fmt.Sprintf("--on-unit-inactive=%s", interval), "--timer-property=AccuracySec=1s", podman)
|
||||||
|
|
||||||
if logrus.IsLevelEnabled(logrus.DebugLevel) {
|
if logrus.IsLevelEnabled(logrus.DebugLevel) {
|
||||||
cmd = append(cmd, "--log-level=debug", "--syslog")
|
cmd = append(cmd, "--log-level=debug", "--syslog")
|
||||||
@ -53,6 +57,12 @@ func (c *Container) createTimer(interval string, isStartup bool) error {
|
|||||||
if output, err := systemdRun.CombinedOutput(); err != nil {
|
if output, err := systemdRun.CombinedOutput(); err != nil {
|
||||||
return fmt.Errorf("%s", output)
|
return fmt.Errorf("%s", output)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
c.state.HCUnitName = hcUnitName
|
||||||
|
if err := c.save(); err != nil {
|
||||||
|
return fmt.Errorf("saving container %s healthcheck unit name: %w", c.ID(), err)
|
||||||
|
}
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -72,13 +82,19 @@ func (c *Container) startTimer(isStartup bool) error {
|
|||||||
if c.disableHealthCheckSystemd(isStartup) {
|
if c.disableHealthCheckSystemd(isStartup) {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
hcUnitName := c.state.HCUnitName
|
||||||
|
if hcUnitName == "" {
|
||||||
|
hcUnitName = c.hcUnitName(isStartup, true)
|
||||||
|
}
|
||||||
|
|
||||||
conn, err := systemd.ConnectToDBUS()
|
conn, err := systemd.ConnectToDBUS()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("unable to get systemd connection to start healthchecks: %w", err)
|
return fmt.Errorf("unable to get systemd connection to start healthchecks: %w", err)
|
||||||
}
|
}
|
||||||
defer conn.Close()
|
defer conn.Close()
|
||||||
|
|
||||||
startFile := fmt.Sprintf("%s.service", c.hcUnitName(isStartup))
|
startFile := fmt.Sprintf("%s.service", hcUnitName)
|
||||||
startChan := make(chan string)
|
startChan := make(chan string)
|
||||||
if _, err := conn.RestartUnitContext(context.Background(), startFile, "fail", startChan); err != nil {
|
if _, err := conn.RestartUnitContext(context.Background(), startFile, "fail", startChan); err != nil {
|
||||||
return err
|
return err
|
||||||
@ -106,10 +122,14 @@ func (c *Container) removeTransientFiles(ctx context.Context, isStartup bool) er
|
|||||||
// clean up as much as possible.
|
// clean up as much as possible.
|
||||||
stopErrors := []error{}
|
stopErrors := []error{}
|
||||||
|
|
||||||
|
unitName := c.state.HCUnitName
|
||||||
|
if unitName == "" {
|
||||||
|
unitName = c.hcUnitName(isStartup, true)
|
||||||
|
}
|
||||||
// Stop the timer before the service to make sure the timer does not
|
// Stop the timer before the service to make sure the timer does not
|
||||||
// fire after the service is stopped.
|
// fire after the service is stopped.
|
||||||
timerChan := make(chan string)
|
timerChan := make(chan string)
|
||||||
timerFile := fmt.Sprintf("%s.timer", c.hcUnitName(isStartup))
|
timerFile := fmt.Sprintf("%s.timer", unitName)
|
||||||
if _, err := conn.StopUnitContext(ctx, timerFile, "ignore-dependencies", timerChan); err != nil {
|
if _, err := conn.StopUnitContext(ctx, timerFile, "ignore-dependencies", timerChan); err != nil {
|
||||||
if !strings.HasSuffix(err.Error(), ".timer not loaded.") {
|
if !strings.HasSuffix(err.Error(), ".timer not loaded.") {
|
||||||
stopErrors = append(stopErrors, fmt.Errorf("removing health-check timer %q: %w", timerFile, err))
|
stopErrors = append(stopErrors, fmt.Errorf("removing health-check timer %q: %w", timerFile, err))
|
||||||
@ -121,7 +141,7 @@ func (c *Container) removeTransientFiles(ctx context.Context, isStartup bool) er
|
|||||||
// Reset the service before stopping it to make sure it's being removed
|
// Reset the service before stopping it to make sure it's being removed
|
||||||
// on stop.
|
// on stop.
|
||||||
serviceChan := make(chan string)
|
serviceChan := make(chan string)
|
||||||
serviceFile := fmt.Sprintf("%s.service", c.hcUnitName(isStartup))
|
serviceFile := fmt.Sprintf("%s.service", unitName)
|
||||||
if err := conn.ResetFailedUnitContext(ctx, serviceFile); err != nil {
|
if err := conn.ResetFailedUnitContext(ctx, serviceFile); err != nil {
|
||||||
logrus.Debugf("Failed to reset unit file: %q", err)
|
logrus.Debugf("Failed to reset unit file: %q", err)
|
||||||
}
|
}
|
||||||
@ -151,11 +171,19 @@ func (c *Container) disableHealthCheckSystemd(isStartup bool) bool {
|
|||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|
||||||
// Systemd unit name for the healthcheck systemd unit
|
// Systemd unit name for the healthcheck systemd unit.
|
||||||
func (c *Container) hcUnitName(isStartup bool) string {
|
// Bare indicates that a random suffix should not be applied to the name. This
|
||||||
|
// was default behavior previously, and is used for backwards compatibility.
|
||||||
|
func (c *Container) hcUnitName(isStartup, bare bool) string {
|
||||||
unitName := c.ID()
|
unitName := c.ID()
|
||||||
if isStartup {
|
if isStartup {
|
||||||
unitName += "-startup"
|
unitName += "-startup"
|
||||||
}
|
}
|
||||||
|
if !bare {
|
||||||
|
// Ensure that unit names are unique from run to run by appending
|
||||||
|
// a random suffix.
|
||||||
|
// Ref: RH Jira RHEL-26105
|
||||||
|
unitName += fmt.Sprintf("-%x", rand.Int())
|
||||||
|
}
|
||||||
return unitName
|
return unitName
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user