mirror of
https://github.com/containers/podman.git
synced 2025-10-20 12:43:58 +08:00
[v4.4.1-rhel] fix --health-on-failure=restart in transient unit
As described in #17777, the `restart` on-failure action did not behave correctly when the health check is being run by a transient systemd unit. It ran just fine when being executed outside such a unit, for instance, manually or, as done in the system tests, in a scripted fashion. There were two issue causing the `restart` on-failure action to misbehave: 1) The transient systemd units used the default `KillMode=cgroup` which will nuke all processes in the specific cgroup including the recently restarted container/conmon once the main `podman healthcheck run` process exits. 2) Podman attempted to remove the transient systemd unit and timer during restart. That is perfectly fine when manually restarting the container but not when the restart itself is being executed inside such a transient unit. Ultimately, Podman tried to shoot itself in the foot. Fix both issues by moving the restart logic in the cleanup process. Instead of restarting the container, the `healthcheck run` will just stop the container and the cleanup process will restart the container once it has turned unhealthy. Backport of commit 95634154303f5b8c3d5c92820e2a3545c54f0bc8. Fixes: #17777 Fixes: https://bugzilla.redhat.com/show_bug.cgi?id=2180125 Fixes: https://bugzilla.redhat.com/show_bug.cgi?id=2180126 Signed-off-by: Valentin Rothberg <vrothberg@redhat.com>
This commit is contained in:
@ -228,6 +228,15 @@ func (c *Container) handleExitFile(exitFile string, fi os.FileInfo) error {
|
||||
}
|
||||
|
||||
func (c *Container) shouldRestart() bool {
|
||||
if c.config.HealthCheckOnFailureAction == define.HealthCheckOnFailureActionRestart {
|
||||
isUnhealthy, err := c.isUnhealthy()
|
||||
if err != nil {
|
||||
logrus.Errorf("Checking if container is unhealthy: %v", err)
|
||||
} else if isUnhealthy {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
// If we did not get a restart policy match, return false
|
||||
// Do the same if we're not a policy that restarts.
|
||||
if !c.state.RestartPolicyMatch ||
|
||||
@ -267,6 +276,12 @@ func (c *Container) handleRestartPolicy(ctx context.Context) (_ bool, retErr err
|
||||
return false, err
|
||||
}
|
||||
|
||||
if c.config.HealthCheckConfig != nil {
|
||||
if err := c.removeTransientFiles(ctx, c.config.StartupHealthCheckConfig != nil && !c.state.StartupHCPassed); err != nil {
|
||||
return false, err
|
||||
}
|
||||
}
|
||||
|
||||
// Is the container running again?
|
||||
// If so, we don't have to do anything
|
||||
if c.ensureState(define.ContainerStateRunning, define.ContainerStatePaused) {
|
||||
@ -1431,6 +1446,7 @@ func (c *Container) restartWithTimeout(ctx context.Context, timeout uint) (retEr
|
||||
if err := c.stop(timeout); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if c.config.HealthCheckConfig != nil {
|
||||
if err := c.removeTransientFiles(context.Background(), c.config.StartupHealthCheckConfig != nil && !c.state.StartupHCPassed); err != nil {
|
||||
logrus.Error(err.Error())
|
||||
|
Reference in New Issue
Block a user