mirror of
https://github.com/containers/podman.git
synced 2025-05-17 15:18:43 +08:00
healthcheck: do not leak service on failed stop
We reset the failed unit to not leak it, however we did so before stopping, this is wrong because when the stop fails we will again have a failed unit. The correct thing is to reset after the stop because once it is stopped it cannot create new errors. I found this using the following reproducer and this is enough to fix it: ``` while :; do cid=$(podman run -d --name foo --health-cmd /home/podman/healthcheck \ --health-startup-cmd /home/podman/healthcheck \ quay.io/libpod/testimage:20241011 /home/podman/pause) podman healthcheck run $cid podman rm -fa sleep 2 systemctl --user list-units --failed | grep $cid && break done ``` Signed-off-by: Paul Holzinger <pholzing@redhat.com>
This commit is contained in:
@ -137,13 +137,8 @@ func (c *Container) removeTransientFiles(ctx context.Context, isStartup bool, un
|
||||
stopErrors = append(stopErrors, fmt.Errorf("stopping systemd health-check timer %q: %w", timerFile, err))
|
||||
}
|
||||
|
||||
// Reset the service before stopping it to make sure it's being removed
|
||||
// on stop.
|
||||
serviceChan := make(chan string)
|
||||
serviceFile := fmt.Sprintf("%s.service", unitName)
|
||||
if err := conn.ResetFailedUnitContext(ctx, serviceFile); err != nil {
|
||||
logrus.Debugf("Failed to reset unit file: %q", err)
|
||||
}
|
||||
if _, err := conn.StopUnitContext(ctx, serviceFile, "ignore-dependencies", serviceChan); err != nil {
|
||||
if !strings.HasSuffix(err.Error(), ".service not loaded.") {
|
||||
stopErrors = append(stopErrors, fmt.Errorf("removing health-check service %q: %w", serviceFile, err))
|
||||
@ -151,6 +146,12 @@ func (c *Container) removeTransientFiles(ctx context.Context, isStartup bool, un
|
||||
} else if err := systemdOpSuccessful(serviceChan); err != nil {
|
||||
stopErrors = append(stopErrors, fmt.Errorf("stopping systemd health-check service %q: %w", serviceFile, err))
|
||||
}
|
||||
// Reset the service after stopping it to make sure it's being removed, systemd keep failed transient services
|
||||
// around in its state. We do not care about the error and we need to ensure to reset the state so we do not
|
||||
// leak resources forever.
|
||||
if err := conn.ResetFailedUnitContext(ctx, serviceFile); err != nil {
|
||||
logrus.Debugf("Failed to reset unit file: %q", err)
|
||||
}
|
||||
|
||||
return errorhandling.JoinErrors(stopErrors)
|
||||
}
|
||||
|
Reference in New Issue
Block a user