healthcheck: do not leak service on failed stop

We reset the failed unit to not leak it, however we did so before
stopping, this is wrong because when the stop fails we will again have a
failed unit. The correct thing is to reset after the stop because once
it is stopped it cannot create new errors.

I found this using the following reproducer and this is enough to fix
it:
```
while :; do
cid=$(podman run -d --name foo --health-cmd /home/podman/healthcheck \
	--health-startup-cmd /home/podman/healthcheck \
	quay.io/libpod/testimage:20241011 /home/podman/pause)
podman healthcheck run $cid
podman rm -fa
sleep 2
systemctl --user list-units --failed | grep $cid && break
done
```

Signed-off-by: Paul Holzinger <pholzing@redhat.com>
This commit is contained in:
Paul Holzinger
2024-10-25 13:59:56 +02:00
parent 6069cdda00
commit c0f4e2ce67

View File

@ -137,13 +137,8 @@ func (c *Container) removeTransientFiles(ctx context.Context, isStartup bool, un
stopErrors = append(stopErrors, fmt.Errorf("stopping systemd health-check timer %q: %w", timerFile, err))
}
// Reset the service before stopping it to make sure it's being removed
// on stop.
serviceChan := make(chan string)
serviceFile := fmt.Sprintf("%s.service", unitName)
if err := conn.ResetFailedUnitContext(ctx, serviceFile); err != nil {
logrus.Debugf("Failed to reset unit file: %q", err)
}
if _, err := conn.StopUnitContext(ctx, serviceFile, "ignore-dependencies", serviceChan); err != nil {
if !strings.HasSuffix(err.Error(), ".service not loaded.") {
stopErrors = append(stopErrors, fmt.Errorf("removing health-check service %q: %w", serviceFile, err))
@ -151,6 +146,12 @@ func (c *Container) removeTransientFiles(ctx context.Context, isStartup bool, un
} else if err := systemdOpSuccessful(serviceChan); err != nil {
stopErrors = append(stopErrors, fmt.Errorf("stopping systemd health-check service %q: %w", serviceFile, err))
}
// Reset the service after stopping it to make sure it's being removed, systemd keep failed transient services
// around in its state. We do not care about the error and we need to ensure to reset the state so we do not
// leak resources forever.
if err := conn.ResetFailedUnitContext(ctx, serviceFile); err != nil {
logrus.Debugf("Failed to reset unit file: %q", err)
}
return errorhandling.JoinErrors(stopErrors)
}