cleanup: add new --stopped-only option

The podman container cleanup process runs asynchronous and by the time
it gets the lock it is possible another podman process already did the
cleanup and then did a new init() to start it again. If the cleanup
process gets the lock there it will cause very weird things.

This can be observed in the remote start API as CI flakes.

Fixes #23754

Signed-off-by: Paul Holzinger <pholzing@redhat.com>
This commit is contained in:
Paul Holzinger
2024-08-27 14:54:31 +02:00
parent bf74797c69
commit a89fef6e2a
6 changed files with 20 additions and 7 deletions

View File

@ -53,6 +53,11 @@ func init() {
flags.BoolVar(&cleanupOptions.Remove, "rm", false, "After cleanup, remove the container entirely") flags.BoolVar(&cleanupOptions.Remove, "rm", false, "After cleanup, remove the container entirely")
flags.BoolVar(&cleanupOptions.RemoveImage, "rmi", false, "After cleanup, remove the image entirely") flags.BoolVar(&cleanupOptions.RemoveImage, "rmi", false, "After cleanup, remove the image entirely")
stoppedOnlyFlag := "stopped-only"
flags.BoolVar(&cleanupOptions.StoppedOnly, stoppedOnlyFlag, false, "Only cleanup when the container is in the stopped state")
_ = flags.MarkHidden(stoppedOnlyFlag)
validate.AddLatestFlag(cleanupCommand, &cleanupOptions.Latest) validate.AddLatestFlag(cleanupCommand, &cleanupOptions.Latest)
} }

View File

@ -785,8 +785,10 @@ func (c *Container) WaitForConditionWithInterval(ctx context.Context, waitTimeou
} }
// Cleanup unmounts all mount points in container and cleans up container storage // Cleanup unmounts all mount points in container and cleans up container storage
// It also cleans up the network stack // It also cleans up the network stack.
func (c *Container) Cleanup(ctx context.Context) error { // onlyStopped is set by the podman container cleanup to ensure we only cleanup a stopped container,
// all other states mean another process already called cleanup before us which is fine in such cases.
func (c *Container) Cleanup(ctx context.Context, onlyStopped bool) error {
if !c.batched { if !c.batched {
c.lock.Lock() c.lock.Lock()
defer c.lock.Unlock() defer c.lock.Unlock()
@ -808,6 +810,9 @@ func (c *Container) Cleanup(ctx context.Context) error {
if !c.ensureState(define.ContainerStateConfigured, define.ContainerStateCreated, define.ContainerStateStopped, define.ContainerStateStopping, define.ContainerStateExited) { if !c.ensureState(define.ContainerStateConfigured, define.ContainerStateCreated, define.ContainerStateStopped, define.ContainerStateStopping, define.ContainerStateExited) {
return fmt.Errorf("container %s is running or paused, refusing to clean up: %w", c.ID(), define.ErrCtrStateInvalid) return fmt.Errorf("container %s is running or paused, refusing to clean up: %w", c.ID(), define.ErrCtrStateInvalid)
} }
if onlyStopped && !c.ensureState(define.ContainerStateStopped) {
return fmt.Errorf("container %s is not stopped and only cleanup for a stopped container was requested: %w", c.ID(), define.ErrCtrStateInvalid)
}
// if the container was not created in the oci runtime or was already cleaned up, then do nothing // if the container was not created in the oci runtime or was already cleaned up, then do nothing
if c.ensureState(define.ContainerStateConfigured, define.ContainerStateExited) { if c.ensureState(define.ContainerStateConfigured, define.ContainerStateExited) {

View File

@ -180,7 +180,7 @@ func (p *Pod) stopWithTimeout(ctx context.Context, cleanup bool, timeout int) (m
} }
if cleanup { if cleanup {
err := c.Cleanup(ctx) err := c.Cleanup(ctx, false)
if err != nil && !errors.Is(err, define.ErrCtrStateInvalid) && !errors.Is(err, define.ErrCtrStopped) { if err != nil && !errors.Is(err, define.ErrCtrStateInvalid) && !errors.Is(err, define.ErrCtrStopped) {
return err return err
} }
@ -299,7 +299,7 @@ func (p *Pod) Cleanup(ctx context.Context) (map[string]error, error) {
c := ctr c := ctr
logrus.Debugf("Adding parallel job to clean up container %s", c.ID()) logrus.Debugf("Adding parallel job to clean up container %s", c.ID())
retChan := parallel.Enqueue(ctx, func() error { retChan := parallel.Enqueue(ctx, func() error {
return c.Cleanup(ctx) return c.Cleanup(ctx, false)
}) })
ctrErrChan[c.ID()] = retChan ctrErrChan[c.ID()] = retChan

View File

@ -371,6 +371,7 @@ type ContainerCleanupOptions struct {
Latest bool Latest bool
Remove bool Remove bool
RemoveImage bool RemoveImage bool
StoppedOnly bool // Only cleanup if the container is stopped, i.e. was running before
} }
// ContainerCleanupReport describes the response from a // ContainerCleanupReport describes the response from a

View File

@ -313,7 +313,7 @@ func (ic *ContainerEngine) ContainerStop(ctx context.Context, namesOrIds []strin
} }
} }
} else { } else {
if err = c.Cleanup(ctx); err != nil { if err = c.Cleanup(ctx, false); err != nil {
// The container could still have been removed, as we unlocked // The container could still have been removed, as we unlocked
// after we stopped it. // after we stopped it.
if errors.Is(err, define.ErrNoSuchCtr) || errors.Is(err, define.ErrCtrRemoved) { if errors.Is(err, define.ErrNoSuchCtr) || errors.Is(err, define.ErrCtrRemoved) {
@ -1320,7 +1320,7 @@ func (ic *ContainerEngine) ContainerCleanup(ctx context.Context, namesOrIds []st
report.RmErr = fmt.Errorf("failed to clean up and remove container %v: %w", ctr.ID(), err) report.RmErr = fmt.Errorf("failed to clean up and remove container %v: %w", ctr.ID(), err)
} }
} else { } else {
err := ctr.Cleanup(ctx) err := ctr.Cleanup(ctx, options.StoppedOnly)
// ignore error if ctr is removed or cannot be cleaned up, likely the ctr was already restarted by another process // ignore error if ctr is removed or cannot be cleaned up, likely the ctr was already restarted by another process
if err != nil && !errors.Is(err, define.ErrNoSuchCtr) && !errors.Is(err, define.ErrCtrStateInvalid) { if err != nil && !errors.Is(err, define.ErrNoSuchCtr) && !errors.Is(err, define.ErrCtrStateInvalid) {
report.CleanErr = fmt.Errorf("failed to clean up container %v: %w", ctr.ID(), err) report.CleanErr = fmt.Errorf("failed to clean up container %v: %w", ctr.ID(), err)

View File

@ -311,7 +311,9 @@ func CreateExitCommandArgs(storageConfig storageTypes.StoreOptions, config *conf
command = append(command, "--module", module) command = append(command, "--module", module)
} }
command = append(command, []string{"container", "cleanup"}...) // --stopped-only is used to ensure we only cleanup stopped containers and do not race
// against other processes that did a cleanup() + init() again before we had the chance to run
command = append(command, []string{"container", "cleanup", "--stopped-only"}...)
if rm { if rm {
command = append(command, "--rm") command = append(command, "--rm")