container removal: handle already removed containers

Since commit d54478d8eaec, a container's lock is released before
attempting to stop it via the OCI runtime.  This opened the window
for various kinds of race conditions.  One of them led to #9479 where
the removal+cleanup sequences of a `run --rm` session overlapped with
`rm -af`.  Make both execution paths more robust by handling the case of
an already removed container.

Fixes: #9479
Signed-off-by: Valentin Rothberg <rothberg@redhat.com>
This commit is contained in:
Valentin Rothberg
2021-02-23 13:02:35 +01:00
parent 96fc9d983e
commit e5ac28f3b9
3 changed files with 26 additions and 9 deletions

View File

@ -954,18 +954,22 @@ func (c *Container) removeAllExecSessions() error {
}
// Delete all exec sessions
if err := c.runtime.state.RemoveContainerExecSessions(c); err != nil {
if lastErr != nil {
logrus.Errorf("Error stopping container %s exec sessions: %v", c.ID(), lastErr)
if errors.Cause(err) != define.ErrCtrRemoved {
if lastErr != nil {
logrus.Errorf("Error stopping container %s exec sessions: %v", c.ID(), lastErr)
}
lastErr = err
}
lastErr = err
}
c.state.ExecSessions = nil
c.state.LegacyExecSessions = nil
if err := c.save(); err != nil {
if lastErr != nil {
logrus.Errorf("Error stopping container %s exec sessions: %v", c.ID(), lastErr)
if errors.Cause(err) != define.ErrCtrRemoved {
if lastErr != nil {
logrus.Errorf("Error stopping container %s exec sessions: %v", c.ID(), lastErr)
}
lastErr = err
}
lastErr = err
}
return lastErr

View File

@ -319,12 +319,18 @@ func (ic *ContainerEngine) ContainerRm(ctx context.Context, namesOrIds []string,
errMap, err := parallelctr.ContainerOp(ctx, ctrs, func(c *libpod.Container) error {
err := ic.Libpod.RemoveContainer(ctx, c, options.Force, options.Volumes)
if err != nil {
if options.Ignore && errors.Cause(err) == define.ErrNoSuchCtr {
if err == nil {
return nil
}
logrus.Debugf("Failed to remove container %s: %s", c.ID(), err.Error())
switch errors.Cause(err) {
case define.ErrNoSuchCtr:
if options.Ignore {
logrus.Debugf("Ignoring error (--allow-missing): %v", err)
return nil
}
logrus.Debugf("Failed to remove container %s: %s", c.ID(), err.Error())
case define.ErrCtrRemoved:
return nil
}
return err
})

View File

@ -51,6 +51,13 @@ load helpers
run_podman rm $rand $external_cid
}
@test "podman rm <-> run --rm race" {
# A container's lock is released before attempting to stop it. This opens
# the window for race conditions that led to #9479.
run_podman run --rm -d $IMAGE sleep infinity
run_podman rm -af
}
# I'm sorry! This test takes 13 seconds. There's not much I can do about it,
# please know that I think it's justified: podman 1.5.0 had a strange bug
# in with exit status was not preserved on some code paths with 'rm -f'