Add basic deadlock detection for container start/remove

We can easily tell if we're going to deadlock by comparing lock
IDs before actually taking the lock. Add a few checks for this in
common places where deadlocks might occur.

This does not yet cover pod operations, where detection is more
difficult (and costly) due to the number of locks being involved
being higher than 2.

Also, add some error wrapping on the Podman side, so we can tell
people to use `system renumber` when it occurs.

Signed-off-by: Matthew Heon <matthew.heon@pm.me>
This commit is contained in:
Matthew Heon
2020-02-23 13:25:12 -05:00
parent 18dcb84d64
commit 4004f646cd
7 changed files with 32 additions and 3 deletions

View File

@ -4,8 +4,10 @@ import (
"fmt" "fmt"
"github.com/containers/libpod/cmd/podman/cliconfig" "github.com/containers/libpod/cmd/podman/cliconfig"
"github.com/containers/libpod/libpod/define"
"github.com/containers/libpod/pkg/adapter" "github.com/containers/libpod/pkg/adapter"
"github.com/pkg/errors" "github.com/pkg/errors"
"github.com/sirupsen/logrus"
"github.com/spf13/cobra" "github.com/spf13/cobra"
) )
@ -77,6 +79,9 @@ func rmCmd(c *cliconfig.RmValues) error {
if len(failures) > 0 { if len(failures) > 0 {
for _, err := range failures { for _, err := range failures {
if errors.Cause(err) == define.ErrWillDeadlock {
logrus.Errorf("Potential deadlock detected - please run 'podman system renumber' to resolve")
}
exitCode = setExitCode(err) exitCode = setExitCode(err)
} }
} }

View File

@ -1401,6 +1401,9 @@ func (c *Container) mountNamedVolume(v *ContainerNamedVolume, mountpoint string)
return nil, errors.Wrapf(err, "error retrieving named volume %s for container %s", v.Name, c.ID()) return nil, errors.Wrapf(err, "error retrieving named volume %s for container %s", v.Name, c.ID())
} }
if vol.config.LockID == c.config.LockID {
return nil, errors.Wrapf(define.ErrWillDeadlock, "container %s and volume %s share lock ID %d", c.ID(), vol.Name(), c.config.LockID)
}
vol.lock.Lock() vol.lock.Lock()
defer vol.lock.Unlock() defer vol.lock.Unlock()
if vol.needsMount() { if vol.needsMount() {

View File

@ -61,6 +61,11 @@ var (
// the user. // the user.
ErrDetach = utils.ErrDetach ErrDetach = utils.ErrDetach
// ErrWillDeadlock indicates that the requested operation will cause a
// deadlock. This is usually caused by upgrade issues, and is resolved
// by renumbering the locks.
ErrWillDeadlock = errors.New("deadlock due to lock mismatch")
// ErrNoCgroups indicates that the container does not have its own // ErrNoCgroups indicates that the container does not have its own
// CGroup. // CGroup.
ErrNoCgroups = errors.New("this container does not have a cgroup") ErrNoCgroups = errors.New("this container does not have a cgroup")

View File

@ -412,6 +412,9 @@ func (r *Runtime) removeContainer(ctx context.Context, c *Container, force bool,
} }
// Lock the pod while we're removing container // Lock the pod while we're removing container
if pod.config.LockID == c.config.LockID {
return errors.Wrapf(define.ErrWillDeadlock, "container %s and pod %s share lock ID %d", c.ID(), pod.ID(), c.config.LockID)
}
pod.lock.Lock() pod.lock.Lock()
defer pod.lock.Unlock() defer pod.lock.Unlock()
if err := pod.updatePod(); err != nil { if err := pod.updatePod(); err != nil {

View File

@ -36,9 +36,6 @@ func (r *Runtime) RemoveVolume(ctx context.Context, v *Volume, force bool) error
} }
} }
v.lock.Lock()
defer v.lock.Unlock()
return r.removeVolume(ctx, v, force) return r.removeVolume(ctx, v, force)
} }

View File

@ -124,6 +124,9 @@ func (r *Runtime) removeVolume(ctx context.Context, v *Volume, force bool) error
return define.ErrVolumeRemoved return define.ErrVolumeRemoved
} }
v.lock.Lock()
defer v.lock.Unlock()
// Update volume status to pick up a potential removal from state // Update volume status to pick up a potential removal from state
if err := v.update(); err != nil { if err := v.update(); err != nil {
return err return err

View File

@ -469,6 +469,10 @@ func (r *LocalRuntime) Run(ctx context.Context, c *cliconfig.RunValues, exitCode
logrus.Debugf("unable to remove container %s after failing to start and attach to it", ctr.ID()) logrus.Debugf("unable to remove container %s after failing to start and attach to it", ctr.ID())
} }
} }
if errors.Cause(err) == define.ErrWillDeadlock {
logrus.Debugf("Deadlock error: %v", err)
return define.ExitCode(err), errors.Errorf("attempting to start container %s would cause a deadlock; please run 'podman system renumber' to resolve", ctr.ID())
}
return define.ExitCode(err), err return define.ExitCode(err), err
} }
@ -702,6 +706,11 @@ func (r *LocalRuntime) Start(ctx context.Context, c *cliconfig.StartValues, sigP
return exitCode, nil return exitCode, nil
} }
if errors.Cause(err) == define.ErrWillDeadlock {
logrus.Debugf("Deadlock error: %v", err)
return define.ExitCode(err), errors.Errorf("attempting to start container %s would cause a deadlock; please run 'podman system renumber' to resolve", ctr.ID())
}
if ctrRunning { if ctrRunning {
return 0, err return 0, err
} }
@ -735,6 +744,10 @@ func (r *LocalRuntime) Start(ctx context.Context, c *cliconfig.StartValues, sigP
if lastError != nil { if lastError != nil {
fmt.Fprintln(os.Stderr, lastError) fmt.Fprintln(os.Stderr, lastError)
} }
if errors.Cause(err) == define.ErrWillDeadlock {
lastError = errors.Wrapf(err, "please run 'podman system renumber' to resolve deadlocks")
continue
}
lastError = errors.Wrapf(err, "unable to start container %q", container) lastError = errors.Wrapf(err, "unable to start container %q", container)
continue continue
} }