Merge pull request #6520 from mheon/no_conmon_no_error

Ensure Conmon is alive before waiting for exit file
This commit is contained in:
OpenShift Merge Robot
2020-06-09 10:09:14 +02:00
committed by GitHub
6 changed files with 66 additions and 4 deletions

View File

@ -1209,13 +1209,35 @@ func (c *Container) stop(timeout uint) error {
} }
} }
// Check if conmon is still alive.
// If it is not, we won't be getting an exit file.
conmonAlive, err := c.ociRuntime.CheckConmonRunning(c)
if err != nil {
return err
}
if err := c.ociRuntime.StopContainer(c, timeout, all); err != nil { if err := c.ociRuntime.StopContainer(c, timeout, all); err != nil {
return err return err
} }
c.newContainerEvent(events.Stop)
c.state.PID = 0 c.state.PID = 0
c.state.ConmonPID = 0 c.state.ConmonPID = 0
c.state.StoppedByUser = true c.state.StoppedByUser = true
if !conmonAlive {
// Conmon is dead, so we can't epect an exit code.
c.state.ExitCode = -1
c.state.FinishedTime = time.Now()
c.state.State = define.ContainerStateStopped
if err := c.save(); err != nil {
logrus.Errorf("Error saving container %s status: %v", c.ID(), err)
}
return errors.Wrapf(define.ErrConmonDead, "container %s conmon process missing, cannot retrieve exit code", c.ID())
}
if err := c.save(); err != nil { if err := c.save(); err != nil {
return errors.Wrapf(err, "error saving container %s state after stopping", c.ID()) return errors.Wrapf(err, "error saving container %s state after stopping", c.ID())
} }
@ -1225,8 +1247,6 @@ func (c *Container) stop(timeout uint) error {
return err return err
} }
c.newContainerEvent(events.Stop)
return nil return nil
} }

View File

@ -141,6 +141,9 @@ var (
// ErrConmonOutdated indicates the version of conmon found (whether via the configuration or $PATH) // ErrConmonOutdated indicates the version of conmon found (whether via the configuration or $PATH)
// is out of date for the current podman version // is out of date for the current podman version
ErrConmonOutdated = errors.New("outdated conmon version") ErrConmonOutdated = errors.New("outdated conmon version")
// ErrConmonDead indicates that the container's conmon process has been
// killed, preventing normal operation.
ErrConmonDead = errors.New("conmon process killed")
// ErrImageInUse indicates the requested operation failed because the image was in use // ErrImageInUse indicates the requested operation failed because the image was in use
ErrImageInUse = errors.New("image is being used") ErrImageInUse = errors.New("image is being used")

View File

@ -107,6 +107,13 @@ type OCIRuntime interface {
// error. // error.
CheckpointContainer(ctr *Container, options ContainerCheckpointOptions) error CheckpointContainer(ctr *Container, options ContainerCheckpointOptions) error
// CheckConmonRunning verifies that the given container's Conmon
// instance is still running. Runtimes without Conmon, or systems where
// the PID of conmon is not available, should mock this as True.
// True indicates that Conmon for the instance is running, False
// indicates it is not.
CheckConmonRunning(ctr *Container) (bool, error)
// SupportsCheckpoint returns whether this OCI runtime // SupportsCheckpoint returns whether this OCI runtime
// implementation supports the CheckpointContainer() operation. // implementation supports the CheckpointContainer() operation.
SupportsCheckpoint() bool SupportsCheckpoint() bool

View File

@ -669,6 +669,31 @@ func (r *ConmonOCIRuntime) CheckpointContainer(ctr *Container, options Container
return utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, nil, r.path, args...) return utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, nil, r.path, args...)
} }
func (r *ConmonOCIRuntime) CheckConmonRunning(ctr *Container) (bool, error) {
if ctr.state.ConmonPID == 0 {
// If the container is running or paused, assume Conmon is
// running. We didn't record Conmon PID on some old versions, so
// that is likely what's going on...
// Unusual enough that we should print a warning message though.
if ctr.ensureState(define.ContainerStateRunning, define.ContainerStatePaused) {
logrus.Warnf("Conmon PID is not set, but container is running!")
return true, nil
}
// Container's not running, so conmon PID being unset is
// expected. Conmon is not running.
return false, nil
}
// We have a conmon PID. Ping it with signal 0.
if err := unix.Kill(ctr.state.ConmonPID, 0); err != nil {
if err == unix.ESRCH {
return false, nil
}
return false, errors.Wrapf(err, "error pinging container %s conmon with signal 0", ctr.ID())
}
return true, nil
}
// SupportsCheckpoint checks if the OCI runtime supports checkpointing // SupportsCheckpoint checks if the OCI runtime supports checkpointing
// containers. // containers.
func (r *ConmonOCIRuntime) SupportsCheckpoint() bool { func (r *ConmonOCIRuntime) SupportsCheckpoint() bool {

View File

@ -163,6 +163,11 @@ func (r *MissingRuntime) CheckpointContainer(ctr *Container, options ContainerCh
return r.printError() return r.printError()
} }
// CheckConmonRunning is not available as the runtime is missing
func (r *MissingRuntime) CheckConmonRunning(ctr *Container) (bool, error) {
return false, r.printError()
}
// SupportsCheckpoint returns false as checkpointing requires a working runtime // SupportsCheckpoint returns false as checkpointing requires a working runtime
func (r *MissingRuntime) SupportsCheckpoint() bool { func (r *MissingRuntime) SupportsCheckpoint() bool {
return false return false

View File

@ -464,9 +464,11 @@ func (r *Runtime) removeContainer(ctx context.Context, c *Container, force bool,
} }
} }
// Check that the container's in a good state to be removed // Check that the container's in a good state to be removed.
if c.state.State == define.ContainerStateRunning { if c.state.State == define.ContainerStateRunning {
if err := c.stop(c.StopTimeout()); err != nil { // Ignore ErrConmonDead - we couldn't retrieve the container's
// exit code properly, but it's still stopped.
if err := c.stop(c.StopTimeout()); err != nil && errors.Cause(err) != define.ErrConmonDead {
return errors.Wrapf(err, "cannot remove container %s as it could not be stopped", c.ID()) return errors.Wrapf(err, "cannot remove container %s as it could not be stopped", c.ID())
} }
} }