diff --git a/libpod/container_internal.go b/libpod/container_internal.go index 004f8a5578..7f909b78db 100644 --- a/libpod/container_internal.go +++ b/libpod/container_internal.go @@ -1326,10 +1326,8 @@ func (c *Container) start(ctx context.Context) error { return nil } -// Internal, non-locking function to stop container -func (c *Container) stop(timeout uint) error { - logrus.Debugf("Stopping ctr %s (timeout %d)", c.ID(), timeout) - +// Whether a container should use `all` when stopping +func (c *Container) stopWithAll() (bool, error) { // If the container is running in a PID Namespace, then killing the // primary pid is enough to kill the container. If it is not running in // a pid namespace then the OCI Runtime needs to kill ALL processes in @@ -1345,7 +1343,7 @@ func (c *Container) stop(timeout uint) error { // Only do this check if we need to unified, err := cgroups.IsCgroup2UnifiedMode() if err != nil { - return err + return false, err } if !unified { all = false @@ -1353,6 +1351,18 @@ func (c *Container) stop(timeout uint) error { } } + return all, nil +} + +// Internal, non-locking function to stop container +func (c *Container) stop(timeout uint) error { + logrus.Debugf("Stopping ctr %s (timeout %d)", c.ID(), timeout) + + all, err := c.stopWithAll() + if err != nil { + return err + } + // OK, the following code looks a bit weird but we have to make sure we can stop // containers with the restart policy always, to do this we have to set // StoppedByUser even when there is nothing to stop right now. This is due to the @@ -1442,6 +1452,58 @@ func (c *Container) waitForConmonToExitAndSave() error { return err } + // If we are still ContainerStateStopping, conmon exited without + // creating an exit file. Let's try and handle that here. + if c.state.State == define.ContainerStateStopping { + // Is container PID1 still alive? + if err := unix.Kill(c.state.PID, 0); err == nil { + // We have a runaway container, unmanaged by + // Conmon. Invoke OCI runtime stop. + // Use 0 timeout for immediate SIGKILL as things + // have gone seriously wrong. + // Ignore the error from stopWithAll, it's just + // a cgroup check - more important that we + // continue. + // If we wanted to be really fancy here, we + // could open a pidfd on container PID1 before + // this to get the real exit code... But I'm not + // that dedicated. + all, _ := c.stopWithAll() + if err := c.ociRuntime.StopContainer(c, 0, all); err != nil { + logrus.Errorf("Error stopping container %s after Conmon exited prematurely: %v", c.ID(), err) + } + } + + // Conmon is dead. Handle it. + c.state.State = define.ContainerStateStopped + c.state.PID = 0 + c.state.ConmonPID = 0 + c.state.FinishedTime = time.Now() + c.state.ExitCode = -1 + c.state.Exited = true + + c.state.Error = "conmon died without writing exit file, container exit code could not be retrieved" + + c.newContainerExitedEvent(c.state.ExitCode) + + if err := c.save(); err != nil { + logrus.Errorf("Error saving container %s state after Conmon exited prematurely: %v", c.ID(), err) + } + + if err := c.runtime.state.AddContainerExitCode(c.ID(), c.state.ExitCode); err != nil { + logrus.Errorf("Error saving container %s exit code after Conmon exited prematurely: %v", c.ID(), err) + } + + // No Conmon alive to trigger cleanup, and the calls in + // regular Podman are conditional on no errors. + // Need to clean up manually. + if err := c.cleanup(context.Background()); err != nil { + logrus.Errorf("Error cleaning up container %s after Conmon exited prematurely: %v", c.ID(), err) + } + + return fmt.Errorf("container %s conmon exited prematurely, exit code could not be retrieved: %w", c.ID(), define.ErrInternal) + } + return c.save() } diff --git a/test/system/030-run.bats b/test/system/030-run.bats index c9c288f5a5..6da1a817a8 100644 --- a/test/system/030-run.bats +++ b/test/system/030-run.bats @@ -1465,4 +1465,44 @@ search | $IMAGE | is "$output" "Error.*: $expect" "podman emits useful diagnostic when no entrypoint is set" } +@test "podman run - stopping loop" { + skip_if_remote "this doesn't work with with the REST service" + + run_podman run -d --name testctr --stop-timeout 240 $IMAGE sh -c 'echo READY; sleep 999' + cid="$output" + wait_for_ready testctr + + run_podman inspect --format '{{ .State.ConmonPid }}' testctr + conmon_pid=$output + + ${PODMAN} stop testctr & + stop_pid=$! + + timeout=20 + while :;do + sleep 0.5 + run_podman container inspect --format '{{.State.Status}}' testctr + if [[ "$output" = "stopping" ]]; then + break + fi + timeout=$((timeout - 1)) + if [[ $timeout == 0 ]]; then + run_podman ps -a + die "Timed out waiting for testctr to acknowledge signal" + fi + done + + kill -9 ${stop_pid} + kill -9 ${conmon_pid} + + # Unclear why `-t0` is required here, works locally without. + # But it shouldn't hurt and does make the test pass... + PODMAN_TIMEOUT=5 run_podman 125 stop -t0 testctr + is "$output" "Error: container .* conmon exited prematurely, exit code could not be retrieved: internal libpod error" "correct error on missing conmon" + + # This should be safe because stop is guaranteed to call cleanup? + run_podman inspect --format "{{ .State.Status }}" testctr + is "$output" "exited" "container has successfully transitioned to exited state after stop" +} + # vim: filetype=sh