diff --git a/libpod/container_internal.go b/libpod/container_internal.go index 651575052b..3f585d2ddb 100644 --- a/libpod/container_internal.go +++ b/libpod/container_internal.go @@ -1281,10 +1281,8 @@ func (c *Container) start() error { return c.save() } -// Internal, non-locking function to stop container -func (c *Container) stop(timeout uint) error { - logrus.Debugf("Stopping ctr %s (timeout %d)", c.ID(), timeout) - +// Whether a container should use `all` when stopping +func (c *Container) stopWithAll() (bool, error) { // If the container is running in a PID Namespace, then killing the // primary pid is enough to kill the container. If it is not running in // a pid namespace then the OCI Runtime needs to kill ALL processes in @@ -1300,7 +1298,7 @@ func (c *Container) stop(timeout uint) error { // Only do this check if we need to unified, err := cgroups.IsCgroup2UnifiedMode() if err != nil { - return err + return false, err } if !unified { all = false @@ -1308,6 +1306,18 @@ func (c *Container) stop(timeout uint) error { } } + return all, nil +} + +// Internal, non-locking function to stop container +func (c *Container) stop(timeout uint) error { + logrus.Debugf("Stopping ctr %s (timeout %d)", c.ID(), timeout) + + all, err := c.stopWithAll() + if err != nil { + return err + } + // Set the container state to "stopping" and unlock the container // before handing it over to conmon to unblock other commands. #8501 // demonstrates nicely that a high stop timeout will block even simple @@ -1374,6 +1384,58 @@ func (c *Container) waitForConmonToExitAndSave() error { return err } + // If we are still ContainerStateStopping, conmon exited without + // creating an exit file. Let's try and handle that here. + if c.state.State == define.ContainerStateStopping { + // Is container PID1 still alive? + if err := unix.Kill(c.state.PID, 0); err == nil { + // We have a runaway container, unmanaged by + // Conmon. Invoke OCI runtime stop. + // Use 0 timeout for immediate SIGKILL as things + // have gone seriously wrong. + // Ignore the error from stopWithAll, it's just + // a cgroup check - more important that we + // continue. + // If we wanted to be really fancy here, we + // could open a pidfd on container PID1 before + // this to get the real exit code... But I'm not + // that dedicated. + all, _ := c.stopWithAll() + if err := c.ociRuntime.StopContainer(c, 0, all); err != nil { + logrus.Errorf("Error stopping container %s after Conmon exited prematurely: %v", c.ID(), err) + } + } + + // Conmon is dead. Handle it. + c.state.State = define.ContainerStateStopped + c.state.PID = 0 + c.state.ConmonPID = 0 + c.state.FinishedTime = time.Now() + c.state.ExitCode = -1 + c.state.Exited = true + + c.state.Error = "conmon died without writing exit file, container exit code could not be retrieved" + + c.newContainerExitedEvent(c.state.ExitCode) + + if err := c.save(); err != nil { + logrus.Errorf("Error saving container %s state after Conmon exited prematurely: %v", c.ID(), err) + } + + if err := c.runtime.state.AddContainerExitCode(c.ID(), c.state.ExitCode); err != nil { + logrus.Errorf("Error saving container %s exit code after Conmon exited prematurely: %v", c.ID(), err) + } + + // No Conmon alive to trigger cleanup, and the calls in + // regular Podman are conditional on no errors. + // Need to clean up manually. + if err := c.cleanup(context.Background()); err != nil { + logrus.Errorf("Error cleaning up container %s after Conmon exited prematurely: %v", c.ID(), err) + } + + return fmt.Errorf("container %s conmon exited prematurely, exit code could not be retrieved: %w", c.ID(), define.ErrInternal) + } + return c.save() } diff --git a/test/system/030-run.bats b/test/system/030-run.bats index 37d36568d6..5afc60185c 100644 --- a/test/system/030-run.bats +++ b/test/system/030-run.bats @@ -1030,4 +1030,44 @@ EOF fi } +@test "podman run - stopping loop" { + skip_if_remote "this doesn't work with with the REST service" + + run_podman run -d --name testctr --stop-timeout 240 $IMAGE sh -c 'echo READY; sleep 999' + cid="$output" + wait_for_ready testctr + + run_podman inspect --format '{{ .State.ConmonPid }}' testctr + conmon_pid=$output + + ${PODMAN} stop testctr & + stop_pid=$! + + timeout=20 + while :;do + sleep 0.5 + run_podman container inspect --format '{{.State.Status}}' testctr + if [[ "$output" = "stopping" ]]; then + break + fi + timeout=$((timeout - 1)) + if [[ $timeout == 0 ]]; then + run_podman ps -a + die "Timed out waiting for testctr to acknowledge signal" + fi + done + + kill -9 ${stop_pid} + kill -9 ${conmon_pid} + + # Unclear why `-t0` is required here, works locally without. + # But it shouldn't hurt and does make the test pass... + PODMAN_TIMEOUT=5 run_podman 125 stop -t0 testctr + is "$output" "Error: container .* conmon exited prematurely, exit code could not be retrieved: internal libpod error" "correct error on missing conmon" + + # This should be safe because stop is guaranteed to call cleanup? + run_podman inspect --format "{{ .State.Status }}" testctr + is "$output" "exited" "container has successfully transitioned to exited state after stop" +} + # vim: filetype=sh