mirror of
https://github.com/containers/podman.git
synced 2025-07-03 09:17:15 +08:00
Merge pull request #22641 from mheon/handle_stopping_loop
Ensure that containers do not get stuck in stopping
This commit is contained in:
@ -1326,10 +1326,8 @@ func (c *Container) start(ctx context.Context) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// Internal, non-locking function to stop container
|
// Whether a container should use `all` when stopping
|
||||||
func (c *Container) stop(timeout uint) error {
|
func (c *Container) stopWithAll() (bool, error) {
|
||||||
logrus.Debugf("Stopping ctr %s (timeout %d)", c.ID(), timeout)
|
|
||||||
|
|
||||||
// If the container is running in a PID Namespace, then killing the
|
// If the container is running in a PID Namespace, then killing the
|
||||||
// primary pid is enough to kill the container. If it is not running in
|
// primary pid is enough to kill the container. If it is not running in
|
||||||
// a pid namespace then the OCI Runtime needs to kill ALL processes in
|
// a pid namespace then the OCI Runtime needs to kill ALL processes in
|
||||||
@ -1345,7 +1343,7 @@ func (c *Container) stop(timeout uint) error {
|
|||||||
// Only do this check if we need to
|
// Only do this check if we need to
|
||||||
unified, err := cgroups.IsCgroup2UnifiedMode()
|
unified, err := cgroups.IsCgroup2UnifiedMode()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return false, err
|
||||||
}
|
}
|
||||||
if !unified {
|
if !unified {
|
||||||
all = false
|
all = false
|
||||||
@ -1353,6 +1351,18 @@ func (c *Container) stop(timeout uint) error {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return all, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Internal, non-locking function to stop container
|
||||||
|
func (c *Container) stop(timeout uint) error {
|
||||||
|
logrus.Debugf("Stopping ctr %s (timeout %d)", c.ID(), timeout)
|
||||||
|
|
||||||
|
all, err := c.stopWithAll()
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
// OK, the following code looks a bit weird but we have to make sure we can stop
|
// OK, the following code looks a bit weird but we have to make sure we can stop
|
||||||
// containers with the restart policy always, to do this we have to set
|
// containers with the restart policy always, to do this we have to set
|
||||||
// StoppedByUser even when there is nothing to stop right now. This is due to the
|
// StoppedByUser even when there is nothing to stop right now. This is due to the
|
||||||
@ -1442,6 +1452,58 @@ func (c *Container) waitForConmonToExitAndSave() error {
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// If we are still ContainerStateStopping, conmon exited without
|
||||||
|
// creating an exit file. Let's try and handle that here.
|
||||||
|
if c.state.State == define.ContainerStateStopping {
|
||||||
|
// Is container PID1 still alive?
|
||||||
|
if err := unix.Kill(c.state.PID, 0); err == nil {
|
||||||
|
// We have a runaway container, unmanaged by
|
||||||
|
// Conmon. Invoke OCI runtime stop.
|
||||||
|
// Use 0 timeout for immediate SIGKILL as things
|
||||||
|
// have gone seriously wrong.
|
||||||
|
// Ignore the error from stopWithAll, it's just
|
||||||
|
// a cgroup check - more important that we
|
||||||
|
// continue.
|
||||||
|
// If we wanted to be really fancy here, we
|
||||||
|
// could open a pidfd on container PID1 before
|
||||||
|
// this to get the real exit code... But I'm not
|
||||||
|
// that dedicated.
|
||||||
|
all, _ := c.stopWithAll()
|
||||||
|
if err := c.ociRuntime.StopContainer(c, 0, all); err != nil {
|
||||||
|
logrus.Errorf("Error stopping container %s after Conmon exited prematurely: %v", c.ID(), err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Conmon is dead. Handle it.
|
||||||
|
c.state.State = define.ContainerStateStopped
|
||||||
|
c.state.PID = 0
|
||||||
|
c.state.ConmonPID = 0
|
||||||
|
c.state.FinishedTime = time.Now()
|
||||||
|
c.state.ExitCode = -1
|
||||||
|
c.state.Exited = true
|
||||||
|
|
||||||
|
c.state.Error = "conmon died without writing exit file, container exit code could not be retrieved"
|
||||||
|
|
||||||
|
c.newContainerExitedEvent(c.state.ExitCode)
|
||||||
|
|
||||||
|
if err := c.save(); err != nil {
|
||||||
|
logrus.Errorf("Error saving container %s state after Conmon exited prematurely: %v", c.ID(), err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := c.runtime.state.AddContainerExitCode(c.ID(), c.state.ExitCode); err != nil {
|
||||||
|
logrus.Errorf("Error saving container %s exit code after Conmon exited prematurely: %v", c.ID(), err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// No Conmon alive to trigger cleanup, and the calls in
|
||||||
|
// regular Podman are conditional on no errors.
|
||||||
|
// Need to clean up manually.
|
||||||
|
if err := c.cleanup(context.Background()); err != nil {
|
||||||
|
logrus.Errorf("Error cleaning up container %s after Conmon exited prematurely: %v", c.ID(), err)
|
||||||
|
}
|
||||||
|
|
||||||
|
return fmt.Errorf("container %s conmon exited prematurely, exit code could not be retrieved: %w", c.ID(), define.ErrInternal)
|
||||||
|
}
|
||||||
|
|
||||||
return c.save()
|
return c.save()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1465,4 +1465,44 @@ search | $IMAGE |
|
|||||||
is "$output" "Error.*: $expect" "podman emits useful diagnostic when no entrypoint is set"
|
is "$output" "Error.*: $expect" "podman emits useful diagnostic when no entrypoint is set"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@test "podman run - stopping loop" {
|
||||||
|
skip_if_remote "this doesn't work with with the REST service"
|
||||||
|
|
||||||
|
run_podman run -d --name testctr --stop-timeout 240 $IMAGE sh -c 'echo READY; sleep 999'
|
||||||
|
cid="$output"
|
||||||
|
wait_for_ready testctr
|
||||||
|
|
||||||
|
run_podman inspect --format '{{ .State.ConmonPid }}' testctr
|
||||||
|
conmon_pid=$output
|
||||||
|
|
||||||
|
${PODMAN} stop testctr &
|
||||||
|
stop_pid=$!
|
||||||
|
|
||||||
|
timeout=20
|
||||||
|
while :;do
|
||||||
|
sleep 0.5
|
||||||
|
run_podman container inspect --format '{{.State.Status}}' testctr
|
||||||
|
if [[ "$output" = "stopping" ]]; then
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
timeout=$((timeout - 1))
|
||||||
|
if [[ $timeout == 0 ]]; then
|
||||||
|
run_podman ps -a
|
||||||
|
die "Timed out waiting for testctr to acknowledge signal"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
kill -9 ${stop_pid}
|
||||||
|
kill -9 ${conmon_pid}
|
||||||
|
|
||||||
|
# Unclear why `-t0` is required here, works locally without.
|
||||||
|
# But it shouldn't hurt and does make the test pass...
|
||||||
|
PODMAN_TIMEOUT=5 run_podman 125 stop -t0 testctr
|
||||||
|
is "$output" "Error: container .* conmon exited prematurely, exit code could not be retrieved: internal libpod error" "correct error on missing conmon"
|
||||||
|
|
||||||
|
# This should be safe because stop is guaranteed to call cleanup?
|
||||||
|
run_podman inspect --format "{{ .State.Status }}" testctr
|
||||||
|
is "$output" "exited" "container has successfully transitioned to exited state after stop"
|
||||||
|
}
|
||||||
|
|
||||||
# vim: filetype=sh
|
# vim: filetype=sh
|
||||||
|
Reference in New Issue
Block a user