Merge pull request #23089 from TomSweeneyRedHat/dev/tsweeney/accel250-v4.4.1-rhel

[v4.4.1-rhel] Ensure that containers do not get stuck in stopping
This commit is contained in:
openshift-merge-bot[bot]
2024-06-28 16:13:55 +00:00
committed by GitHub
2 changed files with 107 additions and 5 deletions

View File

@ -1281,10 +1281,8 @@ func (c *Container) start() error {
return c.save()
}
// Internal, non-locking function to stop container
func (c *Container) stop(timeout uint) error {
logrus.Debugf("Stopping ctr %s (timeout %d)", c.ID(), timeout)
// Whether a container should use `all` when stopping
func (c *Container) stopWithAll() (bool, error) {
// If the container is running in a PID Namespace, then killing the
// primary pid is enough to kill the container. If it is not running in
// a pid namespace then the OCI Runtime needs to kill ALL processes in
@ -1300,7 +1298,7 @@ func (c *Container) stop(timeout uint) error {
// Only do this check if we need to
unified, err := cgroups.IsCgroup2UnifiedMode()
if err != nil {
return err
return false, err
}
if !unified {
all = false
@ -1308,6 +1306,18 @@ func (c *Container) stop(timeout uint) error {
}
}
return all, nil
}
// Internal, non-locking function to stop container
func (c *Container) stop(timeout uint) error {
logrus.Debugf("Stopping ctr %s (timeout %d)", c.ID(), timeout)
all, err := c.stopWithAll()
if err != nil {
return err
}
// Set the container state to "stopping" and unlock the container
// before handing it over to conmon to unblock other commands. #8501
// demonstrates nicely that a high stop timeout will block even simple
@ -1374,6 +1384,58 @@ func (c *Container) waitForConmonToExitAndSave() error {
return err
}
// If we are still ContainerStateStopping, conmon exited without
// creating an exit file. Let's try and handle that here.
if c.state.State == define.ContainerStateStopping {
// Is container PID1 still alive?
if err := unix.Kill(c.state.PID, 0); err == nil {
// We have a runaway container, unmanaged by
// Conmon. Invoke OCI runtime stop.
// Use 0 timeout for immediate SIGKILL as things
// have gone seriously wrong.
// Ignore the error from stopWithAll, it's just
// a cgroup check - more important that we
// continue.
// If we wanted to be really fancy here, we
// could open a pidfd on container PID1 before
// this to get the real exit code... But I'm not
// that dedicated.
all, _ := c.stopWithAll()
if err := c.ociRuntime.StopContainer(c, 0, all); err != nil {
logrus.Errorf("Error stopping container %s after Conmon exited prematurely: %v", c.ID(), err)
}
}
// Conmon is dead. Handle it.
c.state.State = define.ContainerStateStopped
c.state.PID = 0
c.state.ConmonPID = 0
c.state.FinishedTime = time.Now()
c.state.ExitCode = -1
c.state.Exited = true
c.state.Error = "conmon died without writing exit file, container exit code could not be retrieved"
c.newContainerExitedEvent(c.state.ExitCode)
if err := c.save(); err != nil {
logrus.Errorf("Error saving container %s state after Conmon exited prematurely: %v", c.ID(), err)
}
if err := c.runtime.state.AddContainerExitCode(c.ID(), c.state.ExitCode); err != nil {
logrus.Errorf("Error saving container %s exit code after Conmon exited prematurely: %v", c.ID(), err)
}
// No Conmon alive to trigger cleanup, and the calls in
// regular Podman are conditional on no errors.
// Need to clean up manually.
if err := c.cleanup(context.Background()); err != nil {
logrus.Errorf("Error cleaning up container %s after Conmon exited prematurely: %v", c.ID(), err)
}
return fmt.Errorf("container %s conmon exited prematurely, exit code could not be retrieved: %w", c.ID(), define.ErrInternal)
}
return c.save()
}

View File

@ -1030,4 +1030,44 @@ EOF
fi
}
@test "podman run - stopping loop" {
skip_if_remote "this doesn't work with with the REST service"
run_podman run -d --name testctr --stop-timeout 240 $IMAGE sh -c 'echo READY; sleep 999'
cid="$output"
wait_for_ready testctr
run_podman inspect --format '{{ .State.ConmonPid }}' testctr
conmon_pid=$output
${PODMAN} stop testctr &
stop_pid=$!
timeout=20
while :;do
sleep 0.5
run_podman container inspect --format '{{.State.Status}}' testctr
if [[ "$output" = "stopping" ]]; then
break
fi
timeout=$((timeout - 1))
if [[ $timeout == 0 ]]; then
run_podman ps -a
die "Timed out waiting for testctr to acknowledge signal"
fi
done
kill -9 ${stop_pid}
kill -9 ${conmon_pid}
# Unclear why `-t0` is required here, works locally without.
# But it shouldn't hurt and does make the test pass...
PODMAN_TIMEOUT=5 run_podman 125 stop -t0 testctr
is "$output" "Error: container .* conmon exited prematurely, exit code could not be retrieved: internal libpod error" "correct error on missing conmon"
# This should be safe because stop is guaranteed to call cleanup?
run_podman inspect --format "{{ .State.Status }}" testctr
is "$output" "exited" "container has successfully transitioned to exited state after stop"
}
# vim: filetype=sh