wait: look for exit code in stopped state

Make sure to look for the container's exit code when it's in stopped state. With `--restart=always`, the container seems to stay in the stopped state which led the wait logic to loop until the 20 seconds timeout for the cleanup process to have finished kicks in. Also defensively make sure to loop when the container is in stopped state but no exit code has been written yet. Add a regression test to make sure Podman doesn't wait more than 20 seconds. Even on a CI machine under high load I expect it to take much much much less than that, so I do not expect this test to flake in the future. Signed-off-by: Valentin Rothberg <vrothberg@redhat.com>
2025-07-02 00:30:00 +08:00 · 2023-05-16 10:48:28 +02:00
parent 44807eabf1
commit 1b9272a060
3 changed files with 38 additions and 4 deletions
--- a/docs/source/markdown/podman-wait.1.md.in
+++ b/docs/source/markdown/podman-wait.1.md.in
@ -14,6 +14,11 @@ name or ID.  In the case of multiple containers, Podman waits on each consecutiv
 After all specified containers are stopped, the containers' return codes are printed
 separated by newline in the same order as they were given to the command.

+NOTE: there is an inherent race condition when waiting for containers with a
+restart policy of `always` or `on-failure`, such as those created by `podman
+kube play`. Such containers may be repeatedly exiting and restarting, possibly
+with different exit codes, but `podman wait` can only display and detect one.
+
 ## OPTIONS

 #### **--condition**=*state*
--- a/libpod/container_api.go
+++ b/libpod/container_api.go
@ -592,13 +592,21 @@ func (c *Container) WaitForExit(ctx context.Context, pollInterval time.Duration)
 			conmonAlive, err := c.ociRuntime.CheckConmonRunning(c)
 			switch {
 			case errors.Is(err, define.ErrNoSuchCtr):
+				// Container has been removed, so we assume the
+				// exit code is present in the DB.
 				containerRemoved = true
 			case err != nil:
 				return false, -1, err
 			case !conmonAlive:
+				// Give the exit code at most 20 seconds to
+				// show up in the DB.  That should largely be
+				// enough for the cleanup process.
 				timerDuration := time.Second * 20
 				conmonTimer = *time.NewTimer(timerDuration)
 				conmonTimerSet = true
+			case conmonAlive:
+				// Continue waiting if conmon's still running.
+				return false, -1, nil
 			}
 		}

@ -609,7 +617,18 @@ func (c *Container) WaitForExit(ctx context.Context, pollInterval time.Duration)
 			case <-conmonTimer.C:
 				logrus.Debugf("Exceeded conmon timeout waiting for container %s to exit", id)
 			default:
-				if !c.ensureState(define.ContainerStateExited, define.ContainerStateConfigured) {
+				switch c.state.State {
+				case define.ContainerStateExited, define.ContainerStateConfigured:
+					// Container exited, so we can look up the exit code.
+				case define.ContainerStateStopped:
+					// Continue looping unless the restart policy is always.
+					// In this case, the container would never transition to
+					// the exited state, so we need to look up the exit code.
+					if c.config.RestartPolicy != define.RestartPolicyAlways {
+						return false, -1, nil
+					}
+				default:
+					// Continue looping
 					return false, -1, nil
 				}
 			}
@ -617,9 +636,11 @@ func (c *Container) WaitForExit(ctx context.Context, pollInterval time.Duration)

 		exitCode, err := c.runtime.state.GetContainerExitCode(id)
 		if err != nil {
-			if errors.Is(err, define.ErrNoSuchExitCode) && c.ensureState(define.ContainerStateConfigured, define.ContainerStateCreated) {
-				// The container never ran.
-				return true, 0, nil
+			if errors.Is(err, define.ErrNoSuchExitCode) {
+				// If the container is configured or created, we must assume it never ran.
+				if c.ensureState(define.ContainerStateConfigured, define.ContainerStateCreated) {
+					return true, 0, nil
+				}
 			}
 			return true, -1, fmt.Errorf("%w (container in state %s)", err, c.state.State)
 		}
--- a/test/system/030-run.bats
+++ b/test/system/030-run.bats
@ -1104,5 +1104,13 @@ EOF
    rm -rf $romount
 }

+@test "podman run --restart=always -- wait" {
+    # regression test for #18572 to make sure Podman waits less than 20 seconds
+    ctr=$(random_string)
+    run_podman run -d --restart=always --name=$ctr $IMAGE false
+    PODMAN_TIMEOUT=20 run_podman wait $ctr
+    is "$output" "1" "container should exit 1"
+    run_podman rm -f -t0 $ctr
+}

 # vim: filetype=sh