wait: look for exit code in stopped state

Make sure to look for the container's exit code when it's in stopped
state.  With `--restart=always`, the container seems to stay in the
stopped state which led the wait logic to loop until the 20 seconds
timeout for the cleanup process to have finished kicks in.

Also defensively make sure to loop when the container is in stopped
state but no exit code has been written yet.

Add a regression test to make sure Podman doesn't wait more than 20
seconds.  Even on a CI machine under high load I expect it to take much
much much less than that, so I do not expect this test to flake in the
future.

Signed-off-by: Valentin Rothberg <vrothberg@redhat.com>
This commit is contained in:
Valentin Rothberg
2023-05-16 10:48:28 +02:00
parent 44807eabf1
commit 1b9272a060
3 changed files with 38 additions and 4 deletions

View File

@ -14,6 +14,11 @@ name or ID. In the case of multiple containers, Podman waits on each consecutiv
After all specified containers are stopped, the containers' return codes are printed
separated by newline in the same order as they were given to the command.
NOTE: there is an inherent race condition when waiting for containers with a
restart policy of `always` or `on-failure`, such as those created by `podman
kube play`. Such containers may be repeatedly exiting and restarting, possibly
with different exit codes, but `podman wait` can only display and detect one.
## OPTIONS
#### **--condition**=*state*

View File

@ -592,13 +592,21 @@ func (c *Container) WaitForExit(ctx context.Context, pollInterval time.Duration)
conmonAlive, err := c.ociRuntime.CheckConmonRunning(c)
switch {
case errors.Is(err, define.ErrNoSuchCtr):
// Container has been removed, so we assume the
// exit code is present in the DB.
containerRemoved = true
case err != nil:
return false, -1, err
case !conmonAlive:
// Give the exit code at most 20 seconds to
// show up in the DB. That should largely be
// enough for the cleanup process.
timerDuration := time.Second * 20
conmonTimer = *time.NewTimer(timerDuration)
conmonTimerSet = true
case conmonAlive:
// Continue waiting if conmon's still running.
return false, -1, nil
}
}
@ -609,7 +617,18 @@ func (c *Container) WaitForExit(ctx context.Context, pollInterval time.Duration)
case <-conmonTimer.C:
logrus.Debugf("Exceeded conmon timeout waiting for container %s to exit", id)
default:
if !c.ensureState(define.ContainerStateExited, define.ContainerStateConfigured) {
switch c.state.State {
case define.ContainerStateExited, define.ContainerStateConfigured:
// Container exited, so we can look up the exit code.
case define.ContainerStateStopped:
// Continue looping unless the restart policy is always.
// In this case, the container would never transition to
// the exited state, so we need to look up the exit code.
if c.config.RestartPolicy != define.RestartPolicyAlways {
return false, -1, nil
}
default:
// Continue looping
return false, -1, nil
}
}
@ -617,9 +636,11 @@ func (c *Container) WaitForExit(ctx context.Context, pollInterval time.Duration)
exitCode, err := c.runtime.state.GetContainerExitCode(id)
if err != nil {
if errors.Is(err, define.ErrNoSuchExitCode) && c.ensureState(define.ContainerStateConfigured, define.ContainerStateCreated) {
// The container never ran.
return true, 0, nil
if errors.Is(err, define.ErrNoSuchExitCode) {
// If the container is configured or created, we must assume it never ran.
if c.ensureState(define.ContainerStateConfigured, define.ContainerStateCreated) {
return true, 0, nil
}
}
return true, -1, fmt.Errorf("%w (container in state %s)", err, c.state.State)
}

View File

@ -1104,5 +1104,13 @@ EOF
rm -rf $romount
}
@test "podman run --restart=always -- wait" {
# regression test for #18572 to make sure Podman waits less than 20 seconds
ctr=$(random_string)
run_podman run -d --restart=always --name=$ctr $IMAGE false
PODMAN_TIMEOUT=20 run_podman wait $ctr
is "$output" "1" "container should exit 1"
run_podman rm -f -t0 $ctr
}
# vim: filetype=sh