From c0ab293131866f80ef410e1f7b5b5ec6f47c5c1a Mon Sep 17 00:00:00 2001 From: Valentin Rothberg Date: Tue, 13 Jun 2023 10:57:05 +0200 Subject: [PATCH] container wait: indicate timeout in error When waiting for a container, there may be a time window where conmon has already exited but the container hasn't been fully cleaned up. In that case, we give the container at most 20 seconds to be fully cleaned up. We cannot wait forever since conmon may have been killed or something else went wrong. After the timeout, we optimistically assume the container to be cleaned up and its exit code to present. If no exit code can be found, we return an error. Indicate in the error whether the timeout kicked in to help debug (transient) errors and flakes (e.g., #18860). [NO NEW TESTS NEEDED] Signed-off-by: Valentin Rothberg --- libpod/container_api.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/libpod/container_api.go b/libpod/container_api.go index d66e9dae27..e230cc39f3 100644 --- a/libpod/container_api.go +++ b/libpod/container_api.go @@ -610,12 +610,14 @@ func (c *Container) WaitForExit(ctx context.Context, pollInterval time.Duration) } } + timedout := "" if !containerRemoved { // If conmon is dead for more than $timerDuration or if the // container has exited properly, try to look up the exit code. select { case <-conmonTimer.C: logrus.Debugf("Exceeded conmon timeout waiting for container %s to exit", id) + timedout = " [exceeded conmon timeout waiting for container to exit]" default: switch c.state.State { case define.ContainerStateExited, define.ContainerStateConfigured: @@ -642,7 +644,7 @@ func (c *Container) WaitForExit(ctx context.Context, pollInterval time.Duration) return true, 0, nil } } - return true, -1, fmt.Errorf("%w (container in state %s)", err, c.state.State) + return true, -1, fmt.Errorf("%w (container in state %s)%s", err, c.state.State, timedout) } return true, exitCode, nil