From fbbfd0746376c9b87b83169c6eb1450bec117038 Mon Sep 17 00:00:00 2001 From: Ed Santiago Date: Mon, 21 Oct 2024 14:33:28 -0600 Subject: [PATCH] kube SIGINT system test: fix race in timeout handling Up to now this test has been run using: PODMAN_TIMEOUT=2 run_podman kube play ... ...and this gives podman time to start the pod before getting the signal. When run in parallel, under heavy load, the above command seems to time out before podman has gotten its act together. Weird things happen, like weird exit status and (most crucially) zombie containers. Solution: wait for container to actually start before we kill it. Signed-off-by: Ed Santiago --- test/system/700-play.bats | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/test/system/700-play.bats b/test/system/700-play.bats index ce5011bd82..a4dc40cd55 100644 --- a/test/system/700-play.bats +++ b/test/system/700-play.bats @@ -660,17 +660,35 @@ spec: image: $IMAGE command: - top + - -b " > $fname - # force a timeout to happen so that the kube play command is killed - # and expect the timeout code 124 to happen so that we can clean up + # Run in background, then wait for pod to start running. + # This guarantees that when we send the signal (below) we do so + # on a running container; signaling during initialization + # results in undefined behavior. + logfile=$PODMAN_TMPDIR/kube-play.log + $PODMAN kube play --wait $fname &> $logfile & + local kidpid=$! + + for try in {1..10}; do + run_podman '?' container inspect --format '{{.State.Running}}' "$podname-$ctrname" + if [[ $status -eq 0 ]] && [[ "$output" = "true" ]]; then + break + fi + sleep 1 + done + wait_for_output "Mem:" "$podname-$ctrname" + + # Send SIGINT to container, and see how long it takes to exit. local t0=$SECONDS - PODMAN_TIMEOUT=2 run_podman 124 kube play --wait $fname + kill -2 $kidpid + wait $kidpid local t1=$SECONDS local delta_t=$((t1 - t0)) # Expectation (in seconds) of when we should time out. When running - # parallel, allow 4 more seconds due to system load + # parallel, allow longer time due to system load local expect=4 if [[ -n "$PARALLEL_JOBSLOT" ]]; then expect=$((expect + 4)) @@ -678,7 +696,8 @@ spec: assert $delta_t -le $expect \ "podman kube play did not get killed within $expect seconds" # Make sure we actually got SIGTERM and podman printed its message. - assert "$output" =~ "Cleaning up containers, pods, and volumes" "kube play printed sigterm message" + assert "$(< $logfile)" =~ "Cleaning up containers, pods, and volumes" \ + "kube play printed sigterm message" # there should be no containers running or created run_podman ps -a --noheading