From fbbfd0746376c9b87b83169c6eb1450bec117038 Mon Sep 17 00:00:00 2001
From: Ed Santiago <santiago@redhat.com>
Date: Mon, 21 Oct 2024 14:33:28 -0600
Subject: [PATCH] kube SIGINT system test: fix race in timeout handling

Up to now this test has been run using:

    PODMAN_TIMEOUT=2 run_podman kube play ...

...and this gives podman time to start the pod before getting
the signal.

When run in parallel, under heavy load, the above command seems
to time out before podman has gotten its act together. Weird
things happen, like weird exit status and (most crucially)
zombie containers.

Solution: wait for container to actually start before we kill it.

Signed-off-by: Ed Santiago <santiago@redhat.com>
---
 test/system/700-play.bats | 29 ++++++++++++++++++++++++-----
 1 file changed, 24 insertions(+), 5 deletions(-)

diff --git a/test/system/700-play.bats b/test/system/700-play.bats
index ce5011bd82..a4dc40cd55 100644
--- a/test/system/700-play.bats
+++ b/test/system/700-play.bats
@@ -660,17 +660,35 @@ spec:
       image: $IMAGE
       command:
       - top
+      - -b
 " > $fname
 
-    # force a timeout to happen so that the kube play command is killed
-    # and expect the timeout code 124 to happen so that we can clean up
+    # Run in background, then wait for pod to start running.
+    # This guarantees that when we send the signal (below) we do so
+    # on a running container; signaling during initialization
+    # results in undefined behavior.
+    logfile=$PODMAN_TMPDIR/kube-play.log
+    $PODMAN kube play --wait $fname &> $logfile &
+    local kidpid=$!
+
+    for try in {1..10}; do
+        run_podman '?' container inspect --format '{{.State.Running}}' "$podname-$ctrname"
+        if [[ $status -eq 0 ]] && [[ "$output" = "true" ]]; then
+            break
+        fi
+        sleep 1
+    done
+    wait_for_output "Mem:" "$podname-$ctrname"
+
+    # Send SIGINT to container, and see how long it takes to exit.
     local t0=$SECONDS
-    PODMAN_TIMEOUT=2 run_podman 124 kube play --wait $fname
+    kill -2 $kidpid
+    wait $kidpid
     local t1=$SECONDS
     local delta_t=$((t1 - t0))
 
     # Expectation (in seconds) of when we should time out. When running
-    # parallel, allow 4 more seconds due to system load
+    # parallel, allow longer time due to system load
     local expect=4
     if [[ -n "$PARALLEL_JOBSLOT" ]]; then
         expect=$((expect + 4))
@@ -678,7 +696,8 @@ spec:
     assert $delta_t -le $expect \
            "podman kube play did not get killed within $expect seconds"
     # Make sure we actually got SIGTERM and podman printed its message.
-    assert "$output" =~ "Cleaning up containers, pods, and volumes" "kube play printed sigterm message"
+    assert "$(< $logfile)" =~ "Cleaning up containers, pods, and volumes" \
+           "kube play printed sigterm message"
 
     # there should be no containers running or created
     run_podman ps -a --noheading