test/system: speed up basic_{setup,teardown}()

While these are not really slow they still take about 100-250ms if I time this locally. Given they are run for every test this adds up quickly. Looking at CI logs I can see the timings for skipped tests are all in 600ms range. So I think it is safe to assume that these functions need to get faster. We have over 670 test cases currently so we talk about over 400s spend in these functions in CI. This allows for big gains. Now overall this is a tricky trade of, while all tests should cleanup after themselves there is no guarantee for that as such errors can be leaked into other tests making debugging much harder. To work at least a bit against this teardown checks if the test was successful and only skips the podman commands bases on that. Without it a single flake could cause all following tets to fail. As such this commit does the proper setup once one suite start then only after a test failed. In order for this to work at all we have to fix all leaks first, see previous commits. And then for the future keep a very strong eye on this during reviews. Also add a PODMAN_BATS_LEAK_CHECK option By default test must cleanup themselves and to speed up CI we no longer do any cleanup in teardown by default. However there is still many cases where we might have to debug a leak so add a new PODMAN_BATS_LEAK_CHECK env option that can be set and should cause teardown to fail if the test did not cleanup properly. Signed-off-by: Paul Holzinger <pholzing@redhat.com>
2025-07-01 16:17:06 +08:00 · 2024-06-05 13:46:51 +02:00
parent a2352fa3ea
commit 81c90f51c2
5 changed files with 150 additions and 86 deletions
--- a/hack/bats
+++ b/hack/bats
@ -125,6 +125,9 @@ fi
 # Used in 120-load test to identify rootless destination for podman image scp
 export PODMAN_ROOTLESS_USER=$(id -un)

+# Make sure to always check for leaks when running locally
+export PODMAN_BATS_LEAK_CHECK=1
+
 # Root
 if [[ "$TEST_ROOT" ]]; then
    echo "# bats ${bats_filter[*]} $TESTS"
--- a/test/system/120-load.bats
+++ b/test/system/120-load.bats
@ -15,6 +15,7 @@ function teardown() {
    # annotations and image digests may be different. See
    # https://github.com/containers/podman/discussions/17911
    run_podman rmi -a -f
+    _prefetch $IMAGE

    basic_teardown
 }
--- a/test/system/150-login.bats
+++ b/test/system/150-login.bats
@ -349,9 +349,13 @@ function _test_skopeo_credential_sharing() {
        --tls-verify=false $mid \
        $image1
    run_podman rmi $image1
+
+    run_podman images $IMAGE --format {{.ID}}
+    local podman_image_id=$output
+
    run_podman pull -q --retry 4 --retry-delay "0s" --authfile=$authfile \
        --tls-verify=false $image1
-    assert "${output:0:12}" = "$PODMAN_TEST_IMAGE_ID" "First pull (before stopping registry)"
+    assert "${output:0:12}" = "$podman_image_id" "First pull (before stopping registry)"
    run_podman rmi $image1

    # This actually STOPs the registry, so the port is unbound...
@ -361,7 +365,7 @@ function _test_skopeo_credential_sharing() {
    run_podman 0+w pull -q --retry 4 --retry-delay "5s" --authfile=$authfile \
            --tls-verify=false $image1
    assert "$output" =~ "Failed, retrying in 5s.*Error: initializing.* connection refused"
-    assert "${lines[-1]:0:12}" = "$PODMAN_TEST_IMAGE_ID" "push should succeed via retry"
+    assert "${lines[-1]:0:12}" = "$podman_image_id" "push should succeed via retry"
    unpause_registry

    run_podman rmi $image1
--- a/test/system/helpers.bash
+++ b/test/system/helpers.bash
@ -141,60 +141,6 @@ function skopeo() {

 # Setup helper: establish a test environment with exactly the images needed
 function basic_setup() {
-    # Clean up all containers
-    run_podman rm -t 0 --all --force --ignore
-
-    # ...including external (buildah) ones
-    run_podman ps --all --external --format '{{.ID}} {{.Names}}'
-    for line in "${lines[@]}"; do
-        set $line
-        echo "# setup(): removing stray external container $1 ($2)" >&3
-        run_podman '?' rm -f $1
-        if [[ $status -ne 0 ]]; then
-            echo "# [setup] $_LOG_PROMPT podman rm -f $1" >&3
-            for errline in "${lines[@]}"; do
-                echo "# $errline" >&3
-            done
-        fi
-    done
-
-    # Clean up all images except those desired.
-    # 2023-06-26 REMINDER: it is tempting to think that this is clunky,
-    # wouldn't it be safer/cleaner to just 'rmi -a' then '_prefetch $IMAGE'?
-    # Yes, but it's also tremendously slower: 29m for a CI run, to 39m.
-    # Image loads are slow.
-    found_needed_image=
-    run_podman '?' images --all --format '{{.Repository}}:{{.Tag}} {{.ID}}'
-
-    for line in "${lines[@]}"; do
-        set $line
-        if [[ "$1" == "$PODMAN_TEST_IMAGE_FQN" ]]; then
-            if [[ -z "$PODMAN_TEST_IMAGE_ID" ]]; then
-                # This will probably only trigger the 2nd time through setup
-                PODMAN_TEST_IMAGE_ID=$2
-            fi
-            found_needed_image=1
-        elif [[ "$1" == "$PODMAN_SYSTEMD_IMAGE_FQN" ]]; then
-            # This is a big image, don't force unnecessary pulls
-            :
-        else
-            # Always remove image that doesn't match by name
-            echo "# setup(): removing stray image $1" >&3
-            run_podman rmi --force "$1" >/dev/null 2>&1 || true
-
-            # Tagged image will have same IID as our test image; don't rmi it.
-            if [[ $2 != "$PODMAN_TEST_IMAGE_ID" ]]; then
-                echo "# setup(): removing stray image $2" >&3
-                run_podman rmi --force "$2" >/dev/null 2>&1 || true
-            fi
-        fi
-    done
-
-    # Make sure desired image is present
-    if [[ -z "$found_needed_image" ]]; then
-        _prefetch $PODMAN_TEST_IMAGE_FQN
-    fi
-
    # Temporary subdirectory, in which tests can write whatever they like
    # and trust that it'll be deleted on cleanup.
    # (BATS v1.3 and above provide $BATS_TEST_TMPDIR, but we still use
@ -258,38 +204,63 @@ function defer-assertion-failures() {
 # Basic teardown: remove all pods and containers
 function basic_teardown() {
    echo "# [teardown]" >&2
-    local actions=(
-        "pod rm -t 0 --all --force --ignore"
-            "rm -t 0 --all --force --ignore"
-        "network prune --force"
-        "volume rm -a -f"
-    )
-    for action in "${actions[@]}"; do
-        run_podman '?' $action

-        # The -f commands should never exit nonzero, but if they do we want
-        # to know about it.
-        #   FIXME: someday: also test for [[ -n "$output" ]] - can't do this
-        #   yet because too many tests don't clean up their containers
-        if [[ $status -ne 0 ]]; then
-            echo "# [teardown] $_LOG_PROMPT podman $action" >&3
-            for line in "${lines[*]}"; do
-                echo "# $line" >&3
-            done
-
-            # Special case for timeout: check for locks (#18514)
-            if [[ $status -eq 124 ]]; then
-                echo "# [teardown] $_LOG_PROMPT podman system locks" >&3
-                run $PODMAN system locks
-                for line in "${lines[*]}"; do
-                    echo "# $line" >&3
-                done
-            fi
-        fi
-    done
-
-    command rm -rf $PODMAN_TMPDIR
    immediate-assertion-failures
+    # Unlike normal tests teardown will not exit on first command failure
+    # but rather only uses the return code of the teardown function.
+    # This must be directly after immediate-assertion-failures to capture the error code
+    local exit_code=$?
+
+    # Only checks for leaks on a successful run (BATS_TEST_COMPLETED is set 1),
+    # immediate-assertion-failures didn't fail (exit_code -eq 0)
+    # and PODMAN_BATS_LEAK_CHECK is set.
+    # As these podman commands are slow we do not want to do this by default
+    # and only provide this as opt in option. (#22909)
+    if [[ "$BATS_TEST_COMPLETED" -eq 1 ]] && [ $exit_code -eq 0 ] && [ -n "$PODMAN_BATS_LEAK_CHECK" ]; then
+        run_podman volume ls -q
+        assert "$output" == "" "Leaked volumes!!!"
+        exit_code=$((exit_code + $?))
+        run_podman network ls -q
+        # podman always exists
+        assert "$output" == "podman" "Leaked networks!!!"
+        exit_code=$((exit_code + $?))
+        run_podman pod ps -q
+        assert "$output" == "" "Leaked pods!!!"
+        exit_code=$((exit_code + $?))
+        run_podman ps -a -q
+        assert "$output" == "" "Leaked containers!!!"
+        exit_code=$((exit_code + $?))
+
+        run_podman images --all --format '{{.Repository}}:{{.Tag}} {{.ID}}'
+        for line in "${lines[@]}"; do
+            set $line
+            if [[ "$1" == "$PODMAN_TEST_IMAGE_FQN" ]]; then
+                found_needed_image=1
+            elif [[ "$1" == "$PODMAN_SYSTEMD_IMAGE_FQN" ]]; then
+                # This is a big image, don't force unnecessary pulls
+                :
+            else
+                exit_code=$((exit_code + 1))
+                echo "Leaked image $1 $2"
+            fi
+        done
+
+        # Make sure desired image is present
+        if [[ -z "$found_needed_image" ]]; then
+            exit_code=$((exit_code + 1))
+            die "$PODMAN_TEST_IMAGE_FQN was removed"
+        fi
+    fi
+
+    # Some error happened (either in teardown itself or the actual test failed)
+    # so do a full cleanup to ensure following tests start with a clean env.
+    if [ $exit_code -gt 0 ] || [ -z "$BATS_TEST_COMPLETED" ]; then
+        clean_setup
+        exit_code=$((exit_code + $?))
+    fi
+    command rm -rf $PODMAN_TMPDIR
+    exit_code=$((exit_code + $?))
+    return $exit_code
 }


@ -323,6 +294,89 @@ function restore_image() {
    run_podman restore $archive
 }

+function clean_setup() {
+    local actions=(
+        "pod rm -t 0 --all --force --ignore"
+            "rm -t 0 --all --force --ignore"
+        "network prune --force"
+        "volume rm -a -f"
+    )
+    for action in "${actions[@]}"; do
+        run_podman '?' $action
+
+        # The -f commands should never exit nonzero, but if they do we want
+        # to know about it.
+        #   FIXME: someday: also test for [[ -n "$output" ]] - can't do this
+        #   yet because too many tests don't clean up their containers
+        if [[ $status -ne 0 ]]; then
+            echo "# [teardown] $_LOG_PROMPT podman $action" >&3
+            for line in "${lines[*]}"; do
+                echo "# $line" >&3
+            done
+
+            # Special case for timeout: check for locks (#18514)
+            if [[ $status -eq 124 ]]; then
+                echo "# [teardown] $_LOG_PROMPT podman system locks" >&3
+                run $PODMAN system locks
+                for line in "${lines[*]}"; do
+                    echo "# $line" >&3
+                done
+            fi
+        fi
+    done
+
+    # ...including external (buildah) ones
+    run_podman ps --all --external --format '{{.ID}} {{.Names}}'
+    for line in "${lines[@]}"; do
+        set $line
+        echo "# setup(): removing stray external container $1 ($2)" >&3
+        run_podman '?' rm -f $1
+        if [[ $status -ne 0 ]]; then
+            echo "# [setup] $_LOG_PROMPT podman rm -f $1" >&3
+            for errline in "${lines[@]}"; do
+                echo "# $errline" >&3
+            done
+        fi
+    done
+
+    # Clean up all images except those desired.
+    # 2023-06-26 REMINDER: it is tempting to think that this is clunky,
+    # wouldn't it be safer/cleaner to just 'rmi -a' then '_prefetch $IMAGE'?
+    # Yes, but it's also tremendously slower: 29m for a CI run, to 39m.
+    # Image loads are slow.
+    found_needed_image=
+    run_podman '?' images --all --format '{{.Repository}}:{{.Tag}} {{.ID}}'
+
+    for line in "${lines[@]}"; do
+        set $line
+        if [[ "$1" == "$PODMAN_TEST_IMAGE_FQN" ]]; then
+            if [[ -z "$PODMAN_TEST_IMAGE_ID" ]]; then
+                # This will probably only trigger the 2nd time through setup
+                PODMAN_TEST_IMAGE_ID=$2
+            fi
+            found_needed_image=1
+        elif [[ "$1" == "$PODMAN_SYSTEMD_IMAGE_FQN" ]]; then
+            # This is a big image, don't force unnecessary pulls
+            :
+        else
+            # Always remove image that doesn't match by name
+            echo "# setup(): removing stray image $1" >&3
+            run_podman rmi --force "$1" >/dev/null 2>&1 || true
+
+            # Tagged image will have same IID as our test image; don't rmi it.
+            if [[ $2 != "$PODMAN_TEST_IMAGE_ID" ]]; then
+                echo "# setup(): removing stray image $2" >&3
+                run_podman rmi --force "$2" >/dev/null 2>&1 || true
+            fi
+        fi
+    done
+
+    # Make sure desired image is present
+    if [[ -z "$found_needed_image" ]]; then
+        _prefetch $PODMAN_TEST_IMAGE_FQN
+    fi
+}
+
 # END   setup/teardown tools
 ###############################################################################
 # BEGIN podman helpers
--- a/test/system/setup_suite.bash
+++ b/test/system/setup_suite.bash
@ -30,6 +30,8 @@ function setup_suite() {
    # The above does not handle errors. Do a final confirmation.
    assert "$PODMAN_LOGIN_REGISTRY_PORT" != "" \
           "Unable to set PODMAN_LOGIN_REGISTRY_PORT"
+
+    clean_setup
 }

 # Run at the very end of all tests. Useful for cleanup of non-BATS tmpdirs.