system test parallelization: enable two-pass approach

For the past two months we've been splitting system tests into two categories: those that CAN be run in parallel, and those that CANNOT. Much work has been done to replace hardcoded names (mycontainer, mypod) with safename(). Hundreds of test runs, in CI and on Ed's laptop, have proven this approach viable. make {local,remote}system now runs in two steps: first the serial ones, then the parallel ones. hack/bats will now recognize the 'ci:parallel' tag and add --jobs (nprocs). This requires some tweaking of leak_check, because there can be umpteen tests running (affecting image/container/pod/etc state) when any given test completes. Rules for enabling parallelization in tests: * use unique container/pod/volume/network names (safename) * do not run 'podman rm -a' or 'rmi -a' * never use the -l (--latest) option * do not run 'podman ps/images' and expect precise output Signed-off-by: Ed Santiago <santiago@redhat.com>
2025-05-17 06:59:07 +08:00 · 2024-09-17 09:19:52 -06:00
parent f4a08f46b7
commit d571ca6536
4 changed files with 47 additions and 25 deletions
--- a/6
+++ b/6
@ -688,7 +688,8 @@ localmachine:
 localsystem:
 	# Wipe existing config, database, and cache: start with clean slate.
 	$(RM) -rf ${HOME}/.local/share/containers ${HOME}/.config/containers
-	if timeout -v 1 true; then PODMAN=$(CURDIR)/bin/podman QUADLET=$(CURDIR)/bin/quadlet bats -T test/system/; else echo "Skipping $@: 'timeout -v' unavailable'"; fi
+	PODMAN=$(CURDIR)/bin/podman QUADLET=$(CURDIR)/bin/quadlet bats -T --filter-tags '!ci:parallel' test/system/
+	PODMAN=$(CURDIR)/bin/podman QUADLET=$(CURDIR)/bin/quadlet bats -T --filter-tags ci:parallel -j $$(nproc) test/system/

 .PHONY: remotesystem
 remotesystem:
@ -717,7 +718,8 @@ remotesystem:
 			echo "Error: ./bin/podman system service did not come up" >&2;\
 			exit 1;\
 		fi;\
-		env PODMAN="$(CURDIR)/bin/podman-remote" bats -T test/system/ ;\
+		env PODMAN="$(CURDIR)/bin/podman-remote" bats -T --filter-tags '!ci:parallel' test/system/ ;\
+		env PODMAN="$(CURDIR)/bin/podman-remote" bats -T --filter-tags ci:parallel -j $$(nproc) test/system/ ;\
 		rc=$$?;\
 		kill %1;\
 	else \
--- a/hack/bats
+++ b/hack/bats
@ -83,7 +83,10 @@ for i;do
        --rootless) TEST_ROOT= ;;
        --remote)   REMOTE=remote ;;
        --ts|-T)    bats_opts+=("-T") ;;
-        --tag=*)    bats_filter=("--filter-tags" "$value") ;;
+        --tag=*)    bats_filter=("--filter-tags" "$value")
+                    if [[ "$value" = "ci:parallel" ]]; then
+                        bats_opts+=("--jobs" $(nproc))
+                    fi;;
        */*.bats)   TESTS=$i ;;
        *)
            if [[ $i =~ : ]]; then
@ -130,7 +133,7 @@ export PODMAN_BATS_LEAK_CHECK=1

 # Root
 if [[ "$TEST_ROOT" ]]; then
-    echo "# bats ${bats_filter[*]} $TESTS"
+    echo "# bats ${bats_opts[*]} ${bats_filter[*]} $TESTS"
    sudo    --preserve-env=PODMAN \
            --preserve-env=QUADLET \
            --preserve-env=PODMAN_TEST_DEBUG \
@ -144,7 +147,7 @@ fi
 # Rootless. (Only if we're not already root)
 if [[ "$TEST_ROOTLESS" && "$(id -u)" != 0 ]]; then
    echo "--------------------------------------------------"
-    echo "\$ bats ${bats_filter[*]} $TESTS"
+    echo "\$ bats ${bats_opts[*]} ${bats_filter[*]} $TESTS"
    bats "${bats_opts[@]}" "${bats_filter[@]}" $TESTS
    rc=$((rc | $?))
 fi
--- a/test/system/helpers.bash
+++ b/test/system/helpers.bash
@ -222,22 +222,34 @@ function basic_teardown() {
    # This must be directly after immediate-assertion-failures to capture the error code
    local exit_code=$?

-    # Only checks for leaks on a successful run (BATS_TEST_COMPLETED is set 1),
-    # immediate-assertion-failures didn't fail (exit_code -eq 0)
-    # and PODMAN_BATS_LEAK_CHECK is set.
-    # As these podman commands are slow we do not want to do this by default
-    # and only provide this as opt in option. (#22909)
-    if [[ "$BATS_TEST_COMPLETED" -eq 1 ]] && [ $exit_code -eq 0 ] && [ -n "$PODMAN_BATS_LEAK_CHECK" ]; then
-        leak_check
-        exit_code=$((exit_code + $?))
+    # Leak check and state reset. Only run these when running tests serially!
+    # (For parallel tests, we run a leak check only at the very end of all tests)
+    if [[ -z "$PARALLEL_JOBSLOT" ]]; then
+        # Check for leaks, but only if:
+        #  1) test was successful (BATS_TEST_COMPLETED is set 1); and
+        #  2) immediate-assertion-failures didn't fail (exit_code -eq 0); and
+        #  3) PODMAN_BATS_LEAK_CHECK is set (usually only in cron).
+        # As these podman commands are slow we do not want to do this by default
+        # and only provide this as opt-in option. (#22909)
+        if [[ "$BATS_TEST_COMPLETED" -eq 1 ]] && [[ $exit_code -eq 0 ]] && [[ -n "$PODMAN_BATS_LEAK_CHECK" ]]; then
+            leak_check
+            exit_code=$((exit_code + $?))
+        fi
+
+        # Some error happened (either in teardown itself or the actual test failed)
+        # so do a full cleanup to ensure following tests start with a clean env.
+        if [ $exit_code -gt 0 ] || [ -z "$BATS_TEST_COMPLETED" ]; then
+            clean_setup
+            exit_code=$((exit_code + $?))
+        fi
    fi

-    # Some error happened (either in teardown itself or the actual test failed)
-    # so do a full cleanup to ensure following tests start with a clean env.
-    if [ $exit_code -gt 0 ] || [ -z "$BATS_TEST_COMPLETED" ]; then
-        clean_setup
-        exit_code=$((exit_code + $?))
+    # Status file used in teardown_suite() to decide whether or not
+    # to check for leaks
+    if [[ "$BATS_TEST_COMPLETED" -ne 1 ]]; then
+        rm -f "$BATS_SUITE_TMPDIR/all-tests-passed"
    fi
+
    command rm -rf $PODMAN_TMPDIR
    exit_code=$((exit_code + $?))
    return $exit_code
@ -426,12 +438,12 @@ function clean_setup() {
        else
            # Always remove image that doesn't match by name
            echo "# setup(): removing stray image $1" >&3
-            run_podman rmi --force "$1" >/dev/null 2>&1 || true
+            _run_podman_quiet rmi --force "$1"

            # Tagged image will have same IID as our test image; don't rmi it.
            if [[ $2 != "$PODMAN_TEST_IMAGE_ID" ]]; then
                echo "# setup(): removing stray image $2" >&3
-                run_podman rmi --force "$2" >/dev/null 2>&1 || true
+                _run_podman_quiet rmi --force "$2"
            fi
        fi
    done
--- a/test/system/setup_suite.bash
+++ b/test/system/setup_suite.bash
@ -32,6 +32,9 @@ function setup_suite() {
           "Unable to set PODMAN_LOGIN_REGISTRY_PORT"

    clean_setup
+
+    # Canary file. Will be removed if any individual test fails.
+    touch "$BATS_SUITE_TMPDIR/all-tests-passed"
 }

 # Run at the very end of all tests. Useful for cleanup of non-BATS tmpdirs.
@ -39,11 +42,13 @@ function teardown_suite() {
    stop_registry
    local exit_code=$?

-    # After all tests make sure there are no leaks and cleanup if there are
-    leak_check
-    if [ $? -gt 0 ]; then
-        exit_code=$((exit_code + 1))
-        clean_setup
+    # At end, if all tests have passed, check for leaks.
+    # Don't do this if there were errors: failing tests may not clean up.
+    if [[ -e "$BATS_SUITE_TMPDIR/all-tests-passed" ]]; then
+        leak_check
+        if [ $? -gt 0 ]; then
+            exit_code=$((exit_code + 1))
+        fi
    fi

    return $exit_code