From d571ca6536c2f49ad1d95e2c93f982e18201c45a Mon Sep 17 00:00:00 2001
From: Ed Santiago <santiago@redhat.com>
Date: Tue, 17 Sep 2024 09:19:52 -0600
Subject: [PATCH] system test parallelization: enable two-pass approach

For the past two months we've been splitting system tests
into two categories: those that CAN be run in parallel,
and those that CANNOT. Much work has been done to replace
hardcoded names (mycontainer, mypod) with safename().
Hundreds of test runs, in CI and on Ed's laptop, have
proven this approach viable.

make {local,remote}system now runs in two steps: first
the serial ones, then the parallel ones. hack/bats will
now recognize the 'ci:parallel' tag and add --jobs (nprocs).

This requires some tweaking of leak_check, because there
can be umpteen tests running (affecting image/container/pod/etc
state) when any given test completes.

Rules for enabling parallelization in tests:

   * use unique container/pod/volume/network names (safename)
   * do not run 'podman rm -a' or 'rmi -a'
   * never use the -l (--latest) option
   * do not run 'podman ps/images' and expect precise output

Signed-off-by: Ed Santiago <santiago@redhat.com>
---
 Makefile                     |  6 ++++--
 hack/bats                    |  9 +++++---
 test/system/helpers.bash     | 42 +++++++++++++++++++++++-------------
 test/system/setup_suite.bash | 15 ++++++++-----
 4 files changed, 47 insertions(+), 25 deletions(-)

diff --git a/Makefile b/Makefile
index 799ca84390..3cdae32101 100644
--- a/Makefile
+++ b/Makefile
@@ -688,7 +688,8 @@ localmachine:
 localsystem:
 	# Wipe existing config, database, and cache: start with clean slate.
 	$(RM) -rf ${HOME}/.local/share/containers ${HOME}/.config/containers
-	if timeout -v 1 true; then PODMAN=$(CURDIR)/bin/podman QUADLET=$(CURDIR)/bin/quadlet bats -T test/system/; else echo "Skipping $@: 'timeout -v' unavailable'"; fi
+	PODMAN=$(CURDIR)/bin/podman QUADLET=$(CURDIR)/bin/quadlet bats -T --filter-tags '!ci:parallel' test/system/
+	PODMAN=$(CURDIR)/bin/podman QUADLET=$(CURDIR)/bin/quadlet bats -T --filter-tags ci:parallel -j $$(nproc) test/system/
 
 .PHONY: remotesystem
 remotesystem:
@@ -717,7 +718,8 @@ remotesystem:
 			echo "Error: ./bin/podman system service did not come up" >&2;\
 			exit 1;\
 		fi;\
-		env PODMAN="$(CURDIR)/bin/podman-remote" bats -T test/system/ ;\
+		env PODMAN="$(CURDIR)/bin/podman-remote" bats -T --filter-tags '!ci:parallel' test/system/ ;\
+		env PODMAN="$(CURDIR)/bin/podman-remote" bats -T --filter-tags ci:parallel -j $$(nproc) test/system/ ;\
 		rc=$$?;\
 		kill %1;\
 	else \
diff --git a/hack/bats b/hack/bats
index 3002c0199b..3184c95b76 100755
--- a/hack/bats
+++ b/hack/bats
@@ -83,7 +83,10 @@ for i;do
         --rootless) TEST_ROOT= ;;
         --remote)   REMOTE=remote ;;
         --ts|-T)    bats_opts+=("-T") ;;
-        --tag=*)    bats_filter=("--filter-tags" "$value") ;;
+        --tag=*)    bats_filter=("--filter-tags" "$value")
+                    if [[ "$value" = "ci:parallel" ]]; then
+                        bats_opts+=("--jobs" $(nproc))
+                    fi;;
         */*.bats)   TESTS=$i ;;
         *)
             if [[ $i =~ : ]]; then
@@ -130,7 +133,7 @@ export PODMAN_BATS_LEAK_CHECK=1
 
 # Root
 if [[ "$TEST_ROOT" ]]; then
-    echo "# bats ${bats_filter[*]} $TESTS"
+    echo "# bats ${bats_opts[*]} ${bats_filter[*]} $TESTS"
     sudo    --preserve-env=PODMAN \
             --preserve-env=QUADLET \
             --preserve-env=PODMAN_TEST_DEBUG \
@@ -144,7 +147,7 @@ fi
 # Rootless. (Only if we're not already root)
 if [[ "$TEST_ROOTLESS" && "$(id -u)" != 0 ]]; then
     echo "--------------------------------------------------"
-    echo "\$ bats ${bats_filter[*]} $TESTS"
+    echo "\$ bats ${bats_opts[*]} ${bats_filter[*]} $TESTS"
     bats "${bats_opts[@]}" "${bats_filter[@]}" $TESTS
     rc=$((rc | $?))
 fi
diff --git a/test/system/helpers.bash b/test/system/helpers.bash
index 665c192105..280729c5ab 100644
--- a/test/system/helpers.bash
+++ b/test/system/helpers.bash
@@ -222,22 +222,34 @@ function basic_teardown() {
     # This must be directly after immediate-assertion-failures to capture the error code
     local exit_code=$?
 
-    # Only checks for leaks on a successful run (BATS_TEST_COMPLETED is set 1),
-    # immediate-assertion-failures didn't fail (exit_code -eq 0)
-    # and PODMAN_BATS_LEAK_CHECK is set.
-    # As these podman commands are slow we do not want to do this by default
-    # and only provide this as opt in option. (#22909)
-    if [[ "$BATS_TEST_COMPLETED" -eq 1 ]] && [ $exit_code -eq 0 ] && [ -n "$PODMAN_BATS_LEAK_CHECK" ]; then
-        leak_check
-        exit_code=$((exit_code + $?))
+    # Leak check and state reset. Only run these when running tests serially!
+    # (For parallel tests, we run a leak check only at the very end of all tests)
+    if [[ -z "$PARALLEL_JOBSLOT" ]]; then
+        # Check for leaks, but only if:
+        #  1) test was successful (BATS_TEST_COMPLETED is set 1); and
+        #  2) immediate-assertion-failures didn't fail (exit_code -eq 0); and
+        #  3) PODMAN_BATS_LEAK_CHECK is set (usually only in cron).
+        # As these podman commands are slow we do not want to do this by default
+        # and only provide this as opt-in option. (#22909)
+        if [[ "$BATS_TEST_COMPLETED" -eq 1 ]] && [[ $exit_code -eq 0 ]] && [[ -n "$PODMAN_BATS_LEAK_CHECK" ]]; then
+            leak_check
+            exit_code=$((exit_code + $?))
+        fi
+
+        # Some error happened (either in teardown itself or the actual test failed)
+        # so do a full cleanup to ensure following tests start with a clean env.
+        if [ $exit_code -gt 0 ] || [ -z "$BATS_TEST_COMPLETED" ]; then
+            clean_setup
+            exit_code=$((exit_code + $?))
+        fi
     fi
 
-    # Some error happened (either in teardown itself or the actual test failed)
-    # so do a full cleanup to ensure following tests start with a clean env.
-    if [ $exit_code -gt 0 ] || [ -z "$BATS_TEST_COMPLETED" ]; then
-        clean_setup
-        exit_code=$((exit_code + $?))
+    # Status file used in teardown_suite() to decide whether or not
+    # to check for leaks
+    if [[ "$BATS_TEST_COMPLETED" -ne 1 ]]; then
+        rm -f "$BATS_SUITE_TMPDIR/all-tests-passed"
     fi
+
     command rm -rf $PODMAN_TMPDIR
     exit_code=$((exit_code + $?))
     return $exit_code
@@ -426,12 +438,12 @@ function clean_setup() {
         else
             # Always remove image that doesn't match by name
             echo "# setup(): removing stray image $1" >&3
-            run_podman rmi --force "$1" >/dev/null 2>&1 || true
+            _run_podman_quiet rmi --force "$1"
 
             # Tagged image will have same IID as our test image; don't rmi it.
             if [[ $2 != "$PODMAN_TEST_IMAGE_ID" ]]; then
                 echo "# setup(): removing stray image $2" >&3
-                run_podman rmi --force "$2" >/dev/null 2>&1 || true
+                _run_podman_quiet rmi --force "$2"
             fi
         fi
     done
diff --git a/test/system/setup_suite.bash b/test/system/setup_suite.bash
index 4dd59981db..8a3f910839 100644
--- a/test/system/setup_suite.bash
+++ b/test/system/setup_suite.bash
@@ -32,6 +32,9 @@ function setup_suite() {
            "Unable to set PODMAN_LOGIN_REGISTRY_PORT"
 
     clean_setup
+
+    # Canary file. Will be removed if any individual test fails.
+    touch "$BATS_SUITE_TMPDIR/all-tests-passed"
 }
 
 # Run at the very end of all tests. Useful for cleanup of non-BATS tmpdirs.
@@ -39,11 +42,13 @@ function teardown_suite() {
     stop_registry
     local exit_code=$?
 
-    # After all tests make sure there are no leaks and cleanup if there are
-    leak_check
-    if [ $? -gt 0 ]; then
-        exit_code=$((exit_code + 1))
-        clean_setup
+    # At end, if all tests have passed, check for leaks.
+    # Don't do this if there were errors: failing tests may not clean up.
+    if [[ -e "$BATS_SUITE_TMPDIR/all-tests-passed" ]]; then
+        leak_check
+        if [ $? -gt 0 ]; then
+            exit_code=$((exit_code + 1))
+        fi
     fi
 
     return $exit_code