From d571ca6536c2f49ad1d95e2c93f982e18201c45a Mon Sep 17 00:00:00 2001 From: Ed Santiago Date: Tue, 17 Sep 2024 09:19:52 -0600 Subject: [PATCH] system test parallelization: enable two-pass approach For the past two months we've been splitting system tests into two categories: those that CAN be run in parallel, and those that CANNOT. Much work has been done to replace hardcoded names (mycontainer, mypod) with safename(). Hundreds of test runs, in CI and on Ed's laptop, have proven this approach viable. make {local,remote}system now runs in two steps: first the serial ones, then the parallel ones. hack/bats will now recognize the 'ci:parallel' tag and add --jobs (nprocs). This requires some tweaking of leak_check, because there can be umpteen tests running (affecting image/container/pod/etc state) when any given test completes. Rules for enabling parallelization in tests: * use unique container/pod/volume/network names (safename) * do not run 'podman rm -a' or 'rmi -a' * never use the -l (--latest) option * do not run 'podman ps/images' and expect precise output Signed-off-by: Ed Santiago --- Makefile | 6 ++++-- hack/bats | 9 +++++--- test/system/helpers.bash | 42 +++++++++++++++++++++++------------- test/system/setup_suite.bash | 15 ++++++++----- 4 files changed, 47 insertions(+), 25 deletions(-) diff --git a/Makefile b/Makefile index 799ca84390..3cdae32101 100644 --- a/Makefile +++ b/Makefile @@ -688,7 +688,8 @@ localmachine: localsystem: # Wipe existing config, database, and cache: start with clean slate. $(RM) -rf ${HOME}/.local/share/containers ${HOME}/.config/containers - if timeout -v 1 true; then PODMAN=$(CURDIR)/bin/podman QUADLET=$(CURDIR)/bin/quadlet bats -T test/system/; else echo "Skipping $@: 'timeout -v' unavailable'"; fi + PODMAN=$(CURDIR)/bin/podman QUADLET=$(CURDIR)/bin/quadlet bats -T --filter-tags '!ci:parallel' test/system/ + PODMAN=$(CURDIR)/bin/podman QUADLET=$(CURDIR)/bin/quadlet bats -T --filter-tags ci:parallel -j $$(nproc) test/system/ .PHONY: remotesystem remotesystem: @@ -717,7 +718,8 @@ remotesystem: echo "Error: ./bin/podman system service did not come up" >&2;\ exit 1;\ fi;\ - env PODMAN="$(CURDIR)/bin/podman-remote" bats -T test/system/ ;\ + env PODMAN="$(CURDIR)/bin/podman-remote" bats -T --filter-tags '!ci:parallel' test/system/ ;\ + env PODMAN="$(CURDIR)/bin/podman-remote" bats -T --filter-tags ci:parallel -j $$(nproc) test/system/ ;\ rc=$$?;\ kill %1;\ else \ diff --git a/hack/bats b/hack/bats index 3002c0199b..3184c95b76 100755 --- a/hack/bats +++ b/hack/bats @@ -83,7 +83,10 @@ for i;do --rootless) TEST_ROOT= ;; --remote) REMOTE=remote ;; --ts|-T) bats_opts+=("-T") ;; - --tag=*) bats_filter=("--filter-tags" "$value") ;; + --tag=*) bats_filter=("--filter-tags" "$value") + if [[ "$value" = "ci:parallel" ]]; then + bats_opts+=("--jobs" $(nproc)) + fi;; */*.bats) TESTS=$i ;; *) if [[ $i =~ : ]]; then @@ -130,7 +133,7 @@ export PODMAN_BATS_LEAK_CHECK=1 # Root if [[ "$TEST_ROOT" ]]; then - echo "# bats ${bats_filter[*]} $TESTS" + echo "# bats ${bats_opts[*]} ${bats_filter[*]} $TESTS" sudo --preserve-env=PODMAN \ --preserve-env=QUADLET \ --preserve-env=PODMAN_TEST_DEBUG \ @@ -144,7 +147,7 @@ fi # Rootless. (Only if we're not already root) if [[ "$TEST_ROOTLESS" && "$(id -u)" != 0 ]]; then echo "--------------------------------------------------" - echo "\$ bats ${bats_filter[*]} $TESTS" + echo "\$ bats ${bats_opts[*]} ${bats_filter[*]} $TESTS" bats "${bats_opts[@]}" "${bats_filter[@]}" $TESTS rc=$((rc | $?)) fi diff --git a/test/system/helpers.bash b/test/system/helpers.bash index 665c192105..280729c5ab 100644 --- a/test/system/helpers.bash +++ b/test/system/helpers.bash @@ -222,22 +222,34 @@ function basic_teardown() { # This must be directly after immediate-assertion-failures to capture the error code local exit_code=$? - # Only checks for leaks on a successful run (BATS_TEST_COMPLETED is set 1), - # immediate-assertion-failures didn't fail (exit_code -eq 0) - # and PODMAN_BATS_LEAK_CHECK is set. - # As these podman commands are slow we do not want to do this by default - # and only provide this as opt in option. (#22909) - if [[ "$BATS_TEST_COMPLETED" -eq 1 ]] && [ $exit_code -eq 0 ] && [ -n "$PODMAN_BATS_LEAK_CHECK" ]; then - leak_check - exit_code=$((exit_code + $?)) + # Leak check and state reset. Only run these when running tests serially! + # (For parallel tests, we run a leak check only at the very end of all tests) + if [[ -z "$PARALLEL_JOBSLOT" ]]; then + # Check for leaks, but only if: + # 1) test was successful (BATS_TEST_COMPLETED is set 1); and + # 2) immediate-assertion-failures didn't fail (exit_code -eq 0); and + # 3) PODMAN_BATS_LEAK_CHECK is set (usually only in cron). + # As these podman commands are slow we do not want to do this by default + # and only provide this as opt-in option. (#22909) + if [[ "$BATS_TEST_COMPLETED" -eq 1 ]] && [[ $exit_code -eq 0 ]] && [[ -n "$PODMAN_BATS_LEAK_CHECK" ]]; then + leak_check + exit_code=$((exit_code + $?)) + fi + + # Some error happened (either in teardown itself or the actual test failed) + # so do a full cleanup to ensure following tests start with a clean env. + if [ $exit_code -gt 0 ] || [ -z "$BATS_TEST_COMPLETED" ]; then + clean_setup + exit_code=$((exit_code + $?)) + fi fi - # Some error happened (either in teardown itself or the actual test failed) - # so do a full cleanup to ensure following tests start with a clean env. - if [ $exit_code -gt 0 ] || [ -z "$BATS_TEST_COMPLETED" ]; then - clean_setup - exit_code=$((exit_code + $?)) + # Status file used in teardown_suite() to decide whether or not + # to check for leaks + if [[ "$BATS_TEST_COMPLETED" -ne 1 ]]; then + rm -f "$BATS_SUITE_TMPDIR/all-tests-passed" fi + command rm -rf $PODMAN_TMPDIR exit_code=$((exit_code + $?)) return $exit_code @@ -426,12 +438,12 @@ function clean_setup() { else # Always remove image that doesn't match by name echo "# setup(): removing stray image $1" >&3 - run_podman rmi --force "$1" >/dev/null 2>&1 || true + _run_podman_quiet rmi --force "$1" # Tagged image will have same IID as our test image; don't rmi it. if [[ $2 != "$PODMAN_TEST_IMAGE_ID" ]]; then echo "# setup(): removing stray image $2" >&3 - run_podman rmi --force "$2" >/dev/null 2>&1 || true + _run_podman_quiet rmi --force "$2" fi fi done diff --git a/test/system/setup_suite.bash b/test/system/setup_suite.bash index 4dd59981db..8a3f910839 100644 --- a/test/system/setup_suite.bash +++ b/test/system/setup_suite.bash @@ -32,6 +32,9 @@ function setup_suite() { "Unable to set PODMAN_LOGIN_REGISTRY_PORT" clean_setup + + # Canary file. Will be removed if any individual test fails. + touch "$BATS_SUITE_TMPDIR/all-tests-passed" } # Run at the very end of all tests. Useful for cleanup of non-BATS tmpdirs. @@ -39,11 +42,13 @@ function teardown_suite() { stop_registry local exit_code=$? - # After all tests make sure there are no leaks and cleanup if there are - leak_check - if [ $? -gt 0 ]; then - exit_code=$((exit_code + 1)) - clean_setup + # At end, if all tests have passed, check for leaks. + # Don't do this if there were errors: failing tests may not clean up. + if [[ -e "$BATS_SUITE_TMPDIR/all-tests-passed" ]]; then + leak_check + if [ $? -gt 0 ]; then + exit_code=$((exit_code + 1)) + fi fi return $exit_code