test/system: speed up basic_{setup,teardown}()

While these are not really slow they still take about 100-250ms if I
time this locally. Given they are run for every test this adds up
quickly. Looking at CI logs I can see the timings for skipped
tests are all in 600ms range. So I think it is safe to assume that these
functions need to get faster.

We have over 670 test cases currently so we talk about over 400s spend
in these functions in CI. This allows for big gains.

Now overall this is a tricky trade of, while all tests should cleanup
after themselves there is no guarantee for that as such errors can be
leaked into other tests making debugging much harder. To work at least a
bit against this teardown checks if the test was successful and only
skips the podman commands bases on that. Without it a single flake could
cause all following tets to fail.

As such this commit does the proper setup once one suite start then only
after a test failed.

In order for this to work at all we have to fix all leaks first, see
previous commits. And then for the future keep a very strong eye on
this during reviews.

Also add a PODMAN_BATS_LEAK_CHECK option

By default test must cleanup themselves and to speed up CI we no longer
do any cleanup in teardown by default. However there is still many cases
where we might have to debug a leak so add a new PODMAN_BATS_LEAK_CHECK
env option that can be set and should cause teardown to fail if the test
did not cleanup properly.

Signed-off-by: Paul Holzinger <pholzing@redhat.com>
This commit is contained in:
Paul Holzinger
2024-06-05 13:46:51 +02:00
parent a2352fa3ea
commit 81c90f51c2
5 changed files with 150 additions and 86 deletions

View File

@ -125,6 +125,9 @@ fi
# Used in 120-load test to identify rootless destination for podman image scp
export PODMAN_ROOTLESS_USER=$(id -un)
# Make sure to always check for leaks when running locally
export PODMAN_BATS_LEAK_CHECK=1
# Root
if [[ "$TEST_ROOT" ]]; then
echo "# bats ${bats_filter[*]} $TESTS"

View File

@ -15,6 +15,7 @@ function teardown() {
# annotations and image digests may be different. See
# https://github.com/containers/podman/discussions/17911
run_podman rmi -a -f
_prefetch $IMAGE
basic_teardown
}

View File

@ -349,9 +349,13 @@ function _test_skopeo_credential_sharing() {
--tls-verify=false $mid \
$image1
run_podman rmi $image1
run_podman images $IMAGE --format {{.ID}}
local podman_image_id=$output
run_podman pull -q --retry 4 --retry-delay "0s" --authfile=$authfile \
--tls-verify=false $image1
assert "${output:0:12}" = "$PODMAN_TEST_IMAGE_ID" "First pull (before stopping registry)"
assert "${output:0:12}" = "$podman_image_id" "First pull (before stopping registry)"
run_podman rmi $image1
# This actually STOPs the registry, so the port is unbound...
@ -361,7 +365,7 @@ function _test_skopeo_credential_sharing() {
run_podman 0+w pull -q --retry 4 --retry-delay "5s" --authfile=$authfile \
--tls-verify=false $image1
assert "$output" =~ "Failed, retrying in 5s.*Error: initializing.* connection refused"
assert "${lines[-1]:0:12}" = "$PODMAN_TEST_IMAGE_ID" "push should succeed via retry"
assert "${lines[-1]:0:12}" = "$podman_image_id" "push should succeed via retry"
unpause_registry
run_podman rmi $image1

View File

@ -141,60 +141,6 @@ function skopeo() {
# Setup helper: establish a test environment with exactly the images needed
function basic_setup() {
# Clean up all containers
run_podman rm -t 0 --all --force --ignore
# ...including external (buildah) ones
run_podman ps --all --external --format '{{.ID}} {{.Names}}'
for line in "${lines[@]}"; do
set $line
echo "# setup(): removing stray external container $1 ($2)" >&3
run_podman '?' rm -f $1
if [[ $status -ne 0 ]]; then
echo "# [setup] $_LOG_PROMPT podman rm -f $1" >&3
for errline in "${lines[@]}"; do
echo "# $errline" >&3
done
fi
done
# Clean up all images except those desired.
# 2023-06-26 REMINDER: it is tempting to think that this is clunky,
# wouldn't it be safer/cleaner to just 'rmi -a' then '_prefetch $IMAGE'?
# Yes, but it's also tremendously slower: 29m for a CI run, to 39m.
# Image loads are slow.
found_needed_image=
run_podman '?' images --all --format '{{.Repository}}:{{.Tag}} {{.ID}}'
for line in "${lines[@]}"; do
set $line
if [[ "$1" == "$PODMAN_TEST_IMAGE_FQN" ]]; then
if [[ -z "$PODMAN_TEST_IMAGE_ID" ]]; then
# This will probably only trigger the 2nd time through setup
PODMAN_TEST_IMAGE_ID=$2
fi
found_needed_image=1
elif [[ "$1" == "$PODMAN_SYSTEMD_IMAGE_FQN" ]]; then
# This is a big image, don't force unnecessary pulls
:
else
# Always remove image that doesn't match by name
echo "# setup(): removing stray image $1" >&3
run_podman rmi --force "$1" >/dev/null 2>&1 || true
# Tagged image will have same IID as our test image; don't rmi it.
if [[ $2 != "$PODMAN_TEST_IMAGE_ID" ]]; then
echo "# setup(): removing stray image $2" >&3
run_podman rmi --force "$2" >/dev/null 2>&1 || true
fi
fi
done
# Make sure desired image is present
if [[ -z "$found_needed_image" ]]; then
_prefetch $PODMAN_TEST_IMAGE_FQN
fi
# Temporary subdirectory, in which tests can write whatever they like
# and trust that it'll be deleted on cleanup.
# (BATS v1.3 and above provide $BATS_TEST_TMPDIR, but we still use
@ -258,38 +204,63 @@ function defer-assertion-failures() {
# Basic teardown: remove all pods and containers
function basic_teardown() {
echo "# [teardown]" >&2
local actions=(
"pod rm -t 0 --all --force --ignore"
"rm -t 0 --all --force --ignore"
"network prune --force"
"volume rm -a -f"
)
for action in "${actions[@]}"; do
run_podman '?' $action
# The -f commands should never exit nonzero, but if they do we want
# to know about it.
# FIXME: someday: also test for [[ -n "$output" ]] - can't do this
# yet because too many tests don't clean up their containers
if [[ $status -ne 0 ]]; then
echo "# [teardown] $_LOG_PROMPT podman $action" >&3
for line in "${lines[*]}"; do
echo "# $line" >&3
done
# Special case for timeout: check for locks (#18514)
if [[ $status -eq 124 ]]; then
echo "# [teardown] $_LOG_PROMPT podman system locks" >&3
run $PODMAN system locks
for line in "${lines[*]}"; do
echo "# $line" >&3
done
fi
fi
done
command rm -rf $PODMAN_TMPDIR
immediate-assertion-failures
# Unlike normal tests teardown will not exit on first command failure
# but rather only uses the return code of the teardown function.
# This must be directly after immediate-assertion-failures to capture the error code
local exit_code=$?
# Only checks for leaks on a successful run (BATS_TEST_COMPLETED is set 1),
# immediate-assertion-failures didn't fail (exit_code -eq 0)
# and PODMAN_BATS_LEAK_CHECK is set.
# As these podman commands are slow we do not want to do this by default
# and only provide this as opt in option. (#22909)
if [[ "$BATS_TEST_COMPLETED" -eq 1 ]] && [ $exit_code -eq 0 ] && [ -n "$PODMAN_BATS_LEAK_CHECK" ]; then
run_podman volume ls -q
assert "$output" == "" "Leaked volumes!!!"
exit_code=$((exit_code + $?))
run_podman network ls -q
# podman always exists
assert "$output" == "podman" "Leaked networks!!!"
exit_code=$((exit_code + $?))
run_podman pod ps -q
assert "$output" == "" "Leaked pods!!!"
exit_code=$((exit_code + $?))
run_podman ps -a -q
assert "$output" == "" "Leaked containers!!!"
exit_code=$((exit_code + $?))
run_podman images --all --format '{{.Repository}}:{{.Tag}} {{.ID}}'
for line in "${lines[@]}"; do
set $line
if [[ "$1" == "$PODMAN_TEST_IMAGE_FQN" ]]; then
found_needed_image=1
elif [[ "$1" == "$PODMAN_SYSTEMD_IMAGE_FQN" ]]; then
# This is a big image, don't force unnecessary pulls
:
else
exit_code=$((exit_code + 1))
echo "Leaked image $1 $2"
fi
done
# Make sure desired image is present
if [[ -z "$found_needed_image" ]]; then
exit_code=$((exit_code + 1))
die "$PODMAN_TEST_IMAGE_FQN was removed"
fi
fi
# Some error happened (either in teardown itself or the actual test failed)
# so do a full cleanup to ensure following tests start with a clean env.
if [ $exit_code -gt 0 ] || [ -z "$BATS_TEST_COMPLETED" ]; then
clean_setup
exit_code=$((exit_code + $?))
fi
command rm -rf $PODMAN_TMPDIR
exit_code=$((exit_code + $?))
return $exit_code
}
@ -323,6 +294,89 @@ function restore_image() {
run_podman restore $archive
}
function clean_setup() {
local actions=(
"pod rm -t 0 --all --force --ignore"
"rm -t 0 --all --force --ignore"
"network prune --force"
"volume rm -a -f"
)
for action in "${actions[@]}"; do
run_podman '?' $action
# The -f commands should never exit nonzero, but if they do we want
# to know about it.
# FIXME: someday: also test for [[ -n "$output" ]] - can't do this
# yet because too many tests don't clean up their containers
if [[ $status -ne 0 ]]; then
echo "# [teardown] $_LOG_PROMPT podman $action" >&3
for line in "${lines[*]}"; do
echo "# $line" >&3
done
# Special case for timeout: check for locks (#18514)
if [[ $status -eq 124 ]]; then
echo "# [teardown] $_LOG_PROMPT podman system locks" >&3
run $PODMAN system locks
for line in "${lines[*]}"; do
echo "# $line" >&3
done
fi
fi
done
# ...including external (buildah) ones
run_podman ps --all --external --format '{{.ID}} {{.Names}}'
for line in "${lines[@]}"; do
set $line
echo "# setup(): removing stray external container $1 ($2)" >&3
run_podman '?' rm -f $1
if [[ $status -ne 0 ]]; then
echo "# [setup] $_LOG_PROMPT podman rm -f $1" >&3
for errline in "${lines[@]}"; do
echo "# $errline" >&3
done
fi
done
# Clean up all images except those desired.
# 2023-06-26 REMINDER: it is tempting to think that this is clunky,
# wouldn't it be safer/cleaner to just 'rmi -a' then '_prefetch $IMAGE'?
# Yes, but it's also tremendously slower: 29m for a CI run, to 39m.
# Image loads are slow.
found_needed_image=
run_podman '?' images --all --format '{{.Repository}}:{{.Tag}} {{.ID}}'
for line in "${lines[@]}"; do
set $line
if [[ "$1" == "$PODMAN_TEST_IMAGE_FQN" ]]; then
if [[ -z "$PODMAN_TEST_IMAGE_ID" ]]; then
# This will probably only trigger the 2nd time through setup
PODMAN_TEST_IMAGE_ID=$2
fi
found_needed_image=1
elif [[ "$1" == "$PODMAN_SYSTEMD_IMAGE_FQN" ]]; then
# This is a big image, don't force unnecessary pulls
:
else
# Always remove image that doesn't match by name
echo "# setup(): removing stray image $1" >&3
run_podman rmi --force "$1" >/dev/null 2>&1 || true
# Tagged image will have same IID as our test image; don't rmi it.
if [[ $2 != "$PODMAN_TEST_IMAGE_ID" ]]; then
echo "# setup(): removing stray image $2" >&3
run_podman rmi --force "$2" >/dev/null 2>&1 || true
fi
fi
done
# Make sure desired image is present
if [[ -z "$found_needed_image" ]]; then
_prefetch $PODMAN_TEST_IMAGE_FQN
fi
}
# END setup/teardown tools
###############################################################################
# BEGIN podman helpers

View File

@ -30,6 +30,8 @@ function setup_suite() {
# The above does not handle errors. Do a final confirmation.
assert "$PODMAN_LOGIN_REGISTRY_PORT" != "" \
"Unable to set PODMAN_LOGIN_REGISTRY_PORT"
clean_setup
}
# Run at the very end of all tests. Useful for cleanup of non-BATS tmpdirs.