systests: fix race in quadlet tests

The new exit-code propagation test is racy: 'podman wait' can
fail if the service container has already been cleaned up by
systemd.

Solution: run the inspect and wait tests opportunistically, i.e.,
only if those commands succeed. If they fail, confirm that they
fail with ENOSUCHCONTAINER. This may silently lose us some
coverage ... but none of it is important. The important
test, systemctl final status, remains.

Also, as drive-bys:
 - add a FIXME comment documenting another race condition
   that I'm not bothering to fix right now

 - give distinct names to unit files, for readability in
   test failures

Fixes: #18732

Signed-off-by: Ed Santiago <santiago@redhat.com>
This commit is contained in:
Ed Santiago
2023-05-30 13:18:37 -06:00
parent c9c5cb2224
commit 72d4cede29

View File

@ -91,6 +91,7 @@ function service_setup() {
echo "$output"
assert $status -eq 0 "Error starting systemd unit $service"
# FIXME FIXME FIXME: this is racy with short-lived containers!
echo "$_LOG_PROMPT systemctl status $service"
run systemctl status "$service"
echo "$output"
@ -638,15 +639,16 @@ EOF
}
@test "quadlet - exit-code propagation" {
local quadlet_file=$PODMAN_TMPDIR/basic_$(random_string).kube
local yaml_file=$PODMAN_TMPDIR/$(random_string).yaml
exit_tests="
all | true | 0 | inactive
all | false | 137 | failed
none | false | 0 | inactive
"
while read exit_code_prop cmd exit_code service_state; do
local basename=propagate-${exit_code_prop}-${cmd}-$(random_string)
local quadlet_file=$PODMAN_TMPDIR/$basename.kube
local yaml_file=$PODMAN_TMPDIR/$basename.yaml
cat > $yaml_file <<EOF
apiVersion: v1
kind: Pod
@ -674,19 +676,42 @@ EOF
run_quadlet "$quadlet_file"
run systemctl status $QUADLET_SERVICE_NAME
yaml_sha=$(sha256sum $yaml_file)
service_container="${yaml_sha:0:12}-service"
service_setup $QUADLET_SERVICE_NAME
# Ensure we have output. Output is synced via sd-notify (socat in Exec)
run journalctl "--since=$STARTED_TIME" --unit="$QUADLET_SERVICE_NAME"
is "$output" '.*Started.*\.service.*'
yaml_sha=$(sha256sum $yaml_file)
service_container="${yaml_sha:0:12}-service"
run_podman container inspect --format '{{.KubeExitCodePropagation}}' $service_container
is "$output" "$exit_code_prop" "service container has the expected policy set in its annotations"
run_podman wait $service_container
is "$output" "$exit_code" "service container reflects expected exit code $exit_code"
# Opportunistic test: confirm that the Propagation field got set.
# This is racy, because the container is short-lived and quadlet
# cleans up on exit (via kube-down in ExecStopPost). So we use '?'
# and only check output if the inspect succeeds.
run_podman '?' container inspect --format '{{.KubeExitCodePropagation}}' $service_container
if [[ $status -eq 0 ]]; then
is "$output" "$exit_code_prop" \
"$basename: service container has the expected policy set in its annotations"
else
assert "$output" =~ "no such container $service_container" \
"$basename: unexpected error from podman container inspect"
fi
# Container must stop of its own accord before we call service_cleanup(),
# otherwise the 'systemctl stop' there may affect the unit's status.
# Again, use '?' to handle the abovementioned race condition.
run_podman '?' wait $service_container
if [[ $status -eq 0 ]]; then
assert "$output" = "$exit_code" \
"$basename: service container reflects expected exit code"
else
assert "$output" =~ "no container with name or ID" \
"$basename: unexpected error from podman wait"
fi
# This is the actual propagation check
service_cleanup $QUADLET_SERVICE_NAME $service_state
run_podman ps -aq
is "$output" "" "all containers are cleaned up even in case of errors"