healthcheck system check: reduce raciness

When will I learn not to dismiss something as "easy"?

Anyhow, this doesn't actually change anything parallel-wise
but it does reduce a race condition seen on heavily-loaded
slow systems, wherein a container goes into unhealthy before
we want it to. This version isn't perfect; I don't think
there's an ideal fix for this.

Signed-off-by: Ed Santiago <santiago@redhat.com>
This commit is contained in:
Ed Santiago
2024-08-08 08:33:57 -06:00
parent 30ee9c0114
commit 0d7e14fb83

View File

@ -17,22 +17,34 @@ function _check_health {
local since="$4"
local hc_status="$5"
# Loop-wait (up to a few seconds) for healthcheck event (#20342)
local timeout=5
while :; do
run_podman events --filter container=$ctrname --filter event=health_status \
--since "$since" --stream=false --format "{{.HealthStatus}}"
# Output may be empty or multiple lines.
if [[ -n "$output" ]]; then
if [[ "${lines[-1]}" = "$hc_status" ]]; then
break
fi
fi
timeout=$((timeout - 1))
if [[ $timeout -eq 0 ]]; then
die "$testname - timed out waiting for '$hc_status' in podman events"
fi
sleep 1
done
# Got the desired status. Now verify all the healthcheck fields
run_podman inspect --format "{{json .State.Healthcheck}}" $ctrname
defer-assertion-failures
parse_table "$tests" | while read field expect;do
actual=$(jq ".$field" <<<"$output")
is "$actual" "$expect" "$testname - .State.Healthcheck.$field"
done
# Make sure we can read the healthcheck event in podman events (#20342)
run_podman events --filter container=$ctrname --filter event=health_status \
--since "$since" --stream=false --format "{{.HealthStatus}}"
# Because the assert below would fail with "lines: bad array subscript" when
# there are no events lets special case this to provide a more meaningful error.
if [[ -z "$output" ]]; then
die "no healthcheck events"
fi
assert "${lines[-1]}" == "$hc_status" "$testname - podman events health status"
immediate-assertion-failures
}
@test "podman healthcheck" {
@ -50,7 +62,7 @@ function _check_health {
run_podman inspect $ctrname --format "{{.Config.HealthcheckOnFailureAction}}"
is "$output" "kill" "on-failure action is set to kill"
current_time=$(date --iso-8601=seconds)
current_time=$(date --iso-8601=ns)
# We can't check for 'starting' because a 1-second interval is too
# short; it could run healthcheck before we get to our first check.
#
@ -65,10 +77,9 @@ Log[-1].ExitCode | 0
Log[-1].Output | \"Life is Good on stdout\\\nLife is Good on stderr\\\n\"
" "$current_time" "healthy"
current_time=$(date --iso-8601=seconds)
current_time=$(date --iso-8601=ns)
# Force a failure
run_podman exec $ctrname touch /uh-oh
sleep 2
_check_health $ctrname "First failure" "
Status | \"healthy\"
@ -81,10 +92,9 @@ Log[-1].Output | \"Uh-oh on stdout!\\\nUh-oh on stderr!\\\n\"
# name so that the leak check below does not turn into a NOP without noticing.
assert "$(systemctl list-units --type timer | grep $cid)" =~ "podman" "Healthcheck systemd unit exists"
current_time=$(date --iso-8601=seconds)
current_time=$(date --iso-8601=ns)
# After three successive failures, container should no longer be healthy
sleep 5
_check_health $ctrname "Three or more failures" "
_check_health $ctrname "Four or more failures" "
Status | \"unhealthy\"
FailingStreak | [3456]
Log[-1].ExitCode | 1