Files
podman/test/system/220-healthcheck.bats
Jan Rodák fff42ac232 Fix HealthCheck log destination, count, and size defaults
GoLang sets unset values to the default value of the type. This means that the destination of the log is an empty string and the count and size are set to 0. However, this means that size and count are unbounded, and this is not the default behavior.

Fixes: https://github.com/containers/podman/issues/25473
Fixes: https://issues.redhat.com/browse/RHEL-83262

Signed-off-by: Jan Rodák <hony.com@seznam.cz>
2025-03-12 21:27:00 +01:00

485 lines
18 KiB
Bash

#!/usr/bin/env bats -*- bats -*-
#
# tests for podman healthcheck
#
#
load helpers
load helpers.systemd
# bats file_tags=ci:parallel
# Helper function: run 'podman inspect' and check various given fields
function _check_health {
local ctrname="$1"
local testname="$2"
local tests="$3"
local since="$4"
local hc_status="$5"
# Loop-wait (up to a few seconds) for healthcheck event (#20342)
# Allow a margin when running parallel, because of system load
local timeout=5
if [[ -n "$PARALLEL_JOBSLOT" ]]; then
timeout=$((timeout + 3))
fi
while :; do
run_podman events --filter container=$ctrname --filter event=health_status \
--since "$since" --stream=false --format "{{.HealthStatus}}"
# Output may be empty or multiple lines.
if [[ -n "$output" ]]; then
if [[ "${lines[-1]}" = "$hc_status" ]]; then
break
fi
fi
timeout=$((timeout - 1))
if [[ $timeout -eq 0 ]]; then
die "$testname - timed out waiting for '$hc_status' in podman events"
fi
sleep 1
done
# Got the desired status. Now verify all the healthcheck fields
run_podman inspect --format "{{json .State.Healthcheck}}" $ctrname
defer-assertion-failures
parse_table "$tests" | while read field expect;do
actual=$(jq ".$field" <<<"$output")
is "$actual" "$expect" "$testname - .State.Healthcheck.$field"
done
immediate-assertion-failures
}
@test "podman healthcheck" {
local ctrname="c-h-$(safename)"
run_podman run -d --name $ctrname \
--health-cmd /home/podman/healthcheck \
--health-interval 1s \
--health-retries 3 \
--health-on-failure=kill \
--health-startup-cmd /home/podman/healthcheck \
--health-startup-interval 1s \
$IMAGE /home/podman/pause
cid="$output"
run_podman inspect $ctrname --format "{{.Config.HealthcheckOnFailureAction}}"
is "$output" "kill" "on-failure action is set to kill"
run_podman inspect $ctrname --format "{{.Config.StartupHealthCheck.Test}}"
is "$output" "[CMD-SHELL /home/podman/healthcheck]" ".Config.StartupHealthCheck.Test"
current_time=$(date --iso-8601=ns)
# We can't check for 'starting' because a 1-second interval is too
# short; it could run healthcheck before we get to our first check.
#
# So, just force a healthcheck run, then confirm that it's running.
run_podman healthcheck run $ctrname
is "$output" "" "output from 'podman healthcheck run'"
_check_health $ctrname "All healthy" "
Status | \"healthy\"
FailingStreak | 0
Log[-1].ExitCode | 0
Log[-1].Output | \"Life is Good on stdout\\\nLife is Good on stderr\\\n\"
" "$current_time" "healthy"
current_time=$(date --iso-8601=ns)
# Force a failure
run_podman exec $ctrname touch /uh-oh
_check_health $ctrname "First failure" "
Status | \"healthy\"
FailingStreak | [123]
Log[-1].ExitCode | 1
Log[-1].Output | \"Uh-oh on stdout!\\\nUh-oh on stderr!\\\n\"
" "$current_time" "healthy"
# Check that we now we do have valid podman units with this
# name so that the leak check below does not turn into a NOP without noticing.
run -0 systemctl list-units
cidmatch=$(grep "$cid" <<<"$output")
echo "$cidmatch"
assert "$cidmatch" =~ " $cid-[0-9a-f]+\.timer *.*/podman healthcheck run $cid" \
"Healthcheck systemd unit exists"
current_time=$(date --iso-8601=ns)
# After three successive failures, container should no longer be healthy
_check_health $ctrname "Four or more failures" "
Status | \"unhealthy\"
FailingStreak | [3456]
Log[-1].ExitCode | 1
Log[-1].Output | \"Uh-oh on stdout!\\\nUh-oh on stderr!\\\n\"
" "$current_time" "unhealthy"
# now the on-failure should kick in and kill the container
run_podman wait $ctrname
# Clean up
run_podman rm -t 0 -f $ctrname
# Important check for https://github.com/containers/podman/issues/22884
# We never should leak the unit files, healthcheck uses the cid in name so just grep that.
# (Ignore .scope units, those are conmon and can linger for 5 minutes)
# (Ignore .mount, too. They are created/removed by systemd based on the actual real mounts
# on the host and that is async and might be slow enough in CI to cause failures.)
run -0 systemctl list-units --quiet "*$cid*"
except_scope_mount=$(grep -vF ".scope " <<<"$output" | { grep -vF ".mount" || true; } )
assert "$except_scope_mount" == "" "Healthcheck systemd unit cleanup: no units leaked"
}
@test "podman healthcheck - restart cleans up old state" {
ctr="c-h-$(safename)"
run_podman run -d --name $ctr \
--health-cmd /home/podman/healthcheck \
--health-retries=3 \
--health-interval=disable \
$IMAGE /home/podman/pause
run_podman container inspect $ctr --format "{{.State.Healthcheck.FailingStreak}}"
is "$output" "0" "Failing streak of fresh container should be 0"
# Get the healthcheck to fail
run_podman exec $ctr touch /uh-oh-only-once
run_podman 1 healthcheck run $ctr
is "$output" "unhealthy" "output from 'podman healthcheck run'"
run_podman container inspect $ctr --format "{{.State.Healthcheck.FailingStreak}}"
is "$output" "1" "Failing streak after one failed healthcheck should be 1"
run_podman container restart $ctr
run_podman container inspect $ctr --format "{{.State.Healthcheck.FailingStreak}}"
is "$output" "0" "Failing streak of restarted container should be 0 again"
run_podman rm -f -t0 $ctr
}
@test "podman wait --condition={healthy,unhealthy}" {
ctr="c-h-$(safename)"
wait_file="$PODMAN_TMPDIR/$(random_string).wait_for_me"
for condition in healthy unhealthy;do
rm -f $wait_file
run_podman run -d --name $ctr \
--health-cmd /home/podman/healthcheck \
--health-retries=1 \
--health-interval=disable \
$IMAGE /home/podman/pause
if [[ $condition == "unhealthy" ]];then
# create the uh-oh file to let the health check fail
run_podman exec $ctr touch /uh-oh
fi
# Wait for the container in the background and create the $wait_file to
# signal the specified wait condition was met.
(timeout --foreground -v --kill=5 10 $PODMAN wait --condition=$condition $ctr && touch $wait_file) &
# Sleep 1 second to make sure above commands are running
sleep 1
if [[ -f $wait_file ]]; then
die "the wait file should only be created after the container turned healthy"
fi
if [[ $condition == "healthy" ]];then
run_podman healthcheck run $ctr
else
run_podman 1 healthcheck run $ctr
fi
wait_for_file $wait_file
run_podman rm -f -t0 $ctr
done
}
@test "podman healthcheck --health-on-failure" {
run_podman 125 create --health-on-failure=kill $IMAGE
is "$output" "Error: cannot set on-failure action to kill without a health check"
ctr="c-h-$(safename)"
for policy in none kill restart stop;do
uhoh=/uh-oh
if [[ $policy != "none" ]];then
# only fail the first run
uhoh=/uh-oh-only-once
fi
# Run healthcheck image.
run_podman run -d --name $ctr \
--health-cmd /home/podman/healthcheck \
--health-retries=1 \
--health-on-failure=$policy \
--health-interval=disable \
$IMAGE /home/podman/pause
# healthcheck should succeed
run_podman healthcheck run $ctr
# Now cause the healthcheck to fail
run_podman exec $ctr touch $uhoh
# healthcheck should now fail, with exit status 1 and 'unhealthy' output
run_podman 1 healthcheck run $ctr
is "$output" "unhealthy" "output from 'podman healthcheck run' (policy: $policy)"
if [[ $policy == "restart" ]];then
# Make sure the container transitions back to running
run_podman wait --condition=running $ctr
run_podman inspect $ctr --format "{{.RestartCount}}"
assert "${#lines[@]}" != 0 "Container has been restarted at least once"
run_podman container inspect $ctr --format "{{.State.Healthcheck.FailingStreak}}"
is "$output" "0" "Failing streak of restarted container should be 0 again"
run_podman healthcheck run $ctr
elif [[ $policy == "none" ]];then
run_podman inspect $ctr --format "{{.State.Status}} {{.Config.HealthcheckOnFailureAction}}"
# Container is still running and health check still broken
is "$output" "running $policy" "container continued running"
run_podman 1 healthcheck run $ctr
is "$output" "unhealthy" "output from 'podman healthcheck run' (policy: $policy)"
else
run_podman inspect $ctr --format "{{.State.Status}} {{.Config.HealthcheckOnFailureAction}}"
# kill and stop yield the container into a non-running state
is "$output" ".* $policy" "container was stopped/killed (policy: $policy)"
assert "$output" != "running $policy"
# also make sure that it's not stuck in the stopping state
assert "$output" != "stopping $policy"
fi
run_podman rm -f -t0 $ctr
done
}
@test "podman healthcheck --health-on-failure with interval" {
ctr="c-h-$(safename)"
for policy in stop kill restart ;do
t0=$(date --iso-8601=seconds)
run_podman run -d --name $ctr \
--health-cmd /bin/false \
--health-retries=1 \
--health-on-failure=$policy \
--health-interval=1s \
$IMAGE top
if [[ $policy == "restart" ]];then
# Sleeping for 2 seconds makes the test much faster than using
# podman-wait which would compete with the container getting
# restarted.
sleep 2
# Make sure the container transitions back to running
run_podman wait --condition=running $ctr
run_podman inspect $ctr --format "{{.RestartCount}}"
assert "${#lines[@]}" != 0 "Container has been restarted at least once"
else
# kill and stop yield the container into a non-running state
run_podman wait $ctr
run_podman inspect $ctr --format "{{.State.Status}} {{.Config.HealthcheckOnFailureAction}}"
is "$output" ".* $policy" "container was stopped/killed (policy: $policy)"
assert "$output" != "running $policy"
# also make sure that it's not stuck in the stopping state
assert "$output" != "stopping $policy"
fi
run_podman rm -f -t0 $ctr
done
}
function _create_container_with_health_log_settings {
local ctrname="$1"
local msg="$2"
local format="$3"
local flag="$4"
local expect="$5"
local expect_msg="$6"
run_podman run -d --name $ctrname \
--health-cmd "echo $msg" \
$flag \
$IMAGE /home/podman/pause
cid="$output"
run_podman inspect $ctrname --format $format
is "$output" "$expect" "$expect_msg"
output=$cid
}
function _check_health_log {
local ctrname="$1"
local expect_msg="$2"
local comparison=$3
local expect_count="$4"
run_podman inspect $ctrname --format "{{.State.Health.Log}}"
count=$(grep -co "$expect_msg" <<< "$output")
assert "$count" $comparison $expect_count "Number of matching health log messages"
}
@test "podman healthcheck --health-max-log-count values" {
# flag | expected value | op | log count
test="
| 5 | -eq | 5
--health-max-log-count 0 | 0 | -ge | 11
--health-max-log-count=0 | 0 | -ge | 11
--health-max-log-count 10 | 10 | -eq | 10
--health-max-log-count=10 | 10 | -eq | 10
"
while read flag value op logs_count ; do
local msg="healthmsg-$(random_string)"
local ctrname="c-h-$(safename)"
_create_container_with_health_log_settings $ctrname $msg "{{.Config.HealthMaxLogCount}}" $flag $value "HealthMaxLogCount"
for i in $(seq 1 $((logs_count + 5)));
do
run_podman healthcheck run $ctrname
is "$output" "" "unexpected output from podman healthcheck run (pass $i)"
done
_check_health_log $ctrname $msg $op $logs_count
run_podman rm -t 0 -f $ctrname
done < <(parse_table "$tests")
}
@test "podman healthcheck --health-max-log-size values" {
local s=$(printf "healthmsg-%1000s")
local long_msg=${s// /$(random_string)}
# flag | expected value | exp_msg
test="
| 500 | ${long_msg:0:500}}]\$
--health-max-log-size 0 | 0 | $long_msg}]\$
--health-max-log-size=0 | 0 | $long_msg}]\$
--health-max-log-size 10 | 10 | ${long_msg:0:10}}]\$
--health-max-log-size=10 | 10 | ${long_msg:0:10}}]\$
"
while read flag value exp_msg ; do
local ctrname="c-h-$(safename)"
_create_container_with_health_log_settings $ctrname $long_msg "{{.Config.HealthMaxLogSize}}" $flag $value "HealthMaxLogSize"
run_podman healthcheck run $ctrname
is "$output" "" "output from 'podman healthcheck run'"
_check_health_log $ctrname $exp_msg -eq 1
run_podman rm -t 0 -f $ctrname
done < <(parse_table "$tests")
}
@test "podman healthcheck --health-log-destination file" {
local TMP_DIR_HEALTHCHECK="$PODMAN_TMPDIR/healthcheck"
mkdir $TMP_DIR_HEALTHCHECK
local ctrname="c-h-$(safename)"
local msg="healthmsg-$(random_string)"
_create_container_with_health_log_settings $ctrname $msg "{{.Config.HealthLogDestination}}" "--health-log-destination $TMP_DIR_HEALTHCHECK" "$TMP_DIR_HEALTHCHECK" "HealthLogDestination"
cid="$output"
run_podman healthcheck run $ctrname
is "$output" "" "output from 'podman healthcheck run'"
healthcheck_log_path="${TMP_DIR_HEALTHCHECK}/${cid}-healthcheck.log"
# The healthcheck is triggered by the podman when the container is started, but its execution depends on systemd.
# And since `run_podman healthcheck run` is also run manually, it will result in two runs.
count=$(grep -co "$msg" $healthcheck_log_path)
assert "$count" -ge 1 "Number of matching health log messages"
run_podman rm -t 0 -f $ctrname
}
@test "podman healthcheck --health-log-destination journal" {
skip_if_remote "We cannot read journalctl over remote."
# We can't use journald on RHEL as rootless, either: rhbz#1895105
skip_if_journald_unavailable
local ctrname="c-h-$(safename)"
local msg="healthmsg-$(random_string)"
_create_container_with_health_log_settings $ctrname $msg "{{.Config.HealthLogDestination}}" "--health-log-destination events_logger" "events_logger" "HealthLogDestination"
cid="$output"
run_podman healthcheck run $ctrname
is "$output" "" "output from 'podman healthcheck run'"
cmd="journalctl --output cat --output-fields=PODMAN_HEALTH_LOG PODMAN_ID=$cid"
echo "$_LOG_PROMPT $cmd"
run $cmd
echo "$output"
assert "$status" -eq 0 "exit status of journalctl"
# The healthcheck is triggered by the podman when the container is started, but its execution depends on systemd.
# And since `run_podman healthcheck run` is also run manually, it will result in two runs.
count=$(grep -co "$msg" <<< "$output")
assert "$count" -ge 1 "Number of matching health log messages"
run_podman rm -t 0 -f $ctrname
}
@test "podman healthcheck - stop container when healthcheck runs" {
ctr="c-h-$(safename)"
msg="hc-msg-$(random_string)"
hcStatus=$PODMAN_TMPDIR/hcStatus
run_podman run -d --name $ctr \
--health-cmd "sleep 20; echo $msg" \
$IMAGE /home/podman/pause
timeout --foreground -v --kill=10 60 \
$PODMAN healthcheck run $ctr &> $hcStatus &
hc_pid=$!
run_podman inspect $ctr --format "{{.State.Status}}"
assert "$output" == "running" "Container is running"
run_podman stop $ctr
# Wait for background healthcheck to finish and make sure the exit status is 1
rc=0
wait -n $hc_pid || rc=$?
assert $rc -eq 1 "exit status check of healthcheck command"
assert $(< $hcStatus) == "stopped" "Health status"
run_podman inspect $ctr --format "{{.State.Status}}--{{.State.Health.Status}}--{{.State.Health.FailingStreak}}"
assert "$output" == "exited--stopped--0" "Container is stopped -- Health status -- failing streak"
run_podman inspect $ctr --format "{{.State.Health.Log}}"
assert "$output" !~ "$msg" "Health log message not found"
run_podman rm -f -t0 $ctr
}
# https://github.com/containers/podman/issues/25034
@test "podman healthcheck - start errors" {
skip_if_remote '$PATH overwrite not working via remote'
ctr1="c1-h-$(safename)"
ctr2="c2-h-$(safename)"
local systemd_run="$PODMAN_TMPDIR/systemd-run"
touch $systemd_run
chmod +x $systemd_run
# Set custom PATH to force our stub to be called instead of the real systemd-run.
PATH="$PODMAN_TMPDIR:$PATH" run_podman 126 run -d --name $ctr1 \
--health-cmd "true" $IMAGE /home/podman/pause
assert "$output" =~ "create healthcheck: failed to execute systemd-run: fork/exec $systemd_run: exec format error" "error on invalid systemd-run"
local systemd_run="$PODMAN_TMPDIR/systemd-run"
cat > $systemd_run <<EOF
#!/bin/bash
echo stdout
echo stderr >&2
exit 2
EOF
PATH="$PODMAN_TMPDIR:$PATH" run_podman 126 run -d --name $ctr2 \
--health-cmd "true" $IMAGE /home/podman/pause
assert "$output" =~ "create healthcheck: systemd-run failed: exit status 2: output: stdout
stderr" "systemd-run error message"
run_podman rm -f -t0 $ctr1 $ctr2
}
# vim: filetype=sh