healthcheck: make sure to always show health_status events

This fixes a regression caused by commit 7e6e267329, unfortunately this
was not caught during review as for some reason this works fine rootless
and only fails as root.

Because we set the systemd log level to notice in order to hide the unit
started/stopped messages to prevent spamming the journal the issue is
that this now also causes systemd to ignore the events we write to
journald as we also send them as info level.

To fix this we simply send health_status events now on notice level. I
decided against sending all events on notice as I think info is fine for
them. Whenever the notice level is right is of course debatable but
given it may contain the unhealthy message I think having this a notice
should be ok.

The main reason this made it through testing is because we do not rely
on the systemd unit to fire healthchecks in the tests as this is flaky.
There is one test were we rely on it though and I added a check there
to make sure events are displayed correctly when trigger via systemd.

Fixes #20342

Signed-off-by: Paul Holzinger <pholzing@redhat.com>
This commit is contained in:
Paul Holzinger
2023-10-12 14:46:48 +02:00
parent 8de7c48df5
commit 99a14332ef
2 changed files with 31 additions and 4 deletions

View File

@ -74,7 +74,19 @@ func (e EventJournalD) Write(ee Event) error {
case Volume:
m["PODMAN_NAME"] = ee.Name
}
return journal.Send(ee.ToHumanReadable(false), journal.PriInfo, m)
// starting with commmit 7e6e267329 we set LogLevel=notice for the systemd healthcheck unit
// This so it doesn't log the started/stopped unit messages al the time which spam the
// journal if a small interval is used. That however broke the healthcheck event as it no
// longer showed up in podman events when running as root as we only send the event on info
// level. To fix this we have to send the event on notice level.
// https://github.com/containers/podman/issues/20342
prio := journal.PriInfo
if len(ee.HealthStatus) > 0 {
prio = journal.PriNotice
}
return journal.Send(ee.ToHumanReadable(false), prio, m)
}
// Read reads events from the journal and sends qualified events to the event channel

View File

@ -11,6 +11,8 @@ load helpers
function _check_health {
local testname="$1"
local tests="$2"
local since="$3"
local hc_status="$4"
run_podman inspect --format "{{json .State.Healthcheck}}" healthcheck_c
@ -18,6 +20,16 @@ function _check_health {
actual=$(jq ".$field" <<<"$output")
is "$actual" "$expect" "$testname - .State.Healthcheck.$field"
done
# Make sure we can read the healthcheck event in podman events (#20342)
run_podman events --filter container=healthcheck_c --filter event=health_status \
--since "$since" --stream=false --format "{{.HealthStatus}}"
# Because the assert below would fail with "lines: bad array subscript" when
# there are no events lets special case this to provide a more meaningful error.
if [[ -z "$output" ]]; then
die "no healthcheck events"
fi
assert "${lines[-1]}" == "$hc_status" "$testname - podman events health status"
}
@test "podman healthcheck" {
@ -34,6 +46,7 @@ function _check_health {
run_podman inspect healthcheck_c --format "{{.Config.HealthcheckOnFailureAction}}"
is "$output" "kill" "on-failure action is set to kill"
current_time=$(date --iso-8601=seconds)
# We can't check for 'starting' because a 1-second interval is too
# short; it could run healthcheck before we get to our first check.
#
@ -46,8 +59,9 @@ Status | \"healthy\"
FailingStreak | 0
Log[-1].ExitCode | 0
Log[-1].Output | \"Life is Good on stdout\\\nLife is Good on stderr\"
"
" "$current_time" "healthy"
current_time=$(date --iso-8601=seconds)
# Force a failure
run_podman exec healthcheck_c touch /uh-oh
sleep 2
@ -57,8 +71,9 @@ Status | \"healthy\"
FailingStreak | [123]
Log[-1].ExitCode | 1
Log[-1].Output | \"Uh-oh on stdout!\\\nUh-oh on stderr!\"
"
" "$current_time" "healthy"
current_time=$(date --iso-8601=seconds)
# After three successive failures, container should no longer be healthy
sleep 5
_check_health "Three or more failures" "
@ -66,7 +81,7 @@ Status | \"unhealthy\"
FailingStreak | [3456]
Log[-1].ExitCode | 1
Log[-1].Output | \"Uh-oh on stdout!\\\nUh-oh on stderr!\"
"
" "$current_time" "unhealthy"
# now the on-failure should kick in and kill the container
run_podman wait healthcheck_c