healthcheck: make sure to always show health_status events

This fixes a regression caused by commit 7e6e267329, unfortunately this was not caught during review as for some reason this works fine rootless and only fails as root. Because we set the systemd log level to notice in order to hide the unit started/stopped messages to prevent spamming the journal the issue is that this now also causes systemd to ignore the events we write to journald as we also send them as info level. To fix this we simply send health_status events now on notice level. I decided against sending all events on notice as I think info is fine for them. Whenever the notice level is right is of course debatable but given it may contain the unhealthy message I think having this a notice should be ok. The main reason this made it through testing is because we do not rely on the systemd unit to fire healthchecks in the tests as this is flaky. There is one test were we rely on it though and I added a check there to make sure events are displayed correctly when trigger via systemd. Fixes #20342 Signed-off-by: Paul Holzinger <pholzing@redhat.com>
2025-05-17 15:18:43 +08:00 · 2023-10-12 14:46:48 +02:00
parent 8de7c48df5
commit 99a14332ef
2 changed files with 31 additions and 4 deletions
--- a/libpod/events/journal_linux.go
+++ b/libpod/events/journal_linux.go
@ -74,7 +74,19 @@ func (e EventJournalD) Write(ee Event) error {
 	case Volume:
 		m["PODMAN_NAME"] = ee.Name
 	}
-	return journal.Send(ee.ToHumanReadable(false), journal.PriInfo, m)
+
+	// starting with commmit 7e6e267329 we set LogLevel=notice for the systemd healthcheck unit
+	// This so it doesn't log the started/stopped unit messages al the time which spam the
+	// journal if a small interval is used. That however broke the healthcheck event as it no
+	// longer showed up in podman events when running as root as we only send the event on info
+	// level. To fix this we have to send the event on notice level.
+	// https://github.com/containers/podman/issues/20342
+	prio := journal.PriInfo
+	if len(ee.HealthStatus) > 0 {
+		prio = journal.PriNotice
+	}
+
+	return journal.Send(ee.ToHumanReadable(false), prio, m)
 }

 // Read reads events from the journal and sends qualified events to the event channel
--- a/test/system/220-healthcheck.bats
+++ b/test/system/220-healthcheck.bats
@ -11,6 +11,8 @@ load helpers
 function _check_health {
    local testname="$1"
    local tests="$2"
+    local since="$3"
+    local hc_status="$4"

    run_podman inspect --format "{{json .State.Healthcheck}}" healthcheck_c

@ -18,6 +20,16 @@ function _check_health {
        actual=$(jq ".$field" <<<"$output")
        is "$actual" "$expect" "$testname - .State.Healthcheck.$field"
    done
+
+    # Make sure we can read the healthcheck event in podman events (#20342)
+    run_podman events --filter container=healthcheck_c --filter event=health_status \
+        --since "$since" --stream=false --format "{{.HealthStatus}}"
+    # Because the assert below would fail with "lines: bad array subscript" when
+    # there are no events lets special case this to provide a more meaningful error.
+    if [[ -z "$output" ]]; then
+        die "no healthcheck events"
+    fi
+    assert "${lines[-1]}" == "$hc_status" "$testname - podman events health status"
 }

@test "podman healthcheck" {
@ -34,6 +46,7 @@ function _check_health {
    run_podman inspect healthcheck_c --format "{{.Config.HealthcheckOnFailureAction}}"
    is "$output" "kill" "on-failure action is set to kill"

+    current_time=$(date --iso-8601=seconds)
    # We can't check for 'starting' because a 1-second interval is too
    # short; it could run healthcheck before we get to our first check.
    #
@ -46,8 +59,9 @@ Status           | \"healthy\"
 FailingStreak    | 0
 Log[-1].ExitCode | 0
 Log[-1].Output   | \"Life is Good on stdout\\\nLife is Good on stderr\"
-"
+" "$current_time" "healthy"

+    current_time=$(date --iso-8601=seconds)
    # Force a failure
    run_podman exec healthcheck_c touch /uh-oh
    sleep 2
@ -57,8 +71,9 @@ Status           | \"healthy\"
 FailingStreak    | [123]
 Log[-1].ExitCode | 1
 Log[-1].Output   | \"Uh-oh on stdout!\\\nUh-oh on stderr!\"
-"
+" "$current_time" "healthy"

+    current_time=$(date --iso-8601=seconds)
    # After three successive failures, container should no longer be healthy
    sleep 5
    _check_health "Three or more failures" "
@ -66,7 +81,7 @@ Status           | \"unhealthy\"
 FailingStreak    | [3456]
 Log[-1].ExitCode | 1
 Log[-1].Output   | \"Uh-oh on stdout!\\\nUh-oh on stderr!\"
-"
+" "$current_time" "unhealthy"

    # now the on-failure should kick in and kill the container
    run_podman wait healthcheck_c