mirror of
https://github.com/containers/podman.git
synced 2025-05-17 06:59:07 +08:00

When starting a container consider healthcheck errors fatal. That way user know when systemd-run failed to setup the timer to run the healthcheck and we don't get into a state where the container is running but not the healthcheck. This also fixes the broken error reporting from the systemd-run exec, if the binary could not be run the output was just empty leaving the users with no idea what failed. Fixes #25034 Signed-off-by: Paul Holzinger <pholzing@redhat.com>
195 lines
6.0 KiB
Go
195 lines
6.0 KiB
Go
//go:build !remote && systemd
|
|
|
|
package libpod
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"fmt"
|
|
"math/rand"
|
|
"os"
|
|
"os/exec"
|
|
"strings"
|
|
|
|
systemdCommon "github.com/containers/common/pkg/systemd"
|
|
"github.com/containers/podman/v5/pkg/errorhandling"
|
|
"github.com/containers/podman/v5/pkg/rootless"
|
|
"github.com/containers/podman/v5/pkg/systemd"
|
|
"github.com/sirupsen/logrus"
|
|
)
|
|
|
|
// createTimer systemd timers for healthchecks of a container
|
|
func (c *Container) createTimer(interval string, isStartup bool) error {
|
|
if c.disableHealthCheckSystemd(isStartup) {
|
|
return nil
|
|
}
|
|
|
|
hcUnitName := c.hcUnitName(isStartup, false)
|
|
|
|
podman, err := os.Executable()
|
|
if err != nil {
|
|
return fmt.Errorf("failed to get path for podman for a health check timer: %w", err)
|
|
}
|
|
|
|
var cmd = []string{"--property", "LogLevelMax=notice"}
|
|
if rootless.IsRootless() {
|
|
cmd = append(cmd, "--user")
|
|
}
|
|
path := os.Getenv("PATH")
|
|
if path != "" {
|
|
cmd = append(cmd, "--setenv=PATH="+path)
|
|
}
|
|
|
|
cmd = append(cmd, "--unit", hcUnitName, fmt.Sprintf("--on-unit-inactive=%s", interval), "--timer-property=AccuracySec=1s", podman)
|
|
|
|
if logrus.IsLevelEnabled(logrus.DebugLevel) {
|
|
cmd = append(cmd, "--log-level=debug", "--syslog")
|
|
}
|
|
|
|
cmd = append(cmd, "healthcheck", "run", c.ID())
|
|
|
|
conn, err := systemd.ConnectToDBUS()
|
|
if err != nil {
|
|
return fmt.Errorf("unable to get systemd connection to add healthchecks: %w", err)
|
|
}
|
|
conn.Close()
|
|
logrus.Debugf("creating systemd-transient files: %s %s", "systemd-run", cmd)
|
|
systemdRun := exec.Command("systemd-run", cmd...)
|
|
if output, err := systemdRun.CombinedOutput(); err != nil {
|
|
exitError := &exec.ExitError{}
|
|
if errors.As(err, &exitError) {
|
|
return fmt.Errorf("systemd-run failed: %w: output: %s", err, strings.TrimSpace(string(output)))
|
|
}
|
|
return fmt.Errorf("failed to execute systemd-run: %w", err)
|
|
}
|
|
|
|
c.state.HCUnitName = hcUnitName
|
|
if err := c.save(); err != nil {
|
|
return fmt.Errorf("saving container %s healthcheck unit name: %w", c.ID(), err)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// Wait for a message on the channel. Throw an error if the message is not "done".
|
|
func systemdOpSuccessful(c chan string) error {
|
|
msg := <-c
|
|
switch msg {
|
|
case "done":
|
|
return nil
|
|
default:
|
|
return fmt.Errorf("expected %q but received %q", "done", msg)
|
|
}
|
|
}
|
|
|
|
// startTimer starts a systemd timer for the healthchecks
|
|
func (c *Container) startTimer(isStartup bool) error {
|
|
if c.disableHealthCheckSystemd(isStartup) {
|
|
return nil
|
|
}
|
|
|
|
hcUnitName := c.state.HCUnitName
|
|
if hcUnitName == "" {
|
|
hcUnitName = c.hcUnitName(isStartup, true)
|
|
}
|
|
|
|
conn, err := systemd.ConnectToDBUS()
|
|
if err != nil {
|
|
return fmt.Errorf("unable to get systemd connection to start healthchecks: %w", err)
|
|
}
|
|
defer conn.Close()
|
|
|
|
startFile := fmt.Sprintf("%s.service", hcUnitName)
|
|
startChan := make(chan string)
|
|
if _, err := conn.RestartUnitContext(context.Background(), startFile, "fail", startChan); err != nil {
|
|
return err
|
|
}
|
|
if err := systemdOpSuccessful(startChan); err != nil {
|
|
return fmt.Errorf("starting systemd health-check timer %q: %w", startFile, err)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// removeTransientFiles removes the systemd timer and unit files
|
|
// for the container
|
|
func (c *Container) removeTransientFiles(ctx context.Context, isStartup bool, unitName string) error {
|
|
if c.disableHealthCheckSystemd(isStartup) {
|
|
return nil
|
|
}
|
|
conn, err := systemd.ConnectToDBUS()
|
|
if err != nil {
|
|
return fmt.Errorf("unable to get systemd connection to remove healthchecks: %w", err)
|
|
}
|
|
defer conn.Close()
|
|
|
|
// Errors are returned at the very end. Let's make sure to stop and
|
|
// clean up as much as possible.
|
|
stopErrors := []error{}
|
|
|
|
if unitName == "" {
|
|
unitName = c.hcUnitName(isStartup, true)
|
|
}
|
|
// Stop the timer before the service to make sure the timer does not
|
|
// fire after the service is stopped.
|
|
timerChan := make(chan string)
|
|
timerFile := fmt.Sprintf("%s.timer", unitName)
|
|
if _, err := conn.StopUnitContext(ctx, timerFile, "ignore-dependencies", timerChan); err != nil {
|
|
if !strings.HasSuffix(err.Error(), ".timer not loaded.") {
|
|
stopErrors = append(stopErrors, fmt.Errorf("removing health-check timer %q: %w", timerFile, err))
|
|
}
|
|
} else if err := systemdOpSuccessful(timerChan); err != nil {
|
|
stopErrors = append(stopErrors, fmt.Errorf("stopping systemd health-check timer %q: %w", timerFile, err))
|
|
}
|
|
|
|
serviceChan := make(chan string)
|
|
serviceFile := fmt.Sprintf("%s.service", unitName)
|
|
if _, err := conn.StopUnitContext(ctx, serviceFile, "ignore-dependencies", serviceChan); err != nil {
|
|
if !strings.HasSuffix(err.Error(), ".service not loaded.") {
|
|
stopErrors = append(stopErrors, fmt.Errorf("removing health-check service %q: %w", serviceFile, err))
|
|
}
|
|
} else if err := systemdOpSuccessful(serviceChan); err != nil {
|
|
stopErrors = append(stopErrors, fmt.Errorf("stopping systemd health-check service %q: %w", serviceFile, err))
|
|
}
|
|
// Reset the service after stopping it to make sure it's being removed, systemd keep failed transient services
|
|
// around in its state. We do not care about the error and we need to ensure to reset the state so we do not
|
|
// leak resources forever.
|
|
if err := conn.ResetFailedUnitContext(ctx, serviceFile); err != nil {
|
|
logrus.Debugf("Failed to reset unit file: %q", err)
|
|
}
|
|
|
|
return errorhandling.JoinErrors(stopErrors)
|
|
}
|
|
|
|
func (c *Container) disableHealthCheckSystemd(isStartup bool) bool {
|
|
if !systemdCommon.RunsOnSystemd() || os.Getenv("DISABLE_HC_SYSTEMD") == "true" {
|
|
return true
|
|
}
|
|
if isStartup {
|
|
if c.config.StartupHealthCheckConfig.Interval == 0 {
|
|
return true
|
|
}
|
|
}
|
|
if c.config.HealthCheckConfig.Interval == 0 {
|
|
return true
|
|
}
|
|
return false
|
|
}
|
|
|
|
// Systemd unit name for the healthcheck systemd unit.
|
|
// Bare indicates that a random suffix should not be applied to the name. This
|
|
// was default behavior previously, and is used for backwards compatibility.
|
|
func (c *Container) hcUnitName(isStartup, bare bool) string {
|
|
unitName := c.ID()
|
|
if isStartup {
|
|
unitName += "-startup"
|
|
}
|
|
if !bare {
|
|
// Ensure that unit names are unique from run to run by appending
|
|
// a random suffix.
|
|
// Ref: RH Jira RHEL-26105
|
|
unitName += fmt.Sprintf("-%x", rand.Int())
|
|
}
|
|
return unitName
|
|
}
|