Implement SD-NOTIFY proxy in conmon

This leverages conmon's ability to proxy the SD-NOTIFY socket.
This prevents locking caused by OCI runtime blocking, waiting for
SD-NOTIFY messages, and instead passes the messages directly up
to the host.

NOTE: Also re-enable the auto-update tests which has been disabled due
to flakiness.  With this change, Podman properly integrates into
systemd.

Fixes: #7316
Signed-off-by: Joseph Gooch <mrwizard@dok.org>
Signed-off-by: Daniel J Walsh <dwalsh@redhat.com>
Signed-off-by: Valentin Rothberg <rothberg@redhat.com>
This commit is contained in:
Daniel J Walsh
2021-03-24 07:49:29 -04:00
committed by Valentin Rothberg
parent 30b036c5d3
commit c22f3e8b4e
7 changed files with 79 additions and 22 deletions

View File

@ -126,6 +126,10 @@ type Container struct {
// This is true if a container is restored from a checkpoint. // This is true if a container is restored from a checkpoint.
restoreFromCheckpoint bool restoreFromCheckpoint bool
// Used to query the NOTIFY_SOCKET once along with setting up
// mounts etc.
notifySocket string
slirp4netnsSubnet *net.IPNet slirp4netnsSubnet *net.IPNet
} }

View File

@ -352,6 +352,10 @@ func (c *Container) generateSpec(ctx context.Context) (*spec.Spec, error) {
return nil, err return nil, err
} }
if err := c.mountNotifySocket(g); err != nil {
return nil, err
}
// Get host UID and GID based on the container process UID and GID. // Get host UID and GID based on the container process UID and GID.
hostUID, hostGID, err := butil.GetHostIDs(util.IDtoolsToRuntimeSpec(c.config.IDMappings.UIDMap), util.IDtoolsToRuntimeSpec(c.config.IDMappings.GIDMap), uint32(execUser.Uid), uint32(execUser.Gid)) hostUID, hostGID, err := butil.GetHostIDs(util.IDtoolsToRuntimeSpec(c.config.IDMappings.UIDMap), util.IDtoolsToRuntimeSpec(c.config.IDMappings.GIDMap), uint32(execUser.Uid), uint32(execUser.Gid))
if err != nil { if err != nil {
@ -777,6 +781,41 @@ func (c *Container) generateSpec(ctx context.Context) (*spec.Spec, error) {
return g.Config, nil return g.Config, nil
} }
// mountNotifySocket mounts the NOTIFY_SOCKET into the container if it's set
// and if the sdnotify mode is set to container. It also sets c.notifySocket
// to avoid redundantly looking up the env variable.
func (c *Container) mountNotifySocket(g generate.Generator) error {
notify, ok := os.LookupEnv("NOTIFY_SOCKET")
if !ok {
return nil
}
c.notifySocket = notify
if c.config.SdNotifyMode != define.SdNotifyModeContainer {
return nil
}
notifyDir := filepath.Join(c.bundlePath(), "notify")
logrus.Debugf("checking notify %q dir", notifyDir)
if err := os.MkdirAll(notifyDir, 0755); err != nil {
if !os.IsExist(err) {
return errors.Wrapf(err, "unable to create notify %q dir", notifyDir)
}
}
if err := label.Relabel(notifyDir, c.MountLabel(), true); err != nil {
return errors.Wrapf(err, "relabel failed %q", notifyDir)
}
logrus.Debugf("add bindmount notify %q dir", notifyDir)
if _, ok := c.state.BindMounts["/run/notify"]; !ok {
c.state.BindMounts["/run/notify"] = notifyDir
}
// Set the container's notify socket to the proxy socket created by conmon
g.AddProcessEnv("NOTIFY_SOCKET", "/run/notify/notify.sock")
return nil
}
// systemd expects to have /run, /run/lock and /tmp on tmpfs // systemd expects to have /run, /run/lock and /tmp on tmpfs
// It also expects to be able to write to /sys/fs/cgroup/systemd and /var/log/journal // It also expects to be able to write to /sys/fs/cgroup/systemd and /var/log/journal
func (c *Container) setupSystemd(mounts []spec.Mount, g generate.Generator) error { func (c *Container) setupSystemd(mounts []spec.Mount, g generate.Generator) error {
@ -1730,6 +1769,7 @@ rootless=%d
c.state.BindMounts[dest] = src c.state.BindMounts[dest] = src
} }
} }
return nil return nil
} }

View File

@ -14,6 +14,7 @@ var initInodes = map[string]bool{
"/etc/resolv.conf": true, "/etc/resolv.conf": true,
"/proc": true, "/proc": true,
"/run": true, "/run": true,
"/run/notify": true,
"/run/.containerenv": true, "/run/.containerenv": true,
"/run/secrets": true, "/run/secrets": true,
"/sys": true, "/sys": true,

View File

@ -462,7 +462,7 @@ func (r *ConmonOCIRuntime) startExec(c *Container, sessionID string, options *Ex
Setpgid: true, Setpgid: true,
} }
err = startCommandGivenSelinux(execCmd) err = startCommandGivenSelinux(execCmd, c)
// We don't need children pipes on the parent side // We don't need children pipes on the parent side
errorhandling.CloseQuiet(childSyncPipe) errorhandling.CloseQuiet(childSyncPipe)

View File

@ -364,11 +364,6 @@ func (r *ConmonOCIRuntime) StartContainer(ctr *Container) error {
return err return err
} }
env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)} env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)}
if ctr.config.SdNotifyMode == define.SdNotifyModeContainer {
if notify, ok := os.LookupEnv("NOTIFY_SOCKET"); ok {
env = append(env, fmt.Sprintf("NOTIFY_SOCKET=%s", notify))
}
}
if path, ok := os.LookupEnv("PATH"); ok { if path, ok := os.LookupEnv("PATH"); ok {
env = append(env, fmt.Sprintf("PATH=%s", path)) env = append(env, fmt.Sprintf("PATH=%s", path))
} }
@ -1014,12 +1009,6 @@ func (r *ConmonOCIRuntime) createOCIContainer(ctr *Container, restoreOptions *Co
} }
} }
if ctr.config.SdNotifyMode == define.SdNotifyModeIgnore {
if err := os.Unsetenv("NOTIFY_SOCKET"); err != nil {
logrus.Warnf("Error unsetting NOTIFY_SOCKET %v", err)
}
}
pidfile := ctr.config.PidFile pidfile := ctr.config.PidFile
if pidfile == "" { if pidfile == "" {
pidfile = filepath.Join(ctr.state.RunDir, "pidfile") pidfile = filepath.Join(ctr.state.RunDir, "pidfile")
@ -1027,6 +1016,10 @@ func (r *ConmonOCIRuntime) createOCIContainer(ctr *Container, restoreOptions *Co
args := r.sharedConmonArgs(ctr, ctr.ID(), ctr.bundlePath(), pidfile, ctr.LogPath(), r.exitsDir, ociLog, ctr.LogDriver(), logTag) args := r.sharedConmonArgs(ctr, ctr.ID(), ctr.bundlePath(), pidfile, ctr.LogPath(), r.exitsDir, ociLog, ctr.LogDriver(), logTag)
if ctr.config.SdNotifyMode == define.SdNotifyModeContainer && ctr.notifySocket != "" {
args = append(args, fmt.Sprintf("--sdnotify-socket=%s", ctr.notifySocket))
}
if ctr.config.Spec.Process.Terminal { if ctr.config.Spec.Process.Terminal {
args = append(args, "-t") args = append(args, "-t")
} else if ctr.config.Stdin { } else if ctr.config.Stdin {
@ -1171,7 +1164,8 @@ func (r *ConmonOCIRuntime) createOCIContainer(ctr *Container, restoreOptions *Co
} }
} }
err = startCommandGivenSelinux(cmd) err = startCommandGivenSelinux(cmd, ctr)
// regardless of whether we errored or not, we no longer need the children pipes // regardless of whether we errored or not, we no longer need the children pipes
childSyncPipe.Close() childSyncPipe.Close()
childStartPipe.Close() childStartPipe.Close()
@ -1203,7 +1197,13 @@ func (r *ConmonOCIRuntime) createOCIContainer(ctr *Container, restoreOptions *Co
// conmon not having a pid file is a valid state, so don't set it if we don't have it // conmon not having a pid file is a valid state, so don't set it if we don't have it
logrus.Infof("Got Conmon PID as %d", conmonPID) logrus.Infof("Got Conmon PID as %d", conmonPID)
ctr.state.ConmonPID = conmonPID ctr.state.ConmonPID = conmonPID
if ctr.config.SdNotifyMode != define.SdNotifyModeIgnore {
// Send the MAINPID via sdnotify if needed.
switch ctr.config.SdNotifyMode {
case define.SdNotifyModeContainer, define.SdNotifyModeIgnore:
// Nothing to do or conmon takes care of it already.
default:
if sent, err := daemon.SdNotify(false, fmt.Sprintf("MAINPID=%d", conmonPID)); err != nil { if sent, err := daemon.SdNotify(false, fmt.Sprintf("MAINPID=%d", conmonPID)); err != nil {
logrus.Errorf("Error notifying systemd of Conmon PID: %v", err) logrus.Errorf("Error notifying systemd of Conmon PID: %v", err)
} else if sent { } else if sent {
@ -1239,11 +1239,6 @@ func (r *ConmonOCIRuntime) configureConmonEnv(ctr *Container, runtimeDir string)
} }
extraFiles := make([]*os.File, 0) extraFiles := make([]*os.File, 0)
if ctr.config.SdNotifyMode == define.SdNotifyModeContainer {
if notify, ok := os.LookupEnv("NOTIFY_SOCKET"); ok {
env = append(env, fmt.Sprintf("NOTIFY_SOCKET=%s", notify))
}
}
if !r.sdNotify { if !r.sdNotify {
if listenfds, ok := os.LookupEnv("LISTEN_FDS"); ok { if listenfds, ok := os.LookupEnv("LISTEN_FDS"); ok {
env = append(env, fmt.Sprintf("LISTEN_FDS=%s", listenfds), "LISTEN_PID=1") env = append(env, fmt.Sprintf("LISTEN_FDS=%s", listenfds), "LISTEN_PID=1")
@ -1335,7 +1330,23 @@ func (r *ConmonOCIRuntime) sharedConmonArgs(ctr *Container, cuuid, bundlePath, p
// startCommandGivenSelinux starts a container ensuring to set the labels of // startCommandGivenSelinux starts a container ensuring to set the labels of
// the process to make sure SELinux doesn't block conmon communication, if SELinux is enabled // the process to make sure SELinux doesn't block conmon communication, if SELinux is enabled
func startCommandGivenSelinux(cmd *exec.Cmd) error { func startCommandGivenSelinux(cmd *exec.Cmd, ctr *Container) error {
// Make sure to unset the NOTIFY_SOCKET and reset if afterwards if needed.
switch ctr.config.SdNotifyMode {
case define.SdNotifyModeContainer, define.SdNotifyModeIgnore:
if ctr.notifySocket != "" {
if err := os.Unsetenv("NOTIFY_SOCKET"); err != nil {
logrus.Warnf("Error unsetting NOTIFY_SOCKET %v", err)
}
defer func() {
if err := os.Setenv("NOTIFY_SOCKET", ctr.notifySocket); err != nil {
logrus.Errorf("Error resetting NOTIFY_SOCKET=%s", ctr.notifySocket)
}
}()
}
}
if !selinux.GetEnabled() { if !selinux.GetEnabled() {
return cmd.Start() return cmd.Start()
} }

View File

@ -221,7 +221,6 @@ function _confirm_update() {
} }
@test "podman auto-update - label io.containers.autoupdate=local with rollback" { @test "podman auto-update - label io.containers.autoupdate=local with rollback" {
skip "This test flakes way too often, see #11175"
# sdnotify fails with runc 1.0.0-3-dev2 on Ubuntu. Let's just # sdnotify fails with runc 1.0.0-3-dev2 on Ubuntu. Let's just
# assume that we work only with crun, nothing else. # assume that we work only with crun, nothing else.
# [copied from 260-sdnotify.bats] # [copied from 260-sdnotify.bats]

View File

@ -130,6 +130,8 @@ function _assert_mainpid_is_conmon() {
_stop_socat _stop_socat
} }
# These tests can fail in dev. environment because of SELinux.
# quick fix: chcon -t container_runtime_exec_t ./bin/podman
@test "sdnotify : container" { @test "sdnotify : container" {
# Sigh... we need to pull a humongous image because it has systemd-notify. # Sigh... we need to pull a humongous image because it has systemd-notify.
# (IMPORTANT: fedora:32 and above silently removed systemd-notify; this # (IMPORTANT: fedora:32 and above silently removed systemd-notify; this
@ -150,7 +152,7 @@ function _assert_mainpid_is_conmon() {
wait_for_ready $cid wait_for_ready $cid
run_podman logs $cid run_podman logs $cid
is "${lines[0]}" "/.*/container\.sock/notify" "NOTIFY_SOCKET is passed to container" is "${lines[0]}" "/run/notify/notify.sock" "NOTIFY_SOCKET is passed to container"
# With container, READY=1 isn't necessarily the last message received; # With container, READY=1 isn't necessarily the last message received;
# just look for it anywhere in received messages # just look for it anywhere in received messages