From a687c38860149e28c34e6ef4113a6e0f116b6ba1 Mon Sep 17 00:00:00 2001 From: Paul Holzinger Date: Fri, 24 Nov 2023 18:00:24 +0100 Subject: [PATCH] use rootless netns from c/common Use the new rootlessnetns logic from c/common, drop the podman code here and make use of the new much simpler API. ref: https://github.com/containers/common/pull/1761 [NO NEW TESTS NEEDED] Signed-off-by: Paul Holzinger --- cmd/podman/system/service_abi.go | 3 +- go.mod | 2 +- go.sum | 4 +- libpod/container_internal_linux.go | 3 +- libpod/healthcheck_linux.go | 4 +- libpod/networking_common.go | 43 +- libpod/networking_linux.go | 457 --------------- libpod/oci_conmon_common.go | 3 +- libpod/oci_conmon_linux.go | 4 +- libpod/runtime.go | 4 +- libpod/runtime_pod_linux.go | 5 +- pkg/domain/infra/abi/system.go | 22 +- pkg/specgen/generate/validate.go | 3 +- utils/testdata/cgroup.empty | 0 utils/testdata/cgroup.other | 1 - utils/testdata/cgroup.root | 1 - utils/utils.go | 119 ---- utils/utils_supported.go | 205 ------- utils/utils_test.go | 26 - utils/utils_windows.go | 26 - .../containers/common/libimage/copier.go | 12 +- .../common/libnetwork/cni/cni_exec.go | 12 + .../common/libnetwork/cni/network.go | 36 +- .../containers/common/libnetwork/cni/run.go | 152 +++-- .../internal/rootlessnetns/netns.go | 8 + .../internal/rootlessnetns/netns_freebsd.go | 28 + .../internal/rootlessnetns/netns_linux.go | 545 ++++++++++++++++++ .../common/libnetwork/netavark/exec.go | 10 + .../common/libnetwork/netavark/network.go | 45 +- .../common/libnetwork/netavark/run.go | 40 +- .../common/libnetwork/network/interface.go | 28 +- .../common/libnetwork/types/define.go | 3 + .../common/libnetwork/types/network.go | 4 + .../common/pkg/cgroups/utils_linux.go | 170 ++++++ .../common/pkg/netns/netns_linux.go | 45 +- .../common/pkg/systemd/systemd_linux.go | 151 +++++ .../common/pkg/systemd/systemd_unsupported.go | 15 + vendor/modules.txt | 4 +- 38 files changed, 1171 insertions(+), 1072 deletions(-) delete mode 100644 utils/testdata/cgroup.empty delete mode 100644 utils/testdata/cgroup.other delete mode 100644 utils/testdata/cgroup.root delete mode 100644 utils/utils_supported.go delete mode 100644 utils/utils_test.go delete mode 100644 utils/utils_windows.go create mode 100644 vendor/github.com/containers/common/libnetwork/internal/rootlessnetns/netns.go create mode 100644 vendor/github.com/containers/common/libnetwork/internal/rootlessnetns/netns_freebsd.go create mode 100644 vendor/github.com/containers/common/libnetwork/internal/rootlessnetns/netns_linux.go create mode 100644 vendor/github.com/containers/common/pkg/systemd/systemd_linux.go create mode 100644 vendor/github.com/containers/common/pkg/systemd/systemd_unsupported.go diff --git a/cmd/podman/system/service_abi.go b/cmd/podman/system/service_abi.go index 50bf809c3d..815d8062bb 100644 --- a/cmd/podman/system/service_abi.go +++ b/cmd/podman/system/service_abi.go @@ -18,7 +18,6 @@ import ( "github.com/containers/podman/v4/pkg/domain/entities" "github.com/containers/podman/v4/pkg/domain/infra" "github.com/containers/podman/v4/pkg/rootless" - "github.com/containers/podman/v4/utils" "github.com/coreos/go-systemd/v22/activation" "github.com/sirupsen/logrus" "github.com/spf13/pflag" @@ -131,7 +130,7 @@ func restService(flags *pflag.FlagSet, cfg *entities.PodmanConfig, opts entities logrus.Warnf("Running 'system service' in rootless mode without cgroup v2, containers won't survive a 'system service' restart") } - if err := utils.MaybeMoveToSubCgroup(); err != nil { + if err := cgroups.MaybeMoveToSubCgroup(); err != nil { // it is a best effort operation, so just print the // error for debugging purposes. logrus.Debugf("Could not move to subcgroup: %v", err) diff --git a/go.mod b/go.mod index e50aebd609..fb5330c29b 100644 --- a/go.mod +++ b/go.mod @@ -12,7 +12,7 @@ require ( github.com/containernetworking/cni v1.1.2 github.com/containernetworking/plugins v1.4.0 github.com/containers/buildah v1.33.2-0.20231121195905-d1a1c53c8e1c - github.com/containers/common v0.57.1-0.20231130092720-630c929caef9 + github.com/containers/common v0.57.1-0.20231206135104-b647eb3a5eea github.com/containers/conmon v2.0.20+incompatible github.com/containers/gvisor-tap-vsock v0.7.1 github.com/containers/image/v5 v5.29.1-0.20231201205726-671ab94a09ea diff --git a/go.sum b/go.sum index 8ec0ea2075..e049facb7f 100644 --- a/go.sum +++ b/go.sum @@ -256,8 +256,8 @@ github.com/containernetworking/plugins v1.4.0 h1:+w22VPYgk7nQHw7KT92lsRmuToHvb7w github.com/containernetworking/plugins v1.4.0/go.mod h1:UYhcOyjefnrQvKvmmyEKsUA+M9Nfn7tqULPpH0Pkcj0= github.com/containers/buildah v1.33.2-0.20231121195905-d1a1c53c8e1c h1:E7nxvH3N3kpyson0waJv1X+eY9hAs+x2zQswsK+//yY= github.com/containers/buildah v1.33.2-0.20231121195905-d1a1c53c8e1c/go.mod h1:oMNfVrZGEfWVOxXTNOYPMdZzDfSo2umURK/TO0d8TRk= -github.com/containers/common v0.57.1-0.20231130092720-630c929caef9 h1:56pMgYcYyhTlmPPhRmG34NBmT5S/IwMMmOq0o4LJAMo= -github.com/containers/common v0.57.1-0.20231130092720-630c929caef9/go.mod h1:1TyelTjZvU4ZVSq6tGl0ImFlMKIbE8QkzpACQCdcs4U= +github.com/containers/common v0.57.1-0.20231206135104-b647eb3a5eea h1:PI6EWt76Df+v4KrZ6Wn1Fvz/zQvbAYO+2gAQeBGzj3s= +github.com/containers/common v0.57.1-0.20231206135104-b647eb3a5eea/go.mod h1:WbO7Tl8eLCt/+b35lsuc1NkWy7cZsdgF84EJ7VKhgOU= github.com/containers/conmon v2.0.20+incompatible h1:YbCVSFSCqFjjVwHTPINGdMX1F6JXHGTUje2ZYobNrkg= github.com/containers/conmon v2.0.20+incompatible/go.mod h1:hgwZ2mtuDrppv78a/cOBNiCm6O0UMWGx1mu7P00nu5I= github.com/containers/gvisor-tap-vsock v0.7.1 h1:+Rc+sOPplrkQb/BUXeN0ug8TxjgyrIqo/9P/eNS2A4c= diff --git a/libpod/container_internal_linux.go b/libpod/container_internal_linux.go index 7f6508d380..d70b9c78a6 100644 --- a/libpod/container_internal_linux.go +++ b/libpod/container_internal_linux.go @@ -21,7 +21,6 @@ import ( "github.com/containers/common/pkg/config" "github.com/containers/podman/v4/libpod/define" "github.com/containers/podman/v4/pkg/rootless" - "github.com/containers/podman/v4/utils" spec "github.com/opencontainers/runtime-spec/specs-go" "github.com/opencontainers/runtime-tools/generate" "github.com/opencontainers/selinux/go-selinux/label" @@ -390,7 +389,7 @@ func (c *Container) getOCICgroupPath() (string, error) { case c.config.NoCgroups: return "", nil case c.config.CgroupsMode == cgroupSplit: - selfCgroup, err := utils.GetOwnCgroupDisallowRoot() + selfCgroup, err := cgroups.GetOwnCgroupDisallowRoot() if err != nil { return "", err } diff --git a/libpod/healthcheck_linux.go b/libpod/healthcheck_linux.go index 53ec0c1dee..08a35415e4 100644 --- a/libpod/healthcheck_linux.go +++ b/libpod/healthcheck_linux.go @@ -10,10 +10,10 @@ import ( "os/exec" "strings" + systemdCommon "github.com/containers/common/pkg/systemd" "github.com/containers/podman/v4/pkg/errorhandling" "github.com/containers/podman/v4/pkg/rootless" "github.com/containers/podman/v4/pkg/systemd" - "github.com/containers/podman/v4/utils" "github.com/sirupsen/logrus" ) @@ -138,7 +138,7 @@ func (c *Container) removeTransientFiles(ctx context.Context, isStartup bool) er } func (c *Container) disableHealthCheckSystemd(isStartup bool) bool { - if !utils.RunsOnSystemd() || os.Getenv("DISABLE_HC_SYSTEMD") == "true" { + if !systemdCommon.RunsOnSystemd() || os.Getenv("DISABLE_HC_SYSTEMD") == "true" { return true } if isStartup { diff --git a/libpod/networking_common.go b/libpod/networking_common.go index 4e46671fc4..d3a3981f69 100644 --- a/libpod/networking_common.go +++ b/libpod/networking_common.go @@ -65,24 +65,7 @@ func (c *Container) getNetworkOptions(networkOpts map[string]types.PerNetworkOpt // setUpNetwork will set up the networks, on error it will also tear down the cni // networks. If rootless it will join/create the rootless network namespace. func (r *Runtime) setUpNetwork(ns string, opts types.NetworkOptions) (map[string]types.StatusBlock, error) { - rootlessNetNS, err := r.GetRootlessNetNs(true) - if err != nil { - return nil, err - } - var results map[string]types.StatusBlock - setUpPod := func() error { - results, err = r.network.Setup(ns, types.SetupOptions{NetworkOptions: opts}) - return err - } - // rootlessNetNS is nil if we are root - if rootlessNetNS != nil { - // execute the setup in the rootless net ns - err = rootlessNetNS.Do(setUpPod) - rootlessNetNS.Lock.Unlock() - } else { - err = setUpPod() - } - return results, err + return r.network.Setup(ns, types.SetupOptions{NetworkOptions: opts}) } // getNetworkPodName return the pod name (hostname) used by dns backend. @@ -100,29 +83,7 @@ func getNetworkPodName(c *Container) string { // Tear down a container's network configuration and joins the // rootless net ns as rootless user func (r *Runtime) teardownNetworkBackend(ns string, opts types.NetworkOptions) error { - rootlessNetNS, err := r.GetRootlessNetNs(false) - if err != nil { - return err - } - tearDownPod := func() error { - if err := r.network.Teardown(ns, types.TeardownOptions{NetworkOptions: opts}); err != nil { - return fmt.Errorf("tearing down network namespace configuration for container %s: %w", opts.ContainerID, err) - } - return nil - } - - // rootlessNetNS is nil if we are root - if rootlessNetNS != nil { - // execute the network setup in the rootless net ns - err = rootlessNetNS.Do(tearDownPod) - if cerr := rootlessNetNS.Cleanup(r); cerr != nil { - logrus.WithError(cerr).Error("failed to clean up rootless netns") - } - rootlessNetNS.Lock.Unlock() - } else { - err = tearDownPod() - } - return err + return r.network.Teardown(ns, types.TeardownOptions{NetworkOptions: opts}) } // Tear down a container's network backend configuration, but do not tear down the diff --git a/libpod/networking_linux.go b/libpod/networking_linux.go index 6f8299a657..2c0172f64d 100644 --- a/libpod/networking_linux.go +++ b/libpod/networking_linux.go @@ -5,479 +5,22 @@ package libpod import ( "crypto/rand" - "crypto/sha256" - "errors" "fmt" "net" "os" "path/filepath" - "strconv" - "strings" - "syscall" "github.com/containernetworking/plugins/pkg/ns" - "github.com/containers/common/libnetwork/resolvconf" - "github.com/containers/common/libnetwork/slirp4netns" "github.com/containers/common/libnetwork/types" netUtil "github.com/containers/common/libnetwork/util" "github.com/containers/common/pkg/netns" - "github.com/containers/podman/v4/libpod/define" "github.com/containers/podman/v4/pkg/rootless" - "github.com/containers/podman/v4/pkg/util" - "github.com/containers/podman/v4/utils" - "github.com/containers/storage/pkg/lockfile" "github.com/opencontainers/runtime-spec/specs-go" - "github.com/opencontainers/selinux/go-selinux/label" "github.com/sirupsen/logrus" "github.com/vishvananda/netlink" "golang.org/x/sys/unix" ) -const ( - // rootlessNetNsName is the file name for the rootless network namespace bind mount - rootlessNetNsName = "rootless-netns" - - // rootlessNetNsSilrp4netnsPidFile is the name of the rootless netns slirp4netns pid file - rootlessNetNsSilrp4netnsPidFile = "rootless-netns-slirp4netns.pid" - - // persistentCNIDir is the directory where the CNI files are stored - persistentCNIDir = "/var/lib/cni" -) - -type RootlessNetNS struct { - ns ns.NetNS - dir string - Lock *lockfile.LockFile -} - -// getPath will join the given path to the rootless netns dir -func (r *RootlessNetNS) getPath(path string) string { - return filepath.Join(r.dir, path) -} - -// Do - run the given function in the rootless netns. -// It does not lock the rootlessCNI lock, the caller -// should only lock when needed, e.g. for network operations. -func (r *RootlessNetNS) Do(toRun func() error) error { - err := r.ns.Do(func(_ ns.NetNS) error { - // Before we can run the given function, - // we have to set up all mounts correctly. - - // The order of the mounts is IMPORTANT. - // The idea of the extra mount ns is to make /run and /var/lib/cni writeable - // for the cni plugins but not affecting the podman user namespace. - // Because the plugins also need access to XDG_RUNTIME_DIR/netns some special setup is needed. - - // The following bind mounts are needed - // 1. XDG_RUNTIME_DIR -> XDG_RUNTIME_DIR/rootless-netns/XDG_RUNTIME_DIR - // 2. /run/systemd -> XDG_RUNTIME_DIR/rootless-netns/run/systemd (only if it exists) - // 3. XDG_RUNTIME_DIR/rootless-netns/resolv.conf -> /etc/resolv.conf or XDG_RUNTIME_DIR/rootless-netns/run/symlink/target - // 4. XDG_RUNTIME_DIR/rootless-netns/var/lib/cni -> /var/lib/cni (if /var/lib/cni does not exist, use the parent dir) - // 5. XDG_RUNTIME_DIR/rootless-netns/run -> /run - - // Create a new mount namespace, - // this must happen inside the netns thread. - err := unix.Unshare(unix.CLONE_NEWNS) - if err != nil { - return fmt.Errorf("cannot create a new mount namespace: %w", err) - } - - xdgRuntimeDir, err := util.GetRootlessRuntimeDir() - if err != nil { - return fmt.Errorf("could not get runtime directory: %w", err) - } - newXDGRuntimeDir := r.getPath(xdgRuntimeDir) - // 1. Mount the netns into the new run to keep them accessible. - // Otherwise cni setup will fail because it cannot access the netns files. - err = unix.Mount(xdgRuntimeDir, newXDGRuntimeDir, "none", unix.MS_BIND|unix.MS_SHARED|unix.MS_REC, "") - if err != nil { - return fmt.Errorf("failed to mount runtime directory for rootless netns: %w", err) - } - - // 2. Also keep /run/systemd if it exists. - // Many files are symlinked into this dir, for example /dev/log. - runSystemd := "/run/systemd" - _, err = os.Stat(runSystemd) - if err == nil { - newRunSystemd := r.getPath(runSystemd) - err = unix.Mount(runSystemd, newRunSystemd, "none", unix.MS_BIND|unix.MS_REC, "") - if err != nil { - return fmt.Errorf("failed to mount /run/systemd directory for rootless netns: %w", err) - } - } - - // 3. On some distros /etc/resolv.conf is symlinked to somewhere under /run. - // Because the kernel will follow the symlink before mounting, it is not - // possible to mount a file at /etc/resolv.conf. We have to ensure that - // the link target will be available in the mount ns. - // see: https://github.com/containers/podman/issues/10855 - resolvePath := "/etc/resolv.conf" - linkCount := 0 - for i := 1; i < len(resolvePath); i++ { - // Do not use filepath.EvalSymlinks, we only want the first symlink under /run. - // If /etc/resolv.conf has more than one symlink under /run, e.g. - // -> /run/systemd/resolve/stub-resolv.conf -> /run/systemd/resolve/resolv.conf - // we would put the netns resolv.conf file to the last path. However this will - // break dns because the second link does not exist in the mount ns. - // see https://github.com/containers/podman/issues/11222 - // - // We also need to resolve all path components not just the last file. - // see https://github.com/containers/podman/issues/12461 - - if resolvePath[i] != '/' { - // if we are at the last char we need to inc i by one because there is no final slash - if i == len(resolvePath)-1 { - i++ - } else { - // not the end of path, keep going - continue - } - } - path := resolvePath[:i] - - fi, err := os.Lstat(path) - if err != nil { - return fmt.Errorf("failed to stat resolv.conf path: %w", err) - } - - // no link, just continue - if fi.Mode()&os.ModeSymlink == 0 { - continue - } - - link, err := os.Readlink(path) - if err != nil { - return fmt.Errorf("failed to read resolv.conf symlink: %w", err) - } - linkCount++ - if filepath.IsAbs(link) { - // link is as an absolute path - resolvePath = filepath.Join(link, resolvePath[i:]) - } else { - // link is as a relative, join it with the previous path - base := filepath.Dir(path) - resolvePath = filepath.Join(base, link, resolvePath[i:]) - } - // set i back to zero since we now have a new base path - i = 0 - - // we have to stop at the first path under /run because we will have an empty /run and will create the path anyway - // if we would continue we would need to recreate all links under /run - if strings.HasPrefix(resolvePath, "/run/") { - break - } - // make sure wo do not loop forever - if linkCount == 255 { - return errors.New("too many symlinks while resolving /etc/resolv.conf") - } - } - logrus.Debugf("The path of /etc/resolv.conf in the mount ns is %q", resolvePath) - // When /etc/resolv.conf on the host is a symlink to /run/systemd/resolve/stub-resolv.conf, - // we have to mount an empty filesystem on /run/systemd/resolve in the child namespace, - // so as to isolate the directory from the host mount namespace. - // - // Otherwise our bind-mount for /run/systemd/resolve/stub-resolv.conf is unmounted - // when systemd-resolved unlinks and recreates /run/systemd/resolve/stub-resolv.conf on the host. - // see: https://github.com/containers/podman/issues/10929 - if strings.HasPrefix(resolvePath, "/run/systemd/resolve/") { - rsr := r.getPath("/run/systemd/resolve") - err = unix.Mount("", rsr, define.TypeTmpfs, unix.MS_NOEXEC|unix.MS_NOSUID|unix.MS_NODEV, "") - if err != nil { - return fmt.Errorf("failed to mount tmpfs on %q for rootless netns: %w", rsr, err) - } - } - if strings.HasPrefix(resolvePath, "/run/") { - resolvePath = r.getPath(resolvePath) - err = os.MkdirAll(filepath.Dir(resolvePath), 0700) - if err != nil { - return fmt.Errorf("failed to create rootless-netns resolv.conf directory: %w", err) - } - // we want to bind mount on this file so we have to create the file first - _, err = os.OpenFile(resolvePath, os.O_CREATE|os.O_RDONLY, 0700) - if err != nil { - return fmt.Errorf("failed to create rootless-netns resolv.conf file: %w", err) - } - } - // mount resolv.conf to make use of the host dns - err = unix.Mount(r.getPath("resolv.conf"), resolvePath, "none", unix.MS_BIND, "") - if err != nil { - return fmt.Errorf("failed to mount resolv.conf for rootless netns: %w", err) - } - - // 4. CNI plugins need access to /var/lib/cni and /run - varDir := "" - varTarget := persistentCNIDir - // we can only mount to a target dir which exists, check /var/lib/cni recursively - // while we could always use /var there are cases where a user might store the cni - // configs under /var/custom and this would break - for { - if _, err := os.Stat(varTarget); err == nil { - varDir = r.getPath(varTarget) - break - } - varTarget = filepath.Dir(varTarget) - if varTarget == "/" { - break - } - } - if varDir == "" { - return errors.New("failed to stat /var directory") - } - // make sure to mount var first - err = unix.Mount(varDir, varTarget, "none", unix.MS_BIND, "") - if err != nil { - return fmt.Errorf("failed to mount %s for rootless netns: %w", varTarget, err) - } - - // 5. Mount the new prepared run dir to /run, it has to be recursive to keep the other bind mounts. - runDir := r.getPath("run") - err = unix.Mount(runDir, "/run", "none", unix.MS_BIND|unix.MS_REC, "") - if err != nil { - return fmt.Errorf("failed to mount /run for rootless netns: %w", err) - } - - // run the given function in the correct namespace - err = toRun() - return err - }) - return err -} - -// Clean up the rootless network namespace if needed. -// It checks if we have running containers with the bridge network mode. -// Cleanup() expects that r.Lock is locked -func (r *RootlessNetNS) Cleanup(runtime *Runtime) error { - _, err := os.Stat(r.dir) - if os.IsNotExist(err) { - // the directory does not exist, so no need for cleanup - return nil - } - activeNetns := func(c *Container) bool { - // no bridge => no need to check - if !c.config.NetMode.IsBridge() { - return false - } - - // we cannot use c.state() because it will try to lock the container - // locking is a problem because cleanup is called after net teardown - // at this stage the container is already locked. - // also do not try to lock only containers which are not currently in net - // teardown because this will result in an ABBA deadlock between the rootless - // rootless netns lock and the container lock - // because we need to get the state we have to sync otherwise this will not - // work because the state is empty by default - // I do not like this but I do not see a better way at moment - err := c.syncContainer() - if err != nil { - return false - } - - // only check for an active netns, we cannot use the container state - // because not running does not mean that the netns does not need cleanup - // only if the netns is empty we know that we do not need cleanup - return c.state.NetNS != "" - } - ctrs, err := runtime.GetContainers(false, activeNetns) - if err != nil { - return err - } - // no cleanup if we found no other containers with a netns - // we will always find one container (the container cleanup that is currently calling us) - if len(ctrs) > 1 { - return nil - } - logrus.Debug("Cleaning up rootless network namespace") - err = netns.UnmountNS(r.ns.Path()) - if err != nil { - return err - } - // make the following errors not fatal - err = r.ns.Close() - if err != nil { - logrus.Error(err) - } - b, err := os.ReadFile(r.getPath(rootlessNetNsSilrp4netnsPidFile)) - if err == nil { - var i int - i, err = strconv.Atoi(string(b)) - if err == nil { - // kill the slirp process so we do not leak it - err = syscall.Kill(i, syscall.SIGTERM) - } - } - if err != nil { - logrus.Errorf("Failed to kill slirp4netns process: %v", err) - } - err = os.RemoveAll(r.dir) - if err != nil { - logrus.Error(err) - } - return nil -} - -// GetRootlessNetNs returns the rootless netns object. If create is set to true -// the rootless network namespace will be created if it does not already exist. -// If called as root it returns always nil. -// On success the returned RootlessCNI lock is locked and must be unlocked by the caller. -func (r *Runtime) GetRootlessNetNs(new bool) (*RootlessNetNS, error) { - if !rootless.IsRootless() { - return nil, nil - } - var rootlessNetNS *RootlessNetNS - runDir := r.config.Engine.TmpDir - - lfile := filepath.Join(runDir, "rootless-netns.lock") - lock, err := lockfile.GetLockFile(lfile) - if err != nil { - return nil, fmt.Errorf("failed to get rootless-netns lockfile: %w", err) - } - lock.Lock() - defer func() { - // In case of an error (early exit) rootlessNetNS will be nil. - // Make sure to unlock otherwise we could deadlock. - if rootlessNetNS == nil { - lock.Unlock() - } - }() - - rootlessNetNsDir := filepath.Join(runDir, rootlessNetNsName) - err = os.MkdirAll(rootlessNetNsDir, 0700) - if err != nil { - return nil, fmt.Errorf("could not create rootless-netns directory: %w", err) - } - - nsDir, err := netns.GetNSRunDir() - if err != nil { - return nil, err - } - - // create a hash from the static dir - // the cleanup will check if there are running containers - // if you run a several libpod instances with different root/runroot directories this check will fail - // we want one netns for each libpod static dir so we use the hash to prevent name collisions - hash := sha256.Sum256([]byte(r.config.Engine.StaticDir)) - netnsName := fmt.Sprintf("%s-%x", rootlessNetNsName, hash[:10]) - - path := filepath.Join(nsDir, netnsName) - nsReference, err := ns.GetNS(path) - if err != nil { - if !new { - // return an error if we could not get the namespace and should no create one - return nil, fmt.Errorf("getting rootless network namespace: %w", err) - } - - // When the netns is not valid but the file exists we have to remove it first, - // https://github.com/containers/common/pull/1381 changed the behavior from - // NewNSWithName()so it will now error when the file already exists. - // https://github.com/containers/podman/issues/17903#issuecomment-1494329622 - if errors.As(err, &ns.NSPathNotNSErr{}) { - logrus.Infof("rootless netns is no longer valid: %v", err) - // ignore errors, if something is wrong NewNSWithName() will fail below anyway - _ = os.Remove(path) - } - - // create a new namespace - logrus.Debugf("creating rootless network namespace with name %q", netnsName) - nsReference, err = netns.NewNSWithName(netnsName) - if err != nil { - return nil, fmt.Errorf("creating rootless network namespace: %w", err) - } - res, err := slirp4netns.Setup(&slirp4netns.SetupOptions{ - Config: r.config, - ContainerID: "rootless-netns", - Netns: nsReference.Path(), - }) - if err != nil { - return nil, fmt.Errorf("failed to start rootless-netns slirp4netns: %w", err) - } - // create pid file for the slirp4netns process - // this is need to kill the process in the cleanup - pid := strconv.Itoa(res.Pid) - err = os.WriteFile(filepath.Join(rootlessNetNsDir, rootlessNetNsSilrp4netnsPidFile), []byte(pid), 0700) - if err != nil { - return nil, fmt.Errorf("unable to write rootless-netns slirp4netns pid file: %w", err) - } - - if utils.RunsOnSystemd() { - // move to systemd scope to prevent systemd from killing it - err = utils.MoveRootlessNetnsSlirpProcessToUserSlice(res.Pid) - if err != nil { - // only log this, it is not fatal but can lead to issues when running podman inside systemd units - logrus.Errorf("failed to move the rootless netns slirp4netns process to the systemd user.slice: %v", err) - } - } - - // build a new resolv.conf file which uses the slirp4netns dns server address - resolveIP, err := slirp4netns.GetDNS(res.Subnet) - if err != nil { - return nil, fmt.Errorf("failed to determine default slirp4netns DNS address: %w", err) - } - - if err := resolvconf.New(&resolvconf.Params{ - Path: filepath.Join(rootlessNetNsDir, "resolv.conf"), - // fake the netns since we want to filter localhost - Namespaces: []specs.LinuxNamespace{ - {Type: specs.NetworkNamespace}, - }, - IPv6Enabled: res.IPv6, - KeepHostServers: true, - Nameservers: []string{resolveIP.String()}, - }); err != nil { - return nil, fmt.Errorf("failed to create rootless netns resolv.conf: %w", err) - } - // create cni directories to store files - // they will be bind mounted to the correct location in an extra mount ns - err = os.MkdirAll(filepath.Join(rootlessNetNsDir, persistentCNIDir), 0700) - if err != nil { - return nil, fmt.Errorf("could not create rootless-netns var directory: %w", err) - } - runDir := filepath.Join(rootlessNetNsDir, "run") - err = os.MkdirAll(runDir, 0700) - if err != nil { - return nil, fmt.Errorf("could not create rootless-netns run directory: %w", err) - } - // relabel the new run directory to the iptables /run label - // this is important, otherwise the iptables command will fail - err = label.Relabel(runDir, "system_u:object_r:iptables_var_run_t:s0", false) - if err != nil { - if !errors.Is(err, unix.ENOTSUP) { - return nil, fmt.Errorf("could not create relabel rootless-netns run directory: %w", err) - } - logrus.Debugf("Labeling not supported on %q", runDir) - } - // create systemd run directory - err = os.MkdirAll(filepath.Join(runDir, "systemd"), 0700) - if err != nil { - return nil, fmt.Errorf("could not create rootless-netns systemd directory: %w", err) - } - // create the directory for the netns files at the same location - // relative to the rootless-netns location - err = os.MkdirAll(filepath.Join(rootlessNetNsDir, nsDir), 0700) - if err != nil { - return nil, fmt.Errorf("could not create rootless-netns netns directory: %w", err) - } - } - - // The CNI plugins and netavark need access to iptables in $PATH. As it turns out debian doesn't put - // /usr/sbin in $PATH for rootless users. This will break rootless networking completely. - // We might break existing users and we cannot expect everyone to change their $PATH so - // let's add /usr/sbin to $PATH ourselves. - path = os.Getenv("PATH") - if !strings.Contains(path, "/usr/sbin") { - path += ":/usr/sbin" - os.Setenv("PATH", path) - } - - // Important set rootlessNetNS as last step. - // Do not return any errors after this. - rootlessNetNS = &RootlessNetNS{ - ns: nsReference, - dir: rootlessNetNsDir, - Lock: lock, - } - return rootlessNetNS, nil -} - // Create and configure a new network namespace for a container func (r *Runtime) configureNetNS(ctr *Container, ctrNS string) (status map[string]types.StatusBlock, rerr error) { if err := r.exposeMachinePorts(ctr.config.PortMappings); err != nil { diff --git a/libpod/oci_conmon_common.go b/libpod/oci_conmon_common.go index e40c843833..c3e68cf060 100644 --- a/libpod/oci_conmon_common.go +++ b/libpod/oci_conmon_common.go @@ -23,6 +23,7 @@ import ( "text/template" "time" + "github.com/containers/common/pkg/cgroups" "github.com/containers/common/pkg/config" "github.com/containers/common/pkg/detach" "github.com/containers/common/pkg/resize" @@ -1099,7 +1100,7 @@ func (r *ConmonOCIRuntime) createOCIContainer(ctr *Container, restoreOptions *Co } if ctr.config.CgroupsMode == cgroupSplit { - if err := utils.MoveUnderCgroupSubtree("runtime"); err != nil { + if err := cgroups.MoveUnderCgroupSubtree("runtime"); err != nil { return 0, err } } diff --git a/libpod/oci_conmon_linux.go b/libpod/oci_conmon_linux.go index d68a303331..b029b16a56 100644 --- a/libpod/oci_conmon_linux.go +++ b/libpod/oci_conmon_linux.go @@ -16,9 +16,9 @@ import ( "github.com/containers/common/pkg/cgroups" "github.com/containers/common/pkg/config" + "github.com/containers/common/pkg/systemd" "github.com/containers/podman/v4/pkg/errorhandling" "github.com/containers/podman/v4/pkg/rootless" - "github.com/containers/podman/v4/utils" pmount "github.com/containers/storage/pkg/mount" spec "github.com/opencontainers/runtime-spec/specs-go" "github.com/opencontainers/selinux/go-selinux/label" @@ -149,7 +149,7 @@ func (r *ConmonOCIRuntime) moveConmonToCgroupAndSignal(ctr *Container, cmd *exec } logrus.Infof("Running conmon under slice %s and unitName %s", realCgroupParent, unitName) - if err := utils.RunUnderSystemdScope(cmd.Process.Pid, realCgroupParent, unitName); err != nil { + if err := systemd.RunUnderSystemdScope(cmd.Process.Pid, realCgroupParent, unitName); err != nil { logrus.StandardLogger().Logf(logLevel, "Failed to add conmon to systemd sandbox cgroup: %v", err) } } else { diff --git a/libpod/runtime.go b/libpod/runtime.go index 5130eb6fb5..2911c57ce2 100644 --- a/libpod/runtime.go +++ b/libpod/runtime.go @@ -25,6 +25,7 @@ import ( "github.com/containers/common/pkg/cgroups" "github.com/containers/common/pkg/config" "github.com/containers/common/pkg/secrets" + systemdCommon "github.com/containers/common/pkg/systemd" "github.com/containers/image/v5/pkg/sysregistriesv2" is "github.com/containers/image/v5/storage" "github.com/containers/image/v5/types" @@ -36,7 +37,6 @@ import ( "github.com/containers/podman/v4/pkg/rootless" "github.com/containers/podman/v4/pkg/systemd" "github.com/containers/podman/v4/pkg/util" - "github.com/containers/podman/v4/utils" "github.com/containers/storage" "github.com/containers/storage/pkg/lockfile" "github.com/containers/storage/pkg/unshare" @@ -608,7 +608,7 @@ func makeRuntime(runtime *Runtime) (retErr error) { if became { // Check if the pause process was created. If it was created, then // move it to its own systemd scope. - utils.MovePauseProcessToScope(pausePid) + systemdCommon.MovePauseProcessToScope(pausePid) // gocritic complains because defer is not run on os.Exit() // However this is fine because the lock is released anyway when the process exits diff --git a/libpod/runtime_pod_linux.go b/libpod/runtime_pod_linux.go index 3a99af2a5a..5c9e6ec651 100644 --- a/libpod/runtime_pod_linux.go +++ b/libpod/runtime_pod_linux.go @@ -13,7 +13,6 @@ import ( "github.com/containers/common/pkg/config" "github.com/containers/podman/v4/libpod/define" "github.com/containers/podman/v4/pkg/rootless" - "github.com/containers/podman/v4/utils" spec "github.com/opencontainers/runtime-spec/specs-go" "github.com/sirupsen/logrus" ) @@ -97,7 +96,7 @@ func (p *Pod) removePodCgroup() error { } logrus.Debugf("Removing pod cgroup %s", p.state.CgroupPath) - cgroup, err := utils.GetOwnCgroup() + cgroup, err := cgroups.GetOwnCgroup() if err != nil { return err } @@ -106,7 +105,7 @@ func (p *Pod) removePodCgroup() error { // current process out of it before the cgroup is destroyed. if isSubDir(cgroup, string(filepath.Separator)+p.state.CgroupPath) { parent := path.Dir(p.state.CgroupPath) - if err := utils.MoveUnderCgroup(parent, "cleanup", nil); err != nil { + if err := cgroups.MoveUnderCgroup(parent, "cleanup", nil); err != nil { return err } } diff --git a/pkg/domain/infra/abi/system.go b/pkg/domain/infra/abi/system.go index 987df8dfb9..6811311619 100644 --- a/pkg/domain/infra/abi/system.go +++ b/pkg/domain/infra/abi/system.go @@ -11,12 +11,12 @@ import ( "github.com/containers/common/pkg/cgroups" "github.com/containers/common/pkg/config" + "github.com/containers/common/pkg/systemd" "github.com/containers/podman/v4/libpod/define" "github.com/containers/podman/v4/pkg/domain/entities" "github.com/containers/podman/v4/pkg/domain/entities/reports" "github.com/containers/podman/v4/pkg/rootless" "github.com/containers/podman/v4/pkg/util" - "github.com/containers/podman/v4/utils" "github.com/containers/storage" "github.com/containers/storage/pkg/directory" "github.com/containers/storage/pkg/unshare" @@ -67,11 +67,11 @@ func (ic *ContainerEngine) Info(ctx context.Context) (*define.Info, error) { } func (ic *ContainerEngine) SetupRootless(_ context.Context, noMoveProcess bool) error { - runsUnderSystemd := utils.RunsOnSystemd() + runsUnderSystemd := systemd.RunsOnSystemd() if !runsUnderSystemd { isPid1 := os.Getpid() == 1 if _, found := os.LookupEnv("container"); isPid1 || found { - if err := utils.MaybeMoveToSubCgroup(); err != nil { + if err := cgroups.MaybeMoveToSubCgroup(); err != nil { // it is a best effort operation, so just print the // error for debugging purposes. logrus.Debugf("Could not move to subcgroup: %v", err) @@ -101,7 +101,7 @@ func (ic *ContainerEngine) SetupRootless(_ context.Context, noMoveProcess bool) } unitName := fmt.Sprintf("podman-%d.scope", os.Getpid()) if runsUnderSystemd || conf.Engine.CgroupManager == config.SystemdCgroupsManager { - if err := utils.RunUnderSystemdScope(os.Getpid(), "user.slice", unitName); err != nil { + if err := systemd.RunUnderSystemdScope(os.Getpid(), "user.slice", unitName); err != nil { logrus.Debugf("Failed to add podman to systemd sandbox cgroup: %v", err) } } @@ -142,7 +142,7 @@ func (ic *ContainerEngine) SetupRootless(_ context.Context, noMoveProcess bool) } else { became, ret, err = rootless.BecomeRootInUserNS(pausePidPath) if err == nil { - utils.MovePauseProcessToScope(pausePidPath) + systemd.MovePauseProcessToScope(pausePidPath) } } if err != nil { @@ -406,17 +406,7 @@ func (ic *ContainerEngine) Unshare(ctx context.Context, args []string, options e } if options.RootlessNetNS { - rootlessNetNS, err := ic.Libpod.GetRootlessNetNs(true) - if err != nil { - return err - } - // Make sure to unlock, unshare can run for a long time. - rootlessNetNS.Lock.Unlock() - // We do not want to clean up the netns after unshare. - // The problem is that we cannot know if we need to clean up and - // secondly unshare should allow user to set up the namespace with - // special things, e.g. potentially macvlan or something like that. - return rootlessNetNS.Do(unshare) + return ic.Libpod.Network().RunInRootlessNetns(unshare) } return unshare() } diff --git a/pkg/specgen/generate/validate.go b/pkg/specgen/generate/validate.go index 858fcbfc8f..ad64dd210f 100644 --- a/pkg/specgen/generate/validate.go +++ b/pkg/specgen/generate/validate.go @@ -14,7 +14,6 @@ import ( "github.com/containers/common/pkg/sysinfo" "github.com/containers/podman/v4/pkg/rootless" "github.com/containers/podman/v4/pkg/specgen" - "github.com/containers/podman/v4/utils" "github.com/opencontainers/runtime-spec/specs-go" ) @@ -179,7 +178,7 @@ func verifyContainerResourcesCgroupV2(s *specgen.SpecGenerator) ([]string, error // Memory checks if s.ResourceLimits.Memory != nil && s.ResourceLimits.Memory.Swap != nil { - own, err := utils.GetOwnCgroup() + own, err := cgroups.GetOwnCgroup() if err != nil { return warnings, err } diff --git a/utils/testdata/cgroup.empty b/utils/testdata/cgroup.empty deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/utils/testdata/cgroup.other b/utils/testdata/cgroup.other deleted file mode 100644 index 239a7cded6..0000000000 --- a/utils/testdata/cgroup.other +++ /dev/null @@ -1 +0,0 @@ -0::/other diff --git a/utils/testdata/cgroup.root b/utils/testdata/cgroup.root deleted file mode 100644 index 1e027b2a3c..0000000000 --- a/utils/testdata/cgroup.root +++ /dev/null @@ -1 +0,0 @@ -0::/ diff --git a/utils/utils.go b/utils/utils.go index 08b2fa37a0..f06c256e98 100644 --- a/utils/utils.go +++ b/utils/utils.go @@ -2,20 +2,16 @@ package utils import ( "bytes" - "crypto/rand" "fmt" "io" "os" "os/exec" "strconv" "strings" - "sync" "time" - "github.com/containers/common/pkg/cgroups" "github.com/containers/storage/pkg/archive" "github.com/containers/storage/pkg/chrootarchive" - "github.com/godbus/dbus/v5" "github.com/sirupsen/logrus" "github.com/vbauerster/mpb/v8" "github.com/vbauerster/mpb/v8/decor" @@ -133,121 +129,6 @@ func RemoveScientificNotationFromFloat(x float64) (float64, error) { return result, nil } -var ( - runsOnSystemdOnce sync.Once - runsOnSystemd bool -) - -// RunsOnSystemd returns whether the system is using systemd -func RunsOnSystemd() bool { - runsOnSystemdOnce.Do(func() { - // per sd_booted(3), check for this dir - fd, err := os.Stat("/run/systemd/system") - runsOnSystemd = err == nil && fd.IsDir() - }) - return runsOnSystemd -} - -func moveProcessPIDFileToScope(pidPath, slice, scope string) error { - data, err := os.ReadFile(pidPath) - if err != nil { - // do not raise an error if the file doesn't exist - if os.IsNotExist(err) { - return nil - } - return fmt.Errorf("cannot read pid file: %w", err) - } - pid, err := strconv.ParseUint(string(data), 10, 0) - if err != nil { - return fmt.Errorf("cannot parse pid file %s: %w", pidPath, err) - } - - return moveProcessToScope(int(pid), slice, scope) -} - -func moveProcessToScope(pid int, slice, scope string) error { - err := RunUnderSystemdScope(pid, slice, scope) - // If the PID is not valid anymore, do not return an error. - if dbusErr, ok := err.(dbus.Error); ok { - if dbusErr.Name == "org.freedesktop.DBus.Error.UnixProcessIdUnknown" { - return nil - } - } - return err -} - -// MoveRootlessNetnsSlirpProcessToUserSlice moves the slirp4netns process for the rootless netns -// into a different scope so that systemd does not kill it with a container. -func MoveRootlessNetnsSlirpProcessToUserSlice(pid int) error { - randBytes := make([]byte, 4) - _, err := rand.Read(randBytes) - if err != nil { - return err - } - return moveProcessToScope(pid, "user.slice", fmt.Sprintf("rootless-netns-%x.scope", randBytes)) -} - -// MovePauseProcessToScope moves the pause process used for rootless mode to keep the namespaces alive to -// a separate scope. -func MovePauseProcessToScope(pausePidPath string) { - var err error - - for i := 0; i < 10; i++ { - randBytes := make([]byte, 4) - _, err = rand.Read(randBytes) - if err != nil { - logrus.Errorf("failed to read random bytes: %v", err) - continue - } - err = moveProcessPIDFileToScope(pausePidPath, "user.slice", fmt.Sprintf("podman-pause-%x.scope", randBytes)) - if err == nil { - return - } - } - - if err != nil { - unified, err2 := cgroups.IsCgroup2UnifiedMode() - if err2 != nil { - logrus.Warnf("Failed to detect if running with cgroup unified: %v", err) - } - if RunsOnSystemd() && unified { - logrus.Warnf("Failed to add pause process to systemd sandbox cgroup: %v", err) - } else { - logrus.Debugf("Failed to add pause process to systemd sandbox cgroup: %v", err) - } - } -} - -var ( - maybeMoveToSubCgroupSync sync.Once - maybeMoveToSubCgroupSyncErr error -) - -// MaybeMoveToSubCgroup moves the current process in a sub cgroup when -// it is running in the root cgroup on a system that uses cgroupv2. -func MaybeMoveToSubCgroup() error { - maybeMoveToSubCgroupSync.Do(func() { - unifiedMode, err := cgroups.IsCgroup2UnifiedMode() - if err != nil { - maybeMoveToSubCgroupSyncErr = err - return - } - if !unifiedMode { - maybeMoveToSubCgroupSyncErr = nil - return - } - cgroup, err := GetOwnCgroup() - if err != nil { - maybeMoveToSubCgroupSyncErr = err - return - } - if cgroup == "/" { - maybeMoveToSubCgroupSyncErr = MoveUnderCgroupSubtree("init") - } - }) - return maybeMoveToSubCgroupSyncErr -} - // GuardedRemoveAll functions much like os.RemoveAll but // will not delete certain catastrophic paths. func GuardedRemoveAll(path string) error { diff --git a/utils/utils_supported.go b/utils/utils_supported.go deleted file mode 100644 index 3bbd5dbbde..0000000000 --- a/utils/utils_supported.go +++ /dev/null @@ -1,205 +0,0 @@ -//go:build linux || darwin || freebsd -// +build linux darwin freebsd - -package utils - -import ( - "bufio" - "bytes" - "context" - "fmt" - "os" - "path/filepath" - "strings" - - "github.com/containers/common/pkg/cgroups" - "github.com/containers/podman/v4/pkg/rootless" - systemdDbus "github.com/coreos/go-systemd/v22/dbus" - "github.com/godbus/dbus/v5" - "github.com/sirupsen/logrus" -) - -// RunUnderSystemdScope adds the specified pid to a systemd scope -func RunUnderSystemdScope(pid int, slice string, unitName string) error { - var properties []systemdDbus.Property - var conn *systemdDbus.Conn - var err error - - if rootless.IsRootless() { - conn, err = cgroups.UserConnection(rootless.GetRootlessUID()) - if err != nil { - return err - } - } else { - conn, err = systemdDbus.NewWithContext(context.Background()) - if err != nil { - return err - } - } - defer conn.Close() - properties = append(properties, systemdDbus.PropSlice(slice)) - properties = append(properties, newProp("PIDs", []uint32{uint32(pid)})) - properties = append(properties, newProp("Delegate", true)) - properties = append(properties, newProp("DefaultDependencies", false)) - ch := make(chan string) - _, err = conn.StartTransientUnitContext(context.Background(), unitName, "replace", properties, ch) - if err != nil { - // On errors check if the cgroup already exists, if it does move the process there - if props, err := conn.GetUnitTypePropertiesContext(context.Background(), unitName, "Scope"); err == nil { - if cgroup, ok := props["ControlGroup"].(string); ok && cgroup != "" { - if err := MoveUnderCgroup(cgroup, "", []uint32{uint32(pid)}); err == nil { - return nil - } - // On errors return the original error message we got from StartTransientUnit. - } - } - return err - } - - // Block until job is started - <-ch - - return nil -} - -func getCgroupProcess(procFile string, allowRoot bool) (string, error) { - f, err := os.Open(procFile) - if err != nil { - return "", err - } - defer f.Close() - - scanner := bufio.NewScanner(f) - cgroup := "" - for scanner.Scan() { - line := scanner.Text() - parts := strings.SplitN(line, ":", 3) - if len(parts) != 3 { - return "", fmt.Errorf("cannot parse cgroup line %q", line) - } - if strings.HasPrefix(line, "0::") { - cgroup = line[3:] - break - } - if len(parts[2]) > len(cgroup) { - cgroup = parts[2] - } - } - if len(cgroup) == 0 || (!allowRoot && cgroup == "/") { - return "", fmt.Errorf("could not find cgroup mount in %q", procFile) - } - return cgroup, nil -} - -// GetOwnCgroup returns the cgroup for the current process. -func GetOwnCgroup() (string, error) { - return getCgroupProcess("/proc/self/cgroup", true) -} - -func GetOwnCgroupDisallowRoot() (string, error) { - return getCgroupProcess("/proc/self/cgroup", false) -} - -// GetCgroupProcess returns the cgroup for the specified process process. -func GetCgroupProcess(pid int) (string, error) { - return getCgroupProcess(fmt.Sprintf("/proc/%d/cgroup", pid), true) -} - -// MoveUnderCgroupSubtree moves the PID under a cgroup subtree. -func MoveUnderCgroupSubtree(subtree string) error { - return MoveUnderCgroup("", subtree, nil) -} - -// MoveUnderCgroup moves a group of processes to a new cgroup. -// If cgroup is the empty string, then the current calling process cgroup is used. -// If processes is empty, then the processes from the current cgroup are moved. -func MoveUnderCgroup(cgroup, subtree string, processes []uint32) error { - procFile := "/proc/self/cgroup" - f, err := os.Open(procFile) - if err != nil { - return err - } - defer f.Close() - - unifiedMode, err := cgroups.IsCgroup2UnifiedMode() - if err != nil { - return err - } - - scanner := bufio.NewScanner(f) - for scanner.Scan() { - line := scanner.Text() - parts := strings.SplitN(line, ":", 3) - if len(parts) != 3 { - return fmt.Errorf("cannot parse cgroup line %q", line) - } - - // root cgroup, skip it - if parts[2] == "/" && !(unifiedMode && parts[1] == "") { - continue - } - - cgroupRoot := "/sys/fs/cgroup" - // Special case the unified mount on hybrid cgroup and named hierarchies. - // This works on Fedora 31, but we should really parse the mounts to see - // where the cgroup hierarchy is mounted. - if parts[1] == "" && !unifiedMode { - // If it is not using unified mode, the cgroup v2 hierarchy is - // usually mounted under /sys/fs/cgroup/unified - cgroupRoot = filepath.Join(cgroupRoot, "unified") - - // Ignore the unified mount if it doesn't exist - if _, err := os.Stat(cgroupRoot); err != nil && os.IsNotExist(err) { - continue - } - } else if parts[1] != "" { - // Assume the controller is mounted at /sys/fs/cgroup/$CONTROLLER. - controller := strings.TrimPrefix(parts[1], "name=") - cgroupRoot = filepath.Join(cgroupRoot, controller) - } - - parentCgroup := cgroup - if parentCgroup == "" { - parentCgroup = parts[2] - } - newCgroup := filepath.Join(cgroupRoot, parentCgroup, subtree) - if err := os.MkdirAll(newCgroup, 0755); err != nil && !os.IsExist(err) { - return err - } - - f, err := os.OpenFile(filepath.Join(newCgroup, "cgroup.procs"), os.O_RDWR, 0755) - if err != nil { - return err - } - defer f.Close() - - if len(processes) > 0 { - for _, pid := range processes { - if _, err := f.WriteString(fmt.Sprintf("%d\n", pid)); err != nil { - logrus.Debugf("Cannot move process %d to cgroup %q: %v", pid, newCgroup, err) - } - } - } else { - processesData, err := os.ReadFile(filepath.Join(cgroupRoot, parts[2], "cgroup.procs")) - if err != nil { - return err - } - for _, pid := range bytes.Split(processesData, []byte("\n")) { - if len(pid) == 0 { - continue - } - if _, err := f.Write(pid); err != nil { - logrus.Debugf("Cannot move process %s to cgroup %q: %v", string(pid), newCgroup, err) - } - } - } - } - return nil -} - -func newProp(name string, units interface{}) systemdDbus.Property { - return systemdDbus.Property{ - Name: name, - Value: dbus.MakeVariant(units), - } -} diff --git a/utils/utils_test.go b/utils/utils_test.go deleted file mode 100644 index 180038afc6..0000000000 --- a/utils/utils_test.go +++ /dev/null @@ -1,26 +0,0 @@ -//go:build linux || darwin || freebsd -// +build linux darwin freebsd - -package utils - -import ( - "testing" - - "github.com/stretchr/testify/assert" -) - -func TestCgroupProcess(t *testing.T) { - val, err := getCgroupProcess("testdata/cgroup.root", true) - assert.Nil(t, err) - assert.Equal(t, "/", val) - - _, err = getCgroupProcess("testdata/cgroup.root", false) - assert.NotNil(t, err) - - val, err = getCgroupProcess("testdata/cgroup.other", true) - assert.Nil(t, err) - assert.Equal(t, "/other", val) - - _, err = getCgroupProcess("testdata/cgroup.empty", true) - assert.NotNil(t, err) -} diff --git a/utils/utils_windows.go b/utils/utils_windows.go deleted file mode 100644 index 18f232116f..0000000000 --- a/utils/utils_windows.go +++ /dev/null @@ -1,26 +0,0 @@ -//go:build windows -// +build windows - -package utils - -import "errors" - -func RunUnderSystemdScope(pid int, slice string, unitName string) error { - return errors.New("not implemented for windows") -} - -func MoveUnderCgroupSubtree(subtree string) error { - return errors.New("not implemented for windows") -} - -func GetOwnCgroup() (string, error) { - return "", errors.New("not implemented for windows") -} - -func GetOwnCgroupDisallowRoot() (string, error) { - return "", errors.New("not implemented for windows") -} - -func GetCgroupProcess(pid int) (string, error) { - return "", errors.New("not implemented for windows") -} diff --git a/vendor/github.com/containers/common/libimage/copier.go b/vendor/github.com/containers/common/libimage/copier.go index d6acc73250..1edf7d6cb9 100644 --- a/vendor/github.com/containers/common/libimage/copier.go +++ b/vendor/github.com/containers/common/libimage/copier.go @@ -364,11 +364,13 @@ func (c *copier) copy(ctx context.Context, source, destination types.ImageRefere defer cancel() defer timer.Stop() - fmt.Fprintf(c.imageCopyOptions.ReportWriter, - "Pulling image %s inside systemd: setting pull timeout to %s\n", - source.StringWithinTransport(), - time.Duration(numExtensions)*extension, - ) + if c.imageCopyOptions.ReportWriter != nil { + fmt.Fprintf(c.imageCopyOptions.ReportWriter, + "Pulling image %s inside systemd: setting pull timeout to %s\n", + source.StringWithinTransport(), + time.Duration(numExtensions)*extension, + ) + } // From `man systemd.service(5)`: // diff --git a/vendor/github.com/containers/common/libnetwork/cni/cni_exec.go b/vendor/github.com/containers/common/libnetwork/cni/cni_exec.go index 79d7ef120c..4b7ed8c6d9 100644 --- a/vendor/github.com/containers/common/libnetwork/cni/cni_exec.go +++ b/vendor/github.com/containers/common/libnetwork/cni/cni_exec.go @@ -26,8 +26,10 @@ import ( "context" "encoding/json" "fmt" + "os" "os/exec" "path/filepath" + "strings" "github.com/containernetworking/cni/pkg/invoke" "github.com/containernetworking/cni/pkg/version" @@ -80,6 +82,16 @@ func (e *cniExec) ExecPlugin(ctx context.Context, pluginPath string, stdinData [ c.Env = append(c.Env, "XDG_RUNTIME_DIR=") } + // The CNI plugins need access to iptables in $PATH. As it turns out debian doesn't put + // /usr/sbin in $PATH for rootless users. This will break rootless networking completely. + // We might break existing users and we cannot expect everyone to change their $PATH so + // let's add /usr/sbin to $PATH ourselves. + path := os.Getenv("PATH") + if !strings.Contains(path, "/usr/sbin") { + path += ":/usr/sbin" + c.Env = append(c.Env, "PATH="+path) + } + err := c.Run() if err != nil { return nil, annotatePluginError(err, pluginPath, stdout.Bytes(), stderr.Bytes()) diff --git a/vendor/github.com/containers/common/libnetwork/cni/network.go b/vendor/github.com/containers/common/libnetwork/cni/network.go index 49d20b915d..7d3369af7d 100644 --- a/vendor/github.com/containers/common/libnetwork/cni/network.go +++ b/vendor/github.com/containers/common/libnetwork/cni/network.go @@ -16,6 +16,7 @@ import ( "time" "github.com/containernetworking/cni/libcni" + "github.com/containers/common/libnetwork/internal/rootlessnetns" "github.com/containers/common/libnetwork/types" "github.com/containers/common/pkg/config" "github.com/containers/common/pkg/version" @@ -53,6 +54,9 @@ type cniNetwork struct { // networks is a map with loaded networks, the key is the network name networks map[string]*network + + // rootlessNetns is used for the rootless network setup/teardown + rootlessNetns *rootlessnetns.Netns } type network struct { @@ -65,21 +69,14 @@ type network struct { type InitConfig struct { // CNIConfigDir is directory where the cni config files are stored. CNIConfigDir string - // CNIPluginDirs is a list of directories where cni should look for the plugins. - CNIPluginDirs []string // RunDir is a directory where temporary files can be stored. RunDir string - // DefaultNetwork is the name for the default network. - DefaultNetwork string - // DefaultSubnet is the default subnet for the default network. - DefaultSubnet string - - // DefaultsubnetPools contains the subnets which must be used to allocate a free subnet by network create - DefaultsubnetPools []config.SubnetPool - // IsMachine describes whenever podman runs in a podman machine environment. IsMachine bool + + // Config containers.conf options + Config *config.Config } // NewCNINetworkInterface creates the ContainerNetwork interface for the CNI backend. @@ -96,12 +93,12 @@ func NewCNINetworkInterface(conf *InitConfig) (types.ContainerNetwork, error) { return nil, err } - defaultNetworkName := conf.DefaultNetwork + defaultNetworkName := conf.Config.Network.DefaultNetwork if defaultNetworkName == "" { defaultNetworkName = types.DefaultNetworkName } - defaultSubnet := conf.DefaultSubnet + defaultSubnet := conf.Config.Network.DefaultSubnet if defaultSubnet == "" { defaultSubnet = types.DefaultSubnet } @@ -110,21 +107,30 @@ func NewCNINetworkInterface(conf *InitConfig) (types.ContainerNetwork, error) { return nil, fmt.Errorf("failed to parse default subnet: %w", err) } - defaultSubnetPools := conf.DefaultsubnetPools + defaultSubnetPools := conf.Config.Network.DefaultSubnetPools if defaultSubnetPools == nil { defaultSubnetPools = config.DefaultSubnetPools } - cni := libcni.NewCNIConfig(conf.CNIPluginDirs, &cniExec{}) + var netns *rootlessnetns.Netns + if unshare.IsRootless() { + netns, err = rootlessnetns.New(conf.RunDir, rootlessnetns.CNI, conf.Config) + if err != nil { + return nil, err + } + } + + cni := libcni.NewCNIConfig(conf.Config.Network.CNIPluginDirs.Values, &cniExec{}) n := &cniNetwork{ cniConfigDir: conf.CNIConfigDir, - cniPluginDirs: conf.CNIPluginDirs, + cniPluginDirs: conf.Config.Network.CNIPluginDirs.Get(), cniConf: cni, defaultNetwork: defaultNetworkName, defaultSubnet: defaultNet, defaultsubnetPools: defaultSubnetPools, isMachine: conf.IsMachine, lock: lock, + rootlessNetns: netns, } return n, nil diff --git a/vendor/github.com/containers/common/libnetwork/cni/run.go b/vendor/github.com/containers/common/libnetwork/cni/run.go index 2da8da1ad0..829c127042 100644 --- a/vendor/github.com/containers/common/libnetwork/cni/run.go +++ b/vendor/github.com/containers/common/libnetwork/cni/run.go @@ -39,61 +39,71 @@ func (n *cniNetwork) Setup(namespacePath string, options types.SetupOptions) (ma return nil, fmt.Errorf("failed to set the loopback adapter up: %w", err) } - var retErr error - teardownOpts := options - teardownOpts.Networks = map[string]types.PerNetworkOptions{} - // make sure to teardown the already connected networks on error - defer func() { - if retErr != nil { - if len(teardownOpts.Networks) > 0 { - err := n.teardown(namespacePath, types.TeardownOptions(teardownOpts)) - if err != nil { - logrus.Warn(err) + results := make(map[string]types.StatusBlock, len(options.Networks)) + + setup := func() error { + var retErr error + teardownOpts := options + teardownOpts.Networks = map[string]types.PerNetworkOptions{} + // make sure to teardown the already connected networks on error + defer func() { + if retErr != nil { + if len(teardownOpts.Networks) > 0 { + err := n.teardown(namespacePath, types.TeardownOptions(teardownOpts)) + if err != nil { + logrus.Warn(err) + } } } + }() + + ports, err := convertSpecgenPortsToCNIPorts(options.PortMappings) + if err != nil { + return err } - }() - ports, err := convertSpecgenPortsToCNIPorts(options.PortMappings) - if err != nil { - return nil, err - } + for name, netOpts := range options.Networks { + netOpts := netOpts + network := n.networks[name] + rt := getRuntimeConfig(namespacePath, options.ContainerName, options.ContainerID, name, ports, &netOpts) - results := make(map[string]types.StatusBlock, len(options.Networks)) - for name, netOpts := range options.Networks { - netOpts := netOpts - network := n.networks[name] - rt := getRuntimeConfig(namespacePath, options.ContainerName, options.ContainerID, name, ports, &netOpts) - - // If we have more than one static ip we need parse the ips via runtime config, - // make sure to add the ips capability to the first plugin otherwise it doesn't get the ips - if len(netOpts.StaticIPs) > 0 && !network.cniNet.Plugins[0].Network.Capabilities["ips"] { - caps := make(map[string]interface{}) - caps["capabilities"] = map[string]bool{"ips": true} - network.cniNet.Plugins[0], retErr = libcni.InjectConf(network.cniNet.Plugins[0], caps) - if retErr != nil { - return nil, retErr + // If we have more than one static ip we need parse the ips via runtime config, + // make sure to add the ips capability to the first plugin otherwise it doesn't get the ips + if len(netOpts.StaticIPs) > 0 && !network.cniNet.Plugins[0].Network.Capabilities["ips"] { + caps := make(map[string]interface{}) + caps["capabilities"] = map[string]bool{"ips": true} + network.cniNet.Plugins[0], retErr = libcni.InjectConf(network.cniNet.Plugins[0], caps) + if retErr != nil { + return retErr + } } - } - var res cnitypes.Result - res, retErr = n.cniConf.AddNetworkList(context.Background(), network.cniNet, rt) - // Add this network to teardown opts since it is now connected. - // Also add this if an errors was returned since we want to call teardown on this regardless. - teardownOpts.Networks[name] = netOpts - if retErr != nil { - return nil, retErr - } + var res cnitypes.Result + res, retErr = n.cniConf.AddNetworkList(context.Background(), network.cniNet, rt) + // Add this network to teardown opts since it is now connected. + // Also add this if an errors was returned since we want to call teardown on this regardless. + teardownOpts.Networks[name] = netOpts + if retErr != nil { + return retErr + } - logrus.Debugf("cni result for container %s network %s: %v", options.ContainerID, name, res) - var status types.StatusBlock - status, retErr = CNIResultToStatus(res) - if retErr != nil { - return nil, retErr + logrus.Debugf("cni result for container %s network %s: %v", options.ContainerID, name, res) + var status types.StatusBlock + status, retErr = CNIResultToStatus(res) + if retErr != nil { + return retErr + } + results[name] = status } - results[name] = status + return nil } - return results, nil + + if n.rootlessNetns != nil { + err = n.rootlessNetns.Setup(len(options.Networks), setup) + } else { + err = setup() + } + return results, err } // CNIResultToStatus convert the cni result to status block @@ -225,28 +235,39 @@ func (n *cniNetwork) teardown(namespacePath string, options types.TeardownOption } var multiErr *multierror.Error - for name, netOpts := range options.Networks { - netOpts := netOpts - rt := getRuntimeConfig(namespacePath, options.ContainerName, options.ContainerID, name, ports, &netOpts) + teardown := func() error { + for name, netOpts := range options.Networks { + netOpts := netOpts + rt := getRuntimeConfig(namespacePath, options.ContainerName, options.ContainerID, name, ports, &netOpts) - cniConfList, newRt, err := getCachedNetworkConfig(n.cniConf, name, rt) - if err == nil { - rt = newRt - } else { - logrus.Warnf("Failed to load cached network config: %v, falling back to loading network %s from disk", err, name) - network := n.networks[name] - if network == nil { - multiErr = multierror.Append(multiErr, fmt.Errorf("network %s: %w", name, types.ErrNoSuchNetwork)) - continue + cniConfList, newRt, err := getCachedNetworkConfig(n.cniConf, name, rt) + if err == nil { + rt = newRt + } else { + logrus.Warnf("Failed to load cached network config: %v, falling back to loading network %s from disk", err, name) + network := n.networks[name] + if network == nil { + multiErr = multierror.Append(multiErr, fmt.Errorf("network %s: %w", name, types.ErrNoSuchNetwork)) + continue + } + cniConfList = network.cniNet } - cniConfList = network.cniNet - } - err = n.cniConf.DelNetworkList(context.Background(), cniConfList, rt) - if err != nil { - multiErr = multierror.Append(multiErr, err) + err = n.cniConf.DelNetworkList(context.Background(), cniConfList, rt) + if err != nil { + multiErr = multierror.Append(multiErr, err) + } } + return nil } + + if n.rootlessNetns != nil { + err = n.rootlessNetns.Teardown(len(options.Networks), teardown) + } else { + err = teardown() + } + multiErr = multierror.Append(multiErr, err) + return multiErr.ErrorOrNil() } @@ -267,3 +288,10 @@ func getCachedNetworkConfig(cniConf *libcni.CNIConfig, name string, rt *libcni.R } return cniConfList, rt, nil } + +func (n *cniNetwork) RunInRootlessNetns(toRun func() error) error { + if n.rootlessNetns == nil { + return types.ErrNotRootlessNetns + } + return n.rootlessNetns.Run(n.lock, toRun) +} diff --git a/vendor/github.com/containers/common/libnetwork/internal/rootlessnetns/netns.go b/vendor/github.com/containers/common/libnetwork/internal/rootlessnetns/netns.go new file mode 100644 index 0000000000..edc29f66fe --- /dev/null +++ b/vendor/github.com/containers/common/libnetwork/internal/rootlessnetns/netns.go @@ -0,0 +1,8 @@ +package rootlessnetns + +type NetworkBackend int + +const ( + Netavark NetworkBackend = iota + CNI +) diff --git a/vendor/github.com/containers/common/libnetwork/internal/rootlessnetns/netns_freebsd.go b/vendor/github.com/containers/common/libnetwork/internal/rootlessnetns/netns_freebsd.go new file mode 100644 index 0000000000..a176d2d822 --- /dev/null +++ b/vendor/github.com/containers/common/libnetwork/internal/rootlessnetns/netns_freebsd.go @@ -0,0 +1,28 @@ +package rootlessnetns + +import ( + "errors" + + "github.com/containers/common/pkg/config" + "github.com/containers/storage/pkg/lockfile" +) + +var ErrNotSupported = errors.New("rootless netns only supported on linux") + +type Netns struct{} + +func New(dir string, backend NetworkBackend, conf *config.Config) (*Netns, error) { + return nil, ErrNotSupported +} + +func (n *Netns) Setup(nets int, toRun func() error) error { + return ErrNotSupported +} + +func (n *Netns) Teardown(nets int, toRun func() error) error { + return ErrNotSupported +} + +func (n *Netns) Run(lock *lockfile.LockFile, toRun func() error) error { + return ErrNotSupported +} diff --git a/vendor/github.com/containers/common/libnetwork/internal/rootlessnetns/netns_linux.go b/vendor/github.com/containers/common/libnetwork/internal/rootlessnetns/netns_linux.go new file mode 100644 index 0000000000..8fbb1f5900 --- /dev/null +++ b/vendor/github.com/containers/common/libnetwork/internal/rootlessnetns/netns_linux.go @@ -0,0 +1,545 @@ +package rootlessnetns + +import ( + "errors" + "fmt" + "io/fs" + "os" + "path/filepath" + "strconv" + "strings" + "syscall" + + "github.com/containernetworking/plugins/pkg/ns" + "github.com/containers/common/libnetwork/resolvconf" + "github.com/containers/common/libnetwork/slirp4netns" + "github.com/containers/common/pkg/config" + "github.com/containers/common/pkg/netns" + "github.com/containers/common/pkg/systemd" + "github.com/containers/storage/pkg/homedir" + "github.com/containers/storage/pkg/lockfile" + "github.com/hashicorp/go-multierror" + "github.com/opencontainers/runtime-spec/specs-go" + "github.com/opencontainers/selinux/go-selinux/label" + "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" +) + +const ( + // rootlessNetnsDir is the directory name + rootlessNetnsDir = "rootless-netns" + // refCountFile file name for the ref count file + refCountFile = "ref-count" + + // rootlessNetNsSilrp4netnsPidFile is the name of the rootless netns slirp4netns pid file + rootlessNetNsSilrp4netnsPidFile = "rootless-netns-slirp4netns.pid" + + // persistentCNIDir is the directory where the CNI files are stored + persistentCNIDir = "/var/lib/cni" + + tmpfs = "tmpfs" + none = "none" + resolvConfName = "resolv.conf" +) + +type Netns struct { + // dir used for the rootless netns + dir string + // backend used for the network setup/teardown + backend NetworkBackend + + // config contains containers.conf options. + config *config.Config +} + +type rootlessNetnsError struct { + msg string + err error +} + +func (e *rootlessNetnsError) Error() string { + msg := e.msg + ": " + return fmt.Sprintf("rootless netns: %s%v", msg, e.err) +} + +func (e *rootlessNetnsError) Unwrap() error { + return e.err +} + +// wrapError wraps the error with extra context +// It will always include "rootless netns:" so the msg should not mention it again, +// msg can be empty to just include the rootless netns part. +// err must be non nil. +func wrapError(msg string, err error) *rootlessNetnsError { + return &rootlessNetnsError{ + msg: msg, + err: err, + } +} + +func New(dir string, backend NetworkBackend, conf *config.Config) (*Netns, error) { + netnsDir := filepath.Join(dir, rootlessNetnsDir) + if err := os.MkdirAll(netnsDir, 0o700); err != nil { + return nil, wrapError("", err) + } + return &Netns{ + dir: netnsDir, + backend: backend, + config: conf, + }, nil +} + +// getPath is a small wrapper around filepath.Join() to have a bit less code +func (n *Netns) getPath(path string) string { + return filepath.Join(n.dir, path) +} + +// getOrCreateNetns returns the rootless netns, if it created a new one the +// returned bool is set to true. +func (n *Netns) getOrCreateNetns() (ns.NetNS, bool, error) { + nsPath := n.getPath(rootlessNetnsDir) + nsRef, err := ns.GetNS(nsPath) + if err == nil { + // TODO check if slirp4netns is alive + return nsRef, false, nil + } + logrus.Debugf("Creating rootless network namespace at %q", nsPath) + // We have to create the netns dir again here because it is possible + // that cleanup() removed it. + if err := os.MkdirAll(n.dir, 0o700); err != nil { + return nil, false, wrapError("", err) + } + netns, err := netns.NewNSAtPath(nsPath) + if err != nil { + return nil, false, wrapError("create netns", err) + } + err = n.setupSlirp4netns(nsPath) + return netns, true, err +} + +func (n *Netns) cleanup() error { + if _, err := os.Stat(n.dir); err != nil { + if errors.Is(err, fs.ErrNotExist) { + // dir does not exists no need for cleanup + return nil + } + return err + } + + logrus.Debug("Cleaning up rootless network namespace") + + nsPath := n.getPath(rootlessNetnsDir) + var multiErr *multierror.Error + if err := netns.UnmountNS(nsPath); err != nil { + multiErr = multierror.Append(multiErr, err) + } + if err := n.cleanupSlirp4netns(); err != nil { + multiErr = multierror.Append(multiErr, wrapError("kill slirp4netns", err)) + } + if err := os.RemoveAll(n.dir); err != nil { + multiErr = multierror.Append(multiErr, wrapError("remove rootless netns dir", err)) + } + + return multiErr.ErrorOrNil() +} + +func (n *Netns) setupSlirp4netns(nsPath string) error { + res, err := slirp4netns.Setup(&slirp4netns.SetupOptions{ + Config: n.config, + ContainerID: "rootless-netns", + Netns: nsPath, + }) + if err != nil { + return wrapError("start slirp4netns", err) + } + // create pid file for the slirp4netns process + // this is need to kill the process in the cleanup + pid := strconv.Itoa(res.Pid) + err = os.WriteFile(n.getPath(rootlessNetNsSilrp4netnsPidFile), []byte(pid), 0o600) + if err != nil { + return wrapError("write slirp4netns pid file", err) + } + + if systemd.RunsOnSystemd() { + // move to systemd scope to prevent systemd from killing it + err = systemd.MoveRootlessNetnsSlirpProcessToUserSlice(res.Pid) + if err != nil { + // only log this, it is not fatal but can lead to issues when running podman inside systemd units + logrus.Errorf("failed to move the rootless netns slirp4netns process to the systemd user.slice: %v", err) + } + } + + // build a new resolv.conf file which uses the slirp4netns dns server address + resolveIP, err := slirp4netns.GetDNS(res.Subnet) + if err != nil { + return wrapError("determine default slirp4netns DNS address", err) + } + + if err := resolvconf.New(&resolvconf.Params{ + Path: n.getPath(resolvConfName), + // fake the netns since we want to filter localhost + Namespaces: []specs.LinuxNamespace{ + {Type: specs.NetworkNamespace}, + }, + IPv6Enabled: res.IPv6, + KeepHostServers: true, + Nameservers: []string{resolveIP.String()}, + }); err != nil { + return wrapError("create resolv.conf", err) + } + return nil +} + +func (n *Netns) cleanupSlirp4netns() error { + pidFile := n.getPath(rootlessNetNsSilrp4netnsPidFile) + b, err := os.ReadFile(pidFile) + if err == nil { + var i int + i, err = strconv.Atoi(string(b)) + if err == nil { + // kill the slirp process so we do not leak it + err = syscall.Kill(i, syscall.SIGTERM) + } + } + return err +} + +// mountAndMkdirDest convenience wrapper for mount and mkdir +func mountAndMkdirDest(source string, target string, fstype string, flags uintptr) error { + if err := os.MkdirAll(target, 0o700); err != nil { + return wrapError("create mount point", err) + } + if err := unix.Mount(source, target, fstype, flags, ""); err != nil { + return wrapError(fmt.Sprintf("mount %q to %q", source, target), err) + } + return nil +} + +func (n *Netns) setupMounts() error { + // Before we can run the given function, + // we have to set up all mounts correctly. + + // The order of the mounts is IMPORTANT. + // The idea of the extra mount ns is to make /run and /var/lib/cni writeable + // for the cni plugins but not affecting the podman user namespace. + // Because the plugins also need access to XDG_RUNTIME_DIR/netns some special setup is needed. + + // The following bind mounts are needed + // 1. XDG_RUNTIME_DIR -> XDG_RUNTIME_DIR/rootless-netns/XDG_RUNTIME_DIR + // 2. /run/systemd -> XDG_RUNTIME_DIR/rootless-netns/run/systemd (only if it exists) + // 3. XDG_RUNTIME_DIR/rootless-netns/resolv.conf -> /etc/resolv.conf or XDG_RUNTIME_DIR/rootless-netns/run/symlink/target + // 4. XDG_RUNTIME_DIR/rootless-netns/var/lib/cni -> /var/lib/cni (if /var/lib/cni does not exist, use the parent dir) + // 5. XDG_RUNTIME_DIR/rootless-netns/run -> /run + + // Create a new mount namespace, + // this must happen inside the netns thread. + err := unix.Unshare(unix.CLONE_NEWNS) + if err != nil { + return wrapError("create new mount namespace", err) + } + + xdgRuntimeDir, err := homedir.GetRuntimeDir() + if err != nil { + return fmt.Errorf("could not get runtime directory: %w", err) + } + newXDGRuntimeDir := n.getPath(xdgRuntimeDir) + // 1. Mount the netns into the new run to keep them accessible. + // Otherwise cni setup will fail because it cannot access the netns files. + err = mountAndMkdirDest(xdgRuntimeDir, newXDGRuntimeDir, none, unix.MS_BIND|unix.MS_SHARED|unix.MS_REC) + if err != nil { + return err + } + + // 2. Also keep /run/systemd if it exists. + // Many files are symlinked into this dir, for example /dev/log. + runSystemd := "/run/systemd" + _, err = os.Stat(runSystemd) + if err == nil { + newRunSystemd := n.getPath(runSystemd) + err = mountAndMkdirDest(runSystemd, newRunSystemd, none, unix.MS_BIND|unix.MS_REC) + if err != nil { + return err + } + } + + // 3. On some distros /etc/resolv.conf is symlinked to somewhere under /run. + // Because the kernel will follow the symlink before mounting, it is not + // possible to mount a file at /etc/resolv.conf. We have to ensure that + // the link target will be available in the mount ns. + // see: https://github.com/containers/podman/issues/10855 + resolvePath := resolvconf.DefaultResolvConf + linkCount := 0 + for i := 1; i < len(resolvePath); i++ { + // Do not use filepath.EvalSymlinks, we only want the first symlink under /run. + // If /etc/resolv.conf has more than one symlink under /run, e.g. + // -> /run/systemd/resolve/stub-resolv.conf -> /run/systemd/resolve/resolv.conf + // we would put the netns resolv.conf file to the last path. However this will + // break dns because the second link does not exist in the mount ns. + // see https://github.com/containers/podman/issues/11222 + // + // We also need to resolve all path components not just the last file. + // see https://github.com/containers/podman/issues/12461 + + if resolvePath[i] != '/' { + // if we are at the last char we need to inc i by one because there is no final slash + if i == len(resolvePath)-1 { + i++ + } else { + // not the end of path, keep going + continue + } + } + path := resolvePath[:i] + + fi, err := os.Lstat(path) + if err != nil { + return fmt.Errorf("failed to stat resolv.conf path: %w", err) + } + + // no link, just continue + if fi.Mode()&os.ModeSymlink == 0 { + continue + } + + link, err := os.Readlink(path) + if err != nil { + return fmt.Errorf("failed to read resolv.conf symlink: %w", err) + } + linkCount++ + if filepath.IsAbs(link) { + // link is as an absolute path + resolvePath = filepath.Join(link, resolvePath[i:]) + } else { + // link is as a relative, join it with the previous path + base := filepath.Dir(path) + resolvePath = filepath.Join(base, link, resolvePath[i:]) + } + // set i back to zero since we now have a new base path + i = 0 + + // we have to stop at the first path under /run because we will have an empty /run and will create the path anyway + // if we would continue we would need to recreate all links under /run + if strings.HasPrefix(resolvePath, "/run/") { + break + } + // make sure wo do not loop forever + if linkCount == 255 { + return errors.New("too many symlinks while resolving /etc/resolv.conf") + } + } + logrus.Debugf("The path of /etc/resolv.conf in the mount ns is %q", resolvePath) + // When /etc/resolv.conf on the host is a symlink to /run/systemd/resolve/stub-resolv.conf, + // we have to mount an empty filesystem on /run/systemd/resolve in the child namespace, + // so as to isolate the directory from the host mount namespace. + // + // Otherwise our bind-mount for /run/systemd/resolve/stub-resolv.conf is unmounted + // when systemd-resolved unlinks and recreates /run/systemd/resolve/stub-resolv.conf on the host. + // see: https://github.com/containers/podman/issues/10929 + if strings.HasPrefix(resolvePath, "/run/systemd/resolve/") { + rsr := n.getPath("/run/systemd/resolve") + err = mountAndMkdirDest("", rsr, tmpfs, unix.MS_NOEXEC|unix.MS_NOSUID|unix.MS_NODEV) + if err != nil { + return err + } + } + if strings.HasPrefix(resolvePath, "/run/") { + resolvePath = n.getPath(resolvePath) + err = os.MkdirAll(filepath.Dir(resolvePath), 0o700) + if err != nil { + return wrapError("create resolv.conf directory", err) + } + // we want to bind mount on this file so we have to create the file first + _, err = os.OpenFile(resolvePath, os.O_CREATE|os.O_RDONLY, 0o600) + if err != nil { + return wrapError("create resolv.conf file: %w", err) + } + } + // mount resolv.conf to make use of the host dns + err = unix.Mount(n.getPath(resolvConfName), resolvePath, none, unix.MS_BIND, "") + if err != nil { + return wrapError(fmt.Sprintf("mount resolv.conf to %q", resolvePath), err) + } + + // 4. CNI plugins need access to /var/lib/cni + if n.backend == CNI { + if err := n.mountCNIVarDir(); err != nil { + return err + } + } + + // 5. Mount the new prepared run dir to /run, it has to be recursive to keep the other bind mounts. + runDir := n.getPath("run") + // relabel the new run directory to the iptables /run label + // this is important, otherwise the iptables command will fail + err = label.Relabel(runDir, "system_u:object_r:iptables_var_run_t:s0", false) + if err != nil { + if !errors.Is(err, unix.ENOTSUP) { + return wrapError("relabel iptables_var_run_t", err) + } + logrus.Debugf("Labeling not supported on %q", runDir) + } + err = mountAndMkdirDest(runDir, "/run", none, unix.MS_BIND|unix.MS_REC) + if err != nil { + return err + } + return nil +} + +func (n *Netns) mountCNIVarDir() error { + varDir := "" + varTarget := persistentCNIDir + // we can only mount to a target dir which exists, check /var/lib/cni recursively + // while we could always use /var there are cases where a user might store the cni + // configs under /var/custom and this would break + for { + if _, err := os.Stat(varTarget); err == nil { + varDir = n.getPath(varTarget) + break + } + varTarget = filepath.Dir(varTarget) + if varTarget == "/" { + break + } + } + if varDir == "" { + return errors.New("failed to stat /var directory") + } + if err := os.MkdirAll(varDir, 0o700); err != nil { + return wrapError("create var dir", err) + } + // make sure to mount var first + err := unix.Mount(varDir, varTarget, none, unix.MS_BIND, "") + if err != nil { + return wrapError(fmt.Sprintf("mount %q to %q", varDir, varTarget), err) + } + return nil +} + +func (n *Netns) runInner(toRun func() error) (err error) { + nsRef, newNs, err := n.getOrCreateNetns() + if err != nil { + return err + } + defer nsRef.Close() + // If a new netns was created make sure to clean it up again on an error to not leak it. + if newNs { + defer func() { + if err != nil { + if err := n.cleanup(); err != nil { + logrus.Errorf("Rootless netns cleanup error after failed setup: %v", err) + } + } + }() + } + + return nsRef.Do(func(_ ns.NetNS) error { + if err := n.setupMounts(); err != nil { + return err + } + return toRun() + }) +} + +func (n *Netns) Setup(nets int, toRun func() error) error { + err := n.runInner(toRun) + if err != nil { + return err + } + _, err = refCount(n.dir, nets) + return err +} + +func (n *Netns) Teardown(nets int, toRun func() error) error { + var multiErr *multierror.Error + count, countErr := refCount(n.dir, -nets) + if countErr != nil { + multiErr = multierror.Append(multiErr, countErr) + } + err := n.runInner(toRun) + if err != nil { + multiErr = multierror.Append(multiErr, err) + } + + // only cleanup if the ref count did not throw an error + if count == 0 && countErr == nil { + err = n.cleanup() + if err != nil { + multiErr = multierror.Append(multiErr, wrapError("cleanup", err)) + } + } + + return multiErr.ErrorOrNil() +} + +// Run any long running function in the userns. +// We need to ensure that during setup/cleanup we are locked to avoid races. +// However because the given function could be running a long time we must +// unlock in between, i.e. this is used by podman unshare --rootless-nets +// and we do not want to keep it locked for the lifetime of the given command. +func (n *Netns) Run(lock *lockfile.LockFile, toRun func() error) error { + lock.Lock() + defer lock.Unlock() + _, err := refCount(n.dir, 1) + if err != nil { + return err + } + inner := func() error { + lock.Unlock() + err = toRun() + lock.Lock() + return err + } + + inErr := n.runInner(inner) + // make sure to always reset the ref counter afterwards + count, err := refCount(n.dir, -1) + if err != nil { + if inErr == nil { + return err + } + logrus.Errorf("Failed to decrement ref count: %v", err) + return inErr + } + if count == 0 { + err = n.cleanup() + if err != nil { + err = wrapError("cleanup", err) + if inErr == nil { + return err + } + logrus.Errorf("Failed to cleanup rootless netns: %v", err) + return inErr + } + } + + return inErr +} + +func refCount(dir string, inc int) (int, error) { + file := filepath.Join(dir, refCountFile) + content, err := os.ReadFile(file) + if err != nil && !errors.Is(err, fs.ErrNotExist) { + return -1, wrapError("read ref counter", err) + } + + currentCount := 0 + if len(content) > 0 { + currentCount, err = strconv.Atoi(string(content)) + if err != nil { + return -1, wrapError("parse ref counter", err) + } + } + + currentCount += inc + if currentCount < 0 { + logrus.Errorf("rootless netns ref counter out of sync, counter is at %d, resetting it back to 0", currentCount) + currentCount = 0 + } + + newNum := strconv.Itoa(currentCount) + if err = os.WriteFile(file, []byte(newNum), 0o600); err != nil { + return -1, wrapError("write ref counter", err) + } + + return currentCount, nil +} diff --git a/vendor/github.com/containers/common/libnetwork/netavark/exec.go b/vendor/github.com/containers/common/libnetwork/netavark/exec.go index f2c82359ad..e3f9047667 100644 --- a/vendor/github.com/containers/common/libnetwork/netavark/exec.go +++ b/vendor/github.com/containers/common/libnetwork/netavark/exec.go @@ -10,6 +10,7 @@ import ( "os" "os/exec" "strconv" + "strings" "github.com/sirupsen/logrus" ) @@ -79,6 +80,15 @@ func getRustLogEnv() string { func (n *netavarkNetwork) execNetavark(args []string, needPlugin bool, stdin, result interface{}) error { // set the netavark log level to the same as the podman env := append(os.Environ(), getRustLogEnv()) + // Netavark need access to iptables in $PATH. As it turns out debian doesn't put + // /usr/sbin in $PATH for rootless users. This will break rootless networking completely. + // We might break existing users and we cannot expect everyone to change their $PATH so + // let's add /usr/sbin to $PATH ourselves. + path := os.Getenv("PATH") + if !strings.Contains(path, "/usr/sbin") { + path += ":/usr/sbin" + env = append(env, "PATH="+path) + } // if we run with debug log level lets also set RUST_BACKTRACE=1 so we can get the full stack trace in case of panics if logrus.IsLevelEnabled(logrus.DebugLevel) { env = append(env, "RUST_BACKTRACE=1") diff --git a/vendor/github.com/containers/common/libnetwork/netavark/network.go b/vendor/github.com/containers/common/libnetwork/netavark/network.go index 5921167491..aad3cc7bd4 100644 --- a/vendor/github.com/containers/common/libnetwork/netavark/network.go +++ b/vendor/github.com/containers/common/libnetwork/netavark/network.go @@ -12,6 +12,7 @@ import ( "strings" "time" + "github.com/containers/common/libnetwork/internal/rootlessnetns" "github.com/containers/common/libnetwork/internal/util" "github.com/containers/common/libnetwork/types" "github.com/containers/common/pkg/config" @@ -68,6 +69,9 @@ type netavarkNetwork struct { // networks is a map with loaded networks, the key is the network name networks map[string]*types.Network + + // rootlessNetns is used for the rootless network setup/teardown + rootlessNetns *rootlessnetns.Netns } type InitConfig struct { @@ -82,26 +86,12 @@ type InitConfig struct { // NetworkRunDir is where temporary files are stored, i.e.the ipam db, aardvark config NetworkRunDir string - // FirewallDriver sets the firewall driver to use - FirewallDriver string - - // DefaultNetwork is the name for the default network. - DefaultNetwork string - // DefaultSubnet is the default subnet for the default network. - DefaultSubnet string - - // DefaultsubnetPools contains the subnets which must be used to allocate a free subnet by network create - DefaultsubnetPools []config.SubnetPool - - // DNSBindPort is set the port to pass to netavark for aardvark - DNSBindPort uint16 - - // PluginDirs list of directories were netavark plugins are located - PluginDirs []string - // Syslog describes whenever the netavark debug output should be log to the syslog as well. // This will use logrus to do so, make sure logrus is set up to log to the syslog. Syslog bool + + // Config containers.conf options + Config *config.Config } // NewNetworkInterface creates the ContainerNetwork interface for the netavark backend. @@ -118,12 +108,12 @@ func NewNetworkInterface(conf *InitConfig) (types.ContainerNetwork, error) { return nil, err } - defaultNetworkName := conf.DefaultNetwork + defaultNetworkName := conf.Config.Network.DefaultNetwork if defaultNetworkName == "" { defaultNetworkName = types.DefaultNetworkName } - defaultSubnet := conf.DefaultSubnet + defaultSubnet := conf.Config.Network.DefaultSubnet if defaultSubnet == "" { defaultSubnet = types.DefaultSubnet } @@ -140,11 +130,19 @@ func NewNetworkInterface(conf *InitConfig) (types.ContainerNetwork, error) { return nil, err } - defaultSubnetPools := conf.DefaultsubnetPools + defaultSubnetPools := conf.Config.Network.DefaultSubnetPools if defaultSubnetPools == nil { defaultSubnetPools = config.DefaultSubnetPools } + var netns *rootlessnetns.Netns + if unshare.IsRootless() { + netns, err = rootlessnetns.New(conf.NetworkRunDir, rootlessnetns.Netavark, conf.Config) + if err != nil { + return nil, err + } + } + n := &netavarkNetwork{ networkConfigDir: conf.NetworkConfigDir, networkRunDir: conf.NetworkRunDir, @@ -152,14 +150,15 @@ func NewNetworkInterface(conf *InitConfig) (types.ContainerNetwork, error) { aardvarkBinary: conf.AardvarkBinary, networkRootless: unshare.IsRootless(), ipamDBPath: filepath.Join(conf.NetworkRunDir, "ipam.db"), - firewallDriver: conf.FirewallDriver, + firewallDriver: conf.Config.Network.FirewallDriver, defaultNetwork: defaultNetworkName, defaultSubnet: defaultNet, defaultsubnetPools: defaultSubnetPools, - dnsBindPort: conf.DNSBindPort, - pluginDirs: conf.PluginDirs, + dnsBindPort: conf.Config.Network.DNSBindPort, + pluginDirs: conf.Config.Network.NetavarkPluginDirs.Get(), lock: lock, syslog: conf.Syslog, + rootlessNetns: netns, } return n, nil diff --git a/vendor/github.com/containers/common/libnetwork/netavark/run.go b/vendor/github.com/containers/common/libnetwork/netavark/run.go index 3df5ced052..42c76690cb 100644 --- a/vendor/github.com/containers/common/libnetwork/netavark/run.go +++ b/vendor/github.com/containers/common/libnetwork/netavark/run.go @@ -72,12 +72,24 @@ func (n *netavarkNetwork) Setup(namespacePath string, options types.SetupOptions } result := map[string]types.StatusBlock{} - err = n.execNetavark([]string{"setup", namespacePath}, needPlugin, netavarkOpts, &result) - if err != nil { - // lets dealloc ips to prevent leaking - if err := n.deallocIPs(&options.NetworkOptions); err != nil { - logrus.Error(err) + setup := func() error { + err := n.execNetavark([]string{"setup", namespacePath}, needPlugin, netavarkOpts, &result) + if err != nil { + // lets dealloc ips to prevent leaking + if err := n.deallocIPs(&options.NetworkOptions); err != nil { + logrus.Error(err) + } + return err } + return nil + } + + if n.rootlessNetns != nil { + err = n.rootlessNetns.Setup(len(options.Networks), setup) + } else { + err = setup() + } + if err != nil { return nil, err } @@ -112,7 +124,16 @@ func (n *netavarkNetwork) Teardown(namespacePath string, options types.TeardownO return fmt.Errorf("failed to convert net opts: %w", err) } - retErr := n.execNetavark([]string{"teardown", namespacePath}, needPlugin, netavarkOpts, nil) + var retErr error + teardown := func() error { + return n.execNetavark([]string{"teardown", namespacePath}, needPlugin, netavarkOpts, nil) + } + + if n.rootlessNetns != nil { + retErr = n.rootlessNetns.Teardown(len(options.Networks), teardown) + } else { + retErr = teardown() + } // when netavark returned an error we still free the used ips // otherwise we could end up in a state where block the ips forever @@ -160,3 +181,10 @@ func (n *netavarkNetwork) convertNetOpts(opts types.NetworkOptions) (*netavarkOp } return &netavarkOptions, needsPlugin, nil } + +func (n *netavarkNetwork) RunInRootlessNetns(toRun func() error) error { + if n.rootlessNetns == nil { + return types.ErrNotRootlessNetns + } + return n.rootlessNetns.Run(n.lock, toRun) +} diff --git a/vendor/github.com/containers/common/libnetwork/network/interface.go b/vendor/github.com/containers/common/libnetwork/network/interface.go index b3a5f2aec3..4a8290ba74 100644 --- a/vendor/github.com/containers/common/libnetwork/network/interface.go +++ b/vendor/github.com/containers/common/libnetwork/network/interface.go @@ -77,17 +77,12 @@ func NetworkBackend(store storage.Store, conf *config.Config, syslog bool) (type } netInt, err := netavark.NewNetworkInterface(&netavark.InitConfig{ - NetworkConfigDir: confDir, - NetworkRunDir: runDir, - NetavarkBinary: netavarkBin, - AardvarkBinary: aardvarkBin, - PluginDirs: conf.Network.NetavarkPluginDirs.Get(), - FirewallDriver: conf.Network.FirewallDriver, - DefaultNetwork: conf.Network.DefaultNetwork, - DefaultSubnet: conf.Network.DefaultSubnet, - DefaultsubnetPools: conf.Network.DefaultSubnetPools, - DNSBindPort: conf.Network.DNSBindPort, - Syslog: syslog, + Config: conf, + NetworkConfigDir: confDir, + NetworkRunDir: runDir, + NetavarkBinary: netavarkBin, + AardvarkBinary: aardvarkBin, + Syslog: syslog, }) return types.Netavark, netInt, err case types.CNI: @@ -181,13 +176,10 @@ func getCniInterface(conf *config.Config) (types.ContainerNetwork, error) { } } return cni.NewCNINetworkInterface(&cni.InitConfig{ - CNIConfigDir: confDir, - CNIPluginDirs: conf.Network.CNIPluginDirs.Get(), - RunDir: conf.Engine.TmpDir, - DefaultNetwork: conf.Network.DefaultNetwork, - DefaultSubnet: conf.Network.DefaultSubnet, - DefaultsubnetPools: conf.Network.DefaultSubnetPools, - IsMachine: machine.IsGvProxyBased(), + Config: conf, + CNIConfigDir: confDir, + RunDir: conf.Engine.TmpDir, + IsMachine: machine.IsGvProxyBased(), }) } diff --git a/vendor/github.com/containers/common/libnetwork/types/define.go b/vendor/github.com/containers/common/libnetwork/types/define.go index 6e91ccda96..193377b1a2 100644 --- a/vendor/github.com/containers/common/libnetwork/types/define.go +++ b/vendor/github.com/containers/common/libnetwork/types/define.go @@ -18,6 +18,9 @@ var ( // exists. ErrNetworkExists = errors.New("network already exists") + // ErrNotRootlessNetns indicates the rootless netns can only be used as root + ErrNotRootlessNetns = errors.New("rootless netns cannot be used as root") + // NameRegex is a regular expression to validate names. // This must NOT be changed. NameRegex = regexp.Delayed("^[a-zA-Z0-9][a-zA-Z0-9_.-]*$") diff --git a/vendor/github.com/containers/common/libnetwork/types/network.go b/vendor/github.com/containers/common/libnetwork/types/network.go index 94087fd375..9e30975cb0 100644 --- a/vendor/github.com/containers/common/libnetwork/types/network.go +++ b/vendor/github.com/containers/common/libnetwork/types/network.go @@ -27,6 +27,10 @@ type ContainerNetwork interface { // Teardown will teardown the container network namespace. Teardown(namespacePath string, options TeardownOptions) error + // RunInRootlessNetns is used to run the given function in the rootless netns. + // Only used as rootless and should return an error as root. + RunInRootlessNetns(toRun func() error) error + // Drivers will return the list of supported network drivers // for this interface. Drivers() []string diff --git a/vendor/github.com/containers/common/pkg/cgroups/utils_linux.go b/vendor/github.com/containers/common/pkg/cgroups/utils_linux.go index ed9f0761df..ffdf10acaf 100644 --- a/vendor/github.com/containers/common/pkg/cgroups/utils_linux.go +++ b/vendor/github.com/containers/common/pkg/cgroups/utils_linux.go @@ -4,6 +4,7 @@ package cgroups import ( + "bufio" "bytes" "errors" "fmt" @@ -11,6 +12,7 @@ import ( "path" "path/filepath" "strings" + "sync" "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/configs" @@ -143,3 +145,171 @@ func SetBlkioThrottle(res *configs.Resources, cgroupPath string) error { } return nil } + +// Code below was moved from podman/utils/utils_supported.go and should properly better +// integrated here as some parts may be redundant. + +func getCgroupProcess(procFile string, allowRoot bool) (string, error) { + f, err := os.Open(procFile) + if err != nil { + return "", err + } + defer f.Close() + + scanner := bufio.NewScanner(f) + cgroup := "" + for scanner.Scan() { + line := scanner.Text() + parts := strings.SplitN(line, ":", 3) + if len(parts) != 3 { + return "", fmt.Errorf("cannot parse cgroup line %q", line) + } + if strings.HasPrefix(line, "0::") { + cgroup = line[3:] + break + } + if len(parts[2]) > len(cgroup) { + cgroup = parts[2] + } + } + if len(cgroup) == 0 || (!allowRoot && cgroup == "/") { + return "", fmt.Errorf("could not find cgroup mount in %q", procFile) + } + return cgroup, nil +} + +// GetOwnCgroup returns the cgroup for the current process. +func GetOwnCgroup() (string, error) { + return getCgroupProcess("/proc/self/cgroup", true) +} + +func GetOwnCgroupDisallowRoot() (string, error) { + return getCgroupProcess("/proc/self/cgroup", false) +} + +// GetCgroupProcess returns the cgroup for the specified process process. +func GetCgroupProcess(pid int) (string, error) { + return getCgroupProcess(fmt.Sprintf("/proc/%d/cgroup", pid), true) +} + +// MoveUnderCgroupSubtree moves the PID under a cgroup subtree. +func MoveUnderCgroupSubtree(subtree string) error { + return MoveUnderCgroup("", subtree, nil) +} + +// MoveUnderCgroup moves a group of processes to a new cgroup. +// If cgroup is the empty string, then the current calling process cgroup is used. +// If processes is empty, then the processes from the current cgroup are moved. +func MoveUnderCgroup(cgroup, subtree string, processes []uint32) error { + procFile := "/proc/self/cgroup" + f, err := os.Open(procFile) + if err != nil { + return err + } + defer f.Close() + + unifiedMode, err := IsCgroup2UnifiedMode() + if err != nil { + return err + } + + scanner := bufio.NewScanner(f) + for scanner.Scan() { + line := scanner.Text() + parts := strings.SplitN(line, ":", 3) + if len(parts) != 3 { + return fmt.Errorf("cannot parse cgroup line %q", line) + } + + // root cgroup, skip it + if parts[2] == "/" && !(unifiedMode && parts[1] == "") { + continue + } + + cgroupRoot := "/sys/fs/cgroup" + // Special case the unified mount on hybrid cgroup and named hierarchies. + // This works on Fedora 31, but we should really parse the mounts to see + // where the cgroup hierarchy is mounted. + if parts[1] == "" && !unifiedMode { + // If it is not using unified mode, the cgroup v2 hierarchy is + // usually mounted under /sys/fs/cgroup/unified + cgroupRoot = filepath.Join(cgroupRoot, "unified") + + // Ignore the unified mount if it doesn't exist + if _, err := os.Stat(cgroupRoot); err != nil && os.IsNotExist(err) { + continue + } + } else if parts[1] != "" { + // Assume the controller is mounted at /sys/fs/cgroup/$CONTROLLER. + controller := strings.TrimPrefix(parts[1], "name=") + cgroupRoot = filepath.Join(cgroupRoot, controller) + } + + parentCgroup := cgroup + if parentCgroup == "" { + parentCgroup = parts[2] + } + newCgroup := filepath.Join(cgroupRoot, parentCgroup, subtree) + if err := os.MkdirAll(newCgroup, 0o755); err != nil && !os.IsExist(err) { + return err + } + + f, err := os.OpenFile(filepath.Join(newCgroup, "cgroup.procs"), os.O_RDWR, 0o755) + if err != nil { + return err + } + defer f.Close() + + if len(processes) > 0 { + for _, pid := range processes { + if _, err := f.WriteString(fmt.Sprintf("%d\n", pid)); err != nil { + logrus.Debugf("Cannot move process %d to cgroup %q: %v", pid, newCgroup, err) + } + } + } else { + processesData, err := os.ReadFile(filepath.Join(cgroupRoot, parts[2], "cgroup.procs")) + if err != nil { + return err + } + for _, pid := range bytes.Split(processesData, []byte("\n")) { + if len(pid) == 0 { + continue + } + if _, err := f.Write(pid); err != nil { + logrus.Debugf("Cannot move process %s to cgroup %q: %v", string(pid), newCgroup, err) + } + } + } + } + return nil +} + +var ( + maybeMoveToSubCgroupSync sync.Once + maybeMoveToSubCgroupSyncErr error +) + +// MaybeMoveToSubCgroup moves the current process in a sub cgroup when +// it is running in the root cgroup on a system that uses cgroupv2. +func MaybeMoveToSubCgroup() error { + maybeMoveToSubCgroupSync.Do(func() { + unifiedMode, err := IsCgroup2UnifiedMode() + if err != nil { + maybeMoveToSubCgroupSyncErr = err + return + } + if !unifiedMode { + maybeMoveToSubCgroupSyncErr = nil + return + } + cgroup, err := GetOwnCgroup() + if err != nil { + maybeMoveToSubCgroupSyncErr = err + return + } + if cgroup == "/" { + maybeMoveToSubCgroupSyncErr = MoveUnderCgroupSubtree("init") + } + }) + return maybeMoveToSubCgroupSyncErr +} diff --git a/vendor/github.com/containers/common/pkg/netns/netns_linux.go b/vendor/github.com/containers/common/pkg/netns/netns_linux.go index 9f0336bc0f..5b5e0daebb 100644 --- a/vendor/github.com/containers/common/pkg/netns/netns_linux.go +++ b/vendor/github.com/containers/common/pkg/netns/netns_linux.go @@ -32,10 +32,12 @@ import ( "github.com/containernetworking/plugins/pkg/ns" "github.com/containers/storage/pkg/homedir" "github.com/containers/storage/pkg/unshare" - "github.com/sirupsen/logrus" "golang.org/x/sys/unix" ) +// threadNsPath is the /proc path to the current netns handle for the current thread +const threadNsPath = "/proc/thread-self/ns/net" + // GetNSRunDir returns the dir of where to create the netNS. When running // rootless, it needs to be at a location writable by user. func GetNSRunDir() (string, error) { @@ -49,6 +51,10 @@ func GetNSRunDir() (string, error) { return "/run/netns", nil } +func NewNSAtPath(nsPath string) (ns.NetNS, error) { + return newNSPath(nsPath) +} + // NewNS creates a new persistent (bind-mounted) network namespace and returns // an object representing that namespace, without switching to it. func NewNS() (ns.NetNS, error) { @@ -111,8 +117,12 @@ func NewNSWithName(name string) (ns.NetNS, error) { } } - // create an empty file at the mount point nsPath := path.Join(nsRunDir, name) + return newNSPath(nsPath) +} + +func newNSPath(nsPath string) (ns.NetNS, error) { + // create an empty file at the mount point mountPointFd, err := os.OpenFile(nsPath, os.O_RDWR|os.O_CREATE|os.O_EXCL, 0o600) if err != nil { return nil, err @@ -140,24 +150,10 @@ func NewNSWithName(name string) (ns.NetNS, error) { // Don't unlock. By not unlocking, golang will kill the OS thread when the // goroutine is done (for go1.10+) - threadNsPath := getCurrentThreadNetNSPath() - - var origNS ns.NetNS - origNS, err = ns.GetNS(threadNsPath) - if err != nil { - logrus.Warnf("Cannot open current network namespace %s: %q", threadNsPath, err) - return - } - defer func() { - if err := origNS.Close(); err != nil { - logrus.Errorf("Unable to close namespace: %q", err) - } - }() - // create a new netns on the current thread err = unix.Unshare(unix.CLONE_NEWNET) if err != nil { - logrus.Warnf("Cannot create a new network namespace: %q", err) + err = fmt.Errorf("unshare network namespace: %w", err) return } @@ -181,13 +177,8 @@ func NewNSWithName(name string) (ns.NetNS, error) { // UnmountNS unmounts the given netns path func UnmountNS(nsPath string) error { - nsRunDir, err := GetNSRunDir() - if err != nil { - return err - } - // Only unmount if it's been bind-mounted (don't touch namespaces in /proc...) - if strings.HasPrefix(nsPath, nsRunDir) { + if !strings.HasPrefix(nsPath, "/proc/") { if err := unix.Unmount(nsPath, unix.MNT_DETACH); err != nil { return fmt.Errorf("failed to unmount NS: at %s: %v", nsPath, err) } @@ -199,11 +190,3 @@ func UnmountNS(nsPath string) error { return nil } - -// getCurrentThreadNetNSPath copied from pkg/ns -func getCurrentThreadNetNSPath() string { - // /proc/self/ns/net returns the namespace of the main thread, not - // of whatever thread this goroutine is running on. Make sure we - // use the thread's net namespace since the thread is switching around - return fmt.Sprintf("/proc/%d/task/%d/ns/net", os.Getpid(), unix.Gettid()) -} diff --git a/vendor/github.com/containers/common/pkg/systemd/systemd_linux.go b/vendor/github.com/containers/common/pkg/systemd/systemd_linux.go new file mode 100644 index 0000000000..02503618f4 --- /dev/null +++ b/vendor/github.com/containers/common/pkg/systemd/systemd_linux.go @@ -0,0 +1,151 @@ +package systemd + +import ( + "context" + "crypto/rand" + "fmt" + "os" + "strconv" + "sync" + + "github.com/containers/common/pkg/cgroups" + "github.com/containers/storage/pkg/unshare" + systemdDbus "github.com/coreos/go-systemd/v22/dbus" + "github.com/godbus/dbus/v5" + "github.com/sirupsen/logrus" +) + +var ( + runsOnSystemdOnce sync.Once + runsOnSystemd bool +) + +// RunsOnSystemd returns whether the system is using systemd +func RunsOnSystemd() bool { + runsOnSystemdOnce.Do(func() { + // per sd_booted(3), check for this dir + fd, err := os.Stat("/run/systemd/system") + runsOnSystemd = err == nil && fd.IsDir() + }) + return runsOnSystemd +} + +func moveProcessPIDFileToScope(pidPath, slice, scope string) error { + data, err := os.ReadFile(pidPath) + if err != nil { + // do not raise an error if the file doesn't exist + if os.IsNotExist(err) { + return nil + } + return fmt.Errorf("cannot read pid file: %w", err) + } + pid, err := strconv.ParseUint(string(data), 10, 0) + if err != nil { + return fmt.Errorf("cannot parse pid file %s: %w", pidPath, err) + } + + return moveProcessToScope(int(pid), slice, scope) +} + +func moveProcessToScope(pid int, slice, scope string) error { + err := RunUnderSystemdScope(pid, slice, scope) + // If the PID is not valid anymore, do not return an error. + if dbusErr, ok := err.(dbus.Error); ok { + if dbusErr.Name == "org.freedesktop.DBus.Error.UnixProcessIdUnknown" { + return nil + } + } + return err +} + +// MoveRootlessNetnsSlirpProcessToUserSlice moves the slirp4netns process for the rootless netns +// into a different scope so that systemd does not kill it with a container. +func MoveRootlessNetnsSlirpProcessToUserSlice(pid int) error { + randBytes := make([]byte, 4) + _, err := rand.Read(randBytes) + if err != nil { + return err + } + return moveProcessToScope(pid, "user.slice", fmt.Sprintf("rootless-netns-%x.scope", randBytes)) +} + +// MovePauseProcessToScope moves the pause process used for rootless mode to keep the namespaces alive to +// a separate scope. +func MovePauseProcessToScope(pausePidPath string) { + var err error + + for i := 0; i < 10; i++ { + randBytes := make([]byte, 4) + _, err = rand.Read(randBytes) + if err != nil { + logrus.Errorf("failed to read random bytes: %v", err) + continue + } + err = moveProcessPIDFileToScope(pausePidPath, "user.slice", fmt.Sprintf("podman-pause-%x.scope", randBytes)) + if err == nil { + return + } + } + + if err != nil { + unified, err2 := cgroups.IsCgroup2UnifiedMode() + if err2 != nil { + logrus.Warnf("Failed to detect if running with cgroup unified: %v", err) + } + if RunsOnSystemd() && unified { + logrus.Warnf("Failed to add pause process to systemd sandbox cgroup: %v", err) + } else { + logrus.Debugf("Failed to add pause process to systemd sandbox cgroup: %v", err) + } + } +} + +// RunUnderSystemdScope adds the specified pid to a systemd scope +func RunUnderSystemdScope(pid int, slice string, unitName string) error { + var properties []systemdDbus.Property + var conn *systemdDbus.Conn + var err error + + if unshare.GetRootlessUID() != 0 { + conn, err = cgroups.UserConnection(unshare.GetRootlessUID()) + if err != nil { + return err + } + } else { + conn, err = systemdDbus.NewWithContext(context.Background()) + if err != nil { + return err + } + } + defer conn.Close() + properties = append(properties, systemdDbus.PropSlice(slice)) + properties = append(properties, newProp("PIDs", []uint32{uint32(pid)})) + properties = append(properties, newProp("Delegate", true)) + properties = append(properties, newProp("DefaultDependencies", false)) + ch := make(chan string) + _, err = conn.StartTransientUnitContext(context.Background(), unitName, "replace", properties, ch) + if err != nil { + // On errors check if the cgroup already exists, if it does move the process there + if props, err := conn.GetUnitTypePropertiesContext(context.Background(), unitName, "Scope"); err == nil { + if cgroup, ok := props["ControlGroup"].(string); ok && cgroup != "" { + if err := cgroups.MoveUnderCgroup(cgroup, "", []uint32{uint32(pid)}); err == nil { + return nil + } + // On errors return the original error message we got from StartTransientUnit. + } + } + return err + } + + // Block until job is started + <-ch + + return nil +} + +func newProp(name string, units interface{}) systemdDbus.Property { + return systemdDbus.Property{ + Name: name, + Value: dbus.MakeVariant(units), + } +} diff --git a/vendor/github.com/containers/common/pkg/systemd/systemd_unsupported.go b/vendor/github.com/containers/common/pkg/systemd/systemd_unsupported.go new file mode 100644 index 0000000000..e4a6285279 --- /dev/null +++ b/vendor/github.com/containers/common/pkg/systemd/systemd_unsupported.go @@ -0,0 +1,15 @@ +//go:build !linux + +package systemd + +import "errors" + +func RunsOnSystemd() bool { + return false +} + +func MovePauseProcessToScope(pausePidPath string) {} + +func RunUnderSystemdScope(pid int, slice string, unitName string) error { + return errors.New("RunUnderSystemdScope not supported on this OS") +} diff --git a/vendor/modules.txt b/vendor/modules.txt index 615eda3880..1f61cef73d 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -167,7 +167,7 @@ github.com/containers/buildah/pkg/sshagent github.com/containers/buildah/pkg/util github.com/containers/buildah/pkg/volumes github.com/containers/buildah/util -# github.com/containers/common v0.57.1-0.20231130092720-630c929caef9 +# github.com/containers/common v0.57.1-0.20231206135104-b647eb3a5eea ## explicit; go 1.18 github.com/containers/common/internal/attributedstring github.com/containers/common/libimage @@ -177,6 +177,7 @@ github.com/containers/common/libimage/manifests github.com/containers/common/libimage/platform github.com/containers/common/libnetwork/cni github.com/containers/common/libnetwork/etchosts +github.com/containers/common/libnetwork/internal/rootlessnetns github.com/containers/common/libnetwork/internal/util github.com/containers/common/libnetwork/netavark github.com/containers/common/libnetwork/network @@ -223,6 +224,7 @@ github.com/containers/common/pkg/ssh github.com/containers/common/pkg/subscriptions github.com/containers/common/pkg/supplemented github.com/containers/common/pkg/sysinfo +github.com/containers/common/pkg/systemd github.com/containers/common/pkg/timetype github.com/containers/common/pkg/umask github.com/containers/common/pkg/util