use rootless netns from c/common

Use the new rootlessnetns logic from c/common, drop the podman code
here and make use of the new much simpler API.

ref: https://github.com/containers/common/pull/1761

[NO NEW TESTS NEEDED]

Signed-off-by: Paul Holzinger <pholzing@redhat.com>
This commit is contained in:
Paul Holzinger
2023-11-24 18:00:24 +01:00
parent 605a29a714
commit a687c38860
38 changed files with 1171 additions and 1072 deletions

View File

@@ -4,6 +4,7 @@
package cgroups
import (
"bufio"
"bytes"
"errors"
"fmt"
@@ -11,6 +12,7 @@ import (
"path"
"path/filepath"
"strings"
"sync"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
@@ -143,3 +145,171 @@ func SetBlkioThrottle(res *configs.Resources, cgroupPath string) error {
}
return nil
}
// Code below was moved from podman/utils/utils_supported.go and should properly better
// integrated here as some parts may be redundant.
func getCgroupProcess(procFile string, allowRoot bool) (string, error) {
f, err := os.Open(procFile)
if err != nil {
return "", err
}
defer f.Close()
scanner := bufio.NewScanner(f)
cgroup := ""
for scanner.Scan() {
line := scanner.Text()
parts := strings.SplitN(line, ":", 3)
if len(parts) != 3 {
return "", fmt.Errorf("cannot parse cgroup line %q", line)
}
if strings.HasPrefix(line, "0::") {
cgroup = line[3:]
break
}
if len(parts[2]) > len(cgroup) {
cgroup = parts[2]
}
}
if len(cgroup) == 0 || (!allowRoot && cgroup == "/") {
return "", fmt.Errorf("could not find cgroup mount in %q", procFile)
}
return cgroup, nil
}
// GetOwnCgroup returns the cgroup for the current process.
func GetOwnCgroup() (string, error) {
return getCgroupProcess("/proc/self/cgroup", true)
}
func GetOwnCgroupDisallowRoot() (string, error) {
return getCgroupProcess("/proc/self/cgroup", false)
}
// GetCgroupProcess returns the cgroup for the specified process process.
func GetCgroupProcess(pid int) (string, error) {
return getCgroupProcess(fmt.Sprintf("/proc/%d/cgroup", pid), true)
}
// MoveUnderCgroupSubtree moves the PID under a cgroup subtree.
func MoveUnderCgroupSubtree(subtree string) error {
return MoveUnderCgroup("", subtree, nil)
}
// MoveUnderCgroup moves a group of processes to a new cgroup.
// If cgroup is the empty string, then the current calling process cgroup is used.
// If processes is empty, then the processes from the current cgroup are moved.
func MoveUnderCgroup(cgroup, subtree string, processes []uint32) error {
procFile := "/proc/self/cgroup"
f, err := os.Open(procFile)
if err != nil {
return err
}
defer f.Close()
unifiedMode, err := IsCgroup2UnifiedMode()
if err != nil {
return err
}
scanner := bufio.NewScanner(f)
for scanner.Scan() {
line := scanner.Text()
parts := strings.SplitN(line, ":", 3)
if len(parts) != 3 {
return fmt.Errorf("cannot parse cgroup line %q", line)
}
// root cgroup, skip it
if parts[2] == "/" && !(unifiedMode && parts[1] == "") {
continue
}
cgroupRoot := "/sys/fs/cgroup"
// Special case the unified mount on hybrid cgroup and named hierarchies.
// This works on Fedora 31, but we should really parse the mounts to see
// where the cgroup hierarchy is mounted.
if parts[1] == "" && !unifiedMode {
// If it is not using unified mode, the cgroup v2 hierarchy is
// usually mounted under /sys/fs/cgroup/unified
cgroupRoot = filepath.Join(cgroupRoot, "unified")
// Ignore the unified mount if it doesn't exist
if _, err := os.Stat(cgroupRoot); err != nil && os.IsNotExist(err) {
continue
}
} else if parts[1] != "" {
// Assume the controller is mounted at /sys/fs/cgroup/$CONTROLLER.
controller := strings.TrimPrefix(parts[1], "name=")
cgroupRoot = filepath.Join(cgroupRoot, controller)
}
parentCgroup := cgroup
if parentCgroup == "" {
parentCgroup = parts[2]
}
newCgroup := filepath.Join(cgroupRoot, parentCgroup, subtree)
if err := os.MkdirAll(newCgroup, 0o755); err != nil && !os.IsExist(err) {
return err
}
f, err := os.OpenFile(filepath.Join(newCgroup, "cgroup.procs"), os.O_RDWR, 0o755)
if err != nil {
return err
}
defer f.Close()
if len(processes) > 0 {
for _, pid := range processes {
if _, err := f.WriteString(fmt.Sprintf("%d\n", pid)); err != nil {
logrus.Debugf("Cannot move process %d to cgroup %q: %v", pid, newCgroup, err)
}
}
} else {
processesData, err := os.ReadFile(filepath.Join(cgroupRoot, parts[2], "cgroup.procs"))
if err != nil {
return err
}
for _, pid := range bytes.Split(processesData, []byte("\n")) {
if len(pid) == 0 {
continue
}
if _, err := f.Write(pid); err != nil {
logrus.Debugf("Cannot move process %s to cgroup %q: %v", string(pid), newCgroup, err)
}
}
}
}
return nil
}
var (
maybeMoveToSubCgroupSync sync.Once
maybeMoveToSubCgroupSyncErr error
)
// MaybeMoveToSubCgroup moves the current process in a sub cgroup when
// it is running in the root cgroup on a system that uses cgroupv2.
func MaybeMoveToSubCgroup() error {
maybeMoveToSubCgroupSync.Do(func() {
unifiedMode, err := IsCgroup2UnifiedMode()
if err != nil {
maybeMoveToSubCgroupSyncErr = err
return
}
if !unifiedMode {
maybeMoveToSubCgroupSyncErr = nil
return
}
cgroup, err := GetOwnCgroup()
if err != nil {
maybeMoveToSubCgroupSyncErr = err
return
}
if cgroup == "/" {
maybeMoveToSubCgroupSyncErr = MoveUnderCgroupSubtree("init")
}
})
return maybeMoveToSubCgroupSyncErr
}

View File

@@ -32,10 +32,12 @@ import (
"github.com/containernetworking/plugins/pkg/ns"
"github.com/containers/storage/pkg/homedir"
"github.com/containers/storage/pkg/unshare"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
)
// threadNsPath is the /proc path to the current netns handle for the current thread
const threadNsPath = "/proc/thread-self/ns/net"
// GetNSRunDir returns the dir of where to create the netNS. When running
// rootless, it needs to be at a location writable by user.
func GetNSRunDir() (string, error) {
@@ -49,6 +51,10 @@ func GetNSRunDir() (string, error) {
return "/run/netns", nil
}
func NewNSAtPath(nsPath string) (ns.NetNS, error) {
return newNSPath(nsPath)
}
// NewNS creates a new persistent (bind-mounted) network namespace and returns
// an object representing that namespace, without switching to it.
func NewNS() (ns.NetNS, error) {
@@ -111,8 +117,12 @@ func NewNSWithName(name string) (ns.NetNS, error) {
}
}
// create an empty file at the mount point
nsPath := path.Join(nsRunDir, name)
return newNSPath(nsPath)
}
func newNSPath(nsPath string) (ns.NetNS, error) {
// create an empty file at the mount point
mountPointFd, err := os.OpenFile(nsPath, os.O_RDWR|os.O_CREATE|os.O_EXCL, 0o600)
if err != nil {
return nil, err
@@ -140,24 +150,10 @@ func NewNSWithName(name string) (ns.NetNS, error) {
// Don't unlock. By not unlocking, golang will kill the OS thread when the
// goroutine is done (for go1.10+)
threadNsPath := getCurrentThreadNetNSPath()
var origNS ns.NetNS
origNS, err = ns.GetNS(threadNsPath)
if err != nil {
logrus.Warnf("Cannot open current network namespace %s: %q", threadNsPath, err)
return
}
defer func() {
if err := origNS.Close(); err != nil {
logrus.Errorf("Unable to close namespace: %q", err)
}
}()
// create a new netns on the current thread
err = unix.Unshare(unix.CLONE_NEWNET)
if err != nil {
logrus.Warnf("Cannot create a new network namespace: %q", err)
err = fmt.Errorf("unshare network namespace: %w", err)
return
}
@@ -181,13 +177,8 @@ func NewNSWithName(name string) (ns.NetNS, error) {
// UnmountNS unmounts the given netns path
func UnmountNS(nsPath string) error {
nsRunDir, err := GetNSRunDir()
if err != nil {
return err
}
// Only unmount if it's been bind-mounted (don't touch namespaces in /proc...)
if strings.HasPrefix(nsPath, nsRunDir) {
if !strings.HasPrefix(nsPath, "/proc/") {
if err := unix.Unmount(nsPath, unix.MNT_DETACH); err != nil {
return fmt.Errorf("failed to unmount NS: at %s: %v", nsPath, err)
}
@@ -199,11 +190,3 @@ func UnmountNS(nsPath string) error {
return nil
}
// getCurrentThreadNetNSPath copied from pkg/ns
func getCurrentThreadNetNSPath() string {
// /proc/self/ns/net returns the namespace of the main thread, not
// of whatever thread this goroutine is running on. Make sure we
// use the thread's net namespace since the thread is switching around
return fmt.Sprintf("/proc/%d/task/%d/ns/net", os.Getpid(), unix.Gettid())
}

View File

@@ -0,0 +1,151 @@
package systemd
import (
"context"
"crypto/rand"
"fmt"
"os"
"strconv"
"sync"
"github.com/containers/common/pkg/cgroups"
"github.com/containers/storage/pkg/unshare"
systemdDbus "github.com/coreos/go-systemd/v22/dbus"
"github.com/godbus/dbus/v5"
"github.com/sirupsen/logrus"
)
var (
runsOnSystemdOnce sync.Once
runsOnSystemd bool
)
// RunsOnSystemd returns whether the system is using systemd
func RunsOnSystemd() bool {
runsOnSystemdOnce.Do(func() {
// per sd_booted(3), check for this dir
fd, err := os.Stat("/run/systemd/system")
runsOnSystemd = err == nil && fd.IsDir()
})
return runsOnSystemd
}
func moveProcessPIDFileToScope(pidPath, slice, scope string) error {
data, err := os.ReadFile(pidPath)
if err != nil {
// do not raise an error if the file doesn't exist
if os.IsNotExist(err) {
return nil
}
return fmt.Errorf("cannot read pid file: %w", err)
}
pid, err := strconv.ParseUint(string(data), 10, 0)
if err != nil {
return fmt.Errorf("cannot parse pid file %s: %w", pidPath, err)
}
return moveProcessToScope(int(pid), slice, scope)
}
func moveProcessToScope(pid int, slice, scope string) error {
err := RunUnderSystemdScope(pid, slice, scope)
// If the PID is not valid anymore, do not return an error.
if dbusErr, ok := err.(dbus.Error); ok {
if dbusErr.Name == "org.freedesktop.DBus.Error.UnixProcessIdUnknown" {
return nil
}
}
return err
}
// MoveRootlessNetnsSlirpProcessToUserSlice moves the slirp4netns process for the rootless netns
// into a different scope so that systemd does not kill it with a container.
func MoveRootlessNetnsSlirpProcessToUserSlice(pid int) error {
randBytes := make([]byte, 4)
_, err := rand.Read(randBytes)
if err != nil {
return err
}
return moveProcessToScope(pid, "user.slice", fmt.Sprintf("rootless-netns-%x.scope", randBytes))
}
// MovePauseProcessToScope moves the pause process used for rootless mode to keep the namespaces alive to
// a separate scope.
func MovePauseProcessToScope(pausePidPath string) {
var err error
for i := 0; i < 10; i++ {
randBytes := make([]byte, 4)
_, err = rand.Read(randBytes)
if err != nil {
logrus.Errorf("failed to read random bytes: %v", err)
continue
}
err = moveProcessPIDFileToScope(pausePidPath, "user.slice", fmt.Sprintf("podman-pause-%x.scope", randBytes))
if err == nil {
return
}
}
if err != nil {
unified, err2 := cgroups.IsCgroup2UnifiedMode()
if err2 != nil {
logrus.Warnf("Failed to detect if running with cgroup unified: %v", err)
}
if RunsOnSystemd() && unified {
logrus.Warnf("Failed to add pause process to systemd sandbox cgroup: %v", err)
} else {
logrus.Debugf("Failed to add pause process to systemd sandbox cgroup: %v", err)
}
}
}
// RunUnderSystemdScope adds the specified pid to a systemd scope
func RunUnderSystemdScope(pid int, slice string, unitName string) error {
var properties []systemdDbus.Property
var conn *systemdDbus.Conn
var err error
if unshare.GetRootlessUID() != 0 {
conn, err = cgroups.UserConnection(unshare.GetRootlessUID())
if err != nil {
return err
}
} else {
conn, err = systemdDbus.NewWithContext(context.Background())
if err != nil {
return err
}
}
defer conn.Close()
properties = append(properties, systemdDbus.PropSlice(slice))
properties = append(properties, newProp("PIDs", []uint32{uint32(pid)}))
properties = append(properties, newProp("Delegate", true))
properties = append(properties, newProp("DefaultDependencies", false))
ch := make(chan string)
_, err = conn.StartTransientUnitContext(context.Background(), unitName, "replace", properties, ch)
if err != nil {
// On errors check if the cgroup already exists, if it does move the process there
if props, err := conn.GetUnitTypePropertiesContext(context.Background(), unitName, "Scope"); err == nil {
if cgroup, ok := props["ControlGroup"].(string); ok && cgroup != "" {
if err := cgroups.MoveUnderCgroup(cgroup, "", []uint32{uint32(pid)}); err == nil {
return nil
}
// On errors return the original error message we got from StartTransientUnit.
}
}
return err
}
// Block until job is started
<-ch
return nil
}
func newProp(name string, units interface{}) systemdDbus.Property {
return systemdDbus.Property{
Name: name,
Value: dbus.MakeVariant(units),
}
}

View File

@@ -0,0 +1,15 @@
//go:build !linux
package systemd
import "errors"
func RunsOnSystemd() bool {
return false
}
func MovePauseProcessToScope(pausePidPath string) {}
func RunUnderSystemdScope(pid int, slice string, unitName string) error {
return errors.New("RunUnderSystemdScope not supported on this OS")
}