Remove runc edit to lock to specific version

Fixes: https://github.com/containers/podman/issues/19795

Signed-off-by: Daniel J Walsh <dwalsh@redhat.com>
This commit is contained in:
Daniel J Walsh
2024-08-02 16:04:10 -04:00
parent 43a077db87
commit 37226e2e93
26 changed files with 133 additions and 494 deletions

View File

@ -1,5 +1,4 @@
//go:build !linux
// +build !linux
package apparmor

View File

@ -12,7 +12,8 @@ var (
ErrDevicesUnsupported = errors.New("cgroup manager is not configured to set device rules")
// DevicesSetV1 and DevicesSetV2 are functions to set devices for
// cgroup v1 and v2, respectively. Unless libcontainer/cgroups/devices
// cgroup v1 and v2, respectively. Unless
// [github.com/opencontainers/runc/libcontainer/cgroups/devices]
// package is imported, it is set to nil, so cgroup managers can't
// manage devices.
DevicesSetV1 func(path string, r *configs.Resources) error

View File

@ -57,6 +57,40 @@ func WriteFile(dir, file, data string) error {
return nil
}
// WriteFileByLine is the same as WriteFile, except if data contains newlines,
// it is written line by line.
func WriteFileByLine(dir, file, data string) error {
i := strings.Index(data, "\n")
if i == -1 {
return WriteFile(dir, file, data)
}
fd, err := OpenFile(dir, file, unix.O_WRONLY)
if err != nil {
return err
}
defer fd.Close()
start := 0
for {
var line string
if i == -1 {
line = data[start:]
} else {
line = data[start : start+i+1]
}
_, err := fd.WriteString(line)
if err != nil {
return fmt.Errorf("failed to write %q: %w", line, err)
}
if i == -1 {
break
}
start += i + 1
i = strings.Index(data[start:], "\n")
}
return nil
}
const (
cgroupfsDir = "/sys/fs/cgroup"
cgroupfsPrefix = cgroupfsDir + "/"

View File

@ -35,15 +35,31 @@ func (s *CpuGroup) Apply(path string, r *configs.Resources, pid int) error {
}
func (s *CpuGroup) SetRtSched(path string, r *configs.Resources) error {
var period string
if r.CpuRtPeriod != 0 {
if err := cgroups.WriteFile(path, "cpu.rt_period_us", strconv.FormatUint(r.CpuRtPeriod, 10)); err != nil {
return err
period = strconv.FormatUint(r.CpuRtPeriod, 10)
if err := cgroups.WriteFile(path, "cpu.rt_period_us", period); err != nil {
// The values of cpu.rt_period_us and cpu.rt_runtime_us
// are inter-dependent and need to be set in a proper order.
// If the kernel rejects the new period value with EINVAL
// and the new runtime value is also being set, let's
// ignore the error for now and retry later.
if !errors.Is(err, unix.EINVAL) || r.CpuRtRuntime == 0 {
return err
}
} else {
period = ""
}
}
if r.CpuRtRuntime != 0 {
if err := cgroups.WriteFile(path, "cpu.rt_runtime_us", strconv.FormatInt(r.CpuRtRuntime, 10)); err != nil {
return err
}
if period != "" {
if err := cgroups.WriteFile(path, "cpu.rt_period_us", period); err != nil {
return err
}
}
}
return nil
}
@ -89,9 +105,11 @@ func (s *CpuGroup) Set(path string, r *configs.Resources) error {
if r.CpuBurst != nil {
burst = strconv.FormatUint(*r.CpuBurst, 10)
if err := cgroups.WriteFile(path, "cpu.cfs_burst_us", burst); err != nil {
// this is a special trick for burst feature, the current systemd and low version of kernel will not support it.
// So, an `no such file or directory` error would be raised, and we can ignore it .
if !errors.Is(err, unix.ENOENT) {
if errors.Is(err, unix.ENOENT) {
// If CPU burst knob is not available (e.g.
// older kernel), ignore it.
burst = ""
} else {
// Sometimes when the burst to be set is larger
// than the current one, it is rejected by the kernel
// (EINVAL) as old_quota/new_burst exceeds the parent
@ -117,9 +135,7 @@ func (s *CpuGroup) Set(path string, r *configs.Resources) error {
}
if burst != "" {
if err := cgroups.WriteFile(path, "cpu.cfs_burst_us", burst); err != nil {
if !errors.Is(err, unix.ENOENT) {
return err
}
return err
}
}
}

View File

@ -233,7 +233,7 @@ func (m *Manager) setUnified(res map[string]string) error {
if strings.Contains(k, "/") {
return fmt.Errorf("unified resource %q must be a file name (no slashes)", k)
}
if err := cgroups.WriteFile(m.dirPath, k, v); err != nil {
if err := cgroups.WriteFileByLine(m.dirPath, k, v); err != nil {
// Check for both EPERM and ENOENT since O_CREAT is used by WriteFile.
if errors.Is(err, os.ErrPermission) || errors.Is(err, os.ErrNotExist) {
// Check if a controller is available,

View File

@ -57,7 +57,10 @@ func setMemory(dirPath string, r *configs.Resources) error {
// never write empty string to `memory.swap.max`, it means set to 0.
if swapStr != "" {
if err := cgroups.WriteFile(dirPath, "memory.swap.max", swapStr); err != nil {
return err
// If swap is not enabled, silently ignore setting to max or disabling it.
if !(errors.Is(err, os.ErrNotExist) && (swapStr == "max" || swapStr == "0")) {
return err
}
}
}

View File

@ -136,18 +136,18 @@ func GetAllSubsystems() ([]string, error) {
return subsystems, nil
}
func readProcsFile(dir string) ([]int, error) {
f, err := OpenFile(dir, CgroupProcesses, os.O_RDONLY)
func readProcsFile(dir string) (out []int, _ error) {
file := CgroupProcesses
retry := true
again:
f, err := OpenFile(dir, file, os.O_RDONLY)
if err != nil {
return nil, err
}
defer f.Close()
var (
s = bufio.NewScanner(f)
out = []int{}
)
s := bufio.NewScanner(f)
for s.Scan() {
if t := s.Text(); t != "" {
pid, err := strconv.Atoi(t)
@ -157,6 +157,13 @@ func readProcsFile(dir string) ([]int, error) {
out = append(out, pid)
}
}
if errors.Is(s.Err(), unix.ENOTSUP) && retry {
// For a threaded cgroup, read returns ENOTSUP, and we should
// read from cgroup.threads instead.
file = "cgroup.threads"
retry = false
goto again
}
return out, s.Err()
}
@ -275,9 +282,7 @@ func RemovePaths(paths map[string]string) (err error) {
}
}
if len(paths) == 0 {
//nolint:ineffassign,staticcheck // done to help garbage collecting: opencontainers/runc#2506
// TODO: switch to clear once Go < 1.21 is not supported.
paths = make(map[string]string)
clear(paths)
return nil
}
return fmt.Errorf("Failed to remove paths: %v", paths)

View File

@ -1,5 +1,4 @@
//go:build !linux
// +build !linux
package configs

View File

@ -222,6 +222,9 @@ type Config struct {
// Personality contains configuration for the Linux personality syscall.
Personality *LinuxPersonality `json:"personality,omitempty"`
// IOPriority is the container's I/O priority.
IOPriority *IOPriority `json:"io_priority,omitempty"`
}
// Scheduler is based on the Linux sched_setattr(2) syscall.
@ -283,6 +286,14 @@ func ToSchedAttr(scheduler *Scheduler) (*unix.SchedAttr, error) {
}, nil
}
var IOPrioClassMapping = map[specs.IOPriorityClass]int{
specs.IOPRIO_CLASS_RT: 1,
specs.IOPRIO_CLASS_BE: 2,
specs.IOPRIO_CLASS_IDLE: 3,
}
type IOPriority = specs.LinuxIOPriority
type (
HookName string
HookList []Hook

View File

@ -1,5 +1,4 @@
//go:build gofuzz
// +build gofuzz
package configs

View File

@ -1,5 +1,4 @@
//go:build !linux
// +build !linux
package configs

View File

@ -1,5 +1,4 @@
//go:build linux
// +build linux
package configs

View File

@ -1,5 +1,4 @@
//go:build !linux && !windows
// +build !linux,!windows
package configs

View File

@ -1,5 +1,4 @@
//go:build !linux
// +build !linux
package configs

View File

@ -1,5 +1,4 @@
//go:build !windows
// +build !windows
package devices

View File

@ -1,4 +1,8 @@
package userns
// RunningInUserNS detects whether we are currently running in a user namespace.
var RunningInUserNS = runningInUserNS
// RunningInUserNS detects whether we are currently running in a Linux
// user namespace and memoizes the result. It returns false on non-Linux
// platforms.
func RunningInUserNS() bool {
return inUserNS()
}

View File

@ -7,32 +7,26 @@ import (
"sync"
)
var (
inUserNS bool
nsOnce sync.Once
)
var inUserNS = sync.OnceValue(runningInUserNS)
// runningInUserNS detects whether we are currently running in a user namespace.
//
// Originally copied from https://github.com/lxc/incus/blob/e45085dd42f826b3c8c3228e9733c0b6f998eafe/shared/util.go#L678-L700.
func runningInUserNS() bool {
nsOnce.Do(func() {
file, err := os.Open("/proc/self/uid_map")
if err != nil {
// This kernel-provided file only exists if user namespaces are supported.
return
}
defer file.Close()
file, err := os.Open("/proc/self/uid_map")
if err != nil {
// This kernel-provided file only exists if user namespaces are supported.
return false
}
defer file.Close()
buf := bufio.NewReader(file)
l, _, err := buf.ReadLine()
if err != nil {
return
}
buf := bufio.NewReader(file)
l, _, err := buf.ReadLine()
if err != nil {
return false
}
inUserNS = uidMapInUserNS(string(l))
})
return inUserNS
return uidMapInUserNS(string(l))
}
func uidMapInUserNS(uidMap string) bool {

View File

@ -1,5 +1,4 @@
//go:build gofuzz
// +build gofuzz
//go:build linux && gofuzz
package userns

View File

@ -1,79 +0,0 @@
#define _GNU_SOURCE
#include <fcntl.h>
#include <sched.h>
#include <stdio.h>
#include <unistd.h>
#include <stdarg.h>
#include <stdlib.h>
/*
* All of the code here is run inside an aync-signal-safe context, so we need
* to be careful to not call any functions that could cause issues. In theory,
* since we are a Go program, there are fewer restrictions in practice, it's
* better to be safe than sorry.
*
* The only exception is exit, which we need to call to make sure we don't
* return into runc.
*/
void bail(int pipefd, const char *fmt, ...)
{
va_list args;
va_start(args, fmt);
vdprintf(pipefd, fmt, args);
va_end(args);
exit(1);
}
int spawn_userns_cat(char *userns_path, char *path, int outfd, int errfd)
{
char buffer[4096] = { 0 };
pid_t child = fork();
if (child != 0)
return child;
/* in child */
/* Join the target userns. */
int nsfd = open(userns_path, O_RDONLY);
if (nsfd < 0)
bail(errfd, "open userns path %s failed: %m", userns_path);
int err = setns(nsfd, CLONE_NEWUSER);
if (err < 0)
bail(errfd, "setns %s failed: %m", userns_path);
close(nsfd);
/* Pipe the requested file contents. */
int fd = open(path, O_RDONLY);
if (fd < 0)
bail(errfd, "open %s in userns %s failed: %m", path, userns_path);
int nread, ntotal = 0;
while ((nread = read(fd, buffer, sizeof(buffer))) != 0) {
if (nread < 0)
bail(errfd, "read bytes from %s failed (after %d total bytes read): %m", path, ntotal);
ntotal += nread;
int nwritten = 0;
while (nwritten < nread) {
int n = write(outfd, buffer, nread - nwritten);
if (n < 0)
bail(errfd, "write %d bytes from %s failed (after %d bytes written): %m",
nread - nwritten, path, nwritten);
nwritten += n;
}
if (nread != nwritten)
bail(errfd, "mismatch for bytes read and written: %d read != %d written", nread, nwritten);
}
close(fd);
close(outfd);
close(errfd);
/* We must exit here, otherwise we would return into a forked runc. */
exit(0);
}

View File

@ -1,186 +0,0 @@
//go:build linux
package userns
import (
"bufio"
"bytes"
"fmt"
"io"
"os"
"unsafe"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/sirupsen/logrus"
)
/*
#include <stdlib.h>
extern int spawn_userns_cat(char *userns_path, char *path, int outfd, int errfd);
*/
import "C"
func parseIdmapData(data []byte) (ms []configs.IDMap, err error) {
scanner := bufio.NewScanner(bytes.NewReader(data))
for scanner.Scan() {
var m configs.IDMap
line := scanner.Text()
if _, err := fmt.Sscanf(line, "%d %d %d", &m.ContainerID, &m.HostID, &m.Size); err != nil {
return nil, fmt.Errorf("parsing id map failed: invalid format in line %q: %w", line, err)
}
ms = append(ms, m)
}
if err := scanner.Err(); err != nil {
return nil, fmt.Errorf("parsing id map failed: %w", err)
}
return ms, nil
}
// Do something equivalent to nsenter --user=<nsPath> cat <path>, but more
// efficiently. Returns the contents of the requested file from within the user
// namespace.
func spawnUserNamespaceCat(nsPath string, path string) ([]byte, error) {
rdr, wtr, err := os.Pipe()
if err != nil {
return nil, fmt.Errorf("create pipe for userns spawn failed: %w", err)
}
defer rdr.Close()
defer wtr.Close()
errRdr, errWtr, err := os.Pipe()
if err != nil {
return nil, fmt.Errorf("create error pipe for userns spawn failed: %w", err)
}
defer errRdr.Close()
defer errWtr.Close()
cNsPath := C.CString(nsPath)
defer C.free(unsafe.Pointer(cNsPath))
cPath := C.CString(path)
defer C.free(unsafe.Pointer(cPath))
childPid := C.spawn_userns_cat(cNsPath, cPath, C.int(wtr.Fd()), C.int(errWtr.Fd()))
if childPid < 0 {
return nil, fmt.Errorf("failed to spawn fork for userns")
} else if childPid == 0 {
// this should never happen
panic("runc executing inside fork child -- unsafe state!")
}
// We are in the parent -- close the write end of the pipe before reading.
wtr.Close()
output, err := io.ReadAll(rdr)
rdr.Close()
if err != nil {
return nil, fmt.Errorf("reading from userns spawn failed: %w", err)
}
// Ditto for the error pipe.
errWtr.Close()
errOutput, err := io.ReadAll(errRdr)
errRdr.Close()
if err != nil {
return nil, fmt.Errorf("reading from userns spawn error pipe failed: %w", err)
}
errOutput = bytes.TrimSpace(errOutput)
// Clean up the child.
child, err := os.FindProcess(int(childPid))
if err != nil {
return nil, fmt.Errorf("could not find userns spawn process: %w", err)
}
state, err := child.Wait()
if err != nil {
return nil, fmt.Errorf("failed to wait for userns spawn process: %w", err)
}
if !state.Success() {
errStr := string(errOutput)
if errStr == "" {
errStr = fmt.Sprintf("unknown error (status code %d)", state.ExitCode())
}
return nil, fmt.Errorf("userns spawn: %s", errStr)
} else if len(errOutput) > 0 {
// We can just ignore weird output in the error pipe if the process
// didn't bail(), but for completeness output for debugging.
logrus.Debugf("userns spawn succeeded but unexpected error message found: %s", string(errOutput))
}
// The subprocess succeeded, return whatever it wrote to the pipe.
return output, nil
}
func GetUserNamespaceMappings(nsPath string) (uidMap, gidMap []configs.IDMap, err error) {
var (
pid int
extra rune
tryFastPath bool
)
// nsPath is usually of the form /proc/<pid>/ns/user, which means that we
// already have a pid that is part of the user namespace and thus we can
// just use the pid to read from /proc/<pid>/*id_map.
//
// Note that Sscanf doesn't consume the whole input, so we check for any
// trailing data with %c. That way, we can be sure the pattern matched
// /proc/$pid/ns/user _exactly_ iff n === 1.
if n, _ := fmt.Sscanf(nsPath, "/proc/%d/ns/user%c", &pid, &extra); n == 1 {
tryFastPath = pid > 0
}
for _, mapType := range []struct {
name string
idMap *[]configs.IDMap
}{
{"uid_map", &uidMap},
{"gid_map", &gidMap},
} {
var mapData []byte
if tryFastPath {
path := fmt.Sprintf("/proc/%d/%s", pid, mapType.name)
data, err := os.ReadFile(path)
if err != nil {
// Do not error out here -- we need to try the slow path if the
// fast path failed.
logrus.Debugf("failed to use fast path to read %s from userns %s (error: %s), falling back to slow userns-join path", mapType.name, nsPath, err)
} else {
mapData = data
}
} else {
logrus.Debugf("cannot use fast path to read %s from userns %s, falling back to slow userns-join path", mapType.name, nsPath)
}
if mapData == nil {
// We have to actually join the namespace if we cannot take the
// fast path. The path is resolved with respect to the child
// process, so just use /proc/self.
data, err := spawnUserNamespaceCat(nsPath, "/proc/self/"+mapType.name)
if err != nil {
return nil, nil, err
}
mapData = data
}
idMap, err := parseIdmapData(mapData)
if err != nil {
return nil, nil, fmt.Errorf("failed to parse %s of userns %s: %w", mapType.name, nsPath, err)
}
*mapType.idMap = idMap
}
return uidMap, gidMap, nil
}
// IsSameMapping returns whether or not the two id mappings are the same. Note
// that if the order of the mappings is different, or a mapping has been split,
// the mappings will be considered different.
func IsSameMapping(a, b []configs.IDMap) bool {
if len(a) != len(b) {
return false
}
for idx := range a {
if a[idx] != b[idx] {
return false
}
}
return true
}

View File

@ -1,16 +1,6 @@
//go:build !linux
// +build !linux
package userns
// runningInUserNS is a stub for non-Linux systems
// Always returns false
func runningInUserNS() bool {
return false
}
// uidMapInUserNS is a stub for non-Linux systems
// Always returns false
func uidMapInUserNS(uidMap string) bool {
return false
}
// inUserNS is a stub for non-Linux systems. Always returns false.
func inUserNS() bool { return false }

View File

@ -1,156 +0,0 @@
package userns
import (
"fmt"
"os"
"sort"
"strings"
"sync"
"syscall"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
"github.com/opencontainers/runc/libcontainer/configs"
)
type Mapping struct {
UIDMappings []configs.IDMap
GIDMappings []configs.IDMap
}
func (m Mapping) toSys() (uids, gids []syscall.SysProcIDMap) {
for _, uid := range m.UIDMappings {
uids = append(uids, syscall.SysProcIDMap{
ContainerID: int(uid.ContainerID),
HostID: int(uid.HostID),
Size: int(uid.Size),
})
}
for _, gid := range m.GIDMappings {
gids = append(gids, syscall.SysProcIDMap{
ContainerID: int(gid.ContainerID),
HostID: int(gid.HostID),
Size: int(gid.Size),
})
}
return
}
// id returns a unique identifier for this mapping, agnostic of the order of
// the uid and gid mappings (because the order doesn't matter to the kernel).
// The set of userns handles is indexed using this ID.
func (m Mapping) id() string {
var uids, gids []string
for _, idmap := range m.UIDMappings {
uids = append(uids, fmt.Sprintf("%d:%d:%d", idmap.ContainerID, idmap.HostID, idmap.Size))
}
for _, idmap := range m.GIDMappings {
gids = append(gids, fmt.Sprintf("%d:%d:%d", idmap.ContainerID, idmap.HostID, idmap.Size))
}
// We don't care about the sort order -- just sort them.
sort.Strings(uids)
sort.Strings(gids)
return "uid=" + strings.Join(uids, ",") + ";gid=" + strings.Join(gids, ",")
}
type Handles struct {
m sync.Mutex
maps map[string]*os.File
}
// Release all resources associated with this Handle. All existing files
// returned from Get() will continue to work even after calling Release(). The
// same Handles can be re-used after calling Release().
func (hs *Handles) Release() {
hs.m.Lock()
defer hs.m.Unlock()
// Close the files for good measure, though GC will do that for us anyway.
for _, file := range hs.maps {
_ = file.Close()
}
hs.maps = nil
}
func spawnProc(req Mapping) (*os.Process, error) {
// We need to spawn a subprocess with the requested mappings, which is
// unfortunately quite expensive. The "safe" way of doing this is natively
// with Go (and then spawning something like "sleep infinity"), but
// execve() is a waste of cycles because we just need some process to have
// the right mapping, we don't care what it's executing. The "unsafe"
// option of doing a clone() behind the back of Go is probably okay in
// theory as long as we just do kill(getpid(), SIGSTOP). However, if we
// tell Go to put the new process into PTRACE_TRACEME mode, we can avoid
// the exec and not have to faff around with the mappings.
//
// Note that Go's stdlib does not support newuidmap, but in the case of
// id-mapped mounts, it seems incredibly unlikely that the user will be
// requesting us to do a remapping as an unprivileged user with mappings
// they have privileges over.
logrus.Debugf("spawning dummy process for id-mapping %s", req.id())
uidMappings, gidMappings := req.toSys()
// We don't need to use /proc/thread-self here because the exe mm of a
// thread-group is guaranteed to be the same for all threads by definition.
// This lets us avoid having to do runtime.LockOSThread.
return os.StartProcess("/proc/self/exe", []string{"runc", "--help"}, &os.ProcAttr{
Sys: &syscall.SysProcAttr{
Cloneflags: unix.CLONE_NEWUSER,
UidMappings: uidMappings,
GidMappings: gidMappings,
GidMappingsEnableSetgroups: false,
// Put the process into PTRACE_TRACEME mode to allow us to get the
// userns without having a proper execve() target.
Ptrace: true,
},
})
}
func dupFile(f *os.File) (*os.File, error) {
newFd, err := unix.FcntlInt(f.Fd(), unix.F_DUPFD_CLOEXEC, 0)
if err != nil {
return nil, os.NewSyscallError("fcntl(F_DUPFD_CLOEXEC)", err)
}
return os.NewFile(uintptr(newFd), f.Name()), nil
}
// Get returns a handle to a /proc/$pid/ns/user nsfs file with the requested
// mapping. The processes spawned to produce userns nsfds are cached, so if
// equivalent user namespace mappings are requested, the same user namespace
// will be returned. The caller is responsible for closing the returned file
// descriptor.
func (hs *Handles) Get(req Mapping) (file *os.File, err error) {
hs.m.Lock()
defer hs.m.Unlock()
if hs.maps == nil {
hs.maps = make(map[string]*os.File)
}
file, ok := hs.maps[req.id()]
if !ok {
proc, err := spawnProc(req)
if err != nil {
return nil, fmt.Errorf("failed to spawn dummy process for map %s: %w", req.id(), err)
}
// Make sure we kill the helper process. We ignore errors because
// there's not much we can do about them anyway, and ultimately
defer func() {
_ = proc.Kill()
_, _ = proc.Wait()
}()
// Stash away a handle to the userns file. This is neater than keeping
// the process alive, because Go's GC can handle files much better than
// leaked processes, and having long-living useless processes seems
// less than ideal.
file, err = os.Open(fmt.Sprintf("/proc/%d/ns/user", proc.Pid))
if err != nil {
return nil, err
}
hs.maps[req.id()] = file
}
// Duplicate the file, to make sure the lifecycle of each *os.File we
// return is independent.
return dupFile(file)
}

View File

@ -1,5 +1,4 @@
//go:build !windows
// +build !windows
package utils
@ -10,6 +9,7 @@ import (
"path/filepath"
"runtime"
"strconv"
"strings"
"sync"
_ "unsafe" // for go:linkname
@ -261,3 +261,17 @@ func ProcThreadSelf(subpath string) (string, ProcThreadSelfCloser) {
func ProcThreadSelfFd(fd uintptr) (string, ProcThreadSelfCloser) {
return ProcThreadSelf("fd/" + strconv.FormatUint(uint64(fd), 10))
}
// IsLexicallyInRoot is shorthand for strings.HasPrefix(path+"/", root+"/"),
// but properly handling the case where path or root are "/".
//
// NOTE: The return value only make sense if the path doesn't contain "..".
func IsLexicallyInRoot(root, path string) bool {
if root != "/" {
root += "/"
}
if path != "/" {
path += "/"
}
return strings.HasPrefix(path, root)
}