diff --git a/go.mod b/go.mod index f4d05a138a..b2903d51b5 100644 --- a/go.mod +++ b/go.mod @@ -55,7 +55,7 @@ require ( github.com/onsi/gomega v1.34.2 github.com/opencontainers/go-digest v1.0.0 github.com/opencontainers/image-spec v1.1.0 - github.com/opencontainers/runc v1.2.0-rc.2.0.20240801140032-ad5b481dace5 + github.com/opencontainers/runc v1.2.0-rc.3 github.com/opencontainers/runtime-spec v1.2.0 github.com/opencontainers/runtime-tools v0.9.1-0.20230914150019-408c51e934dc github.com/opencontainers/selinux v1.11.0 diff --git a/go.sum b/go.sum index 34dcb18044..b0586f4053 100644 --- a/go.sum +++ b/go.sum @@ -394,8 +394,8 @@ github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8 github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM= github.com/opencontainers/image-spec v1.1.0 h1:8SG7/vwALn54lVB/0yZ/MMwhFrPYtpEHQb2IpWsCzug= github.com/opencontainers/image-spec v1.1.0/go.mod h1:W4s4sFTMaBeK1BQLXbG4AdM2szdn85PY75RI83NrTrM= -github.com/opencontainers/runc v1.2.0-rc.2.0.20240801140032-ad5b481dace5 h1:VqTLG6pS4DlCwEAiwoYoQ3kXnhYCEeHB85vsYeM5ico= -github.com/opencontainers/runc v1.2.0-rc.2.0.20240801140032-ad5b481dace5/go.mod h1:H8njh/SD+WY9bYMmVsEEWDJgJdviOSDjNeXMjeNbYCE= +github.com/opencontainers/runc v1.2.0-rc.3 h1:5vQhejBp4S5w1DwFZ7L3CSOQX9cmcc8JKFy/mOBTJlo= +github.com/opencontainers/runc v1.2.0-rc.3/go.mod h1:HADgqJU4nqAmOpe+uYBTJ4ZRvjks3ptCjKXp1pHqmCc= github.com/opencontainers/runtime-spec v1.2.0 h1:z97+pHb3uELt/yiAWD691HNHQIF07bE7dzrbT927iTk= github.com/opencontainers/runtime-spec v1.2.0/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0= github.com/opencontainers/runtime-tools v0.9.1-0.20230914150019-408c51e934dc h1:d2hUh5O6MRBvStV55MQ8we08t42zSTqBbscoQccWmMc= diff --git a/vendor/github.com/opencontainers/runc/libcontainer/system/linux.go b/vendor/github.com/opencontainers/runc/libcontainer/system/linux.go new file mode 100644 index 0000000000..27e89d635c --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/system/linux.go @@ -0,0 +1,257 @@ +//go:build linux + +package system + +import ( + "fmt" + "io" + "os" + "runtime" + "strconv" + "strings" + "syscall" + "unsafe" + + "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" +) + +type ParentDeathSignal int + +func (p ParentDeathSignal) Restore() error { + if p == 0 { + return nil + } + current, err := GetParentDeathSignal() + if err != nil { + return err + } + if p == current { + return nil + } + return p.Set() +} + +func (p ParentDeathSignal) Set() error { + return SetParentDeathSignal(uintptr(p)) +} + +func Exec(cmd string, args []string, env []string) error { + for { + err := unix.Exec(cmd, args, env) + if err != unix.EINTR { + return &os.PathError{Op: "exec", Path: cmd, Err: err} + } + } +} + +func execveat(fd uintptr, pathname string, args []string, env []string, flags int) error { + pathnamep, err := syscall.BytePtrFromString(pathname) + if err != nil { + return err + } + + argvp, err := syscall.SlicePtrFromStrings(args) + if err != nil { + return err + } + + envp, err := syscall.SlicePtrFromStrings(env) + if err != nil { + return err + } + + _, _, errno := syscall.Syscall6( + unix.SYS_EXECVEAT, + fd, + uintptr(unsafe.Pointer(pathnamep)), + uintptr(unsafe.Pointer(&argvp[0])), + uintptr(unsafe.Pointer(&envp[0])), + uintptr(flags), + 0, + ) + return errno +} + +func Fexecve(fd uintptr, args []string, env []string) error { + var err error + for { + err = execveat(fd, "", args, env, unix.AT_EMPTY_PATH) + if err != unix.EINTR { // nolint:errorlint // unix errors are bare + break + } + } + if err == unix.ENOSYS { // nolint:errorlint // unix errors are bare + // Fallback to classic /proc/self/fd/... exec. + return Exec("/proc/self/fd/"+strconv.Itoa(int(fd)), args, env) + } + return os.NewSyscallError("execveat", err) +} + +func SetParentDeathSignal(sig uintptr) error { + if err := unix.Prctl(unix.PR_SET_PDEATHSIG, sig, 0, 0, 0); err != nil { + return err + } + return nil +} + +func GetParentDeathSignal() (ParentDeathSignal, error) { + var sig int + if err := unix.Prctl(unix.PR_GET_PDEATHSIG, uintptr(unsafe.Pointer(&sig)), 0, 0, 0); err != nil { + return -1, err + } + return ParentDeathSignal(sig), nil +} + +func SetKeepCaps() error { + if err := unix.Prctl(unix.PR_SET_KEEPCAPS, 1, 0, 0, 0); err != nil { + return err + } + + return nil +} + +func ClearKeepCaps() error { + if err := unix.Prctl(unix.PR_SET_KEEPCAPS, 0, 0, 0, 0); err != nil { + return err + } + + return nil +} + +func Setctty() error { + if err := unix.IoctlSetInt(0, unix.TIOCSCTTY, 0); err != nil { + return err + } + return nil +} + +// SetSubreaper sets the value i as the subreaper setting for the calling process +func SetSubreaper(i int) error { + return unix.Prctl(unix.PR_SET_CHILD_SUBREAPER, uintptr(i), 0, 0, 0) +} + +// GetSubreaper returns the subreaper setting for the calling process +func GetSubreaper() (int, error) { + var i uintptr + + if err := unix.Prctl(unix.PR_GET_CHILD_SUBREAPER, uintptr(unsafe.Pointer(&i)), 0, 0, 0); err != nil { + return -1, err + } + + return int(i), nil +} + +func ExecutableMemfd(comment string, flags int) (*os.File, error) { + // Try to use MFD_EXEC first. On pre-6.3 kernels we get -EINVAL for this + // flag. On post-6.3 kernels, with vm.memfd_noexec=1 this ensures we get an + // executable memfd. For vm.memfd_noexec=2 this is a bit more complicated. + // The original vm.memfd_noexec=2 implementation incorrectly silently + // allowed MFD_EXEC[1] -- this should be fixed in 6.6. On 6.6 and newer + // kernels, we will get -EACCES if we try to use MFD_EXEC with + // vm.memfd_noexec=2 (for 6.3-6.5, -EINVAL was the intended return value). + // + // The upshot is we only need to retry without MFD_EXEC on -EINVAL because + // it just so happens that passing MFD_EXEC bypasses vm.memfd_noexec=2 on + // kernels where -EINVAL is actually a security denial. + memfd, err := unix.MemfdCreate(comment, flags|unix.MFD_EXEC) + if err == unix.EINVAL { + memfd, err = unix.MemfdCreate(comment, flags) + } + if err != nil { + if err == unix.EACCES { + logrus.Info("memfd_create(MFD_EXEC) failed, possibly due to vm.memfd_noexec=2 -- falling back to less secure O_TMPFILE") + } + err := os.NewSyscallError("memfd_create", err) + return nil, fmt.Errorf("failed to create executable memfd: %w", err) + } + return os.NewFile(uintptr(memfd), "/memfd:"+comment), nil +} + +// Copy is like io.Copy except it uses sendfile(2) if the source and sink are +// both (*os.File) as an optimisation to make copies faster. +func Copy(dst io.Writer, src io.Reader) (copied int64, err error) { + dstFile, _ := dst.(*os.File) + srcFile, _ := src.(*os.File) + + if dstFile != nil && srcFile != nil { + fi, err := srcFile.Stat() + if err != nil { + goto fallback + } + size := fi.Size() + for size > 0 { + n, err := unix.Sendfile(int(dstFile.Fd()), int(srcFile.Fd()), nil, int(size)) + if n > 0 { + size -= int64(n) + copied += int64(n) + } + if err == unix.EINTR { + continue + } + if err != nil { + if copied == 0 { + // If we haven't copied anything so far, we can safely just + // fallback to io.Copy. We could always do the fallback but + // it's safer to error out in the case of a partial copy + // followed by an error (which should never happen). + goto fallback + } + return copied, fmt.Errorf("partial sendfile copy: %w", err) + } + } + return copied, nil + } + +fallback: + return io.Copy(dst, src) +} + +// SetLinuxPersonality sets the Linux execution personality. For more information see the personality syscall documentation. +// checkout getLinuxPersonalityFromStr() from libcontainer/specconv/spec_linux.go for type conversion. +func SetLinuxPersonality(personality int) error { + _, _, errno := unix.Syscall(unix.SYS_PERSONALITY, uintptr(personality), 0, 0) + if errno != 0 { + return &os.SyscallError{Syscall: "set_personality", Err: errno} + } + return nil +} + +func prepareAt(dir *os.File, path string) (int, string) { + if dir == nil { + return unix.AT_FDCWD, path + } + + // Rather than just filepath.Join-ing path here, do it manually so the + // error and handle correctly indicate cases like path=".." as being + // relative to the correct directory. The handle.Name() might end up being + // wrong but because this is (currently) only used in MkdirAllInRoot, that + // isn't a problem. + dirName := dir.Name() + if !strings.HasSuffix(dirName, "/") { + dirName += "/" + } + fullPath := dirName + path + + return int(dir.Fd()), fullPath +} + +func Openat(dir *os.File, path string, flags int, mode uint32) (*os.File, error) { + dirFd, fullPath := prepareAt(dir, path) + fd, err := unix.Openat(dirFd, path, flags, mode) + if err != nil { + return nil, &os.PathError{Op: "openat", Path: fullPath, Err: err} + } + runtime.KeepAlive(dir) + return os.NewFile(uintptr(fd), fullPath), nil +} + +func Mkdirat(dir *os.File, path string, mode uint32) error { + dirFd, fullPath := prepareAt(dir, path) + err := unix.Mkdirat(dirFd, path, mode) + if err != nil { + err = &os.PathError{Op: "mkdirat", Path: fullPath, Err: err} + } + runtime.KeepAlive(dir) + return err +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/system/proc.go b/vendor/github.com/opencontainers/runc/libcontainer/system/proc.go new file mode 100644 index 0000000000..774443ec9d --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/system/proc.go @@ -0,0 +1,127 @@ +package system + +import ( + "fmt" + "os" + "path/filepath" + "strconv" + "strings" +) + +// State is the status of a process. +type State rune + +const ( // Only values for Linux 3.14 and later are listed here + Dead State = 'X' + DiskSleep State = 'D' + Running State = 'R' + Sleeping State = 'S' + Stopped State = 'T' + TracingStop State = 't' + Zombie State = 'Z' + Parked State = 'P' + Idle State = 'I' +) + +// String forms of the state from proc(5)'s documentation for +// /proc/[pid]/status' "State" field. +func (s State) String() string { + switch s { + case Dead: + return "dead" + case DiskSleep: + return "disk sleep" + case Running: + return "running" + case Sleeping: + return "sleeping" + case Stopped: + return "stopped" + case TracingStop: + return "tracing stop" + case Zombie: + return "zombie" + case Parked: + return "parked" + case Idle: + return "idle" // kernel thread + default: + return fmt.Sprintf("unknown (%c)", s) + } +} + +// Stat_t represents the information from /proc/[pid]/stat, as +// described in proc(5) with names based on the /proc/[pid]/status +// fields. +type Stat_t struct { + // Name is the command run by the process. + Name string + + // State is the state of the process. + State State + + // StartTime is the number of clock ticks after system boot (since + // Linux 2.6). + StartTime uint64 +} + +// Stat returns a Stat_t instance for the specified process. +func Stat(pid int) (stat Stat_t, err error) { + bytes, err := os.ReadFile(filepath.Join("/proc", strconv.Itoa(pid), "stat")) + if err != nil { + return stat, err + } + return parseStat(string(bytes)) +} + +func parseStat(data string) (stat Stat_t, err error) { + // Example: + // 89653 (gunicorn: maste) S 89630 89653 89653 0 -1 4194560 29689 28896 0 3 146 32 76 19 20 0 1 0 2971844 52965376 3920 18446744073709551615 1 1 0 0 0 0 0 16781312 137447943 0 0 0 17 1 0 0 0 0 0 0 0 0 0 0 0 0 0 + // The fields are space-separated, see full description in proc(5). + // + // We are only interested in: + // * field 2: process name. It is the only field enclosed into + // parenthesis, as it can contain spaces (and parenthesis) inside. + // * field 3: process state, a single character (%c) + // * field 22: process start time, a long unsigned integer (%llu). + + // 1. Look for the first '(' and the last ')' first, what's in between is Name. + // We expect at least 20 fields and a space after the last one. + + const minAfterName = 20*2 + 1 // the min field is '0 '. + + first := strings.IndexByte(data, '(') + if first < 0 || first+minAfterName >= len(data) { + return stat, fmt.Errorf("invalid stat data (no comm or too short): %q", data) + } + + last := strings.LastIndexByte(data, ')') + if last <= first || last+minAfterName >= len(data) { + return stat, fmt.Errorf("invalid stat data (no comm or too short): %q", data) + } + + stat.Name = data[first+1 : last] + + // 2. Remove fields 1 and 2 and a space after. State is right after. + data = data[last+2:] + stat.State = State(data[0]) + + // 3. StartTime is field 22, data is at field 3 now, so we need to skip 19 spaces. + skipSpaces := 22 - 3 + for first = 0; skipSpaces > 0 && first < len(data); first++ { + if data[first] == ' ' { + skipSpaces-- + } + } + // Now first points to StartTime; look for space right after. + i := strings.IndexByte(data[first:], ' ') + if i < 0 { + return stat, fmt.Errorf("invalid stat data (too short): %q", data) + } + stat.StartTime, err = strconv.ParseUint(data[first:first+i], 10, 64) + if err != nil { + return stat, fmt.Errorf("invalid stat data (bad start time): %w", err) + } + + return stat, nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/system/rlimit_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/system/rlimit_linux.go new file mode 100644 index 0000000000..4595fa82aa --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/system/rlimit_linux.go @@ -0,0 +1,15 @@ +//go:build go1.23 + +package system + +import ( + "syscall" +) + +// ClearRlimitNofileCache clears go runtime's nofile rlimit cache. The argument +// is process RLIMIT_NOFILE values. Relies on go.dev/cl/588076. +func ClearRlimitNofileCache(lim *syscall.Rlimit) { + // Ignore the return values since we only need to clean the cache, + // the limit is going to be set via unix.Prlimit elsewhere. + _ = syscall.Setrlimit(syscall.RLIMIT_NOFILE, lim) +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/system/rlimit_linux_go122.go b/vendor/github.com/opencontainers/runc/libcontainer/system/rlimit_linux_go122.go new file mode 100644 index 0000000000..865d180221 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/system/rlimit_linux_go122.go @@ -0,0 +1,27 @@ +//go:build !go1.23 + +// TODO: remove this file once go 1.22 is no longer supported. + +package system + +import ( + "sync/atomic" + "syscall" + _ "unsafe" // Needed for go:linkname to work. +) + +//go:linkname syscallOrigRlimitNofile syscall.origRlimitNofile +var syscallOrigRlimitNofile atomic.Pointer[syscall.Rlimit] + +// ClearRlimitNofileCache clears go runtime's nofile rlimit cache. +// The argument is process RLIMIT_NOFILE values. +func ClearRlimitNofileCache(_ *syscall.Rlimit) { + // As reported in issue #4195, the new version of go runtime(since 1.19) + // will cache rlimit-nofile. Before executing execve, the rlimit-nofile + // of the process will be restored with the cache. In runc, this will + // cause the rlimit-nofile setting by the parent process for the container + // to become invalid. It can be solved by clearing this cache. But + // unfortunately, go stdlib doesn't provide such function, so we need to + // link to the private var `origRlimitNofile` in package syscall to hack. + syscallOrigRlimitNofile.Store(nil) +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/utils/utils_unix.go b/vendor/github.com/opencontainers/runc/libcontainer/utils/utils_unix.go index 6bf9102f41..1f3439b78f 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/utils/utils_unix.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/utils/utils_unix.go @@ -3,6 +3,7 @@ package utils import ( + "errors" "fmt" "math" "os" @@ -13,6 +14,8 @@ import ( "sync" _ "unsafe" // for go:linkname + "github.com/opencontainers/runc/libcontainer/system" + securejoin "github.com/cyphar/filepath-securejoin" "github.com/sirupsen/logrus" "golang.org/x/sys/unix" @@ -275,3 +278,112 @@ func IsLexicallyInRoot(root, path string) bool { } return strings.HasPrefix(path, root) } + +// MkdirAllInRootOpen attempts to make +// +// path, _ := securejoin.SecureJoin(root, unsafePath) +// os.MkdirAll(path, mode) +// os.Open(path) +// +// safer against attacks where components in the path are changed between +// SecureJoin returning and MkdirAll (or Open) being called. In particular, we +// try to detect any symlink components in the path while we are doing the +// MkdirAll. +// +// NOTE: Unlike os.MkdirAll, mode is not Go's os.FileMode, it is the unix mode +// (the suid/sgid/sticky bits are not the same as for os.FileMode). +// +// NOTE: If unsafePath is a subpath of root, we assume that you have already +// called SecureJoin and so we use the provided path verbatim without resolving +// any symlinks (this is done in a way that avoids symlink-exchange races). +// This means that the path also must not contain ".." elements, otherwise an +// error will occur. +// +// This is a somewhat less safe alternative to +// , but it should +// detect attempts to trick us into creating directories outside of the root. +// We should migrate to securejoin.MkdirAll once it is merged. +func MkdirAllInRootOpen(root, unsafePath string, mode uint32) (_ *os.File, Err error) { + // If the path is already "within" the root, use it verbatim. + fullPath := unsafePath + if !IsLexicallyInRoot(root, unsafePath) { + var err error + fullPath, err = securejoin.SecureJoin(root, unsafePath) + if err != nil { + return nil, err + } + } + subPath, err := filepath.Rel(root, fullPath) + if err != nil { + return nil, err + } + + // Check for any silly mode bits. + if mode&^0o7777 != 0 { + return nil, fmt.Errorf("tried to include non-mode bits in MkdirAll mode: 0o%.3o", mode) + } + + currentDir, err := os.OpenFile(root, unix.O_DIRECTORY|unix.O_CLOEXEC, 0) + if err != nil { + return nil, fmt.Errorf("open root handle: %w", err) + } + defer func() { + if Err != nil { + currentDir.Close() + } + }() + + for _, part := range strings.Split(subPath, string(filepath.Separator)) { + switch part { + case "", ".": + // Skip over no-op components. + continue + case "..": + return nil, fmt.Errorf("possible breakout detected: found %q component in SecureJoin subpath %s", part, subPath) + } + + nextDir, err := system.Openat(currentDir, part, unix.O_DIRECTORY|unix.O_NOFOLLOW|unix.O_CLOEXEC, 0) + switch { + case err == nil: + // Update the currentDir. + _ = currentDir.Close() + currentDir = nextDir + + case errors.Is(err, unix.ENOTDIR): + // This might be a symlink or some other random file. Either way, + // error out. + return nil, fmt.Errorf("cannot mkdir in %s/%s: %w", currentDir.Name(), part, unix.ENOTDIR) + + case errors.Is(err, os.ErrNotExist): + // Luckily, mkdirat will not follow trailing symlinks, so this is + // safe to do as-is. + if err := system.Mkdirat(currentDir, part, mode); err != nil { + return nil, err + } + // Open the new directory. There is a race here where an attacker + // could swap the directory with a different directory, but + // MkdirAll's fuzzy semantics mean we don't care about that. + nextDir, err := system.Openat(currentDir, part, unix.O_DIRECTORY|unix.O_NOFOLLOW|unix.O_CLOEXEC, 0) + if err != nil { + return nil, fmt.Errorf("open newly created directory: %w", err) + } + // Update the currentDir. + _ = currentDir.Close() + currentDir = nextDir + + default: + return nil, err + } + } + return currentDir, nil +} + +// MkdirAllInRoot is a wrapper around MkdirAllInRootOpen which closes the +// returned handle, for callers that don't need to use it. +func MkdirAllInRoot(root, unsafePath string, mode uint32) error { + f, err := MkdirAllInRootOpen(root, unsafePath, mode) + if err == nil { + _ = f.Close() + } + return err +} diff --git a/vendor/modules.txt b/vendor/modules.txt index 35dc69b680..905d73c6e1 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -886,8 +886,8 @@ github.com/opencontainers/go-digest ## explicit; go 1.18 github.com/opencontainers/image-spec/specs-go github.com/opencontainers/image-spec/specs-go/v1 -# github.com/opencontainers/runc v1.2.0-rc.2.0.20240801140032-ad5b481dace5 -## explicit; go 1.21 +# github.com/opencontainers/runc v1.2.0-rc.3 +## explicit; go 1.22 github.com/opencontainers/runc/libcontainer/apparmor github.com/opencontainers/runc/libcontainer/cgroups github.com/opencontainers/runc/libcontainer/cgroups/fs @@ -895,6 +895,7 @@ github.com/opencontainers/runc/libcontainer/cgroups/fs2 github.com/opencontainers/runc/libcontainer/cgroups/fscommon github.com/opencontainers/runc/libcontainer/configs github.com/opencontainers/runc/libcontainer/devices +github.com/opencontainers/runc/libcontainer/system github.com/opencontainers/runc/libcontainer/user github.com/opencontainers/runc/libcontainer/userns github.com/opencontainers/runc/libcontainer/utils