Bump to runc main

By using main instead of the v1.1 branch, we drop an unnecessary dependency on cilium/ebpf, saving ~1mb of binary size. Signed-off-by: Matt Heon <mheon@redhat.com>
2025-10-27 11:15:38 +08:00 · 2024-02-02 10:06:20 -05:00
parent 6a727fdbee
commit 91b8f208a2
172 changed files with 4363 additions and 22273 deletions
--- a/vendor/github.com/opencontainers/runc/libcontainer/utils/utils_unix.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/utils/utils_unix.go
@ -5,10 +5,16 @@ package utils

 import (
 	"fmt"
+	"math"
 	"os"
+	"path/filepath"
+	"runtime"
 	"strconv"
+	"sync"
 	_ "unsafe" // for go:linkname

+	securejoin "github.com/cyphar/filepath-securejoin"
+	"github.com/sirupsen/logrus"
 	"golang.org/x/sys/unix"
 )

@ -24,12 +30,39 @@ func EnsureProcHandle(fh *os.File) error {
 	return nil
 }

+var (
+	haveCloseRangeCloexecBool bool
+	haveCloseRangeCloexecOnce sync.Once
+)
+
+func haveCloseRangeCloexec() bool {
+	haveCloseRangeCloexecOnce.Do(func() {
+		// Make sure we're not closing a random file descriptor.
+		tmpFd, err := unix.FcntlInt(0, unix.F_DUPFD_CLOEXEC, 0)
+		if err != nil {
+			return
+		}
+		defer unix.Close(tmpFd)
+
+		err = unix.CloseRange(uint(tmpFd), uint(tmpFd), unix.CLOSE_RANGE_CLOEXEC)
+		// Any error means we cannot use close_range(CLOSE_RANGE_CLOEXEC).
+		// -ENOSYS and -EINVAL ultimately mean we don't have support, but any
+		// other potential error would imply that even the most basic close
+		// operation wouldn't work.
+		haveCloseRangeCloexecBool = err == nil
+	})
+	return haveCloseRangeCloexecBool
+}
+
 type fdFunc func(fd int)

 // fdRangeFrom calls the passed fdFunc for each file descriptor that is open in
 // the current process.
 func fdRangeFrom(minFd int, fn fdFunc) error {
-	fdDir, err := os.Open("/proc/self/fd")
+	procSelfFd, closer := ProcThreadSelf("fd")
+	defer closer()
+
+	fdDir, err := os.Open(procSelfFd)
 	if err != nil {
 		return err
 	}
@ -67,6 +100,12 @@ func fdRangeFrom(minFd int, fn fdFunc) error {
 // CloseExecFrom sets the O_CLOEXEC flag on all file descriptors greater or
 // equal to minFd in the current process.
 func CloseExecFrom(minFd int) error {
+	// Use close_range(CLOSE_RANGE_CLOEXEC) if possible.
+	if haveCloseRangeCloexec() {
+		err := unix.CloseRange(uint(minFd), math.MaxUint, unix.CLOSE_RANGE_CLOEXEC)
+		return os.NewSyscallError("close_range", err)
+	}
+	// Otherwise, fall back to the standard loop.
 	return fdRangeFrom(minFd, unix.CloseOnExec)
 }

@ -89,7 +128,8 @@ func runtime_IsPollDescriptor(fd uintptr) bool //nolint:revive
 // *os.File operations would apply to the wrong file). This function is only
 // intended to be called from the last stage of runc init.
 func UnsafeCloseFrom(minFd int) error {
-	// We must not close some file descriptors.
+	// We cannot use close_range(2) even if it is available, because we must
+	// not close some file descriptors.
 	return fdRangeFrom(minFd, func(fd int) {
 		if runtime_IsPollDescriptor(uintptr(fd)) {
 			// These are the Go runtimes internal netpoll file descriptors.
@ -107,11 +147,117 @@ func UnsafeCloseFrom(minFd int) error {
 	})
 }

-// NewSockPair returns a new unix socket pair
-func NewSockPair(name string) (parent *os.File, child *os.File, err error) {
+// NewSockPair returns a new SOCK_STREAM unix socket pair.
+func NewSockPair(name string) (parent, child *os.File, err error) {
 	fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0)
 	if err != nil {
 		return nil, nil, err
 	}
 	return os.NewFile(uintptr(fds[1]), name+"-p"), os.NewFile(uintptr(fds[0]), name+"-c"), nil
 }
+
+// WithProcfd runs the passed closure with a procfd path (/proc/self/fd/...)
+// corresponding to the unsafePath resolved within the root. Before passing the
+// fd, this path is verified to have been inside the root -- so operating on it
+// through the passed fdpath should be safe. Do not access this path through
+// the original path strings, and do not attempt to use the pathname outside of
+// the passed closure (the file handle will be freed once the closure returns).
+func WithProcfd(root, unsafePath string, fn func(procfd string) error) error {
+	// Remove the root then forcefully resolve inside the root.
+	unsafePath = stripRoot(root, unsafePath)
+	path, err := securejoin.SecureJoin(root, unsafePath)
+	if err != nil {
+		return fmt.Errorf("resolving path inside rootfs failed: %w", err)
+	}
+
+	procSelfFd, closer := ProcThreadSelf("fd/")
+	defer closer()
+
+	// Open the target path.
+	fh, err := os.OpenFile(path, unix.O_PATH|unix.O_CLOEXEC, 0)
+	if err != nil {
+		return fmt.Errorf("open o_path procfd: %w", err)
+	}
+	defer fh.Close()
+
+	procfd := filepath.Join(procSelfFd, strconv.Itoa(int(fh.Fd())))
+	// Double-check the path is the one we expected.
+	if realpath, err := os.Readlink(procfd); err != nil {
+		return fmt.Errorf("procfd verification failed: %w", err)
+	} else if realpath != path {
+		return fmt.Errorf("possibly malicious path detected -- refusing to operate on %s", realpath)
+	}
+
+	return fn(procfd)
+}
+
+type ProcThreadSelfCloser func()
+
+var (
+	haveProcThreadSelf     bool
+	haveProcThreadSelfOnce sync.Once
+)
+
+// ProcThreadSelf returns a string that is equivalent to
+// /proc/thread-self/<subpath>, with a graceful fallback on older kernels where
+// /proc/thread-self doesn't exist. This method DOES NOT use SecureJoin,
+// meaning that the passed string needs to be trusted. The caller _must_ call
+// the returned procThreadSelfCloser function (which is runtime.UnlockOSThread)
+// *only once* after it has finished using the returned path string.
+func ProcThreadSelf(subpath string) (string, ProcThreadSelfCloser) {
+	haveProcThreadSelfOnce.Do(func() {
+		if _, err := os.Stat("/proc/thread-self/"); err == nil {
+			haveProcThreadSelf = true
+		} else {
+			logrus.Debugf("cannot stat /proc/thread-self (%v), falling back to /proc/self/task/<tid>", err)
+		}
+	})
+
+	// We need to lock our thread until the caller is done with the path string
+	// because any non-atomic operation on the path (such as opening a file,
+	// then reading it) could be interrupted by the Go runtime where the
+	// underlying thread is swapped out and the original thread is killed,
+	// resulting in pull-your-hair-out-hard-to-debug issues in the caller. In
+	// addition, the pre-3.17 fallback makes everything non-atomic because the
+	// same thing could happen between unix.Gettid() and the path operations.
+	//
+	// In theory, we don't need to lock in the atomic user case when using
+	// /proc/thread-self/, but it's better to be safe than sorry (and there are
+	// only one or two truly atomic users of /proc/thread-self/).
+	runtime.LockOSThread()
+
+	threadSelf := "/proc/thread-self/"
+	if !haveProcThreadSelf {
+		// Pre-3.17 kernels did not have /proc/thread-self, so do it manually.
+		threadSelf = "/proc/self/task/" + strconv.Itoa(unix.Gettid()) + "/"
+		if _, err := os.Stat(threadSelf); err != nil {
+			// Unfortunately, this code is called from rootfs_linux.go where we
+			// are running inside the pid namespace of the container but /proc
+			// is the host's procfs. Unfortunately there is no real way to get
+			// the correct tid to use here (the kernel age means we cannot do
+			// things like set up a private fsopen("proc") -- even scanning
+			// NSpid in all of the tasks in /proc/self/task/*/status requires
+			// Linux 4.1).
+			//
+			// So, we just have to assume that /proc/self is acceptable in this
+			// one specific case.
+			if os.Getpid() == 1 {
+				logrus.Debugf("/proc/thread-self (tid=%d) cannot be emulated inside the initial container setup -- using /proc/self instead: %v", unix.Gettid(), err)
+			} else {
+				// This should never happen, but the fallback should work in most cases...
+				logrus.Warnf("/proc/thread-self could not be emulated for pid=%d (tid=%d) -- using more buggy /proc/self fallback instead: %v", os.Getpid(), unix.Gettid(), err)
+			}
+			threadSelf = "/proc/self/"
+		}
+	}
+	return threadSelf + subpath, runtime.UnlockOSThread
+}
+
+// ProcThreadSelfFd is small wrapper around ProcThreadSelf to make it easier to
+// create a /proc/thread-self handle for given file descriptor.
+//
+// It is basically equivalent to ProcThreadSelf(fmt.Sprintf("fd/%d", fd)), but
+// without using fmt.Sprintf to avoid unneeded overhead.
+func ProcThreadSelfFd(fd uintptr) (string, ProcThreadSelfCloser) {
+	return ProcThreadSelf("fd/" + strconv.FormatUint(uint64(fd), 10))
+}