vendor: update github.com/opencontainers/runc to main

it is not possible yet to drop the override since the latest released
version has an indirect dependency on github.com/cilium/ebpf that
causes the binary to grow by ~1MB.  Once there is a new runc version,
we can drop the override.

For now just bump to the latest version on main, since we are using a
year old version at this point.

Signed-off-by: Giuseppe Scrivano <gscrivan@redhat.com>
This commit is contained in:
Giuseppe Scrivano
2023-09-04 15:58:23 +02:00
parent 9bcab496e7
commit 9730c03735
32 changed files with 524 additions and 228 deletions

View File

@ -8,9 +8,9 @@ The following is courtesy of our legal counsel:
Use and transfer of Docker may be subject to certain restrictions by the
United States and other governments.
United States and other governments.
It is your responsibility to ensure that your use and/or transfer does not
violate applicable laws.
violate applicable laws.
For more information, please see http://www.bis.doc.gov

View File

@ -89,7 +89,7 @@ func prepareOpenat2() error {
})
if err != nil {
prepErr = &os.PathError{Op: "openat2", Path: cgroupfsDir, Err: err}
if err != unix.ENOSYS { //nolint:errorlint // unix errors are bare
if err != unix.ENOSYS {
logrus.Warnf("falling back to securejoin: %s", prepErr)
} else {
logrus.Debug("openat2 not available, falling back to securejoin")

View File

@ -94,6 +94,14 @@ func (s *CpuGroup) Set(path string, r *configs.Resources) error {
}
}
}
if r.CPUIdle != nil {
idle := strconv.FormatInt(*r.CPUIdle, 10)
if err := cgroups.WriteFile(path, "cpu.idle", idle); err != nil {
return err
}
}
return s.SetRtSched(path, r)
}

View File

@ -195,7 +195,7 @@ func cpusetEnsureParent(current string) error {
}
// Treat non-existing directory as cgroupfs as it will be created,
// and the root cpuset directory obviously exists.
if err != nil && err != unix.ENOENT { //nolint:errorlint // unix errors are bare
if err != nil && err != unix.ENOENT {
return &os.PathError{Op: "statfs", Path: parent, Err: err}
}

View File

@ -28,6 +28,7 @@ var subsystems = []subsystem{
&FreezerGroup{},
&RdmaGroup{},
&NameGroup{GroupName: "name=systemd", Join: true},
&NameGroup{GroupName: "misc", Join: true},
}
var errSubsystemDoesNotExist = errors.New("cgroup: subsystem does not exist")
@ -53,13 +54,13 @@ type subsystem interface {
Set(path string, r *configs.Resources) error
}
type manager struct {
type Manager struct {
mu sync.Mutex
cgroups *configs.Cgroup
paths map[string]string
}
func NewManager(cg *configs.Cgroup, paths map[string]string) (cgroups.Manager, error) {
func NewManager(cg *configs.Cgroup, paths map[string]string) (*Manager, error) {
// Some v1 controllers (cpu, cpuset, and devices) expect
// cgroups.Resources to not be nil in Apply.
if cg.Resources == nil {
@ -77,7 +78,7 @@ func NewManager(cg *configs.Cgroup, paths map[string]string) (cgroups.Manager, e
}
}
return &manager{
return &Manager{
cgroups: cg,
paths: paths,
}, nil
@ -104,7 +105,7 @@ func isIgnorableError(rootless bool, err error) bool {
return false
}
func (m *manager) Apply(pid int) (err error) {
func (m *Manager) Apply(pid int) (err error) {
m.mu.Lock()
defer m.mu.Unlock()
@ -138,19 +139,19 @@ func (m *manager) Apply(pid int) (err error) {
return nil
}
func (m *manager) Destroy() error {
func (m *Manager) Destroy() error {
m.mu.Lock()
defer m.mu.Unlock()
return cgroups.RemovePaths(m.paths)
}
func (m *manager) Path(subsys string) string {
func (m *Manager) Path(subsys string) string {
m.mu.Lock()
defer m.mu.Unlock()
return m.paths[subsys]
}
func (m *manager) GetStats() (*cgroups.Stats, error) {
func (m *Manager) GetStats() (*cgroups.Stats, error) {
m.mu.Lock()
defer m.mu.Unlock()
stats := cgroups.NewStats()
@ -166,7 +167,7 @@ func (m *manager) GetStats() (*cgroups.Stats, error) {
return stats, nil
}
func (m *manager) Set(r *configs.Resources) error {
func (m *Manager) Set(r *configs.Resources) error {
if r == nil {
return nil
}
@ -201,7 +202,7 @@ func (m *manager) Set(r *configs.Resources) error {
// Freeze toggles the container's freezer cgroup depending on the state
// provided
func (m *manager) Freeze(state configs.FreezerState) error {
func (m *Manager) Freeze(state configs.FreezerState) error {
path := m.Path("freezer")
if path == "" {
return errors.New("cannot toggle freezer: cgroups not configured for container")
@ -217,25 +218,25 @@ func (m *manager) Freeze(state configs.FreezerState) error {
return nil
}
func (m *manager) GetPids() ([]int, error) {
func (m *Manager) GetPids() ([]int, error) {
return cgroups.GetPids(m.Path("devices"))
}
func (m *manager) GetAllPids() ([]int, error) {
func (m *Manager) GetAllPids() ([]int, error) {
return cgroups.GetAllPids(m.Path("devices"))
}
func (m *manager) GetPaths() map[string]string {
func (m *Manager) GetPaths() map[string]string {
m.mu.Lock()
defer m.mu.Unlock()
return m.paths
}
func (m *manager) GetCgroups() (*configs.Cgroup, error) {
func (m *Manager) GetCgroups() (*configs.Cgroup, error) {
return m.cgroups, nil
}
func (m *manager) GetFreezerState() (configs.FreezerState, error) {
func (m *Manager) GetFreezerState() (configs.FreezerState, error) {
dir := m.Path("freezer")
// If the container doesn't have the freezer cgroup, say it's undefined.
if dir == "" {
@ -245,7 +246,7 @@ func (m *manager) GetFreezerState() (configs.FreezerState, error) {
return freezer.GetState(dir)
}
func (m *manager) Exists() bool {
func (m *Manager) Exists() bool {
return cgroups.PathExists(m.Path("devices"))
}
@ -253,7 +254,7 @@ func OOMKillCount(path string) (uint64, error) {
return fscommon.GetValueByKey(path, "memory.oom_control", "oom_kill")
}
func (m *manager) OOMKillCount() (uint64, error) {
func (m *Manager) OOMKillCount() (uint64, error) {
c, err := OOMKillCount(m.Path("memory"))
// Ignore ENOENT when rootless as it couldn't create cgroup.
if err != nil && m.cgroups.Rootless && os.IsNotExist(err) {

View File

@ -83,6 +83,7 @@ func tryDefaultCgroupRoot() string {
if err != nil {
return ""
}
defer dir.Close()
names, err := dir.Readdirnames(1)
if err != nil {
return ""
@ -164,9 +165,8 @@ func subsysPath(root, inner, subsystem string) (string, error) {
return filepath.Join(root, filepath.Base(mnt), inner), nil
}
// Use GetOwnCgroupPath instead of GetInitCgroupPath, because the creating
// process could in container and shared pid namespace with host, and
// /proc/1/cgroup could point to whole other world of cgroups.
// Use GetOwnCgroupPath for dind-like cases, when cgroupns is not
// available. This is ugly.
parentPath, err := cgroups.GetOwnCgroupPath(subsystem)
if err != nil {
return "", err

View File

@ -11,7 +11,7 @@ import (
)
func isCpuSet(r *configs.Resources) bool {
return r.CpuWeight != 0 || r.CpuQuota != 0 || r.CpuPeriod != 0
return r.CpuWeight != 0 || r.CpuQuota != 0 || r.CpuPeriod != 0 || r.CPUIdle != nil
}
func setCpu(dirPath string, r *configs.Resources) error {
@ -19,6 +19,12 @@ func setCpu(dirPath string, r *configs.Resources) error {
return nil
}
if r.CPUIdle != nil {
if err := cgroups.WriteFile(dirPath, "cpu.idle", strconv.FormatInt(*r.CPUIdle, 10)); err != nil {
return err
}
}
// NOTE: .CpuShares is not used here. Conversion is the caller's responsibility.
if r.CpuWeight != 0 {
if err := cgroups.WriteFile(dirPath, "cpu.weight", strconv.FormatUint(r.CpuWeight, 10)); err != nil {

View File

@ -13,7 +13,7 @@ import (
type parseError = fscommon.ParseError
type manager struct {
type Manager struct {
config *configs.Cgroup
// dirPath is like "/sys/fs/cgroup/user.slice/user-1001.slice/session-1.scope"
dirPath string
@ -25,7 +25,7 @@ type manager struct {
// NewManager creates a manager for cgroup v2 unified hierarchy.
// dirPath is like "/sys/fs/cgroup/user.slice/user-1001.slice/session-1.scope".
// If dirPath is empty, it is automatically set using config.
func NewManager(config *configs.Cgroup, dirPath string) (cgroups.Manager, error) {
func NewManager(config *configs.Cgroup, dirPath string) (*Manager, error) {
if dirPath == "" {
var err error
dirPath, err = defaultDirPath(config)
@ -34,14 +34,14 @@ func NewManager(config *configs.Cgroup, dirPath string) (cgroups.Manager, error)
}
}
m := &manager{
m := &Manager{
config: config,
dirPath: dirPath,
}
return m, nil
}
func (m *manager) getControllers() error {
func (m *Manager) getControllers() error {
if m.controllers != nil {
return nil
}
@ -62,7 +62,7 @@ func (m *manager) getControllers() error {
return nil
}
func (m *manager) Apply(pid int) error {
func (m *Manager) Apply(pid int) error {
if err := CreateCgroupPath(m.dirPath, m.config); err != nil {
// Related tests:
// - "runc create (no limits + no cgrouppath + no permission) succeeds"
@ -84,15 +84,15 @@ func (m *manager) Apply(pid int) error {
return nil
}
func (m *manager) GetPids() ([]int, error) {
func (m *Manager) GetPids() ([]int, error) {
return cgroups.GetPids(m.dirPath)
}
func (m *manager) GetAllPids() ([]int, error) {
func (m *Manager) GetAllPids() ([]int, error) {
return cgroups.GetAllPids(m.dirPath)
}
func (m *manager) GetStats() (*cgroups.Stats, error) {
func (m *Manager) GetStats() (*cgroups.Stats, error) {
var errs []error
st := cgroups.NewStats()
@ -114,6 +114,17 @@ func (m *manager) GetStats() (*cgroups.Stats, error) {
if err := statCpu(m.dirPath, st); err != nil && !os.IsNotExist(err) {
errs = append(errs, err)
}
// PSI (since kernel 4.20).
var err error
if st.CpuStats.PSI, err = statPSI(m.dirPath, "cpu.pressure"); err != nil {
errs = append(errs, err)
}
if st.MemoryStats.PSI, err = statPSI(m.dirPath, "memory.pressure"); err != nil {
errs = append(errs, err)
}
if st.BlkioStats.PSI, err = statPSI(m.dirPath, "io.pressure"); err != nil {
errs = append(errs, err)
}
// hugetlb (since kernel 5.6)
if err := statHugeTlb(m.dirPath, st); err != nil && !os.IsNotExist(err) {
errs = append(errs, err)
@ -128,7 +139,7 @@ func (m *manager) GetStats() (*cgroups.Stats, error) {
return st, nil
}
func (m *manager) Freeze(state configs.FreezerState) error {
func (m *Manager) Freeze(state configs.FreezerState) error {
if m.config.Resources == nil {
return errors.New("cannot toggle freezer: cgroups not configured for container")
}
@ -139,15 +150,15 @@ func (m *manager) Freeze(state configs.FreezerState) error {
return nil
}
func (m *manager) Destroy() error {
func (m *Manager) Destroy() error {
return cgroups.RemovePath(m.dirPath)
}
func (m *manager) Path(_ string) string {
func (m *Manager) Path(_ string) string {
return m.dirPath
}
func (m *manager) Set(r *configs.Resources) error {
func (m *Manager) Set(r *configs.Resources) error {
if r == nil {
return nil
}
@ -213,7 +224,7 @@ func setDevices(dirPath string, r *configs.Resources) error {
return cgroups.DevicesSetV2(dirPath, r)
}
func (m *manager) setUnified(res map[string]string) error {
func (m *Manager) setUnified(res map[string]string) error {
for k, v := range res {
if strings.Contains(k, "/") {
return fmt.Errorf("unified resource %q must be a file name (no slashes)", k)
@ -239,21 +250,21 @@ func (m *manager) setUnified(res map[string]string) error {
return nil
}
func (m *manager) GetPaths() map[string]string {
func (m *Manager) GetPaths() map[string]string {
paths := make(map[string]string, 1)
paths[""] = m.dirPath
return paths
}
func (m *manager) GetCgroups() (*configs.Cgroup, error) {
func (m *Manager) GetCgroups() (*configs.Cgroup, error) {
return m.config, nil
}
func (m *manager) GetFreezerState() (configs.FreezerState, error) {
func (m *Manager) GetFreezerState() (configs.FreezerState, error) {
return getFreezer(m.dirPath)
}
func (m *manager) Exists() bool {
func (m *Manager) Exists() bool {
return cgroups.PathExists(m.dirPath)
}
@ -261,7 +272,7 @@ func OOMKillCount(path string) (uint64, error) {
return fscommon.GetValueByKey(path, "memory.events", "oom_kill")
}
func (m *manager) OOMKillCount() (uint64, error) {
func (m *Manager) OOMKillCount() (uint64, error) {
c, err := OOMKillCount(m.dirPath)
if err != nil && m.config.Rootless && os.IsNotExist(err) {
err = nil
@ -269,3 +280,35 @@ func (m *manager) OOMKillCount() (uint64, error) {
return c, err
}
func CheckMemoryUsage(dirPath string, r *configs.Resources) error {
if !r.MemoryCheckBeforeUpdate {
return nil
}
if r.Memory <= 0 && r.MemorySwap <= 0 {
return nil
}
usage, err := fscommon.GetCgroupParamUint(dirPath, "memory.current")
if err != nil {
// This check is on best-effort basis, so if we can't read the
// current usage (cgroup not yet created, or any other error),
// we should not fail.
return nil
}
if r.MemorySwap > 0 {
if uint64(r.MemorySwap) <= usage {
return fmt.Errorf("rejecting memory+swap limit %d <= usage %d", r.MemorySwap, usage)
}
}
if r.Memory > 0 {
if uint64(r.Memory) <= usage {
return fmt.Errorf("rejecting memory limit %d <= usage %d", r.Memory, usage)
}
}
return nil
}

View File

@ -40,6 +40,11 @@ func setMemory(dirPath string, r *configs.Resources) error {
if !isMemorySet(r) {
return nil
}
if err := CheckMemoryUsage(dirPath, r); err != nil {
return err
}
swap, err := cgroups.ConvertMemorySwapToCgroupV2Value(r.MemorySwap, r.Memory)
if err != nil {
return err
@ -101,8 +106,9 @@ func statMemory(dirPath string, stats *cgroups.Stats) error {
if err != nil {
if errors.Is(err, unix.ENOENT) && dirPath == UnifiedMountpoint {
// The root cgroup does not have memory.{current,max}
// so emulate those using data from /proc/meminfo.
return statsFromMeminfo(stats)
// so emulate those using data from /proc/meminfo and
// /sys/fs/cgroup/memory.stat
return rootStatsFromMeminfo(stats)
}
return err
}
@ -154,7 +160,7 @@ func getMemoryDataV2(path, name string) (cgroups.MemoryData, error) {
return memoryData, nil
}
func statsFromMeminfo(stats *cgroups.Stats) error {
func rootStatsFromMeminfo(stats *cgroups.Stats) error {
const file = "/proc/meminfo"
f, err := os.Open(file)
if err != nil {
@ -166,14 +172,10 @@ func statsFromMeminfo(stats *cgroups.Stats) error {
var (
swap_free uint64
swap_total uint64
main_total uint64
main_free uint64
)
mem := map[string]*uint64{
"SwapFree": &swap_free,
"SwapTotal": &swap_total,
"MemTotal": &main_total,
"MemFree": &main_free,
}
found := 0
@ -206,11 +208,18 @@ func statsFromMeminfo(stats *cgroups.Stats) error {
return &parseError{Path: "", File: file, Err: err}
}
// cgroup v1 `usage_in_bytes` reports memory usage as the sum of
// - rss (NR_ANON_MAPPED)
// - cache (NR_FILE_PAGES)
// cgroup v1 reports SwapUsage values as mem+swap combined
// cgroup v2 reports rss and cache as anon and file.
// sum `anon` + `file` to report the same value as `usage_in_bytes` in v1.
// sum swap usage as combined mem+swap usage for consistency as well.
stats.MemoryStats.Usage.Usage = stats.MemoryStats.Stats["anon"] + stats.MemoryStats.Stats["file"]
stats.MemoryStats.Usage.Limit = math.MaxUint64
stats.MemoryStats.SwapUsage.Usage = (swap_total - swap_free) * 1024
stats.MemoryStats.SwapUsage.Limit = math.MaxUint64
stats.MemoryStats.Usage.Usage = (main_total - main_free) * 1024
stats.MemoryStats.Usage.Limit = math.MaxUint64
stats.MemoryStats.SwapUsage.Usage += stats.MemoryStats.Usage.Usage
return nil
}

View File

@ -0,0 +1,89 @@
package fs2
import (
"bufio"
"errors"
"fmt"
"os"
"strconv"
"strings"
"golang.org/x/sys/unix"
"github.com/opencontainers/runc/libcontainer/cgroups"
)
func statPSI(dirPath string, file string) (*cgroups.PSIStats, error) {
f, err := cgroups.OpenFile(dirPath, file, os.O_RDONLY)
if err != nil {
if errors.Is(err, os.ErrNotExist) {
// Kernel < 4.20, or CONFIG_PSI is not set,
// or PSI stats are turned off for the cgroup
// ("echo 0 > cgroup.pressure", kernel >= 6.1).
return nil, nil
}
return nil, err
}
defer f.Close()
var psistats cgroups.PSIStats
sc := bufio.NewScanner(f)
for sc.Scan() {
parts := strings.Fields(sc.Text())
var pv *cgroups.PSIData
switch parts[0] {
case "some":
pv = &psistats.Some
case "full":
pv = &psistats.Full
}
if pv != nil {
*pv, err = parsePSIData(parts[1:])
if err != nil {
return nil, &parseError{Path: dirPath, File: file, Err: err}
}
}
}
if err := sc.Err(); err != nil {
if errors.Is(err, unix.ENOTSUP) {
// Some kernels (e.g. CS9) may return ENOTSUP on read
// if psi=1 kernel cmdline parameter is required.
return nil, nil
}
return nil, &parseError{Path: dirPath, File: file, Err: err}
}
return &psistats, nil
}
func parsePSIData(psi []string) (cgroups.PSIData, error) {
data := cgroups.PSIData{}
for _, f := range psi {
kv := strings.SplitN(f, "=", 2)
if len(kv) != 2 {
return data, fmt.Errorf("invalid psi data: %q", f)
}
var pv *float64
switch kv[0] {
case "avg10":
pv = &data.Avg10
case "avg60":
pv = &data.Avg60
case "avg300":
pv = &data.Avg300
case "total":
v, err := strconv.ParseUint(kv[1], 10, 64)
if err != nil {
return data, fmt.Errorf("invalid %s PSI value: %w", kv[0], err)
}
data.Total = v
}
if pv != nil {
v, err := strconv.ParseFloat(kv[1], 64)
if err != nil {
return data, fmt.Errorf("invalid %s PSI value: %w", kv[0], err)
}
*pv = v
}
}
return data, nil
}

View File

@ -32,9 +32,22 @@ type CpuUsage struct {
UsageInUsermode uint64 `json:"usage_in_usermode"`
}
type PSIData struct {
Avg10 float64 `json:"avg10"`
Avg60 float64 `json:"avg60"`
Avg300 float64 `json:"avg300"`
Total uint64 `json:"total"`
}
type PSIStats struct {
Some PSIData `json:"some,omitempty"`
Full PSIData `json:"full,omitempty"`
}
type CpuStats struct {
CpuUsage CpuUsage `json:"cpu_usage,omitempty"`
ThrottlingData ThrottlingData `json:"throttling_data,omitempty"`
PSI *PSIStats `json:"psi,omitempty"`
}
type CPUSetStats struct {
@ -89,6 +102,7 @@ type MemoryStats struct {
UseHierarchy bool `json:"use_hierarchy"`
Stats map[string]uint64 `json:"stats,omitempty"`
PSI *PSIStats `json:"psi,omitempty"`
}
type PageUsageByNUMA struct {
@ -133,6 +147,7 @@ type BlkioStats struct {
IoMergedRecursive []BlkioStatEntry `json:"io_merged_recursive,omitempty"`
IoTimeRecursive []BlkioStatEntry `json:"io_time_recursive,omitempty"`
SectorsRecursive []BlkioStatEntry `json:"sectors_recursive,omitempty"`
PSI *PSIStats `json:"psi,omitempty"`
}
type HugetlbStats struct {

View File

@ -36,13 +36,13 @@ func IsCgroup2UnifiedMode() bool {
var st unix.Statfs_t
err := unix.Statfs(unifiedMountpoint, &st)
if err != nil {
level := logrus.WarnLevel
if os.IsNotExist(err) && userns.RunningInUserNS() {
// ignore the "not found" error if running in userns
logrus.WithError(err).Debugf("%s missing, assuming cgroup v1", unifiedMountpoint)
isUnified = false
return
// For rootless containers, sweep it under the rug.
level = logrus.DebugLevel
}
panic(fmt.Sprintf("cannot statfs cgroup root: %s", err))
logrus.StandardLogger().Logf(level,
"statfs %s: %v; assuming cgroup v1", unifiedMountpoint, err)
}
isUnified = st.Type == unix.CGROUP2_SUPER_MAGIC
})
@ -162,8 +162,10 @@ func readProcsFile(dir string) ([]int, error) {
// ParseCgroupFile parses the given cgroup file, typically /proc/self/cgroup
// or /proc/<pid>/cgroup, into a map of subsystems to cgroup paths, e.g.
// "cpu": "/user.slice/user-1000.slice"
// "pids": "/user.slice/user-1000.slice"
//
// "cpu": "/user.slice/user-1000.slice"
// "pids": "/user.slice/user-1000.slice"
//
// etc.
//
// Note that for cgroup v2 unified hierarchy, there are no per-controller
@ -215,20 +217,9 @@ func PathExists(path string) bool {
return true
}
func EnterPid(cgroupPaths map[string]string, pid int) error {
for _, path := range cgroupPaths {
if PathExists(path) {
if err := WriteCgroupProc(path, pid); err != nil {
return err
}
}
}
return nil
}
func rmdir(path string) error {
err := unix.Rmdir(path)
if err == nil || err == unix.ENOENT { //nolint:errorlint // unix errors are bare
if err == nil || err == unix.ENOENT {
return nil
}
return &os.PathError{Op: "rmdir", Path: path, Err: err}

View File

@ -236,27 +236,6 @@ func GetOwnCgroupPath(subsystem string) (string, error) {
return getCgroupPathHelper(subsystem, cgroup)
}
func GetInitCgroup(subsystem string) (string, error) {
if IsCgroup2UnifiedMode() {
return "", errUnified
}
cgroups, err := ParseCgroupFile("/proc/1/cgroup")
if err != nil {
return "", err
}
return getControllerPath(subsystem, cgroups)
}
func GetInitCgroupPath(subsystem string) (string, error) {
cgroup, err := GetInitCgroup(subsystem)
if err != nil {
return "", err
}
return getCgroupPathHelper(subsystem, cgroup)
}
func getCgroupPathHelper(subsystem, cgroup string) (string, error) {
mnt, root, err := FindCgroupMountpointAndRoot("", subsystem)
if err != nil {

View File

@ -84,6 +84,9 @@ type Resources struct {
// MEM to use
CpusetMems string `json:"cpuset_mems"`
// cgroup SCHED_IDLE
CPUIdle *int64 `json:"cpu_idle,omitempty"`
// Process limit; set <= `0' to disable limit.
PidsLimit int64 `json:"pids_limit"`
@ -155,4 +158,9 @@ type Resources struct {
// during Set() to figure out whether the freeze is required. Those
// methods may be relatively slow, thus this flag.
SkipFreezeOnSet bool `json:"-"`
// MemoryCheckBeforeUpdate is a flag for cgroup v2 managers to check
// if the new memory limits (Memory and MemorySwap) being set are lower
// than the current memory usage, and reject if so.
MemoryCheckBeforeUpdate bool `json:"memory_check_before_update"`
}

View File

@ -31,12 +31,13 @@ type IDMap struct {
// for syscalls. Additional architectures can be added by specifying them in
// Architectures.
type Seccomp struct {
DefaultAction Action `json:"default_action"`
Architectures []string `json:"architectures"`
Syscalls []*Syscall `json:"syscalls"`
DefaultErrnoRet *uint `json:"default_errno_ret"`
ListenerPath string `json:"listener_path,omitempty"`
ListenerMetadata string `json:"listener_metadata,omitempty"`
DefaultAction Action `json:"default_action"`
Architectures []string `json:"architectures"`
Flags []specs.LinuxSeccompFlag `json:"flags"`
Syscalls []*Syscall `json:"syscalls"`
DefaultErrnoRet *uint `json:"default_errno_ret"`
ListenerPath string `json:"listener_path,omitempty"`
ListenerMetadata string `json:"listener_metadata,omitempty"`
}
// Action is taken upon rule match in Seccomp
@ -118,6 +119,9 @@ type Config struct {
// Hostname optionally sets the container's hostname if provided
Hostname string `json:"hostname"`
// Domainname optionally sets the container's domainname if provided
Domainname string `json:"domainname"`
// Namespaces specifies the container's namespaces that it should setup when cloning the init process
// If a namespace is not provided that namespace is shared from the container's parent process
Namespaces Namespaces `json:"namespaces"`
@ -155,11 +159,11 @@ type Config struct {
// More information about kernel oom score calculation here: https://lwn.net/Articles/317814/
OomScoreAdj *int `json:"oom_score_adj,omitempty"`
// UidMappings is an array of User ID mappings for User Namespaces
UidMappings []IDMap `json:"uid_mappings"`
// UIDMappings is an array of User ID mappings for User Namespaces
UIDMappings []IDMap `json:"uid_mappings"`
// GidMappings is an array of Group ID mappings for User Namespaces
GidMappings []IDMap `json:"gid_mappings"`
// GIDMappings is an array of Group ID mappings for User Namespaces
GIDMappings []IDMap `json:"gid_mappings"`
// MaskPaths specifies paths within the container's rootfs to mask over with a bind
// mount pointing to /dev/null as to prevent reads of the file.
@ -208,6 +212,13 @@ type Config struct {
// RootlessCgroups is set when unlikely to have the full access to cgroups.
// When RootlessCgroups is set, cgroups errors are ignored.
RootlessCgroups bool `json:"rootless_cgroups,omitempty"`
// Do not try to remount a bind mount again after the first attempt failed on source
// filesystems that have nodev, noexec, nosuid, noatime, relatime, strictatime, nodiratime set
NoMountFallback bool `json:"no_mount_fallback,omitempty"`
// TimeOffsets specifies the offset for supporting time namespaces.
TimeOffsets map[string]specs.LinuxTimeOffset `json:"time_offsets,omitempty"`
}
type (
@ -274,6 +285,7 @@ type Capabilities struct {
Ambient []string
}
// Deprecated: use (Hooks).Run instead.
func (hooks HookList) RunHooks(state *specs.State) error {
for i, h := range hooks {
if err := h.Run(state); err != nil {
@ -330,6 +342,18 @@ func (hooks *Hooks) MarshalJSON() ([]byte, error) {
})
}
// Run executes all hooks for the given hook name.
func (hooks Hooks) Run(name HookName, state *specs.State) error {
list := hooks[name]
for i, h := range list {
if err := h.Run(state); err != nil {
return fmt.Errorf("error running %s hook #%d: %w", name, i, err)
}
}
return nil
}
type Hook interface {
// Run executes the hook with the provided state.
Run(*specs.State) error
@ -390,7 +414,7 @@ func (c Command) Run(s *specs.State) error {
go func() {
err := cmd.Wait()
if err != nil {
err = fmt.Errorf("error running hook: %w, stdout: %s, stderr: %s", err, stdout.String(), stderr.String())
err = fmt.Errorf("%w, stdout: %s, stderr: %s", err, stdout.String(), stderr.String())
}
errC <- err
}()

View File

@ -13,10 +13,10 @@ var (
// different when user namespaces are enabled.
func (c Config) HostUID(containerId int) (int, error) {
if c.Namespaces.Contains(NEWUSER) {
if c.UidMappings == nil {
if len(c.UIDMappings) == 0 {
return -1, errNoUIDMap
}
id, found := c.hostIDFromMapping(containerId, c.UidMappings)
id, found := c.hostIDFromMapping(containerId, c.UIDMappings)
if !found {
return -1, errNoUserMap
}
@ -36,10 +36,10 @@ func (c Config) HostRootUID() (int, error) {
// different when user namespaces are enabled.
func (c Config) HostGID(containerId int) (int, error) {
if c.Namespaces.Contains(NEWUSER) {
if c.GidMappings == nil {
if len(c.GIDMappings) == 0 {
return -1, errNoGIDMap
}
id, found := c.hostIDFromMapping(containerId, c.GidMappings)
id, found := c.hostIDFromMapping(containerId, c.GIDMappings)
if !found {
return -1, errNoGroupMap
}

View File

@ -1,42 +1,7 @@
package configs
import "golang.org/x/sys/unix"
const (
// EXT_COPYUP is a directive to copy up the contents of a directory when
// a tmpfs is mounted over it.
EXT_COPYUP = 1 << iota //nolint:golint // ignore "don't use ALL_CAPS" warning
)
type Mount struct {
// Source path for the mount.
Source string `json:"source"`
// Destination path for the mount inside the container.
Destination string `json:"destination"`
// Device the mount is for.
Device string `json:"device"`
// Mount flags.
Flags int `json:"flags"`
// Propagation Flags
PropagationFlags []int `json:"propagation_flags"`
// Mount data applied to the mount.
Data string `json:"data"`
// Relabel source if set, "z" indicates shared, "Z" indicates unshared.
Relabel string `json:"relabel"`
// RecAttr represents mount properties to be applied recursively (AT_RECURSIVE), see mount_setattr(2).
RecAttr *unix.MountAttr `json:"rec_attr"`
// Extensions are additional flags that are specific to runc.
Extensions int `json:"extensions"`
}
func (m *Mount) IsBind() bool {
return m.Flags&unix.MS_BIND != 0
}

View File

@ -0,0 +1,52 @@
package configs
import "golang.org/x/sys/unix"
type Mount struct {
// Source path for the mount.
Source string `json:"source"`
// Destination path for the mount inside the container.
Destination string `json:"destination"`
// Device the mount is for.
Device string `json:"device"`
// Mount flags.
Flags int `json:"flags"`
// Propagation Flags
PropagationFlags []int `json:"propagation_flags"`
// Mount data applied to the mount.
Data string `json:"data"`
// Relabel source if set, "z" indicates shared, "Z" indicates unshared.
Relabel string `json:"relabel"`
// RecAttr represents mount properties to be applied recursively (AT_RECURSIVE), see mount_setattr(2).
RecAttr *unix.MountAttr `json:"rec_attr"`
// Extensions are additional flags that are specific to runc.
Extensions int `json:"extensions"`
// UIDMappings is used to changing file user owners w/o calling chown.
// Note that, the underlying filesystem should support this feature to be
// used.
// Every mount point could have its own mapping.
UIDMappings []IDMap `json:"uid_mappings,omitempty"`
// GIDMappings is used to changing file group owners w/o calling chown.
// Note that, the underlying filesystem should support this feature to be
// used.
// Every mount point could have its own mapping.
GIDMappings []IDMap `json:"gid_mappings,omitempty"`
}
func (m *Mount) IsBind() bool {
return m.Flags&unix.MS_BIND != 0
}
func (m *Mount) IsIDMapped() bool {
return len(m.UIDMappings) > 0 || len(m.GIDMappings) > 0
}

View File

@ -0,0 +1,10 @@
//go:build !linux
// +build !linux
package configs
type Mount struct{}
func (m *Mount) IsBind() bool {
return false
}

View File

@ -14,6 +14,7 @@ const (
NEWIPC NamespaceType = "NEWIPC"
NEWUSER NamespaceType = "NEWUSER"
NEWCGROUP NamespaceType = "NEWCGROUP"
NEWTIME NamespaceType = "NEWTIME"
)
var (
@ -38,6 +39,8 @@ func NsName(ns NamespaceType) string {
return "uts"
case NEWCGROUP:
return "cgroup"
case NEWTIME:
return "time"
}
return ""
}
@ -72,6 +75,7 @@ func NamespaceTypes() []NamespaceType {
NEWPID,
NEWNS,
NEWCGROUP,
NEWTIME,
}
}

View File

@ -17,6 +17,7 @@ var namespaceInfo = map[NamespaceType]int{
NEWUTS: unix.CLONE_NEWUTS,
NEWPID: unix.CLONE_NEWPID,
NEWCGROUP: unix.CLONE_NEWCGROUP,
NEWTIME: unix.CLONE_NEWTIME,
}
// CloneFlags parses the container's Namespaces options to set the correct
@ -31,3 +32,15 @@ func (n *Namespaces) CloneFlags() uintptr {
}
return uintptr(flag)
}
// IsPrivate tells whether the namespace of type t is configured as private
// (i.e. it exists and is not shared).
func (n Namespaces) IsPrivate(t NamespaceType) bool {
for _, v := range n {
if v.Type == t {
return v.Path == ""
}
}
// Not found, so implicitly sharing a parent namespace.
return false
}

View File

@ -201,7 +201,7 @@ func ParseGroupFilter(r io.Reader, filter func(Group) bool) ([]Group, error) {
if err != nil {
// We should return no error if EOF is reached
// without a match.
if err == io.EOF { //nolint:errorlint // comparison with io.EOF is legit, https://github.com/polyfloyd/go-errorlint/pull/12
if err == io.EOF {
err = nil
}
return out, err
@ -280,13 +280,13 @@ func GetExecUserPath(userSpec string, defaults *ExecUser, passwdPath, groupPath
// found in any entry in passwd and group respectively.
//
// Examples of valid user specifications are:
// * ""
// * "user"
// * "uid"
// * "user:group"
// * "uid:gid
// * "user:gid"
// * "uid:group"
// - ""
// - "user"
// - "uid"
// - "user:group"
// - "uid:gid
// - "user:gid"
// - "uid:group"
//
// It should be noted that if you specify a numeric user or group id, they will
// not be evaluated as usernames (only the metadata will be filled). So attempting

View File

@ -1,5 +1,4 @@
package userns
// RunningInUserNS detects whether we are currently running in a user namespace.
// Originally copied from github.com/lxc/lxd/shared/util.go
var RunningInUserNS = runningInUserNS

View File

@ -3,14 +3,7 @@
package userns
import (
"strings"
"github.com/opencontainers/runc/libcontainer/user"
)
func FuzzUIDMap(data []byte) int {
uidmap, _ := user.ParseIDMap(strings.NewReader(string(data)))
_ = uidMapInUserNS(uidmap)
func FuzzUIDMap(uidmap []byte) int {
_ = uidMapInUserNS(string(uidmap))
return 1
}

View File

@ -1,9 +1,10 @@
package userns
import (
"bufio"
"fmt"
"os"
"sync"
"github.com/opencontainers/runc/libcontainer/user"
)
var (
@ -12,26 +13,43 @@ var (
)
// runningInUserNS detects whether we are currently running in a user namespace.
// Originally copied from github.com/lxc/lxd/shared/util.go
//
// Originally copied from https://github.com/lxc/incus/blob/e45085dd42f826b3c8c3228e9733c0b6f998eafe/shared/util.go#L678-L700.
func runningInUserNS() bool {
nsOnce.Do(func() {
uidmap, err := user.CurrentProcessUIDMap()
file, err := os.Open("/proc/self/uid_map")
if err != nil {
// This kernel-provided file only exists if user namespaces are supported
// This kernel-provided file only exists if user namespaces are supported.
return
}
inUserNS = uidMapInUserNS(uidmap)
defer file.Close()
buf := bufio.NewReader(file)
l, _, err := buf.ReadLine()
if err != nil {
return
}
inUserNS = uidMapInUserNS(string(l))
})
return inUserNS
}
func uidMapInUserNS(uidmap []user.IDMap) bool {
/*
* We assume we are in the initial user namespace if we have a full
* range - 4294967295 uids starting at uid 0.
*/
if len(uidmap) == 1 && uidmap[0].ID == 0 && uidmap[0].ParentID == 0 && uidmap[0].Count == 4294967295 {
func uidMapInUserNS(uidMap string) bool {
if uidMap == "" {
// File exist but empty (the initial state when userns is created,
// see user_namespaces(7)).
return true
}
var a, b, c int64
if _, err := fmt.Sscanf(uidMap, "%d %d %d", &a, &b, &c); err != nil {
// Assume we are in a regular, non user namespace.
return false
}
return true
// As per user_namespaces(7), /proc/self/uid_map of
// the initial user namespace shows 0 0 4294967295.
initNS := a == 0 && b == 0 && c == 4294967295
return !initNS
}

View File

@ -3,8 +3,6 @@
package userns
import "github.com/opencontainers/runc/libcontainer/user"
// runningInUserNS is a stub for non-Linux systems
// Always returns false
func runningInUserNS() bool {
@ -13,6 +11,6 @@ func runningInUserNS() bool {
// uidMapInUserNS is a stub for non-Linux systems
// Always returns false
func uidMapInUserNS(uidmap []user.IDMap) bool {
func uidMapInUserNS(uidMap string) bool {
return false
}

View File

@ -19,13 +19,14 @@ package utils
import (
"fmt"
"os"
"runtime"
"golang.org/x/sys/unix"
)
// MaxSendfdLen is the maximum length of the name of a file descriptor being
// sent using SendFd. The name of the file handle returned by RecvFd will never
// be larger than this value.
// MaxNameLen is the maximum length of the name of a file descriptor being sent
// using SendFile. The name of the file handle returned by RecvFile will never be
// larger than this value.
const MaxNameLen = 4096
// oobSpace is the size of the oob slice required to store a single FD. Note
@ -33,26 +34,21 @@ const MaxNameLen = 4096
// so sizeof(fd) = 4.
var oobSpace = unix.CmsgSpace(4)
// RecvFd waits for a file descriptor to be sent over the given AF_UNIX
// RecvFile waits for a file descriptor to be sent over the given AF_UNIX
// socket. The file name of the remote file descriptor will be recreated
// locally (it is sent as non-auxiliary data in the same payload).
func RecvFd(socket *os.File) (*os.File, error) {
// For some reason, unix.Recvmsg uses the length rather than the capacity
// when passing the msg_controllen and other attributes to recvmsg. So we
// have to actually set the length.
func RecvFile(socket *os.File) (_ *os.File, Err error) {
name := make([]byte, MaxNameLen)
oob := make([]byte, oobSpace)
sockfd := socket.Fd()
n, oobn, _, _, err := unix.Recvmsg(int(sockfd), name, oob, 0)
n, oobn, _, _, err := unix.Recvmsg(int(sockfd), name, oob, unix.MSG_CMSG_CLOEXEC)
if err != nil {
return nil, err
}
if n >= MaxNameLen || oobn != oobSpace {
return nil, fmt.Errorf("recvfd: incorrect number of bytes read (n=%d oobn=%d)", n, oobn)
return nil, fmt.Errorf("recvfile: incorrect number of bytes read (n=%d oobn=%d)", n, oobn)
}
// Truncate.
name = name[:n]
oob = oob[:oobn]
@ -61,36 +57,63 @@ func RecvFd(socket *os.File) (*os.File, error) {
if err != nil {
return nil, err
}
// We cannot control how many SCM_RIGHTS we receive, and upon receiving
// them all of the descriptors are installed in our fd table, so we need to
// parse all of the SCM_RIGHTS we received in order to close all of the
// descriptors on error.
var fds []int
defer func() {
for i, fd := range fds {
if i == 0 && Err == nil {
// Only close the first one on error.
continue
}
// Always close extra ones.
_ = unix.Close(fd)
}
}()
var lastErr error
for _, scm := range scms {
if scm.Header.Type == unix.SCM_RIGHTS {
scmFds, err := unix.ParseUnixRights(&scm)
if err != nil {
lastErr = err
} else {
fds = append(fds, scmFds...)
}
}
}
if lastErr != nil {
return nil, lastErr
}
// We do this after collecting the fds to make sure we close them all when
// returning an error here.
if len(scms) != 1 {
return nil, fmt.Errorf("recvfd: number of SCMs is not 1: %d", len(scms))
}
scm := scms[0]
fds, err := unix.ParseUnixRights(&scm)
if err != nil {
return nil, err
}
if len(fds) != 1 {
return nil, fmt.Errorf("recvfd: number of fds is not 1: %d", len(fds))
}
fd := uintptr(fds[0])
return os.NewFile(fd, string(name)), nil
return os.NewFile(uintptr(fds[0]), string(name)), nil
}
// SendFd sends a file descriptor over the given AF_UNIX socket. In
// addition, the file.Name() of the given file will also be sent as
// non-auxiliary data in the same payload (allowing to send contextual
// information for a file descriptor).
func SendFd(socket *os.File, name string, fd uintptr) error {
// SendFile sends a file over the given AF_UNIX socket. file.Name() is also
// included so that if the other end uses RecvFile, the file will have the same
// name information.
func SendFile(socket *os.File, file *os.File) error {
name := file.Name()
if len(name) >= MaxNameLen {
return fmt.Errorf("sendfd: filename too long: %s", name)
}
return SendFds(socket, []byte(name), int(fd))
err := SendRawFd(socket, name, file.Fd())
runtime.KeepAlive(file)
return err
}
// SendFds sends a list of files descriptor and msg over the given AF_UNIX socket.
func SendFds(socket *os.File, msg []byte, fds ...int) error {
oob := unix.UnixRights(fds...)
return unix.Sendmsg(int(socket.Fd()), msg, oob, nil, 0)
// SendRawFd sends a specific file descriptor over the given AF_UNIX socket.
func SendRawFd(socket *os.File, msg string, fd uintptr) error {
oob := unix.UnixRights(int(fd))
return unix.Sendmsg(int(socket.Fd()), []byte(msg), oob, nil, 0)
}

View File

@ -5,8 +5,10 @@ package utils
import (
"fmt"
"math"
"os"
"strconv"
"sync"
"golang.org/x/sys/unix"
)
@ -23,9 +25,38 @@ func EnsureProcHandle(fh *os.File) error {
return nil
}
var (
haveCloseRangeCloexecBool bool
haveCloseRangeCloexecOnce sync.Once
)
func haveCloseRangeCloexec() bool {
haveCloseRangeCloexecOnce.Do(func() {
// Make sure we're not closing a random file descriptor.
tmpFd, err := unix.FcntlInt(0, unix.F_DUPFD_CLOEXEC, 0)
if err != nil {
return
}
defer unix.Close(tmpFd)
err = unix.CloseRange(uint(tmpFd), uint(tmpFd), unix.CLOSE_RANGE_CLOEXEC)
// Any error means we cannot use close_range(CLOSE_RANGE_CLOEXEC).
// -ENOSYS and -EINVAL ultimately mean we don't have support, but any
// other potential error would imply that even the most basic close
// operation wouldn't work.
haveCloseRangeCloexecBool = err == nil
})
return haveCloseRangeCloexecBool
}
// CloseExecFrom applies O_CLOEXEC to all file descriptors currently open for
// the process (except for those below the given fd value).
func CloseExecFrom(minFd int) error {
if haveCloseRangeCloexec() {
err := unix.CloseRange(uint(minFd), math.MaxUint, unix.CLOSE_RANGE_CLOEXEC)
return os.NewSyscallError("close_range", err)
}
fdDir, err := os.Open("/proc/self/fd")
if err != nil {
return err
@ -60,7 +91,7 @@ func CloseExecFrom(minFd int) error {
}
// NewSockPair returns a new unix socket pair
func NewSockPair(name string) (parent *os.File, child *os.File, err error) {
func NewSockPair(name string) (parent, child *os.File, err error) {
fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0)
if err != nil {
return nil, nil, err

View File

@ -11,7 +11,7 @@ const (
VersionPatch = 0
// VersionDev indicates development branch. Releases will be empty string.
VersionDev = ""
VersionDev = "+dev"
)
// Version is the specification version that the package types support.