Update to runc main, removing pin to an older version

We were pinned to a specific commit to ensure that tests kept passing. Hopefully they pass now, as we need to grab latest runc for CVE fixes. Also grab Buildah main to fix a build issue on FreeBSD. After a botched manual vendor, I used Ed's treadmill script and squashed it into this commit to make Git happy. Thanks bunches Ed. Signed-off-by: Matt Heon <mheon@redhat.com>
2025-07-01 00:01:02 +08:00 · 2024-02-01 15:17:45 -05:00
parent 5e64d4f021
commit 2818abf849
174 changed files with 22580 additions and 922 deletions
--- a/vendor/github.com/opencontainers/runc/NOTICE
+++ b/vendor/github.com/opencontainers/runc/NOTICE
@ -8,9 +8,9 @@ The following is courtesy of our legal counsel:


 Use and transfer of Docker may be subject to certain restrictions by the
-United States and other governments.
+United States and other governments.  
 It is your responsibility to ensure that your use and/or transfer does not
-violate applicable laws.
+violate applicable laws. 

 For more information, please see http://www.bis.doc.gov

--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/cgroups.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/cgroups.go
@ -1,24 +1,9 @@
 package cgroups

 import (
-	"errors"
-
 	"github.com/opencontainers/runc/libcontainer/configs"
 )

-var (
-	// ErrDevicesUnsupported is an error returned when a cgroup manager
-	// is not configured to set device rules.
-	ErrDevicesUnsupported = errors.New("cgroup manager is not configured to set device rules")
-
-	// DevicesSetV1 and DevicesSetV2 are functions to set devices for
-	// cgroup v1 and v2, respectively. Unless libcontainer/cgroups/devices
-	// package is imported, it is set to nil, so cgroup managers can't
-	// manage devices.
-	DevicesSetV1 func(path string, r *configs.Resources) error
-	DevicesSetV2 func(path string, r *configs.Resources) error
-)
-
 type Manager interface {
 	// Apply creates a cgroup, if not yet created, and adds a process
 	// with the specified pid into that cgroup.  A special value of -1
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/devices/devices_emulator.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/devices/devices_emulator.go
@ -0,0 +1,386 @@
+// SPDX-License-Identifier: Apache-2.0
+/*
+ * Copyright (C) 2020 Aleksa Sarai <cyphar@cyphar.com>
+ * Copyright (C) 2020 SUSE LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package devices
+
+import (
+	"bufio"
+	"fmt"
+	"io"
+	"sort"
+	"strconv"
+	"strings"
+
+	"github.com/opencontainers/runc/libcontainer/devices"
+)
+
+// deviceMeta is a Rule without the Allow or Permissions fields, and no
+// wildcard-type support. It's effectively the "match" portion of a metadata
+// rule, for the purposes of our emulation.
+type deviceMeta struct {
+	node  devices.Type
+	major int64
+	minor int64
+}
+
+// deviceRule is effectively the tuple (deviceMeta, Permissions).
+type deviceRule struct {
+	meta  deviceMeta
+	perms devices.Permissions
+}
+
+// deviceRules is a mapping of device metadata rules to the associated
+// permissions in the ruleset.
+type deviceRules map[deviceMeta]devices.Permissions
+
+func (r deviceRules) orderedEntries() []deviceRule {
+	var rules []deviceRule
+	for meta, perms := range r {
+		rules = append(rules, deviceRule{meta: meta, perms: perms})
+	}
+	sort.Slice(rules, func(i, j int) bool {
+		// Sort by (major, minor, type).
+		a, b := rules[i].meta, rules[j].meta
+		return a.major < b.major ||
+			(a.major == b.major && a.minor < b.minor) ||
+			(a.major == b.major && a.minor == b.minor && a.node < b.node)
+	})
+	return rules
+}
+
+type Emulator struct {
+	defaultAllow bool
+	rules        deviceRules
+}
+
+func (e *Emulator) IsBlacklist() bool {
+	return e.defaultAllow
+}
+
+func (e *Emulator) IsAllowAll() bool {
+	return e.IsBlacklist() && len(e.rules) == 0
+}
+
+func parseLine(line string) (*deviceRule, error) {
+	// Input: node major:minor perms.
+	fields := strings.FieldsFunc(line, func(r rune) bool {
+		return r == ' ' || r == ':'
+	})
+	if len(fields) != 4 {
+		return nil, fmt.Errorf("malformed devices.list rule %s", line)
+	}
+
+	var (
+		rule  deviceRule
+		node  = fields[0]
+		major = fields[1]
+		minor = fields[2]
+		perms = fields[3]
+	)
+
+	// Parse the node type.
+	switch node {
+	case "a":
+		// Super-special case -- "a" always means every device with every
+		// access mode. In fact, for devices.list this actually indicates that
+		// the cgroup is in black-list mode.
+		// TODO: Double-check that the entire file is "a *:* rwm".
+		return nil, nil
+	case "b":
+		rule.meta.node = devices.BlockDevice
+	case "c":
+		rule.meta.node = devices.CharDevice
+	default:
+		return nil, fmt.Errorf("unknown device type %q", node)
+	}
+
+	// Parse the major number.
+	if major == "*" {
+		rule.meta.major = devices.Wildcard
+	} else {
+		val, err := strconv.ParseUint(major, 10, 32)
+		if err != nil {
+			return nil, fmt.Errorf("invalid major number: %w", err)
+		}
+		rule.meta.major = int64(val)
+	}
+
+	// Parse the minor number.
+	if minor == "*" {
+		rule.meta.minor = devices.Wildcard
+	} else {
+		val, err := strconv.ParseUint(minor, 10, 32)
+		if err != nil {
+			return nil, fmt.Errorf("invalid minor number: %w", err)
+		}
+		rule.meta.minor = int64(val)
+	}
+
+	// Parse the access permissions.
+	rule.perms = devices.Permissions(perms)
+	if !rule.perms.IsValid() || rule.perms.IsEmpty() {
+		return nil, fmt.Errorf("parse access mode: contained unknown modes or is empty: %q", perms)
+	}
+	return &rule, nil
+}
+
+func (e *Emulator) addRule(rule deviceRule) error { //nolint:unparam
+	if e.rules == nil {
+		e.rules = make(map[deviceMeta]devices.Permissions)
+	}
+
+	// Merge with any pre-existing permissions.
+	oldPerms := e.rules[rule.meta]
+	newPerms := rule.perms.Union(oldPerms)
+	e.rules[rule.meta] = newPerms
+	return nil
+}
+
+func (e *Emulator) rmRule(rule deviceRule) error {
+	// Give an error if any of the permissions requested to be removed are
+	// present in a partially-matching wildcard rule, because such rules will
+	// be ignored by cgroupv1.
+	//
+	// This is a diversion from cgroupv1, but is necessary to avoid leading
+	// users into a false sense of security. cgroupv1 will silently(!) ignore
+	// requests to remove partial exceptions, but we really shouldn't do that.
+	//
+	// It may seem like we could just "split" wildcard rules which hit this
+	// issue, but unfortunately there are 2^32 possible major and minor
+	// numbers, which would exhaust kernel memory quickly if we did this. Not
+	// to mention it'd be really slow (the kernel side is implemented as a
+	// linked-list of exceptions).
+	for _, partialMeta := range []deviceMeta{
+		{node: rule.meta.node, major: devices.Wildcard, minor: rule.meta.minor},
+		{node: rule.meta.node, major: rule.meta.major, minor: devices.Wildcard},
+		{node: rule.meta.node, major: devices.Wildcard, minor: devices.Wildcard},
+	} {
+		// This wildcard rule is equivalent to the requested rule, so skip it.
+		if rule.meta == partialMeta {
+			continue
+		}
+		// Only give an error if the set of permissions overlap.
+		partialPerms := e.rules[partialMeta]
+		if !partialPerms.Intersection(rule.perms).IsEmpty() {
+			return fmt.Errorf("requested rule [%v %v] not supported by devices cgroupv1 (cannot punch hole in existing wildcard rule [%v %v])", rule.meta, rule.perms, partialMeta, partialPerms)
+		}
+	}
+
+	// Subtract all of the permissions listed from the full match rule. If the
+	// rule didn't exist, all of this is a no-op.
+	newPerms := e.rules[rule.meta].Difference(rule.perms)
+	if newPerms.IsEmpty() {
+		delete(e.rules, rule.meta)
+	} else {
+		e.rules[rule.meta] = newPerms
+	}
+	// TODO: The actual cgroup code doesn't care if an exception didn't exist
+	//       during removal, so not erroring out here is /accurate/ but quite
+	//       worrying. Maybe we should do additional validation, but again we
+	//       have to worry about backwards-compatibility.
+	return nil
+}
+
+func (e *Emulator) allow(rule *deviceRule) error {
+	// This cgroup is configured as a black-list. Reset the entire emulator,
+	// and put is into black-list mode.
+	if rule == nil || rule.meta.node == devices.WildcardDevice {
+		*e = Emulator{
+			defaultAllow: true,
+			rules:        nil,
+		}
+		return nil
+	}
+
+	var err error
+	if e.defaultAllow {
+		err = wrapErr(e.rmRule(*rule), "unable to remove 'deny' exception")
+	} else {
+		err = wrapErr(e.addRule(*rule), "unable to add 'allow' exception")
+	}
+	return err
+}
+
+func (e *Emulator) deny(rule *deviceRule) error {
+	// This cgroup is configured as a white-list. Reset the entire emulator,
+	// and put is into white-list mode.
+	if rule == nil || rule.meta.node == devices.WildcardDevice {
+		*e = Emulator{
+			defaultAllow: false,
+			rules:        nil,
+		}
+		return nil
+	}
+
+	var err error
+	if e.defaultAllow {
+		err = wrapErr(e.addRule(*rule), "unable to add 'deny' exception")
+	} else {
+		err = wrapErr(e.rmRule(*rule), "unable to remove 'allow' exception")
+	}
+	return err
+}
+
+func (e *Emulator) Apply(rule devices.Rule) error {
+	if !rule.Type.CanCgroup() {
+		return fmt.Errorf("cannot add rule [%#v] with non-cgroup type %q", rule, rule.Type)
+	}
+
+	innerRule := &deviceRule{
+		meta: deviceMeta{
+			node:  rule.Type,
+			major: rule.Major,
+			minor: rule.Minor,
+		},
+		perms: rule.Permissions,
+	}
+	if innerRule.meta.node == devices.WildcardDevice {
+		innerRule = nil
+	}
+
+	if rule.Allow {
+		return e.allow(innerRule)
+	}
+
+	return e.deny(innerRule)
+}
+
+// EmulatorFromList takes a reader to a "devices.list"-like source, and returns
+// a new Emulator that represents the state of the devices cgroup. Note that
+// black-list devices cgroups cannot be fully reconstructed, due to limitations
+// in the devices cgroup API. Instead, such cgroups are always treated as
+// "allow all" cgroups.
+func EmulatorFromList(list io.Reader) (*Emulator, error) {
+	// Normally cgroups are in black-list mode by default, but the way we
+	// figure out the current mode is whether or not devices.list has an
+	// allow-all rule. So we default to a white-list, and the existence of an
+	// "a *:* rwm" entry will tell us otherwise.
+	e := &Emulator{
+		defaultAllow: false,
+	}
+
+	// Parse the "devices.list".
+	s := bufio.NewScanner(list)
+	for s.Scan() {
+		line := s.Text()
+		deviceRule, err := parseLine(line)
+		if err != nil {
+			return nil, fmt.Errorf("error parsing line %q: %w", line, err)
+		}
+		// "devices.list" is an allow list. Note that this means that in
+		// black-list mode, we have no idea what rules are in play. As a
+		// result, we need to be very careful in Transition().
+		if err := e.allow(deviceRule); err != nil {
+			return nil, fmt.Errorf("error adding devices.list rule: %w", err)
+		}
+	}
+	if err := s.Err(); err != nil {
+		return nil, fmt.Errorf("error reading devices.list lines: %w", err)
+	}
+	return e, nil
+}
+
+// Transition calculates what is the minimally-disruptive set of rules need to
+// be applied to a devices cgroup in order to transition to the given target.
+// This means that any already-existing rules will not be applied, and
+// disruptive rules (like denying all device access) will only be applied if
+// necessary.
+//
+// This function is the sole reason for all of Emulator -- to allow us
+// to figure out how to update a containers' cgroups without causing spurious
+// device errors (if possible).
+func (source *Emulator) Transition(target *Emulator) ([]*devices.Rule, error) {
+	var transitionRules []*devices.Rule
+	oldRules := source.rules
+
+	// If the default policy doesn't match, we need to include a "disruptive"
+	// rule (either allow-all or deny-all) in order to switch the cgroup to the
+	// correct default policy.
+	//
+	// However, due to a limitation in "devices.list" we cannot be sure what
+	// deny rules are in place in a black-list cgroup. Thus if the source is a
+	// black-list we also have to include a disruptive rule.
+	if source.IsBlacklist() || source.defaultAllow != target.defaultAllow {
+		transitionRules = append(transitionRules, &devices.Rule{
+			Type:        'a',
+			Major:       -1,
+			Minor:       -1,
+			Permissions: devices.Permissions("rwm"),
+			Allow:       target.defaultAllow,
+		})
+		// The old rules are only relevant if we aren't starting out with a
+		// disruptive rule.
+		oldRules = nil
+	}
+
+	// NOTE: We traverse through the rules in a sorted order so we always write
+	//       the same set of rules (this is to aid testing).
+
+	// First, we create inverse rules for any old rules not in the new set.
+	// This includes partial-inverse rules for specific permissions. This is a
+	// no-op if we added a disruptive rule, since oldRules will be empty.
+	for _, rule := range oldRules.orderedEntries() {
+		meta, oldPerms := rule.meta, rule.perms
+		newPerms := target.rules[meta]
+		droppedPerms := oldPerms.Difference(newPerms)
+		if !droppedPerms.IsEmpty() {
+			transitionRules = append(transitionRules, &devices.Rule{
+				Type:        meta.node,
+				Major:       meta.major,
+				Minor:       meta.minor,
+				Permissions: droppedPerms,
+				Allow:       target.defaultAllow,
+			})
+		}
+	}
+
+	// Add any additional rules which weren't in the old set. We happen to
+	// filter out rules which are present in both sets, though this isn't
+	// strictly necessary.
+	for _, rule := range target.rules.orderedEntries() {
+		meta, newPerms := rule.meta, rule.perms
+		oldPerms := oldRules[meta]
+		gainedPerms := newPerms.Difference(oldPerms)
+		if !gainedPerms.IsEmpty() {
+			transitionRules = append(transitionRules, &devices.Rule{
+				Type:        meta.node,
+				Major:       meta.major,
+				Minor:       meta.minor,
+				Permissions: gainedPerms,
+				Allow:       !target.defaultAllow,
+			})
+		}
+	}
+	return transitionRules, nil
+}
+
+// Rules returns the minimum set of rules necessary to convert a *deny-all*
+// cgroup to the emulated filter state (note that this is not the same as a
+// default cgroupv1 cgroup -- which is allow-all). This is effectively just a
+// wrapper around Transition() with the source emulator being an empty cgroup.
+func (e *Emulator) Rules() ([]*devices.Rule, error) {
+	defaultCgroup := &Emulator{defaultAllow: false}
+	return defaultCgroup.Transition(e)
+}
+
+func wrapErr(err error, text string) error {
+	if err == nil {
+		return nil
+	}
+	return fmt.Errorf(text+": %w", err)
+}
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/ebpf/devicefilter/devicefilter.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/ebpf/devicefilter/devicefilter.go
@ -0,0 +1,208 @@
+// Package devicefilter contains eBPF device filter program
+//
+// The implementation is based on https://github.com/containers/crun/blob/0.10.2/src/libcrun/ebpf.c
+//
+// Although ebpf.c is originally licensed under LGPL-3.0-or-later, the author (Giuseppe Scrivano)
+// agreed to relicense the file in Apache License 2.0: https://github.com/opencontainers/runc/issues/2144#issuecomment-543116397
+package devicefilter
+
+import (
+	"errors"
+	"fmt"
+	"math"
+	"strconv"
+
+	"github.com/cilium/ebpf/asm"
+	devicesemulator "github.com/opencontainers/runc/libcontainer/cgroups/devices"
+	"github.com/opencontainers/runc/libcontainer/devices"
+	"golang.org/x/sys/unix"
+)
+
+const (
+	// license string format is same as kernel MODULE_LICENSE macro
+	license = "Apache"
+)
+
+// DeviceFilter returns eBPF device filter program and its license string
+func DeviceFilter(rules []*devices.Rule) (asm.Instructions, string, error) {
+	// Generate the minimum ruleset for the device rules we are given. While we
+	// don't care about minimum transitions in cgroupv2, using the emulator
+	// gives us a guarantee that the behaviour of devices filtering is the same
+	// as cgroupv1, including security hardenings to avoid misconfiguration
+	// (such as punching holes in wildcard rules).
+	emu := new(devicesemulator.Emulator)
+	for _, rule := range rules {
+		if err := emu.Apply(*rule); err != nil {
+			return nil, "", err
+		}
+	}
+	cleanRules, err := emu.Rules()
+	if err != nil {
+		return nil, "", err
+	}
+
+	p := &program{
+		defaultAllow: emu.IsBlacklist(),
+	}
+	p.init()
+
+	for idx, rule := range cleanRules {
+		if rule.Type == devices.WildcardDevice {
+			// We can safely skip over wildcard entries because there should
+			// only be one (at most) at the very start to instruct cgroupv1 to
+			// go into allow-list mode. However we do double-check this here.
+			if idx != 0 || rule.Allow != emu.IsBlacklist() {
+				return nil, "", fmt.Errorf("[internal error] emulated cgroupv2 devices ruleset had bad wildcard at idx %v (%s)", idx, rule.CgroupString())
+			}
+			continue
+		}
+		if rule.Allow == p.defaultAllow {
+			// There should be no rules which have an action equal to the
+			// default action, the emulator removes those.
+			return nil, "", fmt.Errorf("[internal error] emulated cgroupv2 devices ruleset had no-op rule at idx %v (%s)", idx, rule.CgroupString())
+		}
+		if err := p.appendRule(rule); err != nil {
+			return nil, "", err
+		}
+	}
+	return p.finalize(), license, nil
+}
+
+type program struct {
+	insts        asm.Instructions
+	defaultAllow bool
+	blockID      int
+}
+
+func (p *program) init() {
+	// struct bpf_cgroup_dev_ctx: https://elixir.bootlin.com/linux/v5.3.6/source/include/uapi/linux/bpf.h#L3423
+	/*
+		u32 access_type
+		u32 major
+		u32 minor
+	*/
+	// R2 <- type (lower 16 bit of u32 access_type at R1[0])
+	p.insts = append(p.insts,
+		asm.LoadMem(asm.R2, asm.R1, 0, asm.Word),
+		asm.And.Imm32(asm.R2, 0xFFFF))
+
+	// R3 <- access (upper 16 bit of u32 access_type at R1[0])
+	p.insts = append(p.insts,
+		asm.LoadMem(asm.R3, asm.R1, 0, asm.Word),
+		// RSh: bitwise shift right
+		asm.RSh.Imm32(asm.R3, 16))
+
+	// R4 <- major (u32 major at R1[4])
+	p.insts = append(p.insts,
+		asm.LoadMem(asm.R4, asm.R1, 4, asm.Word))
+
+	// R5 <- minor (u32 minor at R1[8])
+	p.insts = append(p.insts,
+		asm.LoadMem(asm.R5, asm.R1, 8, asm.Word))
+}
+
+// appendRule rule converts an OCI rule to the relevant eBPF block and adds it
+// to the in-progress filter program. In order to operate properly, it must be
+// called with a "clean" rule list (generated by devices.Emulator.Rules() --
+// with any "a" rules removed).
+func (p *program) appendRule(rule *devices.Rule) error {
+	if p.blockID < 0 {
+		return errors.New("the program is finalized")
+	}
+
+	var bpfType int32
+	switch rule.Type {
+	case devices.CharDevice:
+		bpfType = int32(unix.BPF_DEVCG_DEV_CHAR)
+	case devices.BlockDevice:
+		bpfType = int32(unix.BPF_DEVCG_DEV_BLOCK)
+	default:
+		// We do not permit 'a', nor any other types we don't know about.
+		return fmt.Errorf("invalid type %q", string(rule.Type))
+	}
+	if rule.Major > math.MaxUint32 {
+		return fmt.Errorf("invalid major %d", rule.Major)
+	}
+	if rule.Minor > math.MaxUint32 {
+		return fmt.Errorf("invalid minor %d", rule.Major)
+	}
+	hasMajor := rule.Major >= 0 // if not specified in OCI json, major is set to -1
+	hasMinor := rule.Minor >= 0
+	bpfAccess := int32(0)
+	for _, r := range rule.Permissions {
+		switch r {
+		case 'r':
+			bpfAccess |= unix.BPF_DEVCG_ACC_READ
+		case 'w':
+			bpfAccess |= unix.BPF_DEVCG_ACC_WRITE
+		case 'm':
+			bpfAccess |= unix.BPF_DEVCG_ACC_MKNOD
+		default:
+			return fmt.Errorf("unknown device access %v", r)
+		}
+	}
+	// If the access is rwm, skip the check.
+	hasAccess := bpfAccess != (unix.BPF_DEVCG_ACC_READ | unix.BPF_DEVCG_ACC_WRITE | unix.BPF_DEVCG_ACC_MKNOD)
+
+	var (
+		blockSym         = "block-" + strconv.Itoa(p.blockID)
+		nextBlockSym     = "block-" + strconv.Itoa(p.blockID+1)
+		prevBlockLastIdx = len(p.insts) - 1
+	)
+	p.insts = append(p.insts,
+		// if (R2 != bpfType) goto next
+		asm.JNE.Imm(asm.R2, bpfType, nextBlockSym),
+	)
+	if hasAccess {
+		p.insts = append(p.insts,
+			// if (R3 & bpfAccess != R3 /* use R1 as a temp var */) goto next
+			asm.Mov.Reg32(asm.R1, asm.R3),
+			asm.And.Imm32(asm.R1, bpfAccess),
+			asm.JNE.Reg(asm.R1, asm.R3, nextBlockSym),
+		)
+	}
+	if hasMajor {
+		p.insts = append(p.insts,
+			// if (R4 != major) goto next
+			asm.JNE.Imm(asm.R4, int32(rule.Major), nextBlockSym),
+		)
+	}
+	if hasMinor {
+		p.insts = append(p.insts,
+			// if (R5 != minor) goto next
+			asm.JNE.Imm(asm.R5, int32(rule.Minor), nextBlockSym),
+		)
+	}
+	p.insts = append(p.insts, acceptBlock(rule.Allow)...)
+	// set blockSym to the first instruction we added in this iteration
+	p.insts[prevBlockLastIdx+1] = p.insts[prevBlockLastIdx+1].Sym(blockSym)
+	p.blockID++
+	return nil
+}
+
+func (p *program) finalize() asm.Instructions {
+	var v int32
+	if p.defaultAllow {
+		v = 1
+	}
+	blockSym := "block-" + strconv.Itoa(p.blockID)
+	p.insts = append(p.insts,
+		// R0 <- v
+		asm.Mov.Imm32(asm.R0, v).Sym(blockSym),
+		asm.Return(),
+	)
+	p.blockID = -1
+	return p.insts
+}
+
+func acceptBlock(accept bool) asm.Instructions {
+	var v int32
+	if accept {
+		v = 1
+	}
+	return []asm.Instruction{
+		// R0 <- v
+		asm.Mov.Imm32(asm.R0, v),
+		asm.Return(),
+	}
+}
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/ebpf/ebpf_linux.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/ebpf/ebpf_linux.go
@ -0,0 +1,253 @@
+package ebpf
+
+import (
+	"errors"
+	"fmt"
+	"os"
+	"runtime"
+	"sync"
+	"unsafe"
+
+	"github.com/cilium/ebpf"
+	"github.com/cilium/ebpf/asm"
+	"github.com/cilium/ebpf/link"
+	"github.com/sirupsen/logrus"
+	"golang.org/x/sys/unix"
+)
+
+func nilCloser() error {
+	return nil
+}
+
+func findAttachedCgroupDeviceFilters(dirFd int) ([]*ebpf.Program, error) {
+	type bpfAttrQuery struct {
+		TargetFd    uint32
+		AttachType  uint32
+		QueryType   uint32
+		AttachFlags uint32
+		ProgIds     uint64 // __aligned_u64
+		ProgCnt     uint32
+	}
+
+	// Currently you can only have 64 eBPF programs attached to a cgroup.
+	size := 64
+	retries := 0
+	for retries < 10 {
+		progIds := make([]uint32, size)
+		query := bpfAttrQuery{
+			TargetFd:   uint32(dirFd),
+			AttachType: uint32(unix.BPF_CGROUP_DEVICE),
+			ProgIds:    uint64(uintptr(unsafe.Pointer(&progIds[0]))),
+			ProgCnt:    uint32(len(progIds)),
+		}
+
+		// Fetch the list of program ids.
+		_, _, errno := unix.Syscall(unix.SYS_BPF,
+			uintptr(unix.BPF_PROG_QUERY),
+			uintptr(unsafe.Pointer(&query)),
+			unsafe.Sizeof(query))
+		size = int(query.ProgCnt)
+		runtime.KeepAlive(query)
+		if errno != 0 {
+			// On ENOSPC we get the correct number of programs.
+			if errno == unix.ENOSPC {
+				retries++
+				continue
+			}
+			return nil, fmt.Errorf("bpf_prog_query(BPF_CGROUP_DEVICE) failed: %w", errno)
+		}
+
+		// Convert the ids to program handles.
+		progIds = progIds[:size]
+		programs := make([]*ebpf.Program, 0, len(progIds))
+		for _, progId := range progIds {
+			program, err := ebpf.NewProgramFromID(ebpf.ProgramID(progId))
+			if err != nil {
+				// We skip over programs that give us -EACCES or -EPERM. This
+				// is necessary because there may be BPF programs that have
+				// been attached (such as with --systemd-cgroup) which have an
+				// LSM label that blocks us from interacting with the program.
+				//
+				// Because additional BPF_CGROUP_DEVICE programs only can add
+				// restrictions, there's no real issue with just ignoring these
+				// programs (and stops runc from breaking on distributions with
+				// very strict SELinux policies).
+				if errors.Is(err, os.ErrPermission) {
+					logrus.Debugf("ignoring existing CGROUP_DEVICE program (prog_id=%v) which cannot be accessed by runc -- likely due to LSM policy: %v", progId, err)
+					continue
+				}
+				return nil, fmt.Errorf("cannot fetch program from id: %w", err)
+			}
+			programs = append(programs, program)
+		}
+		runtime.KeepAlive(progIds)
+		return programs, nil
+	}
+
+	return nil, errors.New("could not get complete list of CGROUP_DEVICE programs")
+}
+
+var (
+	haveBpfProgReplaceBool bool
+	haveBpfProgReplaceOnce sync.Once
+)
+
+// Loosely based on the BPF_F_REPLACE support check in
+// https://github.com/cilium/ebpf/blob/v0.6.0/link/syscalls.go.
+//
+// TODO: move this logic to cilium/ebpf
+func haveBpfProgReplace() bool {
+	haveBpfProgReplaceOnce.Do(func() {
+		prog, err := ebpf.NewProgram(&ebpf.ProgramSpec{
+			Type:    ebpf.CGroupDevice,
+			License: "MIT",
+			Instructions: asm.Instructions{
+				asm.Mov.Imm(asm.R0, 0),
+				asm.Return(),
+			},
+		})
+		if err != nil {
+			logrus.Debugf("checking for BPF_F_REPLACE support: ebpf.NewProgram failed: %v", err)
+			return
+		}
+		defer prog.Close()
+
+		devnull, err := os.Open("/dev/null")
+		if err != nil {
+			logrus.Debugf("checking for BPF_F_REPLACE support: open dummy target fd: %v", err)
+			return
+		}
+		defer devnull.Close()
+
+		// We know that we have BPF_PROG_ATTACH since we can load
+		// BPF_CGROUP_DEVICE programs. If passing BPF_F_REPLACE gives us EINVAL
+		// we know that the feature isn't present.
+		err = link.RawAttachProgram(link.RawAttachProgramOptions{
+			// We rely on this fd being checked after attachFlags.
+			Target: int(devnull.Fd()),
+			// Attempt to "replace" bad fds with this program.
+			Program: prog,
+			Attach:  ebpf.AttachCGroupDevice,
+			Flags:   unix.BPF_F_ALLOW_MULTI | unix.BPF_F_REPLACE,
+		})
+		if errors.Is(err, unix.EINVAL) {
+			// not supported
+			return
+		}
+		// attach_flags test succeeded.
+		if !errors.Is(err, unix.EBADF) {
+			logrus.Debugf("checking for BPF_F_REPLACE: got unexpected (not EBADF or EINVAL) error: %v", err)
+		}
+		haveBpfProgReplaceBool = true
+	})
+	return haveBpfProgReplaceBool
+}
+
+// LoadAttachCgroupDeviceFilter installs eBPF device filter program to /sys/fs/cgroup/<foo> directory.
+//
+// Requires the system to be running in cgroup2 unified-mode with kernel >= 4.15 .
+//
+// https://github.com/torvalds/linux/commit/ebc614f687369f9df99828572b1d85a7c2de3d92
+func LoadAttachCgroupDeviceFilter(insts asm.Instructions, license string, dirFd int) (func() error, error) {
+	// Increase `ulimit -l` limit to avoid BPF_PROG_LOAD error (#2167).
+	// This limit is not inherited into the container.
+	memlockLimit := &unix.Rlimit{
+		Cur: unix.RLIM_INFINITY,
+		Max: unix.RLIM_INFINITY,
+	}
+	_ = unix.Setrlimit(unix.RLIMIT_MEMLOCK, memlockLimit)
+
+	// Get the list of existing programs.
+	oldProgs, err := findAttachedCgroupDeviceFilters(dirFd)
+	if err != nil {
+		return nilCloser, err
+	}
+	useReplaceProg := haveBpfProgReplace() && len(oldProgs) == 1
+
+	// Generate new program.
+	spec := &ebpf.ProgramSpec{
+		Type:         ebpf.CGroupDevice,
+		Instructions: insts,
+		License:      license,
+	}
+	prog, err := ebpf.NewProgram(spec)
+	if err != nil {
+		return nilCloser, err
+	}
+
+	// If there is only one old program, we can just replace it directly.
+	var (
+		replaceProg *ebpf.Program
+		attachFlags uint32 = unix.BPF_F_ALLOW_MULTI
+	)
+	if useReplaceProg {
+		replaceProg = oldProgs[0]
+		attachFlags |= unix.BPF_F_REPLACE
+	}
+	err = link.RawAttachProgram(link.RawAttachProgramOptions{
+		Target:  dirFd,
+		Program: prog,
+		Replace: replaceProg,
+		Attach:  ebpf.AttachCGroupDevice,
+		Flags:   attachFlags,
+	})
+	if err != nil {
+		return nilCloser, fmt.Errorf("failed to call BPF_PROG_ATTACH (BPF_CGROUP_DEVICE, BPF_F_ALLOW_MULTI): %w", err)
+	}
+	closer := func() error {
+		err = link.RawDetachProgram(link.RawDetachProgramOptions{
+			Target:  dirFd,
+			Program: prog,
+			Attach:  ebpf.AttachCGroupDevice,
+		})
+		if err != nil {
+			return fmt.Errorf("failed to call BPF_PROG_DETACH (BPF_CGROUP_DEVICE): %w", err)
+		}
+		// TODO: Should we attach the old filters back in this case? Otherwise
+		//       we fail-open on a security feature, which is a bit scary.
+		return nil
+	}
+	if !useReplaceProg {
+		logLevel := logrus.DebugLevel
+		// If there was more than one old program, give a warning (since this
+		// really shouldn't happen with runc-managed cgroups) and then detach
+		// all the old programs.
+		if len(oldProgs) > 1 {
+			// NOTE: Ideally this should be a warning but it turns out that
+			//       systemd-managed cgroups trigger this warning (apparently
+			//       systemd doesn't delete old non-systemd programs when
+			//       setting properties).
+			logrus.Infof("found more than one filter (%d) attached to a cgroup -- removing extra filters!", len(oldProgs))
+			logLevel = logrus.InfoLevel
+		}
+		for idx, oldProg := range oldProgs {
+			// Output some extra debug info.
+			if info, err := oldProg.Info(); err == nil {
+				fields := logrus.Fields{
+					"type": info.Type.String(),
+					"tag":  info.Tag,
+					"name": info.Name,
+				}
+				if id, ok := info.ID(); ok {
+					fields["id"] = id
+				}
+				if runCount, ok := info.RunCount(); ok {
+					fields["run_count"] = runCount
+				}
+				if runtime, ok := info.Runtime(); ok {
+					fields["runtime"] = runtime.String()
+				}
+				logrus.WithFields(fields).Logf(logLevel, "removing old filter %d from cgroup", idx)
+			}
+			err = link.RawDetachProgram(link.RawDetachProgramOptions{
+				Target:  dirFd,
+				Program: oldProg,
+				Attach:  ebpf.AttachCGroupDevice,
+			})
+			if err != nil {
+				return closer, fmt.Errorf("failed to call BPF_PROG_DETACH (BPF_CGROUP_DEVICE) on old filter program: %w", err)
+			}
+		}
+	}
+	return closer, nil
+}
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/file.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/file.go
@ -10,6 +10,7 @@ import (
 	"strings"
 	"sync"

+	"github.com/opencontainers/runc/libcontainer/utils"
 	"github.com/sirupsen/logrus"
 	"golang.org/x/sys/unix"
 )
@ -76,35 +77,36 @@ var (
 	// TestMode is set to true by unit tests that need "fake" cgroupfs.
 	TestMode bool

-	cgroupFd     int = -1
-	prepOnce     sync.Once
-	prepErr      error
-	resolveFlags uint64
+	cgroupRootHandle *os.File
+	prepOnce         sync.Once
+	prepErr          error
+	resolveFlags     uint64
 )

 func prepareOpenat2() error {
 	prepOnce.Do(func() {
 		fd, err := unix.Openat2(-1, cgroupfsDir, &unix.OpenHow{
-			Flags: unix.O_DIRECTORY | unix.O_PATH,
+			Flags: unix.O_DIRECTORY | unix.O_PATH | unix.O_CLOEXEC,
 		})
 		if err != nil {
 			prepErr = &os.PathError{Op: "openat2", Path: cgroupfsDir, Err: err}
-			if err != unix.ENOSYS {
+			if err != unix.ENOSYS { //nolint:errorlint // unix errors are bare
 				logrus.Warnf("falling back to securejoin: %s", prepErr)
 			} else {
 				logrus.Debug("openat2 not available, falling back to securejoin")
 			}
 			return
 		}
+		file := os.NewFile(uintptr(fd), cgroupfsDir)
+
 		var st unix.Statfs_t
-		if err = unix.Fstatfs(fd, &st); err != nil {
+		if err := unix.Fstatfs(int(file.Fd()), &st); err != nil {
 			prepErr = &os.PathError{Op: "statfs", Path: cgroupfsDir, Err: err}
 			logrus.Warnf("falling back to securejoin: %s", prepErr)
 			return
 		}

-		cgroupFd = fd
-
+		cgroupRootHandle = file
 		resolveFlags = unix.RESOLVE_BENEATH | unix.RESOLVE_NO_MAGICLINKS
 		if st.Type == unix.CGROUP2_SUPER_MAGIC {
 			// cgroupv2 has a single mountpoint and no "cpu,cpuacct" symlinks
@ -122,7 +124,7 @@ func openFile(dir, file string, flags int) (*os.File, error) {
 		flags |= os.O_TRUNC | os.O_CREATE
 		mode = 0o600
 	}
-	path := path.Join(dir, file)
+	path := path.Join(dir, utils.CleanPath(file))
 	if prepareOpenat2() != nil {
 		return openFallback(path, flags, mode)
 	}
@ -131,7 +133,7 @@ func openFile(dir, file string, flags int) (*os.File, error) {
 		return openFallback(path, flags, mode)
 	}

-	fd, err := unix.Openat2(cgroupFd, relPath,
+	fd, err := unix.Openat2(int(cgroupRootHandle.Fd()), relPath,
 		&unix.OpenHow{
 			Resolve: resolveFlags,
 			Flags:   uint64(flags) | unix.O_CLOEXEC,
@ -139,20 +141,20 @@ func openFile(dir, file string, flags int) (*os.File, error) {
 		})
 	if err != nil {
 		err = &os.PathError{Op: "openat2", Path: path, Err: err}
-		// Check if cgroupFd is still opened to cgroupfsDir
+		// Check if cgroupRootHandle is still opened to cgroupfsDir
 		// (happens when this package is incorrectly used
 		// across the chroot/pivot_root/mntns boundary, or
 		// when /sys/fs/cgroup is remounted).
 		//
 		// TODO: if such usage will ever be common, amend this
-		// to reopen cgroupFd and retry openat2.
-		fdStr := strconv.Itoa(cgroupFd)
+		// to reopen cgroupRootHandle and retry openat2.
+		fdStr := strconv.Itoa(int(cgroupRootHandle.Fd()))
 		fdDest, _ := os.Readlink("/proc/self/fd/" + fdStr)
 		if fdDest != cgroupfsDir {
-			// Wrap the error so it is clear that cgroupFd
+			// Wrap the error so it is clear that cgroupRootHandle
 			// is opened to an unexpected/wrong directory.
-			err = fmt.Errorf("cgroupFd %s unexpectedly opened to %s != %s: %w",
-				fdStr, fdDest, cgroupfsDir, err)
+			err = fmt.Errorf("cgroupRootHandle %d unexpectedly opened to %s != %s: %w",
+				cgroupRootHandle.Fd(), fdDest, cgroupfsDir, err)
 		}
 		return nil, err
 	}
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpu.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpu.go
@ -94,14 +94,6 @@ func (s *CpuGroup) Set(path string, r *configs.Resources) error {
 			}
 		}
 	}
-
-	if r.CPUIdle != nil {
-		idle := strconv.FormatInt(*r.CPUIdle, 10)
-		if err := cgroups.WriteFile(path, "cpu.idle", idle); err != nil {
-			return err
-		}
-	}
-
 	return s.SetRtSched(path, r)
 }

--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpuset.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpuset.go
@ -195,7 +195,7 @@ func cpusetEnsureParent(current string) error {
 	}
 	// Treat non-existing directory as cgroupfs as it will be created,
 	// and the root cpuset directory obviously exists.
-	if err != nil && err != unix.ENOENT {
+	if err != nil && err != unix.ENOENT { //nolint:errorlint // unix errors are bare
 		return &os.PathError{Op: "statfs", Path: parent, Err: err}
 	}

--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/devices.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/devices.go
@ -1,11 +1,20 @@
 package fs

 import (
+	"bytes"
+	"errors"
+	"reflect"
+
 	"github.com/opencontainers/runc/libcontainer/cgroups"
+	cgroupdevices "github.com/opencontainers/runc/libcontainer/cgroups/devices"
 	"github.com/opencontainers/runc/libcontainer/configs"
+	"github.com/opencontainers/runc/libcontainer/devices"
+	"github.com/opencontainers/runc/libcontainer/userns"
 )

-type DevicesGroup struct{}
+type DevicesGroup struct {
+	TestingSkipFinalCheck bool
+}

 func (s *DevicesGroup) Name() string {
 	return "devices"
@ -24,14 +33,75 @@ func (s *DevicesGroup) Apply(path string, r *configs.Resources, pid int) error {
 	return apply(path, pid)
 }

-func (s *DevicesGroup) Set(path string, r *configs.Resources) error {
-	if cgroups.DevicesSetV1 == nil {
-		if len(r.Devices) == 0 {
-			return nil
-		}
-		return cgroups.ErrDevicesUnsupported
+func loadEmulator(path string) (*cgroupdevices.Emulator, error) {
+	list, err := cgroups.ReadFile(path, "devices.list")
+	if err != nil {
+		return nil, err
 	}
-	return cgroups.DevicesSetV1(path, r)
+	return cgroupdevices.EmulatorFromList(bytes.NewBufferString(list))
+}
+
+func buildEmulator(rules []*devices.Rule) (*cgroupdevices.Emulator, error) {
+	// This defaults to a white-list -- which is what we want!
+	emu := &cgroupdevices.Emulator{}
+	for _, rule := range rules {
+		if err := emu.Apply(*rule); err != nil {
+			return nil, err
+		}
+	}
+	return emu, nil
+}
+
+func (s *DevicesGroup) Set(path string, r *configs.Resources) error {
+	if userns.RunningInUserNS() || r.SkipDevices {
+		return nil
+	}
+
+	// Generate two emulators, one for the current state of the cgroup and one
+	// for the requested state by the user.
+	current, err := loadEmulator(path)
+	if err != nil {
+		return err
+	}
+	target, err := buildEmulator(r.Devices)
+	if err != nil {
+		return err
+	}
+
+	// Compute the minimal set of transition rules needed to achieve the
+	// requested state.
+	transitionRules, err := current.Transition(target)
+	if err != nil {
+		return err
+	}
+	for _, rule := range transitionRules {
+		file := "devices.deny"
+		if rule.Allow {
+			file = "devices.allow"
+		}
+		if err := cgroups.WriteFile(path, file, rule.CgroupString()); err != nil {
+			return err
+		}
+	}
+
+	// Final safety check -- ensure that the resulting state is what was
+	// requested. This is only really correct for white-lists, but for
+	// black-lists we can at least check that the cgroup is in the right mode.
+	//
+	// This safety-check is skipped for the unit tests because we cannot
+	// currently mock devices.list correctly.
+	if !s.TestingSkipFinalCheck {
+		currentAfter, err := loadEmulator(path)
+		if err != nil {
+			return err
+		}
+		if !target.IsBlacklist() && !reflect.DeepEqual(currentAfter, target) {
+			return errors.New("resulting devices cgroup doesn't precisely match target")
+		} else if target.IsBlacklist() != currentAfter.IsBlacklist() {
+			return errors.New("resulting devices cgroup doesn't match target mode")
+		}
+	}
+	return nil
 }

 func (s *DevicesGroup) GetStats(path string, stats *cgroups.Stats) error {
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/fs.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/fs.go
@ -54,13 +54,13 @@ type subsystem interface {
 	Set(path string, r *configs.Resources) error
 }

-type Manager struct {
+type manager struct {
 	mu      sync.Mutex
 	cgroups *configs.Cgroup
 	paths   map[string]string
 }

-func NewManager(cg *configs.Cgroup, paths map[string]string) (*Manager, error) {
+func NewManager(cg *configs.Cgroup, paths map[string]string) (cgroups.Manager, error) {
 	// Some v1 controllers (cpu, cpuset, and devices) expect
 	// cgroups.Resources to not be nil in Apply.
 	if cg.Resources == nil {
@ -78,7 +78,7 @@ func NewManager(cg *configs.Cgroup, paths map[string]string) (*Manager, error) {
 		}
 	}

-	return &Manager{
+	return &manager{
 		cgroups: cg,
 		paths:   paths,
 	}, nil
@ -105,7 +105,7 @@ func isIgnorableError(rootless bool, err error) bool {
 	return false
 }

-func (m *Manager) Apply(pid int) (err error) {
+func (m *manager) Apply(pid int) (err error) {
 	m.mu.Lock()
 	defer m.mu.Unlock()

@ -139,19 +139,19 @@ func (m *Manager) Apply(pid int) (err error) {
 	return nil
 }

-func (m *Manager) Destroy() error {
+func (m *manager) Destroy() error {
 	m.mu.Lock()
 	defer m.mu.Unlock()
 	return cgroups.RemovePaths(m.paths)
 }

-func (m *Manager) Path(subsys string) string {
+func (m *manager) Path(subsys string) string {
 	m.mu.Lock()
 	defer m.mu.Unlock()
 	return m.paths[subsys]
 }

-func (m *Manager) GetStats() (*cgroups.Stats, error) {
+func (m *manager) GetStats() (*cgroups.Stats, error) {
 	m.mu.Lock()
 	defer m.mu.Unlock()
 	stats := cgroups.NewStats()
@ -167,7 +167,7 @@ func (m *Manager) GetStats() (*cgroups.Stats, error) {
 	return stats, nil
 }

-func (m *Manager) Set(r *configs.Resources) error {
+func (m *manager) Set(r *configs.Resources) error {
 	if r == nil {
 		return nil
 	}
@ -183,7 +183,7 @@ func (m *Manager) Set(r *configs.Resources) error {
 		if err := sys.Set(path, r); err != nil {
 			// When rootless is true, errors from the device subsystem
 			// are ignored, as it is really not expected to work.
-			if m.cgroups.Rootless && sys.Name() == "devices" && !errors.Is(err, cgroups.ErrDevicesUnsupported) {
+			if m.cgroups.Rootless && sys.Name() == "devices" {
 				continue
 			}
 			// However, errors from other subsystems are not ignored.
@ -202,7 +202,7 @@ func (m *Manager) Set(r *configs.Resources) error {

 // Freeze toggles the container's freezer cgroup depending on the state
 // provided
-func (m *Manager) Freeze(state configs.FreezerState) error {
+func (m *manager) Freeze(state configs.FreezerState) error {
 	path := m.Path("freezer")
 	if path == "" {
 		return errors.New("cannot toggle freezer: cgroups not configured for container")
@ -218,25 +218,25 @@ func (m *Manager) Freeze(state configs.FreezerState) error {
 	return nil
 }

-func (m *Manager) GetPids() ([]int, error) {
+func (m *manager) GetPids() ([]int, error) {
 	return cgroups.GetPids(m.Path("devices"))
 }

-func (m *Manager) GetAllPids() ([]int, error) {
+func (m *manager) GetAllPids() ([]int, error) {
 	return cgroups.GetAllPids(m.Path("devices"))
 }

-func (m *Manager) GetPaths() map[string]string {
+func (m *manager) GetPaths() map[string]string {
 	m.mu.Lock()
 	defer m.mu.Unlock()
 	return m.paths
 }

-func (m *Manager) GetCgroups() (*configs.Cgroup, error) {
+func (m *manager) GetCgroups() (*configs.Cgroup, error) {
 	return m.cgroups, nil
 }

-func (m *Manager) GetFreezerState() (configs.FreezerState, error) {
+func (m *manager) GetFreezerState() (configs.FreezerState, error) {
 	dir := m.Path("freezer")
 	// If the container doesn't have the freezer cgroup, say it's undefined.
 	if dir == "" {
@ -246,7 +246,7 @@ func (m *Manager) GetFreezerState() (configs.FreezerState, error) {
 	return freezer.GetState(dir)
 }

-func (m *Manager) Exists() bool {
+func (m *manager) Exists() bool {
 	return cgroups.PathExists(m.Path("devices"))
 }

@ -254,7 +254,7 @@ func OOMKillCount(path string) (uint64, error) {
 	return fscommon.GetValueByKey(path, "memory.oom_control", "oom_kill")
 }

-func (m *Manager) OOMKillCount() (uint64, error) {
+func (m *manager) OOMKillCount() (uint64, error) {
 	c, err := OOMKillCount(m.Path("memory"))
 	// Ignore ENOENT when rootless as it couldn't create cgroup.
 	if err != nil && m.cgroups.Rootless && os.IsNotExist(err) {
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/hugetlb.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/hugetlb.go
@ -1,6 +1,8 @@
 package fs

 import (
+	"errors"
+	"os"
 	"strconv"

 	"github.com/opencontainers/runc/libcontainer/cgroups"
@ -19,8 +21,23 @@ func (s *HugetlbGroup) Apply(path string, _ *configs.Resources, pid int) error {
 }

 func (s *HugetlbGroup) Set(path string, r *configs.Resources) error {
+	const suffix = ".limit_in_bytes"
+	skipRsvd := false
+
 	for _, hugetlb := range r.HugetlbLimit {
-		if err := cgroups.WriteFile(path, "hugetlb."+hugetlb.Pagesize+".limit_in_bytes", strconv.FormatUint(hugetlb.Limit, 10)); err != nil {
+		prefix := "hugetlb." + hugetlb.Pagesize
+		val := strconv.FormatUint(hugetlb.Limit, 10)
+		if err := cgroups.WriteFile(path, prefix+suffix, val); err != nil {
+			return err
+		}
+		if skipRsvd {
+			continue
+		}
+		if err := cgroups.WriteFile(path, prefix+".rsvd"+suffix, val); err != nil {
+			if errors.Is(err, os.ErrNotExist) {
+				skipRsvd = true
+				continue
+			}
 			return err
 		}
 	}
@ -32,24 +49,29 @@ func (s *HugetlbGroup) GetStats(path string, stats *cgroups.Stats) error {
 	if !cgroups.PathExists(path) {
 		return nil
 	}
+	rsvd := ".rsvd"
 	hugetlbStats := cgroups.HugetlbStats{}
 	for _, pageSize := range cgroups.HugePageSizes() {
-		usage := "hugetlb." + pageSize + ".usage_in_bytes"
-		value, err := fscommon.GetCgroupParamUint(path, usage)
+	again:
+		prefix := "hugetlb." + pageSize + rsvd
+
+		value, err := fscommon.GetCgroupParamUint(path, prefix+".usage_in_bytes")
 		if err != nil {
+			if rsvd != "" && errors.Is(err, os.ErrNotExist) {
+				rsvd = ""
+				goto again
+			}
 			return err
 		}
 		hugetlbStats.Usage = value

-		maxUsage := "hugetlb." + pageSize + ".max_usage_in_bytes"
-		value, err = fscommon.GetCgroupParamUint(path, maxUsage)
+		value, err = fscommon.GetCgroupParamUint(path, prefix+".max_usage_in_bytes")
 		if err != nil {
 			return err
 		}
 		hugetlbStats.MaxUsage = value

-		failcnt := "hugetlb." + pageSize + ".failcnt"
-		value, err = fscommon.GetCgroupParamUint(path, failcnt)
+		value, err = fscommon.GetCgroupParamUint(path, prefix+".failcnt")
 		if err != nil {
 			return err
 		}
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/memory.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/memory.go
@ -170,6 +170,10 @@ func (s *MemoryGroup) GetStats(path string, stats *cgroups.Stats) error {
 		return err
 	}
 	stats.MemoryStats.SwapUsage = swapUsage
+	stats.MemoryStats.SwapOnlyUsage = cgroups.MemoryData{
+		Usage:   swapUsage.Usage - memoryUsage.Usage,
+		Failcnt: swapUsage.Failcnt - memoryUsage.Failcnt,
+	}
 	kernelUsage, err := getMemoryData(path, "kmem")
 	if err != nil {
 		return err
@ -234,6 +238,12 @@ func getMemoryData(path, name string) (cgroups.MemoryData, error) {
 	memoryData.Failcnt = value
 	value, err = fscommon.GetCgroupParamUint(path, limit)
 	if err != nil {
+		if name == "kmem" && os.IsNotExist(err) {
+			// Ignore ENOENT as kmem.limit_in_bytes has
+			// been removed in newer kernels.
+			return memoryData, nil
+		}
+
 		return cgroups.MemoryData{}, err
 	}
 	memoryData.Limit = value
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/paths.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/paths.go
@ -165,8 +165,9 @@ func subsysPath(root, inner, subsystem string) (string, error) {
 		return filepath.Join(root, filepath.Base(mnt), inner), nil
 	}

-	// Use GetOwnCgroupPath for dind-like cases, when cgroupns is not
-	// available. This is ugly.
+	// Use GetOwnCgroupPath instead of GetInitCgroupPath, because the creating
+	// process could in container and shared pid namespace with host, and
+	// /proc/1/cgroup could point to whole other world of cgroups.
 	parentPath, err := cgroups.GetOwnCgroupPath(subsystem)
 	if err != nil {
 		return "", err
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/cpu.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/cpu.go
@ -11,7 +11,7 @@ import (
 )

 func isCpuSet(r *configs.Resources) bool {
-	return r.CpuWeight != 0 || r.CpuQuota != 0 || r.CpuPeriod != 0 || r.CPUIdle != nil
+	return r.CpuWeight != 0 || r.CpuQuota != 0 || r.CpuPeriod != 0
 }

 func setCpu(dirPath string, r *configs.Resources) error {
@ -19,12 +19,6 @@ func setCpu(dirPath string, r *configs.Resources) error {
 		return nil
 	}

-	if r.CPUIdle != nil {
-		if err := cgroups.WriteFile(dirPath, "cpu.idle", strconv.FormatInt(*r.CPUIdle, 10)); err != nil {
-			return err
-		}
-	}
-
 	// NOTE: .CpuShares is not used here. Conversion is the caller's responsibility.
 	if r.CpuWeight != 0 {
 		if err := cgroups.WriteFile(dirPath, "cpu.weight", strconv.FormatUint(r.CpuWeight, 10)); err != nil {
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/devices.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/devices.go
@ -0,0 +1,75 @@
+package fs2
+
+import (
+	"fmt"
+
+	"golang.org/x/sys/unix"
+
+	"github.com/opencontainers/runc/libcontainer/cgroups/ebpf"
+	"github.com/opencontainers/runc/libcontainer/cgroups/ebpf/devicefilter"
+	"github.com/opencontainers/runc/libcontainer/configs"
+	"github.com/opencontainers/runc/libcontainer/devices"
+	"github.com/opencontainers/runc/libcontainer/userns"
+)
+
+func isRWM(perms devices.Permissions) bool {
+	var r, w, m bool
+	for _, perm := range perms {
+		switch perm {
+		case 'r':
+			r = true
+		case 'w':
+			w = true
+		case 'm':
+			m = true
+		}
+	}
+	return r && w && m
+}
+
+// This is similar to the logic applied in crun for handling errors from bpf(2)
+// <https://github.com/containers/crun/blob/0.17/src/libcrun/cgroup.c#L2438-L2470>.
+func canSkipEBPFError(r *configs.Resources) bool {
+	// If we're running in a user namespace we can ignore eBPF rules because we
+	// usually cannot use bpf(2), as well as rootless containers usually don't
+	// have the necessary privileges to mknod(2) device inodes or access
+	// host-level instances (though ideally we would be blocking device access
+	// for rootless containers anyway).
+	if userns.RunningInUserNS() {
+		return true
+	}
+
+	// We cannot ignore an eBPF load error if any rule if is a block rule or it
+	// doesn't permit all access modes.
+	//
+	// NOTE: This will sometimes trigger in cases where access modes are split
+	//       between different rules but to handle this correctly would require
+	//       using ".../libcontainer/cgroup/devices".Emulator.
+	for _, dev := range r.Devices {
+		if !dev.Allow || !isRWM(dev.Permissions) {
+			return false
+		}
+	}
+	return true
+}
+
+func setDevices(dirPath string, r *configs.Resources) error {
+	if r.SkipDevices {
+		return nil
+	}
+	insts, license, err := devicefilter.DeviceFilter(r.Devices)
+	if err != nil {
+		return err
+	}
+	dirFD, err := unix.Open(dirPath, unix.O_DIRECTORY|unix.O_RDONLY, 0o600)
+	if err != nil {
+		return fmt.Errorf("cannot get dir FD for %s", dirPath)
+	}
+	defer unix.Close(dirFD)
+	if _, err := ebpf.LoadAttachCgroupDeviceFilter(insts, license, dirFD); err != nil {
+		if !canSkipEBPFError(r) {
+			return err
+		}
+	}
+	return nil
+}
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/fs2.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/fs2.go
@ -13,7 +13,7 @@ import (

 type parseError = fscommon.ParseError

-type Manager struct {
+type manager struct {
 	config *configs.Cgroup
 	// dirPath is like "/sys/fs/cgroup/user.slice/user-1001.slice/session-1.scope"
 	dirPath string
@ -25,7 +25,7 @@ type Manager struct {
 // NewManager creates a manager for cgroup v2 unified hierarchy.
 // dirPath is like "/sys/fs/cgroup/user.slice/user-1001.slice/session-1.scope".
 // If dirPath is empty, it is automatically set using config.
-func NewManager(config *configs.Cgroup, dirPath string) (*Manager, error) {
+func NewManager(config *configs.Cgroup, dirPath string) (cgroups.Manager, error) {
 	if dirPath == "" {
 		var err error
 		dirPath, err = defaultDirPath(config)
@ -34,14 +34,14 @@ func NewManager(config *configs.Cgroup, dirPath string) (*Manager, error) {
 		}
 	}

-	m := &Manager{
+	m := &manager{
 		config:  config,
 		dirPath: dirPath,
 	}
 	return m, nil
 }

-func (m *Manager) getControllers() error {
+func (m *manager) getControllers() error {
 	if m.controllers != nil {
 		return nil
 	}
@ -62,7 +62,7 @@ func (m *Manager) getControllers() error {
 	return nil
 }

-func (m *Manager) Apply(pid int) error {
+func (m *manager) Apply(pid int) error {
 	if err := CreateCgroupPath(m.dirPath, m.config); err != nil {
 		// Related tests:
 		// - "runc create (no limits + no cgrouppath + no permission) succeeds"
@ -84,15 +84,15 @@ func (m *Manager) Apply(pid int) error {
 	return nil
 }

-func (m *Manager) GetPids() ([]int, error) {
+func (m *manager) GetPids() ([]int, error) {
 	return cgroups.GetPids(m.dirPath)
 }

-func (m *Manager) GetAllPids() ([]int, error) {
+func (m *manager) GetAllPids() ([]int, error) {
 	return cgroups.GetAllPids(m.dirPath)
 }

-func (m *Manager) GetStats() (*cgroups.Stats, error) {
+func (m *manager) GetStats() (*cgroups.Stats, error) {
 	var errs []error

 	st := cgroups.NewStats()
@ -114,17 +114,6 @@ func (m *Manager) GetStats() (*cgroups.Stats, error) {
 	if err := statCpu(m.dirPath, st); err != nil && !os.IsNotExist(err) {
 		errs = append(errs, err)
 	}
-	// PSI (since kernel 4.20).
-	var err error
-	if st.CpuStats.PSI, err = statPSI(m.dirPath, "cpu.pressure"); err != nil {
-		errs = append(errs, err)
-	}
-	if st.MemoryStats.PSI, err = statPSI(m.dirPath, "memory.pressure"); err != nil {
-		errs = append(errs, err)
-	}
-	if st.BlkioStats.PSI, err = statPSI(m.dirPath, "io.pressure"); err != nil {
-		errs = append(errs, err)
-	}
 	// hugetlb (since kernel 5.6)
 	if err := statHugeTlb(m.dirPath, st); err != nil && !os.IsNotExist(err) {
 		errs = append(errs, err)
@ -139,7 +128,7 @@ func (m *Manager) GetStats() (*cgroups.Stats, error) {
 	return st, nil
 }

-func (m *Manager) Freeze(state configs.FreezerState) error {
+func (m *manager) Freeze(state configs.FreezerState) error {
 	if m.config.Resources == nil {
 		return errors.New("cannot toggle freezer: cgroups not configured for container")
 	}
@ -150,15 +139,15 @@ func (m *Manager) Freeze(state configs.FreezerState) error {
 	return nil
 }

-func (m *Manager) Destroy() error {
+func (m *manager) Destroy() error {
 	return cgroups.RemovePath(m.dirPath)
 }

-func (m *Manager) Path(_ string) string {
+func (m *manager) Path(_ string) string {
 	return m.dirPath
 }

-func (m *Manager) Set(r *configs.Resources) error {
+func (m *manager) Set(r *configs.Resources) error {
 	if r == nil {
 		return nil
 	}
@ -186,10 +175,8 @@ func (m *Manager) Set(r *configs.Resources) error {
 	// When rootless is true, errors from the device subsystem are ignored because it is really not expected to work.
 	// However, errors from other subsystems are not ignored.
 	// see @test "runc create (rootless + limits + no cgrouppath + no permission) fails with informative error"
-	if err := setDevices(m.dirPath, r); err != nil {
-		if !m.config.Rootless || errors.Is(err, cgroups.ErrDevicesUnsupported) {
-			return err
-		}
+	if err := setDevices(m.dirPath, r); err != nil && !m.config.Rootless {
+		return err
 	}
 	// cpuset (since kernel 5.0)
 	if err := setCpuset(m.dirPath, r); err != nil {
@ -214,17 +201,7 @@ func (m *Manager) Set(r *configs.Resources) error {
 	return nil
 }

-func setDevices(dirPath string, r *configs.Resources) error {
-	if cgroups.DevicesSetV2 == nil {
-		if len(r.Devices) > 0 {
-			return cgroups.ErrDevicesUnsupported
-		}
-		return nil
-	}
-	return cgroups.DevicesSetV2(dirPath, r)
-}
-
-func (m *Manager) setUnified(res map[string]string) error {
+func (m *manager) setUnified(res map[string]string) error {
 	for k, v := range res {
 		if strings.Contains(k, "/") {
 			return fmt.Errorf("unified resource %q must be a file name (no slashes)", k)
@ -250,21 +227,21 @@ func (m *Manager) setUnified(res map[string]string) error {
 	return nil
 }

-func (m *Manager) GetPaths() map[string]string {
+func (m *manager) GetPaths() map[string]string {
 	paths := make(map[string]string, 1)
 	paths[""] = m.dirPath
 	return paths
 }

-func (m *Manager) GetCgroups() (*configs.Cgroup, error) {
+func (m *manager) GetCgroups() (*configs.Cgroup, error) {
 	return m.config, nil
 }

-func (m *Manager) GetFreezerState() (configs.FreezerState, error) {
+func (m *manager) GetFreezerState() (configs.FreezerState, error) {
 	return getFreezer(m.dirPath)
 }

-func (m *Manager) Exists() bool {
+func (m *manager) Exists() bool {
 	return cgroups.PathExists(m.dirPath)
 }

@ -272,7 +249,7 @@ func OOMKillCount(path string) (uint64, error) {
 	return fscommon.GetValueByKey(path, "memory.events", "oom_kill")
 }

-func (m *Manager) OOMKillCount() (uint64, error) {
+func (m *manager) OOMKillCount() (uint64, error) {
 	c, err := OOMKillCount(m.dirPath)
 	if err != nil && m.config.Rootless && os.IsNotExist(err) {
 		err = nil
@ -280,35 +257,3 @@ func (m *Manager) OOMKillCount() (uint64, error) {

 	return c, err
 }
-
-func CheckMemoryUsage(dirPath string, r *configs.Resources) error {
-	if !r.MemoryCheckBeforeUpdate {
-		return nil
-	}
-
-	if r.Memory <= 0 && r.MemorySwap <= 0 {
-		return nil
-	}
-
-	usage, err := fscommon.GetCgroupParamUint(dirPath, "memory.current")
-	if err != nil {
-		// This check is on best-effort basis, so if we can't read the
-		// current usage (cgroup not yet created, or any other error),
-		// we should not fail.
-		return nil
-	}
-
-	if r.MemorySwap > 0 {
-		if uint64(r.MemorySwap) <= usage {
-			return fmt.Errorf("rejecting memory+swap limit %d <= usage %d", r.MemorySwap, usage)
-		}
-	}
-
-	if r.Memory > 0 {
-		if uint64(r.Memory) <= usage {
-			return fmt.Errorf("rejecting memory limit %d <= usage %d", r.Memory, usage)
-		}
-	}
-
-	return nil
-}
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/hugetlb.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/hugetlb.go
@ -1,6 +1,8 @@
 package fs2

 import (
+	"errors"
+	"os"
 	"strconv"

 	"github.com/opencontainers/runc/libcontainer/cgroups"
@ -16,8 +18,22 @@ func setHugeTlb(dirPath string, r *configs.Resources) error {
 	if !isHugeTlbSet(r) {
 		return nil
 	}
+	const suffix = ".max"
+	skipRsvd := false
 	for _, hugetlb := range r.HugetlbLimit {
-		if err := cgroups.WriteFile(dirPath, "hugetlb."+hugetlb.Pagesize+".max", strconv.FormatUint(hugetlb.Limit, 10)); err != nil {
+		prefix := "hugetlb." + hugetlb.Pagesize
+		val := strconv.FormatUint(hugetlb.Limit, 10)
+		if err := cgroups.WriteFile(dirPath, prefix+suffix, val); err != nil {
+			return err
+		}
+		if skipRsvd {
+			continue
+		}
+		if err := cgroups.WriteFile(dirPath, prefix+".rsvd"+suffix, val); err != nil {
+			if errors.Is(err, os.ErrNotExist) {
+				skipRsvd = true
+				continue
+			}
 			return err
 		}
 	}
@ -27,15 +43,21 @@ func setHugeTlb(dirPath string, r *configs.Resources) error {

 func statHugeTlb(dirPath string, stats *cgroups.Stats) error {
 	hugetlbStats := cgroups.HugetlbStats{}
+	rsvd := ".rsvd"
 	for _, pagesize := range cgroups.HugePageSizes() {
-		value, err := fscommon.GetCgroupParamUint(dirPath, "hugetlb."+pagesize+".current")
+	again:
+		prefix := "hugetlb." + pagesize + rsvd
+		value, err := fscommon.GetCgroupParamUint(dirPath, prefix+".current")
 		if err != nil {
+			if rsvd != "" && errors.Is(err, os.ErrNotExist) {
+				rsvd = ""
+				goto again
+			}
 			return err
 		}
 		hugetlbStats.Usage = value

-		fileName := "hugetlb." + pagesize + ".events"
-		value, err = fscommon.GetValueByKey(dirPath, fileName, "max")
+		value, err = fscommon.GetValueByKey(dirPath, prefix+".events", "max")
 		if err != nil {
 			return err
 		}
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/memory.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/memory.go
@ -40,11 +40,6 @@ func setMemory(dirPath string, r *configs.Resources) error {
 	if !isMemorySet(r) {
 		return nil
 	}
-
-	if err := CheckMemoryUsage(dirPath, r); err != nil {
-		return err
-	}
-
 	swap, err := cgroups.ConvertMemorySwapToCgroupV2Value(r.MemorySwap, r.Memory)
 	if err != nil {
 		return err
@ -105,7 +100,7 @@ func statMemory(dirPath string, stats *cgroups.Stats) error {
 	memoryUsage, err := getMemoryDataV2(dirPath, "")
 	if err != nil {
 		if errors.Is(err, unix.ENOENT) && dirPath == UnifiedMountpoint {
-			// The root cgroup does not have memory.{current,max}
+			// The root cgroup does not have memory.{current,max,peak}
 			// so emulate those using data from /proc/meminfo and
 			// /sys/fs/cgroup/memory.stat
 			return rootStatsFromMeminfo(stats)
@ -113,10 +108,12 @@ func statMemory(dirPath string, stats *cgroups.Stats) error {
 		return err
 	}
 	stats.MemoryStats.Usage = memoryUsage
-	swapUsage, err := getMemoryDataV2(dirPath, "swap")
+	swapOnlyUsage, err := getMemoryDataV2(dirPath, "swap")
 	if err != nil {
 		return err
 	}
+	stats.MemoryStats.SwapOnlyUsage = swapOnlyUsage
+	swapUsage := swapOnlyUsage
 	// As cgroup v1 reports SwapUsage values as mem+swap combined,
 	// while in cgroup v2 swap values do not include memory,
 	// report combined mem+swap for v1 compatibility.
@ -124,6 +121,9 @@ func statMemory(dirPath string, stats *cgroups.Stats) error {
 	if swapUsage.Limit != math.MaxUint64 {
 		swapUsage.Limit += memoryUsage.Limit
 	}
+	// The `MaxUsage` of mem+swap cannot simply combine mem with
+	// swap. So set it to 0 for v1 compatibility.
+	swapUsage.MaxUsage = 0
 	stats.MemoryStats.SwapUsage = swapUsage

 	return nil
@ -138,6 +138,7 @@ func getMemoryDataV2(path, name string) (cgroups.MemoryData, error) {
 	}
 	usage := moduleName + ".current"
 	limit := moduleName + ".max"
+	maxUsage := moduleName + ".peak"

 	value, err := fscommon.GetCgroupParamUint(path, usage)
 	if err != nil {
@ -157,6 +158,14 @@ func getMemoryDataV2(path, name string) (cgroups.MemoryData, error) {
 	}
 	memoryData.Limit = value

+	// `memory.peak` since kernel 5.19
+	// `memory.swap.peak` since kernel 6.5
+	value, err = fscommon.GetCgroupParamUint(path, maxUsage)
+	if err != nil && !os.IsNotExist(err) {
+		return cgroups.MemoryData{}, err
+	}
+	memoryData.MaxUsage = value
+
 	return memoryData, nil
 }

--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/psi.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/psi.go
@ -1,89 +0,0 @@
-package fs2
-
-import (
-	"bufio"
-	"errors"
-	"fmt"
-	"os"
-	"strconv"
-	"strings"
-
-	"golang.org/x/sys/unix"
-
-	"github.com/opencontainers/runc/libcontainer/cgroups"
-)
-
-func statPSI(dirPath string, file string) (*cgroups.PSIStats, error) {
-	f, err := cgroups.OpenFile(dirPath, file, os.O_RDONLY)
-	if err != nil {
-		if errors.Is(err, os.ErrNotExist) {
-			// Kernel < 4.20, or CONFIG_PSI is not set,
-			// or PSI stats are turned off for the cgroup
-			// ("echo 0 > cgroup.pressure", kernel >= 6.1).
-			return nil, nil
-		}
-		return nil, err
-	}
-	defer f.Close()
-
-	var psistats cgroups.PSIStats
-	sc := bufio.NewScanner(f)
-	for sc.Scan() {
-		parts := strings.Fields(sc.Text())
-		var pv *cgroups.PSIData
-		switch parts[0] {
-		case "some":
-			pv = &psistats.Some
-		case "full":
-			pv = &psistats.Full
-		}
-		if pv != nil {
-			*pv, err = parsePSIData(parts[1:])
-			if err != nil {
-				return nil, &parseError{Path: dirPath, File: file, Err: err}
-			}
-		}
-	}
-	if err := sc.Err(); err != nil {
-		if errors.Is(err, unix.ENOTSUP) {
-			// Some kernels (e.g. CS9) may return ENOTSUP on read
-			// if psi=1 kernel cmdline parameter is required.
-			return nil, nil
-		}
-		return nil, &parseError{Path: dirPath, File: file, Err: err}
-	}
-	return &psistats, nil
-}
-
-func parsePSIData(psi []string) (cgroups.PSIData, error) {
-	data := cgroups.PSIData{}
-	for _, f := range psi {
-		kv := strings.SplitN(f, "=", 2)
-		if len(kv) != 2 {
-			return data, fmt.Errorf("invalid psi data: %q", f)
-		}
-		var pv *float64
-		switch kv[0] {
-		case "avg10":
-			pv = &data.Avg10
-		case "avg60":
-			pv = &data.Avg60
-		case "avg300":
-			pv = &data.Avg300
-		case "total":
-			v, err := strconv.ParseUint(kv[1], 10, 64)
-			if err != nil {
-				return data, fmt.Errorf("invalid %s PSI value: %w", kv[0], err)
-			}
-			data.Total = v
-		}
-		if pv != nil {
-			v, err := strconv.ParseFloat(kv[1], 64)
-			if err != nil {
-				return data, fmt.Errorf("invalid %s PSI value: %w", kv[0], err)
-			}
-			*pv = v
-		}
-	}
-	return data, nil
-}
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/stats.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/stats.go
@ -32,22 +32,9 @@ type CpuUsage struct {
 	UsageInUsermode uint64 `json:"usage_in_usermode"`
 }

-type PSIData struct {
-	Avg10  float64 `json:"avg10"`
-	Avg60  float64 `json:"avg60"`
-	Avg300 float64 `json:"avg300"`
-	Total  uint64  `json:"total"`
-}
-
-type PSIStats struct {
-	Some PSIData `json:"some,omitempty"`
-	Full PSIData `json:"full,omitempty"`
-}
-
 type CpuStats struct {
 	CpuUsage       CpuUsage       `json:"cpu_usage,omitempty"`
 	ThrottlingData ThrottlingData `json:"throttling_data,omitempty"`
-	PSI            *PSIStats      `json:"psi,omitempty"`
 }

 type CPUSetStats struct {
@ -91,6 +78,8 @@ type MemoryStats struct {
 	Usage MemoryData `json:"usage,omitempty"`
 	// usage of memory + swap
 	SwapUsage MemoryData `json:"swap_usage,omitempty"`
+	// usage of swap only
+	SwapOnlyUsage MemoryData `json:"swap_only_usage,omitempty"`
 	// usage of kernel memory
 	KernelUsage MemoryData `json:"kernel_usage,omitempty"`
 	// usage of kernel TCP memory
@ -102,7 +91,6 @@ type MemoryStats struct {
 	UseHierarchy bool `json:"use_hierarchy"`

 	Stats map[string]uint64 `json:"stats,omitempty"`
-	PSI   *PSIStats         `json:"psi,omitempty"`
 }

 type PageUsageByNUMA struct {
@ -147,7 +135,6 @@ type BlkioStats struct {
 	IoMergedRecursive       []BlkioStatEntry `json:"io_merged_recursive,omitempty"`
 	IoTimeRecursive         []BlkioStatEntry `json:"io_time_recursive,omitempty"`
 	SectorsRecursive        []BlkioStatEntry `json:"sectors_recursive,omitempty"`
-	PSI                     *PSIStats        `json:"psi,omitempty"`
 }

 type HugetlbStats struct {
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/utils.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/utils.go
@ -36,13 +36,13 @@ func IsCgroup2UnifiedMode() bool {
 		var st unix.Statfs_t
 		err := unix.Statfs(unifiedMountpoint, &st)
 		if err != nil {
-			level := logrus.WarnLevel
 			if os.IsNotExist(err) && userns.RunningInUserNS() {
-				// For rootless containers, sweep it under the rug.
-				level = logrus.DebugLevel
+				// ignore the "not found" error if running in userns
+				logrus.WithError(err).Debugf("%s missing, assuming cgroup v1", unifiedMountpoint)
+				isUnified = false
+				return
 			}
-			logrus.StandardLogger().Logf(level,
-				"statfs %s: %v; assuming cgroup v1", unifiedMountpoint, err)
+			panic(fmt.Sprintf("cannot statfs cgroup root: %s", err))
 		}
 		isUnified = st.Type == unix.CGROUP2_SUPER_MAGIC
 	})
@ -217,9 +217,20 @@ func PathExists(path string) bool {
 	return true
 }

+func EnterPid(cgroupPaths map[string]string, pid int) error {
+	for _, path := range cgroupPaths {
+		if PathExists(path) {
+			if err := WriteCgroupProc(path, pid); err != nil {
+				return err
+			}
+		}
+	}
+	return nil
+}
+
 func rmdir(path string) error {
 	err := unix.Rmdir(path)
-	if err == nil || err == unix.ENOENT {
+	if err == nil || err == unix.ENOENT { //nolint:errorlint // unix errors are bare
 		return nil
 	}
 	return &os.PathError{Op: "rmdir", Path: path, Err: err}
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/v1_utils.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/v1_utils.go
@ -236,6 +236,27 @@ func GetOwnCgroupPath(subsystem string) (string, error) {
 	return getCgroupPathHelper(subsystem, cgroup)
 }

+func GetInitCgroup(subsystem string) (string, error) {
+	if IsCgroup2UnifiedMode() {
+		return "", errUnified
+	}
+	cgroups, err := ParseCgroupFile("/proc/1/cgroup")
+	if err != nil {
+		return "", err
+	}
+
+	return getControllerPath(subsystem, cgroups)
+}
+
+func GetInitCgroupPath(subsystem string) (string, error) {
+	cgroup, err := GetInitCgroup(subsystem)
+	if err != nil {
+		return "", err
+	}
+
+	return getCgroupPathHelper(subsystem, cgroup)
+}
+
 func getCgroupPathHelper(subsystem, cgroup string) (string, error) {
 	mnt, root, err := FindCgroupMountpointAndRoot("", subsystem)
 	if err != nil {
--- a/vendor/github.com/opencontainers/runc/libcontainer/configs/blkio_device.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/blkio_device.go
@ -2,8 +2,8 @@ package configs

 import "fmt"

-// BlockIODevice holds major:minor format supported in blkio cgroup.
-type BlockIODevice struct {
+// blockIODevice holds major:minor format supported in blkio cgroup
+type blockIODevice struct {
 	// Major is the device's major number
 	Major int64 `json:"major"`
 	// Minor is the device's minor number
@ -12,7 +12,7 @@ type BlockIODevice struct {

 // WeightDevice struct holds a `major:minor weight`|`major:minor leaf_weight` pair
 type WeightDevice struct {
-	BlockIODevice
+	blockIODevice
 	// Weight is the bandwidth rate for the device, range is from 10 to 1000
 	Weight uint16 `json:"weight"`
 	// LeafWeight is the bandwidth rate for the device while competing with the cgroup's child cgroups, range is from 10 to 1000, cfq scheduler only
@ -41,7 +41,7 @@ func (wd *WeightDevice) LeafWeightString() string {

 // ThrottleDevice struct holds a `major:minor rate_per_second` pair
 type ThrottleDevice struct {
-	BlockIODevice
+	blockIODevice
 	// Rate is the IO rate limit per cgroup per device
 	Rate uint64 `json:"rate"`
 }
--- a/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_linux.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_linux.go
@ -84,9 +84,6 @@ type Resources struct {
 	// MEM to use
 	CpusetMems string `json:"cpuset_mems"`

-	// cgroup SCHED_IDLE
-	CPUIdle *int64 `json:"cpu_idle,omitempty"`
-
 	// Process limit; set <= `0' to disable limit.
 	PidsLimit int64 `json:"pids_limit"`

@ -158,9 +155,4 @@ type Resources struct {
 	// during Set() to figure out whether the freeze is required. Those
 	// methods may be relatively slow, thus this flag.
 	SkipFreezeOnSet bool `json:"-"`
-
-	// MemoryCheckBeforeUpdate is a flag for cgroup v2 managers to check
-	// if the new memory limits (Memory and MemorySwap) being set are lower
-	// than the current memory usage, and reject if so.
-	MemoryCheckBeforeUpdate bool `json:"memory_check_before_update"`
 }
--- a/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go
@ -21,9 +21,9 @@ type Rlimit struct {

 // IDMap represents UID/GID Mappings for User Namespaces.
 type IDMap struct {
-	ContainerID int `json:"container_id"`
-	HostID      int `json:"host_id"`
-	Size        int `json:"size"`
+	ContainerID int64 `json:"container_id"`
+	HostID      int64 `json:"host_id"`
+	Size        int64 `json:"size"`
 }

 // Seccomp represents syscall restrictions
@ -31,13 +31,12 @@ type IDMap struct {
 // for syscalls. Additional architectures can be added by specifying them in
 // Architectures.
 type Seccomp struct {
-	DefaultAction    Action                   `json:"default_action"`
-	Architectures    []string                 `json:"architectures"`
-	Flags            []specs.LinuxSeccompFlag `json:"flags"`
-	Syscalls         []*Syscall               `json:"syscalls"`
-	DefaultErrnoRet  *uint                    `json:"default_errno_ret"`
-	ListenerPath     string                   `json:"listener_path,omitempty"`
-	ListenerMetadata string                   `json:"listener_metadata,omitempty"`
+	DefaultAction    Action     `json:"default_action"`
+	Architectures    []string   `json:"architectures"`
+	Syscalls         []*Syscall `json:"syscalls"`
+	DefaultErrnoRet  *uint      `json:"default_errno_ret"`
+	ListenerPath     string     `json:"listener_path,omitempty"`
+	ListenerMetadata string     `json:"listener_metadata,omitempty"`
 }

 // Action is taken upon rule match in Seccomp
@ -84,6 +83,9 @@ type Syscall struct {
 	Args     []*Arg `json:"args"`
 }

+// TODO Windows. Many of these fields should be factored out into those parts
+// which are common across platforms, and those which are platform specific.
+
 // Config defines configuration options for executing a process inside a contained environment.
 type Config struct {
 	// NoPivotRoot will use MS_MOVE and a chroot to jail the process into the container's rootfs
@ -119,9 +121,6 @@ type Config struct {
 	// Hostname optionally sets the container's hostname if provided
 	Hostname string `json:"hostname"`

-	// Domainname optionally sets the container's domainname if provided
-	Domainname string `json:"domainname"`
-
 	// Namespaces specifies the container's namespaces that it should setup when cloning the init process
 	// If a namespace is not provided that namespace is shared from the container's parent process
 	Namespaces Namespaces `json:"namespaces"`
@ -159,11 +158,11 @@ type Config struct {
 	// More information about kernel oom score calculation here: https://lwn.net/Articles/317814/
 	OomScoreAdj *int `json:"oom_score_adj,omitempty"`

-	// UIDMappings is an array of User ID mappings for User Namespaces
-	UIDMappings []IDMap `json:"uid_mappings"`
+	// UidMappings is an array of User ID mappings for User Namespaces
+	UidMappings []IDMap `json:"uid_mappings"`

-	// GIDMappings is an array of Group ID mappings for User Namespaces
-	GIDMappings []IDMap `json:"gid_mappings"`
+	// GidMappings is an array of Group ID mappings for User Namespaces
+	GidMappings []IDMap `json:"gid_mappings"`

 	// MaskPaths specifies paths within the container's rootfs to mask over with a bind
 	// mount pointing to /dev/null as to prevent reads of the file.
@ -212,13 +211,6 @@ type Config struct {
 	// RootlessCgroups is set when unlikely to have the full access to cgroups.
 	// When RootlessCgroups is set, cgroups errors are ignored.
 	RootlessCgroups bool `json:"rootless_cgroups,omitempty"`
-
-	// Do not try to remount a bind mount again after the first attempt failed on source
-	// filesystems that have nodev, noexec, nosuid, noatime, relatime, strictatime, nodiratime set
-	NoMountFallback bool `json:"no_mount_fallback,omitempty"`
-
-	// TimeOffsets specifies the offset for supporting time namespaces.
-	TimeOffsets map[string]specs.LinuxTimeOffset `json:"time_offsets,omitempty"`
 }

 type (
@ -285,7 +277,6 @@ type Capabilities struct {
 	Ambient []string
 }

-// Deprecated: use (Hooks).Run instead.
 func (hooks HookList) RunHooks(state *specs.State) error {
 	for i, h := range hooks {
 		if err := h.Run(state); err != nil {
@ -342,18 +333,6 @@ func (hooks *Hooks) MarshalJSON() ([]byte, error) {
 	})
 }

-// Run executes all hooks for the given hook name.
-func (hooks Hooks) Run(name HookName, state *specs.State) error {
-	list := hooks[name]
-	for i, h := range list {
-		if err := h.Run(state); err != nil {
-			return fmt.Errorf("error running %s hook #%d: %w", name, i, err)
-		}
-	}
-
-	return nil
-}
-
 type Hook interface {
 	// Run executes the hook with the provided state.
 	Run(*specs.State) error
@ -414,7 +393,7 @@ func (c Command) Run(s *specs.State) error {
 	go func() {
 		err := cmd.Wait()
 		if err != nil {
-			err = fmt.Errorf("%w, stdout: %s, stderr: %s", err, stdout.String(), stderr.String())
+			err = fmt.Errorf("error running hook: %w, stdout: %s, stderr: %s", err, stdout.String(), stderr.String())
 		}
 		errC <- err
 	}()
--- a/vendor/github.com/opencontainers/runc/libcontainer/configs/config_linux.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/config_linux.go
@ -1,6 +1,10 @@
 package configs

-import "errors"
+import (
+	"errors"
+	"fmt"
+	"math"
+)

 var (
 	errNoUIDMap   = errors.New("User namespaces enabled, but no uid mappings found.")
@ -13,14 +17,21 @@ var (
 // different when user namespaces are enabled.
 func (c Config) HostUID(containerId int) (int, error) {
 	if c.Namespaces.Contains(NEWUSER) {
-		if len(c.UIDMappings) == 0 {
+		if c.UidMappings == nil {
 			return -1, errNoUIDMap
 		}
-		id, found := c.hostIDFromMapping(containerId, c.UIDMappings)
+		id, found := c.hostIDFromMapping(int64(containerId), c.UidMappings)
 		if !found {
 			return -1, errNoUserMap
 		}
-		return id, nil
+		// If we are a 32-bit binary running on a 64-bit system, it's possible
+		// the mapped user is too large to store in an int, which means we
+		// cannot do the mapping. We can't just return an int64, because
+		// os.Setuid() takes an int.
+		if id > math.MaxInt {
+			return -1, fmt.Errorf("mapping for uid %d (host id %d) is larger than native integer size (%d)", containerId, id, math.MaxInt)
+		}
+		return int(id), nil
 	}
 	// Return unchanged id.
 	return containerId, nil
@ -36,14 +47,21 @@ func (c Config) HostRootUID() (int, error) {
 // different when user namespaces are enabled.
 func (c Config) HostGID(containerId int) (int, error) {
 	if c.Namespaces.Contains(NEWUSER) {
-		if len(c.GIDMappings) == 0 {
+		if c.GidMappings == nil {
 			return -1, errNoGIDMap
 		}
-		id, found := c.hostIDFromMapping(containerId, c.GIDMappings)
+		id, found := c.hostIDFromMapping(int64(containerId), c.GidMappings)
 		if !found {
 			return -1, errNoGroupMap
 		}
-		return id, nil
+		// If we are a 32-bit binary running on a 64-bit system, it's possible
+		// the mapped user is too large to store in an int, which means we
+		// cannot do the mapping. We can't just return an int64, because
+		// os.Setgid() takes an int.
+		if id > math.MaxInt {
+			return -1, fmt.Errorf("mapping for gid %d (host id %d) is larger than native integer size (%d)", containerId, id, math.MaxInt)
+		}
+		return int(id), nil
 	}
 	// Return unchanged id.
 	return containerId, nil
@ -57,7 +75,7 @@ func (c Config) HostRootGID() (int, error) {

 // Utility function that gets a host ID for a container ID from user namespace map
 // if that ID is present in the map.
-func (c Config) hostIDFromMapping(containerID int, uMap []IDMap) (int, bool) {
+func (c Config) hostIDFromMapping(containerID int64, uMap []IDMap) (int64, bool) {
 	for _, m := range uMap {
 		if (containerID >= m.ContainerID) && (containerID <= (m.ContainerID + m.Size - 1)) {
 			hostID := m.HostID + (containerID - m.ContainerID)
--- a/vendor/github.com/opencontainers/runc/libcontainer/configs/mount.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/mount.go
@ -1,7 +1,48 @@
 package configs

+import "golang.org/x/sys/unix"
+
 const (
 	// EXT_COPYUP is a directive to copy up the contents of a directory when
 	// a tmpfs is mounted over it.
 	EXT_COPYUP = 1 << iota //nolint:golint // ignore "don't use ALL_CAPS" warning
 )
+
+type Mount struct {
+	// Source path for the mount.
+	Source string `json:"source"`
+
+	// Destination path for the mount inside the container.
+	Destination string `json:"destination"`
+
+	// Device the mount is for.
+	Device string `json:"device"`
+
+	// Mount flags.
+	Flags int `json:"flags"`
+
+	// Propagation Flags
+	PropagationFlags []int `json:"propagation_flags"`
+
+	// Mount data applied to the mount.
+	Data string `json:"data"`
+
+	// Relabel source if set, "z" indicates shared, "Z" indicates unshared.
+	Relabel string `json:"relabel"`
+
+	// RecAttr represents mount properties to be applied recursively (AT_RECURSIVE), see mount_setattr(2).
+	RecAttr *unix.MountAttr `json:"rec_attr"`
+
+	// Extensions are additional flags that are specific to runc.
+	Extensions int `json:"extensions"`
+
+	// Optional Command to be run before Source is mounted.
+	PremountCmds []Command `json:"premount_cmds"`
+
+	// Optional Command to be run after Source is mounted.
+	PostmountCmds []Command `json:"postmount_cmds"`
+}
+
+func (m *Mount) IsBind() bool {
+	return m.Flags&unix.MS_BIND != 0
+}
--- a/vendor/github.com/opencontainers/runc/libcontainer/configs/mount_linux.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/mount_linux.go
@ -1,52 +0,0 @@
-package configs
-
-import "golang.org/x/sys/unix"
-
-type Mount struct {
-	// Source path for the mount.
-	Source string `json:"source"`
-
-	// Destination path for the mount inside the container.
-	Destination string `json:"destination"`
-
-	// Device the mount is for.
-	Device string `json:"device"`
-
-	// Mount flags.
-	Flags int `json:"flags"`
-
-	// Propagation Flags
-	PropagationFlags []int `json:"propagation_flags"`
-
-	// Mount data applied to the mount.
-	Data string `json:"data"`
-
-	// Relabel source if set, "z" indicates shared, "Z" indicates unshared.
-	Relabel string `json:"relabel"`
-
-	// RecAttr represents mount properties to be applied recursively (AT_RECURSIVE), see mount_setattr(2).
-	RecAttr *unix.MountAttr `json:"rec_attr"`
-
-	// Extensions are additional flags that are specific to runc.
-	Extensions int `json:"extensions"`
-
-	// UIDMappings is used to changing file user owners w/o calling chown.
-	// Note that, the underlying filesystem should support this feature to be
-	// used.
-	// Every mount point could have its own mapping.
-	UIDMappings []IDMap `json:"uid_mappings,omitempty"`
-
-	// GIDMappings is used to changing file group owners w/o calling chown.
-	// Note that, the underlying filesystem should support this feature to be
-	// used.
-	// Every mount point could have its own mapping.
-	GIDMappings []IDMap `json:"gid_mappings,omitempty"`
-}
-
-func (m *Mount) IsBind() bool {
-	return m.Flags&unix.MS_BIND != 0
-}
-
-func (m *Mount) IsIDMapped() bool {
-	return len(m.UIDMappings) > 0 || len(m.GIDMappings) > 0
-}
--- a/vendor/github.com/opencontainers/runc/libcontainer/configs/mount_unsupported.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/mount_unsupported.go
@ -1,10 +0,0 @@
-//go:build !linux
-// +build !linux
-
-package configs
-
-type Mount struct{}
-
-func (m *Mount) IsBind() bool {
-	return false
-}
--- a/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_linux.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_linux.go
@ -14,7 +14,6 @@ const (
 	NEWIPC    NamespaceType = "NEWIPC"
 	NEWUSER   NamespaceType = "NEWUSER"
 	NEWCGROUP NamespaceType = "NEWCGROUP"
-	NEWTIME   NamespaceType = "NEWTIME"
 )

 var (
@ -39,8 +38,6 @@ func NsName(ns NamespaceType) string {
 		return "uts"
 	case NEWCGROUP:
 		return "cgroup"
-	case NEWTIME:
-		return "time"
 	}
 	return ""
 }
@ -75,7 +72,6 @@ func NamespaceTypes() []NamespaceType {
 		NEWPID,
 		NEWNS,
 		NEWCGROUP,
-		NEWTIME,
 	}
 }

--- a/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_syscall.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_syscall.go
@ -17,7 +17,6 @@ var namespaceInfo = map[NamespaceType]int{
 	NEWUTS:    unix.CLONE_NEWUTS,
 	NEWPID:    unix.CLONE_NEWPID,
 	NEWCGROUP: unix.CLONE_NEWCGROUP,
-	NEWTIME:   unix.CLONE_NEWTIME,
 }

 // CloneFlags parses the container's Namespaces options to set the correct
@ -32,15 +31,3 @@ func (n *Namespaces) CloneFlags() uintptr {
 	}
 	return uintptr(flag)
 }
-
-// IsPrivate tells whether the namespace of type t is configured as private
-// (i.e. it exists and is not shared).
-func (n Namespaces) IsPrivate(t NamespaceType) bool {
-	for _, v := range n {
-		if v.Type == t {
-			return v.Path == ""
-		}
-	}
-	// Not found, so implicitly sharing a parent namespace.
-	return false
-}
--- a/vendor/github.com/opencontainers/runc/libcontainer/userns/userns.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/userns/userns.go
@ -1,4 +1,5 @@
 package userns

 // RunningInUserNS detects whether we are currently running in a user namespace.
+// Originally copied from github.com/lxc/lxd/shared/util.go
 var RunningInUserNS = runningInUserNS
--- a/vendor/github.com/opencontainers/runc/libcontainer/userns/userns_fuzzer.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/userns/userns_fuzzer.go
@ -3,7 +3,14 @@

 package userns

-func FuzzUIDMap(uidmap []byte) int {
-	_ = uidMapInUserNS(string(uidmap))
+import (
+	"strings"
+
+	"github.com/opencontainers/runc/libcontainer/user"
+)
+
+func FuzzUIDMap(data []byte) int {
+	uidmap, _ := user.ParseIDMap(strings.NewReader(string(data)))
+	_ = uidMapInUserNS(uidmap)
 	return 1
 }
--- a/vendor/github.com/opencontainers/runc/libcontainer/userns/userns_linux.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/userns/userns_linux.go
@ -1,10 +1,9 @@
 package userns

 import (
-	"bufio"
-	"fmt"
-	"os"
 	"sync"
+
+	"github.com/opencontainers/runc/libcontainer/user"
 )

 var (
@ -13,43 +12,26 @@ var (
 )

 // runningInUserNS detects whether we are currently running in a user namespace.
-//
-// Originally copied from https://github.com/lxc/incus/blob/e45085dd42f826b3c8c3228e9733c0b6f998eafe/shared/util.go#L678-L700.
+// Originally copied from github.com/lxc/lxd/shared/util.go
 func runningInUserNS() bool {
 	nsOnce.Do(func() {
-		file, err := os.Open("/proc/self/uid_map")
+		uidmap, err := user.CurrentProcessUIDMap()
 		if err != nil {
-			// This kernel-provided file only exists if user namespaces are supported.
+			// This kernel-provided file only exists if user namespaces are supported
 			return
 		}
-		defer file.Close()
-
-		buf := bufio.NewReader(file)
-		l, _, err := buf.ReadLine()
-		if err != nil {
-			return
-		}
-
-		inUserNS = uidMapInUserNS(string(l))
+		inUserNS = uidMapInUserNS(uidmap)
 	})
 	return inUserNS
 }

-func uidMapInUserNS(uidMap string) bool {
-	if uidMap == "" {
-		// File exist but empty (the initial state when userns is created,
-		// see user_namespaces(7)).
-		return true
-	}
-
-	var a, b, c int64
-	if _, err := fmt.Sscanf(uidMap, "%d %d %d", &a, &b, &c); err != nil {
-		// Assume we are in a regular, non user namespace.
+func uidMapInUserNS(uidmap []user.IDMap) bool {
+	/*
+	 * We assume we are in the initial user namespace if we have a full
+	 * range - 4294967295 uids starting at uid 0.
+	 */
+	if len(uidmap) == 1 && uidmap[0].ID == 0 && uidmap[0].ParentID == 0 && uidmap[0].Count == 4294967295 {
 		return false
 	}
-
-	// As per user_namespaces(7), /proc/self/uid_map of
-	// the initial user namespace shows 0 0 4294967295.
-	initNS := a == 0 && b == 0 && c == 4294967295
-	return !initNS
+	return true
 }
--- a/vendor/github.com/opencontainers/runc/libcontainer/userns/userns_maps.c
+++ b/vendor/github.com/opencontainers/runc/libcontainer/userns/userns_maps.c
@ -0,0 +1,79 @@
+#define _GNU_SOURCE
+#include <fcntl.h>
+#include <sched.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <stdarg.h>
+#include <stdlib.h>
+
+/*
+ * All of the code here is run inside an aync-signal-safe context, so we need
+ * to be careful to not call any functions that could cause issues. In theory,
+ * since we are a Go program, there are fewer restrictions in practice, it's
+ * better to be safe than sorry.
+ *
+ * The only exception is exit, which we need to call to make sure we don't
+ * return into runc.
+ */
+
+void bail(int pipefd, const char *fmt, ...)
+{
+	va_list args;
+
+	va_start(args, fmt);
+	vdprintf(pipefd, fmt, args);
+	va_end(args);
+
+	exit(1);
+}
+
+int spawn_userns_cat(char *userns_path, char *path, int outfd, int errfd)
+{
+	char buffer[4096] = { 0 };
+
+	pid_t child = fork();
+	if (child != 0)
+		return child;
+	/* in child */
+
+	/* Join the target userns. */
+	int nsfd = open(userns_path, O_RDONLY);
+	if (nsfd < 0)
+		bail(errfd, "open userns path %s failed: %m", userns_path);
+
+	int err = setns(nsfd, CLONE_NEWUSER);
+	if (err < 0)
+		bail(errfd, "setns %s failed: %m", userns_path);
+
+	close(nsfd);
+
+	/* Pipe the requested file contents. */
+	int fd = open(path, O_RDONLY);
+	if (fd < 0)
+		bail(errfd, "open %s in userns %s failed: %m", path, userns_path);
+
+	int nread, ntotal = 0;
+	while ((nread = read(fd, buffer, sizeof(buffer))) != 0) {
+		if (nread < 0)
+			bail(errfd, "read bytes from %s failed (after %d total bytes read): %m", path, ntotal);
+		ntotal += nread;
+
+		int nwritten = 0;
+		while (nwritten < nread) {
+			int n = write(outfd, buffer, nread - nwritten);
+			if (n < 0)
+				bail(errfd, "write %d bytes from %s failed (after %d bytes written): %m",
+				     nread - nwritten, path, nwritten);
+			nwritten += n;
+		}
+		if (nread != nwritten)
+			bail(errfd, "mismatch for bytes read and written: %d read != %d written", nread, nwritten);
+	}
+
+	close(fd);
+	close(outfd);
+	close(errfd);
+
+	/* We must exit here, otherwise we would return into a forked runc. */
+	exit(0);
+}
--- a/vendor/github.com/opencontainers/runc/libcontainer/userns/userns_maps_linux.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/userns/userns_maps_linux.go
@ -0,0 +1,186 @@
+//go:build linux
+
+package userns
+
+import (
+	"bufio"
+	"bytes"
+	"fmt"
+	"io"
+	"os"
+	"unsafe"
+
+	"github.com/opencontainers/runc/libcontainer/configs"
+	"github.com/sirupsen/logrus"
+)
+
+/*
+#include <stdlib.h>
+extern int spawn_userns_cat(char *userns_path, char *path, int outfd, int errfd);
+*/
+import "C"
+
+func parseIdmapData(data []byte) (ms []configs.IDMap, err error) {
+	scanner := bufio.NewScanner(bytes.NewReader(data))
+	for scanner.Scan() {
+		var m configs.IDMap
+		line := scanner.Text()
+		if _, err := fmt.Sscanf(line, "%d %d %d", &m.ContainerID, &m.HostID, &m.Size); err != nil {
+			return nil, fmt.Errorf("parsing id map failed: invalid format in line %q: %w", line, err)
+		}
+		ms = append(ms, m)
+	}
+	if err := scanner.Err(); err != nil {
+		return nil, fmt.Errorf("parsing id map failed: %w", err)
+	}
+	return ms, nil
+}
+
+// Do something equivalent to nsenter --user=<nsPath> cat <path>, but more
+// efficiently. Returns the contents of the requested file from within the user
+// namespace.
+func spawnUserNamespaceCat(nsPath string, path string) ([]byte, error) {
+	rdr, wtr, err := os.Pipe()
+	if err != nil {
+		return nil, fmt.Errorf("create pipe for userns spawn failed: %w", err)
+	}
+	defer rdr.Close()
+	defer wtr.Close()
+
+	errRdr, errWtr, err := os.Pipe()
+	if err != nil {
+		return nil, fmt.Errorf("create error pipe for userns spawn failed: %w", err)
+	}
+	defer errRdr.Close()
+	defer errWtr.Close()
+
+	cNsPath := C.CString(nsPath)
+	defer C.free(unsafe.Pointer(cNsPath))
+	cPath := C.CString(path)
+	defer C.free(unsafe.Pointer(cPath))
+
+	childPid := C.spawn_userns_cat(cNsPath, cPath, C.int(wtr.Fd()), C.int(errWtr.Fd()))
+
+	if childPid < 0 {
+		return nil, fmt.Errorf("failed to spawn fork for userns")
+	} else if childPid == 0 {
+		// this should never happen
+		panic("runc executing inside fork child -- unsafe state!")
+	}
+
+	// We are in the parent -- close the write end of the pipe before reading.
+	wtr.Close()
+	output, err := io.ReadAll(rdr)
+	rdr.Close()
+	if err != nil {
+		return nil, fmt.Errorf("reading from userns spawn failed: %w", err)
+	}
+
+	// Ditto for the error pipe.
+	errWtr.Close()
+	errOutput, err := io.ReadAll(errRdr)
+	errRdr.Close()
+	if err != nil {
+		return nil, fmt.Errorf("reading from userns spawn error pipe failed: %w", err)
+	}
+	errOutput = bytes.TrimSpace(errOutput)
+
+	// Clean up the child.
+	child, err := os.FindProcess(int(childPid))
+	if err != nil {
+		return nil, fmt.Errorf("could not find userns spawn process: %w", err)
+	}
+	state, err := child.Wait()
+	if err != nil {
+		return nil, fmt.Errorf("failed to wait for userns spawn process: %w", err)
+	}
+	if !state.Success() {
+		errStr := string(errOutput)
+		if errStr == "" {
+			errStr = fmt.Sprintf("unknown error (status code %d)", state.ExitCode())
+		}
+		return nil, fmt.Errorf("userns spawn: %s", errStr)
+	} else if len(errOutput) > 0 {
+		// We can just ignore weird output in the error pipe if the process
+		// didn't bail(), but for completeness output for debugging.
+		logrus.Debugf("userns spawn succeeded but unexpected error message found: %s", string(errOutput))
+	}
+	// The subprocess succeeded, return whatever it wrote to the pipe.
+	return output, nil
+}
+
+func GetUserNamespaceMappings(nsPath string) (uidMap, gidMap []configs.IDMap, err error) {
+	var (
+		pid         int
+		extra       rune
+		tryFastPath bool
+	)
+
+	// nsPath is usually of the form /proc/<pid>/ns/user, which means that we
+	// already have a pid that is part of the user namespace and thus we can
+	// just use the pid to read from /proc/<pid>/*id_map.
+	//
+	// Note that Sscanf doesn't consume the whole input, so we check for any
+	// trailing data with %c. That way, we can be sure the pattern matched
+	// /proc/$pid/ns/user _exactly_ iff n === 1.
+	if n, _ := fmt.Sscanf(nsPath, "/proc/%d/ns/user%c", &pid, &extra); n == 1 {
+		tryFastPath = pid > 0
+	}
+
+	for _, mapType := range []struct {
+		name  string
+		idMap *[]configs.IDMap
+	}{
+		{"uid_map", &uidMap},
+		{"gid_map", &gidMap},
+	} {
+		var mapData []byte
+
+		if tryFastPath {
+			path := fmt.Sprintf("/proc/%d/%s", pid, mapType.name)
+			data, err := os.ReadFile(path)
+			if err != nil {
+				// Do not error out here -- we need to try the slow path if the
+				// fast path failed.
+				logrus.Debugf("failed to use fast path to read %s from userns %s (error: %s), falling back to slow userns-join path", mapType.name, nsPath, err)
+			} else {
+				mapData = data
+			}
+		} else {
+			logrus.Debugf("cannot use fast path to read %s from userns %s, falling back to slow userns-join path", mapType.name, nsPath)
+		}
+
+		if mapData == nil {
+			// We have to actually join the namespace if we cannot take the
+			// fast path. The path is resolved with respect to the child
+			// process, so just use /proc/self.
+			data, err := spawnUserNamespaceCat(nsPath, "/proc/self/"+mapType.name)
+			if err != nil {
+				return nil, nil, err
+			}
+			mapData = data
+		}
+		idMap, err := parseIdmapData(mapData)
+		if err != nil {
+			return nil, nil, fmt.Errorf("failed to parse %s of userns %s: %w", mapType.name, nsPath, err)
+		}
+		*mapType.idMap = idMap
+	}
+
+	return uidMap, gidMap, nil
+}
+
+// IsSameMapping returns whether or not the two id mappings are the same. Note
+// that if the order of the mappings is different, or a mapping has been split,
+// the mappings will be considered different.
+func IsSameMapping(a, b []configs.IDMap) bool {
+	if len(a) != len(b) {
+		return false
+	}
+	for idx := range a {
+		if a[idx] != b[idx] {
+			return false
+		}
+	}
+	return true
+}
--- a/vendor/github.com/opencontainers/runc/libcontainer/userns/userns_unsupported.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/userns/userns_unsupported.go
@ -3,6 +3,8 @@

 package userns

+import "github.com/opencontainers/runc/libcontainer/user"
+
 // runningInUserNS is a stub for non-Linux systems
 // Always returns false
 func runningInUserNS() bool {
@ -11,6 +13,6 @@ func runningInUserNS() bool {

 // uidMapInUserNS is a stub for non-Linux systems
 // Always returns false
-func uidMapInUserNS(uidMap string) bool {
+func uidMapInUserNS(uidmap []user.IDMap) bool {
 	return false
 }
--- a/vendor/github.com/opencontainers/runc/libcontainer/utils/cmsg.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/utils/cmsg.go
@ -19,14 +19,13 @@ package utils
 import (
 	"fmt"
 	"os"
-	"runtime"

 	"golang.org/x/sys/unix"
 )

-// MaxNameLen is the maximum length of the name of a file descriptor being sent
-// using SendFile. The name of the file handle returned by RecvFile will never be
-// larger than this value.
+// MaxSendfdLen is the maximum length of the name of a file descriptor being
+// sent using SendFd. The name of the file handle returned by RecvFd will never
+// be larger than this value.
 const MaxNameLen = 4096

 // oobSpace is the size of the oob slice required to store a single FD. Note
@ -34,21 +33,26 @@ const MaxNameLen = 4096
 // so sizeof(fd) = 4.
 var oobSpace = unix.CmsgSpace(4)

-// RecvFile waits for a file descriptor to be sent over the given AF_UNIX
+// RecvFd waits for a file descriptor to be sent over the given AF_UNIX
 // socket. The file name of the remote file descriptor will be recreated
 // locally (it is sent as non-auxiliary data in the same payload).
-func RecvFile(socket *os.File) (_ *os.File, Err error) {
+func RecvFd(socket *os.File) (*os.File, error) {
+	// For some reason, unix.Recvmsg uses the length rather than the capacity
+	// when passing the msg_controllen and other attributes to recvmsg.  So we
+	// have to actually set the length.
 	name := make([]byte, MaxNameLen)
 	oob := make([]byte, oobSpace)

 	sockfd := socket.Fd()
-	n, oobn, _, _, err := unix.Recvmsg(int(sockfd), name, oob, unix.MSG_CMSG_CLOEXEC)
+	n, oobn, _, _, err := unix.Recvmsg(int(sockfd), name, oob, 0)
 	if err != nil {
 		return nil, err
 	}
+
 	if n >= MaxNameLen || oobn != oobSpace {
-		return nil, fmt.Errorf("recvfile: incorrect number of bytes read (n=%d oobn=%d)", n, oobn)
+		return nil, fmt.Errorf("recvfd: incorrect number of bytes read (n=%d oobn=%d)", n, oobn)
 	}
+
 	// Truncate.
 	name = name[:n]
 	oob = oob[:oobn]
@ -57,63 +61,36 @@ func RecvFile(socket *os.File) (_ *os.File, Err error) {
 	if err != nil {
 		return nil, err
 	}
-
-	// We cannot control how many SCM_RIGHTS we receive, and upon receiving
-	// them all of the descriptors are installed in our fd table, so we need to
-	// parse all of the SCM_RIGHTS we received in order to close all of the
-	// descriptors on error.
-	var fds []int
-	defer func() {
-		for i, fd := range fds {
-			if i == 0 && Err == nil {
-				// Only close the first one on error.
-				continue
-			}
-			// Always close extra ones.
-			_ = unix.Close(fd)
-		}
-	}()
-	var lastErr error
-	for _, scm := range scms {
-		if scm.Header.Type == unix.SCM_RIGHTS {
-			scmFds, err := unix.ParseUnixRights(&scm)
-			if err != nil {
-				lastErr = err
-			} else {
-				fds = append(fds, scmFds...)
-			}
-		}
-	}
-	if lastErr != nil {
-		return nil, lastErr
-	}
-
-	// We do this after collecting the fds to make sure we close them all when
-	// returning an error here.
 	if len(scms) != 1 {
 		return nil, fmt.Errorf("recvfd: number of SCMs is not 1: %d", len(scms))
 	}
+	scm := scms[0]
+
+	fds, err := unix.ParseUnixRights(&scm)
+	if err != nil {
+		return nil, err
+	}
 	if len(fds) != 1 {
 		return nil, fmt.Errorf("recvfd: number of fds is not 1: %d", len(fds))
 	}
-	return os.NewFile(uintptr(fds[0]), string(name)), nil
+	fd := uintptr(fds[0])
+
+	return os.NewFile(fd, string(name)), nil
 }

-// SendFile sends a file over the given AF_UNIX socket. file.Name() is also
-// included so that if the other end uses RecvFile, the file will have the same
-// name information.
-func SendFile(socket *os.File, file *os.File) error {
-	name := file.Name()
+// SendFd sends a file descriptor over the given AF_UNIX socket. In
+// addition, the file.Name() of the given file will also be sent as
+// non-auxiliary data in the same payload (allowing to send contextual
+// information for a file descriptor).
+func SendFd(socket *os.File, name string, fd uintptr) error {
 	if len(name) >= MaxNameLen {
 		return fmt.Errorf("sendfd: filename too long: %s", name)
 	}
-	err := SendRawFd(socket, name, file.Fd())
-	runtime.KeepAlive(file)
-	return err
+	return SendFds(socket, []byte(name), int(fd))
 }

-// SendRawFd sends a specific file descriptor over the given AF_UNIX socket.
-func SendRawFd(socket *os.File, msg string, fd uintptr) error {
-	oob := unix.UnixRights(int(fd))
-	return unix.Sendmsg(int(socket.Fd()), []byte(msg), oob, nil, 0)
+// SendFds sends a list of files descriptor and msg over the given AF_UNIX socket.
+func SendFds(socket *os.File, msg []byte, fds ...int) error {
+	oob := unix.UnixRights(fds...)
+	return unix.Sendmsg(int(socket.Fd()), msg, oob, nil, 0)
 }
--- a/vendor/github.com/opencontainers/runc/libcontainer/utils/utils.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/utils/utils.go
@ -132,16 +132,19 @@ func WithProcfd(root, unsafePath string, fn func(procfd string) error) error {
 	return fn(procfd)
 }

-// SearchLabels searches through a list of key=value pairs for a given key,
-// returning its value, and the binary flag telling whether the key exist.
-func SearchLabels(labels []string, key string) (string, bool) {
-	key += "="
-	for _, s := range labels {
-		if strings.HasPrefix(s, key) {
-			return s[len(key):], true
+// SearchLabels searches a list of key-value pairs for the provided key and
+// returns the corresponding value. The pairs must be separated with '='.
+func SearchLabels(labels []string, query string) string {
+	for _, l := range labels {
+		parts := strings.SplitN(l, "=", 2)
+		if len(parts) < 2 {
+			continue
+		}
+		if parts[0] == query {
+			return parts[1]
 		}
 	}
-	return "", false
+	return ""
 }

 // Annotations returns the bundle path and user defined annotations from the
--- a/vendor/github.com/opencontainers/runc/libcontainer/utils/utils_unix.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/utils/utils_unix.go
@ -5,10 +5,9 @@ package utils

 import (
 	"fmt"
-	"math"
 	"os"
 	"strconv"
-	"sync"
+	_ "unsafe" // for go:linkname

 	"golang.org/x/sys/unix"
 )
@ -25,38 +24,11 @@ func EnsureProcHandle(fh *os.File) error {
 	return nil
 }

-var (
-	haveCloseRangeCloexecBool bool
-	haveCloseRangeCloexecOnce sync.Once
-)
-
-func haveCloseRangeCloexec() bool {
-	haveCloseRangeCloexecOnce.Do(func() {
-		// Make sure we're not closing a random file descriptor.
-		tmpFd, err := unix.FcntlInt(0, unix.F_DUPFD_CLOEXEC, 0)
-		if err != nil {
-			return
-		}
-		defer unix.Close(tmpFd)
-
-		err = unix.CloseRange(uint(tmpFd), uint(tmpFd), unix.CLOSE_RANGE_CLOEXEC)
-		// Any error means we cannot use close_range(CLOSE_RANGE_CLOEXEC).
-		// -ENOSYS and -EINVAL ultimately mean we don't have support, but any
-		// other potential error would imply that even the most basic close
-		// operation wouldn't work.
-		haveCloseRangeCloexecBool = err == nil
-	})
-	return haveCloseRangeCloexecBool
-}
-
-// CloseExecFrom applies O_CLOEXEC to all file descriptors currently open for
-// the process (except for those below the given fd value).
-func CloseExecFrom(minFd int) error {
-	if haveCloseRangeCloexec() {
-		err := unix.CloseRange(uint(minFd), math.MaxUint, unix.CLOSE_RANGE_CLOEXEC)
-		return os.NewSyscallError("close_range", err)
-	}
+type fdFunc func(fd int)

+// fdRangeFrom calls the passed fdFunc for each file descriptor that is open in
+// the current process.
+func fdRangeFrom(minFd int, fn fdFunc) error {
 	fdDir, err := os.Open("/proc/self/fd")
 	if err != nil {
 		return err
@ -81,17 +53,62 @@ func CloseExecFrom(minFd int) error {
 		if fd < minFd {
 			continue
 		}
-		// Intentionally ignore errors from unix.CloseOnExec -- the cases where
-		// this might fail are basically file descriptors that have already
-		// been closed (including and especially the one that was created when
-		// os.ReadDir did the "opendir" syscall).
-		unix.CloseOnExec(fd)
+		// Ignore the file descriptor we used for readdir, as it will be closed
+		// when we return.
+		if uintptr(fd) == fdDir.Fd() {
+			continue
+		}
+		// Run the closure.
+		fn(fd)
 	}
 	return nil
 }

+// CloseExecFrom sets the O_CLOEXEC flag on all file descriptors greater or
+// equal to minFd in the current process.
+func CloseExecFrom(minFd int) error {
+	return fdRangeFrom(minFd, unix.CloseOnExec)
+}
+
+//go:linkname runtime_IsPollDescriptor internal/poll.IsPollDescriptor
+
+// In order to make sure we do not close the internal epoll descriptors the Go
+// runtime uses, we need to ensure that we skip descriptors that match
+// "internal/poll".IsPollDescriptor. Yes, this is a Go runtime internal thing,
+// unfortunately there's no other way to be sure we're only keeping the file
+// descriptors the Go runtime needs. Hopefully nothing blows up doing this...
+func runtime_IsPollDescriptor(fd uintptr) bool //nolint:revive
+
+// UnsafeCloseFrom closes all file descriptors greater or equal to minFd in the
+// current process, except for those critical to Go's runtime (such as the
+// netpoll management descriptors).
+//
+// NOTE: That this function is incredibly dangerous to use in most Go code, as
+// closing file descriptors from underneath *os.File handles can lead to very
+// bad behaviour (the closed file descriptor can be re-used and then any
+// *os.File operations would apply to the wrong file). This function is only
+// intended to be called from the last stage of runc init.
+func UnsafeCloseFrom(minFd int) error {
+	// We must not close some file descriptors.
+	return fdRangeFrom(minFd, func(fd int) {
+		if runtime_IsPollDescriptor(uintptr(fd)) {
+			// These are the Go runtimes internal netpoll file descriptors.
+			// These file descriptors are operated on deep in the Go scheduler,
+			// and closing those files from underneath Go can result in panics.
+			// There is no issue with keeping them because they are not
+			// executable and are not useful to an attacker anyway. Also we
+			// don't have any choice.
+			return
+		}
+		// There's nothing we can do about errors from close(2), and the
+		// only likely error to be seen is EBADF which indicates the fd was
+		// already closed (in which case, we got what we wanted).
+		_ = unix.Close(fd)
+	})
+}
+
 // NewSockPair returns a new unix socket pair
-func NewSockPair(name string) (parent, child *os.File, err error) {
+func NewSockPair(name string) (parent *os.File, child *os.File, err error) {
 	fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0)
 	if err != nil {
 		return nil, nil, err