Update to runc main, removing pin to an older version

We were pinned to a specific commit to ensure that tests kept
passing. Hopefully they pass now, as we need to grab latest runc
for CVE fixes.

Also grab Buildah main to fix a build issue on FreeBSD. After a
botched manual vendor, I used Ed's treadmill script and squashed
it into this commit to make Git happy. Thanks bunches Ed.

Signed-off-by: Matt Heon <mheon@redhat.com>
This commit is contained in:
Matt Heon
2024-02-01 15:17:45 -05:00
parent 5e64d4f021
commit 2818abf849
174 changed files with 22580 additions and 922 deletions

View File

@ -8,9 +8,9 @@ The following is courtesy of our legal counsel:
Use and transfer of Docker may be subject to certain restrictions by the
United States and other governments.
United States and other governments.
It is your responsibility to ensure that your use and/or transfer does not
violate applicable laws.
violate applicable laws.
For more information, please see http://www.bis.doc.gov

View File

@ -1,24 +1,9 @@
package cgroups
import (
"errors"
"github.com/opencontainers/runc/libcontainer/configs"
)
var (
// ErrDevicesUnsupported is an error returned when a cgroup manager
// is not configured to set device rules.
ErrDevicesUnsupported = errors.New("cgroup manager is not configured to set device rules")
// DevicesSetV1 and DevicesSetV2 are functions to set devices for
// cgroup v1 and v2, respectively. Unless libcontainer/cgroups/devices
// package is imported, it is set to nil, so cgroup managers can't
// manage devices.
DevicesSetV1 func(path string, r *configs.Resources) error
DevicesSetV2 func(path string, r *configs.Resources) error
)
type Manager interface {
// Apply creates a cgroup, if not yet created, and adds a process
// with the specified pid into that cgroup. A special value of -1

View File

@ -0,0 +1,386 @@
// SPDX-License-Identifier: Apache-2.0
/*
* Copyright (C) 2020 Aleksa Sarai <cyphar@cyphar.com>
* Copyright (C) 2020 SUSE LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package devices
import (
"bufio"
"fmt"
"io"
"sort"
"strconv"
"strings"
"github.com/opencontainers/runc/libcontainer/devices"
)
// deviceMeta is a Rule without the Allow or Permissions fields, and no
// wildcard-type support. It's effectively the "match" portion of a metadata
// rule, for the purposes of our emulation.
type deviceMeta struct {
node devices.Type
major int64
minor int64
}
// deviceRule is effectively the tuple (deviceMeta, Permissions).
type deviceRule struct {
meta deviceMeta
perms devices.Permissions
}
// deviceRules is a mapping of device metadata rules to the associated
// permissions in the ruleset.
type deviceRules map[deviceMeta]devices.Permissions
func (r deviceRules) orderedEntries() []deviceRule {
var rules []deviceRule
for meta, perms := range r {
rules = append(rules, deviceRule{meta: meta, perms: perms})
}
sort.Slice(rules, func(i, j int) bool {
// Sort by (major, minor, type).
a, b := rules[i].meta, rules[j].meta
return a.major < b.major ||
(a.major == b.major && a.minor < b.minor) ||
(a.major == b.major && a.minor == b.minor && a.node < b.node)
})
return rules
}
type Emulator struct {
defaultAllow bool
rules deviceRules
}
func (e *Emulator) IsBlacklist() bool {
return e.defaultAllow
}
func (e *Emulator) IsAllowAll() bool {
return e.IsBlacklist() && len(e.rules) == 0
}
func parseLine(line string) (*deviceRule, error) {
// Input: node major:minor perms.
fields := strings.FieldsFunc(line, func(r rune) bool {
return r == ' ' || r == ':'
})
if len(fields) != 4 {
return nil, fmt.Errorf("malformed devices.list rule %s", line)
}
var (
rule deviceRule
node = fields[0]
major = fields[1]
minor = fields[2]
perms = fields[3]
)
// Parse the node type.
switch node {
case "a":
// Super-special case -- "a" always means every device with every
// access mode. In fact, for devices.list this actually indicates that
// the cgroup is in black-list mode.
// TODO: Double-check that the entire file is "a *:* rwm".
return nil, nil
case "b":
rule.meta.node = devices.BlockDevice
case "c":
rule.meta.node = devices.CharDevice
default:
return nil, fmt.Errorf("unknown device type %q", node)
}
// Parse the major number.
if major == "*" {
rule.meta.major = devices.Wildcard
} else {
val, err := strconv.ParseUint(major, 10, 32)
if err != nil {
return nil, fmt.Errorf("invalid major number: %w", err)
}
rule.meta.major = int64(val)
}
// Parse the minor number.
if minor == "*" {
rule.meta.minor = devices.Wildcard
} else {
val, err := strconv.ParseUint(minor, 10, 32)
if err != nil {
return nil, fmt.Errorf("invalid minor number: %w", err)
}
rule.meta.minor = int64(val)
}
// Parse the access permissions.
rule.perms = devices.Permissions(perms)
if !rule.perms.IsValid() || rule.perms.IsEmpty() {
return nil, fmt.Errorf("parse access mode: contained unknown modes or is empty: %q", perms)
}
return &rule, nil
}
func (e *Emulator) addRule(rule deviceRule) error { //nolint:unparam
if e.rules == nil {
e.rules = make(map[deviceMeta]devices.Permissions)
}
// Merge with any pre-existing permissions.
oldPerms := e.rules[rule.meta]
newPerms := rule.perms.Union(oldPerms)
e.rules[rule.meta] = newPerms
return nil
}
func (e *Emulator) rmRule(rule deviceRule) error {
// Give an error if any of the permissions requested to be removed are
// present in a partially-matching wildcard rule, because such rules will
// be ignored by cgroupv1.
//
// This is a diversion from cgroupv1, but is necessary to avoid leading
// users into a false sense of security. cgroupv1 will silently(!) ignore
// requests to remove partial exceptions, but we really shouldn't do that.
//
// It may seem like we could just "split" wildcard rules which hit this
// issue, but unfortunately there are 2^32 possible major and minor
// numbers, which would exhaust kernel memory quickly if we did this. Not
// to mention it'd be really slow (the kernel side is implemented as a
// linked-list of exceptions).
for _, partialMeta := range []deviceMeta{
{node: rule.meta.node, major: devices.Wildcard, minor: rule.meta.minor},
{node: rule.meta.node, major: rule.meta.major, minor: devices.Wildcard},
{node: rule.meta.node, major: devices.Wildcard, minor: devices.Wildcard},
} {
// This wildcard rule is equivalent to the requested rule, so skip it.
if rule.meta == partialMeta {
continue
}
// Only give an error if the set of permissions overlap.
partialPerms := e.rules[partialMeta]
if !partialPerms.Intersection(rule.perms).IsEmpty() {
return fmt.Errorf("requested rule [%v %v] not supported by devices cgroupv1 (cannot punch hole in existing wildcard rule [%v %v])", rule.meta, rule.perms, partialMeta, partialPerms)
}
}
// Subtract all of the permissions listed from the full match rule. If the
// rule didn't exist, all of this is a no-op.
newPerms := e.rules[rule.meta].Difference(rule.perms)
if newPerms.IsEmpty() {
delete(e.rules, rule.meta)
} else {
e.rules[rule.meta] = newPerms
}
// TODO: The actual cgroup code doesn't care if an exception didn't exist
// during removal, so not erroring out here is /accurate/ but quite
// worrying. Maybe we should do additional validation, but again we
// have to worry about backwards-compatibility.
return nil
}
func (e *Emulator) allow(rule *deviceRule) error {
// This cgroup is configured as a black-list. Reset the entire emulator,
// and put is into black-list mode.
if rule == nil || rule.meta.node == devices.WildcardDevice {
*e = Emulator{
defaultAllow: true,
rules: nil,
}
return nil
}
var err error
if e.defaultAllow {
err = wrapErr(e.rmRule(*rule), "unable to remove 'deny' exception")
} else {
err = wrapErr(e.addRule(*rule), "unable to add 'allow' exception")
}
return err
}
func (e *Emulator) deny(rule *deviceRule) error {
// This cgroup is configured as a white-list. Reset the entire emulator,
// and put is into white-list mode.
if rule == nil || rule.meta.node == devices.WildcardDevice {
*e = Emulator{
defaultAllow: false,
rules: nil,
}
return nil
}
var err error
if e.defaultAllow {
err = wrapErr(e.addRule(*rule), "unable to add 'deny' exception")
} else {
err = wrapErr(e.rmRule(*rule), "unable to remove 'allow' exception")
}
return err
}
func (e *Emulator) Apply(rule devices.Rule) error {
if !rule.Type.CanCgroup() {
return fmt.Errorf("cannot add rule [%#v] with non-cgroup type %q", rule, rule.Type)
}
innerRule := &deviceRule{
meta: deviceMeta{
node: rule.Type,
major: rule.Major,
minor: rule.Minor,
},
perms: rule.Permissions,
}
if innerRule.meta.node == devices.WildcardDevice {
innerRule = nil
}
if rule.Allow {
return e.allow(innerRule)
}
return e.deny(innerRule)
}
// EmulatorFromList takes a reader to a "devices.list"-like source, and returns
// a new Emulator that represents the state of the devices cgroup. Note that
// black-list devices cgroups cannot be fully reconstructed, due to limitations
// in the devices cgroup API. Instead, such cgroups are always treated as
// "allow all" cgroups.
func EmulatorFromList(list io.Reader) (*Emulator, error) {
// Normally cgroups are in black-list mode by default, but the way we
// figure out the current mode is whether or not devices.list has an
// allow-all rule. So we default to a white-list, and the existence of an
// "a *:* rwm" entry will tell us otherwise.
e := &Emulator{
defaultAllow: false,
}
// Parse the "devices.list".
s := bufio.NewScanner(list)
for s.Scan() {
line := s.Text()
deviceRule, err := parseLine(line)
if err != nil {
return nil, fmt.Errorf("error parsing line %q: %w", line, err)
}
// "devices.list" is an allow list. Note that this means that in
// black-list mode, we have no idea what rules are in play. As a
// result, we need to be very careful in Transition().
if err := e.allow(deviceRule); err != nil {
return nil, fmt.Errorf("error adding devices.list rule: %w", err)
}
}
if err := s.Err(); err != nil {
return nil, fmt.Errorf("error reading devices.list lines: %w", err)
}
return e, nil
}
// Transition calculates what is the minimally-disruptive set of rules need to
// be applied to a devices cgroup in order to transition to the given target.
// This means that any already-existing rules will not be applied, and
// disruptive rules (like denying all device access) will only be applied if
// necessary.
//
// This function is the sole reason for all of Emulator -- to allow us
// to figure out how to update a containers' cgroups without causing spurious
// device errors (if possible).
func (source *Emulator) Transition(target *Emulator) ([]*devices.Rule, error) {
var transitionRules []*devices.Rule
oldRules := source.rules
// If the default policy doesn't match, we need to include a "disruptive"
// rule (either allow-all or deny-all) in order to switch the cgroup to the
// correct default policy.
//
// However, due to a limitation in "devices.list" we cannot be sure what
// deny rules are in place in a black-list cgroup. Thus if the source is a
// black-list we also have to include a disruptive rule.
if source.IsBlacklist() || source.defaultAllow != target.defaultAllow {
transitionRules = append(transitionRules, &devices.Rule{
Type: 'a',
Major: -1,
Minor: -1,
Permissions: devices.Permissions("rwm"),
Allow: target.defaultAllow,
})
// The old rules are only relevant if we aren't starting out with a
// disruptive rule.
oldRules = nil
}
// NOTE: We traverse through the rules in a sorted order so we always write
// the same set of rules (this is to aid testing).
// First, we create inverse rules for any old rules not in the new set.
// This includes partial-inverse rules for specific permissions. This is a
// no-op if we added a disruptive rule, since oldRules will be empty.
for _, rule := range oldRules.orderedEntries() {
meta, oldPerms := rule.meta, rule.perms
newPerms := target.rules[meta]
droppedPerms := oldPerms.Difference(newPerms)
if !droppedPerms.IsEmpty() {
transitionRules = append(transitionRules, &devices.Rule{
Type: meta.node,
Major: meta.major,
Minor: meta.minor,
Permissions: droppedPerms,
Allow: target.defaultAllow,
})
}
}
// Add any additional rules which weren't in the old set. We happen to
// filter out rules which are present in both sets, though this isn't
// strictly necessary.
for _, rule := range target.rules.orderedEntries() {
meta, newPerms := rule.meta, rule.perms
oldPerms := oldRules[meta]
gainedPerms := newPerms.Difference(oldPerms)
if !gainedPerms.IsEmpty() {
transitionRules = append(transitionRules, &devices.Rule{
Type: meta.node,
Major: meta.major,
Minor: meta.minor,
Permissions: gainedPerms,
Allow: !target.defaultAllow,
})
}
}
return transitionRules, nil
}
// Rules returns the minimum set of rules necessary to convert a *deny-all*
// cgroup to the emulated filter state (note that this is not the same as a
// default cgroupv1 cgroup -- which is allow-all). This is effectively just a
// wrapper around Transition() with the source emulator being an empty cgroup.
func (e *Emulator) Rules() ([]*devices.Rule, error) {
defaultCgroup := &Emulator{defaultAllow: false}
return defaultCgroup.Transition(e)
}
func wrapErr(err error, text string) error {
if err == nil {
return nil
}
return fmt.Errorf(text+": %w", err)
}

View File

@ -0,0 +1,208 @@
// Package devicefilter contains eBPF device filter program
//
// The implementation is based on https://github.com/containers/crun/blob/0.10.2/src/libcrun/ebpf.c
//
// Although ebpf.c is originally licensed under LGPL-3.0-or-later, the author (Giuseppe Scrivano)
// agreed to relicense the file in Apache License 2.0: https://github.com/opencontainers/runc/issues/2144#issuecomment-543116397
package devicefilter
import (
"errors"
"fmt"
"math"
"strconv"
"github.com/cilium/ebpf/asm"
devicesemulator "github.com/opencontainers/runc/libcontainer/cgroups/devices"
"github.com/opencontainers/runc/libcontainer/devices"
"golang.org/x/sys/unix"
)
const (
// license string format is same as kernel MODULE_LICENSE macro
license = "Apache"
)
// DeviceFilter returns eBPF device filter program and its license string
func DeviceFilter(rules []*devices.Rule) (asm.Instructions, string, error) {
// Generate the minimum ruleset for the device rules we are given. While we
// don't care about minimum transitions in cgroupv2, using the emulator
// gives us a guarantee that the behaviour of devices filtering is the same
// as cgroupv1, including security hardenings to avoid misconfiguration
// (such as punching holes in wildcard rules).
emu := new(devicesemulator.Emulator)
for _, rule := range rules {
if err := emu.Apply(*rule); err != nil {
return nil, "", err
}
}
cleanRules, err := emu.Rules()
if err != nil {
return nil, "", err
}
p := &program{
defaultAllow: emu.IsBlacklist(),
}
p.init()
for idx, rule := range cleanRules {
if rule.Type == devices.WildcardDevice {
// We can safely skip over wildcard entries because there should
// only be one (at most) at the very start to instruct cgroupv1 to
// go into allow-list mode. However we do double-check this here.
if idx != 0 || rule.Allow != emu.IsBlacklist() {
return nil, "", fmt.Errorf("[internal error] emulated cgroupv2 devices ruleset had bad wildcard at idx %v (%s)", idx, rule.CgroupString())
}
continue
}
if rule.Allow == p.defaultAllow {
// There should be no rules which have an action equal to the
// default action, the emulator removes those.
return nil, "", fmt.Errorf("[internal error] emulated cgroupv2 devices ruleset had no-op rule at idx %v (%s)", idx, rule.CgroupString())
}
if err := p.appendRule(rule); err != nil {
return nil, "", err
}
}
return p.finalize(), license, nil
}
type program struct {
insts asm.Instructions
defaultAllow bool
blockID int
}
func (p *program) init() {
// struct bpf_cgroup_dev_ctx: https://elixir.bootlin.com/linux/v5.3.6/source/include/uapi/linux/bpf.h#L3423
/*
u32 access_type
u32 major
u32 minor
*/
// R2 <- type (lower 16 bit of u32 access_type at R1[0])
p.insts = append(p.insts,
asm.LoadMem(asm.R2, asm.R1, 0, asm.Word),
asm.And.Imm32(asm.R2, 0xFFFF))
// R3 <- access (upper 16 bit of u32 access_type at R1[0])
p.insts = append(p.insts,
asm.LoadMem(asm.R3, asm.R1, 0, asm.Word),
// RSh: bitwise shift right
asm.RSh.Imm32(asm.R3, 16))
// R4 <- major (u32 major at R1[4])
p.insts = append(p.insts,
asm.LoadMem(asm.R4, asm.R1, 4, asm.Word))
// R5 <- minor (u32 minor at R1[8])
p.insts = append(p.insts,
asm.LoadMem(asm.R5, asm.R1, 8, asm.Word))
}
// appendRule rule converts an OCI rule to the relevant eBPF block and adds it
// to the in-progress filter program. In order to operate properly, it must be
// called with a "clean" rule list (generated by devices.Emulator.Rules() --
// with any "a" rules removed).
func (p *program) appendRule(rule *devices.Rule) error {
if p.blockID < 0 {
return errors.New("the program is finalized")
}
var bpfType int32
switch rule.Type {
case devices.CharDevice:
bpfType = int32(unix.BPF_DEVCG_DEV_CHAR)
case devices.BlockDevice:
bpfType = int32(unix.BPF_DEVCG_DEV_BLOCK)
default:
// We do not permit 'a', nor any other types we don't know about.
return fmt.Errorf("invalid type %q", string(rule.Type))
}
if rule.Major > math.MaxUint32 {
return fmt.Errorf("invalid major %d", rule.Major)
}
if rule.Minor > math.MaxUint32 {
return fmt.Errorf("invalid minor %d", rule.Major)
}
hasMajor := rule.Major >= 0 // if not specified in OCI json, major is set to -1
hasMinor := rule.Minor >= 0
bpfAccess := int32(0)
for _, r := range rule.Permissions {
switch r {
case 'r':
bpfAccess |= unix.BPF_DEVCG_ACC_READ
case 'w':
bpfAccess |= unix.BPF_DEVCG_ACC_WRITE
case 'm':
bpfAccess |= unix.BPF_DEVCG_ACC_MKNOD
default:
return fmt.Errorf("unknown device access %v", r)
}
}
// If the access is rwm, skip the check.
hasAccess := bpfAccess != (unix.BPF_DEVCG_ACC_READ | unix.BPF_DEVCG_ACC_WRITE | unix.BPF_DEVCG_ACC_MKNOD)
var (
blockSym = "block-" + strconv.Itoa(p.blockID)
nextBlockSym = "block-" + strconv.Itoa(p.blockID+1)
prevBlockLastIdx = len(p.insts) - 1
)
p.insts = append(p.insts,
// if (R2 != bpfType) goto next
asm.JNE.Imm(asm.R2, bpfType, nextBlockSym),
)
if hasAccess {
p.insts = append(p.insts,
// if (R3 & bpfAccess != R3 /* use R1 as a temp var */) goto next
asm.Mov.Reg32(asm.R1, asm.R3),
asm.And.Imm32(asm.R1, bpfAccess),
asm.JNE.Reg(asm.R1, asm.R3, nextBlockSym),
)
}
if hasMajor {
p.insts = append(p.insts,
// if (R4 != major) goto next
asm.JNE.Imm(asm.R4, int32(rule.Major), nextBlockSym),
)
}
if hasMinor {
p.insts = append(p.insts,
// if (R5 != minor) goto next
asm.JNE.Imm(asm.R5, int32(rule.Minor), nextBlockSym),
)
}
p.insts = append(p.insts, acceptBlock(rule.Allow)...)
// set blockSym to the first instruction we added in this iteration
p.insts[prevBlockLastIdx+1] = p.insts[prevBlockLastIdx+1].Sym(blockSym)
p.blockID++
return nil
}
func (p *program) finalize() asm.Instructions {
var v int32
if p.defaultAllow {
v = 1
}
blockSym := "block-" + strconv.Itoa(p.blockID)
p.insts = append(p.insts,
// R0 <- v
asm.Mov.Imm32(asm.R0, v).Sym(blockSym),
asm.Return(),
)
p.blockID = -1
return p.insts
}
func acceptBlock(accept bool) asm.Instructions {
var v int32
if accept {
v = 1
}
return []asm.Instruction{
// R0 <- v
asm.Mov.Imm32(asm.R0, v),
asm.Return(),
}
}

View File

@ -0,0 +1,253 @@
package ebpf
import (
"errors"
"fmt"
"os"
"runtime"
"sync"
"unsafe"
"github.com/cilium/ebpf"
"github.com/cilium/ebpf/asm"
"github.com/cilium/ebpf/link"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
)
func nilCloser() error {
return nil
}
func findAttachedCgroupDeviceFilters(dirFd int) ([]*ebpf.Program, error) {
type bpfAttrQuery struct {
TargetFd uint32
AttachType uint32
QueryType uint32
AttachFlags uint32
ProgIds uint64 // __aligned_u64
ProgCnt uint32
}
// Currently you can only have 64 eBPF programs attached to a cgroup.
size := 64
retries := 0
for retries < 10 {
progIds := make([]uint32, size)
query := bpfAttrQuery{
TargetFd: uint32(dirFd),
AttachType: uint32(unix.BPF_CGROUP_DEVICE),
ProgIds: uint64(uintptr(unsafe.Pointer(&progIds[0]))),
ProgCnt: uint32(len(progIds)),
}
// Fetch the list of program ids.
_, _, errno := unix.Syscall(unix.SYS_BPF,
uintptr(unix.BPF_PROG_QUERY),
uintptr(unsafe.Pointer(&query)),
unsafe.Sizeof(query))
size = int(query.ProgCnt)
runtime.KeepAlive(query)
if errno != 0 {
// On ENOSPC we get the correct number of programs.
if errno == unix.ENOSPC {
retries++
continue
}
return nil, fmt.Errorf("bpf_prog_query(BPF_CGROUP_DEVICE) failed: %w", errno)
}
// Convert the ids to program handles.
progIds = progIds[:size]
programs := make([]*ebpf.Program, 0, len(progIds))
for _, progId := range progIds {
program, err := ebpf.NewProgramFromID(ebpf.ProgramID(progId))
if err != nil {
// We skip over programs that give us -EACCES or -EPERM. This
// is necessary because there may be BPF programs that have
// been attached (such as with --systemd-cgroup) which have an
// LSM label that blocks us from interacting with the program.
//
// Because additional BPF_CGROUP_DEVICE programs only can add
// restrictions, there's no real issue with just ignoring these
// programs (and stops runc from breaking on distributions with
// very strict SELinux policies).
if errors.Is(err, os.ErrPermission) {
logrus.Debugf("ignoring existing CGROUP_DEVICE program (prog_id=%v) which cannot be accessed by runc -- likely due to LSM policy: %v", progId, err)
continue
}
return nil, fmt.Errorf("cannot fetch program from id: %w", err)
}
programs = append(programs, program)
}
runtime.KeepAlive(progIds)
return programs, nil
}
return nil, errors.New("could not get complete list of CGROUP_DEVICE programs")
}
var (
haveBpfProgReplaceBool bool
haveBpfProgReplaceOnce sync.Once
)
// Loosely based on the BPF_F_REPLACE support check in
// https://github.com/cilium/ebpf/blob/v0.6.0/link/syscalls.go.
//
// TODO: move this logic to cilium/ebpf
func haveBpfProgReplace() bool {
haveBpfProgReplaceOnce.Do(func() {
prog, err := ebpf.NewProgram(&ebpf.ProgramSpec{
Type: ebpf.CGroupDevice,
License: "MIT",
Instructions: asm.Instructions{
asm.Mov.Imm(asm.R0, 0),
asm.Return(),
},
})
if err != nil {
logrus.Debugf("checking for BPF_F_REPLACE support: ebpf.NewProgram failed: %v", err)
return
}
defer prog.Close()
devnull, err := os.Open("/dev/null")
if err != nil {
logrus.Debugf("checking for BPF_F_REPLACE support: open dummy target fd: %v", err)
return
}
defer devnull.Close()
// We know that we have BPF_PROG_ATTACH since we can load
// BPF_CGROUP_DEVICE programs. If passing BPF_F_REPLACE gives us EINVAL
// we know that the feature isn't present.
err = link.RawAttachProgram(link.RawAttachProgramOptions{
// We rely on this fd being checked after attachFlags.
Target: int(devnull.Fd()),
// Attempt to "replace" bad fds with this program.
Program: prog,
Attach: ebpf.AttachCGroupDevice,
Flags: unix.BPF_F_ALLOW_MULTI | unix.BPF_F_REPLACE,
})
if errors.Is(err, unix.EINVAL) {
// not supported
return
}
// attach_flags test succeeded.
if !errors.Is(err, unix.EBADF) {
logrus.Debugf("checking for BPF_F_REPLACE: got unexpected (not EBADF or EINVAL) error: %v", err)
}
haveBpfProgReplaceBool = true
})
return haveBpfProgReplaceBool
}
// LoadAttachCgroupDeviceFilter installs eBPF device filter program to /sys/fs/cgroup/<foo> directory.
//
// Requires the system to be running in cgroup2 unified-mode with kernel >= 4.15 .
//
// https://github.com/torvalds/linux/commit/ebc614f687369f9df99828572b1d85a7c2de3d92
func LoadAttachCgroupDeviceFilter(insts asm.Instructions, license string, dirFd int) (func() error, error) {
// Increase `ulimit -l` limit to avoid BPF_PROG_LOAD error (#2167).
// This limit is not inherited into the container.
memlockLimit := &unix.Rlimit{
Cur: unix.RLIM_INFINITY,
Max: unix.RLIM_INFINITY,
}
_ = unix.Setrlimit(unix.RLIMIT_MEMLOCK, memlockLimit)
// Get the list of existing programs.
oldProgs, err := findAttachedCgroupDeviceFilters(dirFd)
if err != nil {
return nilCloser, err
}
useReplaceProg := haveBpfProgReplace() && len(oldProgs) == 1
// Generate new program.
spec := &ebpf.ProgramSpec{
Type: ebpf.CGroupDevice,
Instructions: insts,
License: license,
}
prog, err := ebpf.NewProgram(spec)
if err != nil {
return nilCloser, err
}
// If there is only one old program, we can just replace it directly.
var (
replaceProg *ebpf.Program
attachFlags uint32 = unix.BPF_F_ALLOW_MULTI
)
if useReplaceProg {
replaceProg = oldProgs[0]
attachFlags |= unix.BPF_F_REPLACE
}
err = link.RawAttachProgram(link.RawAttachProgramOptions{
Target: dirFd,
Program: prog,
Replace: replaceProg,
Attach: ebpf.AttachCGroupDevice,
Flags: attachFlags,
})
if err != nil {
return nilCloser, fmt.Errorf("failed to call BPF_PROG_ATTACH (BPF_CGROUP_DEVICE, BPF_F_ALLOW_MULTI): %w", err)
}
closer := func() error {
err = link.RawDetachProgram(link.RawDetachProgramOptions{
Target: dirFd,
Program: prog,
Attach: ebpf.AttachCGroupDevice,
})
if err != nil {
return fmt.Errorf("failed to call BPF_PROG_DETACH (BPF_CGROUP_DEVICE): %w", err)
}
// TODO: Should we attach the old filters back in this case? Otherwise
// we fail-open on a security feature, which is a bit scary.
return nil
}
if !useReplaceProg {
logLevel := logrus.DebugLevel
// If there was more than one old program, give a warning (since this
// really shouldn't happen with runc-managed cgroups) and then detach
// all the old programs.
if len(oldProgs) > 1 {
// NOTE: Ideally this should be a warning but it turns out that
// systemd-managed cgroups trigger this warning (apparently
// systemd doesn't delete old non-systemd programs when
// setting properties).
logrus.Infof("found more than one filter (%d) attached to a cgroup -- removing extra filters!", len(oldProgs))
logLevel = logrus.InfoLevel
}
for idx, oldProg := range oldProgs {
// Output some extra debug info.
if info, err := oldProg.Info(); err == nil {
fields := logrus.Fields{
"type": info.Type.String(),
"tag": info.Tag,
"name": info.Name,
}
if id, ok := info.ID(); ok {
fields["id"] = id
}
if runCount, ok := info.RunCount(); ok {
fields["run_count"] = runCount
}
if runtime, ok := info.Runtime(); ok {
fields["runtime"] = runtime.String()
}
logrus.WithFields(fields).Logf(logLevel, "removing old filter %d from cgroup", idx)
}
err = link.RawDetachProgram(link.RawDetachProgramOptions{
Target: dirFd,
Program: oldProg,
Attach: ebpf.AttachCGroupDevice,
})
if err != nil {
return closer, fmt.Errorf("failed to call BPF_PROG_DETACH (BPF_CGROUP_DEVICE) on old filter program: %w", err)
}
}
}
return closer, nil
}

View File

@ -10,6 +10,7 @@ import (
"strings"
"sync"
"github.com/opencontainers/runc/libcontainer/utils"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
)
@ -76,35 +77,36 @@ var (
// TestMode is set to true by unit tests that need "fake" cgroupfs.
TestMode bool
cgroupFd int = -1
prepOnce sync.Once
prepErr error
resolveFlags uint64
cgroupRootHandle *os.File
prepOnce sync.Once
prepErr error
resolveFlags uint64
)
func prepareOpenat2() error {
prepOnce.Do(func() {
fd, err := unix.Openat2(-1, cgroupfsDir, &unix.OpenHow{
Flags: unix.O_DIRECTORY | unix.O_PATH,
Flags: unix.O_DIRECTORY | unix.O_PATH | unix.O_CLOEXEC,
})
if err != nil {
prepErr = &os.PathError{Op: "openat2", Path: cgroupfsDir, Err: err}
if err != unix.ENOSYS {
if err != unix.ENOSYS { //nolint:errorlint // unix errors are bare
logrus.Warnf("falling back to securejoin: %s", prepErr)
} else {
logrus.Debug("openat2 not available, falling back to securejoin")
}
return
}
file := os.NewFile(uintptr(fd), cgroupfsDir)
var st unix.Statfs_t
if err = unix.Fstatfs(fd, &st); err != nil {
if err := unix.Fstatfs(int(file.Fd()), &st); err != nil {
prepErr = &os.PathError{Op: "statfs", Path: cgroupfsDir, Err: err}
logrus.Warnf("falling back to securejoin: %s", prepErr)
return
}
cgroupFd = fd
cgroupRootHandle = file
resolveFlags = unix.RESOLVE_BENEATH | unix.RESOLVE_NO_MAGICLINKS
if st.Type == unix.CGROUP2_SUPER_MAGIC {
// cgroupv2 has a single mountpoint and no "cpu,cpuacct" symlinks
@ -122,7 +124,7 @@ func openFile(dir, file string, flags int) (*os.File, error) {
flags |= os.O_TRUNC | os.O_CREATE
mode = 0o600
}
path := path.Join(dir, file)
path := path.Join(dir, utils.CleanPath(file))
if prepareOpenat2() != nil {
return openFallback(path, flags, mode)
}
@ -131,7 +133,7 @@ func openFile(dir, file string, flags int) (*os.File, error) {
return openFallback(path, flags, mode)
}
fd, err := unix.Openat2(cgroupFd, relPath,
fd, err := unix.Openat2(int(cgroupRootHandle.Fd()), relPath,
&unix.OpenHow{
Resolve: resolveFlags,
Flags: uint64(flags) | unix.O_CLOEXEC,
@ -139,20 +141,20 @@ func openFile(dir, file string, flags int) (*os.File, error) {
})
if err != nil {
err = &os.PathError{Op: "openat2", Path: path, Err: err}
// Check if cgroupFd is still opened to cgroupfsDir
// Check if cgroupRootHandle is still opened to cgroupfsDir
// (happens when this package is incorrectly used
// across the chroot/pivot_root/mntns boundary, or
// when /sys/fs/cgroup is remounted).
//
// TODO: if such usage will ever be common, amend this
// to reopen cgroupFd and retry openat2.
fdStr := strconv.Itoa(cgroupFd)
// to reopen cgroupRootHandle and retry openat2.
fdStr := strconv.Itoa(int(cgroupRootHandle.Fd()))
fdDest, _ := os.Readlink("/proc/self/fd/" + fdStr)
if fdDest != cgroupfsDir {
// Wrap the error so it is clear that cgroupFd
// Wrap the error so it is clear that cgroupRootHandle
// is opened to an unexpected/wrong directory.
err = fmt.Errorf("cgroupFd %s unexpectedly opened to %s != %s: %w",
fdStr, fdDest, cgroupfsDir, err)
err = fmt.Errorf("cgroupRootHandle %d unexpectedly opened to %s != %s: %w",
cgroupRootHandle.Fd(), fdDest, cgroupfsDir, err)
}
return nil, err
}

View File

@ -94,14 +94,6 @@ func (s *CpuGroup) Set(path string, r *configs.Resources) error {
}
}
}
if r.CPUIdle != nil {
idle := strconv.FormatInt(*r.CPUIdle, 10)
if err := cgroups.WriteFile(path, "cpu.idle", idle); err != nil {
return err
}
}
return s.SetRtSched(path, r)
}

View File

@ -195,7 +195,7 @@ func cpusetEnsureParent(current string) error {
}
// Treat non-existing directory as cgroupfs as it will be created,
// and the root cpuset directory obviously exists.
if err != nil && err != unix.ENOENT {
if err != nil && err != unix.ENOENT { //nolint:errorlint // unix errors are bare
return &os.PathError{Op: "statfs", Path: parent, Err: err}
}

View File

@ -1,11 +1,20 @@
package fs
import (
"bytes"
"errors"
"reflect"
"github.com/opencontainers/runc/libcontainer/cgroups"
cgroupdevices "github.com/opencontainers/runc/libcontainer/cgroups/devices"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/devices"
"github.com/opencontainers/runc/libcontainer/userns"
)
type DevicesGroup struct{}
type DevicesGroup struct {
TestingSkipFinalCheck bool
}
func (s *DevicesGroup) Name() string {
return "devices"
@ -24,14 +33,75 @@ func (s *DevicesGroup) Apply(path string, r *configs.Resources, pid int) error {
return apply(path, pid)
}
func (s *DevicesGroup) Set(path string, r *configs.Resources) error {
if cgroups.DevicesSetV1 == nil {
if len(r.Devices) == 0 {
return nil
}
return cgroups.ErrDevicesUnsupported
func loadEmulator(path string) (*cgroupdevices.Emulator, error) {
list, err := cgroups.ReadFile(path, "devices.list")
if err != nil {
return nil, err
}
return cgroups.DevicesSetV1(path, r)
return cgroupdevices.EmulatorFromList(bytes.NewBufferString(list))
}
func buildEmulator(rules []*devices.Rule) (*cgroupdevices.Emulator, error) {
// This defaults to a white-list -- which is what we want!
emu := &cgroupdevices.Emulator{}
for _, rule := range rules {
if err := emu.Apply(*rule); err != nil {
return nil, err
}
}
return emu, nil
}
func (s *DevicesGroup) Set(path string, r *configs.Resources) error {
if userns.RunningInUserNS() || r.SkipDevices {
return nil
}
// Generate two emulators, one for the current state of the cgroup and one
// for the requested state by the user.
current, err := loadEmulator(path)
if err != nil {
return err
}
target, err := buildEmulator(r.Devices)
if err != nil {
return err
}
// Compute the minimal set of transition rules needed to achieve the
// requested state.
transitionRules, err := current.Transition(target)
if err != nil {
return err
}
for _, rule := range transitionRules {
file := "devices.deny"
if rule.Allow {
file = "devices.allow"
}
if err := cgroups.WriteFile(path, file, rule.CgroupString()); err != nil {
return err
}
}
// Final safety check -- ensure that the resulting state is what was
// requested. This is only really correct for white-lists, but for
// black-lists we can at least check that the cgroup is in the right mode.
//
// This safety-check is skipped for the unit tests because we cannot
// currently mock devices.list correctly.
if !s.TestingSkipFinalCheck {
currentAfter, err := loadEmulator(path)
if err != nil {
return err
}
if !target.IsBlacklist() && !reflect.DeepEqual(currentAfter, target) {
return errors.New("resulting devices cgroup doesn't precisely match target")
} else if target.IsBlacklist() != currentAfter.IsBlacklist() {
return errors.New("resulting devices cgroup doesn't match target mode")
}
}
return nil
}
func (s *DevicesGroup) GetStats(path string, stats *cgroups.Stats) error {

View File

@ -54,13 +54,13 @@ type subsystem interface {
Set(path string, r *configs.Resources) error
}
type Manager struct {
type manager struct {
mu sync.Mutex
cgroups *configs.Cgroup
paths map[string]string
}
func NewManager(cg *configs.Cgroup, paths map[string]string) (*Manager, error) {
func NewManager(cg *configs.Cgroup, paths map[string]string) (cgroups.Manager, error) {
// Some v1 controllers (cpu, cpuset, and devices) expect
// cgroups.Resources to not be nil in Apply.
if cg.Resources == nil {
@ -78,7 +78,7 @@ func NewManager(cg *configs.Cgroup, paths map[string]string) (*Manager, error) {
}
}
return &Manager{
return &manager{
cgroups: cg,
paths: paths,
}, nil
@ -105,7 +105,7 @@ func isIgnorableError(rootless bool, err error) bool {
return false
}
func (m *Manager) Apply(pid int) (err error) {
func (m *manager) Apply(pid int) (err error) {
m.mu.Lock()
defer m.mu.Unlock()
@ -139,19 +139,19 @@ func (m *Manager) Apply(pid int) (err error) {
return nil
}
func (m *Manager) Destroy() error {
func (m *manager) Destroy() error {
m.mu.Lock()
defer m.mu.Unlock()
return cgroups.RemovePaths(m.paths)
}
func (m *Manager) Path(subsys string) string {
func (m *manager) Path(subsys string) string {
m.mu.Lock()
defer m.mu.Unlock()
return m.paths[subsys]
}
func (m *Manager) GetStats() (*cgroups.Stats, error) {
func (m *manager) GetStats() (*cgroups.Stats, error) {
m.mu.Lock()
defer m.mu.Unlock()
stats := cgroups.NewStats()
@ -167,7 +167,7 @@ func (m *Manager) GetStats() (*cgroups.Stats, error) {
return stats, nil
}
func (m *Manager) Set(r *configs.Resources) error {
func (m *manager) Set(r *configs.Resources) error {
if r == nil {
return nil
}
@ -183,7 +183,7 @@ func (m *Manager) Set(r *configs.Resources) error {
if err := sys.Set(path, r); err != nil {
// When rootless is true, errors from the device subsystem
// are ignored, as it is really not expected to work.
if m.cgroups.Rootless && sys.Name() == "devices" && !errors.Is(err, cgroups.ErrDevicesUnsupported) {
if m.cgroups.Rootless && sys.Name() == "devices" {
continue
}
// However, errors from other subsystems are not ignored.
@ -202,7 +202,7 @@ func (m *Manager) Set(r *configs.Resources) error {
// Freeze toggles the container's freezer cgroup depending on the state
// provided
func (m *Manager) Freeze(state configs.FreezerState) error {
func (m *manager) Freeze(state configs.FreezerState) error {
path := m.Path("freezer")
if path == "" {
return errors.New("cannot toggle freezer: cgroups not configured for container")
@ -218,25 +218,25 @@ func (m *Manager) Freeze(state configs.FreezerState) error {
return nil
}
func (m *Manager) GetPids() ([]int, error) {
func (m *manager) GetPids() ([]int, error) {
return cgroups.GetPids(m.Path("devices"))
}
func (m *Manager) GetAllPids() ([]int, error) {
func (m *manager) GetAllPids() ([]int, error) {
return cgroups.GetAllPids(m.Path("devices"))
}
func (m *Manager) GetPaths() map[string]string {
func (m *manager) GetPaths() map[string]string {
m.mu.Lock()
defer m.mu.Unlock()
return m.paths
}
func (m *Manager) GetCgroups() (*configs.Cgroup, error) {
func (m *manager) GetCgroups() (*configs.Cgroup, error) {
return m.cgroups, nil
}
func (m *Manager) GetFreezerState() (configs.FreezerState, error) {
func (m *manager) GetFreezerState() (configs.FreezerState, error) {
dir := m.Path("freezer")
// If the container doesn't have the freezer cgroup, say it's undefined.
if dir == "" {
@ -246,7 +246,7 @@ func (m *Manager) GetFreezerState() (configs.FreezerState, error) {
return freezer.GetState(dir)
}
func (m *Manager) Exists() bool {
func (m *manager) Exists() bool {
return cgroups.PathExists(m.Path("devices"))
}
@ -254,7 +254,7 @@ func OOMKillCount(path string) (uint64, error) {
return fscommon.GetValueByKey(path, "memory.oom_control", "oom_kill")
}
func (m *Manager) OOMKillCount() (uint64, error) {
func (m *manager) OOMKillCount() (uint64, error) {
c, err := OOMKillCount(m.Path("memory"))
// Ignore ENOENT when rootless as it couldn't create cgroup.
if err != nil && m.cgroups.Rootless && os.IsNotExist(err) {

View File

@ -1,6 +1,8 @@
package fs
import (
"errors"
"os"
"strconv"
"github.com/opencontainers/runc/libcontainer/cgroups"
@ -19,8 +21,23 @@ func (s *HugetlbGroup) Apply(path string, _ *configs.Resources, pid int) error {
}
func (s *HugetlbGroup) Set(path string, r *configs.Resources) error {
const suffix = ".limit_in_bytes"
skipRsvd := false
for _, hugetlb := range r.HugetlbLimit {
if err := cgroups.WriteFile(path, "hugetlb."+hugetlb.Pagesize+".limit_in_bytes", strconv.FormatUint(hugetlb.Limit, 10)); err != nil {
prefix := "hugetlb." + hugetlb.Pagesize
val := strconv.FormatUint(hugetlb.Limit, 10)
if err := cgroups.WriteFile(path, prefix+suffix, val); err != nil {
return err
}
if skipRsvd {
continue
}
if err := cgroups.WriteFile(path, prefix+".rsvd"+suffix, val); err != nil {
if errors.Is(err, os.ErrNotExist) {
skipRsvd = true
continue
}
return err
}
}
@ -32,24 +49,29 @@ func (s *HugetlbGroup) GetStats(path string, stats *cgroups.Stats) error {
if !cgroups.PathExists(path) {
return nil
}
rsvd := ".rsvd"
hugetlbStats := cgroups.HugetlbStats{}
for _, pageSize := range cgroups.HugePageSizes() {
usage := "hugetlb." + pageSize + ".usage_in_bytes"
value, err := fscommon.GetCgroupParamUint(path, usage)
again:
prefix := "hugetlb." + pageSize + rsvd
value, err := fscommon.GetCgroupParamUint(path, prefix+".usage_in_bytes")
if err != nil {
if rsvd != "" && errors.Is(err, os.ErrNotExist) {
rsvd = ""
goto again
}
return err
}
hugetlbStats.Usage = value
maxUsage := "hugetlb." + pageSize + ".max_usage_in_bytes"
value, err = fscommon.GetCgroupParamUint(path, maxUsage)
value, err = fscommon.GetCgroupParamUint(path, prefix+".max_usage_in_bytes")
if err != nil {
return err
}
hugetlbStats.MaxUsage = value
failcnt := "hugetlb." + pageSize + ".failcnt"
value, err = fscommon.GetCgroupParamUint(path, failcnt)
value, err = fscommon.GetCgroupParamUint(path, prefix+".failcnt")
if err != nil {
return err
}

View File

@ -170,6 +170,10 @@ func (s *MemoryGroup) GetStats(path string, stats *cgroups.Stats) error {
return err
}
stats.MemoryStats.SwapUsage = swapUsage
stats.MemoryStats.SwapOnlyUsage = cgroups.MemoryData{
Usage: swapUsage.Usage - memoryUsage.Usage,
Failcnt: swapUsage.Failcnt - memoryUsage.Failcnt,
}
kernelUsage, err := getMemoryData(path, "kmem")
if err != nil {
return err
@ -234,6 +238,12 @@ func getMemoryData(path, name string) (cgroups.MemoryData, error) {
memoryData.Failcnt = value
value, err = fscommon.GetCgroupParamUint(path, limit)
if err != nil {
if name == "kmem" && os.IsNotExist(err) {
// Ignore ENOENT as kmem.limit_in_bytes has
// been removed in newer kernels.
return memoryData, nil
}
return cgroups.MemoryData{}, err
}
memoryData.Limit = value

View File

@ -165,8 +165,9 @@ func subsysPath(root, inner, subsystem string) (string, error) {
return filepath.Join(root, filepath.Base(mnt), inner), nil
}
// Use GetOwnCgroupPath for dind-like cases, when cgroupns is not
// available. This is ugly.
// Use GetOwnCgroupPath instead of GetInitCgroupPath, because the creating
// process could in container and shared pid namespace with host, and
// /proc/1/cgroup could point to whole other world of cgroups.
parentPath, err := cgroups.GetOwnCgroupPath(subsystem)
if err != nil {
return "", err

View File

@ -11,7 +11,7 @@ import (
)
func isCpuSet(r *configs.Resources) bool {
return r.CpuWeight != 0 || r.CpuQuota != 0 || r.CpuPeriod != 0 || r.CPUIdle != nil
return r.CpuWeight != 0 || r.CpuQuota != 0 || r.CpuPeriod != 0
}
func setCpu(dirPath string, r *configs.Resources) error {
@ -19,12 +19,6 @@ func setCpu(dirPath string, r *configs.Resources) error {
return nil
}
if r.CPUIdle != nil {
if err := cgroups.WriteFile(dirPath, "cpu.idle", strconv.FormatInt(*r.CPUIdle, 10)); err != nil {
return err
}
}
// NOTE: .CpuShares is not used here. Conversion is the caller's responsibility.
if r.CpuWeight != 0 {
if err := cgroups.WriteFile(dirPath, "cpu.weight", strconv.FormatUint(r.CpuWeight, 10)); err != nil {

View File

@ -0,0 +1,75 @@
package fs2
import (
"fmt"
"golang.org/x/sys/unix"
"github.com/opencontainers/runc/libcontainer/cgroups/ebpf"
"github.com/opencontainers/runc/libcontainer/cgroups/ebpf/devicefilter"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/devices"
"github.com/opencontainers/runc/libcontainer/userns"
)
func isRWM(perms devices.Permissions) bool {
var r, w, m bool
for _, perm := range perms {
switch perm {
case 'r':
r = true
case 'w':
w = true
case 'm':
m = true
}
}
return r && w && m
}
// This is similar to the logic applied in crun for handling errors from bpf(2)
// <https://github.com/containers/crun/blob/0.17/src/libcrun/cgroup.c#L2438-L2470>.
func canSkipEBPFError(r *configs.Resources) bool {
// If we're running in a user namespace we can ignore eBPF rules because we
// usually cannot use bpf(2), as well as rootless containers usually don't
// have the necessary privileges to mknod(2) device inodes or access
// host-level instances (though ideally we would be blocking device access
// for rootless containers anyway).
if userns.RunningInUserNS() {
return true
}
// We cannot ignore an eBPF load error if any rule if is a block rule or it
// doesn't permit all access modes.
//
// NOTE: This will sometimes trigger in cases where access modes are split
// between different rules but to handle this correctly would require
// using ".../libcontainer/cgroup/devices".Emulator.
for _, dev := range r.Devices {
if !dev.Allow || !isRWM(dev.Permissions) {
return false
}
}
return true
}
func setDevices(dirPath string, r *configs.Resources) error {
if r.SkipDevices {
return nil
}
insts, license, err := devicefilter.DeviceFilter(r.Devices)
if err != nil {
return err
}
dirFD, err := unix.Open(dirPath, unix.O_DIRECTORY|unix.O_RDONLY, 0o600)
if err != nil {
return fmt.Errorf("cannot get dir FD for %s", dirPath)
}
defer unix.Close(dirFD)
if _, err := ebpf.LoadAttachCgroupDeviceFilter(insts, license, dirFD); err != nil {
if !canSkipEBPFError(r) {
return err
}
}
return nil
}

View File

@ -13,7 +13,7 @@ import (
type parseError = fscommon.ParseError
type Manager struct {
type manager struct {
config *configs.Cgroup
// dirPath is like "/sys/fs/cgroup/user.slice/user-1001.slice/session-1.scope"
dirPath string
@ -25,7 +25,7 @@ type Manager struct {
// NewManager creates a manager for cgroup v2 unified hierarchy.
// dirPath is like "/sys/fs/cgroup/user.slice/user-1001.slice/session-1.scope".
// If dirPath is empty, it is automatically set using config.
func NewManager(config *configs.Cgroup, dirPath string) (*Manager, error) {
func NewManager(config *configs.Cgroup, dirPath string) (cgroups.Manager, error) {
if dirPath == "" {
var err error
dirPath, err = defaultDirPath(config)
@ -34,14 +34,14 @@ func NewManager(config *configs.Cgroup, dirPath string) (*Manager, error) {
}
}
m := &Manager{
m := &manager{
config: config,
dirPath: dirPath,
}
return m, nil
}
func (m *Manager) getControllers() error {
func (m *manager) getControllers() error {
if m.controllers != nil {
return nil
}
@ -62,7 +62,7 @@ func (m *Manager) getControllers() error {
return nil
}
func (m *Manager) Apply(pid int) error {
func (m *manager) Apply(pid int) error {
if err := CreateCgroupPath(m.dirPath, m.config); err != nil {
// Related tests:
// - "runc create (no limits + no cgrouppath + no permission) succeeds"
@ -84,15 +84,15 @@ func (m *Manager) Apply(pid int) error {
return nil
}
func (m *Manager) GetPids() ([]int, error) {
func (m *manager) GetPids() ([]int, error) {
return cgroups.GetPids(m.dirPath)
}
func (m *Manager) GetAllPids() ([]int, error) {
func (m *manager) GetAllPids() ([]int, error) {
return cgroups.GetAllPids(m.dirPath)
}
func (m *Manager) GetStats() (*cgroups.Stats, error) {
func (m *manager) GetStats() (*cgroups.Stats, error) {
var errs []error
st := cgroups.NewStats()
@ -114,17 +114,6 @@ func (m *Manager) GetStats() (*cgroups.Stats, error) {
if err := statCpu(m.dirPath, st); err != nil && !os.IsNotExist(err) {
errs = append(errs, err)
}
// PSI (since kernel 4.20).
var err error
if st.CpuStats.PSI, err = statPSI(m.dirPath, "cpu.pressure"); err != nil {
errs = append(errs, err)
}
if st.MemoryStats.PSI, err = statPSI(m.dirPath, "memory.pressure"); err != nil {
errs = append(errs, err)
}
if st.BlkioStats.PSI, err = statPSI(m.dirPath, "io.pressure"); err != nil {
errs = append(errs, err)
}
// hugetlb (since kernel 5.6)
if err := statHugeTlb(m.dirPath, st); err != nil && !os.IsNotExist(err) {
errs = append(errs, err)
@ -139,7 +128,7 @@ func (m *Manager) GetStats() (*cgroups.Stats, error) {
return st, nil
}
func (m *Manager) Freeze(state configs.FreezerState) error {
func (m *manager) Freeze(state configs.FreezerState) error {
if m.config.Resources == nil {
return errors.New("cannot toggle freezer: cgroups not configured for container")
}
@ -150,15 +139,15 @@ func (m *Manager) Freeze(state configs.FreezerState) error {
return nil
}
func (m *Manager) Destroy() error {
func (m *manager) Destroy() error {
return cgroups.RemovePath(m.dirPath)
}
func (m *Manager) Path(_ string) string {
func (m *manager) Path(_ string) string {
return m.dirPath
}
func (m *Manager) Set(r *configs.Resources) error {
func (m *manager) Set(r *configs.Resources) error {
if r == nil {
return nil
}
@ -186,10 +175,8 @@ func (m *Manager) Set(r *configs.Resources) error {
// When rootless is true, errors from the device subsystem are ignored because it is really not expected to work.
// However, errors from other subsystems are not ignored.
// see @test "runc create (rootless + limits + no cgrouppath + no permission) fails with informative error"
if err := setDevices(m.dirPath, r); err != nil {
if !m.config.Rootless || errors.Is(err, cgroups.ErrDevicesUnsupported) {
return err
}
if err := setDevices(m.dirPath, r); err != nil && !m.config.Rootless {
return err
}
// cpuset (since kernel 5.0)
if err := setCpuset(m.dirPath, r); err != nil {
@ -214,17 +201,7 @@ func (m *Manager) Set(r *configs.Resources) error {
return nil
}
func setDevices(dirPath string, r *configs.Resources) error {
if cgroups.DevicesSetV2 == nil {
if len(r.Devices) > 0 {
return cgroups.ErrDevicesUnsupported
}
return nil
}
return cgroups.DevicesSetV2(dirPath, r)
}
func (m *Manager) setUnified(res map[string]string) error {
func (m *manager) setUnified(res map[string]string) error {
for k, v := range res {
if strings.Contains(k, "/") {
return fmt.Errorf("unified resource %q must be a file name (no slashes)", k)
@ -250,21 +227,21 @@ func (m *Manager) setUnified(res map[string]string) error {
return nil
}
func (m *Manager) GetPaths() map[string]string {
func (m *manager) GetPaths() map[string]string {
paths := make(map[string]string, 1)
paths[""] = m.dirPath
return paths
}
func (m *Manager) GetCgroups() (*configs.Cgroup, error) {
func (m *manager) GetCgroups() (*configs.Cgroup, error) {
return m.config, nil
}
func (m *Manager) GetFreezerState() (configs.FreezerState, error) {
func (m *manager) GetFreezerState() (configs.FreezerState, error) {
return getFreezer(m.dirPath)
}
func (m *Manager) Exists() bool {
func (m *manager) Exists() bool {
return cgroups.PathExists(m.dirPath)
}
@ -272,7 +249,7 @@ func OOMKillCount(path string) (uint64, error) {
return fscommon.GetValueByKey(path, "memory.events", "oom_kill")
}
func (m *Manager) OOMKillCount() (uint64, error) {
func (m *manager) OOMKillCount() (uint64, error) {
c, err := OOMKillCount(m.dirPath)
if err != nil && m.config.Rootless && os.IsNotExist(err) {
err = nil
@ -280,35 +257,3 @@ func (m *Manager) OOMKillCount() (uint64, error) {
return c, err
}
func CheckMemoryUsage(dirPath string, r *configs.Resources) error {
if !r.MemoryCheckBeforeUpdate {
return nil
}
if r.Memory <= 0 && r.MemorySwap <= 0 {
return nil
}
usage, err := fscommon.GetCgroupParamUint(dirPath, "memory.current")
if err != nil {
// This check is on best-effort basis, so if we can't read the
// current usage (cgroup not yet created, or any other error),
// we should not fail.
return nil
}
if r.MemorySwap > 0 {
if uint64(r.MemorySwap) <= usage {
return fmt.Errorf("rejecting memory+swap limit %d <= usage %d", r.MemorySwap, usage)
}
}
if r.Memory > 0 {
if uint64(r.Memory) <= usage {
return fmt.Errorf("rejecting memory limit %d <= usage %d", r.Memory, usage)
}
}
return nil
}

View File

@ -1,6 +1,8 @@
package fs2
import (
"errors"
"os"
"strconv"
"github.com/opencontainers/runc/libcontainer/cgroups"
@ -16,8 +18,22 @@ func setHugeTlb(dirPath string, r *configs.Resources) error {
if !isHugeTlbSet(r) {
return nil
}
const suffix = ".max"
skipRsvd := false
for _, hugetlb := range r.HugetlbLimit {
if err := cgroups.WriteFile(dirPath, "hugetlb."+hugetlb.Pagesize+".max", strconv.FormatUint(hugetlb.Limit, 10)); err != nil {
prefix := "hugetlb." + hugetlb.Pagesize
val := strconv.FormatUint(hugetlb.Limit, 10)
if err := cgroups.WriteFile(dirPath, prefix+suffix, val); err != nil {
return err
}
if skipRsvd {
continue
}
if err := cgroups.WriteFile(dirPath, prefix+".rsvd"+suffix, val); err != nil {
if errors.Is(err, os.ErrNotExist) {
skipRsvd = true
continue
}
return err
}
}
@ -27,15 +43,21 @@ func setHugeTlb(dirPath string, r *configs.Resources) error {
func statHugeTlb(dirPath string, stats *cgroups.Stats) error {
hugetlbStats := cgroups.HugetlbStats{}
rsvd := ".rsvd"
for _, pagesize := range cgroups.HugePageSizes() {
value, err := fscommon.GetCgroupParamUint(dirPath, "hugetlb."+pagesize+".current")
again:
prefix := "hugetlb." + pagesize + rsvd
value, err := fscommon.GetCgroupParamUint(dirPath, prefix+".current")
if err != nil {
if rsvd != "" && errors.Is(err, os.ErrNotExist) {
rsvd = ""
goto again
}
return err
}
hugetlbStats.Usage = value
fileName := "hugetlb." + pagesize + ".events"
value, err = fscommon.GetValueByKey(dirPath, fileName, "max")
value, err = fscommon.GetValueByKey(dirPath, prefix+".events", "max")
if err != nil {
return err
}

View File

@ -40,11 +40,6 @@ func setMemory(dirPath string, r *configs.Resources) error {
if !isMemorySet(r) {
return nil
}
if err := CheckMemoryUsage(dirPath, r); err != nil {
return err
}
swap, err := cgroups.ConvertMemorySwapToCgroupV2Value(r.MemorySwap, r.Memory)
if err != nil {
return err
@ -105,7 +100,7 @@ func statMemory(dirPath string, stats *cgroups.Stats) error {
memoryUsage, err := getMemoryDataV2(dirPath, "")
if err != nil {
if errors.Is(err, unix.ENOENT) && dirPath == UnifiedMountpoint {
// The root cgroup does not have memory.{current,max}
// The root cgroup does not have memory.{current,max,peak}
// so emulate those using data from /proc/meminfo and
// /sys/fs/cgroup/memory.stat
return rootStatsFromMeminfo(stats)
@ -113,10 +108,12 @@ func statMemory(dirPath string, stats *cgroups.Stats) error {
return err
}
stats.MemoryStats.Usage = memoryUsage
swapUsage, err := getMemoryDataV2(dirPath, "swap")
swapOnlyUsage, err := getMemoryDataV2(dirPath, "swap")
if err != nil {
return err
}
stats.MemoryStats.SwapOnlyUsage = swapOnlyUsage
swapUsage := swapOnlyUsage
// As cgroup v1 reports SwapUsage values as mem+swap combined,
// while in cgroup v2 swap values do not include memory,
// report combined mem+swap for v1 compatibility.
@ -124,6 +121,9 @@ func statMemory(dirPath string, stats *cgroups.Stats) error {
if swapUsage.Limit != math.MaxUint64 {
swapUsage.Limit += memoryUsage.Limit
}
// The `MaxUsage` of mem+swap cannot simply combine mem with
// swap. So set it to 0 for v1 compatibility.
swapUsage.MaxUsage = 0
stats.MemoryStats.SwapUsage = swapUsage
return nil
@ -138,6 +138,7 @@ func getMemoryDataV2(path, name string) (cgroups.MemoryData, error) {
}
usage := moduleName + ".current"
limit := moduleName + ".max"
maxUsage := moduleName + ".peak"
value, err := fscommon.GetCgroupParamUint(path, usage)
if err != nil {
@ -157,6 +158,14 @@ func getMemoryDataV2(path, name string) (cgroups.MemoryData, error) {
}
memoryData.Limit = value
// `memory.peak` since kernel 5.19
// `memory.swap.peak` since kernel 6.5
value, err = fscommon.GetCgroupParamUint(path, maxUsage)
if err != nil && !os.IsNotExist(err) {
return cgroups.MemoryData{}, err
}
memoryData.MaxUsage = value
return memoryData, nil
}

View File

@ -1,89 +0,0 @@
package fs2
import (
"bufio"
"errors"
"fmt"
"os"
"strconv"
"strings"
"golang.org/x/sys/unix"
"github.com/opencontainers/runc/libcontainer/cgroups"
)
func statPSI(dirPath string, file string) (*cgroups.PSIStats, error) {
f, err := cgroups.OpenFile(dirPath, file, os.O_RDONLY)
if err != nil {
if errors.Is(err, os.ErrNotExist) {
// Kernel < 4.20, or CONFIG_PSI is not set,
// or PSI stats are turned off for the cgroup
// ("echo 0 > cgroup.pressure", kernel >= 6.1).
return nil, nil
}
return nil, err
}
defer f.Close()
var psistats cgroups.PSIStats
sc := bufio.NewScanner(f)
for sc.Scan() {
parts := strings.Fields(sc.Text())
var pv *cgroups.PSIData
switch parts[0] {
case "some":
pv = &psistats.Some
case "full":
pv = &psistats.Full
}
if pv != nil {
*pv, err = parsePSIData(parts[1:])
if err != nil {
return nil, &parseError{Path: dirPath, File: file, Err: err}
}
}
}
if err := sc.Err(); err != nil {
if errors.Is(err, unix.ENOTSUP) {
// Some kernels (e.g. CS9) may return ENOTSUP on read
// if psi=1 kernel cmdline parameter is required.
return nil, nil
}
return nil, &parseError{Path: dirPath, File: file, Err: err}
}
return &psistats, nil
}
func parsePSIData(psi []string) (cgroups.PSIData, error) {
data := cgroups.PSIData{}
for _, f := range psi {
kv := strings.SplitN(f, "=", 2)
if len(kv) != 2 {
return data, fmt.Errorf("invalid psi data: %q", f)
}
var pv *float64
switch kv[0] {
case "avg10":
pv = &data.Avg10
case "avg60":
pv = &data.Avg60
case "avg300":
pv = &data.Avg300
case "total":
v, err := strconv.ParseUint(kv[1], 10, 64)
if err != nil {
return data, fmt.Errorf("invalid %s PSI value: %w", kv[0], err)
}
data.Total = v
}
if pv != nil {
v, err := strconv.ParseFloat(kv[1], 64)
if err != nil {
return data, fmt.Errorf("invalid %s PSI value: %w", kv[0], err)
}
*pv = v
}
}
return data, nil
}

View File

@ -32,22 +32,9 @@ type CpuUsage struct {
UsageInUsermode uint64 `json:"usage_in_usermode"`
}
type PSIData struct {
Avg10 float64 `json:"avg10"`
Avg60 float64 `json:"avg60"`
Avg300 float64 `json:"avg300"`
Total uint64 `json:"total"`
}
type PSIStats struct {
Some PSIData `json:"some,omitempty"`
Full PSIData `json:"full,omitempty"`
}
type CpuStats struct {
CpuUsage CpuUsage `json:"cpu_usage,omitempty"`
ThrottlingData ThrottlingData `json:"throttling_data,omitempty"`
PSI *PSIStats `json:"psi,omitempty"`
}
type CPUSetStats struct {
@ -91,6 +78,8 @@ type MemoryStats struct {
Usage MemoryData `json:"usage,omitempty"`
// usage of memory + swap
SwapUsage MemoryData `json:"swap_usage,omitempty"`
// usage of swap only
SwapOnlyUsage MemoryData `json:"swap_only_usage,omitempty"`
// usage of kernel memory
KernelUsage MemoryData `json:"kernel_usage,omitempty"`
// usage of kernel TCP memory
@ -102,7 +91,6 @@ type MemoryStats struct {
UseHierarchy bool `json:"use_hierarchy"`
Stats map[string]uint64 `json:"stats,omitempty"`
PSI *PSIStats `json:"psi,omitempty"`
}
type PageUsageByNUMA struct {
@ -147,7 +135,6 @@ type BlkioStats struct {
IoMergedRecursive []BlkioStatEntry `json:"io_merged_recursive,omitempty"`
IoTimeRecursive []BlkioStatEntry `json:"io_time_recursive,omitempty"`
SectorsRecursive []BlkioStatEntry `json:"sectors_recursive,omitempty"`
PSI *PSIStats `json:"psi,omitempty"`
}
type HugetlbStats struct {

View File

@ -36,13 +36,13 @@ func IsCgroup2UnifiedMode() bool {
var st unix.Statfs_t
err := unix.Statfs(unifiedMountpoint, &st)
if err != nil {
level := logrus.WarnLevel
if os.IsNotExist(err) && userns.RunningInUserNS() {
// For rootless containers, sweep it under the rug.
level = logrus.DebugLevel
// ignore the "not found" error if running in userns
logrus.WithError(err).Debugf("%s missing, assuming cgroup v1", unifiedMountpoint)
isUnified = false
return
}
logrus.StandardLogger().Logf(level,
"statfs %s: %v; assuming cgroup v1", unifiedMountpoint, err)
panic(fmt.Sprintf("cannot statfs cgroup root: %s", err))
}
isUnified = st.Type == unix.CGROUP2_SUPER_MAGIC
})
@ -217,9 +217,20 @@ func PathExists(path string) bool {
return true
}
func EnterPid(cgroupPaths map[string]string, pid int) error {
for _, path := range cgroupPaths {
if PathExists(path) {
if err := WriteCgroupProc(path, pid); err != nil {
return err
}
}
}
return nil
}
func rmdir(path string) error {
err := unix.Rmdir(path)
if err == nil || err == unix.ENOENT {
if err == nil || err == unix.ENOENT { //nolint:errorlint // unix errors are bare
return nil
}
return &os.PathError{Op: "rmdir", Path: path, Err: err}

View File

@ -236,6 +236,27 @@ func GetOwnCgroupPath(subsystem string) (string, error) {
return getCgroupPathHelper(subsystem, cgroup)
}
func GetInitCgroup(subsystem string) (string, error) {
if IsCgroup2UnifiedMode() {
return "", errUnified
}
cgroups, err := ParseCgroupFile("/proc/1/cgroup")
if err != nil {
return "", err
}
return getControllerPath(subsystem, cgroups)
}
func GetInitCgroupPath(subsystem string) (string, error) {
cgroup, err := GetInitCgroup(subsystem)
if err != nil {
return "", err
}
return getCgroupPathHelper(subsystem, cgroup)
}
func getCgroupPathHelper(subsystem, cgroup string) (string, error) {
mnt, root, err := FindCgroupMountpointAndRoot("", subsystem)
if err != nil {

View File

@ -2,8 +2,8 @@ package configs
import "fmt"
// BlockIODevice holds major:minor format supported in blkio cgroup.
type BlockIODevice struct {
// blockIODevice holds major:minor format supported in blkio cgroup
type blockIODevice struct {
// Major is the device's major number
Major int64 `json:"major"`
// Minor is the device's minor number
@ -12,7 +12,7 @@ type BlockIODevice struct {
// WeightDevice struct holds a `major:minor weight`|`major:minor leaf_weight` pair
type WeightDevice struct {
BlockIODevice
blockIODevice
// Weight is the bandwidth rate for the device, range is from 10 to 1000
Weight uint16 `json:"weight"`
// LeafWeight is the bandwidth rate for the device while competing with the cgroup's child cgroups, range is from 10 to 1000, cfq scheduler only
@ -41,7 +41,7 @@ func (wd *WeightDevice) LeafWeightString() string {
// ThrottleDevice struct holds a `major:minor rate_per_second` pair
type ThrottleDevice struct {
BlockIODevice
blockIODevice
// Rate is the IO rate limit per cgroup per device
Rate uint64 `json:"rate"`
}

View File

@ -84,9 +84,6 @@ type Resources struct {
// MEM to use
CpusetMems string `json:"cpuset_mems"`
// cgroup SCHED_IDLE
CPUIdle *int64 `json:"cpu_idle,omitempty"`
// Process limit; set <= `0' to disable limit.
PidsLimit int64 `json:"pids_limit"`
@ -158,9 +155,4 @@ type Resources struct {
// during Set() to figure out whether the freeze is required. Those
// methods may be relatively slow, thus this flag.
SkipFreezeOnSet bool `json:"-"`
// MemoryCheckBeforeUpdate is a flag for cgroup v2 managers to check
// if the new memory limits (Memory and MemorySwap) being set are lower
// than the current memory usage, and reject if so.
MemoryCheckBeforeUpdate bool `json:"memory_check_before_update"`
}

View File

@ -21,9 +21,9 @@ type Rlimit struct {
// IDMap represents UID/GID Mappings for User Namespaces.
type IDMap struct {
ContainerID int `json:"container_id"`
HostID int `json:"host_id"`
Size int `json:"size"`
ContainerID int64 `json:"container_id"`
HostID int64 `json:"host_id"`
Size int64 `json:"size"`
}
// Seccomp represents syscall restrictions
@ -31,13 +31,12 @@ type IDMap struct {
// for syscalls. Additional architectures can be added by specifying them in
// Architectures.
type Seccomp struct {
DefaultAction Action `json:"default_action"`
Architectures []string `json:"architectures"`
Flags []specs.LinuxSeccompFlag `json:"flags"`
Syscalls []*Syscall `json:"syscalls"`
DefaultErrnoRet *uint `json:"default_errno_ret"`
ListenerPath string `json:"listener_path,omitempty"`
ListenerMetadata string `json:"listener_metadata,omitempty"`
DefaultAction Action `json:"default_action"`
Architectures []string `json:"architectures"`
Syscalls []*Syscall `json:"syscalls"`
DefaultErrnoRet *uint `json:"default_errno_ret"`
ListenerPath string `json:"listener_path,omitempty"`
ListenerMetadata string `json:"listener_metadata,omitempty"`
}
// Action is taken upon rule match in Seccomp
@ -84,6 +83,9 @@ type Syscall struct {
Args []*Arg `json:"args"`
}
// TODO Windows. Many of these fields should be factored out into those parts
// which are common across platforms, and those which are platform specific.
// Config defines configuration options for executing a process inside a contained environment.
type Config struct {
// NoPivotRoot will use MS_MOVE and a chroot to jail the process into the container's rootfs
@ -119,9 +121,6 @@ type Config struct {
// Hostname optionally sets the container's hostname if provided
Hostname string `json:"hostname"`
// Domainname optionally sets the container's domainname if provided
Domainname string `json:"domainname"`
// Namespaces specifies the container's namespaces that it should setup when cloning the init process
// If a namespace is not provided that namespace is shared from the container's parent process
Namespaces Namespaces `json:"namespaces"`
@ -159,11 +158,11 @@ type Config struct {
// More information about kernel oom score calculation here: https://lwn.net/Articles/317814/
OomScoreAdj *int `json:"oom_score_adj,omitempty"`
// UIDMappings is an array of User ID mappings for User Namespaces
UIDMappings []IDMap `json:"uid_mappings"`
// UidMappings is an array of User ID mappings for User Namespaces
UidMappings []IDMap `json:"uid_mappings"`
// GIDMappings is an array of Group ID mappings for User Namespaces
GIDMappings []IDMap `json:"gid_mappings"`
// GidMappings is an array of Group ID mappings for User Namespaces
GidMappings []IDMap `json:"gid_mappings"`
// MaskPaths specifies paths within the container's rootfs to mask over with a bind
// mount pointing to /dev/null as to prevent reads of the file.
@ -212,13 +211,6 @@ type Config struct {
// RootlessCgroups is set when unlikely to have the full access to cgroups.
// When RootlessCgroups is set, cgroups errors are ignored.
RootlessCgroups bool `json:"rootless_cgroups,omitempty"`
// Do not try to remount a bind mount again after the first attempt failed on source
// filesystems that have nodev, noexec, nosuid, noatime, relatime, strictatime, nodiratime set
NoMountFallback bool `json:"no_mount_fallback,omitempty"`
// TimeOffsets specifies the offset for supporting time namespaces.
TimeOffsets map[string]specs.LinuxTimeOffset `json:"time_offsets,omitempty"`
}
type (
@ -285,7 +277,6 @@ type Capabilities struct {
Ambient []string
}
// Deprecated: use (Hooks).Run instead.
func (hooks HookList) RunHooks(state *specs.State) error {
for i, h := range hooks {
if err := h.Run(state); err != nil {
@ -342,18 +333,6 @@ func (hooks *Hooks) MarshalJSON() ([]byte, error) {
})
}
// Run executes all hooks for the given hook name.
func (hooks Hooks) Run(name HookName, state *specs.State) error {
list := hooks[name]
for i, h := range list {
if err := h.Run(state); err != nil {
return fmt.Errorf("error running %s hook #%d: %w", name, i, err)
}
}
return nil
}
type Hook interface {
// Run executes the hook with the provided state.
Run(*specs.State) error
@ -414,7 +393,7 @@ func (c Command) Run(s *specs.State) error {
go func() {
err := cmd.Wait()
if err != nil {
err = fmt.Errorf("%w, stdout: %s, stderr: %s", err, stdout.String(), stderr.String())
err = fmt.Errorf("error running hook: %w, stdout: %s, stderr: %s", err, stdout.String(), stderr.String())
}
errC <- err
}()

View File

@ -1,6 +1,10 @@
package configs
import "errors"
import (
"errors"
"fmt"
"math"
)
var (
errNoUIDMap = errors.New("User namespaces enabled, but no uid mappings found.")
@ -13,14 +17,21 @@ var (
// different when user namespaces are enabled.
func (c Config) HostUID(containerId int) (int, error) {
if c.Namespaces.Contains(NEWUSER) {
if len(c.UIDMappings) == 0 {
if c.UidMappings == nil {
return -1, errNoUIDMap
}
id, found := c.hostIDFromMapping(containerId, c.UIDMappings)
id, found := c.hostIDFromMapping(int64(containerId), c.UidMappings)
if !found {
return -1, errNoUserMap
}
return id, nil
// If we are a 32-bit binary running on a 64-bit system, it's possible
// the mapped user is too large to store in an int, which means we
// cannot do the mapping. We can't just return an int64, because
// os.Setuid() takes an int.
if id > math.MaxInt {
return -1, fmt.Errorf("mapping for uid %d (host id %d) is larger than native integer size (%d)", containerId, id, math.MaxInt)
}
return int(id), nil
}
// Return unchanged id.
return containerId, nil
@ -36,14 +47,21 @@ func (c Config) HostRootUID() (int, error) {
// different when user namespaces are enabled.
func (c Config) HostGID(containerId int) (int, error) {
if c.Namespaces.Contains(NEWUSER) {
if len(c.GIDMappings) == 0 {
if c.GidMappings == nil {
return -1, errNoGIDMap
}
id, found := c.hostIDFromMapping(containerId, c.GIDMappings)
id, found := c.hostIDFromMapping(int64(containerId), c.GidMappings)
if !found {
return -1, errNoGroupMap
}
return id, nil
// If we are a 32-bit binary running on a 64-bit system, it's possible
// the mapped user is too large to store in an int, which means we
// cannot do the mapping. We can't just return an int64, because
// os.Setgid() takes an int.
if id > math.MaxInt {
return -1, fmt.Errorf("mapping for gid %d (host id %d) is larger than native integer size (%d)", containerId, id, math.MaxInt)
}
return int(id), nil
}
// Return unchanged id.
return containerId, nil
@ -57,7 +75,7 @@ func (c Config) HostRootGID() (int, error) {
// Utility function that gets a host ID for a container ID from user namespace map
// if that ID is present in the map.
func (c Config) hostIDFromMapping(containerID int, uMap []IDMap) (int, bool) {
func (c Config) hostIDFromMapping(containerID int64, uMap []IDMap) (int64, bool) {
for _, m := range uMap {
if (containerID >= m.ContainerID) && (containerID <= (m.ContainerID + m.Size - 1)) {
hostID := m.HostID + (containerID - m.ContainerID)

View File

@ -1,7 +1,48 @@
package configs
import "golang.org/x/sys/unix"
const (
// EXT_COPYUP is a directive to copy up the contents of a directory when
// a tmpfs is mounted over it.
EXT_COPYUP = 1 << iota //nolint:golint // ignore "don't use ALL_CAPS" warning
)
type Mount struct {
// Source path for the mount.
Source string `json:"source"`
// Destination path for the mount inside the container.
Destination string `json:"destination"`
// Device the mount is for.
Device string `json:"device"`
// Mount flags.
Flags int `json:"flags"`
// Propagation Flags
PropagationFlags []int `json:"propagation_flags"`
// Mount data applied to the mount.
Data string `json:"data"`
// Relabel source if set, "z" indicates shared, "Z" indicates unshared.
Relabel string `json:"relabel"`
// RecAttr represents mount properties to be applied recursively (AT_RECURSIVE), see mount_setattr(2).
RecAttr *unix.MountAttr `json:"rec_attr"`
// Extensions are additional flags that are specific to runc.
Extensions int `json:"extensions"`
// Optional Command to be run before Source is mounted.
PremountCmds []Command `json:"premount_cmds"`
// Optional Command to be run after Source is mounted.
PostmountCmds []Command `json:"postmount_cmds"`
}
func (m *Mount) IsBind() bool {
return m.Flags&unix.MS_BIND != 0
}

View File

@ -1,52 +0,0 @@
package configs
import "golang.org/x/sys/unix"
type Mount struct {
// Source path for the mount.
Source string `json:"source"`
// Destination path for the mount inside the container.
Destination string `json:"destination"`
// Device the mount is for.
Device string `json:"device"`
// Mount flags.
Flags int `json:"flags"`
// Propagation Flags
PropagationFlags []int `json:"propagation_flags"`
// Mount data applied to the mount.
Data string `json:"data"`
// Relabel source if set, "z" indicates shared, "Z" indicates unshared.
Relabel string `json:"relabel"`
// RecAttr represents mount properties to be applied recursively (AT_RECURSIVE), see mount_setattr(2).
RecAttr *unix.MountAttr `json:"rec_attr"`
// Extensions are additional flags that are specific to runc.
Extensions int `json:"extensions"`
// UIDMappings is used to changing file user owners w/o calling chown.
// Note that, the underlying filesystem should support this feature to be
// used.
// Every mount point could have its own mapping.
UIDMappings []IDMap `json:"uid_mappings,omitempty"`
// GIDMappings is used to changing file group owners w/o calling chown.
// Note that, the underlying filesystem should support this feature to be
// used.
// Every mount point could have its own mapping.
GIDMappings []IDMap `json:"gid_mappings,omitempty"`
}
func (m *Mount) IsBind() bool {
return m.Flags&unix.MS_BIND != 0
}
func (m *Mount) IsIDMapped() bool {
return len(m.UIDMappings) > 0 || len(m.GIDMappings) > 0
}

View File

@ -1,10 +0,0 @@
//go:build !linux
// +build !linux
package configs
type Mount struct{}
func (m *Mount) IsBind() bool {
return false
}

View File

@ -14,7 +14,6 @@ const (
NEWIPC NamespaceType = "NEWIPC"
NEWUSER NamespaceType = "NEWUSER"
NEWCGROUP NamespaceType = "NEWCGROUP"
NEWTIME NamespaceType = "NEWTIME"
)
var (
@ -39,8 +38,6 @@ func NsName(ns NamespaceType) string {
return "uts"
case NEWCGROUP:
return "cgroup"
case NEWTIME:
return "time"
}
return ""
}
@ -75,7 +72,6 @@ func NamespaceTypes() []NamespaceType {
NEWPID,
NEWNS,
NEWCGROUP,
NEWTIME,
}
}

View File

@ -17,7 +17,6 @@ var namespaceInfo = map[NamespaceType]int{
NEWUTS: unix.CLONE_NEWUTS,
NEWPID: unix.CLONE_NEWPID,
NEWCGROUP: unix.CLONE_NEWCGROUP,
NEWTIME: unix.CLONE_NEWTIME,
}
// CloneFlags parses the container's Namespaces options to set the correct
@ -32,15 +31,3 @@ func (n *Namespaces) CloneFlags() uintptr {
}
return uintptr(flag)
}
// IsPrivate tells whether the namespace of type t is configured as private
// (i.e. it exists and is not shared).
func (n Namespaces) IsPrivate(t NamespaceType) bool {
for _, v := range n {
if v.Type == t {
return v.Path == ""
}
}
// Not found, so implicitly sharing a parent namespace.
return false
}

View File

@ -1,4 +1,5 @@
package userns
// RunningInUserNS detects whether we are currently running in a user namespace.
// Originally copied from github.com/lxc/lxd/shared/util.go
var RunningInUserNS = runningInUserNS

View File

@ -3,7 +3,14 @@
package userns
func FuzzUIDMap(uidmap []byte) int {
_ = uidMapInUserNS(string(uidmap))
import (
"strings"
"github.com/opencontainers/runc/libcontainer/user"
)
func FuzzUIDMap(data []byte) int {
uidmap, _ := user.ParseIDMap(strings.NewReader(string(data)))
_ = uidMapInUserNS(uidmap)
return 1
}

View File

@ -1,10 +1,9 @@
package userns
import (
"bufio"
"fmt"
"os"
"sync"
"github.com/opencontainers/runc/libcontainer/user"
)
var (
@ -13,43 +12,26 @@ var (
)
// runningInUserNS detects whether we are currently running in a user namespace.
//
// Originally copied from https://github.com/lxc/incus/blob/e45085dd42f826b3c8c3228e9733c0b6f998eafe/shared/util.go#L678-L700.
// Originally copied from github.com/lxc/lxd/shared/util.go
func runningInUserNS() bool {
nsOnce.Do(func() {
file, err := os.Open("/proc/self/uid_map")
uidmap, err := user.CurrentProcessUIDMap()
if err != nil {
// This kernel-provided file only exists if user namespaces are supported.
// This kernel-provided file only exists if user namespaces are supported
return
}
defer file.Close()
buf := bufio.NewReader(file)
l, _, err := buf.ReadLine()
if err != nil {
return
}
inUserNS = uidMapInUserNS(string(l))
inUserNS = uidMapInUserNS(uidmap)
})
return inUserNS
}
func uidMapInUserNS(uidMap string) bool {
if uidMap == "" {
// File exist but empty (the initial state when userns is created,
// see user_namespaces(7)).
return true
}
var a, b, c int64
if _, err := fmt.Sscanf(uidMap, "%d %d %d", &a, &b, &c); err != nil {
// Assume we are in a regular, non user namespace.
func uidMapInUserNS(uidmap []user.IDMap) bool {
/*
* We assume we are in the initial user namespace if we have a full
* range - 4294967295 uids starting at uid 0.
*/
if len(uidmap) == 1 && uidmap[0].ID == 0 && uidmap[0].ParentID == 0 && uidmap[0].Count == 4294967295 {
return false
}
// As per user_namespaces(7), /proc/self/uid_map of
// the initial user namespace shows 0 0 4294967295.
initNS := a == 0 && b == 0 && c == 4294967295
return !initNS
return true
}

View File

@ -0,0 +1,79 @@
#define _GNU_SOURCE
#include <fcntl.h>
#include <sched.h>
#include <stdio.h>
#include <unistd.h>
#include <stdarg.h>
#include <stdlib.h>
/*
* All of the code here is run inside an aync-signal-safe context, so we need
* to be careful to not call any functions that could cause issues. In theory,
* since we are a Go program, there are fewer restrictions in practice, it's
* better to be safe than sorry.
*
* The only exception is exit, which we need to call to make sure we don't
* return into runc.
*/
void bail(int pipefd, const char *fmt, ...)
{
va_list args;
va_start(args, fmt);
vdprintf(pipefd, fmt, args);
va_end(args);
exit(1);
}
int spawn_userns_cat(char *userns_path, char *path, int outfd, int errfd)
{
char buffer[4096] = { 0 };
pid_t child = fork();
if (child != 0)
return child;
/* in child */
/* Join the target userns. */
int nsfd = open(userns_path, O_RDONLY);
if (nsfd < 0)
bail(errfd, "open userns path %s failed: %m", userns_path);
int err = setns(nsfd, CLONE_NEWUSER);
if (err < 0)
bail(errfd, "setns %s failed: %m", userns_path);
close(nsfd);
/* Pipe the requested file contents. */
int fd = open(path, O_RDONLY);
if (fd < 0)
bail(errfd, "open %s in userns %s failed: %m", path, userns_path);
int nread, ntotal = 0;
while ((nread = read(fd, buffer, sizeof(buffer))) != 0) {
if (nread < 0)
bail(errfd, "read bytes from %s failed (after %d total bytes read): %m", path, ntotal);
ntotal += nread;
int nwritten = 0;
while (nwritten < nread) {
int n = write(outfd, buffer, nread - nwritten);
if (n < 0)
bail(errfd, "write %d bytes from %s failed (after %d bytes written): %m",
nread - nwritten, path, nwritten);
nwritten += n;
}
if (nread != nwritten)
bail(errfd, "mismatch for bytes read and written: %d read != %d written", nread, nwritten);
}
close(fd);
close(outfd);
close(errfd);
/* We must exit here, otherwise we would return into a forked runc. */
exit(0);
}

View File

@ -0,0 +1,186 @@
//go:build linux
package userns
import (
"bufio"
"bytes"
"fmt"
"io"
"os"
"unsafe"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/sirupsen/logrus"
)
/*
#include <stdlib.h>
extern int spawn_userns_cat(char *userns_path, char *path, int outfd, int errfd);
*/
import "C"
func parseIdmapData(data []byte) (ms []configs.IDMap, err error) {
scanner := bufio.NewScanner(bytes.NewReader(data))
for scanner.Scan() {
var m configs.IDMap
line := scanner.Text()
if _, err := fmt.Sscanf(line, "%d %d %d", &m.ContainerID, &m.HostID, &m.Size); err != nil {
return nil, fmt.Errorf("parsing id map failed: invalid format in line %q: %w", line, err)
}
ms = append(ms, m)
}
if err := scanner.Err(); err != nil {
return nil, fmt.Errorf("parsing id map failed: %w", err)
}
return ms, nil
}
// Do something equivalent to nsenter --user=<nsPath> cat <path>, but more
// efficiently. Returns the contents of the requested file from within the user
// namespace.
func spawnUserNamespaceCat(nsPath string, path string) ([]byte, error) {
rdr, wtr, err := os.Pipe()
if err != nil {
return nil, fmt.Errorf("create pipe for userns spawn failed: %w", err)
}
defer rdr.Close()
defer wtr.Close()
errRdr, errWtr, err := os.Pipe()
if err != nil {
return nil, fmt.Errorf("create error pipe for userns spawn failed: %w", err)
}
defer errRdr.Close()
defer errWtr.Close()
cNsPath := C.CString(nsPath)
defer C.free(unsafe.Pointer(cNsPath))
cPath := C.CString(path)
defer C.free(unsafe.Pointer(cPath))
childPid := C.spawn_userns_cat(cNsPath, cPath, C.int(wtr.Fd()), C.int(errWtr.Fd()))
if childPid < 0 {
return nil, fmt.Errorf("failed to spawn fork for userns")
} else if childPid == 0 {
// this should never happen
panic("runc executing inside fork child -- unsafe state!")
}
// We are in the parent -- close the write end of the pipe before reading.
wtr.Close()
output, err := io.ReadAll(rdr)
rdr.Close()
if err != nil {
return nil, fmt.Errorf("reading from userns spawn failed: %w", err)
}
// Ditto for the error pipe.
errWtr.Close()
errOutput, err := io.ReadAll(errRdr)
errRdr.Close()
if err != nil {
return nil, fmt.Errorf("reading from userns spawn error pipe failed: %w", err)
}
errOutput = bytes.TrimSpace(errOutput)
// Clean up the child.
child, err := os.FindProcess(int(childPid))
if err != nil {
return nil, fmt.Errorf("could not find userns spawn process: %w", err)
}
state, err := child.Wait()
if err != nil {
return nil, fmt.Errorf("failed to wait for userns spawn process: %w", err)
}
if !state.Success() {
errStr := string(errOutput)
if errStr == "" {
errStr = fmt.Sprintf("unknown error (status code %d)", state.ExitCode())
}
return nil, fmt.Errorf("userns spawn: %s", errStr)
} else if len(errOutput) > 0 {
// We can just ignore weird output in the error pipe if the process
// didn't bail(), but for completeness output for debugging.
logrus.Debugf("userns spawn succeeded but unexpected error message found: %s", string(errOutput))
}
// The subprocess succeeded, return whatever it wrote to the pipe.
return output, nil
}
func GetUserNamespaceMappings(nsPath string) (uidMap, gidMap []configs.IDMap, err error) {
var (
pid int
extra rune
tryFastPath bool
)
// nsPath is usually of the form /proc/<pid>/ns/user, which means that we
// already have a pid that is part of the user namespace and thus we can
// just use the pid to read from /proc/<pid>/*id_map.
//
// Note that Sscanf doesn't consume the whole input, so we check for any
// trailing data with %c. That way, we can be sure the pattern matched
// /proc/$pid/ns/user _exactly_ iff n === 1.
if n, _ := fmt.Sscanf(nsPath, "/proc/%d/ns/user%c", &pid, &extra); n == 1 {
tryFastPath = pid > 0
}
for _, mapType := range []struct {
name string
idMap *[]configs.IDMap
}{
{"uid_map", &uidMap},
{"gid_map", &gidMap},
} {
var mapData []byte
if tryFastPath {
path := fmt.Sprintf("/proc/%d/%s", pid, mapType.name)
data, err := os.ReadFile(path)
if err != nil {
// Do not error out here -- we need to try the slow path if the
// fast path failed.
logrus.Debugf("failed to use fast path to read %s from userns %s (error: %s), falling back to slow userns-join path", mapType.name, nsPath, err)
} else {
mapData = data
}
} else {
logrus.Debugf("cannot use fast path to read %s from userns %s, falling back to slow userns-join path", mapType.name, nsPath)
}
if mapData == nil {
// We have to actually join the namespace if we cannot take the
// fast path. The path is resolved with respect to the child
// process, so just use /proc/self.
data, err := spawnUserNamespaceCat(nsPath, "/proc/self/"+mapType.name)
if err != nil {
return nil, nil, err
}
mapData = data
}
idMap, err := parseIdmapData(mapData)
if err != nil {
return nil, nil, fmt.Errorf("failed to parse %s of userns %s: %w", mapType.name, nsPath, err)
}
*mapType.idMap = idMap
}
return uidMap, gidMap, nil
}
// IsSameMapping returns whether or not the two id mappings are the same. Note
// that if the order of the mappings is different, or a mapping has been split,
// the mappings will be considered different.
func IsSameMapping(a, b []configs.IDMap) bool {
if len(a) != len(b) {
return false
}
for idx := range a {
if a[idx] != b[idx] {
return false
}
}
return true
}

View File

@ -3,6 +3,8 @@
package userns
import "github.com/opencontainers/runc/libcontainer/user"
// runningInUserNS is a stub for non-Linux systems
// Always returns false
func runningInUserNS() bool {
@ -11,6 +13,6 @@ func runningInUserNS() bool {
// uidMapInUserNS is a stub for non-Linux systems
// Always returns false
func uidMapInUserNS(uidMap string) bool {
func uidMapInUserNS(uidmap []user.IDMap) bool {
return false
}

View File

@ -19,14 +19,13 @@ package utils
import (
"fmt"
"os"
"runtime"
"golang.org/x/sys/unix"
)
// MaxNameLen is the maximum length of the name of a file descriptor being sent
// using SendFile. The name of the file handle returned by RecvFile will never be
// larger than this value.
// MaxSendfdLen is the maximum length of the name of a file descriptor being
// sent using SendFd. The name of the file handle returned by RecvFd will never
// be larger than this value.
const MaxNameLen = 4096
// oobSpace is the size of the oob slice required to store a single FD. Note
@ -34,21 +33,26 @@ const MaxNameLen = 4096
// so sizeof(fd) = 4.
var oobSpace = unix.CmsgSpace(4)
// RecvFile waits for a file descriptor to be sent over the given AF_UNIX
// RecvFd waits for a file descriptor to be sent over the given AF_UNIX
// socket. The file name of the remote file descriptor will be recreated
// locally (it is sent as non-auxiliary data in the same payload).
func RecvFile(socket *os.File) (_ *os.File, Err error) {
func RecvFd(socket *os.File) (*os.File, error) {
// For some reason, unix.Recvmsg uses the length rather than the capacity
// when passing the msg_controllen and other attributes to recvmsg. So we
// have to actually set the length.
name := make([]byte, MaxNameLen)
oob := make([]byte, oobSpace)
sockfd := socket.Fd()
n, oobn, _, _, err := unix.Recvmsg(int(sockfd), name, oob, unix.MSG_CMSG_CLOEXEC)
n, oobn, _, _, err := unix.Recvmsg(int(sockfd), name, oob, 0)
if err != nil {
return nil, err
}
if n >= MaxNameLen || oobn != oobSpace {
return nil, fmt.Errorf("recvfile: incorrect number of bytes read (n=%d oobn=%d)", n, oobn)
return nil, fmt.Errorf("recvfd: incorrect number of bytes read (n=%d oobn=%d)", n, oobn)
}
// Truncate.
name = name[:n]
oob = oob[:oobn]
@ -57,63 +61,36 @@ func RecvFile(socket *os.File) (_ *os.File, Err error) {
if err != nil {
return nil, err
}
// We cannot control how many SCM_RIGHTS we receive, and upon receiving
// them all of the descriptors are installed in our fd table, so we need to
// parse all of the SCM_RIGHTS we received in order to close all of the
// descriptors on error.
var fds []int
defer func() {
for i, fd := range fds {
if i == 0 && Err == nil {
// Only close the first one on error.
continue
}
// Always close extra ones.
_ = unix.Close(fd)
}
}()
var lastErr error
for _, scm := range scms {
if scm.Header.Type == unix.SCM_RIGHTS {
scmFds, err := unix.ParseUnixRights(&scm)
if err != nil {
lastErr = err
} else {
fds = append(fds, scmFds...)
}
}
}
if lastErr != nil {
return nil, lastErr
}
// We do this after collecting the fds to make sure we close them all when
// returning an error here.
if len(scms) != 1 {
return nil, fmt.Errorf("recvfd: number of SCMs is not 1: %d", len(scms))
}
scm := scms[0]
fds, err := unix.ParseUnixRights(&scm)
if err != nil {
return nil, err
}
if len(fds) != 1 {
return nil, fmt.Errorf("recvfd: number of fds is not 1: %d", len(fds))
}
return os.NewFile(uintptr(fds[0]), string(name)), nil
fd := uintptr(fds[0])
return os.NewFile(fd, string(name)), nil
}
// SendFile sends a file over the given AF_UNIX socket. file.Name() is also
// included so that if the other end uses RecvFile, the file will have the same
// name information.
func SendFile(socket *os.File, file *os.File) error {
name := file.Name()
// SendFd sends a file descriptor over the given AF_UNIX socket. In
// addition, the file.Name() of the given file will also be sent as
// non-auxiliary data in the same payload (allowing to send contextual
// information for a file descriptor).
func SendFd(socket *os.File, name string, fd uintptr) error {
if len(name) >= MaxNameLen {
return fmt.Errorf("sendfd: filename too long: %s", name)
}
err := SendRawFd(socket, name, file.Fd())
runtime.KeepAlive(file)
return err
return SendFds(socket, []byte(name), int(fd))
}
// SendRawFd sends a specific file descriptor over the given AF_UNIX socket.
func SendRawFd(socket *os.File, msg string, fd uintptr) error {
oob := unix.UnixRights(int(fd))
return unix.Sendmsg(int(socket.Fd()), []byte(msg), oob, nil, 0)
// SendFds sends a list of files descriptor and msg over the given AF_UNIX socket.
func SendFds(socket *os.File, msg []byte, fds ...int) error {
oob := unix.UnixRights(fds...)
return unix.Sendmsg(int(socket.Fd()), msg, oob, nil, 0)
}

View File

@ -132,16 +132,19 @@ func WithProcfd(root, unsafePath string, fn func(procfd string) error) error {
return fn(procfd)
}
// SearchLabels searches through a list of key=value pairs for a given key,
// returning its value, and the binary flag telling whether the key exist.
func SearchLabels(labels []string, key string) (string, bool) {
key += "="
for _, s := range labels {
if strings.HasPrefix(s, key) {
return s[len(key):], true
// SearchLabels searches a list of key-value pairs for the provided key and
// returns the corresponding value. The pairs must be separated with '='.
func SearchLabels(labels []string, query string) string {
for _, l := range labels {
parts := strings.SplitN(l, "=", 2)
if len(parts) < 2 {
continue
}
if parts[0] == query {
return parts[1]
}
}
return "", false
return ""
}
// Annotations returns the bundle path and user defined annotations from the

View File

@ -5,10 +5,9 @@ package utils
import (
"fmt"
"math"
"os"
"strconv"
"sync"
_ "unsafe" // for go:linkname
"golang.org/x/sys/unix"
)
@ -25,38 +24,11 @@ func EnsureProcHandle(fh *os.File) error {
return nil
}
var (
haveCloseRangeCloexecBool bool
haveCloseRangeCloexecOnce sync.Once
)
func haveCloseRangeCloexec() bool {
haveCloseRangeCloexecOnce.Do(func() {
// Make sure we're not closing a random file descriptor.
tmpFd, err := unix.FcntlInt(0, unix.F_DUPFD_CLOEXEC, 0)
if err != nil {
return
}
defer unix.Close(tmpFd)
err = unix.CloseRange(uint(tmpFd), uint(tmpFd), unix.CLOSE_RANGE_CLOEXEC)
// Any error means we cannot use close_range(CLOSE_RANGE_CLOEXEC).
// -ENOSYS and -EINVAL ultimately mean we don't have support, but any
// other potential error would imply that even the most basic close
// operation wouldn't work.
haveCloseRangeCloexecBool = err == nil
})
return haveCloseRangeCloexecBool
}
// CloseExecFrom applies O_CLOEXEC to all file descriptors currently open for
// the process (except for those below the given fd value).
func CloseExecFrom(minFd int) error {
if haveCloseRangeCloexec() {
err := unix.CloseRange(uint(minFd), math.MaxUint, unix.CLOSE_RANGE_CLOEXEC)
return os.NewSyscallError("close_range", err)
}
type fdFunc func(fd int)
// fdRangeFrom calls the passed fdFunc for each file descriptor that is open in
// the current process.
func fdRangeFrom(minFd int, fn fdFunc) error {
fdDir, err := os.Open("/proc/self/fd")
if err != nil {
return err
@ -81,17 +53,62 @@ func CloseExecFrom(minFd int) error {
if fd < minFd {
continue
}
// Intentionally ignore errors from unix.CloseOnExec -- the cases where
// this might fail are basically file descriptors that have already
// been closed (including and especially the one that was created when
// os.ReadDir did the "opendir" syscall).
unix.CloseOnExec(fd)
// Ignore the file descriptor we used for readdir, as it will be closed
// when we return.
if uintptr(fd) == fdDir.Fd() {
continue
}
// Run the closure.
fn(fd)
}
return nil
}
// CloseExecFrom sets the O_CLOEXEC flag on all file descriptors greater or
// equal to minFd in the current process.
func CloseExecFrom(minFd int) error {
return fdRangeFrom(minFd, unix.CloseOnExec)
}
//go:linkname runtime_IsPollDescriptor internal/poll.IsPollDescriptor
// In order to make sure we do not close the internal epoll descriptors the Go
// runtime uses, we need to ensure that we skip descriptors that match
// "internal/poll".IsPollDescriptor. Yes, this is a Go runtime internal thing,
// unfortunately there's no other way to be sure we're only keeping the file
// descriptors the Go runtime needs. Hopefully nothing blows up doing this...
func runtime_IsPollDescriptor(fd uintptr) bool //nolint:revive
// UnsafeCloseFrom closes all file descriptors greater or equal to minFd in the
// current process, except for those critical to Go's runtime (such as the
// netpoll management descriptors).
//
// NOTE: That this function is incredibly dangerous to use in most Go code, as
// closing file descriptors from underneath *os.File handles can lead to very
// bad behaviour (the closed file descriptor can be re-used and then any
// *os.File operations would apply to the wrong file). This function is only
// intended to be called from the last stage of runc init.
func UnsafeCloseFrom(minFd int) error {
// We must not close some file descriptors.
return fdRangeFrom(minFd, func(fd int) {
if runtime_IsPollDescriptor(uintptr(fd)) {
// These are the Go runtimes internal netpoll file descriptors.
// These file descriptors are operated on deep in the Go scheduler,
// and closing those files from underneath Go can result in panics.
// There is no issue with keeping them because they are not
// executable and are not useful to an attacker anyway. Also we
// don't have any choice.
return
}
// There's nothing we can do about errors from close(2), and the
// only likely error to be seen is EBADF which indicates the fd was
// already closed (in which case, we got what we wanted).
_ = unix.Close(fd)
})
}
// NewSockPair returns a new unix socket pair
func NewSockPair(name string) (parent, child *os.File, err error) {
func NewSockPair(name string) (parent *os.File, child *os.File, err error) {
fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0)
if err != nil {
return nil, nil, err