Bump to runc main

By using main instead of the v1.1 branch, we drop an unnecessary
dependency on cilium/ebpf, saving ~1mb of binary size.

Signed-off-by: Matt Heon <mheon@redhat.com>
This commit is contained in:
Matt Heon
2024-02-02 10:06:20 -05:00
parent 6a727fdbee
commit 91b8f208a2
172 changed files with 4363 additions and 22273 deletions

View File

@ -8,9 +8,9 @@ The following is courtesy of our legal counsel:
Use and transfer of Docker may be subject to certain restrictions by the
United States and other governments.
United States and other governments.
It is your responsibility to ensure that your use and/or transfer does not
violate applicable laws.
violate applicable laws.
For more information, please see http://www.bis.doc.gov

View File

@ -26,14 +26,19 @@ func isEnabled() bool {
}
func setProcAttr(attr, value string) error {
// Under AppArmor you can only change your own attr, so use /proc/self/
// instead of /proc/<tid>/ like libapparmor does
attrPath := "/proc/self/attr/apparmor/" + attr
if _, err := os.Stat(attrPath); errors.Is(err, os.ErrNotExist) {
attr = utils.CleanPath(attr)
attrSubPath := "attr/apparmor/" + attr
if _, err := os.Stat("/proc/self/" + attrSubPath); errors.Is(err, os.ErrNotExist) {
// fall back to the old convention
attrPath = "/proc/self/attr/" + attr
attrSubPath = "attr/" + attr
}
// Under AppArmor you can only change your own attr, so there's no reason
// to not use /proc/thread-self/ (instead of /proc/<tid>/, like libapparmor
// does).
attrPath, closer := utils.ProcThreadSelf(attrSubPath)
defer closer()
f, err := os.OpenFile(attrPath, os.O_WRONLY, 0)
if err != nil {
return err

View File

@ -1,9 +1,24 @@
package cgroups
import (
"errors"
"github.com/opencontainers/runc/libcontainer/configs"
)
var (
// ErrDevicesUnsupported is an error returned when a cgroup manager
// is not configured to set device rules.
ErrDevicesUnsupported = errors.New("cgroup manager is not configured to set device rules")
// DevicesSetV1 and DevicesSetV2 are functions to set devices for
// cgroup v1 and v2, respectively. Unless libcontainer/cgroups/devices
// package is imported, it is set to nil, so cgroup managers can't
// manage devices.
DevicesSetV1 func(path string, r *configs.Resources) error
DevicesSetV2 func(path string, r *configs.Resources) error
)
type Manager interface {
// Apply creates a cgroup, if not yet created, and adds a process
// with the specified pid into that cgroup. A special value of -1

View File

@ -1,386 +0,0 @@
// SPDX-License-Identifier: Apache-2.0
/*
* Copyright (C) 2020 Aleksa Sarai <cyphar@cyphar.com>
* Copyright (C) 2020 SUSE LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package devices
import (
"bufio"
"fmt"
"io"
"sort"
"strconv"
"strings"
"github.com/opencontainers/runc/libcontainer/devices"
)
// deviceMeta is a Rule without the Allow or Permissions fields, and no
// wildcard-type support. It's effectively the "match" portion of a metadata
// rule, for the purposes of our emulation.
type deviceMeta struct {
node devices.Type
major int64
minor int64
}
// deviceRule is effectively the tuple (deviceMeta, Permissions).
type deviceRule struct {
meta deviceMeta
perms devices.Permissions
}
// deviceRules is a mapping of device metadata rules to the associated
// permissions in the ruleset.
type deviceRules map[deviceMeta]devices.Permissions
func (r deviceRules) orderedEntries() []deviceRule {
var rules []deviceRule
for meta, perms := range r {
rules = append(rules, deviceRule{meta: meta, perms: perms})
}
sort.Slice(rules, func(i, j int) bool {
// Sort by (major, minor, type).
a, b := rules[i].meta, rules[j].meta
return a.major < b.major ||
(a.major == b.major && a.minor < b.minor) ||
(a.major == b.major && a.minor == b.minor && a.node < b.node)
})
return rules
}
type Emulator struct {
defaultAllow bool
rules deviceRules
}
func (e *Emulator) IsBlacklist() bool {
return e.defaultAllow
}
func (e *Emulator) IsAllowAll() bool {
return e.IsBlacklist() && len(e.rules) == 0
}
func parseLine(line string) (*deviceRule, error) {
// Input: node major:minor perms.
fields := strings.FieldsFunc(line, func(r rune) bool {
return r == ' ' || r == ':'
})
if len(fields) != 4 {
return nil, fmt.Errorf("malformed devices.list rule %s", line)
}
var (
rule deviceRule
node = fields[0]
major = fields[1]
minor = fields[2]
perms = fields[3]
)
// Parse the node type.
switch node {
case "a":
// Super-special case -- "a" always means every device with every
// access mode. In fact, for devices.list this actually indicates that
// the cgroup is in black-list mode.
// TODO: Double-check that the entire file is "a *:* rwm".
return nil, nil
case "b":
rule.meta.node = devices.BlockDevice
case "c":
rule.meta.node = devices.CharDevice
default:
return nil, fmt.Errorf("unknown device type %q", node)
}
// Parse the major number.
if major == "*" {
rule.meta.major = devices.Wildcard
} else {
val, err := strconv.ParseUint(major, 10, 32)
if err != nil {
return nil, fmt.Errorf("invalid major number: %w", err)
}
rule.meta.major = int64(val)
}
// Parse the minor number.
if minor == "*" {
rule.meta.minor = devices.Wildcard
} else {
val, err := strconv.ParseUint(minor, 10, 32)
if err != nil {
return nil, fmt.Errorf("invalid minor number: %w", err)
}
rule.meta.minor = int64(val)
}
// Parse the access permissions.
rule.perms = devices.Permissions(perms)
if !rule.perms.IsValid() || rule.perms.IsEmpty() {
return nil, fmt.Errorf("parse access mode: contained unknown modes or is empty: %q", perms)
}
return &rule, nil
}
func (e *Emulator) addRule(rule deviceRule) error { //nolint:unparam
if e.rules == nil {
e.rules = make(map[deviceMeta]devices.Permissions)
}
// Merge with any pre-existing permissions.
oldPerms := e.rules[rule.meta]
newPerms := rule.perms.Union(oldPerms)
e.rules[rule.meta] = newPerms
return nil
}
func (e *Emulator) rmRule(rule deviceRule) error {
// Give an error if any of the permissions requested to be removed are
// present in a partially-matching wildcard rule, because such rules will
// be ignored by cgroupv1.
//
// This is a diversion from cgroupv1, but is necessary to avoid leading
// users into a false sense of security. cgroupv1 will silently(!) ignore
// requests to remove partial exceptions, but we really shouldn't do that.
//
// It may seem like we could just "split" wildcard rules which hit this
// issue, but unfortunately there are 2^32 possible major and minor
// numbers, which would exhaust kernel memory quickly if we did this. Not
// to mention it'd be really slow (the kernel side is implemented as a
// linked-list of exceptions).
for _, partialMeta := range []deviceMeta{
{node: rule.meta.node, major: devices.Wildcard, minor: rule.meta.minor},
{node: rule.meta.node, major: rule.meta.major, minor: devices.Wildcard},
{node: rule.meta.node, major: devices.Wildcard, minor: devices.Wildcard},
} {
// This wildcard rule is equivalent to the requested rule, so skip it.
if rule.meta == partialMeta {
continue
}
// Only give an error if the set of permissions overlap.
partialPerms := e.rules[partialMeta]
if !partialPerms.Intersection(rule.perms).IsEmpty() {
return fmt.Errorf("requested rule [%v %v] not supported by devices cgroupv1 (cannot punch hole in existing wildcard rule [%v %v])", rule.meta, rule.perms, partialMeta, partialPerms)
}
}
// Subtract all of the permissions listed from the full match rule. If the
// rule didn't exist, all of this is a no-op.
newPerms := e.rules[rule.meta].Difference(rule.perms)
if newPerms.IsEmpty() {
delete(e.rules, rule.meta)
} else {
e.rules[rule.meta] = newPerms
}
// TODO: The actual cgroup code doesn't care if an exception didn't exist
// during removal, so not erroring out here is /accurate/ but quite
// worrying. Maybe we should do additional validation, but again we
// have to worry about backwards-compatibility.
return nil
}
func (e *Emulator) allow(rule *deviceRule) error {
// This cgroup is configured as a black-list. Reset the entire emulator,
// and put is into black-list mode.
if rule == nil || rule.meta.node == devices.WildcardDevice {
*e = Emulator{
defaultAllow: true,
rules: nil,
}
return nil
}
var err error
if e.defaultAllow {
err = wrapErr(e.rmRule(*rule), "unable to remove 'deny' exception")
} else {
err = wrapErr(e.addRule(*rule), "unable to add 'allow' exception")
}
return err
}
func (e *Emulator) deny(rule *deviceRule) error {
// This cgroup is configured as a white-list. Reset the entire emulator,
// and put is into white-list mode.
if rule == nil || rule.meta.node == devices.WildcardDevice {
*e = Emulator{
defaultAllow: false,
rules: nil,
}
return nil
}
var err error
if e.defaultAllow {
err = wrapErr(e.addRule(*rule), "unable to add 'deny' exception")
} else {
err = wrapErr(e.rmRule(*rule), "unable to remove 'allow' exception")
}
return err
}
func (e *Emulator) Apply(rule devices.Rule) error {
if !rule.Type.CanCgroup() {
return fmt.Errorf("cannot add rule [%#v] with non-cgroup type %q", rule, rule.Type)
}
innerRule := &deviceRule{
meta: deviceMeta{
node: rule.Type,
major: rule.Major,
minor: rule.Minor,
},
perms: rule.Permissions,
}
if innerRule.meta.node == devices.WildcardDevice {
innerRule = nil
}
if rule.Allow {
return e.allow(innerRule)
}
return e.deny(innerRule)
}
// EmulatorFromList takes a reader to a "devices.list"-like source, and returns
// a new Emulator that represents the state of the devices cgroup. Note that
// black-list devices cgroups cannot be fully reconstructed, due to limitations
// in the devices cgroup API. Instead, such cgroups are always treated as
// "allow all" cgroups.
func EmulatorFromList(list io.Reader) (*Emulator, error) {
// Normally cgroups are in black-list mode by default, but the way we
// figure out the current mode is whether or not devices.list has an
// allow-all rule. So we default to a white-list, and the existence of an
// "a *:* rwm" entry will tell us otherwise.
e := &Emulator{
defaultAllow: false,
}
// Parse the "devices.list".
s := bufio.NewScanner(list)
for s.Scan() {
line := s.Text()
deviceRule, err := parseLine(line)
if err != nil {
return nil, fmt.Errorf("error parsing line %q: %w", line, err)
}
// "devices.list" is an allow list. Note that this means that in
// black-list mode, we have no idea what rules are in play. As a
// result, we need to be very careful in Transition().
if err := e.allow(deviceRule); err != nil {
return nil, fmt.Errorf("error adding devices.list rule: %w", err)
}
}
if err := s.Err(); err != nil {
return nil, fmt.Errorf("error reading devices.list lines: %w", err)
}
return e, nil
}
// Transition calculates what is the minimally-disruptive set of rules need to
// be applied to a devices cgroup in order to transition to the given target.
// This means that any already-existing rules will not be applied, and
// disruptive rules (like denying all device access) will only be applied if
// necessary.
//
// This function is the sole reason for all of Emulator -- to allow us
// to figure out how to update a containers' cgroups without causing spurious
// device errors (if possible).
func (source *Emulator) Transition(target *Emulator) ([]*devices.Rule, error) {
var transitionRules []*devices.Rule
oldRules := source.rules
// If the default policy doesn't match, we need to include a "disruptive"
// rule (either allow-all or deny-all) in order to switch the cgroup to the
// correct default policy.
//
// However, due to a limitation in "devices.list" we cannot be sure what
// deny rules are in place in a black-list cgroup. Thus if the source is a
// black-list we also have to include a disruptive rule.
if source.IsBlacklist() || source.defaultAllow != target.defaultAllow {
transitionRules = append(transitionRules, &devices.Rule{
Type: 'a',
Major: -1,
Minor: -1,
Permissions: devices.Permissions("rwm"),
Allow: target.defaultAllow,
})
// The old rules are only relevant if we aren't starting out with a
// disruptive rule.
oldRules = nil
}
// NOTE: We traverse through the rules in a sorted order so we always write
// the same set of rules (this is to aid testing).
// First, we create inverse rules for any old rules not in the new set.
// This includes partial-inverse rules for specific permissions. This is a
// no-op if we added a disruptive rule, since oldRules will be empty.
for _, rule := range oldRules.orderedEntries() {
meta, oldPerms := rule.meta, rule.perms
newPerms := target.rules[meta]
droppedPerms := oldPerms.Difference(newPerms)
if !droppedPerms.IsEmpty() {
transitionRules = append(transitionRules, &devices.Rule{
Type: meta.node,
Major: meta.major,
Minor: meta.minor,
Permissions: droppedPerms,
Allow: target.defaultAllow,
})
}
}
// Add any additional rules which weren't in the old set. We happen to
// filter out rules which are present in both sets, though this isn't
// strictly necessary.
for _, rule := range target.rules.orderedEntries() {
meta, newPerms := rule.meta, rule.perms
oldPerms := oldRules[meta]
gainedPerms := newPerms.Difference(oldPerms)
if !gainedPerms.IsEmpty() {
transitionRules = append(transitionRules, &devices.Rule{
Type: meta.node,
Major: meta.major,
Minor: meta.minor,
Permissions: gainedPerms,
Allow: !target.defaultAllow,
})
}
}
return transitionRules, nil
}
// Rules returns the minimum set of rules necessary to convert a *deny-all*
// cgroup to the emulated filter state (note that this is not the same as a
// default cgroupv1 cgroup -- which is allow-all). This is effectively just a
// wrapper around Transition() with the source emulator being an empty cgroup.
func (e *Emulator) Rules() ([]*devices.Rule, error) {
defaultCgroup := &Emulator{defaultAllow: false}
return defaultCgroup.Transition(e)
}
func wrapErr(err error, text string) error {
if err == nil {
return nil
}
return fmt.Errorf(text+": %w", err)
}

View File

@ -1,208 +0,0 @@
// Package devicefilter contains eBPF device filter program
//
// The implementation is based on https://github.com/containers/crun/blob/0.10.2/src/libcrun/ebpf.c
//
// Although ebpf.c is originally licensed under LGPL-3.0-or-later, the author (Giuseppe Scrivano)
// agreed to relicense the file in Apache License 2.0: https://github.com/opencontainers/runc/issues/2144#issuecomment-543116397
package devicefilter
import (
"errors"
"fmt"
"math"
"strconv"
"github.com/cilium/ebpf/asm"
devicesemulator "github.com/opencontainers/runc/libcontainer/cgroups/devices"
"github.com/opencontainers/runc/libcontainer/devices"
"golang.org/x/sys/unix"
)
const (
// license string format is same as kernel MODULE_LICENSE macro
license = "Apache"
)
// DeviceFilter returns eBPF device filter program and its license string
func DeviceFilter(rules []*devices.Rule) (asm.Instructions, string, error) {
// Generate the minimum ruleset for the device rules we are given. While we
// don't care about minimum transitions in cgroupv2, using the emulator
// gives us a guarantee that the behaviour of devices filtering is the same
// as cgroupv1, including security hardenings to avoid misconfiguration
// (such as punching holes in wildcard rules).
emu := new(devicesemulator.Emulator)
for _, rule := range rules {
if err := emu.Apply(*rule); err != nil {
return nil, "", err
}
}
cleanRules, err := emu.Rules()
if err != nil {
return nil, "", err
}
p := &program{
defaultAllow: emu.IsBlacklist(),
}
p.init()
for idx, rule := range cleanRules {
if rule.Type == devices.WildcardDevice {
// We can safely skip over wildcard entries because there should
// only be one (at most) at the very start to instruct cgroupv1 to
// go into allow-list mode. However we do double-check this here.
if idx != 0 || rule.Allow != emu.IsBlacklist() {
return nil, "", fmt.Errorf("[internal error] emulated cgroupv2 devices ruleset had bad wildcard at idx %v (%s)", idx, rule.CgroupString())
}
continue
}
if rule.Allow == p.defaultAllow {
// There should be no rules which have an action equal to the
// default action, the emulator removes those.
return nil, "", fmt.Errorf("[internal error] emulated cgroupv2 devices ruleset had no-op rule at idx %v (%s)", idx, rule.CgroupString())
}
if err := p.appendRule(rule); err != nil {
return nil, "", err
}
}
return p.finalize(), license, nil
}
type program struct {
insts asm.Instructions
defaultAllow bool
blockID int
}
func (p *program) init() {
// struct bpf_cgroup_dev_ctx: https://elixir.bootlin.com/linux/v5.3.6/source/include/uapi/linux/bpf.h#L3423
/*
u32 access_type
u32 major
u32 minor
*/
// R2 <- type (lower 16 bit of u32 access_type at R1[0])
p.insts = append(p.insts,
asm.LoadMem(asm.R2, asm.R1, 0, asm.Word),
asm.And.Imm32(asm.R2, 0xFFFF))
// R3 <- access (upper 16 bit of u32 access_type at R1[0])
p.insts = append(p.insts,
asm.LoadMem(asm.R3, asm.R1, 0, asm.Word),
// RSh: bitwise shift right
asm.RSh.Imm32(asm.R3, 16))
// R4 <- major (u32 major at R1[4])
p.insts = append(p.insts,
asm.LoadMem(asm.R4, asm.R1, 4, asm.Word))
// R5 <- minor (u32 minor at R1[8])
p.insts = append(p.insts,
asm.LoadMem(asm.R5, asm.R1, 8, asm.Word))
}
// appendRule rule converts an OCI rule to the relevant eBPF block and adds it
// to the in-progress filter program. In order to operate properly, it must be
// called with a "clean" rule list (generated by devices.Emulator.Rules() --
// with any "a" rules removed).
func (p *program) appendRule(rule *devices.Rule) error {
if p.blockID < 0 {
return errors.New("the program is finalized")
}
var bpfType int32
switch rule.Type {
case devices.CharDevice:
bpfType = int32(unix.BPF_DEVCG_DEV_CHAR)
case devices.BlockDevice:
bpfType = int32(unix.BPF_DEVCG_DEV_BLOCK)
default:
// We do not permit 'a', nor any other types we don't know about.
return fmt.Errorf("invalid type %q", string(rule.Type))
}
if rule.Major > math.MaxUint32 {
return fmt.Errorf("invalid major %d", rule.Major)
}
if rule.Minor > math.MaxUint32 {
return fmt.Errorf("invalid minor %d", rule.Major)
}
hasMajor := rule.Major >= 0 // if not specified in OCI json, major is set to -1
hasMinor := rule.Minor >= 0
bpfAccess := int32(0)
for _, r := range rule.Permissions {
switch r {
case 'r':
bpfAccess |= unix.BPF_DEVCG_ACC_READ
case 'w':
bpfAccess |= unix.BPF_DEVCG_ACC_WRITE
case 'm':
bpfAccess |= unix.BPF_DEVCG_ACC_MKNOD
default:
return fmt.Errorf("unknown device access %v", r)
}
}
// If the access is rwm, skip the check.
hasAccess := bpfAccess != (unix.BPF_DEVCG_ACC_READ | unix.BPF_DEVCG_ACC_WRITE | unix.BPF_DEVCG_ACC_MKNOD)
var (
blockSym = "block-" + strconv.Itoa(p.blockID)
nextBlockSym = "block-" + strconv.Itoa(p.blockID+1)
prevBlockLastIdx = len(p.insts) - 1
)
p.insts = append(p.insts,
// if (R2 != bpfType) goto next
asm.JNE.Imm(asm.R2, bpfType, nextBlockSym),
)
if hasAccess {
p.insts = append(p.insts,
// if (R3 & bpfAccess != R3 /* use R1 as a temp var */) goto next
asm.Mov.Reg32(asm.R1, asm.R3),
asm.And.Imm32(asm.R1, bpfAccess),
asm.JNE.Reg(asm.R1, asm.R3, nextBlockSym),
)
}
if hasMajor {
p.insts = append(p.insts,
// if (R4 != major) goto next
asm.JNE.Imm(asm.R4, int32(rule.Major), nextBlockSym),
)
}
if hasMinor {
p.insts = append(p.insts,
// if (R5 != minor) goto next
asm.JNE.Imm(asm.R5, int32(rule.Minor), nextBlockSym),
)
}
p.insts = append(p.insts, acceptBlock(rule.Allow)...)
// set blockSym to the first instruction we added in this iteration
p.insts[prevBlockLastIdx+1] = p.insts[prevBlockLastIdx+1].Sym(blockSym)
p.blockID++
return nil
}
func (p *program) finalize() asm.Instructions {
var v int32
if p.defaultAllow {
v = 1
}
blockSym := "block-" + strconv.Itoa(p.blockID)
p.insts = append(p.insts,
// R0 <- v
asm.Mov.Imm32(asm.R0, v).Sym(blockSym),
asm.Return(),
)
p.blockID = -1
return p.insts
}
func acceptBlock(accept bool) asm.Instructions {
var v int32
if accept {
v = 1
}
return []asm.Instruction{
// R0 <- v
asm.Mov.Imm32(asm.R0, v),
asm.Return(),
}
}

View File

@ -1,253 +0,0 @@
package ebpf
import (
"errors"
"fmt"
"os"
"runtime"
"sync"
"unsafe"
"github.com/cilium/ebpf"
"github.com/cilium/ebpf/asm"
"github.com/cilium/ebpf/link"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
)
func nilCloser() error {
return nil
}
func findAttachedCgroupDeviceFilters(dirFd int) ([]*ebpf.Program, error) {
type bpfAttrQuery struct {
TargetFd uint32
AttachType uint32
QueryType uint32
AttachFlags uint32
ProgIds uint64 // __aligned_u64
ProgCnt uint32
}
// Currently you can only have 64 eBPF programs attached to a cgroup.
size := 64
retries := 0
for retries < 10 {
progIds := make([]uint32, size)
query := bpfAttrQuery{
TargetFd: uint32(dirFd),
AttachType: uint32(unix.BPF_CGROUP_DEVICE),
ProgIds: uint64(uintptr(unsafe.Pointer(&progIds[0]))),
ProgCnt: uint32(len(progIds)),
}
// Fetch the list of program ids.
_, _, errno := unix.Syscall(unix.SYS_BPF,
uintptr(unix.BPF_PROG_QUERY),
uintptr(unsafe.Pointer(&query)),
unsafe.Sizeof(query))
size = int(query.ProgCnt)
runtime.KeepAlive(query)
if errno != 0 {
// On ENOSPC we get the correct number of programs.
if errno == unix.ENOSPC {
retries++
continue
}
return nil, fmt.Errorf("bpf_prog_query(BPF_CGROUP_DEVICE) failed: %w", errno)
}
// Convert the ids to program handles.
progIds = progIds[:size]
programs := make([]*ebpf.Program, 0, len(progIds))
for _, progId := range progIds {
program, err := ebpf.NewProgramFromID(ebpf.ProgramID(progId))
if err != nil {
// We skip over programs that give us -EACCES or -EPERM. This
// is necessary because there may be BPF programs that have
// been attached (such as with --systemd-cgroup) which have an
// LSM label that blocks us from interacting with the program.
//
// Because additional BPF_CGROUP_DEVICE programs only can add
// restrictions, there's no real issue with just ignoring these
// programs (and stops runc from breaking on distributions with
// very strict SELinux policies).
if errors.Is(err, os.ErrPermission) {
logrus.Debugf("ignoring existing CGROUP_DEVICE program (prog_id=%v) which cannot be accessed by runc -- likely due to LSM policy: %v", progId, err)
continue
}
return nil, fmt.Errorf("cannot fetch program from id: %w", err)
}
programs = append(programs, program)
}
runtime.KeepAlive(progIds)
return programs, nil
}
return nil, errors.New("could not get complete list of CGROUP_DEVICE programs")
}
var (
haveBpfProgReplaceBool bool
haveBpfProgReplaceOnce sync.Once
)
// Loosely based on the BPF_F_REPLACE support check in
// https://github.com/cilium/ebpf/blob/v0.6.0/link/syscalls.go.
//
// TODO: move this logic to cilium/ebpf
func haveBpfProgReplace() bool {
haveBpfProgReplaceOnce.Do(func() {
prog, err := ebpf.NewProgram(&ebpf.ProgramSpec{
Type: ebpf.CGroupDevice,
License: "MIT",
Instructions: asm.Instructions{
asm.Mov.Imm(asm.R0, 0),
asm.Return(),
},
})
if err != nil {
logrus.Debugf("checking for BPF_F_REPLACE support: ebpf.NewProgram failed: %v", err)
return
}
defer prog.Close()
devnull, err := os.Open("/dev/null")
if err != nil {
logrus.Debugf("checking for BPF_F_REPLACE support: open dummy target fd: %v", err)
return
}
defer devnull.Close()
// We know that we have BPF_PROG_ATTACH since we can load
// BPF_CGROUP_DEVICE programs. If passing BPF_F_REPLACE gives us EINVAL
// we know that the feature isn't present.
err = link.RawAttachProgram(link.RawAttachProgramOptions{
// We rely on this fd being checked after attachFlags.
Target: int(devnull.Fd()),
// Attempt to "replace" bad fds with this program.
Program: prog,
Attach: ebpf.AttachCGroupDevice,
Flags: unix.BPF_F_ALLOW_MULTI | unix.BPF_F_REPLACE,
})
if errors.Is(err, unix.EINVAL) {
// not supported
return
}
// attach_flags test succeeded.
if !errors.Is(err, unix.EBADF) {
logrus.Debugf("checking for BPF_F_REPLACE: got unexpected (not EBADF or EINVAL) error: %v", err)
}
haveBpfProgReplaceBool = true
})
return haveBpfProgReplaceBool
}
// LoadAttachCgroupDeviceFilter installs eBPF device filter program to /sys/fs/cgroup/<foo> directory.
//
// Requires the system to be running in cgroup2 unified-mode with kernel >= 4.15 .
//
// https://github.com/torvalds/linux/commit/ebc614f687369f9df99828572b1d85a7c2de3d92
func LoadAttachCgroupDeviceFilter(insts asm.Instructions, license string, dirFd int) (func() error, error) {
// Increase `ulimit -l` limit to avoid BPF_PROG_LOAD error (#2167).
// This limit is not inherited into the container.
memlockLimit := &unix.Rlimit{
Cur: unix.RLIM_INFINITY,
Max: unix.RLIM_INFINITY,
}
_ = unix.Setrlimit(unix.RLIMIT_MEMLOCK, memlockLimit)
// Get the list of existing programs.
oldProgs, err := findAttachedCgroupDeviceFilters(dirFd)
if err != nil {
return nilCloser, err
}
useReplaceProg := haveBpfProgReplace() && len(oldProgs) == 1
// Generate new program.
spec := &ebpf.ProgramSpec{
Type: ebpf.CGroupDevice,
Instructions: insts,
License: license,
}
prog, err := ebpf.NewProgram(spec)
if err != nil {
return nilCloser, err
}
// If there is only one old program, we can just replace it directly.
var (
replaceProg *ebpf.Program
attachFlags uint32 = unix.BPF_F_ALLOW_MULTI
)
if useReplaceProg {
replaceProg = oldProgs[0]
attachFlags |= unix.BPF_F_REPLACE
}
err = link.RawAttachProgram(link.RawAttachProgramOptions{
Target: dirFd,
Program: prog,
Replace: replaceProg,
Attach: ebpf.AttachCGroupDevice,
Flags: attachFlags,
})
if err != nil {
return nilCloser, fmt.Errorf("failed to call BPF_PROG_ATTACH (BPF_CGROUP_DEVICE, BPF_F_ALLOW_MULTI): %w", err)
}
closer := func() error {
err = link.RawDetachProgram(link.RawDetachProgramOptions{
Target: dirFd,
Program: prog,
Attach: ebpf.AttachCGroupDevice,
})
if err != nil {
return fmt.Errorf("failed to call BPF_PROG_DETACH (BPF_CGROUP_DEVICE): %w", err)
}
// TODO: Should we attach the old filters back in this case? Otherwise
// we fail-open on a security feature, which is a bit scary.
return nil
}
if !useReplaceProg {
logLevel := logrus.DebugLevel
// If there was more than one old program, give a warning (since this
// really shouldn't happen with runc-managed cgroups) and then detach
// all the old programs.
if len(oldProgs) > 1 {
// NOTE: Ideally this should be a warning but it turns out that
// systemd-managed cgroups trigger this warning (apparently
// systemd doesn't delete old non-systemd programs when
// setting properties).
logrus.Infof("found more than one filter (%d) attached to a cgroup -- removing extra filters!", len(oldProgs))
logLevel = logrus.InfoLevel
}
for idx, oldProg := range oldProgs {
// Output some extra debug info.
if info, err := oldProg.Info(); err == nil {
fields := logrus.Fields{
"type": info.Type.String(),
"tag": info.Tag,
"name": info.Name,
}
if id, ok := info.ID(); ok {
fields["id"] = id
}
if runCount, ok := info.RunCount(); ok {
fields["run_count"] = runCount
}
if runtime, ok := info.Runtime(); ok {
fields["runtime"] = runtime.String()
}
logrus.WithFields(fields).Logf(logLevel, "removing old filter %d from cgroup", idx)
}
err = link.RawDetachProgram(link.RawDetachProgramOptions{
Target: dirFd,
Program: oldProg,
Attach: ebpf.AttachCGroupDevice,
})
if err != nil {
return closer, fmt.Errorf("failed to call BPF_PROG_DETACH (BPF_CGROUP_DEVICE) on old filter program: %w", err)
}
}
}
return closer, nil
}

View File

@ -50,24 +50,13 @@ func WriteFile(dir, file, data string) error {
return err
}
defer fd.Close()
if err := retryingWriteFile(fd, data); err != nil {
if _, err := fd.WriteString(data); err != nil {
// Having data in the error message helps in debugging.
return fmt.Errorf("failed to write %q: %w", data, err)
}
return nil
}
func retryingWriteFile(fd *os.File, data string) error {
for {
_, err := fd.Write([]byte(data))
if errors.Is(err, unix.EINTR) {
logrus.Infof("interrupted while writing %s to %s", data, fd.Name())
continue
}
return err
}
}
const (
cgroupfsDir = "/sys/fs/cgroup"
cgroupfsPrefix = cgroupfsDir + "/"
@ -90,7 +79,7 @@ func prepareOpenat2() error {
})
if err != nil {
prepErr = &os.PathError{Op: "openat2", Path: cgroupfsDir, Err: err}
if err != unix.ENOSYS { //nolint:errorlint // unix errors are bare
if err != unix.ENOSYS {
logrus.Warnf("falling back to securejoin: %s", prepErr)
} else {
logrus.Debug("openat2 not available, falling back to securejoin")
@ -148,8 +137,9 @@ func openFile(dir, file string, flags int) (*os.File, error) {
//
// TODO: if such usage will ever be common, amend this
// to reopen cgroupRootHandle and retry openat2.
fdStr := strconv.Itoa(int(cgroupRootHandle.Fd()))
fdDest, _ := os.Readlink("/proc/self/fd/" + fdStr)
fdPath, closer := utils.ProcThreadSelf("fd/" + strconv.Itoa(int(cgroupRootHandle.Fd())))
defer closer()
fdDest, _ := os.Readlink(fdPath)
if fdDest != cgroupfsDir {
// Wrap the error so it is clear that cgroupRootHandle
// is opened to an unexpected/wrong directory.

View File

@ -84,6 +84,28 @@ func (s *CpuGroup) Set(path string, r *configs.Resources) error {
period = ""
}
}
var burst string
if r.CpuBurst != nil {
burst = strconv.FormatUint(*r.CpuBurst, 10)
if err := cgroups.WriteFile(path, "cpu.cfs_burst_us", burst); err != nil {
// this is a special trick for burst feature, the current systemd and low version of kernel will not support it.
// So, an `no such file or directory` error would be raised, and we can ignore it .
if !errors.Is(err, unix.ENOENT) {
// Sometimes when the burst to be set is larger
// than the current one, it is rejected by the kernel
// (EINVAL) as old_quota/new_burst exceeds the parent
// cgroup quota limit. If this happens and the quota is
// going to be set, ignore the error for now and retry
// after setting the quota.
if !errors.Is(err, unix.EINVAL) || r.CpuQuota == 0 {
return err
}
}
} else {
burst = ""
}
}
if r.CpuQuota != 0 {
if err := cgroups.WriteFile(path, "cpu.cfs_quota_us", strconv.FormatInt(r.CpuQuota, 10)); err != nil {
return err
@ -93,7 +115,22 @@ func (s *CpuGroup) Set(path string, r *configs.Resources) error {
return err
}
}
if burst != "" {
if err := cgroups.WriteFile(path, "cpu.cfs_burst_us", burst); err != nil {
if !errors.Is(err, unix.ENOENT) {
return err
}
}
}
}
if r.CPUIdle != nil {
idle := strconv.FormatInt(*r.CPUIdle, 10)
if err := cgroups.WriteFile(path, "cpu.idle", idle); err != nil {
return err
}
}
return s.SetRtSched(path, r)
}

View File

@ -195,7 +195,7 @@ func cpusetEnsureParent(current string) error {
}
// Treat non-existing directory as cgroupfs as it will be created,
// and the root cpuset directory obviously exists.
if err != nil && err != unix.ENOENT { //nolint:errorlint // unix errors are bare
if err != nil && err != unix.ENOENT {
return &os.PathError{Op: "statfs", Path: parent, Err: err}
}

View File

@ -1,20 +1,11 @@
package fs
import (
"bytes"
"errors"
"reflect"
"github.com/opencontainers/runc/libcontainer/cgroups"
cgroupdevices "github.com/opencontainers/runc/libcontainer/cgroups/devices"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/devices"
"github.com/opencontainers/runc/libcontainer/userns"
)
type DevicesGroup struct {
TestingSkipFinalCheck bool
}
type DevicesGroup struct{}
func (s *DevicesGroup) Name() string {
return "devices"
@ -33,75 +24,14 @@ func (s *DevicesGroup) Apply(path string, r *configs.Resources, pid int) error {
return apply(path, pid)
}
func loadEmulator(path string) (*cgroupdevices.Emulator, error) {
list, err := cgroups.ReadFile(path, "devices.list")
if err != nil {
return nil, err
}
return cgroupdevices.EmulatorFromList(bytes.NewBufferString(list))
}
func buildEmulator(rules []*devices.Rule) (*cgroupdevices.Emulator, error) {
// This defaults to a white-list -- which is what we want!
emu := &cgroupdevices.Emulator{}
for _, rule := range rules {
if err := emu.Apply(*rule); err != nil {
return nil, err
}
}
return emu, nil
}
func (s *DevicesGroup) Set(path string, r *configs.Resources) error {
if userns.RunningInUserNS() || r.SkipDevices {
return nil
}
// Generate two emulators, one for the current state of the cgroup and one
// for the requested state by the user.
current, err := loadEmulator(path)
if err != nil {
return err
}
target, err := buildEmulator(r.Devices)
if err != nil {
return err
}
// Compute the minimal set of transition rules needed to achieve the
// requested state.
transitionRules, err := current.Transition(target)
if err != nil {
return err
}
for _, rule := range transitionRules {
file := "devices.deny"
if rule.Allow {
file = "devices.allow"
}
if err := cgroups.WriteFile(path, file, rule.CgroupString()); err != nil {
return err
if cgroups.DevicesSetV1 == nil {
if len(r.Devices) == 0 {
return nil
}
return cgroups.ErrDevicesUnsupported
}
// Final safety check -- ensure that the resulting state is what was
// requested. This is only really correct for white-lists, but for
// black-lists we can at least check that the cgroup is in the right mode.
//
// This safety-check is skipped for the unit tests because we cannot
// currently mock devices.list correctly.
if !s.TestingSkipFinalCheck {
currentAfter, err := loadEmulator(path)
if err != nil {
return err
}
if !target.IsBlacklist() && !reflect.DeepEqual(currentAfter, target) {
return errors.New("resulting devices cgroup doesn't precisely match target")
} else if target.IsBlacklist() != currentAfter.IsBlacklist() {
return errors.New("resulting devices cgroup doesn't match target mode")
}
}
return nil
return cgroups.DevicesSetV1(path, r)
}
func (s *DevicesGroup) GetStats(path string, stats *cgroups.Stats) error {

View File

@ -54,13 +54,13 @@ type subsystem interface {
Set(path string, r *configs.Resources) error
}
type manager struct {
type Manager struct {
mu sync.Mutex
cgroups *configs.Cgroup
paths map[string]string
}
func NewManager(cg *configs.Cgroup, paths map[string]string) (cgroups.Manager, error) {
func NewManager(cg *configs.Cgroup, paths map[string]string) (*Manager, error) {
// Some v1 controllers (cpu, cpuset, and devices) expect
// cgroups.Resources to not be nil in Apply.
if cg.Resources == nil {
@ -78,7 +78,7 @@ func NewManager(cg *configs.Cgroup, paths map[string]string) (cgroups.Manager, e
}
}
return &manager{
return &Manager{
cgroups: cg,
paths: paths,
}, nil
@ -105,7 +105,7 @@ func isIgnorableError(rootless bool, err error) bool {
return false
}
func (m *manager) Apply(pid int) (err error) {
func (m *Manager) Apply(pid int) (err error) {
m.mu.Lock()
defer m.mu.Unlock()
@ -139,19 +139,19 @@ func (m *manager) Apply(pid int) (err error) {
return nil
}
func (m *manager) Destroy() error {
func (m *Manager) Destroy() error {
m.mu.Lock()
defer m.mu.Unlock()
return cgroups.RemovePaths(m.paths)
}
func (m *manager) Path(subsys string) string {
func (m *Manager) Path(subsys string) string {
m.mu.Lock()
defer m.mu.Unlock()
return m.paths[subsys]
}
func (m *manager) GetStats() (*cgroups.Stats, error) {
func (m *Manager) GetStats() (*cgroups.Stats, error) {
m.mu.Lock()
defer m.mu.Unlock()
stats := cgroups.NewStats()
@ -167,7 +167,7 @@ func (m *manager) GetStats() (*cgroups.Stats, error) {
return stats, nil
}
func (m *manager) Set(r *configs.Resources) error {
func (m *Manager) Set(r *configs.Resources) error {
if r == nil {
return nil
}
@ -183,7 +183,7 @@ func (m *manager) Set(r *configs.Resources) error {
if err := sys.Set(path, r); err != nil {
// When rootless is true, errors from the device subsystem
// are ignored, as it is really not expected to work.
if m.cgroups.Rootless && sys.Name() == "devices" {
if m.cgroups.Rootless && sys.Name() == "devices" && !errors.Is(err, cgroups.ErrDevicesUnsupported) {
continue
}
// However, errors from other subsystems are not ignored.
@ -202,7 +202,7 @@ func (m *manager) Set(r *configs.Resources) error {
// Freeze toggles the container's freezer cgroup depending on the state
// provided
func (m *manager) Freeze(state configs.FreezerState) error {
func (m *Manager) Freeze(state configs.FreezerState) error {
path := m.Path("freezer")
if path == "" {
return errors.New("cannot toggle freezer: cgroups not configured for container")
@ -218,25 +218,25 @@ func (m *manager) Freeze(state configs.FreezerState) error {
return nil
}
func (m *manager) GetPids() ([]int, error) {
func (m *Manager) GetPids() ([]int, error) {
return cgroups.GetPids(m.Path("devices"))
}
func (m *manager) GetAllPids() ([]int, error) {
func (m *Manager) GetAllPids() ([]int, error) {
return cgroups.GetAllPids(m.Path("devices"))
}
func (m *manager) GetPaths() map[string]string {
func (m *Manager) GetPaths() map[string]string {
m.mu.Lock()
defer m.mu.Unlock()
return m.paths
}
func (m *manager) GetCgroups() (*configs.Cgroup, error) {
func (m *Manager) GetCgroups() (*configs.Cgroup, error) {
return m.cgroups, nil
}
func (m *manager) GetFreezerState() (configs.FreezerState, error) {
func (m *Manager) GetFreezerState() (configs.FreezerState, error) {
dir := m.Path("freezer")
// If the container doesn't have the freezer cgroup, say it's undefined.
if dir == "" {
@ -246,7 +246,7 @@ func (m *manager) GetFreezerState() (configs.FreezerState, error) {
return freezer.GetState(dir)
}
func (m *manager) Exists() bool {
func (m *Manager) Exists() bool {
return cgroups.PathExists(m.Path("devices"))
}
@ -254,7 +254,7 @@ func OOMKillCount(path string) (uint64, error) {
return fscommon.GetValueByKey(path, "memory.oom_control", "oom_kill")
}
func (m *manager) OOMKillCount() (uint64, error) {
func (m *Manager) OOMKillCount() (uint64, error) {
c, err := OOMKillCount(m.Path("memory"))
// Ignore ENOENT when rootless as it couldn't create cgroup.
if err != nil && m.cgroups.Rootless && os.IsNotExist(err) {

View File

@ -165,9 +165,8 @@ func subsysPath(root, inner, subsystem string) (string, error) {
return filepath.Join(root, filepath.Base(mnt), inner), nil
}
// Use GetOwnCgroupPath instead of GetInitCgroupPath, because the creating
// process could in container and shared pid namespace with host, and
// /proc/1/cgroup could point to whole other world of cgroups.
// Use GetOwnCgroupPath for dind-like cases, when cgroupns is not
// available. This is ugly.
parentPath, err := cgroups.GetOwnCgroupPath(subsystem)
if err != nil {
return "", err

View File

@ -2,16 +2,19 @@ package fs2
import (
"bufio"
"errors"
"os"
"strconv"
"golang.org/x/sys/unix"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
)
func isCpuSet(r *configs.Resources) bool {
return r.CpuWeight != 0 || r.CpuQuota != 0 || r.CpuPeriod != 0
return r.CpuWeight != 0 || r.CpuQuota != 0 || r.CpuPeriod != 0 || r.CPUIdle != nil || r.CpuBurst != nil
}
func setCpu(dirPath string, r *configs.Resources) error {
@ -19,6 +22,12 @@ func setCpu(dirPath string, r *configs.Resources) error {
return nil
}
if r.CPUIdle != nil {
if err := cgroups.WriteFile(dirPath, "cpu.idle", strconv.FormatInt(*r.CPUIdle, 10)); err != nil {
return err
}
}
// NOTE: .CpuShares is not used here. Conversion is the caller's responsibility.
if r.CpuWeight != 0 {
if err := cgroups.WriteFile(dirPath, "cpu.weight", strconv.FormatUint(r.CpuWeight, 10)); err != nil {
@ -26,6 +35,23 @@ func setCpu(dirPath string, r *configs.Resources) error {
}
}
var burst string
if r.CpuBurst != nil {
burst = strconv.FormatUint(*r.CpuBurst, 10)
if err := cgroups.WriteFile(dirPath, "cpu.max.burst", burst); err != nil {
// Sometimes when the burst to be set is larger
// than the current one, it is rejected by the kernel
// (EINVAL) as old_quota/new_burst exceeds the parent
// cgroup quota limit. If this happens and the quota is
// going to be set, ignore the error for now and retry
// after setting the quota.
if !errors.Is(err, unix.EINVAL) || r.CpuQuota == 0 {
return err
}
} else {
burst = ""
}
}
if r.CpuQuota != 0 || r.CpuPeriod != 0 {
str := "max"
if r.CpuQuota > 0 {
@ -41,6 +67,11 @@ func setCpu(dirPath string, r *configs.Resources) error {
if err := cgroups.WriteFile(dirPath, "cpu.max", str); err != nil {
return err
}
if burst != "" {
if err := cgroups.WriteFile(dirPath, "cpu.max.burst", burst); err != nil {
return err
}
}
}
return nil

View File

@ -55,6 +55,9 @@ func _defaultDirPath(root, cgPath, cgParent, cgName string) (string, error) {
return filepath.Join(root, innerPath), nil
}
// we don't need to use /proc/thread-self here because runc always runs
// with every thread in the same cgroup. This lets us avoid having to do
// runtime.LockOSThread.
ownCgroup, err := parseCgroupFile("/proc/self/cgroup")
if err != nil {
return "", err

View File

@ -1,75 +0,0 @@
package fs2
import (
"fmt"
"golang.org/x/sys/unix"
"github.com/opencontainers/runc/libcontainer/cgroups/ebpf"
"github.com/opencontainers/runc/libcontainer/cgroups/ebpf/devicefilter"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/devices"
"github.com/opencontainers/runc/libcontainer/userns"
)
func isRWM(perms devices.Permissions) bool {
var r, w, m bool
for _, perm := range perms {
switch perm {
case 'r':
r = true
case 'w':
w = true
case 'm':
m = true
}
}
return r && w && m
}
// This is similar to the logic applied in crun for handling errors from bpf(2)
// <https://github.com/containers/crun/blob/0.17/src/libcrun/cgroup.c#L2438-L2470>.
func canSkipEBPFError(r *configs.Resources) bool {
// If we're running in a user namespace we can ignore eBPF rules because we
// usually cannot use bpf(2), as well as rootless containers usually don't
// have the necessary privileges to mknod(2) device inodes or access
// host-level instances (though ideally we would be blocking device access
// for rootless containers anyway).
if userns.RunningInUserNS() {
return true
}
// We cannot ignore an eBPF load error if any rule if is a block rule or it
// doesn't permit all access modes.
//
// NOTE: This will sometimes trigger in cases where access modes are split
// between different rules but to handle this correctly would require
// using ".../libcontainer/cgroup/devices".Emulator.
for _, dev := range r.Devices {
if !dev.Allow || !isRWM(dev.Permissions) {
return false
}
}
return true
}
func setDevices(dirPath string, r *configs.Resources) error {
if r.SkipDevices {
return nil
}
insts, license, err := devicefilter.DeviceFilter(r.Devices)
if err != nil {
return err
}
dirFD, err := unix.Open(dirPath, unix.O_DIRECTORY|unix.O_RDONLY, 0o600)
if err != nil {
return fmt.Errorf("cannot get dir FD for %s", dirPath)
}
defer unix.Close(dirFD)
if _, err := ebpf.LoadAttachCgroupDeviceFilter(insts, license, dirFD); err != nil {
if !canSkipEBPFError(r) {
return err
}
}
return nil
}

View File

@ -13,7 +13,7 @@ import (
type parseError = fscommon.ParseError
type manager struct {
type Manager struct {
config *configs.Cgroup
// dirPath is like "/sys/fs/cgroup/user.slice/user-1001.slice/session-1.scope"
dirPath string
@ -25,7 +25,7 @@ type manager struct {
// NewManager creates a manager for cgroup v2 unified hierarchy.
// dirPath is like "/sys/fs/cgroup/user.slice/user-1001.slice/session-1.scope".
// If dirPath is empty, it is automatically set using config.
func NewManager(config *configs.Cgroup, dirPath string) (cgroups.Manager, error) {
func NewManager(config *configs.Cgroup, dirPath string) (*Manager, error) {
if dirPath == "" {
var err error
dirPath, err = defaultDirPath(config)
@ -34,14 +34,14 @@ func NewManager(config *configs.Cgroup, dirPath string) (cgroups.Manager, error)
}
}
m := &manager{
m := &Manager{
config: config,
dirPath: dirPath,
}
return m, nil
}
func (m *manager) getControllers() error {
func (m *Manager) getControllers() error {
if m.controllers != nil {
return nil
}
@ -62,7 +62,7 @@ func (m *manager) getControllers() error {
return nil
}
func (m *manager) Apply(pid int) error {
func (m *Manager) Apply(pid int) error {
if err := CreateCgroupPath(m.dirPath, m.config); err != nil {
// Related tests:
// - "runc create (no limits + no cgrouppath + no permission) succeeds"
@ -84,15 +84,15 @@ func (m *manager) Apply(pid int) error {
return nil
}
func (m *manager) GetPids() ([]int, error) {
func (m *Manager) GetPids() ([]int, error) {
return cgroups.GetPids(m.dirPath)
}
func (m *manager) GetAllPids() ([]int, error) {
func (m *Manager) GetAllPids() ([]int, error) {
return cgroups.GetAllPids(m.dirPath)
}
func (m *manager) GetStats() (*cgroups.Stats, error) {
func (m *Manager) GetStats() (*cgroups.Stats, error) {
var errs []error
st := cgroups.NewStats()
@ -114,6 +114,17 @@ func (m *manager) GetStats() (*cgroups.Stats, error) {
if err := statCpu(m.dirPath, st); err != nil && !os.IsNotExist(err) {
errs = append(errs, err)
}
// PSI (since kernel 4.20).
var err error
if st.CpuStats.PSI, err = statPSI(m.dirPath, "cpu.pressure"); err != nil {
errs = append(errs, err)
}
if st.MemoryStats.PSI, err = statPSI(m.dirPath, "memory.pressure"); err != nil {
errs = append(errs, err)
}
if st.BlkioStats.PSI, err = statPSI(m.dirPath, "io.pressure"); err != nil {
errs = append(errs, err)
}
// hugetlb (since kernel 5.6)
if err := statHugeTlb(m.dirPath, st); err != nil && !os.IsNotExist(err) {
errs = append(errs, err)
@ -122,13 +133,17 @@ func (m *manager) GetStats() (*cgroups.Stats, error) {
if err := fscommon.RdmaGetStats(m.dirPath, st); err != nil && !os.IsNotExist(err) {
errs = append(errs, err)
}
// misc (since kernel 5.13)
if err := statMisc(m.dirPath, st); err != nil && !os.IsNotExist(err) {
errs = append(errs, err)
}
if len(errs) > 0 && !m.config.Rootless {
return st, fmt.Errorf("error while statting cgroup v2: %+v", errs)
}
return st, nil
}
func (m *manager) Freeze(state configs.FreezerState) error {
func (m *Manager) Freeze(state configs.FreezerState) error {
if m.config.Resources == nil {
return errors.New("cannot toggle freezer: cgroups not configured for container")
}
@ -139,15 +154,15 @@ func (m *manager) Freeze(state configs.FreezerState) error {
return nil
}
func (m *manager) Destroy() error {
func (m *Manager) Destroy() error {
return cgroups.RemovePath(m.dirPath)
}
func (m *manager) Path(_ string) string {
func (m *Manager) Path(_ string) string {
return m.dirPath
}
func (m *manager) Set(r *configs.Resources) error {
func (m *Manager) Set(r *configs.Resources) error {
if r == nil {
return nil
}
@ -175,8 +190,10 @@ func (m *manager) Set(r *configs.Resources) error {
// When rootless is true, errors from the device subsystem are ignored because it is really not expected to work.
// However, errors from other subsystems are not ignored.
// see @test "runc create (rootless + limits + no cgrouppath + no permission) fails with informative error"
if err := setDevices(m.dirPath, r); err != nil && !m.config.Rootless {
return err
if err := setDevices(m.dirPath, r); err != nil {
if !m.config.Rootless || errors.Is(err, cgroups.ErrDevicesUnsupported) {
return err
}
}
// cpuset (since kernel 5.0)
if err := setCpuset(m.dirPath, r); err != nil {
@ -201,7 +218,17 @@ func (m *manager) Set(r *configs.Resources) error {
return nil
}
func (m *manager) setUnified(res map[string]string) error {
func setDevices(dirPath string, r *configs.Resources) error {
if cgroups.DevicesSetV2 == nil {
if len(r.Devices) > 0 {
return cgroups.ErrDevicesUnsupported
}
return nil
}
return cgroups.DevicesSetV2(dirPath, r)
}
func (m *Manager) setUnified(res map[string]string) error {
for k, v := range res {
if strings.Contains(k, "/") {
return fmt.Errorf("unified resource %q must be a file name (no slashes)", k)
@ -227,21 +254,21 @@ func (m *manager) setUnified(res map[string]string) error {
return nil
}
func (m *manager) GetPaths() map[string]string {
func (m *Manager) GetPaths() map[string]string {
paths := make(map[string]string, 1)
paths[""] = m.dirPath
return paths
}
func (m *manager) GetCgroups() (*configs.Cgroup, error) {
func (m *Manager) GetCgroups() (*configs.Cgroup, error) {
return m.config, nil
}
func (m *manager) GetFreezerState() (configs.FreezerState, error) {
func (m *Manager) GetFreezerState() (configs.FreezerState, error) {
return getFreezer(m.dirPath)
}
func (m *manager) Exists() bool {
func (m *Manager) Exists() bool {
return cgroups.PathExists(m.dirPath)
}
@ -249,7 +276,7 @@ func OOMKillCount(path string) (uint64, error) {
return fscommon.GetValueByKey(path, "memory.events", "oom_kill")
}
func (m *manager) OOMKillCount() (uint64, error) {
func (m *Manager) OOMKillCount() (uint64, error) {
c, err := OOMKillCount(m.dirPath)
if err != nil && m.config.Rootless && os.IsNotExist(err) {
err = nil
@ -257,3 +284,35 @@ func (m *manager) OOMKillCount() (uint64, error) {
return c, err
}
func CheckMemoryUsage(dirPath string, r *configs.Resources) error {
if !r.MemoryCheckBeforeUpdate {
return nil
}
if r.Memory <= 0 && r.MemorySwap <= 0 {
return nil
}
usage, err := fscommon.GetCgroupParamUint(dirPath, "memory.current")
if err != nil {
// This check is on best-effort basis, so if we can't read the
// current usage (cgroup not yet created, or any other error),
// we should not fail.
return nil
}
if r.MemorySwap > 0 {
if uint64(r.MemorySwap) <= usage {
return fmt.Errorf("rejecting memory+swap limit %d <= usage %d", r.MemorySwap, usage)
}
}
if r.Memory > 0 {
if uint64(r.Memory) <= usage {
return fmt.Errorf("rejecting memory limit %d <= usage %d", r.Memory, usage)
}
}
return nil
}

View File

@ -40,6 +40,11 @@ func setMemory(dirPath string, r *configs.Resources) error {
if !isMemorySet(r) {
return nil
}
if err := CheckMemoryUsage(dirPath, r); err != nil {
return err
}
swap, err := cgroups.ConvertMemorySwapToCgroupV2Value(r.MemorySwap, r.Memory)
if err != nil {
return err

View File

@ -0,0 +1,52 @@
package fs2
import (
"bufio"
"os"
"strings"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
)
func statMisc(dirPath string, stats *cgroups.Stats) error {
for _, file := range []string{"current", "events"} {
fd, err := cgroups.OpenFile(dirPath, "misc."+file, os.O_RDONLY)
if err != nil {
return err
}
s := bufio.NewScanner(fd)
for s.Scan() {
key, value, err := fscommon.ParseKeyValue(s.Text())
if err != nil {
fd.Close()
return err
}
key = strings.TrimSuffix(key, ".max")
if _, ok := stats.MiscStats[key]; !ok {
stats.MiscStats[key] = cgroups.MiscStats{}
}
tmp := stats.MiscStats[key]
switch file {
case "current":
tmp.Usage = value
case "events":
tmp.Events = value
}
stats.MiscStats[key] = tmp
}
fd.Close()
if err := s.Err(); err != nil {
return err
}
}
return nil
}

View File

@ -0,0 +1,89 @@
package fs2
import (
"bufio"
"errors"
"fmt"
"os"
"strconv"
"strings"
"golang.org/x/sys/unix"
"github.com/opencontainers/runc/libcontainer/cgroups"
)
func statPSI(dirPath string, file string) (*cgroups.PSIStats, error) {
f, err := cgroups.OpenFile(dirPath, file, os.O_RDONLY)
if err != nil {
if errors.Is(err, os.ErrNotExist) {
// Kernel < 4.20, or CONFIG_PSI is not set,
// or PSI stats are turned off for the cgroup
// ("echo 0 > cgroup.pressure", kernel >= 6.1).
return nil, nil
}
return nil, err
}
defer f.Close()
var psistats cgroups.PSIStats
sc := bufio.NewScanner(f)
for sc.Scan() {
parts := strings.Fields(sc.Text())
var pv *cgroups.PSIData
switch parts[0] {
case "some":
pv = &psistats.Some
case "full":
pv = &psistats.Full
}
if pv != nil {
*pv, err = parsePSIData(parts[1:])
if err != nil {
return nil, &parseError{Path: dirPath, File: file, Err: err}
}
}
}
if err := sc.Err(); err != nil {
if errors.Is(err, unix.ENOTSUP) {
// Some kernels (e.g. CS9) may return ENOTSUP on read
// if psi=1 kernel cmdline parameter is required.
return nil, nil
}
return nil, &parseError{Path: dirPath, File: file, Err: err}
}
return &psistats, nil
}
func parsePSIData(psi []string) (cgroups.PSIData, error) {
data := cgroups.PSIData{}
for _, f := range psi {
kv := strings.SplitN(f, "=", 2)
if len(kv) != 2 {
return data, fmt.Errorf("invalid psi data: %q", f)
}
var pv *float64
switch kv[0] {
case "avg10":
pv = &data.Avg10
case "avg60":
pv = &data.Avg60
case "avg300":
pv = &data.Avg300
case "total":
v, err := strconv.ParseUint(kv[1], 10, 64)
if err != nil {
return data, fmt.Errorf("invalid %s PSI value: %w", kv[0], err)
}
data.Total = v
}
if pv != nil {
v, err := strconv.ParseFloat(kv[1], 64)
if err != nil {
return data, fmt.Errorf("invalid %s PSI value: %w", kv[0], err)
}
*pv = v
}
}
return data, nil
}

View File

@ -32,9 +32,22 @@ type CpuUsage struct {
UsageInUsermode uint64 `json:"usage_in_usermode"`
}
type PSIData struct {
Avg10 float64 `json:"avg10"`
Avg60 float64 `json:"avg60"`
Avg300 float64 `json:"avg300"`
Total uint64 `json:"total"`
}
type PSIStats struct {
Some PSIData `json:"some,omitempty"`
Full PSIData `json:"full,omitempty"`
}
type CpuStats struct {
CpuUsage CpuUsage `json:"cpu_usage,omitempty"`
ThrottlingData ThrottlingData `json:"throttling_data,omitempty"`
PSI *PSIStats `json:"psi,omitempty"`
}
type CPUSetStats struct {
@ -91,6 +104,7 @@ type MemoryStats struct {
UseHierarchy bool `json:"use_hierarchy"`
Stats map[string]uint64 `json:"stats,omitempty"`
PSI *PSIStats `json:"psi,omitempty"`
}
type PageUsageByNUMA struct {
@ -135,6 +149,7 @@ type BlkioStats struct {
IoMergedRecursive []BlkioStatEntry `json:"io_merged_recursive,omitempty"`
IoTimeRecursive []BlkioStatEntry `json:"io_time_recursive,omitempty"`
SectorsRecursive []BlkioStatEntry `json:"sectors_recursive,omitempty"`
PSI *PSIStats `json:"psi,omitempty"`
}
type HugetlbStats struct {
@ -157,6 +172,13 @@ type RdmaStats struct {
RdmaCurrent []RdmaEntry `json:"rdma_current,omitempty"`
}
type MiscStats struct {
// current resource usage for a key in misc
Usage uint64 `json:"usage,omitempty"`
// number of times the resource usage was about to go over the max boundary
Events uint64 `json:"events,omitempty"`
}
type Stats struct {
CpuStats CpuStats `json:"cpu_stats,omitempty"`
CPUSetStats CPUSetStats `json:"cpuset_stats,omitempty"`
@ -166,10 +188,13 @@ type Stats struct {
// the map is in the format "size of hugepage: stats of the hugepage"
HugetlbStats map[string]HugetlbStats `json:"hugetlb_stats,omitempty"`
RdmaStats RdmaStats `json:"rdma_stats,omitempty"`
// the map is in the format "misc resource name: stats of the key"
MiscStats map[string]MiscStats `json:"misc_stats,omitempty"`
}
func NewStats() *Stats {
memoryStats := MemoryStats{Stats: make(map[string]uint64)}
hugetlbStats := make(map[string]HugetlbStats)
return &Stats{MemoryStats: memoryStats, HugetlbStats: hugetlbStats}
miscStats := make(map[string]MiscStats)
return &Stats{MemoryStats: memoryStats, HugetlbStats: hugetlbStats, MiscStats: miscStats}
}

View File

@ -36,13 +36,13 @@ func IsCgroup2UnifiedMode() bool {
var st unix.Statfs_t
err := unix.Statfs(unifiedMountpoint, &st)
if err != nil {
level := logrus.WarnLevel
if os.IsNotExist(err) && userns.RunningInUserNS() {
// ignore the "not found" error if running in userns
logrus.WithError(err).Debugf("%s missing, assuming cgroup v1", unifiedMountpoint)
isUnified = false
return
// For rootless containers, sweep it under the rug.
level = logrus.DebugLevel
}
panic(fmt.Sprintf("cannot statfs cgroup root: %s", err))
logrus.StandardLogger().Logf(level,
"statfs %s: %v; assuming cgroup v1", unifiedMountpoint, err)
}
isUnified = st.Type == unix.CGROUP2_SUPER_MAGIC
})
@ -217,21 +217,26 @@ func PathExists(path string) bool {
return true
}
func EnterPid(cgroupPaths map[string]string, pid int) error {
for _, path := range cgroupPaths {
if PathExists(path) {
if err := WriteCgroupProc(path, pid); err != nil {
return err
}
}
}
return nil
}
// rmdir tries to remove a directory, optionally retrying on EBUSY.
func rmdir(path string, retry bool) error {
delay := time.Millisecond
tries := 10
func rmdir(path string) error {
again:
err := unix.Rmdir(path)
if err == nil || err == unix.ENOENT { //nolint:errorlint // unix errors are bare
switch err { // nolint:errorlint // unix errors are bare
case nil, unix.ENOENT:
return nil
case unix.EINTR:
goto again
case unix.EBUSY:
if retry && tries > 0 {
time.Sleep(delay)
delay *= 2
tries--
goto again
}
}
return &os.PathError{Op: "rmdir", Path: path, Err: err}
}
@ -239,68 +244,42 @@ func rmdir(path string) error {
// RemovePath aims to remove cgroup path. It does so recursively,
// by removing any subdirectories (sub-cgroups) first.
func RemovePath(path string) error {
// try the fast path first
if err := rmdir(path); err == nil {
// Try the fast path first.
if err := rmdir(path, false); err == nil {
return nil
}
infos, err := os.ReadDir(path)
if err != nil {
if os.IsNotExist(err) {
err = nil
}
if err != nil && !os.IsNotExist(err) {
return err
}
for _, info := range infos {
if info.IsDir() {
// We should remove subcgroups dir first
// We should remove subcgroup first.
if err = RemovePath(filepath.Join(path, info.Name())); err != nil {
break
}
}
}
if err == nil {
err = rmdir(path)
err = rmdir(path, true)
}
return err
}
// RemovePaths iterates over the provided paths removing them.
// We trying to remove all paths five times with increasing delay between tries.
// If after all there are not removed cgroups - appropriate error will be
// returned.
func RemovePaths(paths map[string]string) (err error) {
const retries = 5
delay := 10 * time.Millisecond
for i := 0; i < retries; i++ {
if i != 0 {
time.Sleep(delay)
delay *= 2
}
for s, p := range paths {
if err := RemovePath(p); err != nil {
// do not log intermediate iterations
switch i {
case 0:
logrus.WithError(err).Warnf("Failed to remove cgroup (will retry)")
case retries - 1:
logrus.WithError(err).Error("Failed to remove cgroup")
}
}
_, err := os.Stat(p)
// We need this strange way of checking cgroups existence because
// RemoveAll almost always returns error, even on already removed
// cgroups
if os.IsNotExist(err) {
delete(paths, s)
}
}
if len(paths) == 0 {
//nolint:ineffassign,staticcheck // done to help garbage collecting: opencontainers/runc#2506
paths = make(map[string]string)
return nil
for s, p := range paths {
if err := RemovePath(p); err == nil {
delete(paths, s)
}
}
if len(paths) == 0 {
//nolint:ineffassign,staticcheck // done to help garbage collecting: opencontainers/runc#2506
// TODO: switch to clear once Go < 1.21 is not supported.
paths = make(map[string]string)
return nil
}
return fmt.Errorf("Failed to remove paths: %v", paths)
}

View File

@ -99,11 +99,12 @@ func tryDefaultPath(cgroupPath, subsystem string) string {
// expensive), so it is assumed that cgroup mounts are not being changed.
func readCgroupMountinfo() ([]*mountinfo.Info, error) {
readMountinfoOnce.Do(func() {
// mountinfo.GetMounts uses /proc/thread-self, so we can use it without
// issues.
cgroupMountinfo, readMountinfoErr = mountinfo.GetMounts(
mountinfo.FSTypeFilter("cgroup"),
)
})
return cgroupMountinfo, readMountinfoErr
}
@ -196,6 +197,9 @@ func getCgroupMountsV1(all bool) ([]Mount, error) {
return nil, err
}
// We don't need to use /proc/thread-self here because runc always runs
// with every thread in the same cgroup. This lets us avoid having to do
// runtime.LockOSThread.
allSubsystems, err := ParseCgroupFile("/proc/self/cgroup")
if err != nil {
return nil, err
@ -214,6 +218,10 @@ func GetOwnCgroup(subsystem string) (string, error) {
if IsCgroup2UnifiedMode() {
return "", errUnified
}
// We don't need to use /proc/thread-self here because runc always runs
// with every thread in the same cgroup. This lets us avoid having to do
// runtime.LockOSThread.
cgroups, err := ParseCgroupFile("/proc/self/cgroup")
if err != nil {
return "", err
@ -236,27 +244,6 @@ func GetOwnCgroupPath(subsystem string) (string, error) {
return getCgroupPathHelper(subsystem, cgroup)
}
func GetInitCgroup(subsystem string) (string, error) {
if IsCgroup2UnifiedMode() {
return "", errUnified
}
cgroups, err := ParseCgroupFile("/proc/1/cgroup")
if err != nil {
return "", err
}
return getControllerPath(subsystem, cgroups)
}
func GetInitCgroupPath(subsystem string) (string, error) {
cgroup, err := GetInitCgroup(subsystem)
if err != nil {
return "", err
}
return getCgroupPathHelper(subsystem, cgroup)
}
func getCgroupPathHelper(subsystem, cgroup string) (string, error) {
mnt, root, err := FindCgroupMountpointAndRoot("", subsystem)
if err != nil {

View File

@ -2,8 +2,8 @@ package configs
import "fmt"
// blockIODevice holds major:minor format supported in blkio cgroup
type blockIODevice struct {
// BlockIODevice holds major:minor format supported in blkio cgroup.
type BlockIODevice struct {
// Major is the device's major number
Major int64 `json:"major"`
// Minor is the device's minor number
@ -12,7 +12,7 @@ type blockIODevice struct {
// WeightDevice struct holds a `major:minor weight`|`major:minor leaf_weight` pair
type WeightDevice struct {
blockIODevice
BlockIODevice
// Weight is the bandwidth rate for the device, range is from 10 to 1000
Weight uint16 `json:"weight"`
// LeafWeight is the bandwidth rate for the device while competing with the cgroup's child cgroups, range is from 10 to 1000, cfq scheduler only
@ -41,7 +41,7 @@ func (wd *WeightDevice) LeafWeightString() string {
// ThrottleDevice struct holds a `major:minor rate_per_second` pair
type ThrottleDevice struct {
blockIODevice
BlockIODevice
// Rate is the IO rate limit per cgroup per device
Rate uint64 `json:"rate"`
}

View File

@ -69,6 +69,9 @@ type Resources struct {
// CPU hardcap limit (in usecs). Allowed cpu time in a given period.
CpuQuota int64 `json:"cpu_quota"`
// CPU hardcap burst limit (in usecs). Allowed accumulated cpu time additionally for burst in a given period.
CpuBurst *uint64 `json:"cpu_burst"` //nolint:revive
// CPU period to be used for hardcapping (in usecs). 0 to use system default.
CpuPeriod uint64 `json:"cpu_period"`
@ -84,6 +87,9 @@ type Resources struct {
// MEM to use
CpusetMems string `json:"cpuset_mems"`
// cgroup SCHED_IDLE
CPUIdle *int64 `json:"cpu_idle,omitempty"`
// Process limit; set <= `0' to disable limit.
PidsLimit int64 `json:"pids_limit"`
@ -155,4 +161,9 @@ type Resources struct {
// during Set() to figure out whether the freeze is required. Those
// methods may be relatively slow, thus this flag.
SkipFreezeOnSet bool `json:"-"`
// MemoryCheckBeforeUpdate is a flag for cgroup v2 managers to check
// if the new memory limits (Memory and MemorySwap) being set are lower
// than the current memory usage, and reject if so.
MemoryCheckBeforeUpdate bool `json:"memory_check_before_update"`
}

View File

@ -8,6 +8,7 @@ import (
"time"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
"github.com/opencontainers/runc/libcontainer/devices"
"github.com/opencontainers/runtime-spec/specs-go"
@ -31,12 +32,13 @@ type IDMap struct {
// for syscalls. Additional architectures can be added by specifying them in
// Architectures.
type Seccomp struct {
DefaultAction Action `json:"default_action"`
Architectures []string `json:"architectures"`
Syscalls []*Syscall `json:"syscalls"`
DefaultErrnoRet *uint `json:"default_errno_ret"`
ListenerPath string `json:"listener_path,omitempty"`
ListenerMetadata string `json:"listener_metadata,omitempty"`
DefaultAction Action `json:"default_action"`
Architectures []string `json:"architectures"`
Flags []specs.LinuxSeccompFlag `json:"flags"`
Syscalls []*Syscall `json:"syscalls"`
DefaultErrnoRet *uint `json:"default_errno_ret"`
ListenerPath string `json:"listener_path,omitempty"`
ListenerMetadata string `json:"listener_metadata,omitempty"`
}
// Action is taken upon rule match in Seccomp
@ -83,9 +85,6 @@ type Syscall struct {
Args []*Arg `json:"args"`
}
// TODO Windows. Many of these fields should be factored out into those parts
// which are common across platforms, and those which are platform specific.
// Config defines configuration options for executing a process inside a contained environment.
type Config struct {
// NoPivotRoot will use MS_MOVE and a chroot to jail the process into the container's rootfs
@ -121,6 +120,9 @@ type Config struct {
// Hostname optionally sets the container's hostname if provided
Hostname string `json:"hostname"`
// Domainname optionally sets the container's domainname if provided
Domainname string `json:"domainname"`
// Namespaces specifies the container's namespaces that it should setup when cloning the init process
// If a namespace is not provided that namespace is shared from the container's parent process
Namespaces Namespaces `json:"namespaces"`
@ -158,11 +160,11 @@ type Config struct {
// More information about kernel oom score calculation here: https://lwn.net/Articles/317814/
OomScoreAdj *int `json:"oom_score_adj,omitempty"`
// UidMappings is an array of User ID mappings for User Namespaces
UidMappings []IDMap `json:"uid_mappings"`
// UIDMappings is an array of User ID mappings for User Namespaces
UIDMappings []IDMap `json:"uid_mappings"`
// GidMappings is an array of Group ID mappings for User Namespaces
GidMappings []IDMap `json:"gid_mappings"`
// GIDMappings is an array of Group ID mappings for User Namespaces
GIDMappings []IDMap `json:"gid_mappings"`
// MaskPaths specifies paths within the container's rootfs to mask over with a bind
// mount pointing to /dev/null as to prevent reads of the file.
@ -211,6 +213,74 @@ type Config struct {
// RootlessCgroups is set when unlikely to have the full access to cgroups.
// When RootlessCgroups is set, cgroups errors are ignored.
RootlessCgroups bool `json:"rootless_cgroups,omitempty"`
// TimeOffsets specifies the offset for supporting time namespaces.
TimeOffsets map[string]specs.LinuxTimeOffset `json:"time_offsets,omitempty"`
// Scheduler represents the scheduling attributes for a process.
Scheduler *Scheduler `json:"scheduler,omitempty"`
// Personality contains configuration for the Linux personality syscall.
Personality *LinuxPersonality `json:"personality,omitempty"`
}
// Scheduler is based on the Linux sched_setattr(2) syscall.
type Scheduler = specs.Scheduler
// ToSchedAttr is to convert *configs.Scheduler to *unix.SchedAttr.
func ToSchedAttr(scheduler *Scheduler) (*unix.SchedAttr, error) {
var policy uint32
switch scheduler.Policy {
case specs.SchedOther:
policy = 0
case specs.SchedFIFO:
policy = 1
case specs.SchedRR:
policy = 2
case specs.SchedBatch:
policy = 3
case specs.SchedISO:
policy = 4
case specs.SchedIdle:
policy = 5
case specs.SchedDeadline:
policy = 6
default:
return nil, fmt.Errorf("invalid scheduler policy: %s", scheduler.Policy)
}
var flags uint64
for _, flag := range scheduler.Flags {
switch flag {
case specs.SchedFlagResetOnFork:
flags |= 0x01
case specs.SchedFlagReclaim:
flags |= 0x02
case specs.SchedFlagDLOverrun:
flags |= 0x04
case specs.SchedFlagKeepPolicy:
flags |= 0x08
case specs.SchedFlagKeepParams:
flags |= 0x10
case specs.SchedFlagUtilClampMin:
flags |= 0x20
case specs.SchedFlagUtilClampMax:
flags |= 0x40
default:
return nil, fmt.Errorf("invalid scheduler flag: %s", flag)
}
}
return &unix.SchedAttr{
Size: unix.SizeofSchedAttr,
Policy: policy,
Flags: flags,
Nice: scheduler.Nice,
Priority: uint32(scheduler.Priority),
Runtime: scheduler.Runtime,
Deadline: scheduler.Deadline,
Period: scheduler.Period,
}, nil
}
type (
@ -277,6 +347,7 @@ type Capabilities struct {
Ambient []string
}
// Deprecated: use (Hooks).Run instead.
func (hooks HookList) RunHooks(state *specs.State) error {
for i, h := range hooks {
if err := h.Run(state); err != nil {
@ -333,6 +404,18 @@ func (hooks *Hooks) MarshalJSON() ([]byte, error) {
})
}
// Run executes all hooks for the given hook name.
func (hooks Hooks) Run(name HookName, state *specs.State) error {
list := hooks[name]
for i, h := range list {
if err := h.Run(state); err != nil {
return fmt.Errorf("error running %s hook #%d: %w", name, i, err)
}
}
return nil
}
type Hook interface {
// Run executes the hook with the provided state.
Run(*specs.State) error
@ -393,7 +476,7 @@ func (c Command) Run(s *specs.State) error {
go func() {
err := cmd.Wait()
if err != nil {
err = fmt.Errorf("error running hook: %w, stdout: %s, stderr: %s", err, stdout.String(), stderr.String())
err = fmt.Errorf("%w, stdout: %s, stderr: %s", err, stdout.String(), stderr.String())
}
errC <- err
}()

View File

@ -7,22 +7,33 @@ import (
)
var (
errNoUIDMap = errors.New("User namespaces enabled, but no uid mappings found.")
errNoUserMap = errors.New("User namespaces enabled, but no user mapping found.")
errNoGIDMap = errors.New("User namespaces enabled, but no gid mappings found.")
errNoGroupMap = errors.New("User namespaces enabled, but no group mapping found.")
errNoUIDMap = errors.New("user namespaces enabled, but no uid mappings found")
errNoGIDMap = errors.New("user namespaces enabled, but no gid mappings found")
)
// Please check https://man7.org/linux/man-pages/man2/personality.2.html for const details.
// https://raw.githubusercontent.com/torvalds/linux/master/include/uapi/linux/personality.h
const (
PerLinux = 0x0000
PerLinux32 = 0x0008
)
type LinuxPersonality struct {
// Domain for the personality
// can only contain values "LINUX" and "LINUX32"
Domain int `json:"domain"`
}
// HostUID gets the translated uid for the process on host which could be
// different when user namespaces are enabled.
func (c Config) HostUID(containerId int) (int, error) {
if c.Namespaces.Contains(NEWUSER) {
if c.UidMappings == nil {
if len(c.UIDMappings) == 0 {
return -1, errNoUIDMap
}
id, found := c.hostIDFromMapping(int64(containerId), c.UidMappings)
id, found := c.hostIDFromMapping(int64(containerId), c.UIDMappings)
if !found {
return -1, errNoUserMap
return -1, fmt.Errorf("user namespaces enabled, but no mapping found for uid %d", containerId)
}
// If we are a 32-bit binary running on a 64-bit system, it's possible
// the mapped user is too large to store in an int, which means we
@ -47,12 +58,12 @@ func (c Config) HostRootUID() (int, error) {
// different when user namespaces are enabled.
func (c Config) HostGID(containerId int) (int, error) {
if c.Namespaces.Contains(NEWUSER) {
if c.GidMappings == nil {
if len(c.GIDMappings) == 0 {
return -1, errNoGIDMap
}
id, found := c.hostIDFromMapping(int64(containerId), c.GidMappings)
id, found := c.hostIDFromMapping(int64(containerId), c.GIDMappings)
if !found {
return -1, errNoGroupMap
return -1, fmt.Errorf("user namespaces enabled, but no mapping found for gid %d", containerId)
}
// If we are a 32-bit binary running on a 64-bit system, it's possible
// the mapped user is too large to store in an int, which means we

View File

@ -1,48 +1,7 @@
package configs
import "golang.org/x/sys/unix"
const (
// EXT_COPYUP is a directive to copy up the contents of a directory when
// a tmpfs is mounted over it.
EXT_COPYUP = 1 << iota //nolint:golint // ignore "don't use ALL_CAPS" warning
EXT_COPYUP = 1 << iota //nolint:golint,revive // ignore "don't use ALL_CAPS" warning
)
type Mount struct {
// Source path for the mount.
Source string `json:"source"`
// Destination path for the mount inside the container.
Destination string `json:"destination"`
// Device the mount is for.
Device string `json:"device"`
// Mount flags.
Flags int `json:"flags"`
// Propagation Flags
PropagationFlags []int `json:"propagation_flags"`
// Mount data applied to the mount.
Data string `json:"data"`
// Relabel source if set, "z" indicates shared, "Z" indicates unshared.
Relabel string `json:"relabel"`
// RecAttr represents mount properties to be applied recursively (AT_RECURSIVE), see mount_setattr(2).
RecAttr *unix.MountAttr `json:"rec_attr"`
// Extensions are additional flags that are specific to runc.
Extensions int `json:"extensions"`
// Optional Command to be run before Source is mounted.
PremountCmds []Command `json:"premount_cmds"`
// Optional Command to be run after Source is mounted.
PostmountCmds []Command `json:"postmount_cmds"`
}
func (m *Mount) IsBind() bool {
return m.Flags&unix.MS_BIND != 0
}

View File

@ -0,0 +1,66 @@
package configs
import "golang.org/x/sys/unix"
type MountIDMapping struct {
// Recursive indicates if the mapping needs to be recursive.
Recursive bool `json:"recursive"`
// UserNSPath is a path to a user namespace that indicates the necessary
// id-mappings for MOUNT_ATTR_IDMAP. If set to non-"", UIDMappings and
// GIDMappings must be set to nil.
UserNSPath string `json:"userns_path,omitempty"`
// UIDMappings is the uid mapping set for this mount, to be used with
// MOUNT_ATTR_IDMAP.
UIDMappings []IDMap `json:"uid_mappings,omitempty"`
// GIDMappings is the gid mapping set for this mount, to be used with
// MOUNT_ATTR_IDMAP.
GIDMappings []IDMap `json:"gid_mappings,omitempty"`
}
type Mount struct {
// Source path for the mount.
Source string `json:"source"`
// Destination path for the mount inside the container.
Destination string `json:"destination"`
// Device the mount is for.
Device string `json:"device"`
// Mount flags.
Flags int `json:"flags"`
// Mount flags that were explicitly cleared in the configuration (meaning
// the user explicitly requested that these flags *not* be set).
ClearedFlags int `json:"cleared_flags"`
// Propagation Flags
PropagationFlags []int `json:"propagation_flags"`
// Mount data applied to the mount.
Data string `json:"data"`
// Relabel source if set, "z" indicates shared, "Z" indicates unshared.
Relabel string `json:"relabel"`
// RecAttr represents mount properties to be applied recursively (AT_RECURSIVE), see mount_setattr(2).
RecAttr *unix.MountAttr `json:"rec_attr"`
// Extensions are additional flags that are specific to runc.
Extensions int `json:"extensions"`
// Mapping is the MOUNT_ATTR_IDMAP configuration for the mount. If non-nil,
// the mount is configured to use MOUNT_ATTR_IDMAP-style id mappings.
IDMapping *MountIDMapping `json:"id_mapping,omitempty"`
}
func (m *Mount) IsBind() bool {
return m.Flags&unix.MS_BIND != 0
}
func (m *Mount) IsIDMapped() bool {
return m.IDMapping != nil
}

View File

@ -0,0 +1,10 @@
//go:build !linux
// +build !linux
package configs
type Mount struct{}
func (m *Mount) IsBind() bool {
return false
}

View File

@ -14,6 +14,7 @@ const (
NEWIPC NamespaceType = "NEWIPC"
NEWUSER NamespaceType = "NEWUSER"
NEWCGROUP NamespaceType = "NEWCGROUP"
NEWTIME NamespaceType = "NEWTIME"
)
var (
@ -38,6 +39,8 @@ func NsName(ns NamespaceType) string {
return "uts"
case NEWCGROUP:
return "cgroup"
case NEWTIME:
return "time"
}
return ""
}
@ -56,6 +59,9 @@ func IsNamespaceSupported(ns NamespaceType) bool {
if nsFile == "" {
return false
}
// We don't need to use /proc/thread-self here because the list of
// namespace types is unrelated to the thread. This lets us avoid having to
// do runtime.LockOSThread.
_, err := os.Stat("/proc/self/ns/" + nsFile)
// a namespace is supported if it exists and we have permissions to read it
supported = err == nil
@ -72,6 +78,7 @@ func NamespaceTypes() []NamespaceType {
NEWPID,
NEWNS,
NEWCGROUP,
NEWTIME,
}
}

View File

@ -17,6 +17,7 @@ var namespaceInfo = map[NamespaceType]int{
NEWUTS: unix.CLONE_NEWUTS,
NEWPID: unix.CLONE_NEWPID,
NEWCGROUP: unix.CLONE_NEWCGROUP,
NEWTIME: unix.CLONE_NEWTIME,
}
// CloneFlags parses the container's Namespaces options to set the correct
@ -31,3 +32,15 @@ func (n *Namespaces) CloneFlags() uintptr {
}
return uintptr(flag)
}
// IsPrivate tells whether the namespace of type t is configured as private
// (i.e. it exists and is not shared).
func (n Namespaces) IsPrivate(t NamespaceType) bool {
for _, v := range n {
if v.Type == t {
return v.Path == ""
}
}
// Not found, so implicitly sharing a parent namespace.
return false
}

View File

@ -0,0 +1,81 @@
package user
import (
"io"
"github.com/moby/sys/user"
)
// LookupUser looks up a user by their username in /etc/passwd. If the user
// cannot be found (or there is no /etc/passwd file on the filesystem), then
// LookupUser returns an error.
func LookupUser(username string) (user.User, error) {
return user.LookupUser(username)
}
// LookupUid looks up a user by their user id in /etc/passwd. If the user cannot
// be found (or there is no /etc/passwd file on the filesystem), then LookupId
// returns an error.
func LookupUid(uid int) (user.User, error) { //nolint:revive // ignore var-naming: func LookupUid should be LookupUID
return user.LookupUid(uid)
}
// LookupGroup looks up a group by its name in /etc/group. If the group cannot
// be found (or there is no /etc/group file on the filesystem), then LookupGroup
// returns an error.
func LookupGroup(groupname string) (user.Group, error) {
return user.LookupGroup(groupname)
}
// LookupGid looks up a group by its group id in /etc/group. If the group cannot
// be found (or there is no /etc/group file on the filesystem), then LookupGid
// returns an error.
func LookupGid(gid int) (user.Group, error) {
return user.LookupGid(gid)
}
func GetPasswdPath() (string, error) {
return user.GetPasswdPath()
}
func GetPasswd() (io.ReadCloser, error) {
return user.GetPasswd()
}
func GetGroupPath() (string, error) {
return user.GetGroupPath()
}
func GetGroup() (io.ReadCloser, error) {
return user.GetGroup()
}
// CurrentUser looks up the current user by their user id in /etc/passwd. If the
// user cannot be found (or there is no /etc/passwd file on the filesystem),
// then CurrentUser returns an error.
func CurrentUser() (user.User, error) {
return user.CurrentUser()
}
// CurrentGroup looks up the current user's group by their primary group id's
// entry in /etc/passwd. If the group cannot be found (or there is no
// /etc/group file on the filesystem), then CurrentGroup returns an error.
func CurrentGroup() (user.Group, error) {
return user.CurrentGroup()
}
func CurrentUserSubUIDs() ([]user.SubID, error) {
return user.CurrentUserSubUIDs()
}
func CurrentUserSubGIDs() ([]user.SubID, error) {
return user.CurrentUserSubGIDs()
}
func CurrentProcessUIDMap() ([]user.IDMap, error) {
return user.CurrentProcessUIDMap()
}
func CurrentProcessGIDMap() ([]user.IDMap, error) {
return user.CurrentProcessGIDMap()
}

View File

@ -1,157 +0,0 @@
//go:build darwin || dragonfly || freebsd || linux || netbsd || openbsd || solaris
// +build darwin dragonfly freebsd linux netbsd openbsd solaris
package user
import (
"io"
"os"
"strconv"
"golang.org/x/sys/unix"
)
// Unix-specific path to the passwd and group formatted files.
const (
unixPasswdPath = "/etc/passwd"
unixGroupPath = "/etc/group"
)
// LookupUser looks up a user by their username in /etc/passwd. If the user
// cannot be found (or there is no /etc/passwd file on the filesystem), then
// LookupUser returns an error.
func LookupUser(username string) (User, error) {
return lookupUserFunc(func(u User) bool {
return u.Name == username
})
}
// LookupUid looks up a user by their user id in /etc/passwd. If the user cannot
// be found (or there is no /etc/passwd file on the filesystem), then LookupId
// returns an error.
func LookupUid(uid int) (User, error) {
return lookupUserFunc(func(u User) bool {
return u.Uid == uid
})
}
func lookupUserFunc(filter func(u User) bool) (User, error) {
// Get operating system-specific passwd reader-closer.
passwd, err := GetPasswd()
if err != nil {
return User{}, err
}
defer passwd.Close()
// Get the users.
users, err := ParsePasswdFilter(passwd, filter)
if err != nil {
return User{}, err
}
// No user entries found.
if len(users) == 0 {
return User{}, ErrNoPasswdEntries
}
// Assume the first entry is the "correct" one.
return users[0], nil
}
// LookupGroup looks up a group by its name in /etc/group. If the group cannot
// be found (or there is no /etc/group file on the filesystem), then LookupGroup
// returns an error.
func LookupGroup(groupname string) (Group, error) {
return lookupGroupFunc(func(g Group) bool {
return g.Name == groupname
})
}
// LookupGid looks up a group by its group id in /etc/group. If the group cannot
// be found (or there is no /etc/group file on the filesystem), then LookupGid
// returns an error.
func LookupGid(gid int) (Group, error) {
return lookupGroupFunc(func(g Group) bool {
return g.Gid == gid
})
}
func lookupGroupFunc(filter func(g Group) bool) (Group, error) {
// Get operating system-specific group reader-closer.
group, err := GetGroup()
if err != nil {
return Group{}, err
}
defer group.Close()
// Get the users.
groups, err := ParseGroupFilter(group, filter)
if err != nil {
return Group{}, err
}
// No user entries found.
if len(groups) == 0 {
return Group{}, ErrNoGroupEntries
}
// Assume the first entry is the "correct" one.
return groups[0], nil
}
func GetPasswdPath() (string, error) {
return unixPasswdPath, nil
}
func GetPasswd() (io.ReadCloser, error) {
return os.Open(unixPasswdPath)
}
func GetGroupPath() (string, error) {
return unixGroupPath, nil
}
func GetGroup() (io.ReadCloser, error) {
return os.Open(unixGroupPath)
}
// CurrentUser looks up the current user by their user id in /etc/passwd. If the
// user cannot be found (or there is no /etc/passwd file on the filesystem),
// then CurrentUser returns an error.
func CurrentUser() (User, error) {
return LookupUid(unix.Getuid())
}
// CurrentGroup looks up the current user's group by their primary group id's
// entry in /etc/passwd. If the group cannot be found (or there is no
// /etc/group file on the filesystem), then CurrentGroup returns an error.
func CurrentGroup() (Group, error) {
return LookupGid(unix.Getgid())
}
func currentUserSubIDs(fileName string) ([]SubID, error) {
u, err := CurrentUser()
if err != nil {
return nil, err
}
filter := func(entry SubID) bool {
return entry.Name == u.Name || entry.Name == strconv.Itoa(u.Uid)
}
return ParseSubIDFileFilter(fileName, filter)
}
func CurrentUserSubUIDs() ([]SubID, error) {
return currentUserSubIDs("/etc/subuid")
}
func CurrentUserSubGIDs() ([]SubID, error) {
return currentUserSubIDs("/etc/subgid")
}
func CurrentProcessUIDMap() ([]IDMap, error) {
return ParseIDMapFile("/proc/self/uid_map")
}
func CurrentProcessGIDMap() ([]IDMap, error) {
return ParseIDMapFile("/proc/self/gid_map")
}

View File

@ -1,605 +0,0 @@
package user
import (
"bufio"
"bytes"
"errors"
"fmt"
"io"
"os"
"strconv"
"strings"
)
const (
minID = 0
maxID = 1<<31 - 1 // for 32-bit systems compatibility
)
var (
// ErrNoPasswdEntries is returned if no matching entries were found in /etc/group.
ErrNoPasswdEntries = errors.New("no matching entries in passwd file")
// ErrNoGroupEntries is returned if no matching entries were found in /etc/passwd.
ErrNoGroupEntries = errors.New("no matching entries in group file")
// ErrRange is returned if a UID or GID is outside of the valid range.
ErrRange = fmt.Errorf("uids and gids must be in range %d-%d", minID, maxID)
)
type User struct {
Name string
Pass string
Uid int
Gid int
Gecos string
Home string
Shell string
}
type Group struct {
Name string
Pass string
Gid int
List []string
}
// SubID represents an entry in /etc/sub{u,g}id
type SubID struct {
Name string
SubID int64
Count int64
}
// IDMap represents an entry in /proc/PID/{u,g}id_map
type IDMap struct {
ID int64
ParentID int64
Count int64
}
func parseLine(line []byte, v ...interface{}) {
parseParts(bytes.Split(line, []byte(":")), v...)
}
func parseParts(parts [][]byte, v ...interface{}) {
if len(parts) == 0 {
return
}
for i, p := range parts {
// Ignore cases where we don't have enough fields to populate the arguments.
// Some configuration files like to misbehave.
if len(v) <= i {
break
}
// Use the type of the argument to figure out how to parse it, scanf() style.
// This is legit.
switch e := v[i].(type) {
case *string:
*e = string(p)
case *int:
// "numbers", with conversion errors ignored because of some misbehaving configuration files.
*e, _ = strconv.Atoi(string(p))
case *int64:
*e, _ = strconv.ParseInt(string(p), 10, 64)
case *[]string:
// Comma-separated lists.
if len(p) != 0 {
*e = strings.Split(string(p), ",")
} else {
*e = []string{}
}
default:
// Someone goof'd when writing code using this function. Scream so they can hear us.
panic(fmt.Sprintf("parseLine only accepts {*string, *int, *int64, *[]string} as arguments! %#v is not a pointer!", e))
}
}
}
func ParsePasswdFile(path string) ([]User, error) {
passwd, err := os.Open(path)
if err != nil {
return nil, err
}
defer passwd.Close()
return ParsePasswd(passwd)
}
func ParsePasswd(passwd io.Reader) ([]User, error) {
return ParsePasswdFilter(passwd, nil)
}
func ParsePasswdFileFilter(path string, filter func(User) bool) ([]User, error) {
passwd, err := os.Open(path)
if err != nil {
return nil, err
}
defer passwd.Close()
return ParsePasswdFilter(passwd, filter)
}
func ParsePasswdFilter(r io.Reader, filter func(User) bool) ([]User, error) {
if r == nil {
return nil, errors.New("nil source for passwd-formatted data")
}
var (
s = bufio.NewScanner(r)
out = []User{}
)
for s.Scan() {
line := bytes.TrimSpace(s.Bytes())
if len(line) == 0 {
continue
}
// see: man 5 passwd
// name:password:UID:GID:GECOS:directory:shell
// Name:Pass:Uid:Gid:Gecos:Home:Shell
// root:x:0:0:root:/root:/bin/bash
// adm:x:3:4:adm:/var/adm:/bin/false
p := User{}
parseLine(line, &p.Name, &p.Pass, &p.Uid, &p.Gid, &p.Gecos, &p.Home, &p.Shell)
if filter == nil || filter(p) {
out = append(out, p)
}
}
if err := s.Err(); err != nil {
return nil, err
}
return out, nil
}
func ParseGroupFile(path string) ([]Group, error) {
group, err := os.Open(path)
if err != nil {
return nil, err
}
defer group.Close()
return ParseGroup(group)
}
func ParseGroup(group io.Reader) ([]Group, error) {
return ParseGroupFilter(group, nil)
}
func ParseGroupFileFilter(path string, filter func(Group) bool) ([]Group, error) {
group, err := os.Open(path)
if err != nil {
return nil, err
}
defer group.Close()
return ParseGroupFilter(group, filter)
}
func ParseGroupFilter(r io.Reader, filter func(Group) bool) ([]Group, error) {
if r == nil {
return nil, errors.New("nil source for group-formatted data")
}
rd := bufio.NewReader(r)
out := []Group{}
// Read the file line-by-line.
for {
var (
isPrefix bool
wholeLine []byte
err error
)
// Read the next line. We do so in chunks (as much as reader's
// buffer is able to keep), check if we read enough columns
// already on each step and store final result in wholeLine.
for {
var line []byte
line, isPrefix, err = rd.ReadLine()
if err != nil {
// We should return no error if EOF is reached
// without a match.
if err == io.EOF {
err = nil
}
return out, err
}
// Simple common case: line is short enough to fit in a
// single reader's buffer.
if !isPrefix && len(wholeLine) == 0 {
wholeLine = line
break
}
wholeLine = append(wholeLine, line...)
// Check if we read the whole line already.
if !isPrefix {
break
}
}
// There's no spec for /etc/passwd or /etc/group, but we try to follow
// the same rules as the glibc parser, which allows comments and blank
// space at the beginning of a line.
wholeLine = bytes.TrimSpace(wholeLine)
if len(wholeLine) == 0 || wholeLine[0] == '#' {
continue
}
// see: man 5 group
// group_name:password:GID:user_list
// Name:Pass:Gid:List
// root:x:0:root
// adm:x:4:root,adm,daemon
p := Group{}
parseLine(wholeLine, &p.Name, &p.Pass, &p.Gid, &p.List)
if filter == nil || filter(p) {
out = append(out, p)
}
}
}
type ExecUser struct {
Uid int
Gid int
Sgids []int
Home string
}
// GetExecUserPath is a wrapper for GetExecUser. It reads data from each of the
// given file paths and uses that data as the arguments to GetExecUser. If the
// files cannot be opened for any reason, the error is ignored and a nil
// io.Reader is passed instead.
func GetExecUserPath(userSpec string, defaults *ExecUser, passwdPath, groupPath string) (*ExecUser, error) {
var passwd, group io.Reader
if passwdFile, err := os.Open(passwdPath); err == nil {
passwd = passwdFile
defer passwdFile.Close()
}
if groupFile, err := os.Open(groupPath); err == nil {
group = groupFile
defer groupFile.Close()
}
return GetExecUser(userSpec, defaults, passwd, group)
}
// GetExecUser parses a user specification string (using the passwd and group
// readers as sources for /etc/passwd and /etc/group data, respectively). In
// the case of blank fields or missing data from the sources, the values in
// defaults is used.
//
// GetExecUser will return an error if a user or group literal could not be
// found in any entry in passwd and group respectively.
//
// Examples of valid user specifications are:
// - ""
// - "user"
// - "uid"
// - "user:group"
// - "uid:gid
// - "user:gid"
// - "uid:group"
//
// It should be noted that if you specify a numeric user or group id, they will
// not be evaluated as usernames (only the metadata will be filled). So attempting
// to parse a user with user.Name = "1337" will produce the user with a UID of
// 1337.
func GetExecUser(userSpec string, defaults *ExecUser, passwd, group io.Reader) (*ExecUser, error) {
if defaults == nil {
defaults = new(ExecUser)
}
// Copy over defaults.
user := &ExecUser{
Uid: defaults.Uid,
Gid: defaults.Gid,
Sgids: defaults.Sgids,
Home: defaults.Home,
}
// Sgids slice *cannot* be nil.
if user.Sgids == nil {
user.Sgids = []int{}
}
// Allow for userArg to have either "user" syntax, or optionally "user:group" syntax
var userArg, groupArg string
parseLine([]byte(userSpec), &userArg, &groupArg)
// Convert userArg and groupArg to be numeric, so we don't have to execute
// Atoi *twice* for each iteration over lines.
uidArg, uidErr := strconv.Atoi(userArg)
gidArg, gidErr := strconv.Atoi(groupArg)
// Find the matching user.
users, err := ParsePasswdFilter(passwd, func(u User) bool {
if userArg == "" {
// Default to current state of the user.
return u.Uid == user.Uid
}
if uidErr == nil {
// If the userArg is numeric, always treat it as a UID.
return uidArg == u.Uid
}
return u.Name == userArg
})
// If we can't find the user, we have to bail.
if err != nil && passwd != nil {
if userArg == "" {
userArg = strconv.Itoa(user.Uid)
}
return nil, fmt.Errorf("unable to find user %s: %w", userArg, err)
}
var matchedUserName string
if len(users) > 0 {
// First match wins, even if there's more than one matching entry.
matchedUserName = users[0].Name
user.Uid = users[0].Uid
user.Gid = users[0].Gid
user.Home = users[0].Home
} else if userArg != "" {
// If we can't find a user with the given username, the only other valid
// option is if it's a numeric username with no associated entry in passwd.
if uidErr != nil {
// Not numeric.
return nil, fmt.Errorf("unable to find user %s: %w", userArg, ErrNoPasswdEntries)
}
user.Uid = uidArg
// Must be inside valid uid range.
if user.Uid < minID || user.Uid > maxID {
return nil, ErrRange
}
// Okay, so it's numeric. We can just roll with this.
}
// On to the groups. If we matched a username, we need to do this because of
// the supplementary group IDs.
if groupArg != "" || matchedUserName != "" {
groups, err := ParseGroupFilter(group, func(g Group) bool {
// If the group argument isn't explicit, we'll just search for it.
if groupArg == "" {
// Check if user is a member of this group.
for _, u := range g.List {
if u == matchedUserName {
return true
}
}
return false
}
if gidErr == nil {
// If the groupArg is numeric, always treat it as a GID.
return gidArg == g.Gid
}
return g.Name == groupArg
})
if err != nil && group != nil {
return nil, fmt.Errorf("unable to find groups for spec %v: %w", matchedUserName, err)
}
// Only start modifying user.Gid if it is in explicit form.
if groupArg != "" {
if len(groups) > 0 {
// First match wins, even if there's more than one matching entry.
user.Gid = groups[0].Gid
} else {
// If we can't find a group with the given name, the only other valid
// option is if it's a numeric group name with no associated entry in group.
if gidErr != nil {
// Not numeric.
return nil, fmt.Errorf("unable to find group %s: %w", groupArg, ErrNoGroupEntries)
}
user.Gid = gidArg
// Must be inside valid gid range.
if user.Gid < minID || user.Gid > maxID {
return nil, ErrRange
}
// Okay, so it's numeric. We can just roll with this.
}
} else if len(groups) > 0 {
// Supplementary group ids only make sense if in the implicit form.
user.Sgids = make([]int, len(groups))
for i, group := range groups {
user.Sgids[i] = group.Gid
}
}
}
return user, nil
}
// GetAdditionalGroups looks up a list of groups by name or group id
// against the given /etc/group formatted data. If a group name cannot
// be found, an error will be returned. If a group id cannot be found,
// or the given group data is nil, the id will be returned as-is
// provided it is in the legal range.
func GetAdditionalGroups(additionalGroups []string, group io.Reader) ([]int, error) {
groups := []Group{}
if group != nil {
var err error
groups, err = ParseGroupFilter(group, func(g Group) bool {
for _, ag := range additionalGroups {
if g.Name == ag || strconv.Itoa(g.Gid) == ag {
return true
}
}
return false
})
if err != nil {
return nil, fmt.Errorf("Unable to find additional groups %v: %w", additionalGroups, err)
}
}
gidMap := make(map[int]struct{})
for _, ag := range additionalGroups {
var found bool
for _, g := range groups {
// if we found a matched group either by name or gid, take the
// first matched as correct
if g.Name == ag || strconv.Itoa(g.Gid) == ag {
if _, ok := gidMap[g.Gid]; !ok {
gidMap[g.Gid] = struct{}{}
found = true
break
}
}
}
// we asked for a group but didn't find it. let's check to see
// if we wanted a numeric group
if !found {
gid, err := strconv.ParseInt(ag, 10, 64)
if err != nil {
// Not a numeric ID either.
return nil, fmt.Errorf("Unable to find group %s: %w", ag, ErrNoGroupEntries)
}
// Ensure gid is inside gid range.
if gid < minID || gid > maxID {
return nil, ErrRange
}
gidMap[int(gid)] = struct{}{}
}
}
gids := []int{}
for gid := range gidMap {
gids = append(gids, gid)
}
return gids, nil
}
// GetAdditionalGroupsPath is a wrapper around GetAdditionalGroups
// that opens the groupPath given and gives it as an argument to
// GetAdditionalGroups.
func GetAdditionalGroupsPath(additionalGroups []string, groupPath string) ([]int, error) {
var group io.Reader
if groupFile, err := os.Open(groupPath); err == nil {
group = groupFile
defer groupFile.Close()
}
return GetAdditionalGroups(additionalGroups, group)
}
func ParseSubIDFile(path string) ([]SubID, error) {
subid, err := os.Open(path)
if err != nil {
return nil, err
}
defer subid.Close()
return ParseSubID(subid)
}
func ParseSubID(subid io.Reader) ([]SubID, error) {
return ParseSubIDFilter(subid, nil)
}
func ParseSubIDFileFilter(path string, filter func(SubID) bool) ([]SubID, error) {
subid, err := os.Open(path)
if err != nil {
return nil, err
}
defer subid.Close()
return ParseSubIDFilter(subid, filter)
}
func ParseSubIDFilter(r io.Reader, filter func(SubID) bool) ([]SubID, error) {
if r == nil {
return nil, errors.New("nil source for subid-formatted data")
}
var (
s = bufio.NewScanner(r)
out = []SubID{}
)
for s.Scan() {
line := bytes.TrimSpace(s.Bytes())
if len(line) == 0 {
continue
}
// see: man 5 subuid
p := SubID{}
parseLine(line, &p.Name, &p.SubID, &p.Count)
if filter == nil || filter(p) {
out = append(out, p)
}
}
if err := s.Err(); err != nil {
return nil, err
}
return out, nil
}
func ParseIDMapFile(path string) ([]IDMap, error) {
r, err := os.Open(path)
if err != nil {
return nil, err
}
defer r.Close()
return ParseIDMap(r)
}
func ParseIDMap(r io.Reader) ([]IDMap, error) {
return ParseIDMapFilter(r, nil)
}
func ParseIDMapFileFilter(path string, filter func(IDMap) bool) ([]IDMap, error) {
r, err := os.Open(path)
if err != nil {
return nil, err
}
defer r.Close()
return ParseIDMapFilter(r, filter)
}
func ParseIDMapFilter(r io.Reader, filter func(IDMap) bool) ([]IDMap, error) {
if r == nil {
return nil, errors.New("nil source for idmap-formatted data")
}
var (
s = bufio.NewScanner(r)
out = []IDMap{}
)
for s.Scan() {
line := bytes.TrimSpace(s.Bytes())
if len(line) == 0 {
continue
}
// see: man 7 user_namespaces
p := IDMap{}
parseParts(bytes.Fields(line), &p.ID, &p.ParentID, &p.Count)
if filter == nil || filter(p) {
out = append(out, p)
}
}
if err := s.Err(); err != nil {
return nil, err
}
return out, nil
}

View File

@ -0,0 +1,146 @@
// Package user is an alias for [github.com/moby/sys/user].
//
// Deprecated: use [github.com/moby/sys/user].
package user
import (
"io"
"github.com/moby/sys/user"
)
var (
// ErrNoPasswdEntries is returned if no matching entries were found in /etc/group.
ErrNoPasswdEntries = user.ErrNoPasswdEntries
// ErrNoGroupEntries is returned if no matching entries were found in /etc/passwd.
ErrNoGroupEntries = user.ErrNoGroupEntries
// ErrRange is returned if a UID or GID is outside of the valid range.
ErrRange = user.ErrRange
)
type (
User = user.User
Group = user.Group
// SubID represents an entry in /etc/sub{u,g}id.
SubID = user.SubID
// IDMap represents an entry in /proc/PID/{u,g}id_map.
IDMap = user.IDMap
ExecUser = user.ExecUser
)
func ParsePasswdFile(path string) ([]user.User, error) {
return user.ParsePasswdFile(path)
}
func ParsePasswd(passwd io.Reader) ([]user.User, error) {
return user.ParsePasswd(passwd)
}
func ParsePasswdFileFilter(path string, filter func(user.User) bool) ([]user.User, error) {
return user.ParsePasswdFileFilter(path, filter)
}
func ParsePasswdFilter(r io.Reader, filter func(user.User) bool) ([]user.User, error) {
return user.ParsePasswdFilter(r, filter)
}
func ParseGroupFile(path string) ([]user.Group, error) {
return user.ParseGroupFile(path)
}
func ParseGroup(group io.Reader) ([]user.Group, error) {
return user.ParseGroup(group)
}
func ParseGroupFileFilter(path string, filter func(user.Group) bool) ([]user.Group, error) {
return user.ParseGroupFileFilter(path, filter)
}
func ParseGroupFilter(r io.Reader, filter func(user.Group) bool) ([]user.Group, error) {
return user.ParseGroupFilter(r, filter)
}
// GetExecUserPath is a wrapper for GetExecUser. It reads data from each of the
// given file paths and uses that data as the arguments to GetExecUser. If the
// files cannot be opened for any reason, the error is ignored and a nil
// io.Reader is passed instead.
func GetExecUserPath(userSpec string, defaults *user.ExecUser, passwdPath, groupPath string) (*user.ExecUser, error) {
return user.GetExecUserPath(userSpec, defaults, passwdPath, groupPath)
}
// GetExecUser parses a user specification string (using the passwd and group
// readers as sources for /etc/passwd and /etc/group data, respectively). In
// the case of blank fields or missing data from the sources, the values in
// defaults is used.
//
// GetExecUser will return an error if a user or group literal could not be
// found in any entry in passwd and group respectively.
//
// Examples of valid user specifications are:
// - ""
// - "user"
// - "uid"
// - "user:group"
// - "uid:gid
// - "user:gid"
// - "uid:group"
//
// It should be noted that if you specify a numeric user or group id, they will
// not be evaluated as usernames (only the metadata will be filled). So attempting
// to parse a user with user.Name = "1337" will produce the user with a UID of
// 1337.
func GetExecUser(userSpec string, defaults *user.ExecUser, passwd, group io.Reader) (*user.ExecUser, error) {
return user.GetExecUser(userSpec, defaults, passwd, group)
}
// GetAdditionalGroups looks up a list of groups by name or group id
// against the given /etc/group formatted data. If a group name cannot
// be found, an error will be returned. If a group id cannot be found,
// or the given group data is nil, the id will be returned as-is
// provided it is in the legal range.
func GetAdditionalGroups(additionalGroups []string, group io.Reader) ([]int, error) {
return user.GetAdditionalGroups(additionalGroups, group)
}
// GetAdditionalGroupsPath is a wrapper around GetAdditionalGroups
// that opens the groupPath given and gives it as an argument to
// GetAdditionalGroups.
func GetAdditionalGroupsPath(additionalGroups []string, groupPath string) ([]int, error) {
return user.GetAdditionalGroupsPath(additionalGroups, groupPath)
}
func ParseSubIDFile(path string) ([]user.SubID, error) {
return user.ParseSubIDFile(path)
}
func ParseSubID(subid io.Reader) ([]user.SubID, error) {
return user.ParseSubID(subid)
}
func ParseSubIDFileFilter(path string, filter func(user.SubID) bool) ([]user.SubID, error) {
return user.ParseSubIDFileFilter(path, filter)
}
func ParseSubIDFilter(r io.Reader, filter func(user.SubID) bool) ([]user.SubID, error) {
return user.ParseSubIDFilter(r, filter)
}
func ParseIDMapFile(path string) ([]user.IDMap, error) {
return user.ParseIDMapFile(path)
}
func ParseIDMap(r io.Reader) ([]user.IDMap, error) {
return user.ParseIDMap(r)
}
func ParseIDMapFileFilter(path string, filter func(user.IDMap) bool) ([]user.IDMap, error) {
return user.ParseIDMapFileFilter(path, filter)
}
func ParseIDMapFilter(r io.Reader, filter func(user.IDMap) bool) ([]user.IDMap, error) {
return user.ParseIDMapFilter(r, filter)
}

View File

@ -1,43 +0,0 @@
//go:build gofuzz
// +build gofuzz
package user
import (
"io"
"strings"
)
func IsDivisbleBy(n int, divisibleby int) bool {
return (n % divisibleby) == 0
}
func FuzzUser(data []byte) int {
if len(data) == 0 {
return -1
}
if !IsDivisbleBy(len(data), 5) {
return -1
}
var divided [][]byte
chunkSize := len(data) / 5
for i := 0; i < len(data); i += chunkSize {
end := i + chunkSize
divided = append(divided, data[i:end])
}
_, _ = ParsePasswdFilter(strings.NewReader(string(divided[0])), nil)
var passwd, group io.Reader
group = strings.NewReader(string(divided[1]))
_, _ = GetAdditionalGroups([]string{string(divided[2])}, group)
passwd = strings.NewReader(string(divided[3]))
_, _ = GetExecUser(string(divided[4]), nil, passwd, group)
return 1
}

View File

@ -1,5 +1,4 @@
package userns
// RunningInUserNS detects whether we are currently running in a user namespace.
// Originally copied from github.com/lxc/lxd/shared/util.go
var RunningInUserNS = runningInUserNS

View File

@ -3,14 +3,7 @@
package userns
import (
"strings"
"github.com/opencontainers/runc/libcontainer/user"
)
func FuzzUIDMap(data []byte) int {
uidmap, _ := user.ParseIDMap(strings.NewReader(string(data)))
_ = uidMapInUserNS(uidmap)
func FuzzUIDMap(uidmap []byte) int {
_ = uidMapInUserNS(string(uidmap))
return 1
}

View File

@ -1,9 +1,10 @@
package userns
import (
"bufio"
"fmt"
"os"
"sync"
"github.com/opencontainers/runc/libcontainer/user"
)
var (
@ -12,26 +13,43 @@ var (
)
// runningInUserNS detects whether we are currently running in a user namespace.
// Originally copied from github.com/lxc/lxd/shared/util.go
//
// Originally copied from https://github.com/lxc/incus/blob/e45085dd42f826b3c8c3228e9733c0b6f998eafe/shared/util.go#L678-L700.
func runningInUserNS() bool {
nsOnce.Do(func() {
uidmap, err := user.CurrentProcessUIDMap()
file, err := os.Open("/proc/self/uid_map")
if err != nil {
// This kernel-provided file only exists if user namespaces are supported
// This kernel-provided file only exists if user namespaces are supported.
return
}
inUserNS = uidMapInUserNS(uidmap)
defer file.Close()
buf := bufio.NewReader(file)
l, _, err := buf.ReadLine()
if err != nil {
return
}
inUserNS = uidMapInUserNS(string(l))
})
return inUserNS
}
func uidMapInUserNS(uidmap []user.IDMap) bool {
/*
* We assume we are in the initial user namespace if we have a full
* range - 4294967295 uids starting at uid 0.
*/
if len(uidmap) == 1 && uidmap[0].ID == 0 && uidmap[0].ParentID == 0 && uidmap[0].Count == 4294967295 {
func uidMapInUserNS(uidMap string) bool {
if uidMap == "" {
// File exist but empty (the initial state when userns is created,
// see user_namespaces(7)).
return true
}
var a, b, c int64
if _, err := fmt.Sscanf(uidMap, "%d %d %d", &a, &b, &c); err != nil {
// Assume we are in a regular, non user namespace.
return false
}
return true
// As per user_namespaces(7), /proc/self/uid_map of
// the initial user namespace shows 0 0 4294967295.
initNS := a == 0 && b == 0 && c == 4294967295
return !initNS
}

View File

@ -3,8 +3,6 @@
package userns
import "github.com/opencontainers/runc/libcontainer/user"
// runningInUserNS is a stub for non-Linux systems
// Always returns false
func runningInUserNS() bool {
@ -13,6 +11,6 @@ func runningInUserNS() bool {
// uidMapInUserNS is a stub for non-Linux systems
// Always returns false
func uidMapInUserNS(uidmap []user.IDMap) bool {
func uidMapInUserNS(uidMap string) bool {
return false
}

View File

@ -0,0 +1,156 @@
package userns
import (
"fmt"
"os"
"sort"
"strings"
"sync"
"syscall"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
"github.com/opencontainers/runc/libcontainer/configs"
)
type Mapping struct {
UIDMappings []configs.IDMap
GIDMappings []configs.IDMap
}
func (m Mapping) toSys() (uids, gids []syscall.SysProcIDMap) {
for _, uid := range m.UIDMappings {
uids = append(uids, syscall.SysProcIDMap{
ContainerID: int(uid.ContainerID),
HostID: int(uid.HostID),
Size: int(uid.Size),
})
}
for _, gid := range m.GIDMappings {
gids = append(gids, syscall.SysProcIDMap{
ContainerID: int(gid.ContainerID),
HostID: int(gid.HostID),
Size: int(gid.Size),
})
}
return
}
// id returns a unique identifier for this mapping, agnostic of the order of
// the uid and gid mappings (because the order doesn't matter to the kernel).
// The set of userns handles is indexed using this ID.
func (m Mapping) id() string {
var uids, gids []string
for _, idmap := range m.UIDMappings {
uids = append(uids, fmt.Sprintf("%d:%d:%d", idmap.ContainerID, idmap.HostID, idmap.Size))
}
for _, idmap := range m.GIDMappings {
gids = append(gids, fmt.Sprintf("%d:%d:%d", idmap.ContainerID, idmap.HostID, idmap.Size))
}
// We don't care about the sort order -- just sort them.
sort.Strings(uids)
sort.Strings(gids)
return "uid=" + strings.Join(uids, ",") + ";gid=" + strings.Join(gids, ",")
}
type Handles struct {
m sync.Mutex
maps map[string]*os.File
}
// Release all resources associated with this Handle. All existing files
// returned from Get() will continue to work even after calling Release(). The
// same Handles can be re-used after calling Release().
func (hs *Handles) Release() {
hs.m.Lock()
defer hs.m.Unlock()
// Close the files for good measure, though GC will do that for us anyway.
for _, file := range hs.maps {
_ = file.Close()
}
hs.maps = nil
}
func spawnProc(req Mapping) (*os.Process, error) {
// We need to spawn a subprocess with the requested mappings, which is
// unfortunately quite expensive. The "safe" way of doing this is natively
// with Go (and then spawning something like "sleep infinity"), but
// execve() is a waste of cycles because we just need some process to have
// the right mapping, we don't care what it's executing. The "unsafe"
// option of doing a clone() behind the back of Go is probably okay in
// theory as long as we just do kill(getpid(), SIGSTOP). However, if we
// tell Go to put the new process into PTRACE_TRACEME mode, we can avoid
// the exec and not have to faff around with the mappings.
//
// Note that Go's stdlib does not support newuidmap, but in the case of
// id-mapped mounts, it seems incredibly unlikely that the user will be
// requesting us to do a remapping as an unprivileged user with mappings
// they have privileges over.
logrus.Debugf("spawning dummy process for id-mapping %s", req.id())
uidMappings, gidMappings := req.toSys()
// We don't need to use /proc/thread-self here because the exe mm of a
// thread-group is guaranteed to be the same for all threads by definition.
// This lets us avoid having to do runtime.LockOSThread.
return os.StartProcess("/proc/self/exe", []string{"runc", "--help"}, &os.ProcAttr{
Sys: &syscall.SysProcAttr{
Cloneflags: unix.CLONE_NEWUSER,
UidMappings: uidMappings,
GidMappings: gidMappings,
GidMappingsEnableSetgroups: false,
// Put the process into PTRACE_TRACEME mode to allow us to get the
// userns without having a proper execve() target.
Ptrace: true,
},
})
}
func dupFile(f *os.File) (*os.File, error) {
newFd, err := unix.FcntlInt(f.Fd(), unix.F_DUPFD_CLOEXEC, 0)
if err != nil {
return nil, os.NewSyscallError("fcntl(F_DUPFD_CLOEXEC)", err)
}
return os.NewFile(uintptr(newFd), f.Name()), nil
}
// Get returns a handle to a /proc/$pid/ns/user nsfs file with the requested
// mapping. The processes spawned to produce userns nsfds are cached, so if
// equivalent user namespace mappings are requested, the same user namespace
// will be returned. The caller is responsible for closing the returned file
// descriptor.
func (hs *Handles) Get(req Mapping) (file *os.File, err error) {
hs.m.Lock()
defer hs.m.Unlock()
if hs.maps == nil {
hs.maps = make(map[string]*os.File)
}
file, ok := hs.maps[req.id()]
if !ok {
proc, err := spawnProc(req)
if err != nil {
return nil, fmt.Errorf("failed to spawn dummy process for map %s: %w", req.id(), err)
}
// Make sure we kill the helper process. We ignore errors because
// there's not much we can do about them anyway, and ultimately
defer func() {
_ = proc.Kill()
_, _ = proc.Wait()
}()
// Stash away a handle to the userns file. This is neater than keeping
// the process alive, because Go's GC can handle files much better than
// leaked processes, and having long-living useless processes seems
// less than ideal.
file, err = os.Open(fmt.Sprintf("/proc/%d/ns/user", proc.Pid))
if err != nil {
return nil, err
}
hs.maps[req.id()] = file
}
// Duplicate the file, to make sure the lifecycle of each *os.File we
// return is independent.
return dupFile(file)
}

View File

@ -19,13 +19,14 @@ package utils
import (
"fmt"
"os"
"runtime"
"golang.org/x/sys/unix"
)
// MaxSendfdLen is the maximum length of the name of a file descriptor being
// sent using SendFd. The name of the file handle returned by RecvFd will never
// be larger than this value.
// MaxNameLen is the maximum length of the name of a file descriptor being sent
// using SendFile. The name of the file handle returned by RecvFile will never be
// larger than this value.
const MaxNameLen = 4096
// oobSpace is the size of the oob slice required to store a single FD. Note
@ -33,26 +34,21 @@ const MaxNameLen = 4096
// so sizeof(fd) = 4.
var oobSpace = unix.CmsgSpace(4)
// RecvFd waits for a file descriptor to be sent over the given AF_UNIX
// RecvFile waits for a file descriptor to be sent over the given AF_UNIX
// socket. The file name of the remote file descriptor will be recreated
// locally (it is sent as non-auxiliary data in the same payload).
func RecvFd(socket *os.File) (*os.File, error) {
// For some reason, unix.Recvmsg uses the length rather than the capacity
// when passing the msg_controllen and other attributes to recvmsg. So we
// have to actually set the length.
func RecvFile(socket *os.File) (_ *os.File, Err error) {
name := make([]byte, MaxNameLen)
oob := make([]byte, oobSpace)
sockfd := socket.Fd()
n, oobn, _, _, err := unix.Recvmsg(int(sockfd), name, oob, 0)
n, oobn, _, _, err := unix.Recvmsg(int(sockfd), name, oob, unix.MSG_CMSG_CLOEXEC)
if err != nil {
return nil, err
}
if n >= MaxNameLen || oobn != oobSpace {
return nil, fmt.Errorf("recvfd: incorrect number of bytes read (n=%d oobn=%d)", n, oobn)
return nil, fmt.Errorf("recvfile: incorrect number of bytes read (n=%d oobn=%d)", n, oobn)
}
// Truncate.
name = name[:n]
oob = oob[:oobn]
@ -61,36 +57,63 @@ func RecvFd(socket *os.File) (*os.File, error) {
if err != nil {
return nil, err
}
// We cannot control how many SCM_RIGHTS we receive, and upon receiving
// them all of the descriptors are installed in our fd table, so we need to
// parse all of the SCM_RIGHTS we received in order to close all of the
// descriptors on error.
var fds []int
defer func() {
for i, fd := range fds {
if i == 0 && Err == nil {
// Only close the first one on error.
continue
}
// Always close extra ones.
_ = unix.Close(fd)
}
}()
var lastErr error
for _, scm := range scms {
if scm.Header.Type == unix.SCM_RIGHTS {
scmFds, err := unix.ParseUnixRights(&scm)
if err != nil {
lastErr = err
} else {
fds = append(fds, scmFds...)
}
}
}
if lastErr != nil {
return nil, lastErr
}
// We do this after collecting the fds to make sure we close them all when
// returning an error here.
if len(scms) != 1 {
return nil, fmt.Errorf("recvfd: number of SCMs is not 1: %d", len(scms))
}
scm := scms[0]
fds, err := unix.ParseUnixRights(&scm)
if err != nil {
return nil, err
}
if len(fds) != 1 {
return nil, fmt.Errorf("recvfd: number of fds is not 1: %d", len(fds))
}
fd := uintptr(fds[0])
return os.NewFile(fd, string(name)), nil
return os.NewFile(uintptr(fds[0]), string(name)), nil
}
// SendFd sends a file descriptor over the given AF_UNIX socket. In
// addition, the file.Name() of the given file will also be sent as
// non-auxiliary data in the same payload (allowing to send contextual
// information for a file descriptor).
func SendFd(socket *os.File, name string, fd uintptr) error {
// SendFile sends a file over the given AF_UNIX socket. file.Name() is also
// included so that if the other end uses RecvFile, the file will have the same
// name information.
func SendFile(socket *os.File, file *os.File) error {
name := file.Name()
if len(name) >= MaxNameLen {
return fmt.Errorf("sendfd: filename too long: %s", name)
}
return SendFds(socket, []byte(name), int(fd))
err := SendRawFd(socket, name, file.Fd())
runtime.KeepAlive(file)
return err
}
// SendFds sends a list of files descriptor and msg over the given AF_UNIX socket.
func SendFds(socket *os.File, msg []byte, fds ...int) error {
oob := unix.UnixRights(fds...)
return unix.Sendmsg(int(socket.Fd()), msg, oob, nil, 0)
// SendRawFd sends a specific file descriptor over the given AF_UNIX socket.
func SendRawFd(socket *os.File, msg string, fd uintptr) error {
oob := unix.UnixRights(int(fd))
return unix.Sendmsg(int(socket.Fd()), []byte(msg), oob, nil, 0)
}

View File

@ -3,15 +3,12 @@ package utils
import (
"encoding/binary"
"encoding/json"
"fmt"
"io"
"os"
"path/filepath"
"strconv"
"strings"
"unsafe"
securejoin "github.com/cyphar/filepath-securejoin"
"golang.org/x/sys/unix"
)
@ -43,6 +40,9 @@ func ExitStatus(status unix.WaitStatus) int {
}
// WriteJSON writes the provided struct v to w using standard json marshaling
// without a trailing newline. This is used instead of json.Encoder because
// there might be a problem in json decoder in some cases, see:
// https://github.com/docker/docker/issues/14203#issuecomment-174177790
func WriteJSON(w io.Writer, v interface{}) error {
data, err := json.Marshal(v)
if err != nil {
@ -99,52 +99,16 @@ func stripRoot(root, path string) string {
return CleanPath("/" + path)
}
// WithProcfd runs the passed closure with a procfd path (/proc/self/fd/...)
// corresponding to the unsafePath resolved within the root. Before passing the
// fd, this path is verified to have been inside the root -- so operating on it
// through the passed fdpath should be safe. Do not access this path through
// the original path strings, and do not attempt to use the pathname outside of
// the passed closure (the file handle will be freed once the closure returns).
func WithProcfd(root, unsafePath string, fn func(procfd string) error) error {
// Remove the root then forcefully resolve inside the root.
unsafePath = stripRoot(root, unsafePath)
path, err := securejoin.SecureJoin(root, unsafePath)
if err != nil {
return fmt.Errorf("resolving path inside rootfs failed: %w", err)
}
// Open the target path.
fh, err := os.OpenFile(path, unix.O_PATH|unix.O_CLOEXEC, 0)
if err != nil {
return fmt.Errorf("open o_path procfd: %w", err)
}
defer fh.Close()
// Double-check the path is the one we expected.
procfd := "/proc/self/fd/" + strconv.Itoa(int(fh.Fd()))
if realpath, err := os.Readlink(procfd); err != nil {
return fmt.Errorf("procfd verification failed: %w", err)
} else if realpath != path {
return fmt.Errorf("possibly malicious path detected -- refusing to operate on %s", realpath)
}
// Run the closure.
return fn(procfd)
}
// SearchLabels searches a list of key-value pairs for the provided key and
// returns the corresponding value. The pairs must be separated with '='.
func SearchLabels(labels []string, query string) string {
for _, l := range labels {
parts := strings.SplitN(l, "=", 2)
if len(parts) < 2 {
continue
}
if parts[0] == query {
return parts[1]
// SearchLabels searches through a list of key=value pairs for a given key,
// returning its value, and the binary flag telling whether the key exist.
func SearchLabels(labels []string, key string) (string, bool) {
key += "="
for _, s := range labels {
if strings.HasPrefix(s, key) {
return s[len(key):], true
}
}
return ""
return "", false
}
// Annotations returns the bundle path and user defined annotations from the

View File

@ -5,10 +5,16 @@ package utils
import (
"fmt"
"math"
"os"
"path/filepath"
"runtime"
"strconv"
"sync"
_ "unsafe" // for go:linkname
securejoin "github.com/cyphar/filepath-securejoin"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
)
@ -24,12 +30,39 @@ func EnsureProcHandle(fh *os.File) error {
return nil
}
var (
haveCloseRangeCloexecBool bool
haveCloseRangeCloexecOnce sync.Once
)
func haveCloseRangeCloexec() bool {
haveCloseRangeCloexecOnce.Do(func() {
// Make sure we're not closing a random file descriptor.
tmpFd, err := unix.FcntlInt(0, unix.F_DUPFD_CLOEXEC, 0)
if err != nil {
return
}
defer unix.Close(tmpFd)
err = unix.CloseRange(uint(tmpFd), uint(tmpFd), unix.CLOSE_RANGE_CLOEXEC)
// Any error means we cannot use close_range(CLOSE_RANGE_CLOEXEC).
// -ENOSYS and -EINVAL ultimately mean we don't have support, but any
// other potential error would imply that even the most basic close
// operation wouldn't work.
haveCloseRangeCloexecBool = err == nil
})
return haveCloseRangeCloexecBool
}
type fdFunc func(fd int)
// fdRangeFrom calls the passed fdFunc for each file descriptor that is open in
// the current process.
func fdRangeFrom(minFd int, fn fdFunc) error {
fdDir, err := os.Open("/proc/self/fd")
procSelfFd, closer := ProcThreadSelf("fd")
defer closer()
fdDir, err := os.Open(procSelfFd)
if err != nil {
return err
}
@ -67,6 +100,12 @@ func fdRangeFrom(minFd int, fn fdFunc) error {
// CloseExecFrom sets the O_CLOEXEC flag on all file descriptors greater or
// equal to minFd in the current process.
func CloseExecFrom(minFd int) error {
// Use close_range(CLOSE_RANGE_CLOEXEC) if possible.
if haveCloseRangeCloexec() {
err := unix.CloseRange(uint(minFd), math.MaxUint, unix.CLOSE_RANGE_CLOEXEC)
return os.NewSyscallError("close_range", err)
}
// Otherwise, fall back to the standard loop.
return fdRangeFrom(minFd, unix.CloseOnExec)
}
@ -89,7 +128,8 @@ func runtime_IsPollDescriptor(fd uintptr) bool //nolint:revive
// *os.File operations would apply to the wrong file). This function is only
// intended to be called from the last stage of runc init.
func UnsafeCloseFrom(minFd int) error {
// We must not close some file descriptors.
// We cannot use close_range(2) even if it is available, because we must
// not close some file descriptors.
return fdRangeFrom(minFd, func(fd int) {
if runtime_IsPollDescriptor(uintptr(fd)) {
// These are the Go runtimes internal netpoll file descriptors.
@ -107,11 +147,117 @@ func UnsafeCloseFrom(minFd int) error {
})
}
// NewSockPair returns a new unix socket pair
func NewSockPair(name string) (parent *os.File, child *os.File, err error) {
// NewSockPair returns a new SOCK_STREAM unix socket pair.
func NewSockPair(name string) (parent, child *os.File, err error) {
fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0)
if err != nil {
return nil, nil, err
}
return os.NewFile(uintptr(fds[1]), name+"-p"), os.NewFile(uintptr(fds[0]), name+"-c"), nil
}
// WithProcfd runs the passed closure with a procfd path (/proc/self/fd/...)
// corresponding to the unsafePath resolved within the root. Before passing the
// fd, this path is verified to have been inside the root -- so operating on it
// through the passed fdpath should be safe. Do not access this path through
// the original path strings, and do not attempt to use the pathname outside of
// the passed closure (the file handle will be freed once the closure returns).
func WithProcfd(root, unsafePath string, fn func(procfd string) error) error {
// Remove the root then forcefully resolve inside the root.
unsafePath = stripRoot(root, unsafePath)
path, err := securejoin.SecureJoin(root, unsafePath)
if err != nil {
return fmt.Errorf("resolving path inside rootfs failed: %w", err)
}
procSelfFd, closer := ProcThreadSelf("fd/")
defer closer()
// Open the target path.
fh, err := os.OpenFile(path, unix.O_PATH|unix.O_CLOEXEC, 0)
if err != nil {
return fmt.Errorf("open o_path procfd: %w", err)
}
defer fh.Close()
procfd := filepath.Join(procSelfFd, strconv.Itoa(int(fh.Fd())))
// Double-check the path is the one we expected.
if realpath, err := os.Readlink(procfd); err != nil {
return fmt.Errorf("procfd verification failed: %w", err)
} else if realpath != path {
return fmt.Errorf("possibly malicious path detected -- refusing to operate on %s", realpath)
}
return fn(procfd)
}
type ProcThreadSelfCloser func()
var (
haveProcThreadSelf bool
haveProcThreadSelfOnce sync.Once
)
// ProcThreadSelf returns a string that is equivalent to
// /proc/thread-self/<subpath>, with a graceful fallback on older kernels where
// /proc/thread-self doesn't exist. This method DOES NOT use SecureJoin,
// meaning that the passed string needs to be trusted. The caller _must_ call
// the returned procThreadSelfCloser function (which is runtime.UnlockOSThread)
// *only once* after it has finished using the returned path string.
func ProcThreadSelf(subpath string) (string, ProcThreadSelfCloser) {
haveProcThreadSelfOnce.Do(func() {
if _, err := os.Stat("/proc/thread-self/"); err == nil {
haveProcThreadSelf = true
} else {
logrus.Debugf("cannot stat /proc/thread-self (%v), falling back to /proc/self/task/<tid>", err)
}
})
// We need to lock our thread until the caller is done with the path string
// because any non-atomic operation on the path (such as opening a file,
// then reading it) could be interrupted by the Go runtime where the
// underlying thread is swapped out and the original thread is killed,
// resulting in pull-your-hair-out-hard-to-debug issues in the caller. In
// addition, the pre-3.17 fallback makes everything non-atomic because the
// same thing could happen between unix.Gettid() and the path operations.
//
// In theory, we don't need to lock in the atomic user case when using
// /proc/thread-self/, but it's better to be safe than sorry (and there are
// only one or two truly atomic users of /proc/thread-self/).
runtime.LockOSThread()
threadSelf := "/proc/thread-self/"
if !haveProcThreadSelf {
// Pre-3.17 kernels did not have /proc/thread-self, so do it manually.
threadSelf = "/proc/self/task/" + strconv.Itoa(unix.Gettid()) + "/"
if _, err := os.Stat(threadSelf); err != nil {
// Unfortunately, this code is called from rootfs_linux.go where we
// are running inside the pid namespace of the container but /proc
// is the host's procfs. Unfortunately there is no real way to get
// the correct tid to use here (the kernel age means we cannot do
// things like set up a private fsopen("proc") -- even scanning
// NSpid in all of the tasks in /proc/self/task/*/status requires
// Linux 4.1).
//
// So, we just have to assume that /proc/self is acceptable in this
// one specific case.
if os.Getpid() == 1 {
logrus.Debugf("/proc/thread-self (tid=%d) cannot be emulated inside the initial container setup -- using /proc/self instead: %v", unix.Gettid(), err)
} else {
// This should never happen, but the fallback should work in most cases...
logrus.Warnf("/proc/thread-self could not be emulated for pid=%d (tid=%d) -- using more buggy /proc/self fallback instead: %v", os.Getpid(), unix.Gettid(), err)
}
threadSelf = "/proc/self/"
}
}
return threadSelf + subpath, runtime.UnlockOSThread
}
// ProcThreadSelfFd is small wrapper around ProcThreadSelf to make it easier to
// create a /proc/thread-self handle for given file descriptor.
//
// It is basically equivalent to ProcThreadSelf(fmt.Sprintf("fd/%d", fd)), but
// without using fmt.Sprintf to avoid unneeded overhead.
func ProcThreadSelfFd(fd uintptr) (string, ProcThreadSelfCloser) {
return ProcThreadSelf("fd/" + strconv.FormatUint(uint64(fd), 10))
}