Merge pull request #2082 from rhatdan/runc

Update vendor of runc
This commit is contained in:
OpenShift Merge Robot
2019-01-06 17:27:50 -08:00
committed by GitHub
42 changed files with 9250 additions and 101 deletions

View File

@@ -0,0 +1,97 @@
// +build ignore
// Simple tool to create an archive stream from an old and new directory
//
// By default it will stream the comparison of two temporary directories with junk files
package main
import (
"flag"
"fmt"
"io"
"io/ioutil"
"os"
"path"
"github.com/docker/docker/pkg/archive"
"github.com/sirupsen/logrus"
)
var (
flDebug = flag.Bool("D", false, "debugging output")
flNewDir = flag.String("newdir", "", "")
flOldDir = flag.String("olddir", "", "")
log = logrus.New()
)
func main() {
flag.Usage = func() {
fmt.Println("Produce a tar from comparing two directory paths. By default a demo tar is created of around 200 files (including hardlinks)")
fmt.Printf("%s [OPTIONS]\n", os.Args[0])
flag.PrintDefaults()
}
flag.Parse()
log.Out = os.Stderr
if (len(os.Getenv("DEBUG")) > 0) || *flDebug {
logrus.SetLevel(logrus.DebugLevel)
}
var newDir, oldDir string
if len(*flNewDir) == 0 {
var err error
newDir, err = ioutil.TempDir("", "docker-test-newDir")
if err != nil {
log.Fatal(err)
}
defer os.RemoveAll(newDir)
if _, err := prepareUntarSourceDirectory(100, newDir, true); err != nil {
log.Fatal(err)
}
} else {
newDir = *flNewDir
}
if len(*flOldDir) == 0 {
oldDir, err := ioutil.TempDir("", "docker-test-oldDir")
if err != nil {
log.Fatal(err)
}
defer os.RemoveAll(oldDir)
} else {
oldDir = *flOldDir
}
changes, err := archive.ChangesDirs(newDir, oldDir)
if err != nil {
log.Fatal(err)
}
a, err := archive.ExportChanges(newDir, changes)
if err != nil {
log.Fatal(err)
}
defer a.Close()
i, err := io.Copy(os.Stdout, a)
if err != nil && err != io.EOF {
log.Fatal(err)
}
fmt.Fprintf(os.Stderr, "wrote archive of %d bytes", i)
}
func prepareUntarSourceDirectory(numberOfFiles int, targetPath string, makeLinks bool) (int, error) {
fileData := []byte("fooo")
for n := 0; n < numberOfFiles; n++ {
fileName := fmt.Sprintf("file-%d", n)
if err := ioutil.WriteFile(path.Join(targetPath, fileName), fileData, 0700); err != nil {
return 0, err
}
if makeLinks {
if err := os.Link(path.Join(targetPath, fileName), path.Join(targetPath, fileName+"-link")); err != nil {
return 0, err
}
}
}
totalSize := numberOfFiles * len(fileData)
return totalSize, nil
}

View File

@@ -1 +0,0 @@
SysInfo stores information about which features a kernel supports.

View File

@@ -1,12 +0,0 @@
// +build !linux,!windows
package sysinfo
import (
"runtime"
)
// NumCPU returns the number of CPUs
func NumCPU() int {
return runtime.NumCPU()
}

View File

@@ -1,44 +0,0 @@
// +build linux
package sysinfo
import (
"runtime"
"unsafe"
"golang.org/x/sys/unix"
)
// numCPU queries the system for the count of threads available
// for use to this process.
//
// Issues two syscalls.
// Returns 0 on errors. Use |runtime.NumCPU| in that case.
func numCPU() int {
// Gets the affinity mask for a process: The very one invoking this function.
pid, _, _ := unix.RawSyscall(unix.SYS_GETPID, 0, 0, 0)
var mask [1024 / 64]uintptr
_, _, err := unix.RawSyscall(unix.SYS_SCHED_GETAFFINITY, pid, uintptr(len(mask)*8), uintptr(unsafe.Pointer(&mask[0])))
if err != 0 {
return 0
}
// For every available thread a bit is set in the mask.
ncpu := 0
for _, e := range mask {
if e == 0 {
continue
}
ncpu += int(popcnt(uint64(e)))
}
return ncpu
}
// NumCPU returns the number of CPUs which are currently online
func NumCPU() int {
if ncpu := numCPU(); ncpu > 0 {
return ncpu
}
return runtime.NumCPU()
}

View File

@@ -1,37 +0,0 @@
// +build windows
package sysinfo
import (
"runtime"
"unsafe"
"golang.org/x/sys/windows"
)
var (
kernel32 = windows.NewLazySystemDLL("kernel32.dll")
getCurrentProcess = kernel32.NewProc("GetCurrentProcess")
getProcessAffinityMask = kernel32.NewProc("GetProcessAffinityMask")
)
func numCPU() int {
// Gets the affinity mask for a process
var mask, sysmask uintptr
currentProcess, _, _ := getCurrentProcess.Call()
ret, _, _ := getProcessAffinityMask.Call(currentProcess, uintptr(unsafe.Pointer(&mask)), uintptr(unsafe.Pointer(&sysmask)))
if ret == 0 {
return 0
}
// For every available thread a bit is set in the mask.
ncpu := int(popcnt(uint64(mask)))
return ncpu
}
// NumCPU returns the number of CPUs which are currently online
func NumCPU() int {
if ncpu := numCPU(); ncpu > 0 {
return ncpu
}
return runtime.NumCPU()
}

View File

@@ -1,144 +0,0 @@
package sysinfo
import "github.com/docker/docker/pkg/parsers"
// SysInfo stores information about which features a kernel supports.
// TODO Windows: Factor out platform specific capabilities.
type SysInfo struct {
// Whether the kernel supports AppArmor or not
AppArmor bool
// Whether the kernel supports Seccomp or not
Seccomp bool
cgroupMemInfo
cgroupCPUInfo
cgroupBlkioInfo
cgroupCpusetInfo
cgroupPids
// Whether IPv4 forwarding is supported or not, if this was disabled, networking will not work
IPv4ForwardingDisabled bool
// Whether bridge-nf-call-iptables is supported or not
BridgeNFCallIPTablesDisabled bool
// Whether bridge-nf-call-ip6tables is supported or not
BridgeNFCallIP6TablesDisabled bool
// Whether the cgroup has the mountpoint of "devices" or not
CgroupDevicesEnabled bool
}
type cgroupMemInfo struct {
// Whether memory limit is supported or not
MemoryLimit bool
// Whether swap limit is supported or not
SwapLimit bool
// Whether soft limit is supported or not
MemoryReservation bool
// Whether OOM killer disable is supported or not
OomKillDisable bool
// Whether memory swappiness is supported or not
MemorySwappiness bool
// Whether kernel memory limit is supported or not
KernelMemory bool
}
type cgroupCPUInfo struct {
// Whether CPU shares is supported or not
CPUShares bool
// Whether CPU CFS(Completely Fair Scheduler) period is supported or not
CPUCfsPeriod bool
// Whether CPU CFS(Completely Fair Scheduler) quota is supported or not
CPUCfsQuota bool
// Whether CPU real-time period is supported or not
CPURealtimePeriod bool
// Whether CPU real-time runtime is supported or not
CPURealtimeRuntime bool
}
type cgroupBlkioInfo struct {
// Whether Block IO weight is supported or not
BlkioWeight bool
// Whether Block IO weight_device is supported or not
BlkioWeightDevice bool
// Whether Block IO read limit in bytes per second is supported or not
BlkioReadBpsDevice bool
// Whether Block IO write limit in bytes per second is supported or not
BlkioWriteBpsDevice bool
// Whether Block IO read limit in IO per second is supported or not
BlkioReadIOpsDevice bool
// Whether Block IO write limit in IO per second is supported or not
BlkioWriteIOpsDevice bool
}
type cgroupCpusetInfo struct {
// Whether Cpuset is supported or not
Cpuset bool
// Available Cpuset's cpus
Cpus string
// Available Cpuset's memory nodes
Mems string
}
type cgroupPids struct {
// Whether Pids Limit is supported or not
PidsLimit bool
}
// IsCpusetCpusAvailable returns `true` if the provided string set is contained
// in cgroup's cpuset.cpus set, `false` otherwise.
// If error is not nil a parsing error occurred.
func (c cgroupCpusetInfo) IsCpusetCpusAvailable(provided string) (bool, error) {
return isCpusetListAvailable(provided, c.Cpus)
}
// IsCpusetMemsAvailable returns `true` if the provided string set is contained
// in cgroup's cpuset.mems set, `false` otherwise.
// If error is not nil a parsing error occurred.
func (c cgroupCpusetInfo) IsCpusetMemsAvailable(provided string) (bool, error) {
return isCpusetListAvailable(provided, c.Mems)
}
func isCpusetListAvailable(provided, available string) (bool, error) {
parsedProvided, err := parsers.ParseUintList(provided)
if err != nil {
return false, err
}
parsedAvailable, err := parsers.ParseUintList(available)
if err != nil {
return false, err
}
for k := range parsedProvided {
if !parsedAvailable[k] {
return false, nil
}
}
return true, nil
}
// Returns bit count of 1, used by NumCPU
func popcnt(x uint64) (n byte) {
x -= (x >> 1) & 0x5555555555555555
x = (x>>2)&0x3333333333333333 + x&0x3333333333333333
x += x >> 4
x &= 0x0f0f0f0f0f0f0f0f
x *= 0x0101010101010101
return byte(x >> 56)
}

View File

@@ -1,254 +0,0 @@
package sysinfo
import (
"fmt"
"io/ioutil"
"os"
"path"
"strings"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
)
func findCgroupMountpoints() (map[string]string, error) {
cgMounts, err := cgroups.GetCgroupMounts(false)
if err != nil {
return nil, fmt.Errorf("Failed to parse cgroup information: %v", err)
}
mps := make(map[string]string)
for _, m := range cgMounts {
for _, ss := range m.Subsystems {
mps[ss] = m.Mountpoint
}
}
return mps, nil
}
// New returns a new SysInfo, using the filesystem to detect which features
// the kernel supports. If `quiet` is `false` warnings are printed in logs
// whenever an error occurs or misconfigurations are present.
func New(quiet bool) *SysInfo {
sysInfo := &SysInfo{}
cgMounts, err := findCgroupMountpoints()
if err != nil {
logrus.Warnf("Failed to parse cgroup information: %v", err)
} else {
sysInfo.cgroupMemInfo = checkCgroupMem(cgMounts, quiet)
sysInfo.cgroupCPUInfo = checkCgroupCPU(cgMounts, quiet)
sysInfo.cgroupBlkioInfo = checkCgroupBlkioInfo(cgMounts, quiet)
sysInfo.cgroupCpusetInfo = checkCgroupCpusetInfo(cgMounts, quiet)
sysInfo.cgroupPids = checkCgroupPids(quiet)
}
_, ok := cgMounts["devices"]
sysInfo.CgroupDevicesEnabled = ok
sysInfo.IPv4ForwardingDisabled = !readProcBool("/proc/sys/net/ipv4/ip_forward")
sysInfo.BridgeNFCallIPTablesDisabled = !readProcBool("/proc/sys/net/bridge/bridge-nf-call-iptables")
sysInfo.BridgeNFCallIP6TablesDisabled = !readProcBool("/proc/sys/net/bridge/bridge-nf-call-ip6tables")
// Check if AppArmor is supported.
if _, err := os.Stat("/sys/kernel/security/apparmor"); !os.IsNotExist(err) {
sysInfo.AppArmor = true
}
// Check if Seccomp is supported, via CONFIG_SECCOMP.
if err := unix.Prctl(unix.PR_GET_SECCOMP, 0, 0, 0, 0); err != unix.EINVAL {
// Make sure the kernel has CONFIG_SECCOMP_FILTER.
if err := unix.Prctl(unix.PR_SET_SECCOMP, unix.SECCOMP_MODE_FILTER, 0, 0, 0); err != unix.EINVAL {
sysInfo.Seccomp = true
}
}
return sysInfo
}
// checkCgroupMem reads the memory information from the memory cgroup mount point.
func checkCgroupMem(cgMounts map[string]string, quiet bool) cgroupMemInfo {
mountPoint, ok := cgMounts["memory"]
if !ok {
if !quiet {
logrus.Warn("Your kernel does not support cgroup memory limit")
}
return cgroupMemInfo{}
}
swapLimit := cgroupEnabled(mountPoint, "memory.memsw.limit_in_bytes")
if !quiet && !swapLimit {
logrus.Warn("Your kernel does not support swap memory limit")
}
memoryReservation := cgroupEnabled(mountPoint, "memory.soft_limit_in_bytes")
if !quiet && !memoryReservation {
logrus.Warn("Your kernel does not support memory reservation")
}
oomKillDisable := cgroupEnabled(mountPoint, "memory.oom_control")
if !quiet && !oomKillDisable {
logrus.Warn("Your kernel does not support oom control")
}
memorySwappiness := cgroupEnabled(mountPoint, "memory.swappiness")
if !quiet && !memorySwappiness {
logrus.Warn("Your kernel does not support memory swappiness")
}
kernelMemory := cgroupEnabled(mountPoint, "memory.kmem.limit_in_bytes")
if !quiet && !kernelMemory {
logrus.Warn("Your kernel does not support kernel memory limit")
}
return cgroupMemInfo{
MemoryLimit: true,
SwapLimit: swapLimit,
MemoryReservation: memoryReservation,
OomKillDisable: oomKillDisable,
MemorySwappiness: memorySwappiness,
KernelMemory: kernelMemory,
}
}
// checkCgroupCPU reads the cpu information from the cpu cgroup mount point.
func checkCgroupCPU(cgMounts map[string]string, quiet bool) cgroupCPUInfo {
mountPoint, ok := cgMounts["cpu"]
if !ok {
if !quiet {
logrus.Warn("Unable to find cpu cgroup in mounts")
}
return cgroupCPUInfo{}
}
cpuShares := cgroupEnabled(mountPoint, "cpu.shares")
if !quiet && !cpuShares {
logrus.Warn("Your kernel does not support cgroup cpu shares")
}
cpuCfsPeriod := cgroupEnabled(mountPoint, "cpu.cfs_period_us")
if !quiet && !cpuCfsPeriod {
logrus.Warn("Your kernel does not support cgroup cfs period")
}
cpuCfsQuota := cgroupEnabled(mountPoint, "cpu.cfs_quota_us")
if !quiet && !cpuCfsQuota {
logrus.Warn("Your kernel does not support cgroup cfs quotas")
}
cpuRealtimePeriod := cgroupEnabled(mountPoint, "cpu.rt_period_us")
if !quiet && !cpuRealtimePeriod {
logrus.Warn("Your kernel does not support cgroup rt period")
}
cpuRealtimeRuntime := cgroupEnabled(mountPoint, "cpu.rt_runtime_us")
if !quiet && !cpuRealtimeRuntime {
logrus.Warn("Your kernel does not support cgroup rt runtime")
}
return cgroupCPUInfo{
CPUShares: cpuShares,
CPUCfsPeriod: cpuCfsPeriod,
CPUCfsQuota: cpuCfsQuota,
CPURealtimePeriod: cpuRealtimePeriod,
CPURealtimeRuntime: cpuRealtimeRuntime,
}
}
// checkCgroupBlkioInfo reads the blkio information from the blkio cgroup mount point.
func checkCgroupBlkioInfo(cgMounts map[string]string, quiet bool) cgroupBlkioInfo {
mountPoint, ok := cgMounts["blkio"]
if !ok {
if !quiet {
logrus.Warn("Unable to find blkio cgroup in mounts")
}
return cgroupBlkioInfo{}
}
weight := cgroupEnabled(mountPoint, "blkio.weight")
if !quiet && !weight {
logrus.Warn("Your kernel does not support cgroup blkio weight")
}
weightDevice := cgroupEnabled(mountPoint, "blkio.weight_device")
if !quiet && !weightDevice {
logrus.Warn("Your kernel does not support cgroup blkio weight_device")
}
readBpsDevice := cgroupEnabled(mountPoint, "blkio.throttle.read_bps_device")
if !quiet && !readBpsDevice {
logrus.Warn("Your kernel does not support cgroup blkio throttle.read_bps_device")
}
writeBpsDevice := cgroupEnabled(mountPoint, "blkio.throttle.write_bps_device")
if !quiet && !writeBpsDevice {
logrus.Warn("Your kernel does not support cgroup blkio throttle.write_bps_device")
}
readIOpsDevice := cgroupEnabled(mountPoint, "blkio.throttle.read_iops_device")
if !quiet && !readIOpsDevice {
logrus.Warn("Your kernel does not support cgroup blkio throttle.read_iops_device")
}
writeIOpsDevice := cgroupEnabled(mountPoint, "blkio.throttle.write_iops_device")
if !quiet && !writeIOpsDevice {
logrus.Warn("Your kernel does not support cgroup blkio throttle.write_iops_device")
}
return cgroupBlkioInfo{
BlkioWeight: weight,
BlkioWeightDevice: weightDevice,
BlkioReadBpsDevice: readBpsDevice,
BlkioWriteBpsDevice: writeBpsDevice,
BlkioReadIOpsDevice: readIOpsDevice,
BlkioWriteIOpsDevice: writeIOpsDevice,
}
}
// checkCgroupCpusetInfo reads the cpuset information from the cpuset cgroup mount point.
func checkCgroupCpusetInfo(cgMounts map[string]string, quiet bool) cgroupCpusetInfo {
mountPoint, ok := cgMounts["cpuset"]
if !ok {
if !quiet {
logrus.Warn("Unable to find cpuset cgroup in mounts")
}
return cgroupCpusetInfo{}
}
cpus, err := ioutil.ReadFile(path.Join(mountPoint, "cpuset.cpus"))
if err != nil {
return cgroupCpusetInfo{}
}
mems, err := ioutil.ReadFile(path.Join(mountPoint, "cpuset.mems"))
if err != nil {
return cgroupCpusetInfo{}
}
return cgroupCpusetInfo{
Cpuset: true,
Cpus: strings.TrimSpace(string(cpus)),
Mems: strings.TrimSpace(string(mems)),
}
}
// checkCgroupPids reads the pids information from the pids cgroup mount point.
func checkCgroupPids(quiet bool) cgroupPids {
_, err := cgroups.FindCgroupMountpoint("pids")
if err != nil {
if !quiet {
logrus.Warn(err)
}
return cgroupPids{}
}
return cgroupPids{
PidsLimit: true,
}
}
func cgroupEnabled(mountPoint, name string) bool {
_, err := os.Stat(path.Join(mountPoint, name))
return err == nil
}
func readProcBool(path string) bool {
val, err := ioutil.ReadFile(path)
if err != nil {
return false
}
return strings.TrimSpace(string(val)) == "1"
}

View File

@@ -1,121 +0,0 @@
// +build solaris,cgo
package sysinfo
import (
"bytes"
"os/exec"
"strconv"
"strings"
)
/*
#cgo LDFLAGS: -llgrp
#include <unistd.h>
#include <stdlib.h>
#include <sys/lgrp_user.h>
int getLgrpCount() {
lgrp_cookie_t lgrpcookie = LGRP_COOKIE_NONE;
uint_t nlgrps;
if ((lgrpcookie = lgrp_init(LGRP_VIEW_OS)) == LGRP_COOKIE_NONE) {
return -1;
}
nlgrps = lgrp_nlgrps(lgrpcookie);
return nlgrps;
}
*/
import "C"
// IsCPUSharesAvailable returns whether CPUShares setting is supported.
// We need FSS to be set as default scheduling class to support CPU Shares
func IsCPUSharesAvailable() bool {
cmd := exec.Command("/usr/sbin/dispadmin", "-d")
outBuf := new(bytes.Buffer)
errBuf := new(bytes.Buffer)
cmd.Stderr = errBuf
cmd.Stdout = outBuf
if err := cmd.Run(); err != nil {
return false
}
return (strings.Contains(outBuf.String(), "FSS"))
}
// New returns a new SysInfo, using the filesystem to detect which features
// the kernel supports.
//NOTE Solaris: If we change the below capabilities be sure
// to update verifyPlatformContainerSettings() in daemon_solaris.go
func New(quiet bool) *SysInfo {
sysInfo := &SysInfo{}
sysInfo.cgroupMemInfo = setCgroupMem(quiet)
sysInfo.cgroupCPUInfo = setCgroupCPU(quiet)
sysInfo.cgroupBlkioInfo = setCgroupBlkioInfo(quiet)
sysInfo.cgroupCpusetInfo = setCgroupCPUsetInfo(quiet)
sysInfo.IPv4ForwardingDisabled = false
sysInfo.AppArmor = false
return sysInfo
}
// setCgroupMem reads the memory information for Solaris.
func setCgroupMem(quiet bool) cgroupMemInfo {
return cgroupMemInfo{
MemoryLimit: true,
SwapLimit: true,
MemoryReservation: false,
OomKillDisable: false,
MemorySwappiness: false,
KernelMemory: false,
}
}
// setCgroupCPU reads the cpu information for Solaris.
func setCgroupCPU(quiet bool) cgroupCPUInfo {
return cgroupCPUInfo{
CPUShares: true,
CPUCfsPeriod: false,
CPUCfsQuota: true,
CPURealtimePeriod: false,
CPURealtimeRuntime: false,
}
}
// blkio switches are not supported in Solaris.
func setCgroupBlkioInfo(quiet bool) cgroupBlkioInfo {
return cgroupBlkioInfo{
BlkioWeight: false,
BlkioWeightDevice: false,
}
}
// setCgroupCPUsetInfo reads the cpuset information for Solaris.
func setCgroupCPUsetInfo(quiet bool) cgroupCpusetInfo {
return cgroupCpusetInfo{
Cpuset: true,
Cpus: getCPUCount(),
Mems: getLgrpCount(),
}
}
func getCPUCount() string {
ncpus := C.sysconf(C._SC_NPROCESSORS_ONLN)
if ncpus <= 0 {
return ""
}
return strconv.FormatInt(int64(ncpus), 16)
}
func getLgrpCount() string {
nlgrps := C.getLgrpCount()
if nlgrps <= 0 {
return ""
}
return strconv.FormatInt(int64(nlgrps), 16)
}

View File

@@ -1,9 +0,0 @@
// +build !linux,!solaris,!windows
package sysinfo
// New returns an empty SysInfo for non linux nor solaris for now.
func New(quiet bool) *SysInfo {
sysInfo := &SysInfo{}
return sysInfo
}

View File

@@ -1,9 +0,0 @@
// +build windows
package sysinfo
// New returns an empty SysInfo for windows for now.
func New(quiet bool) *SysInfo {
sysInfo := &SysInfo{}
return sysInfo
}

View File

@@ -0,0 +1,32 @@
// +build ignore
package main
import (
"encoding/json"
"io/ioutil"
"os"
"path/filepath"
"github.com/docker/docker/profiles/seccomp"
)
// saves the default seccomp profile as a json file so people can use it as a
// base for their own custom profiles
func main() {
wd, err := os.Getwd()
if err != nil {
panic(err)
}
f := filepath.Join(wd, "default.json")
// write the default profile to the file
b, err := json.MarshalIndent(seccomp.DefaultProfile(), "", "\t")
if err != nil {
panic(err)
}
if err := ioutil.WriteFile(f, b, 0644); err != nil {
panic(err)
}
}

View File

@@ -68,6 +68,7 @@ make BUILDTAGS='seccomp apparmor'
| selinux | selinux process and mount labeling | <none> |
| apparmor | apparmor profile support | <none> |
| ambient | ambient capability support | kernel 4.3 |
| nokmem | disable kernel memory account | <none> |
### Running the test suite
@@ -87,6 +88,18 @@ You can run a specific test case by setting the `TESTFLAGS` variable.
# make test TESTFLAGS="-run=SomeTestFunction"
```
You can run a specific integration test by setting the `TESTPATH` variable.
```bash
# make test TESTPATH="/checkpoint.bats"
```
You can run a test in your proxy environment by setting `DOCKER_BUILD_PROXY` and `DOCKER_RUN_PROXY` variables.
```bash
# make test DOCKER_BUILD_PROXY="--build-arg HTTP_PROXY=http://yourproxy/" DOCKER_RUN_PROXY="-e HTTP_PROXY=http://yourproxy/"
```
### Dependencies Management
`runc` uses [vndr](https://github.com/LK4D4/vndr) for dependencies management.
@@ -251,3 +264,7 @@ PIDFile=/run/mycontainerid.pid
[Install]
WantedBy=multi-user.target
```
## License
The code and docs are released under the [Apache 2.0 license](LICENSE).

View File

@@ -148,6 +148,7 @@ config := &configs.Config{
{Type: configs.NEWPID},
{Type: configs.NEWUSER},
{Type: configs.NEWNET},
{Type: configs.NEWCGROUP},
}),
Cgroups: &configs.Cgroup{
Name: "test-container",
@@ -323,6 +324,7 @@ generated when building libcontainer with docker.
## Copyright and license
Code and documentation copyright 2014 Docker, inc. Code released under the Apache 2.0 license.
Docs released under Creative commons.
Code and documentation copyright 2014 Docker, inc.
The code and documentation are released under the [Apache 2.0 license](../LICENSE).
The documentation is also released under Creative Commons Attribution 4.0 International License.
You may obtain a copy of the license, titled CC-BY-4.0, at http://creativecommons.org/licenses/by/4.0/.

View File

@@ -13,40 +13,50 @@ import (
"strings"
"time"
"github.com/docker/go-units"
units "github.com/docker/go-units"
)
const (
cgroupNamePrefix = "name="
CgroupNamePrefix = "name="
CgroupProcesses = "cgroup.procs"
)
// https://www.kernel.org/doc/Documentation/cgroup-v1/cgroups.txt
func FindCgroupMountpoint(subsystem string) (string, error) {
mnt, _, err := FindCgroupMountpointAndRoot(subsystem)
func FindCgroupMountpoint(cgroupPath, subsystem string) (string, error) {
mnt, _, err := FindCgroupMountpointAndRoot(cgroupPath, subsystem)
return mnt, err
}
func FindCgroupMountpointAndRoot(subsystem string) (string, string, error) {
func FindCgroupMountpointAndRoot(cgroupPath, subsystem string) (string, string, error) {
// We are not using mount.GetMounts() because it's super-inefficient,
// parsing it directly sped up x10 times because of not using Sscanf.
// It was one of two major performance drawbacks in container start.
if !isSubsystemAvailable(subsystem) {
return "", "", NewNotFoundError(subsystem)
}
f, err := os.Open("/proc/self/mountinfo")
if err != nil {
return "", "", err
}
defer f.Close()
scanner := bufio.NewScanner(f)
return findCgroupMountpointAndRootFromReader(f, cgroupPath, subsystem)
}
func findCgroupMountpointAndRootFromReader(reader io.Reader, cgroupPath, subsystem string) (string, string, error) {
scanner := bufio.NewScanner(reader)
for scanner.Scan() {
txt := scanner.Text()
fields := strings.Split(txt, " ")
for _, opt := range strings.Split(fields[len(fields)-1], ",") {
if opt == subsystem {
return fields[4], fields[3], nil
fields := strings.Fields(txt)
if len(fields) < 5 {
continue
}
if strings.HasPrefix(fields[4], cgroupPath) {
for _, opt := range strings.Split(fields[len(fields)-1], ",") {
if opt == subsystem {
return fields[4], fields[3], nil
}
}
}
}
@@ -103,7 +113,7 @@ func FindCgroupMountpointDir() (string, error) {
}
if postSeparatorFields[0] == "cgroup" {
// Check that the mount is properly formated.
// Check that the mount is properly formatted.
if numPostFields < 3 {
return "", fmt.Errorf("Error found less than 3 fields post '-' in %q", text)
}
@@ -151,19 +161,20 @@ func getCgroupMountsHelper(ss map[string]bool, mi io.Reader, all bool) ([]Mount,
Root: fields[3],
}
for _, opt := range strings.Split(fields[len(fields)-1], ",") {
if !ss[opt] {
seen, known := ss[opt]
if !known || (!all && seen) {
continue
}
if strings.HasPrefix(opt, cgroupNamePrefix) {
m.Subsystems = append(m.Subsystems, opt[len(cgroupNamePrefix):])
} else {
m.Subsystems = append(m.Subsystems, opt)
}
if !all {
numFound++
ss[opt] = true
if strings.HasPrefix(opt, CgroupNamePrefix) {
opt = opt[len(CgroupNamePrefix):]
}
m.Subsystems = append(m.Subsystems, opt)
numFound++
}
if len(m.Subsystems) > 0 || all {
res = append(res, m)
}
res = append(res, m)
}
if err := scanner.Err(); err != nil {
return nil, err
@@ -187,7 +198,7 @@ func GetCgroupMounts(all bool) ([]Mount, error) {
allMap := make(map[string]bool)
for s := range allSubsystems {
allMap[s] = true
allMap[s] = false
}
return getCgroupMountsHelper(allMap, f, all)
}
@@ -256,13 +267,13 @@ func GetInitCgroupPath(subsystem string) (string, error) {
}
func getCgroupPathHelper(subsystem, cgroup string) (string, error) {
mnt, root, err := FindCgroupMountpointAndRoot(subsystem)
mnt, root, err := FindCgroupMountpointAndRoot("", subsystem)
if err != nil {
return "", err
}
// This is needed for nested containers, because in /proc/self/cgroup we
// see pathes from host, which don't exist in container.
// see paths from host, which don't exist in container.
relCgroup, err := filepath.Rel(root, cgroup)
if err != nil {
return "", err
@@ -342,7 +353,7 @@ func getControllerPath(subsystem string, cgroups map[string]string) (string, err
return p, nil
}
if p, ok := cgroups[cgroupNamePrefix+subsystem]; ok {
if p, ok := cgroups[CgroupNamePrefix+subsystem]; ok {
return p, nil
}
@@ -452,7 +463,7 @@ func WriteCgroupProc(dir string, pid int) error {
return fmt.Errorf("no such directory for %s", CgroupProcesses)
}
// Dont attach any pid to the cgroup if -1 is specified as a pid
// Don't attach any pid to the cgroup if -1 is specified as a pid
if pid != -1 {
if err := ioutil.WriteFile(filepath.Join(dir, CgroupProcesses), []byte(strconv.Itoa(pid)), 0700); err != nil {
return fmt.Errorf("failed to write %v to %v: %v", pid, CgroupProcesses, err)

View File

@@ -186,12 +186,19 @@ type Config struct {
// callers keyring in this case.
NoNewKeyring bool `json:"no_new_keyring"`
// Rootless specifies whether the container is a rootless container.
Rootless bool `json:"rootless"`
// IntelRdt specifies settings for Intel RDT/CAT group that the container is placed into
// to limit the resources (e.g., L3 cache) the container has available
// IntelRdt specifies settings for Intel RDT group that the container is placed into
// to limit the resources (e.g., L3 cache, memory bandwidth) the container has available
IntelRdt *IntelRdt `json:"intel_rdt,omitempty"`
// RootlessEUID is set when the runc was launched with non-zero EUID.
// Note that RootlessEUID is set to false when launched with EUID=0 in userns.
// When RootlessEUID is set, runc creates a new userns for the container.
// (config.json needs to contain userns settings)
RootlessEUID bool `json:"rootless_euid,omitempty"`
// RootlessCgroups is set when unlikely to have the full access to cgroups.
// When RootlessCgroups is set, cgroups errors are ignored.
RootlessCgroups bool `json:"rootless_cgroups,omitempty"`
}
type Hooks struct {
@@ -265,26 +272,23 @@ func (hooks Hooks) MarshalJSON() ([]byte, error) {
})
}
// HookState is the payload provided to a hook on execution.
type HookState specs.State
type Hook interface {
// Run executes the hook with the provided state.
Run(HookState) error
Run(*specs.State) error
}
// NewFunctionHook will call the provided function when the hook is run.
func NewFunctionHook(f func(HookState) error) FuncHook {
func NewFunctionHook(f func(*specs.State) error) FuncHook {
return FuncHook{
run: f,
}
}
type FuncHook struct {
run func(HookState) error
run func(*specs.State) error
}
func (f FuncHook) Run(s HookState) error {
func (f FuncHook) Run(s *specs.State) error {
return f.run(s)
}
@@ -307,7 +311,7 @@ type CommandHook struct {
Command
}
func (c Command) Run(s HookState) error {
func (c Command) Run(s *specs.State) error {
b, err := json.Marshal(s)
if err != nil {
return err

View File

@@ -4,4 +4,10 @@ type IntelRdt struct {
// The schema for L3 cache id and capacity bitmask (CBM)
// Format: "L3:<cache_id0>=<cbm0>;<cache_id1>=<cbm1>;..."
L3CacheSchema string `json:"l3_cache_schema,omitempty"`
// The schema of memory bandwidth per L3 cache id
// Format: "MB:<cache_id0>=bandwidth0;<cache_id1>=bandwidth1;..."
// The unit of memory bandwidth is specified in "percentages" by
// default, and in "MBps" if MBA Software Controller is enabled.
MemBwSchema string `json:"memBwSchema,omitempty"`
}

View File

@@ -7,12 +7,13 @@ import (
)
const (
NEWNET NamespaceType = "NEWNET"
NEWPID NamespaceType = "NEWPID"
NEWNS NamespaceType = "NEWNS"
NEWUTS NamespaceType = "NEWUTS"
NEWIPC NamespaceType = "NEWIPC"
NEWUSER NamespaceType = "NEWUSER"
NEWNET NamespaceType = "NEWNET"
NEWPID NamespaceType = "NEWPID"
NEWNS NamespaceType = "NEWNS"
NEWUTS NamespaceType = "NEWUTS"
NEWIPC NamespaceType = "NEWIPC"
NEWUSER NamespaceType = "NEWUSER"
NEWCGROUP NamespaceType = "NEWCGROUP"
)
var (
@@ -35,6 +36,8 @@ func NsName(ns NamespaceType) string {
return "user"
case NEWUTS:
return "uts"
case NEWCGROUP:
return "cgroup"
}
return ""
}
@@ -68,6 +71,7 @@ func NamespaceTypes() []NamespaceType {
NEWNET,
NEWPID,
NEWNS,
NEWCGROUP,
}
}

View File

@@ -9,12 +9,13 @@ func (n *Namespace) Syscall() int {
}
var namespaceInfo = map[NamespaceType]int{
NEWNET: unix.CLONE_NEWNET,
NEWNS: unix.CLONE_NEWNS,
NEWUSER: unix.CLONE_NEWUSER,
NEWIPC: unix.CLONE_NEWIPC,
NEWUTS: unix.CLONE_NEWUTS,
NEWPID: unix.CLONE_NEWPID,
NEWNET: unix.CLONE_NEWNET,
NEWNS: unix.CLONE_NEWNS,
NEWUSER: unix.CLONE_NEWUSER,
NEWIPC: unix.CLONE_NEWIPC,
NEWUTS: unix.CLONE_NEWUTS,
NEWPID: unix.CLONE_NEWPID,
NEWCGROUP: unix.CLONE_NEWCGROUP,
}
// CloneFlags parses the container's Namespaces options to set the correct

View File

@@ -10,8 +10,8 @@ The `nsenter` package will `import "C"` and it uses [cgo](https://golang.org/cmd
package. In cgo, if the import of "C" is immediately preceded by a comment, that comment,
called the preamble, is used as a header when compiling the C parts of the package.
So every time we import package `nsenter`, the C code function `nsexec()` would be
called. And package `nsenter` is now only imported in `main_unix.go`, so every time
before we call `cmd.Start` on linux, that C code would run.
called. And package `nsenter` is only imported in `init.go`, so every time the runc
`init` command is invoked, that C code is run.
Because `nsexec()` must be run before the Go runtime in order to use the
Linux kernel namespace, you must `import` this library into a package if
@@ -37,7 +37,7 @@ the parent `nsexec()` will exit and the child `nsexec()` process will
return to allow the Go runtime take over.
NOTE: We do both `setns(2)` and `clone(2)` even if we don't have any
CLONE_NEW* clone flags because we must fork a new process in order to
`CLONE_NEW*` clone flags because we must fork a new process in order to
enter the PID namespace.

View File

@@ -42,6 +42,12 @@ enum sync_t {
SYNC_ERR = 0xFF, /* Fatal error, no turning back. The error code follows. */
};
/*
* Synchronisation value for cgroup namespace setup.
* The same constant is defined in process_linux.go as "createCgroupns".
*/
#define CREATECGROUPNS 0x80
/* longjmp() arguments. */
#define JUMP_PARENT 0x00
#define JUMP_CHILD 0xA0
@@ -82,7 +88,7 @@ struct nlconfig_t {
uint8_t is_setgroup;
/* Rootless container settings. */
uint8_t is_rootless;
uint8_t is_rootless_euid; /* boolean */
char *uidmappath;
size_t uidmappath_len;
char *gidmappath;
@@ -100,7 +106,7 @@ struct nlconfig_t {
#define GIDMAP_ATTR 27284
#define SETGROUP_ATTR 27285
#define OOM_SCORE_ADJ_ATTR 27286
#define ROOTLESS_ATTR 27287
#define ROOTLESS_EUID_ATTR 27287
#define UIDMAPPATH_ATTR 27288
#define GIDMAPPATH_ATTR 27289
@@ -211,7 +217,7 @@ static int try_mapping_tool(const char *app, int pid, char *map, size_t map_len)
/*
* If @app is NULL, execve will segfault. Just check it here and bail (if
* we're in this path, the caller is already getting desparate and there
* we're in this path, the caller is already getting desperate and there
* isn't a backup to this failing). This usually would be a configuration
* or programming issue.
*/
@@ -419,8 +425,8 @@ static void nl_parse(int fd, struct nlconfig_t *config)
case CLONE_FLAGS_ATTR:
config->cloneflags = readint32(current);
break;
case ROOTLESS_ATTR:
config->is_rootless = readint8(current);
case ROOTLESS_EUID_ATTR:
config->is_rootless_euid = readint8(current); /* boolean */
break;
case OOM_SCORE_ADJ_ATTR:
config->oom_score_adj = current;
@@ -640,7 +646,6 @@ void nsexec(void)
case JUMP_PARENT:{
int len;
pid_t child, first_child = -1;
char buf[JSON_MAX];
bool ready = false;
/* For debugging. */
@@ -687,7 +692,7 @@ void nsexec(void)
* newuidmap/newgidmap shall be used.
*/
if (config.is_rootless && !config.is_setgroup)
if (config.is_rootless_euid && !config.is_setgroup)
update_setgroups(child, SETGROUPS_DENY);
/* Set up mappings. */
@@ -716,6 +721,18 @@ void nsexec(void)
kill(child, SIGKILL);
bail("failed to sync with child: write(SYNC_RECVPID_ACK)");
}
/* Send the init_func pid back to our parent.
*
* Send the init_func pid and the pid of the first child back to our parent.
* We need to send both back because we can't reap the first child we created (CLONE_PARENT).
* It becomes the responsibility of our parent to reap the first child.
*/
len = dprintf(pipenum, "{\"pid\": %d, \"pid_first\": %d}\n", child, first_child);
if (len < 0) {
kill(child, SIGKILL);
bail("unable to generate JSON for child pid");
}
}
break;
case SYNC_CHILD_READY:
@@ -759,23 +776,6 @@ void nsexec(void)
bail("unexpected sync value: %u", s);
}
}
/*
* Send the init_func pid and the pid of the first child back to our parent.
*
* We need to send both back because we can't reap the first child we created (CLONE_PARENT).
* It becomes the responsibility of our parent to reap the first child.
*/
len = snprintf(buf, JSON_MAX, "{\"pid\": %d, \"pid_first\": %d}\n", child, first_child);
if (len < 0) {
kill(child, SIGKILL);
bail("unable to generate JSON for child pid");
}
if (write(pipenum, buf, len) != len) {
kill(child, SIGKILL);
bail("unable to send child pid to bootstrapper");
}
exit(0);
}
@@ -862,14 +862,17 @@ void nsexec(void)
if (setresuid(0, 0, 0) < 0)
bail("failed to become root in user namespace");
}
/*
* Unshare all of the namespaces. Note that we don't merge this
* with clone() because there were some old kernel versions where
* clone(CLONE_PARENT | CLONE_NEWPID) was broken, so we'll just do
* it the long way.
* Unshare all of the namespaces. Now, it should be noted that this
* ordering might break in the future (especially with rootless
* containers). But for now, it's not possible to split this into
* CLONE_NEWUSER + [the rest] because of some RHEL SELinux issues.
*
* Note that we don't merge this with clone() because there were
* some old kernel versions where clone(CLONE_PARENT | CLONE_NEWPID)
* was broken, so we'll just do it the long way anyway.
*/
if (unshare(config.cloneflags) < 0)
if (unshare(config.cloneflags & ~CLONE_NEWCGROUP) < 0)
bail("failed to unshare namespaces");
/*
@@ -953,11 +956,23 @@ void nsexec(void)
if (setgid(0) < 0)
bail("setgid failed");
if (!config.is_rootless && config.is_setgroup) {
if (!config.is_rootless_euid && config.is_setgroup) {
if (setgroups(0, NULL) < 0)
bail("setgroups failed");
}
/* ... wait until our topmost parent has finished cgroup setup in p.manager.Apply() ... */
if (config.cloneflags & CLONE_NEWCGROUP) {
uint8_t value;
if (read(pipenum, &value, sizeof(value)) != sizeof(value))
bail("read synchronisation value failed");
if (value == CREATECGROUPNS) {
if (unshare(CLONE_NEWCGROUP) < 0)
bail("failed to unshare cgroup namespace");
} else
bail("received unknown synchronisation value");
}
s = SYNC_CHILD_READY;
if (write(syncfd, &s, sizeof(s)) != sizeof(s))
bail("failed to sync with patent: write(SYNC_CHILD_READY)");

View File

@@ -5,6 +5,7 @@ package user
import (
"io"
"os"
"strconv"
"golang.org/x/sys/unix"
)
@@ -115,22 +116,23 @@ func CurrentGroup() (Group, error) {
return LookupGid(unix.Getgid())
}
func CurrentUserSubUIDs() ([]SubID, error) {
func currentUserSubIDs(fileName string) ([]SubID, error) {
u, err := CurrentUser()
if err != nil {
return nil, err
}
return ParseSubIDFileFilter("/etc/subuid",
func(entry SubID) bool { return entry.Name == u.Name })
filter := func(entry SubID) bool {
return entry.Name == u.Name || entry.Name == strconv.Itoa(u.Uid)
}
return ParseSubIDFileFilter(fileName, filter)
}
func CurrentGroupSubGIDs() ([]SubID, error) {
g, err := CurrentGroup()
if err != nil {
return nil, err
}
return ParseSubIDFileFilter("/etc/subgid",
func(entry SubID) bool { return entry.Name == g.Name })
func CurrentUserSubUIDs() ([]SubID, error) {
return currentUserSubIDs("/etc/subuid")
}
func CurrentUserSubGIDs() ([]SubID, error) {
return currentUserSubIDs("/etc/subgid")
}
func CurrentProcessUIDMap() ([]IDMap, error) {

View File

@@ -1,7 +1,7 @@
# OCI runtime-spec. When updating this, make sure you use a version tag rather
# than a commit ID so it's much more obvious what version of the spec we are
# using.
github.com/opencontainers/runtime-spec v1.0.0
github.com/opencontainers/runtime-spec 5684b8af48c1ac3b1451fa499724e30e3c20a294
# Core libcontainer functionality.
github.com/mrunalp/fileutils ed869b029674c0e9ce4c0dfa781405c2d9946d08
github.com/opencontainers/selinux v1.0.0-rc1