Files
podman/pkg/spec/spec.go
Daniel J Walsh e62d081770 Update podman to use containers.conf
Add more default options parsing

Switch to using --time as opposed to --timeout to better match Docker.

Signed-off-by: Daniel J Walsh <dwalsh@redhat.com>
2020-04-20 16:11:36 -04:00

545 lines
15 KiB
Go

package createconfig
import (
"strings"
"github.com/containers/common/pkg/capabilities"
cconfig "github.com/containers/common/pkg/config"
"github.com/containers/common/pkg/sysinfo"
"github.com/containers/libpod/libpod"
"github.com/containers/libpod/pkg/cgroups"
"github.com/containers/libpod/pkg/env"
"github.com/containers/libpod/pkg/rootless"
"github.com/containers/libpod/pkg/util"
"github.com/docker/go-units"
"github.com/opencontainers/runc/libcontainer/user"
spec "github.com/opencontainers/runtime-spec/specs-go"
"github.com/opencontainers/runtime-tools/generate"
"github.com/pkg/errors"
)
const CpuPeriod = 100000
func GetAvailableGids() (int64, error) {
idMap, err := user.ParseIDMapFile("/proc/self/gid_map")
if err != nil {
return 0, err
}
count := int64(0)
for _, r := range idMap {
count += r.Count
}
return count, nil
}
// CreateConfigToOCISpec parses information needed to create a container into an OCI runtime spec
func (config *CreateConfig) createConfigToOCISpec(runtime *libpod.Runtime, userMounts []spec.Mount) (*spec.Spec, error) {
cgroupPerm := "ro"
g, err := generate.New("linux")
if err != nil {
return nil, err
}
// Remove the default /dev/shm mount to ensure we overwrite it
g.RemoveMount("/dev/shm")
g.HostSpecific = true
addCgroup := true
canMountSys := true
isRootless := rootless.IsRootless()
inUserNS := config.User.InNS(isRootless)
if inUserNS && config.Network.NetMode.IsHost() {
canMountSys = false
}
if config.Security.Privileged && canMountSys {
cgroupPerm = "rw"
g.RemoveMount("/sys")
sysMnt := spec.Mount{
Destination: "/sys",
Type: "sysfs",
Source: "sysfs",
Options: []string{"rprivate", "nosuid", "noexec", "nodev", "rw"},
}
g.AddMount(sysMnt)
} else if !canMountSys {
addCgroup = false
g.RemoveMount("/sys")
r := "ro"
if config.Security.Privileged {
r = "rw"
}
sysMnt := spec.Mount{
Destination: "/sys",
Type: TypeBind,
Source: "/sys",
Options: []string{"rprivate", "nosuid", "noexec", "nodev", r, "rbind"},
}
g.AddMount(sysMnt)
if !config.Security.Privileged && isRootless {
g.AddLinuxMaskedPaths("/sys/kernel")
}
}
var runtimeConfig *cconfig.Config
if runtime != nil {
runtimeConfig, err = runtime.GetConfig()
if err != nil {
return nil, err
}
g.Config.Process.Capabilities.Bounding = runtimeConfig.Containers.DefaultCapabilities
sysctls, err := util.ValidateSysctls(runtimeConfig.Containers.DefaultSysctls)
if err != nil {
return nil, err
}
for name, val := range config.Security.Sysctl {
sysctls[name] = val
}
config.Security.Sysctl = sysctls
if !util.StringInSlice("host", config.Resources.Ulimit) {
config.Resources.Ulimit = append(runtimeConfig.Containers.DefaultUlimits, config.Resources.Ulimit...)
}
if config.Resources.PidsLimit < 0 && !config.cgroupDisabled() {
config.Resources.PidsLimit = runtimeConfig.Containers.PidsLimit
}
} else {
g.Config.Process.Capabilities.Bounding = cconfig.DefaultCapabilities
if config.Resources.PidsLimit < 0 && !config.cgroupDisabled() {
config.Resources.PidsLimit = cconfig.DefaultPidsLimit
}
}
gid5Available := true
if isRootless {
nGids, err := GetAvailableGids()
if err != nil {
return nil, err
}
gid5Available = nGids >= 5
}
// When using a different user namespace, check that the GID 5 is mapped inside
// the container.
if gid5Available && len(config.User.IDMappings.GIDMap) > 0 {
mappingFound := false
for _, r := range config.User.IDMappings.GIDMap {
if r.ContainerID <= 5 && 5 < r.ContainerID+r.Size {
mappingFound = true
break
}
}
if !mappingFound {
gid5Available = false
}
}
if !gid5Available {
// If we have no GID mappings, the gid=5 default option would fail, so drop it.
g.RemoveMount("/dev/pts")
devPts := spec.Mount{
Destination: "/dev/pts",
Type: "devpts",
Source: "devpts",
Options: []string{"rprivate", "nosuid", "noexec", "newinstance", "ptmxmode=0666", "mode=0620"},
}
g.AddMount(devPts)
}
if inUserNS && config.Ipc.IpcMode.IsHost() {
g.RemoveMount("/dev/mqueue")
devMqueue := spec.Mount{
Destination: "/dev/mqueue",
Type: TypeBind,
Source: "/dev/mqueue",
Options: []string{"bind", "nosuid", "noexec", "nodev"},
}
g.AddMount(devMqueue)
}
if inUserNS && config.Pid.PidMode.IsHost() {
g.RemoveMount("/proc")
procMount := spec.Mount{
Destination: "/proc",
Type: TypeBind,
Source: "/proc",
Options: []string{"rbind", "nosuid", "noexec", "nodev"},
}
g.AddMount(procMount)
}
if addCgroup {
cgroupMnt := spec.Mount{
Destination: "/sys/fs/cgroup",
Type: "cgroup",
Source: "cgroup",
Options: []string{"rprivate", "nosuid", "noexec", "nodev", "relatime", cgroupPerm},
}
g.AddMount(cgroupMnt)
}
g.SetProcessCwd(config.WorkDir)
g.SetProcessArgs(config.Command)
g.SetProcessTerminal(config.Tty)
for key, val := range config.Annotations {
g.AddAnnotation(key, val)
}
addedResources := false
// RESOURCES - MEMORY
if config.Resources.Memory != 0 {
g.SetLinuxResourcesMemoryLimit(config.Resources.Memory)
// If a swap limit is not explicitly set, also set a swap limit
// Default to double the memory limit
if config.Resources.MemorySwap == 0 {
g.SetLinuxResourcesMemorySwap(2 * config.Resources.Memory)
}
addedResources = true
}
if config.Resources.MemoryReservation != 0 {
g.SetLinuxResourcesMemoryReservation(config.Resources.MemoryReservation)
addedResources = true
}
if config.Resources.MemorySwap != 0 {
g.SetLinuxResourcesMemorySwap(config.Resources.MemorySwap)
addedResources = true
}
if config.Resources.KernelMemory != 0 {
g.SetLinuxResourcesMemoryKernel(config.Resources.KernelMemory)
addedResources = true
}
if config.Resources.MemorySwappiness != -1 {
g.SetLinuxResourcesMemorySwappiness(uint64(config.Resources.MemorySwappiness))
addedResources = true
}
g.SetLinuxResourcesMemoryDisableOOMKiller(config.Resources.DisableOomKiller)
g.SetProcessOOMScoreAdj(config.Resources.OomScoreAdj)
// RESOURCES - CPU
if config.Resources.CPUShares != 0 {
g.SetLinuxResourcesCPUShares(config.Resources.CPUShares)
addedResources = true
}
if config.Resources.CPUQuota != 0 {
g.SetLinuxResourcesCPUQuota(config.Resources.CPUQuota)
addedResources = true
}
if config.Resources.CPUPeriod != 0 {
g.SetLinuxResourcesCPUPeriod(config.Resources.CPUPeriod)
addedResources = true
}
if config.Resources.CPUs != 0 {
g.SetLinuxResourcesCPUPeriod(CpuPeriod)
g.SetLinuxResourcesCPUQuota(int64(config.Resources.CPUs * CpuPeriod))
addedResources = true
}
if config.Resources.CPURtRuntime != 0 {
g.SetLinuxResourcesCPURealtimeRuntime(config.Resources.CPURtRuntime)
addedResources = true
}
if config.Resources.CPURtPeriod != 0 {
g.SetLinuxResourcesCPURealtimePeriod(config.Resources.CPURtPeriod)
addedResources = true
}
if config.Resources.CPUsetCPUs != "" {
g.SetLinuxResourcesCPUCpus(config.Resources.CPUsetCPUs)
addedResources = true
}
if config.Resources.CPUsetMems != "" {
g.SetLinuxResourcesCPUMems(config.Resources.CPUsetMems)
addedResources = true
}
// Devices
if config.Security.Privileged {
// If privileged, we need to add all the host devices to the
// spec. We do not add the user provided ones because we are
// already adding them all.
if err := AddPrivilegedDevices(&g); err != nil {
return nil, err
}
} else {
for _, devicePath := range config.Devices {
if err := DevicesFromPath(&g, devicePath); err != nil {
return nil, err
}
}
if len(config.Resources.DeviceCgroupRules) != 0 {
if err := deviceCgroupRules(&g, config.Resources.DeviceCgroupRules); err != nil {
return nil, err
}
addedResources = true
}
}
g.SetProcessNoNewPrivileges(config.Security.NoNewPrivs)
if !config.Security.Privileged {
g.SetProcessApparmorProfile(config.Security.ApparmorProfile)
}
// Unless already set via the CLI, check if we need to disable process
// labels or set the defaults.
if len(config.Security.LabelOpts) == 0 && runtimeConfig != nil {
if !runtimeConfig.Containers.EnableLabeling {
// Disabled in the config.
config.Security.LabelOpts = append(config.Security.LabelOpts, "disable")
} else if err := config.Security.SetLabelOpts(runtime, &config.Pid, &config.Ipc); err != nil {
// Defaults!
return nil, err
}
}
BlockAccessToKernelFilesystems(config.Security.Privileged, config.Pid.PidMode.IsHost(), &g)
// RESOURCES - PIDS
if config.Resources.PidsLimit > 0 {
// if running on rootless on a cgroupv1 machine or using the cgroupfs manager, pids
// limit is not supported. If the value is still the default
// then ignore the settings. If the caller asked for a
// non-default, then try to use it.
setPidLimit := true
if rootless.IsRootless() {
cgroup2, err := cgroups.IsCgroup2UnifiedMode()
if err != nil {
return nil, err
}
if (!cgroup2 || (runtimeConfig != nil && runtimeConfig.Engine.CgroupManager != cconfig.SystemdCgroupsManager)) && config.Resources.PidsLimit == sysinfo.GetDefaultPidsLimit() {
setPidLimit = false
}
}
if setPidLimit {
g.SetLinuxResourcesPidsLimit(config.Resources.PidsLimit)
addedResources = true
}
}
// Make sure to always set the default variables unless overridden in the
// config.
var defaultEnv map[string]string
if runtimeConfig == nil {
defaultEnv = env.DefaultEnvVariables
} else {
defaultEnv, err = env.ParseSlice(runtimeConfig.Containers.Env)
if err != nil {
return nil, errors.Wrap(err, "Env fields in containers.conf failed ot parse")
}
defaultEnv = env.Join(env.DefaultEnvVariables, defaultEnv)
}
config.Env = env.Join(defaultEnv, config.Env)
for name, val := range config.Env {
g.AddProcessEnv(name, val)
}
if err := addRlimits(config, &g); err != nil {
return nil, err
}
// NAMESPACES
if err := config.Pid.ConfigureGenerator(&g); err != nil {
return nil, err
}
if err := config.User.ConfigureGenerator(&g); err != nil {
return nil, err
}
if err := config.Network.ConfigureGenerator(&g); err != nil {
return nil, err
}
if err := config.Uts.ConfigureGenerator(&g, &config.Network, runtime); err != nil {
return nil, err
}
if err := config.Ipc.ConfigureGenerator(&g); err != nil {
return nil, err
}
if err := config.Cgroup.ConfigureGenerator(&g); err != nil {
return nil, err
}
configSpec := g.Config
// If the container image specifies an label with a
// capabilities.ContainerImageLabel then split the comma separated list
// of capabilities and record them. This list indicates the only
// capabilities, required to run the container.
var capRequired []string
for key, val := range config.Labels {
if util.StringInSlice(key, capabilities.ContainerImageLabels) {
capRequired = strings.Split(val, ",")
}
}
config.Security.CapRequired = capRequired
if err := config.Security.ConfigureGenerator(&g, &config.User); err != nil {
return nil, err
}
// BIND MOUNTS
configSpec.Mounts = SupercedeUserMounts(userMounts, configSpec.Mounts)
// Process mounts to ensure correct options
if err := InitFSMounts(configSpec.Mounts); err != nil {
return nil, err
}
// BLOCK IO
blkio, err := config.CreateBlockIO()
if err != nil {
return nil, errors.Wrapf(err, "error creating block io")
}
if blkio != nil {
configSpec.Linux.Resources.BlockIO = blkio
addedResources = true
}
if rootless.IsRootless() {
cgroup2, err := cgroups.IsCgroup2UnifiedMode()
if err != nil {
return nil, err
}
if !addedResources {
configSpec.Linux.Resources = &spec.LinuxResources{}
}
canUseResources := cgroup2 && runtimeConfig != nil && (runtimeConfig.Engine.CgroupManager == cconfig.SystemdCgroupsManager)
if addedResources && !canUseResources {
return nil, errors.New("invalid configuration, cannot specify resource limits without cgroups v2 and --cgroup-manager=systemd")
}
if !canUseResources {
// Force the resources block to be empty instead of having default values.
configSpec.Linux.Resources = &spec.LinuxResources{}
}
}
switch config.Cgroup.Cgroups {
case "disabled":
if addedResources {
return nil, errors.New("cannot specify resource limits when cgroups are disabled is specified")
}
configSpec.Linux.Resources = &spec.LinuxResources{}
case "enabled", "no-conmon", "":
// Do nothing
default:
return nil, errors.New("unrecognized option for cgroups; supported are 'default', 'disabled', 'no-conmon'")
}
// Add annotations
if configSpec.Annotations == nil {
configSpec.Annotations = make(map[string]string)
}
if config.CidFile != "" {
configSpec.Annotations[libpod.InspectAnnotationCIDFile] = config.CidFile
}
if config.Rm {
configSpec.Annotations[libpod.InspectAnnotationAutoremove] = libpod.InspectResponseTrue
} else {
configSpec.Annotations[libpod.InspectAnnotationAutoremove] = libpod.InspectResponseFalse
}
if len(config.VolumesFrom) > 0 {
configSpec.Annotations[libpod.InspectAnnotationVolumesFrom] = strings.Join(config.VolumesFrom, ",")
}
if config.Security.Privileged {
configSpec.Annotations[libpod.InspectAnnotationPrivileged] = libpod.InspectResponseTrue
} else {
configSpec.Annotations[libpod.InspectAnnotationPrivileged] = libpod.InspectResponseFalse
}
if config.Init {
configSpec.Annotations[libpod.InspectAnnotationInit] = libpod.InspectResponseTrue
} else {
configSpec.Annotations[libpod.InspectAnnotationInit] = libpod.InspectResponseFalse
}
return configSpec, nil
}
func (config *CreateConfig) cgroupDisabled() bool {
return config.Cgroup.Cgroups == "disabled"
}
func BlockAccessToKernelFilesystems(privileged, pidModeIsHost bool, g *generate.Generator) {
if !privileged {
for _, mp := range []string{
"/proc/acpi",
"/proc/kcore",
"/proc/keys",
"/proc/latency_stats",
"/proc/timer_list",
"/proc/timer_stats",
"/proc/sched_debug",
"/proc/scsi",
"/sys/firmware",
"/sys/fs/selinux",
} {
g.AddLinuxMaskedPaths(mp)
}
if pidModeIsHost && rootless.IsRootless() {
return
}
for _, rp := range []string{
"/proc/asound",
"/proc/bus",
"/proc/fs",
"/proc/irq",
"/proc/sys",
"/proc/sysrq-trigger",
} {
g.AddLinuxReadonlyPaths(rp)
}
}
}
func addRlimits(config *CreateConfig, g *generate.Generator) error {
var (
kernelMax uint64 = 1048576
isRootless = rootless.IsRootless()
nofileSet = false
nprocSet = false
)
for _, u := range config.Resources.Ulimit {
if u == "host" {
if len(config.Resources.Ulimit) != 1 {
return errors.New("ulimit can use host only once")
}
g.Config.Process.Rlimits = nil
break
}
ul, err := units.ParseUlimit(u)
if err != nil {
return errors.Wrapf(err, "ulimit option %q requires name=SOFT:HARD, failed to be parsed", u)
}
if ul.Name == "nofile" {
nofileSet = true
} else if ul.Name == "nproc" {
nprocSet = true
}
g.AddProcessRlimits("RLIMIT_"+strings.ToUpper(ul.Name), uint64(ul.Hard), uint64(ul.Soft))
}
// If not explicitly overridden by the user, default number of open
// files and number of processes to the maximum they can be set to
// (without overriding a sysctl)
if !nofileSet && !isRootless {
g.AddProcessRlimits("RLIMIT_NOFILE", kernelMax, kernelMax)
}
if !nprocSet && !isRootless {
g.AddProcessRlimits("RLIMIT_NPROC", kernelMax, kernelMax)
}
return nil
}