Files
podman/pkg/specgen/generate/oci_linux.go
Paul Holzinger 83863a6863 specgen: parse devices even with privileged set
When a users asks for specific devices we should still add them and not
ignore them just because privileged adds all of them.

Most notably if you set --device /dev/null:/dev/test you expect
/dev/test in the container, however as we ignored them this was not the
case. Another side effect is that the input was not validated at at all.
This leads to confusion as descriped in the issue.

Fixes #23132

Signed-off-by: Paul Holzinger <pholzing@redhat.com>
2024-07-01 11:46:34 +02:00

402 lines
10 KiB
Go

//go:build !remote
package generate
import (
"context"
"encoding/json"
"fmt"
"path"
"strings"
"github.com/containers/common/libimage"
"github.com/containers/common/pkg/cgroups"
"github.com/containers/common/pkg/config"
"github.com/containers/podman/v5/libpod"
"github.com/containers/podman/v5/libpod/define"
"github.com/containers/podman/v5/pkg/rootless"
"github.com/containers/podman/v5/pkg/specgen"
"github.com/docker/go-units"
spec "github.com/opencontainers/runtime-spec/specs-go"
"github.com/opencontainers/runtime-tools/generate"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
)
func setProcOpts(s *specgen.SpecGenerator, g *generate.Generator) {
if s.ProcOpts == nil {
return
}
for i := range g.Config.Mounts {
if g.Config.Mounts[i].Destination == "/proc" {
g.Config.Mounts[i].Options = s.ProcOpts
return
}
}
}
func setDevOptsReadOnly(g *generate.Generator) {
for i := range g.Config.Mounts {
if g.Config.Mounts[i].Destination == "/dev" {
g.Config.Mounts[i].Options = append(g.Config.Mounts[i].Options, "ro")
return
}
}
}
// canMountSys is a best-effort heuristic to detect whether mounting a new sysfs is permitted in the container
func canMountSys(isRootless, isNewUserns bool, s *specgen.SpecGenerator) bool {
if s.NetNS.IsHost() && (isRootless || isNewUserns) {
return false
}
if isNewUserns {
switch s.NetNS.NSMode {
case specgen.Slirp, specgen.Pasta, specgen.Private, specgen.NoNetwork, specgen.Bridge:
return true
default:
return false
}
}
return true
}
func getCgroupPermissions(unmask []string) string {
ro := "ro"
rw := "rw"
cgroup := "/sys/fs/cgroup"
cgroupv2, _ := cgroups.IsCgroup2UnifiedMode()
if !cgroupv2 {
return ro
}
if len(unmask) != 0 && unmask[0] == "ALL" {
return rw
}
for _, p := range unmask {
if path.Clean(p) == cgroup {
return rw
}
}
return ro
}
// SpecGenToOCI returns the base configuration for the container.
func SpecGenToOCI(ctx context.Context, s *specgen.SpecGenerator, rt *libpod.Runtime, rtc *config.Config, newImage *libimage.Image, mounts []spec.Mount, pod *libpod.Pod, finalCmd []string, compatibleOptions *libpod.InfraInherit) (*spec.Spec, error) {
cgroupPerm := getCgroupPermissions(s.Unmask)
g, err := generate.New("linux")
if err != nil {
return nil, err
}
// Remove the default /dev/shm mount to ensure we overwrite it
g.RemoveMount("/dev/shm")
g.HostSpecific = true
addCgroup := true
isRootless := rootless.IsRootless()
isNewUserns := s.UserNS.IsContainer() || s.UserNS.IsPath() || s.UserNS.IsPrivate() || s.UserNS.IsPod() || s.UserNS.IsAuto()
canMountSys := canMountSys(isRootless, isNewUserns, s)
if s.IsPrivileged() && canMountSys {
cgroupPerm = "rw"
g.RemoveMount("/sys")
sysMnt := spec.Mount{
Destination: "/sys",
Type: "sysfs",
Source: "sysfs",
Options: []string{"rprivate", "nosuid", "noexec", "nodev", "rw"},
}
g.AddMount(sysMnt)
}
if !canMountSys {
addCgroup = false
g.RemoveMount("/sys")
r := "ro"
if s.IsPrivileged() {
r = "rw"
}
sysMnt := spec.Mount{
Destination: "/sys",
Type: define.TypeBind,
Source: "/sys",
Options: []string{"rprivate", "nosuid", "noexec", "nodev", r, "rbind"},
}
g.AddMount(sysMnt)
g.RemoveMount("/sys/fs/cgroup")
sysFsCgroupMnt := spec.Mount{
Destination: "/sys/fs/cgroup",
Type: "cgroup",
Source: "/sys/fs/cgroup",
Options: []string{"rprivate", "nosuid", "noexec", "nodev", r},
}
g.AddMount(sysFsCgroupMnt)
if !s.IsPrivileged() && isRootless {
g.AddLinuxMaskedPaths("/sys/kernel")
}
}
gid5Available := true
if isRootless {
nGids, err := rootless.GetAvailableGids()
if err != nil {
return nil, err
}
gid5Available = nGids >= 5
}
// When using a different user namespace, check that the GID 5 is mapped inside
// the container.
if gid5Available && (s.IDMappings != nil && len(s.IDMappings.GIDMap) > 0) {
mappingFound := false
for _, r := range s.IDMappings.GIDMap {
if r.ContainerID <= 5 && 5 < r.ContainerID+r.Size {
mappingFound = true
break
}
}
if !mappingFound {
gid5Available = false
}
}
if !gid5Available {
// If we have no GID mappings, the gid=5 default option would fail, so drop it.
g.RemoveMount("/dev/pts")
devPts := spec.Mount{
Destination: "/dev/pts",
Type: define.TypeDevpts,
Source: define.TypeDevpts,
Options: []string{"rprivate", "nosuid", "noexec", "newinstance", "ptmxmode=0666", "mode=0620"},
}
g.AddMount(devPts)
}
inUserNS := isRootless || isNewUserns
if inUserNS && s.IpcNS.IsHost() {
g.RemoveMount("/dev/mqueue")
devMqueue := spec.Mount{
Destination: "/dev/mqueue",
Type: define.TypeBind, // constant ?
Source: "/dev/mqueue",
Options: []string{define.TypeBind, "nosuid", "noexec", "nodev"},
}
g.AddMount(devMqueue)
}
if inUserNS && s.PidNS.IsHost() {
g.RemoveMount("/proc")
procMount := spec.Mount{
Destination: "/proc",
Type: define.TypeBind,
Source: "/proc",
Options: []string{"rbind", "nosuid", "noexec", "nodev"},
}
g.AddMount(procMount)
}
if addCgroup {
cgroupMnt := spec.Mount{
Destination: "/sys/fs/cgroup",
Type: "cgroup",
Source: "cgroup",
Options: []string{"rprivate", "nosuid", "noexec", "nodev", "relatime", cgroupPerm},
}
g.AddMount(cgroupMnt)
}
g.Config.Linux.Personality = s.Personality
g.SetProcessCwd(s.WorkDir)
g.SetProcessArgs(finalCmd)
if s.Terminal != nil {
g.SetProcessTerminal(*s.Terminal)
}
for key, val := range s.Annotations {
g.AddAnnotation(key, val)
}
if s.IntelRdt != nil {
if s.IntelRdt.ClosID != "" {
g.SetLinuxIntelRdtClosID(s.IntelRdt.ClosID)
}
}
if s.ResourceLimits != nil {
out, err := json.Marshal(s.ResourceLimits)
if err != nil {
return nil, err
}
err = json.Unmarshal(out, g.Config.Linux.Resources)
if err != nil {
return nil, err
}
g.Config.Linux.Resources = s.ResourceLimits
}
weightDevices, err := WeightDevices(s.WeightDevice)
if err != nil {
return nil, err
}
if len(weightDevices) > 0 {
for _, dev := range weightDevices {
g.AddLinuxResourcesBlockIOWeightDevice(dev.Major, dev.Minor, *dev.Weight)
}
}
// Devices
// set the default rule at the beginning of device configuration
if !inUserNS && !s.IsPrivileged() {
g.AddLinuxResourcesDevice(false, "", nil, nil, "rwm")
}
var userDevices []spec.LinuxDevice
// add default devices from containers.conf
for _, device := range rtc.Containers.Devices.Get() {
if err = DevicesFromPath(&g, device); err != nil {
return nil, err
}
}
if len(compatibleOptions.HostDeviceList) > 0 && len(s.Devices) == 0 {
userDevices = compatibleOptions.HostDeviceList
} else {
userDevices = s.Devices
}
// add default devices specified by caller
for _, device := range userDevices {
if err = DevicesFromPath(&g, device.Path); err != nil {
return nil, err
}
}
s.HostDeviceList = userDevices
// set the devices cgroup when not running in a user namespace
if isRootless && len(s.DeviceCgroupRule) > 0 {
return nil, fmt.Errorf("device cgroup rules are not supported in rootless mode or in a user namespace")
}
if !isRootless && !s.IsPrivileged() {
for _, dev := range s.DeviceCgroupRule {
g.AddLinuxResourcesDevice(true, dev.Type, dev.Major, dev.Minor, dev.Access)
}
}
BlockAccessToKernelFilesystems(s.IsPrivileged(), s.PidNS.IsHost(), s.Mask, s.Unmask, &g)
g.ClearProcessEnv()
for name, val := range s.Env {
g.AddProcessEnv(name, val)
}
addRlimits(s, &g)
// NAMESPACES
if err := specConfigureNamespaces(s, &g, rt, pod); err != nil {
return nil, err
}
configSpec := g.Config
if err := securityConfigureGenerator(s, &g, newImage, rtc); err != nil {
return nil, err
}
// BIND MOUNTS
configSpec.Mounts = SupersedeUserMounts(mounts, configSpec.Mounts)
// Process mounts to ensure correct options
if err := InitFSMounts(configSpec.Mounts); err != nil {
return nil, err
}
// Add annotations
if configSpec.Annotations == nil {
configSpec.Annotations = make(map[string]string)
}
if s.Remove != nil && *s.Remove {
configSpec.Annotations[define.InspectAnnotationAutoremove] = define.InspectResponseTrue
}
if len(s.VolumesFrom) > 0 {
configSpec.Annotations[define.VolumesFromAnnotation] = strings.Join(s.VolumesFrom, ";")
}
if s.IsPrivileged() {
configSpec.Annotations[define.InspectAnnotationPrivileged] = define.InspectResponseTrue
}
if s.Init != nil && *s.Init {
configSpec.Annotations[define.InspectAnnotationInit] = define.InspectResponseTrue
}
if s.OOMScoreAdj != nil {
g.SetProcessOOMScoreAdj(*s.OOMScoreAdj)
}
setProcOpts(s, &g)
roFS := false
if s.ReadOnlyFilesystem != nil {
roFS = *s.ReadOnlyFilesystem
}
rwTmpfs := false
if s.ReadWriteTmpfs != nil {
rwTmpfs = *s.ReadWriteTmpfs
}
if roFS && !rwTmpfs {
setDevOptsReadOnly(&g)
}
return configSpec, nil
}
func WeightDevices(wtDevices map[string]spec.LinuxWeightDevice) ([]spec.LinuxWeightDevice, error) {
devs := []spec.LinuxWeightDevice{}
for k, v := range wtDevices {
statT := unix.Stat_t{}
if err := unix.Stat(k, &statT); err != nil {
return nil, fmt.Errorf("failed to inspect '%s' in --blkio-weight-device: %w", k, err)
}
dev := new(spec.LinuxWeightDevice)
dev.Major = (int64(unix.Major(uint64(statT.Rdev)))) //nolint: unconvert
dev.Minor = (int64(unix.Minor(uint64(statT.Rdev)))) //nolint: unconvert
dev.Weight = v.Weight
devs = append(devs, *dev)
}
return devs, nil
}
// subNegativeOne translates Hard or soft limits of -1 to the current
// processes Max limit
func subNegativeOne(u spec.POSIXRlimit) spec.POSIXRlimit {
if !rootless.IsRootless() ||
(int64(u.Hard) != -1 && int64(u.Soft) != -1) {
return u
}
ul, err := units.ParseUlimit(fmt.Sprintf("%s=%d:%d", u.Type, int64(u.Soft), int64(u.Hard)))
if err != nil {
logrus.Warnf("Failed to check %s ulimit %q", u.Type, err)
return u
}
rl, err := ul.GetRlimit()
if err != nil {
logrus.Warnf("Failed to check %s ulimit %q", u.Type, err)
return u
}
var rlimit unix.Rlimit
if err := unix.Getrlimit(rl.Type, &rlimit); err != nil {
logrus.Warnf("Failed to return RLIMIT_NOFILE ulimit %q", err)
return u
}
if int64(u.Hard) == -1 {
u.Hard = rlimit.Max
}
if int64(u.Soft) == -1 {
u.Soft = rlimit.Max
}
return u
}