Files
podman/pkg/specgen/generate/oci_linux.go
Matt Heon 458ba5a8af Fix podman stop and podman run --rmi
This started off as an attempt to make `podman stop` on a
container started with `--rm` actually remove the container,
instead of just cleaning it up and waiting for the cleanup
process to finish the removal.

In the process, I realized that `podman run --rmi` was rather
broken. It was only done as part of the Podman CLI, not the
cleanup process (meaning it only worked with attached containers)
and the way it was wired meant that I was fairly confident that
it wouldn't work if I did a `podman stop` on an attached
container run with `--rmi`. I rewired it to use the same
mechanism that `podman run --rm` uses, so it should be a lot more
durable now, and I also wired it into `podman inspect` so you can
tell that a container will remove its image.

Tests have been added for the changes to `podman run --rmi`. No
tests for `stop` on a `run --rm` container as that would be racy.

Fixes #22852
Fixes RHEL-39513

Signed-off-by: Matt Heon <mheon@redhat.com>
2024-08-20 09:51:18 -04:00

406 lines
10 KiB
Go

//go:build !remote
package generate
import (
"context"
"encoding/json"
"fmt"
"path"
"strings"
"github.com/containers/common/libimage"
"github.com/containers/common/pkg/cgroups"
"github.com/containers/common/pkg/config"
"github.com/containers/podman/v5/libpod"
"github.com/containers/podman/v5/libpod/define"
"github.com/containers/podman/v5/pkg/rootless"
"github.com/containers/podman/v5/pkg/specgen"
"github.com/docker/go-units"
spec "github.com/opencontainers/runtime-spec/specs-go"
"github.com/opencontainers/runtime-tools/generate"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
)
func setProcOpts(s *specgen.SpecGenerator, g *generate.Generator) {
if s.ProcOpts == nil {
return
}
for i := range g.Config.Mounts {
if g.Config.Mounts[i].Destination == "/proc" {
g.Config.Mounts[i].Options = s.ProcOpts
return
}
}
}
func setDevOptsReadOnly(g *generate.Generator) {
for i := range g.Config.Mounts {
if g.Config.Mounts[i].Destination == "/dev" {
g.Config.Mounts[i].Options = append(g.Config.Mounts[i].Options, "ro")
return
}
}
}
// canMountSys is a best-effort heuristic to detect whether mounting a new sysfs is permitted in the container
func canMountSys(isRootless, isNewUserns bool, s *specgen.SpecGenerator) bool {
if s.NetNS.IsHost() && (isRootless || isNewUserns) {
return false
}
if isNewUserns {
switch s.NetNS.NSMode {
case specgen.Slirp, specgen.Pasta, specgen.Private, specgen.NoNetwork, specgen.Bridge:
return true
default:
return false
}
}
return true
}
func getCgroupPermissions(unmask []string) string {
ro := "ro"
rw := "rw"
cgroup := "/sys/fs/cgroup"
cgroupv2, _ := cgroups.IsCgroup2UnifiedMode()
if !cgroupv2 {
return ro
}
if len(unmask) != 0 && unmask[0] == "ALL" {
return rw
}
for _, p := range unmask {
if path.Clean(p) == cgroup {
return rw
}
}
return ro
}
// SpecGenToOCI returns the base configuration for the container.
func SpecGenToOCI(ctx context.Context, s *specgen.SpecGenerator, rt *libpod.Runtime, rtc *config.Config, newImage *libimage.Image, mounts []spec.Mount, pod *libpod.Pod, finalCmd []string, compatibleOptions *libpod.InfraInherit) (*spec.Spec, error) {
cgroupPerm := getCgroupPermissions(s.Unmask)
g, err := generate.New("linux")
if err != nil {
return nil, err
}
// Remove the default /dev/shm mount to ensure we overwrite it
g.RemoveMount("/dev/shm")
g.HostSpecific = true
addCgroup := true
isRootless := rootless.IsRootless()
isNewUserns := s.UserNS.IsContainer() || s.UserNS.IsPath() || s.UserNS.IsPrivate() || s.UserNS.IsPod() || s.UserNS.IsAuto()
canMountSys := canMountSys(isRootless, isNewUserns, s)
if s.IsPrivileged() && canMountSys {
cgroupPerm = "rw"
g.RemoveMount("/sys")
sysMnt := spec.Mount{
Destination: "/sys",
Type: "sysfs",
Source: "sysfs",
Options: []string{"rprivate", "nosuid", "noexec", "nodev", "rw"},
}
g.AddMount(sysMnt)
}
if !canMountSys {
addCgroup = false
g.RemoveMount("/sys")
r := "ro"
if s.IsPrivileged() {
r = "rw"
}
sysMnt := spec.Mount{
Destination: "/sys",
Type: define.TypeBind,
Source: "/sys",
Options: []string{"rprivate", "nosuid", "noexec", "nodev", r, "rbind"},
}
g.AddMount(sysMnt)
g.RemoveMount("/sys/fs/cgroup")
sysFsCgroupMnt := spec.Mount{
Destination: "/sys/fs/cgroup",
Type: "cgroup",
Source: "/sys/fs/cgroup",
Options: []string{"rprivate", "nosuid", "noexec", "nodev", r},
}
g.AddMount(sysFsCgroupMnt)
if !s.IsPrivileged() && isRootless {
g.AddLinuxMaskedPaths("/sys/kernel")
}
}
gid5Available := true
if isRootless {
nGids, err := rootless.GetAvailableGids()
if err != nil {
return nil, err
}
gid5Available = nGids >= 5
}
// When using a different user namespace, check that the GID 5 is mapped inside
// the container.
if gid5Available && (s.IDMappings != nil && len(s.IDMappings.GIDMap) > 0) {
mappingFound := false
for _, r := range s.IDMappings.GIDMap {
if r.ContainerID <= 5 && 5 < r.ContainerID+r.Size {
mappingFound = true
break
}
}
if !mappingFound {
gid5Available = false
}
}
if !gid5Available {
// If we have no GID mappings, the gid=5 default option would fail, so drop it.
g.RemoveMount("/dev/pts")
devPts := spec.Mount{
Destination: "/dev/pts",
Type: define.TypeDevpts,
Source: define.TypeDevpts,
Options: []string{"rprivate", "nosuid", "noexec", "newinstance", "ptmxmode=0666", "mode=0620"},
}
g.AddMount(devPts)
}
inUserNS := isRootless || isNewUserns
if inUserNS && s.IpcNS.IsHost() {
g.RemoveMount("/dev/mqueue")
devMqueue := spec.Mount{
Destination: "/dev/mqueue",
Type: define.TypeBind, // constant ?
Source: "/dev/mqueue",
Options: []string{define.TypeBind, "nosuid", "noexec", "nodev"},
}
g.AddMount(devMqueue)
}
if inUserNS && s.PidNS.IsHost() {
g.RemoveMount("/proc")
procMount := spec.Mount{
Destination: "/proc",
Type: define.TypeBind,
Source: "/proc",
Options: []string{"rbind", "nosuid", "noexec", "nodev"},
}
g.AddMount(procMount)
}
if addCgroup {
cgroupMnt := spec.Mount{
Destination: "/sys/fs/cgroup",
Type: "cgroup",
Source: "cgroup",
Options: []string{"rprivate", "nosuid", "noexec", "nodev", "relatime", cgroupPerm},
}
g.AddMount(cgroupMnt)
}
g.Config.Linux.Personality = s.Personality
g.SetProcessCwd(s.WorkDir)
g.SetProcessArgs(finalCmd)
if s.Terminal != nil {
g.SetProcessTerminal(*s.Terminal)
}
for key, val := range s.Annotations {
g.AddAnnotation(key, val)
}
if s.IntelRdt != nil {
if s.IntelRdt.ClosID != "" {
g.SetLinuxIntelRdtClosID(s.IntelRdt.ClosID)
}
}
if s.ResourceLimits != nil {
out, err := json.Marshal(s.ResourceLimits)
if err != nil {
return nil, err
}
err = json.Unmarshal(out, g.Config.Linux.Resources)
if err != nil {
return nil, err
}
g.Config.Linux.Resources = s.ResourceLimits
}
weightDevices, err := WeightDevices(s.WeightDevice)
if err != nil {
return nil, err
}
if len(weightDevices) > 0 {
for _, dev := range weightDevices {
g.AddLinuxResourcesBlockIOWeightDevice(dev.Major, dev.Minor, *dev.Weight)
}
}
// Devices
// set the default rule at the beginning of device configuration
if !inUserNS && !s.IsPrivileged() {
g.AddLinuxResourcesDevice(false, "", nil, nil, "rwm")
}
var userDevices []spec.LinuxDevice
// add default devices from containers.conf
for _, device := range rtc.Containers.Devices.Get() {
if err = DevicesFromPath(&g, device); err != nil {
return nil, err
}
}
if len(compatibleOptions.HostDeviceList) > 0 && len(s.Devices) == 0 {
userDevices = compatibleOptions.HostDeviceList
} else {
userDevices = s.Devices
}
// add default devices specified by caller
for _, device := range userDevices {
if err = DevicesFromPath(&g, device.Path); err != nil {
return nil, err
}
}
s.HostDeviceList = userDevices
// set the devices cgroup when not running in a user namespace
if isRootless && len(s.DeviceCgroupRule) > 0 {
return nil, fmt.Errorf("device cgroup rules are not supported in rootless mode or in a user namespace")
}
if !isRootless && !s.IsPrivileged() {
for _, dev := range s.DeviceCgroupRule {
g.AddLinuxResourcesDevice(true, dev.Type, dev.Major, dev.Minor, dev.Access)
}
}
BlockAccessToKernelFilesystems(s.IsPrivileged(), s.PidNS.IsHost(), s.Mask, s.Unmask, &g)
g.ClearProcessEnv()
for name, val := range s.Env {
g.AddProcessEnv(name, val)
}
addRlimits(s, &g)
// NAMESPACES
if err := specConfigureNamespaces(s, &g, rt, pod); err != nil {
return nil, err
}
configSpec := g.Config
if err := securityConfigureGenerator(s, &g, newImage, rtc); err != nil {
return nil, err
}
// BIND MOUNTS
configSpec.Mounts = SupersedeUserMounts(mounts, configSpec.Mounts)
// Process mounts to ensure correct options
if err := InitFSMounts(configSpec.Mounts); err != nil {
return nil, err
}
// Add annotations
if configSpec.Annotations == nil {
configSpec.Annotations = make(map[string]string)
}
if s.Remove != nil && *s.Remove {
configSpec.Annotations[define.InspectAnnotationAutoremove] = define.InspectResponseTrue
}
if s.RemoveImage != nil && *s.RemoveImage {
configSpec.Annotations[define.InspectAnnotationAutoremoveImage] = define.InspectResponseTrue
}
if len(s.VolumesFrom) > 0 {
configSpec.Annotations[define.VolumesFromAnnotation] = strings.Join(s.VolumesFrom, ";")
}
if s.IsPrivileged() {
configSpec.Annotations[define.InspectAnnotationPrivileged] = define.InspectResponseTrue
}
if s.Init != nil && *s.Init {
configSpec.Annotations[define.InspectAnnotationInit] = define.InspectResponseTrue
}
if s.OOMScoreAdj != nil {
g.SetProcessOOMScoreAdj(*s.OOMScoreAdj)
}
setProcOpts(s, &g)
roFS := false
if s.ReadOnlyFilesystem != nil {
roFS = *s.ReadOnlyFilesystem
}
rwTmpfs := false
if s.ReadWriteTmpfs != nil {
rwTmpfs = *s.ReadWriteTmpfs
}
if roFS && !rwTmpfs {
setDevOptsReadOnly(&g)
}
return configSpec, nil
}
func WeightDevices(wtDevices map[string]spec.LinuxWeightDevice) ([]spec.LinuxWeightDevice, error) {
devs := []spec.LinuxWeightDevice{}
for k, v := range wtDevices {
statT := unix.Stat_t{}
if err := unix.Stat(k, &statT); err != nil {
return nil, fmt.Errorf("failed to inspect '%s' in --blkio-weight-device: %w", k, err)
}
dev := new(spec.LinuxWeightDevice)
dev.Major = (int64(unix.Major(uint64(statT.Rdev)))) //nolint: unconvert
dev.Minor = (int64(unix.Minor(uint64(statT.Rdev)))) //nolint: unconvert
dev.Weight = v.Weight
devs = append(devs, *dev)
}
return devs, nil
}
// subNegativeOne translates Hard or soft limits of -1 to the current
// processes Max limit
func subNegativeOne(u spec.POSIXRlimit) spec.POSIXRlimit {
if !rootless.IsRootless() ||
(int64(u.Hard) != -1 && int64(u.Soft) != -1) {
return u
}
ul, err := units.ParseUlimit(fmt.Sprintf("%s=%d:%d", u.Type, int64(u.Soft), int64(u.Hard)))
if err != nil {
logrus.Warnf("Failed to check %s ulimit %q", u.Type, err)
return u
}
rl, err := ul.GetRlimit()
if err != nil {
logrus.Warnf("Failed to check %s ulimit %q", u.Type, err)
return u
}
var rlimit unix.Rlimit
if err := unix.Getrlimit(rl.Type, &rlimit); err != nil {
logrus.Warnf("Failed to return RLIMIT_NOFILE ulimit %q", err)
return u
}
if int64(u.Hard) == -1 {
u.Hard = rlimit.Max
}
if int64(u.Soft) == -1 {
u.Soft = rlimit.Max
}
return u
}