mirror of
https://github.com/containers/podman.git
synced 2025-05-17 23:26:08 +08:00

When conmon is started it blocks and waits for us to signal it to start via pipe. This works but when conmon exits before it waits for the start message it causes podman to fail with `write child: broken pipe`. This error is meaningless to podman users. The real error is that conmon failed so we should not return early if we fail to send the start message to conmon. Instead ignore the EPIPE error case as it is safe to assume to the conmon died and for other errors we make sure to kill conmon so that the following wait() call does not hang forever. This also fixes problems with having conmon zombie processes leaked as wait() was never called. Signed-off-by: Paul Holzinger <pholzing@redhat.com>
327 lines
10 KiB
Go
327 lines
10 KiB
Go
package libpod
|
|
|
|
import (
|
|
"fmt"
|
|
"os"
|
|
"os/exec"
|
|
"path/filepath"
|
|
"runtime"
|
|
"strings"
|
|
|
|
runcconfig "github.com/opencontainers/runc/libcontainer/configs"
|
|
"github.com/opencontainers/runc/libcontainer/devices"
|
|
|
|
"github.com/containers/common/pkg/cgroups"
|
|
"github.com/containers/common/pkg/config"
|
|
"github.com/containers/podman/v4/pkg/errorhandling"
|
|
"github.com/containers/podman/v4/pkg/rootless"
|
|
"github.com/containers/podman/v4/utils"
|
|
pmount "github.com/containers/storage/pkg/mount"
|
|
spec "github.com/opencontainers/runtime-spec/specs-go"
|
|
"github.com/opencontainers/selinux/go-selinux/label"
|
|
"github.com/sirupsen/logrus"
|
|
"golang.org/x/sys/unix"
|
|
)
|
|
|
|
func (r *ConmonOCIRuntime) createRootlessContainer(ctr *Container, restoreOptions *ContainerCheckpointOptions) (int64, error) {
|
|
type result struct {
|
|
restoreDuration int64
|
|
err error
|
|
}
|
|
ch := make(chan result)
|
|
go func() {
|
|
runtime.LockOSThread()
|
|
restoreDuration, err := func() (int64, error) {
|
|
fd, err := os.Open(fmt.Sprintf("/proc/%d/task/%d/ns/mnt", os.Getpid(), unix.Gettid()))
|
|
if err != nil {
|
|
return 0, err
|
|
}
|
|
defer errorhandling.CloseQuiet(fd)
|
|
|
|
// create a new mountns on the current thread
|
|
if err = unix.Unshare(unix.CLONE_NEWNS); err != nil {
|
|
return 0, err
|
|
}
|
|
defer func() {
|
|
if err := unix.Setns(int(fd.Fd()), unix.CLONE_NEWNS); err != nil {
|
|
logrus.Errorf("Unable to clone new namespace: %q", err)
|
|
}
|
|
}()
|
|
|
|
// don't spread our mounts around. We are setting only /sys to be slave
|
|
// so that the cleanup process is still able to umount the storage and the
|
|
// changes are propagated to the host.
|
|
err = unix.Mount("/sys", "/sys", "none", unix.MS_REC|unix.MS_SLAVE, "")
|
|
if err != nil {
|
|
return 0, fmt.Errorf("cannot make /sys slave: %w", err)
|
|
}
|
|
|
|
mounts, err := pmount.GetMounts()
|
|
if err != nil {
|
|
return 0, err
|
|
}
|
|
for _, m := range mounts {
|
|
if !strings.HasPrefix(m.Mountpoint, "/sys/kernel") {
|
|
continue
|
|
}
|
|
err = unix.Unmount(m.Mountpoint, 0)
|
|
if err != nil && !os.IsNotExist(err) {
|
|
return 0, fmt.Errorf("cannot unmount %s: %w", m.Mountpoint, err)
|
|
}
|
|
}
|
|
return r.createOCIContainer(ctr, restoreOptions)
|
|
}()
|
|
ch <- result{
|
|
restoreDuration: restoreDuration,
|
|
err: err,
|
|
}
|
|
}()
|
|
res := <-ch
|
|
return res.restoreDuration, res.err
|
|
}
|
|
|
|
// Run the closure with the container's socket label set
|
|
func (r *ConmonOCIRuntime) withContainerSocketLabel(ctr *Container, closure func() error) error {
|
|
runtime.LockOSThread()
|
|
if err := label.SetSocketLabel(ctr.ProcessLabel()); err != nil {
|
|
return err
|
|
}
|
|
err := closure()
|
|
// Ignore error returned from SetSocketLabel("") call,
|
|
// can't recover.
|
|
if labelErr := label.SetSocketLabel(""); labelErr == nil {
|
|
// Unlock the thread only if the process label could be restored
|
|
// successfully. Otherwise leave the thread locked and the Go runtime
|
|
// will terminate it once it returns to the threads pool.
|
|
runtime.UnlockOSThread()
|
|
} else {
|
|
logrus.Errorf("Unable to reset socket label: %q", labelErr)
|
|
}
|
|
return err
|
|
}
|
|
|
|
// moveConmonToCgroupAndSignal gets a container's cgroupParent and moves the conmon process to that cgroup
|
|
// it then signals for conmon to start by sending nonce data down the start fd
|
|
func (r *ConmonOCIRuntime) moveConmonToCgroupAndSignal(ctr *Container, cmd *exec.Cmd, startFd *os.File) error {
|
|
mustCreateCgroup := true
|
|
|
|
if ctr.config.NoCgroups {
|
|
mustCreateCgroup = false
|
|
}
|
|
|
|
// If cgroup creation is disabled - just signal.
|
|
switch ctr.config.CgroupsMode {
|
|
case "disabled", "no-conmon", cgroupSplit:
|
|
mustCreateCgroup = false
|
|
}
|
|
|
|
// $INVOCATION_ID is set by systemd when running as a service.
|
|
if ctr.runtime.RemoteURI() == "" && os.Getenv("INVOCATION_ID") != "" {
|
|
mustCreateCgroup = false
|
|
}
|
|
|
|
if mustCreateCgroup {
|
|
// Usually rootless users are not allowed to configure cgroupfs.
|
|
// There are cases though, where it is allowed, e.g. if the cgroup
|
|
// is manually configured and chowned). Avoid detecting all
|
|
// such cases and simply use a lower log level.
|
|
logLevel := logrus.WarnLevel
|
|
if rootless.IsRootless() {
|
|
logLevel = logrus.InfoLevel
|
|
}
|
|
// TODO: This should be a switch - we are not guaranteed that
|
|
// there are only 2 valid cgroup managers
|
|
cgroupParent := ctr.CgroupParent()
|
|
cgroupPath := filepath.Join(ctr.config.CgroupParent, "conmon")
|
|
cgroupResources, err := GetLimits(ctr.LinuxResources())
|
|
if err != nil {
|
|
logrus.StandardLogger().Log(logLevel, "Could not get ctr resources")
|
|
}
|
|
if ctr.CgroupManager() == config.SystemdCgroupsManager {
|
|
unitName := createUnitName("libpod-conmon", ctr.ID())
|
|
realCgroupParent := cgroupParent
|
|
splitParent := strings.Split(cgroupParent, "/")
|
|
if strings.HasSuffix(cgroupParent, ".slice") && len(splitParent) > 1 {
|
|
realCgroupParent = splitParent[len(splitParent)-1]
|
|
}
|
|
|
|
logrus.Infof("Running conmon under slice %s and unitName %s", realCgroupParent, unitName)
|
|
if err := utils.RunUnderSystemdScope(cmd.Process.Pid, realCgroupParent, unitName); err != nil {
|
|
logrus.StandardLogger().Logf(logLevel, "Failed to add conmon to systemd sandbox cgroup: %v", err)
|
|
}
|
|
} else {
|
|
control, err := cgroups.New(cgroupPath, &cgroupResources)
|
|
if err != nil {
|
|
logrus.StandardLogger().Logf(logLevel, "Failed to add conmon to cgroupfs sandbox cgroup: %v", err)
|
|
} else if err := control.AddPid(cmd.Process.Pid); err != nil {
|
|
// we need to remove this defer and delete the cgroup once conmon exits
|
|
// maybe need a conmon monitor?
|
|
logrus.StandardLogger().Logf(logLevel, "Failed to add conmon to cgroupfs sandbox cgroup: %v", err)
|
|
}
|
|
}
|
|
}
|
|
|
|
/* We set the cgroup, now the child can start creating children */
|
|
return writeConmonPipeData(startFd)
|
|
}
|
|
|
|
// GetLimits converts spec resource limits to cgroup consumable limits
|
|
func GetLimits(resource *spec.LinuxResources) (runcconfig.Resources, error) {
|
|
if resource == nil {
|
|
resource = &spec.LinuxResources{}
|
|
}
|
|
final := &runcconfig.Resources{}
|
|
devs := []*devices.Rule{}
|
|
|
|
// Devices
|
|
for _, entry := range resource.Devices {
|
|
if entry.Major == nil || entry.Minor == nil {
|
|
continue
|
|
}
|
|
runeType := 'a'
|
|
switch entry.Type {
|
|
case "b":
|
|
runeType = 'b'
|
|
case "c":
|
|
runeType = 'c'
|
|
}
|
|
|
|
devs = append(devs, &devices.Rule{
|
|
Type: devices.Type(runeType),
|
|
Major: *entry.Major,
|
|
Minor: *entry.Minor,
|
|
Permissions: devices.Permissions(entry.Access),
|
|
Allow: entry.Allow,
|
|
})
|
|
}
|
|
final.Devices = devs
|
|
|
|
// HugepageLimits
|
|
pageLimits := []*runcconfig.HugepageLimit{}
|
|
for _, entry := range resource.HugepageLimits {
|
|
pageLimits = append(pageLimits, &runcconfig.HugepageLimit{
|
|
Pagesize: entry.Pagesize,
|
|
Limit: entry.Limit,
|
|
})
|
|
}
|
|
final.HugetlbLimit = pageLimits
|
|
|
|
// Networking
|
|
netPriorities := []*runcconfig.IfPrioMap{}
|
|
if resource.Network != nil {
|
|
for _, entry := range resource.Network.Priorities {
|
|
netPriorities = append(netPriorities, &runcconfig.IfPrioMap{
|
|
Interface: entry.Name,
|
|
Priority: int64(entry.Priority),
|
|
})
|
|
}
|
|
}
|
|
final.NetPrioIfpriomap = netPriorities
|
|
rdma := make(map[string]runcconfig.LinuxRdma)
|
|
for name, entry := range resource.Rdma {
|
|
rdma[name] = runcconfig.LinuxRdma{HcaHandles: entry.HcaHandles, HcaObjects: entry.HcaObjects}
|
|
}
|
|
final.Rdma = rdma
|
|
|
|
// Memory
|
|
if resource.Memory != nil {
|
|
if resource.Memory.Limit != nil {
|
|
final.Memory = *resource.Memory.Limit
|
|
}
|
|
if resource.Memory.Reservation != nil {
|
|
final.MemoryReservation = *resource.Memory.Reservation
|
|
}
|
|
if resource.Memory.Swap != nil {
|
|
final.MemorySwap = *resource.Memory.Swap
|
|
}
|
|
if resource.Memory.Swappiness != nil {
|
|
final.MemorySwappiness = resource.Memory.Swappiness
|
|
}
|
|
}
|
|
|
|
// CPU
|
|
if resource.CPU != nil {
|
|
if resource.CPU.Period != nil {
|
|
final.CpuPeriod = *resource.CPU.Period
|
|
}
|
|
if resource.CPU.Quota != nil {
|
|
final.CpuQuota = *resource.CPU.Quota
|
|
}
|
|
if resource.CPU.RealtimePeriod != nil {
|
|
final.CpuRtPeriod = *resource.CPU.RealtimePeriod
|
|
}
|
|
if resource.CPU.RealtimeRuntime != nil {
|
|
final.CpuRtRuntime = *resource.CPU.RealtimeRuntime
|
|
}
|
|
if resource.CPU.Shares != nil {
|
|
final.CpuShares = *resource.CPU.Shares
|
|
}
|
|
final.CpusetCpus = resource.CPU.Cpus
|
|
final.CpusetMems = resource.CPU.Mems
|
|
}
|
|
|
|
// BlkIO
|
|
if resource.BlockIO != nil {
|
|
if len(resource.BlockIO.ThrottleReadBpsDevice) > 0 {
|
|
for _, entry := range resource.BlockIO.ThrottleReadBpsDevice {
|
|
throttle := runcconfig.NewThrottleDevice(entry.Major, entry.Minor, entry.Rate)
|
|
final.BlkioThrottleReadBpsDevice = append(final.BlkioThrottleReadBpsDevice, throttle)
|
|
}
|
|
}
|
|
if len(resource.BlockIO.ThrottleWriteBpsDevice) > 0 {
|
|
for _, entry := range resource.BlockIO.ThrottleWriteBpsDevice {
|
|
throttle := runcconfig.NewThrottleDevice(entry.Major, entry.Minor, entry.Rate)
|
|
final.BlkioThrottleWriteBpsDevice = append(final.BlkioThrottleWriteBpsDevice, throttle)
|
|
}
|
|
}
|
|
if len(resource.BlockIO.ThrottleReadIOPSDevice) > 0 {
|
|
for _, entry := range resource.BlockIO.ThrottleReadIOPSDevice {
|
|
throttle := runcconfig.NewThrottleDevice(entry.Major, entry.Minor, entry.Rate)
|
|
final.BlkioThrottleReadIOPSDevice = append(final.BlkioThrottleReadIOPSDevice, throttle)
|
|
}
|
|
}
|
|
if len(resource.BlockIO.ThrottleWriteIOPSDevice) > 0 {
|
|
for _, entry := range resource.BlockIO.ThrottleWriteIOPSDevice {
|
|
throttle := runcconfig.NewThrottleDevice(entry.Major, entry.Minor, entry.Rate)
|
|
final.BlkioThrottleWriteIOPSDevice = append(final.BlkioThrottleWriteIOPSDevice, throttle)
|
|
}
|
|
}
|
|
if resource.BlockIO.LeafWeight != nil {
|
|
final.BlkioLeafWeight = *resource.BlockIO.LeafWeight
|
|
}
|
|
if resource.BlockIO.Weight != nil {
|
|
final.BlkioWeight = *resource.BlockIO.Weight
|
|
}
|
|
if len(resource.BlockIO.WeightDevice) > 0 {
|
|
for _, entry := range resource.BlockIO.WeightDevice {
|
|
var w, lw uint16
|
|
if entry.Weight != nil {
|
|
w = *entry.Weight
|
|
}
|
|
if entry.LeafWeight != nil {
|
|
lw = *entry.LeafWeight
|
|
}
|
|
weight := runcconfig.NewWeightDevice(entry.Major, entry.Minor, w, lw)
|
|
final.BlkioWeightDevice = append(final.BlkioWeightDevice, weight)
|
|
}
|
|
}
|
|
}
|
|
|
|
// Pids
|
|
if resource.Pids != nil {
|
|
final.PidsLimit = resource.Pids.Limit
|
|
}
|
|
|
|
// Networking
|
|
if resource.Network != nil {
|
|
if resource.Network.ClassID != nil {
|
|
final.NetClsClassid = *resource.Network.ClassID
|
|
}
|
|
}
|
|
|
|
// Unified state
|
|
final.Unified = resource.Unified
|
|
|
|
return *final, nil
|
|
}
|