mirror of
https://github.com/containers/podman.git
synced 2025-05-17 15:18:43 +08:00

Moving from Go module v4 to v5 prepares us for public releases. Move done using gomove [1] as with the v3 and v4 moves. [1] https://github.com/KSubedi/gomove Signed-off-by: Matt Heon <mheon@redhat.com>
409 lines
12 KiB
Go
409 lines
12 KiB
Go
//go:build !remote && linux && cgo
|
|
|
|
package libpod
|
|
|
|
import (
|
|
"bufio"
|
|
"bytes"
|
|
"errors"
|
|
"fmt"
|
|
"os"
|
|
"os/exec"
|
|
"path/filepath"
|
|
"runtime"
|
|
"strconv"
|
|
"strings"
|
|
"syscall"
|
|
"unsafe"
|
|
|
|
"github.com/containers/podman/v5/libpod/define"
|
|
"github.com/containers/podman/v5/pkg/rootless"
|
|
"github.com/containers/psgo"
|
|
"github.com/containers/storage/pkg/reexec"
|
|
"github.com/google/shlex"
|
|
"github.com/sirupsen/logrus"
|
|
"golang.org/x/exp/slices"
|
|
"golang.org/x/sys/unix"
|
|
)
|
|
|
|
/*
|
|
#include <stdlib.h>
|
|
void fork_exec_ps();
|
|
void create_argv(int len);
|
|
void set_argv(int pos, char *arg);
|
|
*/
|
|
import "C"
|
|
|
|
const (
|
|
// podmanTopCommand is the reexec key to safely setup the environment for ps to be executed
|
|
podmanTopCommand = "podman-top"
|
|
|
|
// podmanTopExitCode is a special exec code to signal that podman failed to to something in
|
|
// reexec command not ps. This is used to give a better error.
|
|
podmanTopExitCode = 255
|
|
)
|
|
|
|
func init() {
|
|
reexec.Register(podmanTopCommand, podmanTopMain)
|
|
}
|
|
|
|
// podmanTopMain - main function for the reexec
|
|
func podmanTopMain() {
|
|
if err := podmanTopInner(); err != nil {
|
|
fmt.Fprint(os.Stderr, err.Error())
|
|
os.Exit(podmanTopExitCode)
|
|
}
|
|
os.Exit(0)
|
|
}
|
|
|
|
// podmanTopInner os.Args = {command name} {pid} {psPath} [args...]
|
|
// We are rexxec'd in a new mountns, then we need to set some security settings in order
|
|
// to safely execute ps in the container pid namespace. Most notably make sure podman and
|
|
// ps are read only to prevent a process from overwriting it.
|
|
func podmanTopInner() error {
|
|
if len(os.Args) < 3 {
|
|
return fmt.Errorf("internal error, need at least two arguments")
|
|
}
|
|
|
|
// We have to lock the thread as we a) switch namespace below and b) use PR_SET_PDEATHSIG
|
|
// Also do not unlock as this thread should not be reused by go we exit anyway at the end.
|
|
runtime.LockOSThread()
|
|
|
|
if err := unix.Prctl(unix.PR_SET_PDEATHSIG, uintptr(unix.SIGKILL), 0, 0, 0); err != nil {
|
|
return fmt.Errorf("PR_SET_PDEATHSIG: %w", err)
|
|
}
|
|
if err := unix.Prctl(unix.PR_SET_DUMPABLE, 0, 0, 0, 0); err != nil {
|
|
return fmt.Errorf("PR_SET_DUMPABLE: %w", err)
|
|
}
|
|
|
|
if err := unix.Prctl(unix.PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil {
|
|
return fmt.Errorf("PR_SET_NO_NEW_PRIVS: %w", err)
|
|
}
|
|
|
|
if err := unix.Mount("none", "/", "", unix.MS_REC|unix.MS_PRIVATE, ""); err != nil {
|
|
return fmt.Errorf("make / mount private: %w", err)
|
|
}
|
|
|
|
psPath := os.Args[2]
|
|
|
|
// try to mount everything read only
|
|
if err := unix.MountSetattr(0, "/", unix.AT_RECURSIVE, &unix.MountAttr{
|
|
Attr_set: unix.MOUNT_ATTR_RDONLY,
|
|
}); err != nil {
|
|
if err != unix.ENOSYS {
|
|
return fmt.Errorf("mount_setattr / readonly: %w", err)
|
|
}
|
|
// old kernel without mount_setattr, i.e. on RHEL 8.8
|
|
// Bind mount the directories readonly for both podman and ps.
|
|
psPath, err = remountReadOnly(psPath)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
_, err = remountReadOnly(reexec.Self())
|
|
if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
// extra safety check make sure the ps path is actually read only
|
|
err := unix.Access(psPath, unix.W_OK)
|
|
if err == nil {
|
|
return fmt.Errorf("%q was not mounted read only, this can be dangerous so we will not execute it", psPath)
|
|
}
|
|
|
|
pid := os.Args[1]
|
|
// join the pid namespace of pid
|
|
pidFD, err := os.Open(fmt.Sprintf("/proc/%s/ns/pid", pid))
|
|
if err != nil {
|
|
return fmt.Errorf("open pidns: %w", err)
|
|
}
|
|
if err := unix.Setns(int(pidFD.Fd()), unix.CLONE_NEWPID); err != nil {
|
|
return fmt.Errorf("setns NEWPID: %w", err)
|
|
}
|
|
pidFD.Close()
|
|
|
|
args := []string{psPath}
|
|
args = append(args, os.Args[3:]...)
|
|
|
|
C.create_argv(C.int(len(args)))
|
|
for i, arg := range args {
|
|
cArg := C.CString(arg)
|
|
C.set_argv(C.int(i), cArg)
|
|
defer C.free(unsafe.Pointer(cArg))
|
|
}
|
|
|
|
// Now try to close open fds except std streams
|
|
// While golang open everything O_CLOEXEC it could still leak fds from
|
|
// the parent, i.e. bash. In this case an attacker might be able to
|
|
// read/write from them.
|
|
// Do this as last step, it has to happen before to fork because the child
|
|
// will be immediately in pid namespace so we cannot close them in the child.
|
|
entries, err := os.ReadDir("/proc/self/fd")
|
|
if err != nil {
|
|
return err
|
|
}
|
|
for _, e := range entries {
|
|
i, err := strconv.Atoi(e.Name())
|
|
// IsFdInherited checks the we got the fd from a parent process and only close them,
|
|
// when we close all that would include the ones from the go runtime which
|
|
// then can panic because of that.
|
|
if err == nil && i > unix.Stderr && rootless.IsFdInherited(i) {
|
|
_ = unix.Close(i)
|
|
}
|
|
}
|
|
|
|
// this function will always exit for us
|
|
C.fork_exec_ps()
|
|
return nil
|
|
}
|
|
|
|
// remountReadOnly remounts the parent directory of the given path read only
|
|
// return the resolved path or an error. The path can then be used to exec the
|
|
// binary as we know it is on a read only mount now.
|
|
func remountReadOnly(path string) (string, error) {
|
|
resolvedPath, err := filepath.EvalSymlinks(path)
|
|
if err != nil {
|
|
return "", fmt.Errorf("resolve symlink for %s: %w", path, err)
|
|
}
|
|
dir := filepath.Dir(resolvedPath)
|
|
// create mount point
|
|
if err := unix.Mount(dir, dir, "", unix.MS_BIND, ""); err != nil {
|
|
return "", fmt.Errorf("mount %s read only: %w", dir, err)
|
|
}
|
|
// remount readonly
|
|
if err := unix.Mount(dir, dir, "", unix.MS_BIND|unix.MS_REMOUNT|unix.MS_RDONLY, ""); err != nil {
|
|
return "", fmt.Errorf("mount %s read only: %w", dir, err)
|
|
}
|
|
return resolvedPath, nil
|
|
}
|
|
|
|
// Top gathers statistics about the running processes in a container. It returns a
|
|
// []string for output
|
|
func (c *Container) Top(descriptors []string) ([]string, error) {
|
|
if c.config.NoCgroups {
|
|
return nil, fmt.Errorf("cannot run top on container %s as it did not create a cgroup: %w", c.ID(), define.ErrNoCgroups)
|
|
}
|
|
|
|
conStat, err := c.State()
|
|
if err != nil {
|
|
return nil, fmt.Errorf("unable to look up state for %s: %w", c.ID(), err)
|
|
}
|
|
if conStat != define.ContainerStateRunning {
|
|
return nil, errors.New("top can only be used on running containers")
|
|
}
|
|
|
|
// Also support comma-separated input.
|
|
psgoDescriptors := []string{}
|
|
for _, d := range descriptors {
|
|
for _, s := range strings.Split(d, ",") {
|
|
if s != "" {
|
|
psgoDescriptors = append(psgoDescriptors, s)
|
|
}
|
|
}
|
|
}
|
|
|
|
// If we encountered an ErrUnknownDescriptor error, fallback to executing
|
|
// ps(1). This ensures backwards compatibility to users depending on ps(1)
|
|
// and makes sure we're ~compatible with docker.
|
|
output, psgoErr := c.GetContainerPidInformation(psgoDescriptors)
|
|
if psgoErr == nil {
|
|
return output, nil
|
|
}
|
|
if !errors.Is(psgoErr, psgo.ErrUnknownDescriptor) {
|
|
return nil, psgoErr
|
|
}
|
|
|
|
psDescriptors := descriptors
|
|
if len(descriptors) == 1 {
|
|
// Note that the descriptors to ps(1) must be shlexed (see #12452).
|
|
psDescriptors = make([]string, 0, len(descriptors))
|
|
shSplit, err := shlex.Split(descriptors[0])
|
|
if err != nil {
|
|
return nil, fmt.Errorf("parsing ps args: %w", err)
|
|
}
|
|
for _, s := range shSplit {
|
|
if s != "" {
|
|
psDescriptors = append(psDescriptors, s)
|
|
}
|
|
}
|
|
}
|
|
|
|
// Only use ps(1) from the host when we know the container was not started with CAP_SYS_PTRACE,
|
|
// with it the container can access /proc/$pid/ files and potentially escape the container fs.
|
|
if c.config.Spec.Process.Capabilities != nil &&
|
|
!slices.Contains(c.config.Spec.Process.Capabilities.Effective, "CAP_SYS_PTRACE") {
|
|
var retry bool
|
|
output, retry, err = c.execPS(psDescriptors)
|
|
if err != nil {
|
|
if !retry {
|
|
return nil, err
|
|
}
|
|
logrus.Warnf("Falling back to container ps(1), could not execute ps(1) from the host: %v", err)
|
|
output, err = c.execPSinContainer(psDescriptors)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("executing ps(1) in container: %w", err)
|
|
}
|
|
}
|
|
} else {
|
|
output, err = c.execPSinContainer(psDescriptors)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("executing ps(1) in container: %w", err)
|
|
}
|
|
}
|
|
|
|
// Trick: filter the ps command from the output instead of
|
|
// checking/requiring PIDs in the output.
|
|
filtered := []string{}
|
|
cmd := strings.Join(descriptors, " ")
|
|
for _, line := range output {
|
|
if !strings.Contains(line, cmd) {
|
|
filtered = append(filtered, line)
|
|
}
|
|
}
|
|
|
|
return filtered, nil
|
|
}
|
|
|
|
// GetContainerPidInformation returns process-related data of all processes in
|
|
// the container. The output data can be controlled via the `descriptors`
|
|
// argument which expects format descriptors and supports all AIXformat
|
|
// descriptors of ps (1) plus some additional ones to for instance inspect the
|
|
// set of effective capabilities. Each element in the returned string slice
|
|
// is a tab-separated string.
|
|
//
|
|
// For more details, please refer to github.com/containers/psgo.
|
|
func (c *Container) GetContainerPidInformation(descriptors []string) ([]string, error) {
|
|
pid := strconv.Itoa(c.state.PID)
|
|
// NOTE: psgo returns a [][]string to give users the ability to apply
|
|
// filters on the data. We need to change the API here
|
|
// to return a [][]string if we want to make use of
|
|
// filtering.
|
|
opts := psgo.JoinNamespaceOpts{FillMappings: rootless.IsRootless()}
|
|
|
|
psgoOutput, err := psgo.JoinNamespaceAndProcessInfoWithOptions(pid, descriptors, &opts)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
res := []string{}
|
|
for _, out := range psgoOutput {
|
|
res = append(res, strings.Join(out, "\t"))
|
|
}
|
|
return res, nil
|
|
}
|
|
|
|
// execute ps(1) from the host within the container pid namespace
|
|
func (c *Container) execPS(psArgs []string) ([]string, bool, error) {
|
|
rPipe, wPipe, err := os.Pipe()
|
|
if err != nil {
|
|
return nil, false, err
|
|
}
|
|
defer rPipe.Close()
|
|
|
|
outErrChan := make(chan error)
|
|
stdout := []string{}
|
|
go func() {
|
|
defer close(outErrChan)
|
|
scanner := bufio.NewScanner(rPipe)
|
|
for scanner.Scan() {
|
|
stdout = append(stdout, scanner.Text())
|
|
}
|
|
if err := scanner.Err(); err != nil {
|
|
outErrChan <- err
|
|
}
|
|
}()
|
|
|
|
psPath, err := exec.LookPath("ps")
|
|
if err != nil {
|
|
wPipe.Close()
|
|
return nil, true, err
|
|
}
|
|
args := append([]string{podmanTopCommand, strconv.Itoa(c.state.PID), psPath}, psArgs...)
|
|
|
|
cmd := reexec.Command(args...)
|
|
cmd.SysProcAttr = &syscall.SysProcAttr{
|
|
Unshareflags: unix.CLONE_NEWNS,
|
|
}
|
|
var errBuf bytes.Buffer
|
|
cmd.Stdout = wPipe
|
|
cmd.Stderr = &errBuf
|
|
// nil means use current env so explicitly unset all, to not leak any sensitive env vars
|
|
cmd.Env = []string{fmt.Sprintf("HOME=%s", os.Getenv("HOME"))}
|
|
|
|
retryContainerExec := true
|
|
err = cmd.Run()
|
|
wPipe.Close()
|
|
if err != nil {
|
|
exitError := &exec.ExitError{}
|
|
if errors.As(err, &exitError) {
|
|
if exitError.ExitCode() != podmanTopExitCode {
|
|
// ps command failed
|
|
err = fmt.Errorf("ps(1) failed with exit code %d: %s", exitError.ExitCode(), errBuf.String())
|
|
// ps command itself failed: likely invalid args, no point in retrying.
|
|
retryContainerExec = false
|
|
} else {
|
|
// podman-top reexec setup fails somewhere
|
|
err = fmt.Errorf("could not execute ps(1) in the container pid namespace: %s", errBuf.String())
|
|
}
|
|
} else {
|
|
err = fmt.Errorf("could not reexec podman-top command: %w", err)
|
|
}
|
|
}
|
|
|
|
if err := <-outErrChan; err != nil {
|
|
return nil, retryContainerExec, fmt.Errorf("failed to read ps stdout: %w", err)
|
|
}
|
|
return stdout, retryContainerExec, err
|
|
}
|
|
|
|
// execPS executes ps(1) with the specified args in the container via exec session.
|
|
// This should be a bit safer then execPS() but it requires ps(1) to be installed in the container.
|
|
func (c *Container) execPSinContainer(args []string) ([]string, error) {
|
|
rPipe, wPipe, err := os.Pipe()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer rPipe.Close()
|
|
|
|
var errBuf bytes.Buffer
|
|
streams := new(define.AttachStreams)
|
|
streams.OutputStream = wPipe
|
|
streams.ErrorStream = &errBuf
|
|
streams.AttachOutput = true
|
|
streams.AttachError = true
|
|
|
|
outErrChan := make(chan error)
|
|
stdout := []string{}
|
|
go func() {
|
|
defer close(outErrChan)
|
|
scanner := bufio.NewScanner(rPipe)
|
|
for scanner.Scan() {
|
|
stdout = append(stdout, scanner.Text())
|
|
}
|
|
if err := scanner.Err(); err != nil {
|
|
outErrChan <- err
|
|
}
|
|
}()
|
|
|
|
cmd := append([]string{"ps"}, args...)
|
|
config := new(ExecConfig)
|
|
config.Command = cmd
|
|
ec, err := c.Exec(config, streams, nil)
|
|
wPipe.Close()
|
|
if err != nil {
|
|
return nil, err
|
|
} else if ec != 0 {
|
|
return nil, fmt.Errorf("runtime failed with exit status: %d and output: %s", ec, errBuf.String())
|
|
}
|
|
|
|
if logrus.GetLevel() >= logrus.DebugLevel {
|
|
// If we're running in debug mode or higher, we might want to have a
|
|
// look at stderr which includes debug logs from conmon.
|
|
logrus.Debugf(errBuf.String())
|
|
}
|
|
|
|
if err := <-outErrChan; err != nil {
|
|
return nil, fmt.Errorf("failed to read ps stdout: %w", err)
|
|
}
|
|
return stdout, nil
|
|
}
|