Merge pull request #19066 from Luap99/ps

top: do not depend on ps(1) in container
This commit is contained in:
OpenShift Merge Robot
2023-07-14 13:17:59 +02:00
committed by GitHub
17 changed files with 376 additions and 71 deletions

View File

@ -379,8 +379,6 @@ alt_build_task:
ALT_NAME: 'Build Each Commit'
- env:
ALT_NAME: 'Windows Cross'
- env:
ALT_NAME: 'Build Without CGO'
- env:
ALT_NAME: 'Alt Arch. Cross'
# This task cannot make use of the shared repo.tbz artifact.

View File

@ -451,13 +451,6 @@ local-cross: $(CROSS_BUILD_TARGETS) ## Cross compile podman binary for multiple
.PHONY: cross
cross: local-cross
.PHONY: build-no-cgo
build-no-cgo:
BUILDTAGS="containers_image_openpgp exclude_graphdriver_btrfs \
exclude_graphdriver_devicemapper exclude_disk_quota" \
CGO_ENABLED=0 \
$(MAKE) all
.PHONY: completions
completions: podman podman-remote
# key = shell, value = completion filename

View File

@ -276,9 +276,6 @@ function _run_altbuild() {
make podman-remote-release-windows_amd64.zip
make podman.msi
;;
*Without*)
make build-no-cgo
;;
*RPM*)
make package
;;

View File

@ -7,7 +7,9 @@ podman\-pod\-top - Display the running processes of containers in a pod
**podman pod top** [*options*] *pod* [*format-descriptors*]
## DESCRIPTION
Display the running processes of containers in a pod. The *format-descriptors* are ps (1) compatible AIX format descriptors but extended to print additional information, such as the seccomp mode or the effective capabilities of a given process. The descriptors can either be passed as separate arguments or as a single comma-separated argument. Note that if additional options of ps(1) are specified, Podman falls back to executing ps with the specified arguments and options in the container.
Display the running processes of containers in a pod. The *format-descriptors* are ps (1) compatible AIX format
descriptors but extended to print additional information, such as the seccomp mode or the effective capabilities
of a given process. The descriptors can either be passed as separate arguments or as a single comma-separated argument.
## OPTIONS

View File

@ -9,7 +9,14 @@ podman\-top - Display the running processes of a container
**podman container top** [*options*] *container* [*format-descriptors*]
## DESCRIPTION
Display the running processes of the container. The *format-descriptors* are ps (1) compatible AIX format descriptors but extended to print additional information, such as the seccomp mode or the effective capabilities of a given process. The descriptors can either be passed as separated arguments or as a single comma-separated argument. Note that options and or flags of ps(1) can also be specified; in this case, Podman falls back to executing ps with the specified arguments and flags in the container. Please use the "h*" descriptors to extract host-related information. For instance, `podman top $name hpid huser` to display the PID and user of the processes in the host context.
Display the running processes of the container. The *format-descriptors* are ps (1) compatible AIX format
descriptors but extended to print additional information, such as the seccomp mode or the effective capabilities
of a given process. The descriptors can either be passed as separated arguments or as a single comma-separated
argument. Note that options and or flags of ps(1) can also be specified; in this case, Podman falls back to
executing ps(1) from the host with the specified arguments and flags in the container namespace. If the container
has the `CAP_SYS_PTRACE` capability then we will execute ps(1) in the container so it must be installed there.
Please use the "h*" descriptors to extract host-related information. For instance, `podman top $name hpid huser`
to display the PID and user of the processes in the host context.
## OPTIONS
@ -90,7 +97,7 @@ PID SECCOMP COMMAND %CPU
8 filter vi /etc/ 0.000
```
Podman falls back to executing ps(1) in the container if an unknown descriptor is specified.
Podman falls back to executing ps(1) from the host in the container namespace if an unknown descriptor is specified.
```
$ podman top -l -- aux

View File

@ -0,0 +1,81 @@
#define _GNU_SOURCE
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/mount.h>
#include <sys/wait.h>
#include <unistd.h>
/* keep special_exit_code in sync with container_top_linux.go */
int special_exit_code = 255;
char **argv = NULL;
void
create_argv (int len)
{
/* allocate one extra element because we need a final NULL in c */
argv = malloc (sizeof (char *) * (len + 1));
if (argv == NULL)
{
fprintf (stderr, "failed to allocate ps argv");
exit (special_exit_code);
}
/* add final NULL */
argv[len] = NULL;
}
void
set_argv (int pos, char *arg)
{
argv[pos] = arg;
}
/*
We use cgo code here so we can fork then exec separately,
this is done so we can mount proc after the fork because the pid namespace is
only active after spawning childs.
*/
void
fork_exec_ps ()
{
int r, status = 0;
pid_t pid;
if (argv == NULL)
{
fprintf (stderr, "argv not initialized");
exit (special_exit_code);
}
pid = fork ();
if (pid < 0)
{
fprintf (stderr, "fork: %m");
exit (special_exit_code);
}
if (pid == 0)
{
r = mount ("proc", "/proc", "proc", 0, NULL);
if (r < 0)
{
fprintf (stderr, "mount proc: %m");
exit (special_exit_code);
}
/* use execve to unset all env vars, we do not want to leak anything into the container */
execve (argv[0], argv, NULL);
fprintf (stderr, "execve: %m");
exit (special_exit_code);
}
r = waitpid (pid, &status, 0);
if (r < 0)
{
fprintf (stderr, "waitpid: %m");
exit (special_exit_code);
}
if (WIFEXITED (status))
exit (WEXITSTATUS (status));
if (WIFSIGNALED (status))
exit (128 + WTERMSIG (status));
exit (special_exit_code);
}

View File

@ -1,23 +1,183 @@
//go:build linux
// +build linux
//go:build linux && cgo
// +build linux,cgo
package libpod
import (
"bufio"
"bytes"
"errors"
"fmt"
"os"
"os/exec"
"path/filepath"
"runtime"
"strconv"
"strings"
"syscall"
"unsafe"
"github.com/containers/common/pkg/util"
"github.com/containers/podman/v4/libpod/define"
"github.com/containers/podman/v4/pkg/rootless"
"github.com/containers/psgo"
"github.com/containers/storage/pkg/reexec"
"github.com/google/shlex"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
)
/*
#include <stdlib.h>
void fork_exec_ps();
void create_argv(int len);
void set_argv(int pos, char *arg);
*/
import "C"
const (
// podmanTopCommand is the reexec key to safely setup the environment for ps to be executed
podmanTopCommand = "podman-top"
// podmanTopExitCode is a special exec code to signal that podman failed to to something in
// reexec command not ps. This is used to give a better error.
podmanTopExitCode = 255
)
func init() {
reexec.Register(podmanTopCommand, podmanTopMain)
}
// podmanTopMain - main function for the reexec
func podmanTopMain() {
if err := podmanTopInner(); err != nil {
fmt.Fprint(os.Stderr, err.Error())
os.Exit(podmanTopExitCode)
}
os.Exit(0)
}
// podmanTopInner os.Args = {command name} {pid} {psPath} [args...]
// We are rexxec'd in a new mountns, then we need to set some security settings in order
// to safely execute ps in the container pid namespace. Most notably make sure podman and
// ps are read only to prevent a process from overwriting it.
func podmanTopInner() error {
if len(os.Args) < 3 {
return fmt.Errorf("internal error, need at least two arguments")
}
// We have to lock the thread as we a) switch namespace below and b) use PR_SET_PDEATHSIG
// Also do not unlock as this thread should not be reused by go we exit anyway at the end.
runtime.LockOSThread()
if err := unix.Prctl(unix.PR_SET_PDEATHSIG, uintptr(unix.SIGKILL), 0, 0, 0); err != nil {
return fmt.Errorf("PR_SET_PDEATHSIG: %w", err)
}
if err := unix.Prctl(unix.PR_SET_DUMPABLE, 0, 0, 0, 0); err != nil {
return fmt.Errorf("PR_SET_DUMPABLE: %w", err)
}
if err := unix.Prctl(unix.PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil {
return fmt.Errorf("PR_SET_NO_NEW_PRIVS: %w", err)
}
if err := unix.Mount("none", "/", "", unix.MS_REC|unix.MS_PRIVATE, ""); err != nil {
return fmt.Errorf("make / mount private: %w", err)
}
psPath := os.Args[2]
// try to mount everything read only
if err := unix.MountSetattr(0, "/", unix.AT_RECURSIVE, &unix.MountAttr{
Attr_set: unix.MOUNT_ATTR_RDONLY,
}); err != nil {
if err != unix.ENOSYS {
return fmt.Errorf("mount_setattr / readonly: %w", err)
}
// old kernel without mount_setattr, i.e. on RHEL 8.8
// Bind mount the directories readonly for both podman and ps.
psPath, err = remountReadOnly(psPath)
if err != nil {
return err
}
_, err = remountReadOnly(reexec.Self())
if err != nil {
return err
}
}
// extra safety check make sure the ps path is actually read only
err := unix.Access(psPath, unix.W_OK)
if err == nil {
return fmt.Errorf("%q was not mounted read only, this can be dangerous so we will not execute it", psPath)
}
pid := os.Args[1]
// join the pid namespace of pid
pidFD, err := os.Open(fmt.Sprintf("/proc/%s/ns/pid", pid))
if err != nil {
return fmt.Errorf("open pidns: %w", err)
}
if err := unix.Setns(int(pidFD.Fd()), unix.CLONE_NEWPID); err != nil {
return fmt.Errorf("setns NEWPID: %w", err)
}
pidFD.Close()
args := []string{psPath}
args = append(args, os.Args[3:]...)
C.create_argv(C.int(len(args)))
for i, arg := range args {
cArg := C.CString(arg)
C.set_argv(C.int(i), cArg)
defer C.free(unsafe.Pointer(cArg))
}
// Now try to close open fds except std streams
// While golang open everything O_CLOEXEC it could still leak fds from
// the parent, i.e. bash. In this case an attacker might be able to
// read/write from them.
// Do this as last step, it has to happen before to fork because the child
// will be immediately in pid namespace so we cannot close them in the child.
entries, err := os.ReadDir("/proc/self/fd")
if err != nil {
return err
}
for _, e := range entries {
i, err := strconv.Atoi(e.Name())
// IsFdInherited checks the we got the fd from a parent process and only close them,
// when we close all that would include the ones from the go runtime which
// then can panic because of that.
if err == nil && i > unix.Stderr && rootless.IsFdInherited(i) {
_ = unix.Close(i)
}
}
// this function will always exit for us
C.fork_exec_ps()
return nil
}
// remountReadOnly remounts the parent directory of the given path read only
// return the resolved path or an error. The path can then be used to exec the
// binary as we know it is on a read only mount now.
func remountReadOnly(path string) (string, error) {
resolvedPath, err := filepath.EvalSymlinks(path)
if err != nil {
return "", fmt.Errorf("resolve symlink for %s: %w", path, err)
}
dir := filepath.Dir(resolvedPath)
// create mount point
if err := unix.Mount(dir, dir, "", unix.MS_BIND, ""); err != nil {
return "", fmt.Errorf("mount %s read only: %w", dir, err)
}
// remount readonly
if err := unix.Mount(dir, dir, "", unix.MS_BIND|unix.MS_REMOUNT|unix.MS_RDONLY, ""); err != nil {
return "", fmt.Errorf("mount %s read only: %w", dir, err)
}
return resolvedPath, nil
}
// Top gathers statistics about the running processes in a container. It returns a
// []string for output
func (c *Container) Top(descriptors []string) ([]string, error) {
@ -68,9 +228,27 @@ func (c *Container) Top(descriptors []string) ([]string, error) {
}
}
output, err = c.execPS(psDescriptors)
// Only use ps(1) from the host when we know the container was not started with CAP_SYS_PTRACE,
// with it the container can access /proc/$pid/ files and potentially escape the container fs.
if c.config.Spec.Process.Capabilities != nil &&
!util.StringInSlice("CAP_SYS_PTRACE", c.config.Spec.Process.Capabilities.Effective) {
var retry bool
output, retry, err = c.execPS(psDescriptors)
if err != nil {
return nil, fmt.Errorf("executing ps(1) in the container: %w", err)
if !retry {
return nil, err
}
logrus.Warnf("Falling back to container ps(1), could not execute ps(1) from the host: %v", err)
output, err = c.execPSinContainer(psDescriptors)
if err != nil {
return nil, fmt.Errorf("executing ps(1) in container: %w", err)
}
}
} else {
output, err = c.execPSinContainer(psDescriptors)
if err != nil {
return nil, fmt.Errorf("executing ps(1) in container: %w", err)
}
}
// Trick: filter the ps command from the output instead of
@ -113,8 +291,63 @@ func (c *Container) GetContainerPidInformation(descriptors []string) ([]string,
return res, nil
}
// execPS executes ps(1) with the specified args in the container.
func (c *Container) execPS(args []string) ([]string, error) {
// execute ps(1) from the host within the container pid namespace
func (c *Container) execPS(psArgs []string) ([]string, bool, error) {
rPipe, wPipe, err := os.Pipe()
if err != nil {
return nil, false, err
}
defer wPipe.Close()
defer rPipe.Close()
stdout := []string{}
go func() {
scanner := bufio.NewScanner(rPipe)
for scanner.Scan() {
stdout = append(stdout, scanner.Text())
}
}()
psPath, err := exec.LookPath("ps")
if err != nil {
return nil, true, err
}
args := append([]string{podmanTopCommand, strconv.Itoa(c.state.PID), psPath}, psArgs...)
cmd := reexec.Command(args...)
cmd.SysProcAttr = &syscall.SysProcAttr{
Unshareflags: unix.CLONE_NEWNS,
}
var errBuf bytes.Buffer
cmd.Stdout = wPipe
cmd.Stderr = &errBuf
// nil means use current env so explicitly unset all, to not leak any sensitive env vars
cmd.Env = []string{}
retryContainerExec := true
err = cmd.Run()
if err != nil {
exitError := &exec.ExitError{}
if errors.As(err, &exitError) {
if exitError.ExitCode() != podmanTopExitCode {
// ps command failed
err = fmt.Errorf("ps(1) failed with exit code %d: %s", exitError.ExitCode(), errBuf.String())
// ps command itself failed: likely invalid args, no point in retrying.
retryContainerExec = false
} else {
// podman-top reexec setup fails somewhere
err = fmt.Errorf("could not execute ps(1) in the container pid namespace: %s", errBuf.String())
}
} else {
err = fmt.Errorf("could not reexec podman-top command: %w", err)
}
}
return stdout, retryContainerExec, err
}
// execPS executes ps(1) with the specified args in the container vie exec session.
// This should be a bit safer then execPS() but it requires ps(1) to be installed in the container.
func (c *Container) execPSinContainer(args []string) ([]string, error) {
rPipe, wPipe, err := os.Pipe()
if err != nil {
return nil, err
@ -122,16 +355,10 @@ func (c *Container) execPS(args []string) ([]string, error) {
defer wPipe.Close()
defer rPipe.Close()
rErrPipe, wErrPipe, err := os.Pipe()
if err != nil {
return nil, err
}
defer wErrPipe.Close()
defer rErrPipe.Close()
var errBuf bytes.Buffer
streams := new(define.AttachStreams)
streams.OutputStream = wPipe
streams.ErrorStream = wErrPipe
streams.ErrorStream = &errBuf
streams.AttachOutput = true
streams.AttachError = true
@ -142,13 +369,6 @@ func (c *Container) execPS(args []string) ([]string, error) {
stdout = append(stdout, scanner.Text())
}
}()
stderr := []string{}
go func() {
scanner := bufio.NewScanner(rErrPipe)
for scanner.Scan() {
stderr = append(stderr, scanner.Text())
}
}()
cmd := append([]string{"ps"}, args...)
config := new(ExecConfig)
@ -157,15 +377,13 @@ func (c *Container) execPS(args []string) ([]string, error) {
if err != nil {
return nil, err
} else if ec != 0 {
return nil, fmt.Errorf("runtime failed with exit status: %d and output: %s", ec, strings.Join(stderr, " "))
return nil, fmt.Errorf("runtime failed with exit status: %d and output: %s", ec, errBuf.String())
}
if logrus.GetLevel() >= logrus.DebugLevel {
// If we're running in debug mode or higher, we might want to have a
// look at stderr which includes debug logs from conmon.
for _, log := range stderr {
logrus.Debugf("%s", log)
}
logrus.Debugf(errBuf.String())
}
return stdout, nil

View File

@ -1,5 +1,6 @@
//go:build !linux && !freebsd
// +build !linux,!freebsd
//go:build !(linux && cgo) && !freebsd
// +build !linux !cgo
// +build !freebsd
package libpod

View File

@ -51,9 +51,9 @@ const (
// AttachStreams contains streams that will be attached to the container
type AttachStreams struct {
// OutputStream will be attached to container's STDOUT
OutputStream io.WriteCloser
OutputStream io.Writer
// ErrorStream will be attached to container's STDERR
ErrorStream io.WriteCloser
ErrorStream io.Writer
// InputStream will be attached to container's STDIN
InputStream *bufio.Reader
// AttachOutput is whether to attach to STDOUT

View File

@ -376,17 +376,12 @@ func PodTop(w http.ResponseWriter, r *http.Request) {
runtime := r.Context().Value(api.RuntimeKey).(*libpod.Runtime)
decoder := r.Context().Value(api.DecoderKey).(*schema.Decoder)
psArgs := "-ef"
if utils.IsLibpodRequest(r) {
psArgs = ""
}
query := struct {
Delay int `schema:"delay"`
PsArgs string `schema:"ps_args"`
Stream bool `schema:"stream"`
}{
Delay: 5,
PsArgs: psArgs,
}
if err := decoder.Decode(&query, r.URL.Query()); err != nil {
utils.Error(w, http.StatusBadRequest, fmt.Errorf("failed to parse parameters for %s: %w", r.URL.String(), err))

View File

@ -444,7 +444,7 @@ func (s *APIServer) registerContainersHandlers(r *mux.Router) error {
// name: ps_args
// type: string
// default: -ef
// description: arguments to pass to ps such as aux. Requires ps(1) to be installed in the container if no ps(1) compatible AIX descriptors are used.
// description: arguments to pass to ps such as aux.
// produces:
// - application/json
// responses:
@ -1174,10 +1174,9 @@ func (s *APIServer) registerContainersHandlers(r *mux.Router) error {
// - in: query
// name: ps_args
// type: string
// default: -ef
// default:
// description: |
// arguments to pass to ps such as aux.
// Requires ps(1) to be installed in the container if no ps(1) compatible AIX descriptors are used.
// produces:
// - application/json
// responses:

View File

@ -309,10 +309,9 @@ func (s *APIServer) registerPodsHandlers(r *mux.Router) error {
// - in: query
// name: ps_args
// type: string
// default: -ef
// default:
// description: |
// arguments to pass to ps such as aux.
// Requires ps(1) to be installed in the container if no ps(1) compatible AIX descriptors are used.
// responses:
// 200:
// $ref: "#/responses/podTopResponse"

View File

@ -295,9 +295,9 @@ type ResizeExecTTYOptions struct {
//go:generate go run ../generator/generator.go ExecStartAndAttachOptions
type ExecStartAndAttachOptions struct {
// OutputStream will be attached to container's STDOUT
OutputStream *io.WriteCloser
OutputStream *io.Writer
// ErrorStream will be attached to container's STDERR
ErrorStream *io.WriteCloser
ErrorStream *io.Writer
// InputStream will be attached to container's STDIN
InputStream *bufio.Reader
// AttachOutput is whether to attach to STDOUT

View File

@ -20,30 +20,30 @@ func (o *ExecStartAndAttachOptions) ToParams() (url.Values, error) {
}
// WithOutputStream set field OutputStream to given value
func (o *ExecStartAndAttachOptions) WithOutputStream(value io.WriteCloser) *ExecStartAndAttachOptions {
func (o *ExecStartAndAttachOptions) WithOutputStream(value io.Writer) *ExecStartAndAttachOptions {
o.OutputStream = &value
return o
}
// GetOutputStream returns value of field OutputStream
func (o *ExecStartAndAttachOptions) GetOutputStream() io.WriteCloser {
func (o *ExecStartAndAttachOptions) GetOutputStream() io.Writer {
if o.OutputStream == nil {
var z io.WriteCloser
var z io.Writer
return z
}
return *o.OutputStream
}
// WithErrorStream set field ErrorStream to given value
func (o *ExecStartAndAttachOptions) WithErrorStream(value io.WriteCloser) *ExecStartAndAttachOptions {
func (o *ExecStartAndAttachOptions) WithErrorStream(value io.Writer) *ExecStartAndAttachOptions {
o.ErrorStream = &value
return o
}
// GetErrorStream returns value of field ErrorStream
func (o *ExecStartAndAttachOptions) GetErrorStream() io.WriteCloser {
func (o *ExecStartAndAttachOptions) GetErrorStream() io.Writer {
if o.ErrorStream == nil {
var z io.WriteCloser
var z io.Writer
return z
}
return *o.ErrorStream

View File

@ -410,7 +410,7 @@ var _ = Describe("Podman containers ", func() {
// With bogus descriptors
_, err = containers.Top(bt.conn, cid, new(containers.TopOptions).WithDescriptors([]string{"Me,Neither"}))
Expect(err).ToNot(HaveOccurred())
Expect(err).To(HaveOccurred())
})
It("podman bogus container does not exist in local storage", func() {

View File

@ -123,10 +123,14 @@ if root; then
t GET containers/$CTRNAME/top?stream=false 200 \
.Titles='[
"UID",
"PID",
"USER",
"PPID",
"C",
"STIME",
"TTY",
"TIME",
"COMMAND"
"CMD"
]'
podman rm -f $CTRNAME
@ -135,9 +139,9 @@ fi
CTRNAME=test123
podman run --name $CTRNAME -d $IMAGE top
t GET libpod/containers/$CTRNAME/top?ps_args=--invalid 500 \
.cause~".*unrecognized option.*"
.cause~".*unknown gnu long option.*"
t GET containers/$CTRNAME/top?ps_args=--invalid 500 \
.cause~".*unrecognized option.*"
.cause~".*unknown gnu long option.*"
podman rm -f $CTRNAME

View File

@ -88,7 +88,7 @@ var _ = Describe("Podman top", func() {
})
It("podman top with ps(1) options", func() {
session := podmanTest.Podman([]string{"run", "-d", ALPINE, "top", "-d", "2"})
session := podmanTest.Podman([]string{"run", "-d", fedoraMinimal, "sleep", "inf"})
session.WaitWithDefaultTimeout()
Expect(session).Should(Exit(0))
@ -100,7 +100,18 @@ var _ = Describe("Podman top", func() {
result = podmanTest.Podman([]string{"top", session.OutputToString(), "ax -o args"})
result.WaitWithDefaultTimeout()
Expect(result).Should(Exit(0))
Expect(result.OutputToStringArray()).To(Equal([]string{"COMMAND", "top -d 2"}))
Expect(result.OutputToStringArray()).To(Equal([]string{"COMMAND", "sleep inf"}))
// Now make sure we use ps in the container with CAP_SYS_PTRACE
session = podmanTest.Podman([]string{"run", "-d", "--cap-add=SYS_PTRACE", fedoraMinimal, "sleep", "inf"})
session.WaitWithDefaultTimeout()
Expect(session).Should(Exit(0))
// Because the image does not contain this must fail and we know we use the correct podman exec fallback.
exec := podmanTest.Podman([]string{"top", session.OutputToString(), "aux"})
exec.WaitWithDefaultTimeout()
Expect(exec).Should(Exit(125))
Expect(exec.ErrorToString()).Should(ContainSubstring("OCI runtime attempted to invoke a command that was not found"))
})
It("podman top with comma-separated options", func() {