//go:build !remote && linux && cgo package libpod import ( "bufio" "bytes" "errors" "fmt" "os" "os/exec" "path/filepath" "runtime" "slices" "strconv" "strings" "syscall" "unsafe" "github.com/containers/podman/v5/libpod/define" "github.com/containers/podman/v5/pkg/rootless" "github.com/containers/psgo" "github.com/containers/storage/pkg/reexec" "github.com/google/shlex" "github.com/sirupsen/logrus" "golang.org/x/sys/unix" ) /* #include void fork_exec_ps(); void create_argv(int len); void set_argv(int pos, char *arg); void set_userns(); */ import "C" const ( // podmanTopCommand is the reexec key to safely setup the environment for ps to be executed podmanTopCommand = "podman-top" // podmanTopExitCode is a special exec code to signal that podman failed to to something in // reexec command not ps. This is used to give a better error. podmanTopExitCode = 255 ) func init() { reexec.Register(podmanTopCommand, podmanTopMain) } // podmanTopMain - main function for the reexec func podmanTopMain() { if err := podmanTopInner(); err != nil { fmt.Fprint(os.Stderr, err.Error()) os.Exit(podmanTopExitCode) } os.Exit(0) } // podmanTopInner os.Args = {command name} {pid} {userns(1/0)} {psPath} [args...] // We are rexxec'd in a new mountns, then we need to set some security settings in order // to safely execute ps in the container pid namespace. Most notably make sure podman and // ps are read only to prevent a process from overwriting it. func podmanTopInner() error { if len(os.Args) < 4 { return fmt.Errorf("internal error, need at least three arguments") } // We have to lock the thread as we a) switch namespace below and b) use PR_SET_PDEATHSIG // Also do not unlock as this thread should not be reused by go we exit anyway at the end. runtime.LockOSThread() if err := unix.Prctl(unix.PR_SET_PDEATHSIG, uintptr(unix.SIGKILL), 0, 0, 0); err != nil { return fmt.Errorf("PR_SET_PDEATHSIG: %w", err) } if err := unix.Prctl(unix.PR_SET_DUMPABLE, 0, 0, 0, 0); err != nil { return fmt.Errorf("PR_SET_DUMPABLE: %w", err) } if err := unix.Prctl(unix.PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil { return fmt.Errorf("PR_SET_NO_NEW_PRIVS: %w", err) } if err := unix.Mount("none", "/", "", unix.MS_REC|unix.MS_PRIVATE, ""); err != nil { return fmt.Errorf("make / mount private: %w", err) } psPath := os.Args[3] // try to mount everything read only if err := unix.MountSetattr(0, "/", unix.AT_RECURSIVE, &unix.MountAttr{ Attr_set: unix.MOUNT_ATTR_RDONLY, }); err != nil { if err != unix.ENOSYS { return fmt.Errorf("mount_setattr / readonly: %w", err) } // old kernel without mount_setattr, i.e. on RHEL 8.8 // Bind mount the directories readonly for both podman and ps. psPath, err = remountReadOnly(psPath) if err != nil { return err } _, err = remountReadOnly(reexec.Self()) if err != nil { return err } } // extra safety check make sure the ps path is actually read only err := unix.Access(psPath, unix.W_OK) if err == nil { return fmt.Errorf("%q was not mounted read only, this can be dangerous so we will not execute it", psPath) } pid := os.Args[1] // join the pid namespace of pid pidFD, err := os.Open(fmt.Sprintf("/proc/%s/ns/pid", pid)) if err != nil { return fmt.Errorf("open pidns: %w", err) } if err := unix.Setns(int(pidFD.Fd()), unix.CLONE_NEWPID); err != nil { return fmt.Errorf("setns NEWPID: %w", err) } pidFD.Close() userns := os.Args[2] if userns == "1" { C.set_userns() } args := []string{psPath} args = append(args, os.Args[4:]...) C.create_argv(C.int(len(args))) for i, arg := range args { cArg := C.CString(arg) C.set_argv(C.int(i), cArg) defer C.free(unsafe.Pointer(cArg)) } // Now try to close open fds except std streams // While golang open everything O_CLOEXEC it could still leak fds from // the parent, i.e. bash. In this case an attacker might be able to // read/write from them. // Do this as last step, it has to happen before to fork because the child // will be immediately in pid namespace so we cannot close them in the child. entries, err := os.ReadDir("/proc/self/fd") if err != nil { return err } for _, e := range entries { i, err := strconv.Atoi(e.Name()) // IsFdInherited checks the we got the fd from a parent process and only close them, // when we close all that would include the ones from the go runtime which // then can panic because of that. if err == nil && i > unix.Stderr && rootless.IsFdInherited(i) { _ = unix.Close(i) } } // this function will always exit for us C.fork_exec_ps() return nil } // remountReadOnly remounts the parent directory of the given path read only // return the resolved path or an error. The path can then be used to exec the // binary as we know it is on a read only mount now. func remountReadOnly(path string) (string, error) { resolvedPath, err := filepath.EvalSymlinks(path) if err != nil { return "", fmt.Errorf("resolve symlink for %s: %w", path, err) } dir := filepath.Dir(resolvedPath) // create mount point if err := unix.Mount(dir, dir, "", unix.MS_BIND, ""); err != nil { return "", fmt.Errorf("mount %s read only: %w", dir, err) } // remount readonly if err := unix.Mount(dir, dir, "", unix.MS_BIND|unix.MS_REMOUNT|unix.MS_RDONLY, ""); err != nil { return "", fmt.Errorf("mount %s read only: %w", dir, err) } return resolvedPath, nil } // Top gathers statistics about the running processes in a container. It returns a // []string for output func (c *Container) Top(descriptors []string) ([]string, error) { if c.config.NoCgroups { return nil, fmt.Errorf("cannot run top on container %s as it did not create a cgroup: %w", c.ID(), define.ErrNoCgroups) } conStat, err := c.State() if err != nil { return nil, fmt.Errorf("unable to look up state for %s: %w", c.ID(), err) } if conStat != define.ContainerStateRunning { return nil, errors.New("top can only be used on running containers") } // Also support comma-separated input. psgoDescriptors := []string{} for _, d := range descriptors { for _, s := range strings.Split(d, ",") { if s != "" { psgoDescriptors = append(psgoDescriptors, s) } } } // If we encountered an ErrUnknownDescriptor error, fallback to executing // ps(1). This ensures backwards compatibility to users depending on ps(1) // and makes sure we're ~compatible with docker. output, psgoErr := c.GetContainerPidInformation(psgoDescriptors) if psgoErr == nil { return output, nil } if !errors.Is(psgoErr, psgo.ErrUnknownDescriptor) { return nil, psgoErr } psDescriptors := descriptors if len(descriptors) == 1 { // Note that the descriptors to ps(1) must be shlexed (see #12452). psDescriptors = make([]string, 0, len(descriptors)) shSplit, err := shlex.Split(descriptors[0]) if err != nil { return nil, fmt.Errorf("parsing ps args: %w", err) } for _, s := range shSplit { if s != "" { psDescriptors = append(psDescriptors, s) } } } // Only use ps(1) from the host when we know the container was not started with CAP_SYS_PTRACE, // with it the container can access /proc/$pid/ files and potentially escape the container fs. if c.config.Spec.Process.Capabilities != nil && !slices.Contains(c.config.Spec.Process.Capabilities.Effective, "CAP_SYS_PTRACE") { var retry bool output, retry, err = c.execPS(psDescriptors) if err != nil { if !retry { return nil, err } logrus.Warnf("Falling back to container ps(1), could not execute ps(1) from the host: %v", err) output, err = c.execPSinContainer(psDescriptors) if err != nil { return nil, fmt.Errorf("executing ps(1) in container: %w", err) } } } else { output, err = c.execPSinContainer(psDescriptors) if err != nil { return nil, fmt.Errorf("executing ps(1) in container: %w", err) } } // Trick: filter the ps command from the output instead of // checking/requiring PIDs in the output. filtered := []string{} cmd := strings.Join(descriptors, " ") for _, line := range output { if !strings.Contains(line, cmd) { filtered = append(filtered, line) } } return filtered, nil } // GetContainerPidInformation returns process-related data of all processes in // the container. The output data can be controlled via the `descriptors` // argument which expects format descriptors and supports all AIXformat // descriptors of ps (1) plus some additional ones to for instance inspect the // set of effective capabilities. Each element in the returned string slice // is a tab-separated string. // // For more details, please refer to github.com/containers/psgo. func (c *Container) GetContainerPidInformation(descriptors []string) ([]string, error) { pid := strconv.Itoa(c.state.PID) // NOTE: psgo returns a [][]string to give users the ability to apply // filters on the data. We need to change the API here // to return a [][]string if we want to make use of // filtering. opts := psgo.JoinNamespaceOpts{FillMappings: rootless.IsRootless()} psgoOutput, err := psgo.JoinNamespaceAndProcessInfoWithOptions(pid, descriptors, &opts) if err != nil { return nil, err } res := []string{} for _, out := range psgoOutput { res = append(res, strings.Join(out, "\t")) } return res, nil } // execute ps(1) from the host within the container pid namespace func (c *Container) execPS(psArgs []string) ([]string, bool, error) { rPipe, wPipe, err := os.Pipe() if err != nil { return nil, false, err } defer rPipe.Close() outErrChan := make(chan error) stdout := []string{} go func() { defer close(outErrChan) scanner := bufio.NewScanner(rPipe) for scanner.Scan() { stdout = append(stdout, scanner.Text()) } if err := scanner.Err(); err != nil { outErrChan <- err } }() psPath, err := exec.LookPath("ps") if err != nil { wPipe.Close() return nil, true, err } // see podmanTopInner() userns := "0" if len(c.config.IDMappings.UIDMap) > 0 { userns = "1" } args := append([]string{podmanTopCommand, strconv.Itoa(c.state.PID), userns, psPath}, psArgs...) cmd := reexec.Command(args...) cmd.SysProcAttr = &syscall.SysProcAttr{ Unshareflags: unix.CLONE_NEWNS, } var errBuf bytes.Buffer cmd.Stdout = wPipe cmd.Stderr = &errBuf // nil means use current env so explicitly unset all, to not leak any sensitive env vars cmd.Env = []string{fmt.Sprintf("HOME=%s", os.Getenv("HOME"))} retryContainerExec := true err = cmd.Run() wPipe.Close() if err != nil { exitError := &exec.ExitError{} if errors.As(err, &exitError) { if exitError.ExitCode() != podmanTopExitCode { // ps command failed err = fmt.Errorf("ps(1) failed with exit code %d: %s", exitError.ExitCode(), errBuf.String()) // ps command itself failed: likely invalid args, no point in retrying. retryContainerExec = false } else { // podman-top reexec setup fails somewhere err = fmt.Errorf("could not execute ps(1) in the container pid namespace: %s", errBuf.String()) } } else { err = fmt.Errorf("could not reexec podman-top command: %w", err) } } if err := <-outErrChan; err != nil { return nil, retryContainerExec, fmt.Errorf("failed to read ps stdout: %w", err) } return stdout, retryContainerExec, err } // execPS executes ps(1) with the specified args in the container via exec session. // This should be a bit safer then execPS() but it requires ps(1) to be installed in the container. func (c *Container) execPSinContainer(args []string) ([]string, error) { rPipe, wPipe, err := os.Pipe() if err != nil { return nil, err } defer rPipe.Close() var errBuf bytes.Buffer streams := new(define.AttachStreams) streams.OutputStream = wPipe streams.ErrorStream = &errBuf streams.AttachOutput = true streams.AttachError = true outErrChan := make(chan error) stdout := []string{} go func() { defer close(outErrChan) scanner := bufio.NewScanner(rPipe) for scanner.Scan() { stdout = append(stdout, scanner.Text()) } if err := scanner.Err(); err != nil { outErrChan <- err } }() cmd := append([]string{"ps"}, args...) config := new(ExecConfig) config.Command = cmd ec, err := c.Exec(config, streams, nil) wPipe.Close() if err != nil { return nil, err } else if ec != 0 { return nil, fmt.Errorf("runtime failed with exit status: %d and output: %s", ec, errBuf.String()) } if logrus.GetLevel() >= logrus.DebugLevel { // If we're running in debug mode or higher, we might want to have a // look at stderr which includes debug logs from conmon. logrus.Debug(errBuf.String()) } if err := <-outErrChan; err != nil { return nil, fmt.Errorf("failed to read ps stdout: %w", err) } return stdout, nil }