mirror of
https://github.com/containers/podman.git
synced 2026-03-13 08:01:19 +08:00
rootless: use nsfs file handles to persist namespaces
use name_to_handle_at and open_by_handle_at to persist rootless namespaces without needing a pause process. The namespace file handles are stored in a file and can be used to rejoin the namespaces, as long as the namespaces still exist. Fall back to the pause process approach only when the kernel doesn't support nsfs handles (EOPNOTSUPP). The feature is currently only enabled when the PODMAN_NO_PAUSE_PROCESS environment variable is set. These changes in the kernel are required (landed in Linux 6.18): https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=3ab378cfa793 Signed-off-by: Giuseppe Scrivano <gscrivan@redhat.com>
This commit is contained in:
@@ -301,6 +301,11 @@ otherwise in the home directory of the user under
|
||||
|
||||
In Rootless mode temporary configuration data is stored in `${XDG_RUNTIME_DIR}/containers`.
|
||||
|
||||
#### **PODMAN_NO_PAUSE_PROCESS**
|
||||
|
||||
In Rootless mode, when set to a value other than "0", Podman does not use a pause process.
|
||||
Namespace file handles are stored to allow rejoining the existing user and mount namespace if they are still alive.
|
||||
|
||||
## Remote Access
|
||||
|
||||
The Podman command can be used with remote services using the `--remote` flag. Connections can
|
||||
|
||||
@@ -544,25 +544,25 @@ func makeRuntime(ctx context.Context, runtime *Runtime) (retErr error) {
|
||||
}
|
||||
unLockFunc()
|
||||
unLockFunc = nil
|
||||
pausePid, err := util.GetRootlessPauseProcessPidPath()
|
||||
stateDir, err := util.GetRootlessStateDir()
|
||||
if err != nil {
|
||||
return fmt.Errorf("could not get pause process pid file path: %w", err)
|
||||
return fmt.Errorf("could not get rootless state directory: %w", err)
|
||||
}
|
||||
|
||||
// create the path in case it does not already exists
|
||||
// https://github.com/containers/podman/issues/8539
|
||||
if err := os.MkdirAll(filepath.Dir(pausePid), 0o700); err != nil {
|
||||
return fmt.Errorf("could not create pause process pid file directory: %w", err)
|
||||
if err := os.MkdirAll(stateDir, 0o700); err != nil {
|
||||
return fmt.Errorf("could not create rootless state directory: %w", err)
|
||||
}
|
||||
|
||||
became, ret, err := rootless.BecomeRootInUserNS(pausePid)
|
||||
became, ret, err := rootless.BecomeRootInUserNS(stateDir)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if became {
|
||||
// Check if the pause process was created. If it was created, then
|
||||
// move it to its own systemd scope.
|
||||
systemdCommon.MovePauseProcessToScope(pausePid)
|
||||
systemdCommon.MovePauseProcessToScope(rootless.GetPausePidPath(stateDir))
|
||||
|
||||
// gocritic complains because defer is not run on os.Exit()
|
||||
// However this is fine because the lock is released anyway when the process exits
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
package libpod
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"os"
|
||||
"strconv"
|
||||
@@ -10,14 +11,22 @@ import (
|
||||
|
||||
"github.com/containers/podman/v6/pkg/rootless"
|
||||
"github.com/containers/podman/v6/pkg/util"
|
||||
"github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
func (r *Runtime) stopPauseProcess() error {
|
||||
if rootless.IsRootless() {
|
||||
pausePidPath, err := util.GetRootlessPauseProcessPidPath()
|
||||
stateDir, err := util.GetRootlessStateDir()
|
||||
if err != nil {
|
||||
return fmt.Errorf("could not get pause process pid file path: %w", err)
|
||||
return fmt.Errorf("could not get rootless state directory: %w", err)
|
||||
}
|
||||
|
||||
nsHandlesPath := rootless.GetNamespaceHandlesPath(stateDir)
|
||||
if err := os.Remove(nsHandlesPath); err != nil && !errors.Is(err, os.ErrNotExist) {
|
||||
logrus.Warnf("Failed to remove namespace handles file %s: %v", nsHandlesPath, err)
|
||||
}
|
||||
|
||||
pausePidPath := rootless.GetPausePidPath(stateDir)
|
||||
data, err := os.ReadFile(pausePidPath)
|
||||
if err != nil {
|
||||
if os.IsNotExist(err) {
|
||||
|
||||
@@ -66,12 +66,12 @@ func (ic *ContainerEngine) SetupRootless(_ context.Context, noMoveProcess bool,
|
||||
return nil
|
||||
}
|
||||
|
||||
pausePidPath, err := util.GetRootlessPauseProcessPidPath()
|
||||
stateDir, err := util.GetRootlessStateDir()
|
||||
if err != nil {
|
||||
return fmt.Errorf("could not get pause process pid file path: %w", err)
|
||||
return fmt.Errorf("could not get rootless state directory: %w", err)
|
||||
}
|
||||
|
||||
became, ret, err := rootless.TryJoinPauseProcess(pausePidPath)
|
||||
became, ret, err := rootless.TryJoinPauseProcess(stateDir)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -91,23 +91,23 @@ func (ic *ContainerEngine) SetupRootless(_ context.Context, noMoveProcess bool,
|
||||
}
|
||||
|
||||
if len(paths) > 0 {
|
||||
became, ret, err = rootless.TryJoinFromFilePaths(pausePidPath, paths)
|
||||
became, ret, err = rootless.TryJoinFromFilePaths(stateDir, paths)
|
||||
// TryJoinFromFilePaths fails with ESRCH when the PID are all not valid anymore
|
||||
// In this case create a new userns.
|
||||
if errors.Is(err, unix.ESRCH) {
|
||||
logrus.Warnf("Failed to join existing conmon namespace, creating a new rootless podman user namespace. If there are existing container running please stop them with %q to reset the namespace", os.Args[0]+" system migrate")
|
||||
became, ret, err = rootless.BecomeRootInUserNS(pausePidPath)
|
||||
became, ret, err = rootless.BecomeRootInUserNS(stateDir)
|
||||
}
|
||||
} else {
|
||||
logrus.Info("Creating a new rootless user namespace")
|
||||
became, ret, err = rootless.BecomeRootInUserNS(pausePidPath)
|
||||
became, ret, err = rootless.BecomeRootInUserNS(stateDir)
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
return fmt.Errorf("fatal error, invalid internal status, unable to create a new pause process: %w. Try running %q and if that doesn't work reboot to recover", err, os.Args[0]+" system migrate")
|
||||
}
|
||||
if !noMoveProcess {
|
||||
systemd.MovePauseProcessToScope(pausePidPath)
|
||||
systemd.MovePauseProcessToScope(rootless.GetPausePidPath(stateDir))
|
||||
}
|
||||
if became {
|
||||
os.Exit(ret)
|
||||
|
||||
@@ -4,6 +4,7 @@ import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"sync"
|
||||
|
||||
@@ -13,10 +14,28 @@ import (
|
||||
"go.podman.io/storage/pkg/lockfile"
|
||||
)
|
||||
|
||||
// GetNamespaceHandlesPath returns the path to the namespace handles file
|
||||
// in the given state directory.
|
||||
func GetNamespaceHandlesPath(stateDir string) string {
|
||||
return filepath.Join(stateDir, "ns_handles")
|
||||
}
|
||||
|
||||
// GetPausePidPath returns the path to the pause.pid file
|
||||
// in the given state directory.
|
||||
func GetPausePidPath(stateDir string) string {
|
||||
return filepath.Join(stateDir, "pause.pid")
|
||||
}
|
||||
|
||||
// TryJoinPauseProcess attempts to join the namespaces of the pause PID via
|
||||
// TryJoinFromFilePaths. If joining fails, it attempts to delete the specified
|
||||
// file.
|
||||
func TryJoinPauseProcess(pausePidPath string) (bool, int, error) {
|
||||
func TryJoinPauseProcess(stateDir string) (bool, int, error) {
|
||||
nsHandlesPath := GetNamespaceHandlesPath(stateDir)
|
||||
if err := fileutils.Exists(nsHandlesPath); err == nil {
|
||||
return false, -1, nil
|
||||
}
|
||||
|
||||
pausePidPath := GetPausePidPath(stateDir)
|
||||
if err := fileutils.Exists(pausePidPath); err != nil {
|
||||
if errors.Is(err, os.ErrNotExist) {
|
||||
return false, -1, nil
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
#include <asm-generic/errno-base.h>
|
||||
#define _GNU_SOURCE
|
||||
#include <sched.h>
|
||||
#include <stdio.h>
|
||||
@@ -18,11 +19,32 @@
|
||||
#include <sys/prctl.h>
|
||||
#include <dirent.h>
|
||||
#include <sys/select.h>
|
||||
#include <sys/ioctl.h>
|
||||
#include <sys/file.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#define ETC_PREEXEC_HOOKS "/etc/containers/pre-exec-hooks"
|
||||
#define LIBEXECPODMAN "/usr/libexec/podman"
|
||||
|
||||
#ifndef FD_NSFS_ROOT
|
||||
/* Copied from /usr/include/linux/fcntl.h. */
|
||||
#define FD_NSFS_ROOT -10003
|
||||
#endif
|
||||
|
||||
/* Used by name_to_handle_at/open_by_handle_at. */
|
||||
struct ns_file_handle
|
||||
{
|
||||
unsigned int handle_bytes;
|
||||
int handle_type;
|
||||
unsigned char f_handle[MAX_HANDLE_SZ];
|
||||
};
|
||||
|
||||
struct ns_handles
|
||||
{
|
||||
struct ns_file_handle userns;
|
||||
struct ns_file_handle mntns;
|
||||
};
|
||||
|
||||
#ifndef TEMP_FAILURE_RETRY
|
||||
#define TEMP_FAILURE_RETRY(expression) \
|
||||
(__extension__ \
|
||||
@@ -59,7 +81,8 @@ cleanup_dirp (DIR **p)
|
||||
closedir (dir);
|
||||
}
|
||||
|
||||
int rename_noreplace (int olddirfd, const char *oldpath, int newdirfd, const char *newpath)
|
||||
int
|
||||
rename_noreplace (int olddirfd, const char *oldpath, int newdirfd, const char *newpath)
|
||||
{
|
||||
int ret;
|
||||
|
||||
@@ -121,6 +144,224 @@ rootless_gid ()
|
||||
return rootless_gid_init;
|
||||
}
|
||||
|
||||
static int
|
||||
get_ns_handles (struct ns_handles *handles)
|
||||
{
|
||||
cleanup_close int mnt_fd = -1;
|
||||
cleanup_close int user_fd = -1;
|
||||
int mount_id;
|
||||
|
||||
handles->userns.handle_bytes = MAX_HANDLE_SZ;
|
||||
handles->mntns.handle_bytes = MAX_HANDLE_SZ;
|
||||
|
||||
mnt_fd = open ("/proc/self/ns/mnt", O_RDONLY | O_CLOEXEC);
|
||||
if (mnt_fd < 0)
|
||||
return -1;
|
||||
|
||||
if (name_to_handle_at (mnt_fd, "", (struct file_handle *) &handles->mntns, &mount_id, AT_EMPTY_PATH) < 0)
|
||||
return -1;
|
||||
|
||||
user_fd = open ("/proc/self/ns/user", O_RDONLY | O_CLOEXEC);
|
||||
if (user_fd < 0)
|
||||
return -1;
|
||||
|
||||
if (name_to_handle_at (user_fd, "", (struct file_handle *) &handles->userns, &mount_id, AT_EMPTY_PATH) < 0)
|
||||
return -1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void
|
||||
join_namespace_or_die (const char *name, int ns_fd)
|
||||
{
|
||||
if (setns (ns_fd, 0) < 0)
|
||||
{
|
||||
fprintf (stderr, "cannot set %s namespace\n", name);
|
||||
_exit (EXIT_FAILURE);
|
||||
}
|
||||
}
|
||||
|
||||
static int
|
||||
set_ns_handles (const char *path)
|
||||
{
|
||||
cleanup_close int fd = -1;
|
||||
struct ns_handles handles;
|
||||
ssize_t bytes_read;
|
||||
cleanup_close int userns_fd = -1;
|
||||
cleanup_close int mntns_fd = -1;
|
||||
|
||||
fd = open (path, O_RDONLY | O_CLOEXEC);
|
||||
if (fd < 0)
|
||||
return -1;
|
||||
|
||||
bytes_read = TEMP_FAILURE_RETRY (read (fd, &handles, sizeof (handles)));
|
||||
if (bytes_read != sizeof (handles))
|
||||
{
|
||||
if (bytes_read >= 0)
|
||||
errno = EINVAL;
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (handles.userns.handle_bytes > MAX_HANDLE_SZ ||
|
||||
handles.mntns.handle_bytes > MAX_HANDLE_SZ)
|
||||
{
|
||||
errno = EINVAL;
|
||||
return -1;
|
||||
}
|
||||
|
||||
mntns_fd = open_by_handle_at (FD_NSFS_ROOT, (struct file_handle *) &handles.mntns, O_RDONLY);
|
||||
if (mntns_fd < 0)
|
||||
return -1;
|
||||
|
||||
userns_fd = open_by_handle_at (FD_NSFS_ROOT, (struct file_handle *) &handles.userns, O_RDONLY);
|
||||
if (userns_fd < 0)
|
||||
return -1;
|
||||
|
||||
if (setns (userns_fd, 0) != 0)
|
||||
return -1;
|
||||
|
||||
/* This is a fatal error we can't recover from since we have already joined the userns. */
|
||||
join_namespace_or_die ("mnt", mntns_fd);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Acquire an exclusive lock on the namespace handles lock file.
|
||||
Returns the lock fd on success, -1 on error. */
|
||||
static int
|
||||
acquire_ns_handles_lock (const char *state_dir)
|
||||
{
|
||||
char lock_path[PATH_MAX];
|
||||
int lock_fd;
|
||||
int ret;
|
||||
int saved_errno;
|
||||
|
||||
ret = snprintf (lock_path, PATH_MAX, "%s/ns_handles.lock", state_dir);
|
||||
if (ret >= PATH_MAX)
|
||||
{
|
||||
errno = ENAMETOOLONG;
|
||||
return -1;
|
||||
}
|
||||
|
||||
lock_fd = open (lock_path, O_RDWR | O_CREAT | O_CLOEXEC, 0600);
|
||||
if (lock_fd < 0)
|
||||
return -1;
|
||||
|
||||
if (flock (lock_fd, LOCK_EX) < 0)
|
||||
{
|
||||
saved_errno = errno;
|
||||
close (lock_fd);
|
||||
errno = saved_errno;
|
||||
return -1;
|
||||
}
|
||||
|
||||
return lock_fd;
|
||||
}
|
||||
|
||||
/* Save namespace handles to the specified file. */
|
||||
static int
|
||||
save_ns_handles (const char *path, struct ns_handles *handles)
|
||||
{
|
||||
cleanup_close int fd = -1;
|
||||
char tmp_path[PATH_MAX];
|
||||
int ret;
|
||||
int saved_errno;
|
||||
ssize_t written;
|
||||
|
||||
ret = snprintf (tmp_path, PATH_MAX, "%s.XXXXXX", path);
|
||||
if (ret >= PATH_MAX)
|
||||
{
|
||||
errno = ENAMETOOLONG;
|
||||
return -1;
|
||||
}
|
||||
|
||||
fd = mkstemp (tmp_path);
|
||||
if (fd < 0)
|
||||
return -1;
|
||||
|
||||
written = TEMP_FAILURE_RETRY (write (fd, handles, sizeof (*handles)));
|
||||
if (written != sizeof (*handles))
|
||||
{
|
||||
saved_errno = errno;
|
||||
unlink (tmp_path);
|
||||
errno = saved_errno;
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (rename_noreplace (AT_FDCWD, tmp_path, AT_FDCWD, path) < 0)
|
||||
{
|
||||
saved_errno = errno;
|
||||
unlink (tmp_path);
|
||||
errno = saved_errno;
|
||||
return -1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int
|
||||
get_and_save_ns_handles_with_lock (const char *state_dir)
|
||||
{
|
||||
char ns_handles_path[PATH_MAX];
|
||||
cleanup_close int lock_fd = -1;
|
||||
struct ns_handles handles;
|
||||
int ret;
|
||||
int saved_errno;
|
||||
char *env = getenv ("PODMAN_NO_PAUSE_PROCESS");
|
||||
|
||||
ret = snprintf (ns_handles_path, PATH_MAX, "%s/ns_handles", state_dir);
|
||||
if (ret >= PATH_MAX)
|
||||
{
|
||||
errno = ENAMETOOLONG;
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (env == NULL || strcmp(env, "0") == 0)
|
||||
{
|
||||
if (unlink(ns_handles_path) < 0 && errno != ENOENT)
|
||||
return -1;
|
||||
|
||||
/* Pretend the kernel does not support it and move on. */
|
||||
errno = EOPNOTSUPP;
|
||||
return -1;
|
||||
}
|
||||
|
||||
lock_fd = acquire_ns_handles_lock (state_dir);
|
||||
if (lock_fd < 0)
|
||||
return -1;
|
||||
|
||||
/* Now that we hold the lock, revalidate the file. */
|
||||
if (set_ns_handles (ns_handles_path) == 0)
|
||||
return 0;
|
||||
|
||||
ret = unlink (ns_handles_path);
|
||||
if (ret != 0 && errno != ENOENT)
|
||||
{
|
||||
saved_errno = errno;
|
||||
close (lock_fd);
|
||||
lock_fd = -1;
|
||||
errno = saved_errno;
|
||||
return -1;
|
||||
}
|
||||
|
||||
ret = get_ns_handles (&handles);
|
||||
if (ret < 0)
|
||||
{
|
||||
saved_errno = errno;
|
||||
close (lock_fd);
|
||||
lock_fd = -1; /* Prevent cleanup from running. */
|
||||
errno = saved_errno;
|
||||
return -1;
|
||||
}
|
||||
|
||||
ret = save_ns_handles (ns_handles_path, &handles);
|
||||
saved_errno = errno;
|
||||
close (lock_fd);
|
||||
lock_fd = -1; /* Prevent cleanup from running. */
|
||||
errno = saved_errno;
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* exec the specified executable and exit if it fails. */
|
||||
static void
|
||||
exec_binary (const char *path, char **argv, int argc)
|
||||
@@ -519,8 +760,9 @@ static void __attribute__((constructor)) init()
|
||||
}
|
||||
}
|
||||
|
||||
/* Shortcut. If we are able to join the pause pid file, do it now so we don't
|
||||
need to re-exec. */
|
||||
/* Shortcut. If we are able to join the existing namespace, do it now so we
|
||||
don't need to re-exec. First try using namespace file handles, then fall back
|
||||
to the pause.pid approach for older kernels. */
|
||||
xdg_runtime_dir = getenv ("XDG_RUNTIME_DIR");
|
||||
if (geteuid () != 0 && xdg_runtime_dir && xdg_runtime_dir[0] && can_use_shortcut (argv))
|
||||
{
|
||||
@@ -533,7 +775,6 @@ static void __attribute__((constructor)) init()
|
||||
uid_t uid;
|
||||
gid_t gid;
|
||||
char path[PATH_MAX];
|
||||
const char *const suffix = "/libpod/tmp/pause.pid";
|
||||
char uid_fmt[16];
|
||||
char gid_fmt[16];
|
||||
size_t len;
|
||||
@@ -546,7 +787,39 @@ static void __attribute__((constructor)) init()
|
||||
_exit (EXIT_FAILURE);
|
||||
}
|
||||
|
||||
len = snprintf (path, PATH_MAX, "%s%s", xdg_runtime_dir, suffix);
|
||||
uid = geteuid ();
|
||||
gid = getegid ();
|
||||
|
||||
len = snprintf (path, PATH_MAX, "%s/libpod/tmp/ns_handles", xdg_runtime_dir);
|
||||
if (len >= PATH_MAX)
|
||||
{
|
||||
errno = ENAMETOOLONG;
|
||||
fprintf (stderr, "invalid value for XDG_RUNTIME_DIR: %m");
|
||||
exit (EXIT_FAILURE);
|
||||
}
|
||||
|
||||
if (set_ns_handles (path) == 0)
|
||||
goto joined;
|
||||
|
||||
/* If the handle is stale, give up with the shortcut. */
|
||||
if (errno == ESTALE)
|
||||
return;
|
||||
|
||||
/* Fall back to pause.pid if:
|
||||
- ENOENT ns_handles file doesn't exist
|
||||
- EOPNOTSUPP kernel doesn't support open_by_handle_at
|
||||
- ENOSYS syscall not available
|
||||
- EPERM (could be seccomp when running in a container)
|
||||
*/
|
||||
if (errno != ENOENT && errno != EOPNOTSUPP && errno != ENOSYS && errno != EPERM)
|
||||
{
|
||||
/* Anything else is fatal. */
|
||||
fprintf (stderr, "error opening namespace handles: %m\n");
|
||||
_exit (EXIT_FAILURE);
|
||||
}
|
||||
|
||||
/* Fall back to pause.pid for compatibility with older versions or if the kernel is too old. */
|
||||
len = snprintf (path, PATH_MAX, "%s/libpod/tmp/pause.pid", xdg_runtime_dir);
|
||||
if (len >= PATH_MAX)
|
||||
{
|
||||
errno = ENAMETOOLONG;
|
||||
@@ -559,7 +832,6 @@ static void __attribute__((constructor)) init()
|
||||
return;
|
||||
|
||||
r = TEMP_FAILURE_RETRY (read (fd, buf, sizeof (buf) - 1));
|
||||
|
||||
if (r < 0)
|
||||
return;
|
||||
buf[r] = '\0';
|
||||
@@ -568,9 +840,6 @@ static void __attribute__((constructor)) init()
|
||||
if (pid == LONG_MAX)
|
||||
return;
|
||||
|
||||
uid = geteuid ();
|
||||
gid = getegid ();
|
||||
|
||||
userns_fd = open_namespace (pid, "user");
|
||||
if (userns_fd < 0)
|
||||
return;
|
||||
@@ -582,15 +851,10 @@ static void __attribute__((constructor)) init()
|
||||
if (setns (userns_fd, 0) < 0)
|
||||
return;
|
||||
|
||||
/* The user namespace was joined, after this point errors are
|
||||
not recoverable anymore. */
|
||||
|
||||
if (setns (mntns_fd, 0) < 0)
|
||||
{
|
||||
fprintf (stderr, "cannot join mount namespace for %ld: %m", pid);
|
||||
exit (EXIT_FAILURE);
|
||||
}
|
||||
/* This is a fatal error we can't recover from since we have already joined the userns. */
|
||||
join_namespace_or_die ("mnt", mntns_fd);
|
||||
|
||||
joined:
|
||||
sprintf (uid_fmt, "%d", uid);
|
||||
sprintf (gid_fmt, "%d", gid);
|
||||
|
||||
@@ -598,6 +862,8 @@ static void __attribute__((constructor)) init()
|
||||
setenv ("_CONTAINERS_ROOTLESS_UID", uid_fmt, 1);
|
||||
setenv ("_CONTAINERS_ROOTLESS_GID", gid_fmt, 1);
|
||||
|
||||
/* We are in the user+mount namespace, these errors are not recoverable. */
|
||||
|
||||
if (syscall_setresgid (0, 0, 0) < 0)
|
||||
{
|
||||
fprintf (stderr, "cannot setresgid: %m\n");
|
||||
@@ -649,10 +915,19 @@ reexec_in_user_namespace_wait (int pid, int options)
|
||||
}
|
||||
|
||||
static int
|
||||
create_pause_process (const char *pause_pid_file_path, char **argv)
|
||||
create_pause_process (const char *state_dir, char **argv)
|
||||
{
|
||||
pid_t pid;
|
||||
int p[2];
|
||||
char pause_pid_file_path[PATH_MAX];
|
||||
int ret;
|
||||
|
||||
ret = snprintf (pause_pid_file_path, PATH_MAX, "%s/pause.pid", state_dir);
|
||||
if (ret >= PATH_MAX)
|
||||
{
|
||||
errno = ENAMETOOLONG;
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (pipe (p) < 0)
|
||||
return -1;
|
||||
@@ -699,7 +974,7 @@ create_pause_process (const char *pause_pid_file_path, char **argv)
|
||||
|
||||
sprintf (pid_str, "%d", pid);
|
||||
|
||||
if (asprintf (&tmp_file_path, "%s.XXXXXX", pause_pid_file_path) < 0)
|
||||
if (asprintf (&tmp_file_path, "%s/pause.pid.XXXXXX", state_dir) < 0)
|
||||
{
|
||||
fprintf (stderr, "unable to print to string\n");
|
||||
kill (pid, SIGKILL);
|
||||
@@ -777,18 +1052,8 @@ create_pause_process (const char *pause_pid_file_path, char **argv)
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
join_namespace_or_die (const char *name, int ns_fd)
|
||||
{
|
||||
if (setns (ns_fd, 0) < 0)
|
||||
{
|
||||
fprintf (stderr, "cannot set %s namespace\n", name);
|
||||
_exit (EXIT_FAILURE);
|
||||
}
|
||||
}
|
||||
|
||||
int
|
||||
reexec_userns_join (int pid_to_join, char *pause_pid_file_path)
|
||||
reexec_userns_join (int pid_to_join, char *state_dir)
|
||||
{
|
||||
cleanup_close int userns_fd = -1;
|
||||
cleanup_close int mntns_fd = -1;
|
||||
@@ -910,10 +1175,24 @@ reexec_userns_join (int pid_to_join, char *pause_pid_file_path)
|
||||
_exit (EXIT_FAILURE);
|
||||
}
|
||||
|
||||
if (pause_pid_file_path && pause_pid_file_path[0] != '\0')
|
||||
if (state_dir && state_dir[0] != '\0')
|
||||
{
|
||||
/* We ignore errors here as we didn't create the namespace anyway. */
|
||||
create_pause_process (pause_pid_file_path, argv);
|
||||
/* Try to use namespace file handles instead of a pause process. */
|
||||
if (get_and_save_ns_handles_with_lock (state_dir) < 0)
|
||||
{
|
||||
/* Fall back to pause process only if kernel doesn't support nsfs handles,
|
||||
if they are blocked (e.g. seccomp), or if the state directory doesn't exist yet. */
|
||||
if (errno == EOPNOTSUPP || errno == EPERM || errno == ENOSYS || errno == ENOENT)
|
||||
{
|
||||
if (create_pause_process (state_dir, argv) < 0)
|
||||
_exit (EXIT_FAILURE);
|
||||
}
|
||||
else
|
||||
{
|
||||
fprintf (stderr, "cannot save namespace handles: %m\n");
|
||||
_exit (EXIT_FAILURE);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (sigprocmask (SIG_SETMASK, &oldsigset, NULL) < 0)
|
||||
{
|
||||
@@ -947,7 +1226,7 @@ check_proc_sys_userns_file (const char *path)
|
||||
}
|
||||
|
||||
int
|
||||
reexec_in_user_namespace (int ready, char *pause_pid_file_path)
|
||||
reexec_in_user_namespace (int ready, char *state_dir)
|
||||
{
|
||||
cleanup_free char **argv = NULL;
|
||||
cleanup_free char *argv0 = NULL;
|
||||
@@ -1073,12 +1352,27 @@ reexec_in_user_namespace (int ready, char *pause_pid_file_path)
|
||||
_exit (EXIT_FAILURE);
|
||||
}
|
||||
|
||||
if (pause_pid_file_path && pause_pid_file_path[0] != '\0')
|
||||
if (state_dir && state_dir[0] != '\0')
|
||||
{
|
||||
if (create_pause_process (pause_pid_file_path, argv) < 0)
|
||||
/* Try to use namespace file handles instead of a pause process. */
|
||||
if (get_and_save_ns_handles_with_lock (state_dir) < 0)
|
||||
{
|
||||
TEMP_FAILURE_RETRY (write (ready, "2", 1));
|
||||
_exit (EXIT_FAILURE);
|
||||
/* Fall back to pause process only if kernel doesn't support nsfs handles,
|
||||
if they are blocked (e.g. seccomp), or if the state directory doesn't exist yet. */
|
||||
if (errno == EOPNOTSUPP || errno == EPERM || errno == ENOSYS || errno == ENOENT)
|
||||
{
|
||||
if (create_pause_process (state_dir, argv) < 0)
|
||||
{
|
||||
TEMP_FAILURE_RETRY (write (ready, "2", 1));
|
||||
_exit (EXIT_FAILURE);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
fprintf (stderr, "cannot save namespace handles: %m\n");
|
||||
TEMP_FAILURE_RETRY (write (ready, "2", 1));
|
||||
_exit (EXIT_FAILURE);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -31,9 +31,9 @@ import (
|
||||
#include <sys/types.h>
|
||||
extern uid_t rootless_uid();
|
||||
extern uid_t rootless_gid();
|
||||
extern int reexec_in_user_namespace(int ready, char *pause_pid_file_path);
|
||||
extern int reexec_in_user_namespace(int ready, char *state_dir);
|
||||
extern int reexec_in_user_namespace_wait(int pid, int options);
|
||||
extern int reexec_userns_join(int pid, char *pause_pid_file_path);
|
||||
extern int reexec_userns_join(int pid, char *state_dir);
|
||||
extern int is_fd_inherited(int fd);
|
||||
*/
|
||||
import "C"
|
||||
@@ -142,7 +142,7 @@ func tryMappingTool(uid bool, pid int, hostID int, mappings []idtools.IDMap) err
|
||||
// joinUserAndMountNS re-exec podman in a new userNS and join the user and mount
|
||||
// namespace of the specified PID without looking up its parent. Useful to join directly
|
||||
// the conmon process.
|
||||
func joinUserAndMountNS(pid uint, pausePid string) (bool, int, error) {
|
||||
func joinUserAndMountNS(pid uint, stateDir string) (bool, int, error) {
|
||||
hasCapSysAdmin, err := unshare.HasCapSysAdmin()
|
||||
if err != nil {
|
||||
return false, 0, err
|
||||
@@ -151,10 +151,10 @@ func joinUserAndMountNS(pid uint, pausePid string) (bool, int, error) {
|
||||
return false, 0, nil
|
||||
}
|
||||
|
||||
cPausePid := C.CString(pausePid)
|
||||
defer C.free(unsafe.Pointer(cPausePid))
|
||||
cStateDir := C.CString(stateDir)
|
||||
defer C.free(unsafe.Pointer(cStateDir))
|
||||
|
||||
pidC := C.reexec_userns_join(C.int(pid), cPausePid)
|
||||
pidC := C.reexec_userns_join(C.int(pid), cStateDir)
|
||||
if int(pidC) < 0 {
|
||||
return false, -1, fmt.Errorf("cannot re-exec process to join the existing user namespace")
|
||||
}
|
||||
@@ -212,7 +212,7 @@ func copyMappings(from, to string) error {
|
||||
return os.WriteFile(to, content, 0o600)
|
||||
}
|
||||
|
||||
func becomeRootInUserNS(pausePid string) (_ bool, _ int, retErr error) {
|
||||
func becomeRootInUserNS(stateDir string) (_ bool, _ int, retErr error) {
|
||||
hasCapSysAdmin, err := unshare.HasCapSysAdmin()
|
||||
if err != nil {
|
||||
return false, 0, err
|
||||
@@ -245,8 +245,8 @@ func becomeRootInUserNS(pausePid string) (_ bool, _ int, retErr error) {
|
||||
}
|
||||
}
|
||||
|
||||
cPausePid := C.CString(pausePid)
|
||||
defer C.free(unsafe.Pointer(cPausePid))
|
||||
cStateDir := C.CString(stateDir)
|
||||
defer C.free(unsafe.Pointer(cStateDir))
|
||||
|
||||
runtime.LockOSThread()
|
||||
defer runtime.UnlockOSThread()
|
||||
@@ -279,7 +279,7 @@ func becomeRootInUserNS(pausePid string) (_ bool, _ int, retErr error) {
|
||||
}
|
||||
}()
|
||||
|
||||
pidC := C.reexec_in_user_namespace(C.int(r.Fd()), cPausePid)
|
||||
pidC := C.reexec_in_user_namespace(C.int(r.Fd()), cStateDir)
|
||||
pid = int(pidC)
|
||||
if pid < 0 {
|
||||
return false, -1, fmt.Errorf("cannot re-exec process")
|
||||
@@ -357,7 +357,8 @@ func becomeRootInUserNS(pausePid string) (_ bool, _ int, retErr error) {
|
||||
// We have lost the race for writing the PID file, as probably another
|
||||
// process created a namespace and wrote the PID.
|
||||
// Try to join it.
|
||||
data, err := os.ReadFile(pausePid)
|
||||
pausePidPath := stateDir + "/pause.pid"
|
||||
data, err := os.ReadFile(pausePidPath)
|
||||
if err == nil {
|
||||
var pid uint64
|
||||
pid, err = strconv.ParseUint(string(data), 10, 0)
|
||||
@@ -417,15 +418,15 @@ func waitAndProxySignalsToChild(pid C.int) (bool, int, error) {
|
||||
// into a new user namespace and the return code from the re-executed podman process.
|
||||
// If podman was re-executed the caller needs to propagate the error code returned by the child
|
||||
// process.
|
||||
func BecomeRootInUserNS(pausePid string) (bool, int, error) {
|
||||
return becomeRootInUserNS(pausePid)
|
||||
func BecomeRootInUserNS(stateDir string) (bool, int, error) {
|
||||
return becomeRootInUserNS(stateDir)
|
||||
}
|
||||
|
||||
// TryJoinFromFilePaths attempts to join the namespaces of the pid files in paths.
|
||||
// This is useful when there are already running containers and we
|
||||
// don't have a pause process yet. We can use the paths to the conmon
|
||||
// processes to attempt joining their namespaces.
|
||||
func TryJoinFromFilePaths(pausePidPath string, paths []string) (bool, int, error) {
|
||||
func TryJoinFromFilePaths(stateDir string, paths []string) (bool, int, error) {
|
||||
var lastErr error
|
||||
|
||||
for _, path := range paths {
|
||||
@@ -435,16 +436,16 @@ func TryJoinFromFilePaths(pausePidPath string, paths []string) (bool, int, error
|
||||
continue
|
||||
}
|
||||
|
||||
pausePid, err := strconv.Atoi(string(data))
|
||||
pid, err := strconv.Atoi(string(data))
|
||||
if err != nil {
|
||||
lastErr = fmt.Errorf("cannot parse file %q: %w", path, err)
|
||||
continue
|
||||
}
|
||||
|
||||
if pausePid > 0 && unix.Kill(pausePid, 0) == nil {
|
||||
joined, pid, err := joinUserAndMountNS(uint(pausePid), pausePidPath)
|
||||
if pid > 0 && unix.Kill(pid, 0) == nil {
|
||||
joined, ret, err := joinUserAndMountNS(uint(pid), stateDir)
|
||||
if err == nil {
|
||||
return joined, pid, nil
|
||||
return joined, ret, nil
|
||||
}
|
||||
lastErr = err
|
||||
}
|
||||
|
||||
@@ -1,13 +1,27 @@
|
||||
package rootless
|
||||
|
||||
import (
|
||||
"path/filepath"
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/moby/sys/user"
|
||||
spec "github.com/opencontainers/runtime-spec/specs-go"
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
func TestGetNamespaceHandlesPath(t *testing.T) {
|
||||
stateDir := "libpod"
|
||||
result := GetNamespaceHandlesPath(stateDir)
|
||||
assert.Equal(t, filepath.Join(stateDir, "ns_handles"), result)
|
||||
}
|
||||
|
||||
func TestGetPausePidPath(t *testing.T) {
|
||||
stateDir := "libpod"
|
||||
result := GetPausePidPath(stateDir)
|
||||
assert.Equal(t, filepath.Join(stateDir, "pause.pid"), result)
|
||||
}
|
||||
|
||||
func TestMaybeSplitMappings(t *testing.T) {
|
||||
mappings := []spec.LinuxIDMapping{
|
||||
{
|
||||
|
||||
@@ -25,15 +25,15 @@ func GetRootlessConfigHomeDir() (string, error) {
|
||||
return homedir.GetConfigHome()
|
||||
}
|
||||
|
||||
// GetRootlessPauseProcessPidPath returns the path to the file that holds the pid for
|
||||
// the pause process.
|
||||
func GetRootlessPauseProcessPidPath() (string, error) {
|
||||
// GetRootlessStateDir returns the directory that holds the rootless state
|
||||
// (pause.pid and ns_handles files).
|
||||
func GetRootlessStateDir() (string, error) {
|
||||
runtimeDir, err := homedir.GetRuntimeDir()
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
// Note this path must be kept in sync with pkg/rootless/rootless_linux.go
|
||||
// Note this path must be kept in sync with pkg/rootless/rootless_linux.c
|
||||
// We only want a single pause process per user, so we do not want to use
|
||||
// the tmpdir which can be changed via --tmpdir.
|
||||
return filepath.Join(runtimeDir, "libpod", "tmp", "pause.pid"), nil
|
||||
return filepath.Join(runtimeDir, "libpod", "tmp"), nil
|
||||
}
|
||||
|
||||
@@ -833,11 +833,11 @@ func TestProcessOptions(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestGetRootlessPauseProcessPidPath(t *testing.T) {
|
||||
func TestGetRootlessStateDir(t *testing.T) {
|
||||
if runtime.GOOS == "windows" {
|
||||
t.Skip("Not implemented on Windows")
|
||||
}
|
||||
dir, err := GetRootlessPauseProcessPidPath()
|
||||
dir, err := GetRootlessStateDir()
|
||||
assert.NoError(t, err)
|
||||
assert.NotEqual(t, dir, "libpod/tmp/pause.pid")
|
||||
assert.NotEqual(t, dir, "libpod/tmp")
|
||||
}
|
||||
|
||||
@@ -23,10 +23,10 @@ func GetContainerPidInformationDescriptors() ([]string, error) {
|
||||
return nil, fmt.Errorf("GetContainerPidInformationDescriptors: %w", errNotImplemented)
|
||||
}
|
||||
|
||||
// GetRootlessPauseProcessPidPath returns the path to the file that holds the pid for
|
||||
// the pause process
|
||||
func GetRootlessPauseProcessPidPath() (string, error) {
|
||||
return "", fmt.Errorf("GetRootlessPauseProcessPidPath: %w", errNotImplemented)
|
||||
// GetRootlessStateDir returns the directory that holds the rootless state
|
||||
// (pause.pid and ns_handles files).
|
||||
func GetRootlessStateDir() (string, error) {
|
||||
return "", fmt.Errorf("GetRootlessStateDir: %w", errNotImplemented)
|
||||
}
|
||||
|
||||
// GetRootlessRuntimeDir returns the runtime directory
|
||||
|
||||
@@ -212,7 +212,9 @@ function __run_healthcheck_container() {
|
||||
|
||||
kill -9 ${conmon_pid}
|
||||
|
||||
run_podman rm -f -t0 $cname
|
||||
# When conmon is killed, the ns_handles may become stale and produce a warning
|
||||
# about creating a new rootless user namespace. This is expected behavior.
|
||||
run_podman 0+w rm -f -t0 $cname
|
||||
|
||||
run_podman 125 container inspect $cname
|
||||
assert "$output" =~ "no such container \"$cname\"" "Container should be removed"
|
||||
|
||||
@@ -60,13 +60,15 @@ SocketMode=0660
|
||||
WantedBy=sockets.target
|
||||
EOF
|
||||
|
||||
# ensure pause die before each test runs
|
||||
# ensure pause process/ns_handles are removed before each test runs
|
||||
if is_rootless; then
|
||||
local pause_pid_file="$XDG_RUNTIME_DIR/libpod/tmp/pause.pid"
|
||||
local ns_handles_file="$XDG_RUNTIME_DIR/libpod/tmp/ns_handles"
|
||||
if [ -f $pause_pid_file ]; then
|
||||
kill -9 $(< $pause_pid_file) 2> /dev/null
|
||||
rm -f $pause_pid_file
|
||||
fi
|
||||
rm -f $ns_handles_file
|
||||
fi
|
||||
systemctl_start "$SERVICE_NAME.socket"
|
||||
}
|
||||
@@ -105,13 +107,17 @@ function teardown() {
|
||||
cid="$output"
|
||||
|
||||
local pause_pid_file="$XDG_RUNTIME_DIR/libpod/tmp/pause.pid"
|
||||
if [ ! -f $pause_pid_file ]; then
|
||||
# This seems unlikely, but not impossible
|
||||
die "Pause pid file does not exist: $pause_pid_file"
|
||||
fi
|
||||
local ns_handles_file="$XDG_RUNTIME_DIR/libpod/tmp/ns_handles"
|
||||
|
||||
echo "kill -9 $(< $pause_pid_file) [pause process]"
|
||||
kill -9 $(< $pause_pid_file)
|
||||
if [ -f $ns_handles_file ]; then
|
||||
echo "Removing ns_handles file: $ns_handles_file"
|
||||
rm -f $ns_handles_file
|
||||
elif [ -f $pause_pid_file ]; then
|
||||
echo "kill -9 $(< $pause_pid_file) [pause process]"
|
||||
kill -9 $(< $pause_pid_file)
|
||||
else
|
||||
die "Neither ns_handles file nor pause.pid file exists"
|
||||
fi
|
||||
|
||||
run curl -s --max-time 3 --unix-socket $SERVICE_SOCK_ADDR $_PING
|
||||
echo "curl output: $output"
|
||||
|
||||
@@ -16,11 +16,19 @@ function setup_file() {
|
||||
}
|
||||
|
||||
function _check_pause_process() {
|
||||
# do not mark this variable as local; our caller expects it
|
||||
# do not mark these variables as local; our caller expects them
|
||||
pause_pid_file="$XDG_RUNTIME_DIR/libpod/tmp/pause.pid"
|
||||
test -e $pause_pid_file || die "Pause pid file $pause_pid_file missing"
|
||||
ns_handles_file="$XDG_RUNTIME_DIR/libpod/tmp/ns_handles"
|
||||
pause_pid=""
|
||||
|
||||
# Check that either ns_handles or pause.pid exists
|
||||
if [ -e $ns_handles_file ]; then
|
||||
# ns_handles file exists, no pause process needed
|
||||
return
|
||||
fi
|
||||
|
||||
test -e $pause_pid_file || die "Neither ns_handles file ($ns_handles_file) nor pause.pid file ($pause_pid_file) exists"
|
||||
|
||||
# do not mark this variable as local; our caller expects it
|
||||
pause_pid=$(<$pause_pid_file)
|
||||
test -d /proc/$pause_pid || die "Pause process $pause_pid (from $pause_pid_file) is not running"
|
||||
|
||||
@@ -51,26 +59,31 @@ function _check_pause_process() {
|
||||
# Use podman system migrate to stop the currently running pause process
|
||||
run_podman system migrate
|
||||
|
||||
# After migrate, there must be no pause process
|
||||
# After migrate, there must be no pause process or ns_handles
|
||||
# Note: pause_pid_file and ns_handles_file are set by _check_pause_process above
|
||||
test -e $pause_pid_file && die "Pause pid file $pause_pid_file still exists, even after podman system migrate"
|
||||
test -e $ns_handles_file && die "ns_handles file $ns_handles_file still exists, even after podman system migrate"
|
||||
|
||||
run kill -0 $pause_pid
|
||||
test $status -eq 0 && die "Pause process $pause_pid is still running even after podman system migrate"
|
||||
if [[ -n "$pause_pid" ]]; then
|
||||
run kill -0 $pause_pid
|
||||
test $status -eq 0 && die "Pause process $pause_pid is still running even after podman system migrate"
|
||||
fi
|
||||
|
||||
run_podman $(podman_isolation_opts ${PODMAN_TMPDIR}) $getns
|
||||
tmpdir_userns="$output"
|
||||
|
||||
# And now we should once again have a pause process
|
||||
# And now we should once again have a pause process or ns_handles
|
||||
_check_pause_process
|
||||
|
||||
# and all podmans, with & without --tmpdir, should use the same ns
|
||||
run_podman $getns
|
||||
assert "$output" == "$tmpdir_userns" \
|
||||
"podman should use the same userns created using a tmpdir"
|
||||
if [ -e $pause_pid_file ]; then
|
||||
run_podman $getns
|
||||
assert "$output" == "$tmpdir_userns" \
|
||||
"podman should use the same userns created using a tmpdir"
|
||||
|
||||
run_podman --tmpdir $PODMAN_TMPDIR/tmp2 $getns
|
||||
assert "$output" == "$tmpdir_userns" \
|
||||
"podman with tmpdir2 should use the same userns created using a tmpdir"
|
||||
run_podman --tmpdir $PODMAN_TMPDIR/tmp2 $getns
|
||||
assert "$output" == "$tmpdir_userns" \
|
||||
"podman with tmpdir2 should use the same userns created using a tmpdir"
|
||||
fi
|
||||
}
|
||||
|
||||
# https://github.com/containers/podman/issues/16091
|
||||
@@ -105,9 +118,14 @@ function _check_pause_process() {
|
||||
run_podman unshare readlink /proc/self/ns/user
|
||||
userns="$output"
|
||||
|
||||
# check for pause pid and then kill it
|
||||
# Check for pause pid or ns_handles file, and remove/kill it
|
||||
# Note: _check_pause_process sets ns_handles_file and pause_pid
|
||||
_check_pause_process
|
||||
kill -9 $pause_pid
|
||||
if [ -e $ns_handles_file ]; then
|
||||
rm -f $ns_handles_file
|
||||
elif [ -n "$pause_pid" ]; then
|
||||
kill -9 $pause_pid
|
||||
fi
|
||||
|
||||
# Now again directly start podman run and make sure it can forward signals
|
||||
# We're forced to use $PODMAN because run_podman cannot be backgrounded
|
||||
@@ -161,9 +179,14 @@ function _check_pause_process() {
|
||||
run_podman inspect --format '{{.State.ConmonPid}}' $cname
|
||||
conmon_pid="$output"
|
||||
|
||||
# check for pause pid and then kill it
|
||||
# Check for pause pid or ns_handles file, and remove/kill it
|
||||
# Note: _check_pause_process sets ns_handles_file and pause_pid
|
||||
_check_pause_process
|
||||
kill -9 $pause_pid
|
||||
if [ -e $ns_handles_file ]; then
|
||||
rm -f $ns_handles_file
|
||||
elif [ -n "$pause_pid" ]; then
|
||||
kill -9 $pause_pid
|
||||
fi
|
||||
|
||||
# kill conmon
|
||||
kill -9 $conmon_pid
|
||||
|
||||
Reference in New Issue
Block a user