rootless: use nsfs file handles to persist namespaces

use name_to_handle_at and open_by_handle_at to persist rootless
namespaces without needing a pause process.

The namespace file handles are stored in a file and can be used to
rejoin the namespaces, as long as the namespaces still exist.

Fall back to the pause process approach only when the kernel doesn't
support nsfs handles (EOPNOTSUPP).

The feature is currently only enabled when the PODMAN_NO_PAUSE_PROCESS
environment variable is set.

These changes in the kernel are required (landed in Linux 6.18):

https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=3ab378cfa793

Signed-off-by: Giuseppe Scrivano <gscrivan@redhat.com>
This commit is contained in:
Giuseppe Scrivano
2026-01-15 13:08:37 +01:00
parent 95230cb315
commit f172ff789b
14 changed files with 483 additions and 110 deletions

View File

@@ -301,6 +301,11 @@ otherwise in the home directory of the user under
In Rootless mode temporary configuration data is stored in `${XDG_RUNTIME_DIR}/containers`.
#### **PODMAN_NO_PAUSE_PROCESS**
In Rootless mode, when set to a value other than "0", Podman does not use a pause process.
Namespace file handles are stored to allow rejoining the existing user and mount namespace if they are still alive.
## Remote Access
The Podman command can be used with remote services using the `--remote` flag. Connections can

View File

@@ -544,25 +544,25 @@ func makeRuntime(ctx context.Context, runtime *Runtime) (retErr error) {
}
unLockFunc()
unLockFunc = nil
pausePid, err := util.GetRootlessPauseProcessPidPath()
stateDir, err := util.GetRootlessStateDir()
if err != nil {
return fmt.Errorf("could not get pause process pid file path: %w", err)
return fmt.Errorf("could not get rootless state directory: %w", err)
}
// create the path in case it does not already exists
// https://github.com/containers/podman/issues/8539
if err := os.MkdirAll(filepath.Dir(pausePid), 0o700); err != nil {
return fmt.Errorf("could not create pause process pid file directory: %w", err)
if err := os.MkdirAll(stateDir, 0o700); err != nil {
return fmt.Errorf("could not create rootless state directory: %w", err)
}
became, ret, err := rootless.BecomeRootInUserNS(pausePid)
became, ret, err := rootless.BecomeRootInUserNS(stateDir)
if err != nil {
return err
}
if became {
// Check if the pause process was created. If it was created, then
// move it to its own systemd scope.
systemdCommon.MovePauseProcessToScope(pausePid)
systemdCommon.MovePauseProcessToScope(rootless.GetPausePidPath(stateDir))
// gocritic complains because defer is not run on os.Exit()
// However this is fine because the lock is released anyway when the process exits

View File

@@ -3,6 +3,7 @@
package libpod
import (
"errors"
"fmt"
"os"
"strconv"
@@ -10,14 +11,22 @@ import (
"github.com/containers/podman/v6/pkg/rootless"
"github.com/containers/podman/v6/pkg/util"
"github.com/sirupsen/logrus"
)
func (r *Runtime) stopPauseProcess() error {
if rootless.IsRootless() {
pausePidPath, err := util.GetRootlessPauseProcessPidPath()
stateDir, err := util.GetRootlessStateDir()
if err != nil {
return fmt.Errorf("could not get pause process pid file path: %w", err)
return fmt.Errorf("could not get rootless state directory: %w", err)
}
nsHandlesPath := rootless.GetNamespaceHandlesPath(stateDir)
if err := os.Remove(nsHandlesPath); err != nil && !errors.Is(err, os.ErrNotExist) {
logrus.Warnf("Failed to remove namespace handles file %s: %v", nsHandlesPath, err)
}
pausePidPath := rootless.GetPausePidPath(stateDir)
data, err := os.ReadFile(pausePidPath)
if err != nil {
if os.IsNotExist(err) {

View File

@@ -66,12 +66,12 @@ func (ic *ContainerEngine) SetupRootless(_ context.Context, noMoveProcess bool,
return nil
}
pausePidPath, err := util.GetRootlessPauseProcessPidPath()
stateDir, err := util.GetRootlessStateDir()
if err != nil {
return fmt.Errorf("could not get pause process pid file path: %w", err)
return fmt.Errorf("could not get rootless state directory: %w", err)
}
became, ret, err := rootless.TryJoinPauseProcess(pausePidPath)
became, ret, err := rootless.TryJoinPauseProcess(stateDir)
if err != nil {
return err
}
@@ -91,23 +91,23 @@ func (ic *ContainerEngine) SetupRootless(_ context.Context, noMoveProcess bool,
}
if len(paths) > 0 {
became, ret, err = rootless.TryJoinFromFilePaths(pausePidPath, paths)
became, ret, err = rootless.TryJoinFromFilePaths(stateDir, paths)
// TryJoinFromFilePaths fails with ESRCH when the PID are all not valid anymore
// In this case create a new userns.
if errors.Is(err, unix.ESRCH) {
logrus.Warnf("Failed to join existing conmon namespace, creating a new rootless podman user namespace. If there are existing container running please stop them with %q to reset the namespace", os.Args[0]+" system migrate")
became, ret, err = rootless.BecomeRootInUserNS(pausePidPath)
became, ret, err = rootless.BecomeRootInUserNS(stateDir)
}
} else {
logrus.Info("Creating a new rootless user namespace")
became, ret, err = rootless.BecomeRootInUserNS(pausePidPath)
became, ret, err = rootless.BecomeRootInUserNS(stateDir)
}
if err != nil {
return fmt.Errorf("fatal error, invalid internal status, unable to create a new pause process: %w. Try running %q and if that doesn't work reboot to recover", err, os.Args[0]+" system migrate")
}
if !noMoveProcess {
systemd.MovePauseProcessToScope(pausePidPath)
systemd.MovePauseProcessToScope(rootless.GetPausePidPath(stateDir))
}
if became {
os.Exit(ret)

View File

@@ -4,6 +4,7 @@ import (
"errors"
"fmt"
"os"
"path/filepath"
"sort"
"sync"
@@ -13,10 +14,28 @@ import (
"go.podman.io/storage/pkg/lockfile"
)
// GetNamespaceHandlesPath returns the path to the namespace handles file
// in the given state directory.
func GetNamespaceHandlesPath(stateDir string) string {
return filepath.Join(stateDir, "ns_handles")
}
// GetPausePidPath returns the path to the pause.pid file
// in the given state directory.
func GetPausePidPath(stateDir string) string {
return filepath.Join(stateDir, "pause.pid")
}
// TryJoinPauseProcess attempts to join the namespaces of the pause PID via
// TryJoinFromFilePaths. If joining fails, it attempts to delete the specified
// file.
func TryJoinPauseProcess(pausePidPath string) (bool, int, error) {
func TryJoinPauseProcess(stateDir string) (bool, int, error) {
nsHandlesPath := GetNamespaceHandlesPath(stateDir)
if err := fileutils.Exists(nsHandlesPath); err == nil {
return false, -1, nil
}
pausePidPath := GetPausePidPath(stateDir)
if err := fileutils.Exists(pausePidPath); err != nil {
if errors.Is(err, os.ErrNotExist) {
return false, -1, nil

View File

@@ -1,3 +1,4 @@
#include <asm-generic/errno-base.h>
#define _GNU_SOURCE
#include <sched.h>
#include <stdio.h>
@@ -18,11 +19,32 @@
#include <sys/prctl.h>
#include <dirent.h>
#include <sys/select.h>
#include <sys/ioctl.h>
#include <sys/file.h>
#include <stdio.h>
#define ETC_PREEXEC_HOOKS "/etc/containers/pre-exec-hooks"
#define LIBEXECPODMAN "/usr/libexec/podman"
#ifndef FD_NSFS_ROOT
/* Copied from /usr/include/linux/fcntl.h. */
#define FD_NSFS_ROOT -10003
#endif
/* Used by name_to_handle_at/open_by_handle_at. */
struct ns_file_handle
{
unsigned int handle_bytes;
int handle_type;
unsigned char f_handle[MAX_HANDLE_SZ];
};
struct ns_handles
{
struct ns_file_handle userns;
struct ns_file_handle mntns;
};
#ifndef TEMP_FAILURE_RETRY
#define TEMP_FAILURE_RETRY(expression) \
(__extension__ \
@@ -59,7 +81,8 @@ cleanup_dirp (DIR **p)
closedir (dir);
}
int rename_noreplace (int olddirfd, const char *oldpath, int newdirfd, const char *newpath)
int
rename_noreplace (int olddirfd, const char *oldpath, int newdirfd, const char *newpath)
{
int ret;
@@ -121,6 +144,224 @@ rootless_gid ()
return rootless_gid_init;
}
static int
get_ns_handles (struct ns_handles *handles)
{
cleanup_close int mnt_fd = -1;
cleanup_close int user_fd = -1;
int mount_id;
handles->userns.handle_bytes = MAX_HANDLE_SZ;
handles->mntns.handle_bytes = MAX_HANDLE_SZ;
mnt_fd = open ("/proc/self/ns/mnt", O_RDONLY | O_CLOEXEC);
if (mnt_fd < 0)
return -1;
if (name_to_handle_at (mnt_fd, "", (struct file_handle *) &handles->mntns, &mount_id, AT_EMPTY_PATH) < 0)
return -1;
user_fd = open ("/proc/self/ns/user", O_RDONLY | O_CLOEXEC);
if (user_fd < 0)
return -1;
if (name_to_handle_at (user_fd, "", (struct file_handle *) &handles->userns, &mount_id, AT_EMPTY_PATH) < 0)
return -1;
return 0;
}
static void
join_namespace_or_die (const char *name, int ns_fd)
{
if (setns (ns_fd, 0) < 0)
{
fprintf (stderr, "cannot set %s namespace\n", name);
_exit (EXIT_FAILURE);
}
}
static int
set_ns_handles (const char *path)
{
cleanup_close int fd = -1;
struct ns_handles handles;
ssize_t bytes_read;
cleanup_close int userns_fd = -1;
cleanup_close int mntns_fd = -1;
fd = open (path, O_RDONLY | O_CLOEXEC);
if (fd < 0)
return -1;
bytes_read = TEMP_FAILURE_RETRY (read (fd, &handles, sizeof (handles)));
if (bytes_read != sizeof (handles))
{
if (bytes_read >= 0)
errno = EINVAL;
return -1;
}
if (handles.userns.handle_bytes > MAX_HANDLE_SZ ||
handles.mntns.handle_bytes > MAX_HANDLE_SZ)
{
errno = EINVAL;
return -1;
}
mntns_fd = open_by_handle_at (FD_NSFS_ROOT, (struct file_handle *) &handles.mntns, O_RDONLY);
if (mntns_fd < 0)
return -1;
userns_fd = open_by_handle_at (FD_NSFS_ROOT, (struct file_handle *) &handles.userns, O_RDONLY);
if (userns_fd < 0)
return -1;
if (setns (userns_fd, 0) != 0)
return -1;
/* This is a fatal error we can't recover from since we have already joined the userns. */
join_namespace_or_die ("mnt", mntns_fd);
return 0;
}
/* Acquire an exclusive lock on the namespace handles lock file.
Returns the lock fd on success, -1 on error. */
static int
acquire_ns_handles_lock (const char *state_dir)
{
char lock_path[PATH_MAX];
int lock_fd;
int ret;
int saved_errno;
ret = snprintf (lock_path, PATH_MAX, "%s/ns_handles.lock", state_dir);
if (ret >= PATH_MAX)
{
errno = ENAMETOOLONG;
return -1;
}
lock_fd = open (lock_path, O_RDWR | O_CREAT | O_CLOEXEC, 0600);
if (lock_fd < 0)
return -1;
if (flock (lock_fd, LOCK_EX) < 0)
{
saved_errno = errno;
close (lock_fd);
errno = saved_errno;
return -1;
}
return lock_fd;
}
/* Save namespace handles to the specified file. */
static int
save_ns_handles (const char *path, struct ns_handles *handles)
{
cleanup_close int fd = -1;
char tmp_path[PATH_MAX];
int ret;
int saved_errno;
ssize_t written;
ret = snprintf (tmp_path, PATH_MAX, "%s.XXXXXX", path);
if (ret >= PATH_MAX)
{
errno = ENAMETOOLONG;
return -1;
}
fd = mkstemp (tmp_path);
if (fd < 0)
return -1;
written = TEMP_FAILURE_RETRY (write (fd, handles, sizeof (*handles)));
if (written != sizeof (*handles))
{
saved_errno = errno;
unlink (tmp_path);
errno = saved_errno;
return -1;
}
if (rename_noreplace (AT_FDCWD, tmp_path, AT_FDCWD, path) < 0)
{
saved_errno = errno;
unlink (tmp_path);
errno = saved_errno;
return -1;
}
return 0;
}
static int
get_and_save_ns_handles_with_lock (const char *state_dir)
{
char ns_handles_path[PATH_MAX];
cleanup_close int lock_fd = -1;
struct ns_handles handles;
int ret;
int saved_errno;
char *env = getenv ("PODMAN_NO_PAUSE_PROCESS");
ret = snprintf (ns_handles_path, PATH_MAX, "%s/ns_handles", state_dir);
if (ret >= PATH_MAX)
{
errno = ENAMETOOLONG;
return -1;
}
if (env == NULL || strcmp(env, "0") == 0)
{
if (unlink(ns_handles_path) < 0 && errno != ENOENT)
return -1;
/* Pretend the kernel does not support it and move on. */
errno = EOPNOTSUPP;
return -1;
}
lock_fd = acquire_ns_handles_lock (state_dir);
if (lock_fd < 0)
return -1;
/* Now that we hold the lock, revalidate the file. */
if (set_ns_handles (ns_handles_path) == 0)
return 0;
ret = unlink (ns_handles_path);
if (ret != 0 && errno != ENOENT)
{
saved_errno = errno;
close (lock_fd);
lock_fd = -1;
errno = saved_errno;
return -1;
}
ret = get_ns_handles (&handles);
if (ret < 0)
{
saved_errno = errno;
close (lock_fd);
lock_fd = -1; /* Prevent cleanup from running. */
errno = saved_errno;
return -1;
}
ret = save_ns_handles (ns_handles_path, &handles);
saved_errno = errno;
close (lock_fd);
lock_fd = -1; /* Prevent cleanup from running. */
errno = saved_errno;
return ret;
}
/* exec the specified executable and exit if it fails. */
static void
exec_binary (const char *path, char **argv, int argc)
@@ -519,8 +760,9 @@ static void __attribute__((constructor)) init()
}
}
/* Shortcut. If we are able to join the pause pid file, do it now so we don't
need to re-exec. */
/* Shortcut. If we are able to join the existing namespace, do it now so we
don't need to re-exec. First try using namespace file handles, then fall back
to the pause.pid approach for older kernels. */
xdg_runtime_dir = getenv ("XDG_RUNTIME_DIR");
if (geteuid () != 0 && xdg_runtime_dir && xdg_runtime_dir[0] && can_use_shortcut (argv))
{
@@ -533,7 +775,6 @@ static void __attribute__((constructor)) init()
uid_t uid;
gid_t gid;
char path[PATH_MAX];
const char *const suffix = "/libpod/tmp/pause.pid";
char uid_fmt[16];
char gid_fmt[16];
size_t len;
@@ -546,7 +787,39 @@ static void __attribute__((constructor)) init()
_exit (EXIT_FAILURE);
}
len = snprintf (path, PATH_MAX, "%s%s", xdg_runtime_dir, suffix);
uid = geteuid ();
gid = getegid ();
len = snprintf (path, PATH_MAX, "%s/libpod/tmp/ns_handles", xdg_runtime_dir);
if (len >= PATH_MAX)
{
errno = ENAMETOOLONG;
fprintf (stderr, "invalid value for XDG_RUNTIME_DIR: %m");
exit (EXIT_FAILURE);
}
if (set_ns_handles (path) == 0)
goto joined;
/* If the handle is stale, give up with the shortcut. */
if (errno == ESTALE)
return;
/* Fall back to pause.pid if:
- ENOENT ns_handles file doesn't exist
- EOPNOTSUPP kernel doesn't support open_by_handle_at
- ENOSYS syscall not available
- EPERM (could be seccomp when running in a container)
*/
if (errno != ENOENT && errno != EOPNOTSUPP && errno != ENOSYS && errno != EPERM)
{
/* Anything else is fatal. */
fprintf (stderr, "error opening namespace handles: %m\n");
_exit (EXIT_FAILURE);
}
/* Fall back to pause.pid for compatibility with older versions or if the kernel is too old. */
len = snprintf (path, PATH_MAX, "%s/libpod/tmp/pause.pid", xdg_runtime_dir);
if (len >= PATH_MAX)
{
errno = ENAMETOOLONG;
@@ -559,7 +832,6 @@ static void __attribute__((constructor)) init()
return;
r = TEMP_FAILURE_RETRY (read (fd, buf, sizeof (buf) - 1));
if (r < 0)
return;
buf[r] = '\0';
@@ -568,9 +840,6 @@ static void __attribute__((constructor)) init()
if (pid == LONG_MAX)
return;
uid = geteuid ();
gid = getegid ();
userns_fd = open_namespace (pid, "user");
if (userns_fd < 0)
return;
@@ -582,15 +851,10 @@ static void __attribute__((constructor)) init()
if (setns (userns_fd, 0) < 0)
return;
/* The user namespace was joined, after this point errors are
not recoverable anymore. */
if (setns (mntns_fd, 0) < 0)
{
fprintf (stderr, "cannot join mount namespace for %ld: %m", pid);
exit (EXIT_FAILURE);
}
/* This is a fatal error we can't recover from since we have already joined the userns. */
join_namespace_or_die ("mnt", mntns_fd);
joined:
sprintf (uid_fmt, "%d", uid);
sprintf (gid_fmt, "%d", gid);
@@ -598,6 +862,8 @@ static void __attribute__((constructor)) init()
setenv ("_CONTAINERS_ROOTLESS_UID", uid_fmt, 1);
setenv ("_CONTAINERS_ROOTLESS_GID", gid_fmt, 1);
/* We are in the user+mount namespace, these errors are not recoverable. */
if (syscall_setresgid (0, 0, 0) < 0)
{
fprintf (stderr, "cannot setresgid: %m\n");
@@ -649,10 +915,19 @@ reexec_in_user_namespace_wait (int pid, int options)
}
static int
create_pause_process (const char *pause_pid_file_path, char **argv)
create_pause_process (const char *state_dir, char **argv)
{
pid_t pid;
int p[2];
char pause_pid_file_path[PATH_MAX];
int ret;
ret = snprintf (pause_pid_file_path, PATH_MAX, "%s/pause.pid", state_dir);
if (ret >= PATH_MAX)
{
errno = ENAMETOOLONG;
return -1;
}
if (pipe (p) < 0)
return -1;
@@ -699,7 +974,7 @@ create_pause_process (const char *pause_pid_file_path, char **argv)
sprintf (pid_str, "%d", pid);
if (asprintf (&tmp_file_path, "%s.XXXXXX", pause_pid_file_path) < 0)
if (asprintf (&tmp_file_path, "%s/pause.pid.XXXXXX", state_dir) < 0)
{
fprintf (stderr, "unable to print to string\n");
kill (pid, SIGKILL);
@@ -777,18 +1052,8 @@ create_pause_process (const char *pause_pid_file_path, char **argv)
}
}
static void
join_namespace_or_die (const char *name, int ns_fd)
{
if (setns (ns_fd, 0) < 0)
{
fprintf (stderr, "cannot set %s namespace\n", name);
_exit (EXIT_FAILURE);
}
}
int
reexec_userns_join (int pid_to_join, char *pause_pid_file_path)
reexec_userns_join (int pid_to_join, char *state_dir)
{
cleanup_close int userns_fd = -1;
cleanup_close int mntns_fd = -1;
@@ -910,10 +1175,24 @@ reexec_userns_join (int pid_to_join, char *pause_pid_file_path)
_exit (EXIT_FAILURE);
}
if (pause_pid_file_path && pause_pid_file_path[0] != '\0')
if (state_dir && state_dir[0] != '\0')
{
/* We ignore errors here as we didn't create the namespace anyway. */
create_pause_process (pause_pid_file_path, argv);
/* Try to use namespace file handles instead of a pause process. */
if (get_and_save_ns_handles_with_lock (state_dir) < 0)
{
/* Fall back to pause process only if kernel doesn't support nsfs handles,
if they are blocked (e.g. seccomp), or if the state directory doesn't exist yet. */
if (errno == EOPNOTSUPP || errno == EPERM || errno == ENOSYS || errno == ENOENT)
{
if (create_pause_process (state_dir, argv) < 0)
_exit (EXIT_FAILURE);
}
else
{
fprintf (stderr, "cannot save namespace handles: %m\n");
_exit (EXIT_FAILURE);
}
}
}
if (sigprocmask (SIG_SETMASK, &oldsigset, NULL) < 0)
{
@@ -947,7 +1226,7 @@ check_proc_sys_userns_file (const char *path)
}
int
reexec_in_user_namespace (int ready, char *pause_pid_file_path)
reexec_in_user_namespace (int ready, char *state_dir)
{
cleanup_free char **argv = NULL;
cleanup_free char *argv0 = NULL;
@@ -1073,12 +1352,27 @@ reexec_in_user_namespace (int ready, char *pause_pid_file_path)
_exit (EXIT_FAILURE);
}
if (pause_pid_file_path && pause_pid_file_path[0] != '\0')
if (state_dir && state_dir[0] != '\0')
{
if (create_pause_process (pause_pid_file_path, argv) < 0)
/* Try to use namespace file handles instead of a pause process. */
if (get_and_save_ns_handles_with_lock (state_dir) < 0)
{
TEMP_FAILURE_RETRY (write (ready, "2", 1));
_exit (EXIT_FAILURE);
/* Fall back to pause process only if kernel doesn't support nsfs handles,
if they are blocked (e.g. seccomp), or if the state directory doesn't exist yet. */
if (errno == EOPNOTSUPP || errno == EPERM || errno == ENOSYS || errno == ENOENT)
{
if (create_pause_process (state_dir, argv) < 0)
{
TEMP_FAILURE_RETRY (write (ready, "2", 1));
_exit (EXIT_FAILURE);
}
}
else
{
fprintf (stderr, "cannot save namespace handles: %m\n");
TEMP_FAILURE_RETRY (write (ready, "2", 1));
_exit (EXIT_FAILURE);
}
}
}

View File

@@ -31,9 +31,9 @@ import (
#include <sys/types.h>
extern uid_t rootless_uid();
extern uid_t rootless_gid();
extern int reexec_in_user_namespace(int ready, char *pause_pid_file_path);
extern int reexec_in_user_namespace(int ready, char *state_dir);
extern int reexec_in_user_namespace_wait(int pid, int options);
extern int reexec_userns_join(int pid, char *pause_pid_file_path);
extern int reexec_userns_join(int pid, char *state_dir);
extern int is_fd_inherited(int fd);
*/
import "C"
@@ -142,7 +142,7 @@ func tryMappingTool(uid bool, pid int, hostID int, mappings []idtools.IDMap) err
// joinUserAndMountNS re-exec podman in a new userNS and join the user and mount
// namespace of the specified PID without looking up its parent. Useful to join directly
// the conmon process.
func joinUserAndMountNS(pid uint, pausePid string) (bool, int, error) {
func joinUserAndMountNS(pid uint, stateDir string) (bool, int, error) {
hasCapSysAdmin, err := unshare.HasCapSysAdmin()
if err != nil {
return false, 0, err
@@ -151,10 +151,10 @@ func joinUserAndMountNS(pid uint, pausePid string) (bool, int, error) {
return false, 0, nil
}
cPausePid := C.CString(pausePid)
defer C.free(unsafe.Pointer(cPausePid))
cStateDir := C.CString(stateDir)
defer C.free(unsafe.Pointer(cStateDir))
pidC := C.reexec_userns_join(C.int(pid), cPausePid)
pidC := C.reexec_userns_join(C.int(pid), cStateDir)
if int(pidC) < 0 {
return false, -1, fmt.Errorf("cannot re-exec process to join the existing user namespace")
}
@@ -212,7 +212,7 @@ func copyMappings(from, to string) error {
return os.WriteFile(to, content, 0o600)
}
func becomeRootInUserNS(pausePid string) (_ bool, _ int, retErr error) {
func becomeRootInUserNS(stateDir string) (_ bool, _ int, retErr error) {
hasCapSysAdmin, err := unshare.HasCapSysAdmin()
if err != nil {
return false, 0, err
@@ -245,8 +245,8 @@ func becomeRootInUserNS(pausePid string) (_ bool, _ int, retErr error) {
}
}
cPausePid := C.CString(pausePid)
defer C.free(unsafe.Pointer(cPausePid))
cStateDir := C.CString(stateDir)
defer C.free(unsafe.Pointer(cStateDir))
runtime.LockOSThread()
defer runtime.UnlockOSThread()
@@ -279,7 +279,7 @@ func becomeRootInUserNS(pausePid string) (_ bool, _ int, retErr error) {
}
}()
pidC := C.reexec_in_user_namespace(C.int(r.Fd()), cPausePid)
pidC := C.reexec_in_user_namespace(C.int(r.Fd()), cStateDir)
pid = int(pidC)
if pid < 0 {
return false, -1, fmt.Errorf("cannot re-exec process")
@@ -357,7 +357,8 @@ func becomeRootInUserNS(pausePid string) (_ bool, _ int, retErr error) {
// We have lost the race for writing the PID file, as probably another
// process created a namespace and wrote the PID.
// Try to join it.
data, err := os.ReadFile(pausePid)
pausePidPath := stateDir + "/pause.pid"
data, err := os.ReadFile(pausePidPath)
if err == nil {
var pid uint64
pid, err = strconv.ParseUint(string(data), 10, 0)
@@ -417,15 +418,15 @@ func waitAndProxySignalsToChild(pid C.int) (bool, int, error) {
// into a new user namespace and the return code from the re-executed podman process.
// If podman was re-executed the caller needs to propagate the error code returned by the child
// process.
func BecomeRootInUserNS(pausePid string) (bool, int, error) {
return becomeRootInUserNS(pausePid)
func BecomeRootInUserNS(stateDir string) (bool, int, error) {
return becomeRootInUserNS(stateDir)
}
// TryJoinFromFilePaths attempts to join the namespaces of the pid files in paths.
// This is useful when there are already running containers and we
// don't have a pause process yet. We can use the paths to the conmon
// processes to attempt joining their namespaces.
func TryJoinFromFilePaths(pausePidPath string, paths []string) (bool, int, error) {
func TryJoinFromFilePaths(stateDir string, paths []string) (bool, int, error) {
var lastErr error
for _, path := range paths {
@@ -435,16 +436,16 @@ func TryJoinFromFilePaths(pausePidPath string, paths []string) (bool, int, error
continue
}
pausePid, err := strconv.Atoi(string(data))
pid, err := strconv.Atoi(string(data))
if err != nil {
lastErr = fmt.Errorf("cannot parse file %q: %w", path, err)
continue
}
if pausePid > 0 && unix.Kill(pausePid, 0) == nil {
joined, pid, err := joinUserAndMountNS(uint(pausePid), pausePidPath)
if pid > 0 && unix.Kill(pid, 0) == nil {
joined, ret, err := joinUserAndMountNS(uint(pid), stateDir)
if err == nil {
return joined, pid, nil
return joined, ret, nil
}
lastErr = err
}

View File

@@ -1,13 +1,27 @@
package rootless
import (
"path/filepath"
"reflect"
"testing"
"github.com/moby/sys/user"
spec "github.com/opencontainers/runtime-spec/specs-go"
"github.com/stretchr/testify/assert"
)
func TestGetNamespaceHandlesPath(t *testing.T) {
stateDir := "libpod"
result := GetNamespaceHandlesPath(stateDir)
assert.Equal(t, filepath.Join(stateDir, "ns_handles"), result)
}
func TestGetPausePidPath(t *testing.T) {
stateDir := "libpod"
result := GetPausePidPath(stateDir)
assert.Equal(t, filepath.Join(stateDir, "pause.pid"), result)
}
func TestMaybeSplitMappings(t *testing.T) {
mappings := []spec.LinuxIDMapping{
{

View File

@@ -25,15 +25,15 @@ func GetRootlessConfigHomeDir() (string, error) {
return homedir.GetConfigHome()
}
// GetRootlessPauseProcessPidPath returns the path to the file that holds the pid for
// the pause process.
func GetRootlessPauseProcessPidPath() (string, error) {
// GetRootlessStateDir returns the directory that holds the rootless state
// (pause.pid and ns_handles files).
func GetRootlessStateDir() (string, error) {
runtimeDir, err := homedir.GetRuntimeDir()
if err != nil {
return "", err
}
// Note this path must be kept in sync with pkg/rootless/rootless_linux.go
// Note this path must be kept in sync with pkg/rootless/rootless_linux.c
// We only want a single pause process per user, so we do not want to use
// the tmpdir which can be changed via --tmpdir.
return filepath.Join(runtimeDir, "libpod", "tmp", "pause.pid"), nil
return filepath.Join(runtimeDir, "libpod", "tmp"), nil
}

View File

@@ -833,11 +833,11 @@ func TestProcessOptions(t *testing.T) {
}
}
func TestGetRootlessPauseProcessPidPath(t *testing.T) {
func TestGetRootlessStateDir(t *testing.T) {
if runtime.GOOS == "windows" {
t.Skip("Not implemented on Windows")
}
dir, err := GetRootlessPauseProcessPidPath()
dir, err := GetRootlessStateDir()
assert.NoError(t, err)
assert.NotEqual(t, dir, "libpod/tmp/pause.pid")
assert.NotEqual(t, dir, "libpod/tmp")
}

View File

@@ -23,10 +23,10 @@ func GetContainerPidInformationDescriptors() ([]string, error) {
return nil, fmt.Errorf("GetContainerPidInformationDescriptors: %w", errNotImplemented)
}
// GetRootlessPauseProcessPidPath returns the path to the file that holds the pid for
// the pause process
func GetRootlessPauseProcessPidPath() (string, error) {
return "", fmt.Errorf("GetRootlessPauseProcessPidPath: %w", errNotImplemented)
// GetRootlessStateDir returns the directory that holds the rootless state
// (pause.pid and ns_handles files).
func GetRootlessStateDir() (string, error) {
return "", fmt.Errorf("GetRootlessStateDir: %w", errNotImplemented)
}
// GetRootlessRuntimeDir returns the runtime directory

View File

@@ -212,7 +212,9 @@ function __run_healthcheck_container() {
kill -9 ${conmon_pid}
run_podman rm -f -t0 $cname
# When conmon is killed, the ns_handles may become stale and produce a warning
# about creating a new rootless user namespace. This is expected behavior.
run_podman 0+w rm -f -t0 $cname
run_podman 125 container inspect $cname
assert "$output" =~ "no such container \"$cname\"" "Container should be removed"

View File

@@ -60,13 +60,15 @@ SocketMode=0660
WantedBy=sockets.target
EOF
# ensure pause die before each test runs
# ensure pause process/ns_handles are removed before each test runs
if is_rootless; then
local pause_pid_file="$XDG_RUNTIME_DIR/libpod/tmp/pause.pid"
local ns_handles_file="$XDG_RUNTIME_DIR/libpod/tmp/ns_handles"
if [ -f $pause_pid_file ]; then
kill -9 $(< $pause_pid_file) 2> /dev/null
rm -f $pause_pid_file
fi
rm -f $ns_handles_file
fi
systemctl_start "$SERVICE_NAME.socket"
}
@@ -105,13 +107,17 @@ function teardown() {
cid="$output"
local pause_pid_file="$XDG_RUNTIME_DIR/libpod/tmp/pause.pid"
if [ ! -f $pause_pid_file ]; then
# This seems unlikely, but not impossible
die "Pause pid file does not exist: $pause_pid_file"
fi
local ns_handles_file="$XDG_RUNTIME_DIR/libpod/tmp/ns_handles"
echo "kill -9 $(< $pause_pid_file) [pause process]"
kill -9 $(< $pause_pid_file)
if [ -f $ns_handles_file ]; then
echo "Removing ns_handles file: $ns_handles_file"
rm -f $ns_handles_file
elif [ -f $pause_pid_file ]; then
echo "kill -9 $(< $pause_pid_file) [pause process]"
kill -9 $(< $pause_pid_file)
else
die "Neither ns_handles file nor pause.pid file exists"
fi
run curl -s --max-time 3 --unix-socket $SERVICE_SOCK_ADDR $_PING
echo "curl output: $output"

View File

@@ -16,11 +16,19 @@ function setup_file() {
}
function _check_pause_process() {
# do not mark this variable as local; our caller expects it
# do not mark these variables as local; our caller expects them
pause_pid_file="$XDG_RUNTIME_DIR/libpod/tmp/pause.pid"
test -e $pause_pid_file || die "Pause pid file $pause_pid_file missing"
ns_handles_file="$XDG_RUNTIME_DIR/libpod/tmp/ns_handles"
pause_pid=""
# Check that either ns_handles or pause.pid exists
if [ -e $ns_handles_file ]; then
# ns_handles file exists, no pause process needed
return
fi
test -e $pause_pid_file || die "Neither ns_handles file ($ns_handles_file) nor pause.pid file ($pause_pid_file) exists"
# do not mark this variable as local; our caller expects it
pause_pid=$(<$pause_pid_file)
test -d /proc/$pause_pid || die "Pause process $pause_pid (from $pause_pid_file) is not running"
@@ -51,26 +59,31 @@ function _check_pause_process() {
# Use podman system migrate to stop the currently running pause process
run_podman system migrate
# After migrate, there must be no pause process
# After migrate, there must be no pause process or ns_handles
# Note: pause_pid_file and ns_handles_file are set by _check_pause_process above
test -e $pause_pid_file && die "Pause pid file $pause_pid_file still exists, even after podman system migrate"
test -e $ns_handles_file && die "ns_handles file $ns_handles_file still exists, even after podman system migrate"
run kill -0 $pause_pid
test $status -eq 0 && die "Pause process $pause_pid is still running even after podman system migrate"
if [[ -n "$pause_pid" ]]; then
run kill -0 $pause_pid
test $status -eq 0 && die "Pause process $pause_pid is still running even after podman system migrate"
fi
run_podman $(podman_isolation_opts ${PODMAN_TMPDIR}) $getns
tmpdir_userns="$output"
# And now we should once again have a pause process
# And now we should once again have a pause process or ns_handles
_check_pause_process
# and all podmans, with & without --tmpdir, should use the same ns
run_podman $getns
assert "$output" == "$tmpdir_userns" \
"podman should use the same userns created using a tmpdir"
if [ -e $pause_pid_file ]; then
run_podman $getns
assert "$output" == "$tmpdir_userns" \
"podman should use the same userns created using a tmpdir"
run_podman --tmpdir $PODMAN_TMPDIR/tmp2 $getns
assert "$output" == "$tmpdir_userns" \
"podman with tmpdir2 should use the same userns created using a tmpdir"
run_podman --tmpdir $PODMAN_TMPDIR/tmp2 $getns
assert "$output" == "$tmpdir_userns" \
"podman with tmpdir2 should use the same userns created using a tmpdir"
fi
}
# https://github.com/containers/podman/issues/16091
@@ -105,9 +118,14 @@ function _check_pause_process() {
run_podman unshare readlink /proc/self/ns/user
userns="$output"
# check for pause pid and then kill it
# Check for pause pid or ns_handles file, and remove/kill it
# Note: _check_pause_process sets ns_handles_file and pause_pid
_check_pause_process
kill -9 $pause_pid
if [ -e $ns_handles_file ]; then
rm -f $ns_handles_file
elif [ -n "$pause_pid" ]; then
kill -9 $pause_pid
fi
# Now again directly start podman run and make sure it can forward signals
# We're forced to use $PODMAN because run_podman cannot be backgrounded
@@ -161,9 +179,14 @@ function _check_pause_process() {
run_podman inspect --format '{{.State.ConmonPid}}' $cname
conmon_pid="$output"
# check for pause pid and then kill it
# Check for pause pid or ns_handles file, and remove/kill it
# Note: _check_pause_process sets ns_handles_file and pause_pid
_check_pause_process
kill -9 $pause_pid
if [ -e $ns_handles_file ]; then
rm -f $ns_handles_file
elif [ -n "$pause_pid" ]; then
kill -9 $pause_pid
fi
# kill conmon
kill -9 $conmon_pid