Merge pull request #27604 from Luap99/migrate

podman system migrate fixes when pause process and conmon got killed
This commit is contained in:
openshift-merge-bot[bot]
2025-11-25 19:31:33 +00:00
committed by GitHub
3 changed files with 49 additions and 14 deletions

View File

@@ -4,6 +4,7 @@ package abi
import ( import (
"context" "context"
"errors"
"fmt" "fmt"
"os" "os"
@@ -14,6 +15,7 @@ import (
"go.podman.io/common/pkg/config" "go.podman.io/common/pkg/config"
"go.podman.io/common/pkg/systemd" "go.podman.io/common/pkg/systemd"
"go.podman.io/storage/pkg/unshare" "go.podman.io/storage/pkg/unshare"
"golang.org/x/sys/unix"
) )
// Default path for system runtime state // Default path for system runtime state
@@ -59,6 +61,8 @@ func (ic *ContainerEngine) SetupRootless(_ context.Context, noMoveProcess bool,
} }
} }
} }
// return early as we are already re-exec or root here so no need to join the rootless userns.
return nil return nil
} }
@@ -74,36 +78,41 @@ func (ic *ContainerEngine) SetupRootless(_ context.Context, noMoveProcess bool,
if became { if became {
os.Exit(ret) os.Exit(ret)
} }
if noMoveProcess {
return nil
}
// if there is no pid file, try to join existing containers, and create a pause process. // if there is no pid file, try to join existing containers, and create a pause process.
ctrs, err := ic.Libpod.GetRunningContainers() ctrs, err := ic.Libpod.GetRunningContainers()
if err != nil { if err != nil {
logrus.Error(err.Error()) return err
os.Exit(1)
} }
paths := []string{} paths := make([]string, 0, len(ctrs))
for _, ctr := range ctrs { for _, ctr := range ctrs {
paths = append(paths, ctr.ConfigNoCopy().ConmonPidFile) paths = append(paths, ctr.ConfigNoCopy().ConmonPidFile)
} }
if len(paths) > 0 { if len(paths) > 0 {
became, ret, err = rootless.TryJoinFromFilePaths(pausePidPath, paths) became, ret, err = rootless.TryJoinFromFilePaths(pausePidPath, paths)
} else { // TryJoinFromFilePaths fails with ESRCH when the PID are all not valid anymore
// In this case create a new userns.
if errors.Is(err, unix.ESRCH) {
logrus.Warnf("Failed to join existing conmon namespace, creating a new rootless podman user namespace. If there are existing container running please stop them with %q to reset the namespace", os.Args[0]+" system migrate")
became, ret, err = rootless.BecomeRootInUserNS(pausePidPath) became, ret, err = rootless.BecomeRootInUserNS(pausePidPath)
if err == nil {
systemd.MovePauseProcessToScope(pausePidPath)
} }
} else {
logrus.Info("Creating a new rootless user namespace")
became, ret, err = rootless.BecomeRootInUserNS(pausePidPath)
} }
if err != nil { if err != nil {
logrus.Error(fmt.Errorf("invalid internal status, try resetting the pause process with %q: %w", os.Args[0]+" system migrate", err)) return fmt.Errorf("fatal error, invalid internal status, unable to create a new pause process: %w. Try running %q and if that doesn't work reboot to recover", err, os.Args[0]+" system migrate")
os.Exit(1) }
if !noMoveProcess {
systemd.MovePauseProcessToScope(pausePidPath)
} }
if became { if became {
os.Exit(ret) os.Exit(ret)
} }
logrus.Error("Internal error, failed to re-exec podman into user namespace without error. This should never happen, if you see this please report a bug")
return nil return nil
} }

View File

@@ -384,8 +384,7 @@ can_use_shortcut (char **argv)
|| strcmp (argv[argc], "version") == 0 || strcmp (argv[argc], "version") == 0
|| strcmp (argv[argc], "context") == 0 || strcmp (argv[argc], "context") == 0
|| strcmp (argv[argc], "search") == 0 || strcmp (argv[argc], "search") == 0
|| strcmp (argv[argc], "compose") == 0 || strcmp (argv[argc], "compose") == 0)
|| (strcmp (argv[argc], "system") == 0 && argv[argc+1] && strcmp (argv[argc+1], "service") != 0))
{ {
ret = false; ret = false;
break; break;

View File

@@ -149,3 +149,30 @@ function _check_pause_process() {
# This used to hang trying to unmount the netns. # This used to hang trying to unmount the netns.
run_podman rm -f -t0 $cname run_podman rm -f -t0 $cname
} }
# regression test for https://issues.redhat.com/browse/RHEL-130252
@test "podman system migrate works with conmon being killed" {
skip_if_not_rootless "pause process is only used as rootless"
skip_if_remote "system migrate not supported via remote"
local cname=c-$(safename)
run_podman run --name $cname --stop-signal SIGKILL -d $IMAGE sleep 100
run_podman inspect --format '{{.State.ConmonPid}}' $cname
conmon_pid="$output"
# check for pause pid and then kill it
_check_pause_process
kill -9 $pause_pid
# kill conmon
kill -9 $conmon_pid
# Use podman system migrate to stop the currently running pause process
run_podman 125 system migrate
assert "$output" =~ "Failed to join existing conmon namespace" "fallback to userns creating"
assert "$output" =~ "conmon process killed"
# Now the removal command should work fine without errors.
run_podman rm $cname
}