diff --git a/pkg/domain/infra/abi/system_linux.go b/pkg/domain/infra/abi/system_linux.go index da16961e69..6f1f6ff229 100644 --- a/pkg/domain/infra/abi/system_linux.go +++ b/pkg/domain/infra/abi/system_linux.go @@ -4,6 +4,7 @@ package abi import ( "context" + "errors" "fmt" "os" @@ -14,6 +15,7 @@ import ( "go.podman.io/common/pkg/config" "go.podman.io/common/pkg/systemd" "go.podman.io/storage/pkg/unshare" + "golang.org/x/sys/unix" ) // Default path for system runtime state @@ -90,14 +92,22 @@ func (ic *ContainerEngine) SetupRootless(_ context.Context, noMoveProcess bool, if len(paths) > 0 { became, ret, err = rootless.TryJoinFromFilePaths(pausePidPath, paths) - } else { - became, ret, err = rootless.BecomeRootInUserNS(pausePidPath) - if err == nil && !noMoveProcess { - systemd.MovePauseProcessToScope(pausePidPath) + // TryJoinFromFilePaths fails with ESRCH when the PID are all not valid anymore + // In this case create a new userns. + if errors.Is(err, unix.ESRCH) { + logrus.Warnf("Failed to join existing conmon namespace, creating a new rootless podman user namespace. If there are existing container running please stop them with %q to reset the namespace", os.Args[0]+" system migrate") + became, ret, err = rootless.BecomeRootInUserNS(pausePidPath) } + } else { + logrus.Info("Creating a new rootless user namespace") + became, ret, err = rootless.BecomeRootInUserNS(pausePidPath) } + if err != nil { - return fmt.Errorf("invalid internal status, try resetting the pause process with %q: %w", os.Args[0]+" system migrate", err) + return fmt.Errorf("fatal error, invalid internal status, unable to create a new pause process: %w. Try running %q and if that doesn't work reboot to recover", err, os.Args[0]+" system migrate") + } + if !noMoveProcess { + systemd.MovePauseProcessToScope(pausePidPath) } if became { os.Exit(ret) diff --git a/test/system/550-pause-process.bats b/test/system/550-pause-process.bats index da657e77e1..7818f36bb9 100644 --- a/test/system/550-pause-process.bats +++ b/test/system/550-pause-process.bats @@ -149,3 +149,30 @@ function _check_pause_process() { # This used to hang trying to unmount the netns. run_podman rm -f -t0 $cname } + +# regression test for https://issues.redhat.com/browse/RHEL-130252 +@test "podman system migrate works with conmon being killed" { + skip_if_not_rootless "pause process is only used as rootless" + skip_if_remote "system migrate not supported via remote" + + local cname=c-$(safename) + run_podman run --name $cname --stop-signal SIGKILL -d $IMAGE sleep 100 + + run_podman inspect --format '{{.State.ConmonPid}}' $cname + conmon_pid="$output" + + # check for pause pid and then kill it + _check_pause_process + kill -9 $pause_pid + + # kill conmon + kill -9 $conmon_pid + + # Use podman system migrate to stop the currently running pause process + run_podman 125 system migrate + assert "$output" =~ "Failed to join existing conmon namespace" "fallback to userns creating" + assert "$output" =~ "conmon process killed" + + # Now the removal command should work fine without errors. + run_podman rm $cname +}