Merge pull request #27604 from Luap99/migrate

podman system migrate fixes when pause process and conmon got killed
This commit is contained in:
openshift-merge-bot[bot]
2025-11-25 19:31:33 +00:00
committed by GitHub
3 changed files with 49 additions and 14 deletions

View File

@@ -4,6 +4,7 @@ package abi
import (
"context"
"errors"
"fmt"
"os"
@@ -14,6 +15,7 @@ import (
"go.podman.io/common/pkg/config"
"go.podman.io/common/pkg/systemd"
"go.podman.io/storage/pkg/unshare"
"golang.org/x/sys/unix"
)
// Default path for system runtime state
@@ -59,6 +61,8 @@ func (ic *ContainerEngine) SetupRootless(_ context.Context, noMoveProcess bool,
}
}
}
// return early as we are already re-exec or root here so no need to join the rootless userns.
return nil
}
@@ -74,36 +78,41 @@ func (ic *ContainerEngine) SetupRootless(_ context.Context, noMoveProcess bool,
if became {
os.Exit(ret)
}
if noMoveProcess {
return nil
}
// if there is no pid file, try to join existing containers, and create a pause process.
ctrs, err := ic.Libpod.GetRunningContainers()
if err != nil {
logrus.Error(err.Error())
os.Exit(1)
return err
}
paths := []string{}
paths := make([]string, 0, len(ctrs))
for _, ctr := range ctrs {
paths = append(paths, ctr.ConfigNoCopy().ConmonPidFile)
}
if len(paths) > 0 {
became, ret, err = rootless.TryJoinFromFilePaths(pausePidPath, paths)
} else {
became, ret, err = rootless.BecomeRootInUserNS(pausePidPath)
if err == nil {
systemd.MovePauseProcessToScope(pausePidPath)
// TryJoinFromFilePaths fails with ESRCH when the PID are all not valid anymore
// In this case create a new userns.
if errors.Is(err, unix.ESRCH) {
logrus.Warnf("Failed to join existing conmon namespace, creating a new rootless podman user namespace. If there are existing container running please stop them with %q to reset the namespace", os.Args[0]+" system migrate")
became, ret, err = rootless.BecomeRootInUserNS(pausePidPath)
}
} else {
logrus.Info("Creating a new rootless user namespace")
became, ret, err = rootless.BecomeRootInUserNS(pausePidPath)
}
if err != nil {
logrus.Error(fmt.Errorf("invalid internal status, try resetting the pause process with %q: %w", os.Args[0]+" system migrate", err))
os.Exit(1)
return fmt.Errorf("fatal error, invalid internal status, unable to create a new pause process: %w. Try running %q and if that doesn't work reboot to recover", err, os.Args[0]+" system migrate")
}
if !noMoveProcess {
systemd.MovePauseProcessToScope(pausePidPath)
}
if became {
os.Exit(ret)
}
logrus.Error("Internal error, failed to re-exec podman into user namespace without error. This should never happen, if you see this please report a bug")
return nil
}

View File

@@ -384,8 +384,7 @@ can_use_shortcut (char **argv)
|| strcmp (argv[argc], "version") == 0
|| strcmp (argv[argc], "context") == 0
|| strcmp (argv[argc], "search") == 0
|| strcmp (argv[argc], "compose") == 0
|| (strcmp (argv[argc], "system") == 0 && argv[argc+1] && strcmp (argv[argc+1], "service") != 0))
|| strcmp (argv[argc], "compose") == 0)
{
ret = false;
break;

View File

@@ -149,3 +149,30 @@ function _check_pause_process() {
# This used to hang trying to unmount the netns.
run_podman rm -f -t0 $cname
}
# regression test for https://issues.redhat.com/browse/RHEL-130252
@test "podman system migrate works with conmon being killed" {
skip_if_not_rootless "pause process is only used as rootless"
skip_if_remote "system migrate not supported via remote"
local cname=c-$(safename)
run_podman run --name $cname --stop-signal SIGKILL -d $IMAGE sleep 100
run_podman inspect --format '{{.State.ConmonPid}}' $cname
conmon_pid="$output"
# check for pause pid and then kill it
_check_pause_process
kill -9 $pause_pid
# kill conmon
kill -9 $conmon_pid
# Use podman system migrate to stop the currently running pause process
run_podman 125 system migrate
assert "$output" =~ "Failed to join existing conmon namespace" "fallback to userns creating"
assert "$output" =~ "conmon process killed"
# Now the removal command should work fine without errors.
run_podman rm $cname
}