mirror of
https://github.com/containers/podman.git
synced 2025-11-28 17:18:58 +08:00
SetupRootless handle case where conmon pid are not valid
When trying to join the conmon pid to recreate the pause process based on the namespace it can be that the pid is no longer valid, i.e. when conmon crashed or was killed. Currently we have a big issue that can be reproduced using: $ podman run -d quay.io/libpod/testimage:20241011 sleep 100 $ killall -9 conmon $ killall catatonit All commands would fail as we keep trying to rejoin the namespace of the non existing conmon process. So to address that fall back to creating a new namespace if we fail to join the conmon pids. Signed-off-by: Paul Holzinger <pholzing@redhat.com>
This commit is contained in:
@@ -4,6 +4,7 @@ package abi
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"os"
|
"os"
|
||||||
|
|
||||||
@@ -14,6 +15,7 @@ import (
|
|||||||
"go.podman.io/common/pkg/config"
|
"go.podman.io/common/pkg/config"
|
||||||
"go.podman.io/common/pkg/systemd"
|
"go.podman.io/common/pkg/systemd"
|
||||||
"go.podman.io/storage/pkg/unshare"
|
"go.podman.io/storage/pkg/unshare"
|
||||||
|
"golang.org/x/sys/unix"
|
||||||
)
|
)
|
||||||
|
|
||||||
// Default path for system runtime state
|
// Default path for system runtime state
|
||||||
@@ -90,14 +92,22 @@ func (ic *ContainerEngine) SetupRootless(_ context.Context, noMoveProcess bool,
|
|||||||
|
|
||||||
if len(paths) > 0 {
|
if len(paths) > 0 {
|
||||||
became, ret, err = rootless.TryJoinFromFilePaths(pausePidPath, paths)
|
became, ret, err = rootless.TryJoinFromFilePaths(pausePidPath, paths)
|
||||||
} else {
|
// TryJoinFromFilePaths fails with ESRCH when the PID are all not valid anymore
|
||||||
|
// In this case create a new userns.
|
||||||
|
if errors.Is(err, unix.ESRCH) {
|
||||||
|
logrus.Warnf("Failed to join existing conmon namespace, creating a new rootless podman user namespace. If there are existing container running please stop them with %q to reset the namespace", os.Args[0]+" system migrate")
|
||||||
became, ret, err = rootless.BecomeRootInUserNS(pausePidPath)
|
became, ret, err = rootless.BecomeRootInUserNS(pausePidPath)
|
||||||
if err == nil && !noMoveProcess {
|
|
||||||
systemd.MovePauseProcessToScope(pausePidPath)
|
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
logrus.Info("Creating a new rootless user namespace")
|
||||||
|
became, ret, err = rootless.BecomeRootInUserNS(pausePidPath)
|
||||||
}
|
}
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("invalid internal status, try resetting the pause process with %q: %w", os.Args[0]+" system migrate", err)
|
return fmt.Errorf("fatal error, invalid internal status, unable to create a new pause process: %w. Try running %q and if that doesn't work reboot to recover", err, os.Args[0]+" system migrate")
|
||||||
|
}
|
||||||
|
if !noMoveProcess {
|
||||||
|
systemd.MovePauseProcessToScope(pausePidPath)
|
||||||
}
|
}
|
||||||
if became {
|
if became {
|
||||||
os.Exit(ret)
|
os.Exit(ret)
|
||||||
|
|||||||
@@ -149,3 +149,30 @@ function _check_pause_process() {
|
|||||||
# This used to hang trying to unmount the netns.
|
# This used to hang trying to unmount the netns.
|
||||||
run_podman rm -f -t0 $cname
|
run_podman rm -f -t0 $cname
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# regression test for https://issues.redhat.com/browse/RHEL-130252
|
||||||
|
@test "podman system migrate works with conmon being killed" {
|
||||||
|
skip_if_not_rootless "pause process is only used as rootless"
|
||||||
|
skip_if_remote "system migrate not supported via remote"
|
||||||
|
|
||||||
|
local cname=c-$(safename)
|
||||||
|
run_podman run --name $cname --stop-signal SIGKILL -d $IMAGE sleep 100
|
||||||
|
|
||||||
|
run_podman inspect --format '{{.State.ConmonPid}}' $cname
|
||||||
|
conmon_pid="$output"
|
||||||
|
|
||||||
|
# check for pause pid and then kill it
|
||||||
|
_check_pause_process
|
||||||
|
kill -9 $pause_pid
|
||||||
|
|
||||||
|
# kill conmon
|
||||||
|
kill -9 $conmon_pid
|
||||||
|
|
||||||
|
# Use podman system migrate to stop the currently running pause process
|
||||||
|
run_podman 125 system migrate
|
||||||
|
assert "$output" =~ "Failed to join existing conmon namespace" "fallback to userns creating"
|
||||||
|
assert "$output" =~ "conmon process killed"
|
||||||
|
|
||||||
|
# Now the removal command should work fine without errors.
|
||||||
|
run_podman rm $cname
|
||||||
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user