Files
podman/libpod/container_internal_linux.go
Adrian Reber f7c8fd8a3d Add support to checkpoint/restore containers
runc uses CRIU to support checkpoint and restore of containers. This
brings an initial checkpoint/restore implementation to podman.

None of the additional runc flags are yet supported and container
migration optimization (pre-copy/post-copy) is also left for the future.

The current status is that it is possible to checkpoint and restore a
container. I am testing on RHEL-7.x and as the combination of RHEL-7 and
CRIU has seccomp troubles I have to create the container without
seccomp.

With the following steps I am able to checkpoint and restore a
container:

 # podman run --security-opt="seccomp=unconfined" -d registry.fedoraproject.org/f27/httpd
 # curl -I 10.22.0.78:8080
 HTTP/1.1 403 Forbidden # <-- this is actually a good answer
 # podman container checkpoint <container>
 # curl -I 10.22.0.78:8080
 curl: (7) Failed connect to 10.22.0.78:8080; No route to host
 # podman container restore <container>
 # curl -I 10.22.0.78:8080
 HTTP/1.1 403 Forbidden

I am using CRIU, runc and conmon from git. All required changes for
checkpoint/restore support in podman have been merged in the
corresponding projects.

To have the same IP address in the restored container as before
checkpointing, CNI is told which IP address to use.

If the saved network configuration cannot be found during restore, the
container is restored with a new IP address.

For CRIU to restore established TCP connections the IP address of the
network namespace used for restore needs to be the same. For TCP
connections in the listening state the IP address can change.

During restore only one network interface with one IP address is handled
correctly. Support to restore containers with more advanced network
configuration will be implemented later.

v2:
 * comment typo
 * print debug messages during cleanup of restore files
 * use createContainer() instead of createOCIContainer()
 * introduce helper CheckpointPath()
 * do not try to restore a container that is paused
 * use existing helper functions for cleanup
 * restructure code flow for better readability
 * do not try to restore if checkpoint/inventory.img is missing
 * git add checkpoint.go restore.go

v3:
 * move checkpoint/restore under 'podman container'

v4:
 * incorporated changes from latest reviews

Signed-off-by: Adrian Reber <areber@redhat.com>
2018-10-03 21:41:39 +02:00

468 lines
14 KiB
Go

// +build linux
package libpod
import (
"context"
"encoding/json"
"fmt"
"io/ioutil"
"net"
"os"
"path"
"path/filepath"
"strings"
"syscall"
"time"
cnitypes "github.com/containernetworking/cni/pkg/types/current"
crioAnnotations "github.com/containers/libpod/pkg/annotations"
"github.com/containers/libpod/pkg/chrootuser"
"github.com/containers/libpod/pkg/rootless"
"github.com/containers/storage/pkg/idtools"
spec "github.com/opencontainers/runtime-spec/specs-go"
"github.com/opencontainers/runtime-tools/generate"
"github.com/opencontainers/selinux/go-selinux/label"
"github.com/pkg/errors"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
)
func (c *Container) mountSHM(shmOptions string) error {
if err := unix.Mount("shm", c.config.ShmDir, "tmpfs", unix.MS_NOEXEC|unix.MS_NOSUID|unix.MS_NODEV,
label.FormatMountLabel(shmOptions, c.config.MountLabel)); err != nil {
return errors.Wrapf(err, "failed to mount shm tmpfs %q", c.config.ShmDir)
}
return nil
}
func (c *Container) unmountSHM(mount string) error {
if err := unix.Unmount(mount, unix.MNT_DETACH); err != nil {
if err != syscall.EINVAL {
logrus.Warnf("container %s failed to unmount %s : %v", c.ID(), mount, err)
}
}
return nil
}
// prepare mounts the container and sets up other required resources like net
// namespaces
func (c *Container) prepare() (err error) {
// Mount storage if not mounted
if err := c.mountStorage(); err != nil {
return err
}
// Set up network namespace if not already set up
if c.config.CreateNetNS && c.state.NetNS == nil && !c.config.PostConfigureNetNS {
if err := c.runtime.createNetNS(c); err != nil {
// Tear down storage before exiting to make sure we
// don't leak mounts
if err2 := c.cleanupStorage(); err2 != nil {
logrus.Errorf("Error cleaning up storage for container %s: %v", c.ID(), err2)
}
return err
}
}
return nil
}
// cleanupNetwork unmounts and cleans up the container's network
func (c *Container) cleanupNetwork() error {
if c.state.NetNS == nil {
logrus.Debugf("Network is already cleaned up, skipping...")
return nil
}
// Stop the container's network namespace (if it has one)
if err := c.runtime.teardownNetNS(c); err != nil {
logrus.Errorf("unable to cleanup network for container %s: %q", c.ID(), err)
}
c.state.NetNS = nil
c.state.NetworkStatus = nil
if c.valid {
return c.save()
}
return nil
}
// Generate spec for a container
// Accepts a map of the container's dependencies
func (c *Container) generateSpec(ctx context.Context) (*spec.Spec, error) {
g := generate.NewFromSpec(c.config.Spec)
// If network namespace was requested, add it now
if c.config.CreateNetNS {
if c.config.PostConfigureNetNS {
g.AddOrReplaceLinuxNamespace(spec.NetworkNamespace, "")
} else {
g.AddOrReplaceLinuxNamespace(spec.NetworkNamespace, c.state.NetNS.Path())
}
}
// Check if the spec file mounts contain the label Relabel flags z or Z.
// If they do, relabel the source directory and then remove the option.
for _, m := range g.Mounts() {
var options []string
for _, o := range m.Options {
switch o {
case "z":
fallthrough
case "Z":
if err := label.Relabel(m.Source, c.MountLabel(), label.IsShared(o)); err != nil {
return nil, errors.Wrapf(err, "relabel failed %q", m.Source)
}
default:
options = append(options, o)
}
}
m.Options = options
}
g.SetProcessSelinuxLabel(c.ProcessLabel())
g.SetLinuxMountLabel(c.MountLabel())
// Remove the default /dev/shm mount to ensure we overwrite it
g.RemoveMount("/dev/shm")
// Add bind mounts to container
for dstPath, srcPath := range c.state.BindMounts {
newMount := spec.Mount{
Type: "bind",
Source: srcPath,
Destination: dstPath,
Options: []string{"bind", "private"},
}
if c.IsReadOnly() {
newMount.Options = append(newMount.Options, "ro")
}
if !MountExists(g.Mounts(), dstPath) {
g.AddMount(newMount)
} else {
logrus.Warnf("User mount overriding libpod mount at %q", dstPath)
}
}
var err error
if !rootless.IsRootless() {
if c.state.ExtensionStageHooks, err = c.setupOCIHooks(ctx, g.Config); err != nil {
return nil, errors.Wrapf(err, "error setting up OCI Hooks")
}
}
// Bind builtin image volumes
if c.config.Rootfs == "" && c.config.ImageVolumes {
if err := c.addLocalVolumes(ctx, &g); err != nil {
return nil, errors.Wrapf(err, "error mounting image volumes")
}
}
if c.config.User != "" {
if !c.state.Mounted {
return nil, errors.Wrapf(ErrCtrStateInvalid, "container %s must be mounted in order to translate User field", c.ID())
}
uid, gid, err := chrootuser.GetUser(c.state.Mountpoint, c.config.User)
if err != nil {
return nil, err
}
// User and Group must go together
g.SetProcessUID(uid)
g.SetProcessGID(gid)
}
// Add addition groups if c.config.GroupAdd is not empty
if len(c.config.Groups) > 0 {
if !c.state.Mounted {
return nil, errors.Wrapf(ErrCtrStateInvalid, "container %s must be mounted in order to add additional groups", c.ID())
}
for _, group := range c.config.Groups {
gid, err := chrootuser.GetGroup(c.state.Mountpoint, group)
if err != nil {
return nil, err
}
g.AddProcessAdditionalGid(gid)
}
}
// Look up and add groups the user belongs to, if a group wasn't directly specified
if !rootless.IsRootless() && !strings.Contains(c.config.User, ":") {
groups, err := chrootuser.GetAdditionalGroupsForUser(c.state.Mountpoint, uint64(g.Config.Process.User.UID))
if err != nil && errors.Cause(err) != chrootuser.ErrNoSuchUser {
return nil, err
}
for _, gid := range groups {
g.AddProcessAdditionalGid(gid)
}
}
// Add shared namespaces from other containers
if c.config.IPCNsCtr != "" {
if err := c.addNamespaceContainer(&g, IPCNS, c.config.IPCNsCtr, spec.IPCNamespace); err != nil {
return nil, err
}
}
if c.config.MountNsCtr != "" {
if err := c.addNamespaceContainer(&g, MountNS, c.config.MountNsCtr, spec.MountNamespace); err != nil {
return nil, err
}
}
if c.config.NetNsCtr != "" {
if err := c.addNamespaceContainer(&g, NetNS, c.config.NetNsCtr, spec.NetworkNamespace); err != nil {
return nil, err
}
}
if c.config.PIDNsCtr != "" {
if err := c.addNamespaceContainer(&g, PIDNS, c.config.PIDNsCtr, string(spec.PIDNamespace)); err != nil {
return nil, err
}
}
if c.config.UserNsCtr != "" {
if err := c.addNamespaceContainer(&g, UserNS, c.config.UserNsCtr, spec.UserNamespace); err != nil {
return nil, err
}
}
if c.config.UTSNsCtr != "" {
if err := c.addNamespaceContainer(&g, UTSNS, c.config.UTSNsCtr, spec.UTSNamespace); err != nil {
return nil, err
}
}
if c.config.CgroupNsCtr != "" {
if err := c.addNamespaceContainer(&g, CgroupNS, c.config.CgroupNsCtr, spec.CgroupNamespace); err != nil {
return nil, err
}
}
if c.config.Rootfs == "" {
if err := idtools.MkdirAllAs(c.state.RealMountpoint, 0700, c.RootUID(), c.RootGID()); err != nil {
return nil, err
}
}
g.SetRootPath(c.state.RealMountpoint)
g.AddAnnotation(crioAnnotations.Created, c.config.CreatedTime.Format(time.RFC3339Nano))
g.AddAnnotation("org.opencontainers.image.stopSignal", fmt.Sprintf("%d", c.config.StopSignal))
for _, i := range c.config.Spec.Linux.Namespaces {
if string(i.Type) == spec.UTSNamespace {
hostname := c.Hostname()
g.SetHostname(hostname)
g.AddProcessEnv("HOSTNAME", hostname)
break
}
}
// Only add container environment variable if not already present
foundContainerEnv := false
for _, env := range g.Config.Process.Env {
if strings.HasPrefix(env, "container=") {
foundContainerEnv = true
break
}
}
if !foundContainerEnv {
g.AddProcessEnv("container", "libpod")
}
if rootless.IsRootless() {
g.SetLinuxCgroupsPath("")
} else if c.runtime.config.CgroupManager == SystemdCgroupsManager {
// When runc is set to use Systemd as a cgroup manager, it
// expects cgroups to be passed as follows:
// slice:prefix:name
systemdCgroups := fmt.Sprintf("%s:libpod:%s", path.Base(c.config.CgroupParent), c.ID())
logrus.Debugf("Setting CGroups for container %s to %s", c.ID(), systemdCgroups)
g.SetLinuxCgroupsPath(systemdCgroups)
} else {
cgroupPath, err := c.CGroupPath()
if err != nil {
return nil, err
}
logrus.Debugf("Setting CGroup path for container %s to %s", c.ID(), cgroupPath)
g.SetLinuxCgroupsPath(cgroupPath)
}
// Mounts need to be sorted so paths will not cover other paths
mounts := sortMounts(g.Mounts())
g.ClearMounts()
for _, m := range mounts {
g.AddMount(m)
}
return g.Config, nil
}
// Add an existing container's namespace to the spec
func (c *Container) addNamespaceContainer(g *generate.Generator, ns LinuxNS, ctr string, specNS string) error {
nsCtr, err := c.runtime.state.Container(ctr)
if err != nil {
return errors.Wrapf(err, "error retrieving dependency %s of container %s from state", ctr, c.ID())
}
// TODO need unlocked version of this for use in pods
nsPath, err := nsCtr.NamespacePath(ns)
if err != nil {
return err
}
if err := g.AddOrReplaceLinuxNamespace(specNS, nsPath); err != nil {
return err
}
return nil
}
func (c *Container) checkpoint(ctx context.Context, keep bool) (err error) {
if c.state.State != ContainerStateRunning {
return errors.Wrapf(ErrCtrStateInvalid, "%q is not running, cannot checkpoint", c.state.State)
}
if err := c.runtime.ociRuntime.checkpointContainer(c); err != nil {
return err
}
// Save network.status. This is needed to restore the container with
// the same IP. Currently limited to one IP address in a container
// with one interface.
formatJSON, err := json.MarshalIndent(c.state.NetworkStatus, "", " ")
if err != nil {
return err
}
if err := ioutil.WriteFile(filepath.Join(c.bundlePath(), "network.status"), formatJSON, 0644); err != nil {
return err
}
logrus.Debugf("Checkpointed container %s", c.ID())
c.state.State = ContainerStateStopped
// Cleanup Storage and Network
if err := c.cleanup(ctx); err != nil {
return err
}
if !keep {
// Remove log file
os.Remove(filepath.Join(c.bundlePath(), "dump.log"))
// Remove statistic file
os.Remove(filepath.Join(c.bundlePath(), "stats-dump"))
}
return c.save()
}
func (c *Container) restore(ctx context.Context, keep bool) (err error) {
if (c.state.State != ContainerStateConfigured) && (c.state.State != ContainerStateExited) {
return errors.Wrapf(ErrCtrStateInvalid, "container %s is running or paused, cannot restore", c.ID())
}
// Let's try to stat() CRIU's inventory file. If it does not exist, it makes
// no sense to try a restore. This is a minimal check if a checkpoint exist.
if _, err := os.Stat(filepath.Join(c.CheckpointPath(), "inventory.img")); os.IsNotExist(err) {
return errors.Wrapf(err, "A complete checkpoint for this container cannot be found, cannot restore")
}
// Read network configuration from checkpoint
// Currently only one interface with one IP is supported.
networkStatusFile, err := os.Open(filepath.Join(c.bundlePath(), "network.status"))
if err == nil {
// The file with the network.status does exist. Let's restore the
// container with the same IP address as during checkpointing.
defer networkStatusFile.Close()
var networkStatus []*cnitypes.Result
networkJSON, err := ioutil.ReadAll(networkStatusFile)
if err != nil {
return err
}
json.Unmarshal(networkJSON, &networkStatus)
// Take the first IP address
var IP net.IP
if len(networkStatus) > 0 {
if len(networkStatus[0].IPs) > 0 {
IP = networkStatus[0].IPs[0].Address.IP
}
}
if IP != nil {
env := fmt.Sprintf("IP=%s", IP)
// Tell CNI which IP address we want.
os.Setenv("CNI_ARGS", env)
logrus.Debugf("Restoring container with %s", env)
}
}
if err := c.prepare(); err != nil {
return err
}
defer func() {
if err != nil {
if err2 := c.cleanup(ctx); err2 != nil {
logrus.Errorf("error cleaning up container %s: %v", c.ID(), err2)
}
}
}()
// TODO: use existing way to request static IPs, once it is merged in ocicni
// https://github.com/cri-o/ocicni/pull/23/
// CNI_ARGS was used to request a certain IP address. Unconditionally remove it.
os.Unsetenv("CNI_ARGS")
// Read config
jsonPath := filepath.Join(c.bundlePath(), "config.json")
logrus.Debugf("generate.NewFromFile at %v", jsonPath)
g, err := generate.NewFromFile(jsonPath)
if err != nil {
logrus.Debugf("generate.NewFromFile failed with %v", err)
return err
}
// We want to have the same network namespace as before.
if c.config.CreateNetNS {
g.AddOrReplaceLinuxNamespace(spec.NetworkNamespace, c.state.NetNS.Path())
}
// Save the OCI spec to disk
if err := c.saveSpec(g.Spec()); err != nil {
return err
}
if err := c.makeBindMounts(); err != nil {
return err
}
// Cleanup for a working restore.
c.removeConmonFiles()
if err := c.runtime.ociRuntime.createContainer(c, c.config.CgroupParent, true); err != nil {
return err
}
logrus.Debugf("Restored container %s", c.ID())
c.state.State = ContainerStateRunning
if !keep {
// Delete all checkpoint related files. At this point, in theory, all files
// should exist. Still ignoring errors for now as the container should be
// restored and running. Not erroring out just because some cleanup operation
// failed. Starting with the checkpoint directory
err = os.RemoveAll(c.CheckpointPath())
if err != nil {
logrus.Debugf("Non-fatal: removal of checkpoint directory (%s) failed: %v", c.CheckpointPath(), err)
}
cleanup := [...]string{"restore.log", "dump.log", "stats-dump", "stats-restore", "network.status"}
for _, delete := range cleanup {
file := filepath.Join(c.bundlePath(), delete)
err = os.Remove(file)
if err != nil {
logrus.Debugf("Non-fatal: removal of checkpoint file (%s) failed: %v", file, err)
}
}
}
return c.save()
}