Merge pull request #1848 from adrianreber/master

Add tcp-established to checkpoint/restore
This commit is contained in:
OpenShift Merge Robot
2018-11-28 07:00:24 -08:00
committed by GitHub
16 changed files with 245 additions and 36 deletions

View File

@ -18,8 +18,8 @@ gce_instance:
env:
FEDORA_CNI_COMMIT: "412b6d31280682bb4fab4446f113c22ff1886554"
CNI_COMMIT: "7480240de9749f9a0a5c8614b17f1f03e0c06ab9"
CRIO_COMMIT: "662dbb31b5d4f5ed54511a47cde7190c61c28677"
CRIU_COMMIT: "584cbe4643c3fc7dc901ff08bf923ca0fe7326f9"
CRIO_COMMIT: "7a283c391abb7bd25086a8ff91dbb36ebdd24466"
CRIU_COMMIT: "c74b83cd49c00589c0c0468ba5fe685b67fdbd0a"
RUNC_COMMIT: "78ef28e63bec2ee4c139b5e3e0d691eb9bdc748d"
# File to update in home-dir with task-specific env. var values
ENVLIB: ".bash_profile"

View File

@ -64,7 +64,7 @@ RUN set -x \
&& rm -rf "$GOPATH"
# Install conmon
ENV CRIO_COMMIT 662dbb31b5d4f5ed54511a47cde7190c61c28677
ENV CRIO_COMMIT 7a283c391abb7bd25086a8ff91dbb36ebdd24466
RUN set -x \
&& export GOPATH="$(mktemp -d)" \
&& git clone https://github.com/kubernetes-sigs/cri-o.git "$GOPATH/src/github.com/kubernetes-sigs/cri-o.git" \
@ -112,8 +112,7 @@ RUN set -x \
&& go get -u github.com/mailru/easyjson/... \
&& install -D -m 755 "$GOPATH"/bin/easyjson /usr/bin/
# Install criu
ENV CRIU_COMMIT 584cbe4643c3fc7dc901ff08bf923ca0fe7326f9
# Install latest stable criu version
RUN set -x \
&& cd /tmp \
&& git clone https://github.com/checkpoint-restore/criu.git \

View File

@ -68,7 +68,7 @@ RUN set -x \
&& install -D -m 755 "$GOPATH"/bin/easyjson /usr/bin/
# Install conmon
ENV CRIO_COMMIT 662dbb31b5d4f5ed54511a47cde7190c61c28677
ENV CRIO_COMMIT 7a283c391abb7bd25086a8ff91dbb36ebdd24466
RUN set -x \
&& export GOPATH="$(mktemp -d)" \
&& git clone https://github.com/kubernetes-sigs/cri-o.git "$GOPATH/src/github.com/kubernetes-sigs/cri-o.git" \

View File

@ -72,7 +72,7 @@ RUN set -x \
&& install -D -m 755 "$GOPATH"/bin/easyjson /usr/bin/
# Install conmon
ENV CRIO_COMMIT 662dbb31b5d4f5ed54511a47cde7190c61c28677
ENV CRIO_COMMIT 7a283c391abb7bd25086a8ff91dbb36ebdd24466
RUN set -x \
&& export GOPATH="$(mktemp -d)" \
&& git clone https://github.com/kubernetes-sigs/cri-o.git "$GOPATH/src/github.com/kubernetes-sigs/cri-o.git" \

View File

@ -27,6 +27,10 @@ var (
Name: "leave-running, R",
Usage: "leave the container running after writing checkpoint to disk",
},
cli.BoolFlag{
Name: "tcp-established",
Usage: "checkpoint a container with established TCP connections",
},
cli.BoolFlag{
Name: "all, a",
Usage: "checkpoint all running containers",
@ -55,8 +59,9 @@ func checkpointCmd(c *cli.Context) error {
defer runtime.Shutdown(false)
options := libpod.ContainerCheckpointOptions{
Keep: c.Bool("keep"),
KeepRunning: c.Bool("leave-running"),
Keep: c.Bool("keep"),
KeepRunning: c.Bool("leave-running"),
TCPEstablished: c.Bool("tcp-established"),
}
if err := checkAllAndLatest(c); err != nil {

View File

@ -26,6 +26,10 @@ var (
// restore --all would make more sense if there would be
// dedicated state for container which are checkpointed.
// TODO: add ContainerStateCheckpointed
cli.BoolFlag{
Name: "tcp-established",
Usage: "checkpoint a container with established TCP connections",
},
cli.BoolFlag{
Name: "all, a",
Usage: "restore all checkpointed containers",
@ -53,16 +57,19 @@ func restoreCmd(c *cli.Context) error {
}
defer runtime.Shutdown(false)
keep := c.Bool("keep")
options := libpod.ContainerCheckpointOptions{
Keep: c.Bool("keep"),
TCPEstablished: c.Bool("tcp-established"),
}
if err := checkAllAndLatest(c); err != nil {
return err
}
containers, lastError := getAllOrLatestContainers(c, runtime, libpod.ContainerStateRunning, "checkpointed")
containers, lastError := getAllOrLatestContainers(c, runtime, libpod.ContainerStateExited, "checkpointed")
for _, ctr := range containers {
if err = ctr.Restore(context.TODO(), keep); err != nil {
if err = ctr.Restore(context.TODO(), options); err != nil {
if lastError != nil {
fmt.Fprintln(os.Stderr, lastError)
}

View File

@ -716,16 +716,22 @@ _podman_container_attach() {
}
_podman_container_checkpoint() {
local options_with_args="
--help -h
"
local boolean_options="
--keep
-a
--all
-h
--help
-k
--keep
-l
--latest
-R
--leave-running
--tcp-established
"
case "$cur" in
-*)
COMPREPLY=($(compgen -W "$boolean_options $options_with_args" -- "$cur"))
COMPREPLY=($(compgen -W "$boolean_options" -- "$cur"))
;;
*)
__podman_complete_containers_running
@ -794,16 +800,20 @@ _podman_container_restart() {
}
_podman_container_restore() {
local options_with_args="
--help -h
"
local boolean_options="
--keep
-a
--all
-h
--help
-k
--keep
-l
--latest
--tcp-established
"
case "$cur" in
-*)
COMPREPLY=($(compgen -W "$boolean_options $options_with_args" -- "$cur"))
COMPREPLY=($(compgen -W "$boolean_options" -- "$cur"))
;;
*)
__podman_complete_containers_created

View File

@ -29,6 +29,13 @@ Instead of providing the container name or ID, checkpoint the last created conta
Leave the container running after checkpointing instead of stopping it.
**--tcp-established**
Checkpoint a container with established TCP connections. If the checkpoint
image contains established TCP connections, this options is required during
restore. Defaults to not checkpointing containers with established TCP
connections.
## EXAMPLE
podman container checkpoint mywebserver

View File

@ -32,6 +32,14 @@ Restore all checkpointed containers.
Instead of providing the container name or ID, restore the last created container.
**--tcp-established**
Restore a container with established TCP connections. If the checkpoint image
contains established TCP connections, this option is required during restore.
If the checkpoint image does not contain established TCP connections this
option is ignored. Defaults to not restoring containers with established TCP
connections.
## EXAMPLE
podman container restore mywebserver

View File

@ -833,10 +833,16 @@ func (c *Container) Refresh(ctx context.Context) error {
}
// ContainerCheckpointOptions is a struct used to pass the parameters
// for checkpointing to corresponding functions
// for checkpointing (and restoring) to the corresponding functions
type ContainerCheckpointOptions struct {
Keep bool
// Keep tells the API to not delete checkpoint artifacts
Keep bool
// KeepRunning tells the API to keep the container running
// after writing the checkpoint to disk
KeepRunning bool
// TCPEstablished tells the API to checkpoint a container
// even if it contains established TCP connections
TCPEstablished bool
}
// Checkpoint checkpoints a container
@ -855,7 +861,7 @@ func (c *Container) Checkpoint(ctx context.Context, options ContainerCheckpointO
}
// Restore restores a container
func (c *Container) Restore(ctx context.Context, keep bool) (err error) {
func (c *Container) Restore(ctx context.Context, options ContainerCheckpointOptions) (err error) {
logrus.Debugf("Trying to restore container %s", c)
if !c.batched {
c.lock.Lock()
@ -866,5 +872,5 @@ func (c *Container) Restore(ctx context.Context, keep bool) (err error) {
}
}
return c.restore(ctx, keep)
return c.restore(ctx, options)
}

View File

@ -606,7 +606,7 @@ func (c *Container) init(ctx context.Context) error {
}
// With the spec complete, do an OCI create
if err := c.runtime.ociRuntime.createContainer(c, c.config.CgroupParent, false); err != nil {
if err := c.runtime.ociRuntime.createContainer(c, c.config.CgroupParent, nil); err != nil {
return err
}

View File

@ -514,7 +514,7 @@ func (c *Container) checkpoint(ctx context.Context, options ContainerCheckpointO
return c.save()
}
func (c *Container) restore(ctx context.Context, keep bool) (err error) {
func (c *Container) restore(ctx context.Context, options ContainerCheckpointOptions) (err error) {
if !criu.CheckForCriu() {
return errors.Errorf("restoring a container requires at least CRIU %d", criu.MinCriuVersion)
@ -602,7 +602,7 @@ func (c *Container) restore(ctx context.Context, keep bool) (err error) {
// Cleanup for a working restore.
c.removeConmonFiles()
if err := c.runtime.ociRuntime.createContainer(c, c.config.CgroupParent, true); err != nil {
if err := c.runtime.ociRuntime.createContainer(c, c.config.CgroupParent, &options); err != nil {
return err
}
@ -610,7 +610,7 @@ func (c *Container) restore(ctx context.Context, keep bool) (err error) {
c.state.State = ContainerStateRunning
if !keep {
if !options.Keep {
// Delete all checkpoint related files. At this point, in theory, all files
// should exist. Still ignoring errors for now as the container should be
// restored and running. Not erroring out just because some cleanup operation

View File

@ -227,7 +227,7 @@ func bindPorts(ports []ocicni.PortMapping) ([]*os.File, error) {
return files, nil
}
func (r *OCIRuntime) createOCIContainer(ctr *Container, cgroupParent string, restoreContainer bool) (err error) {
func (r *OCIRuntime) createOCIContainer(ctr *Container, cgroupParent string, restoreOptions *ContainerCheckpointOptions) (err error) {
var stderrBuf bytes.Buffer
runtimeDir, err := util.GetRootlessRuntimeDir()
@ -289,8 +289,11 @@ func (r *OCIRuntime) createOCIContainer(ctr *Container, cgroupParent string, res
args = append(args, "--syslog")
}
if restoreContainer {
if restoreOptions != nil {
args = append(args, "--restore", ctr.CheckpointPath())
if restoreOptions.TCPEstablished {
args = append(args, "--restore-arg", "--tcp-established")
}
}
logrus.WithFields(logrus.Fields{
@ -866,6 +869,9 @@ func (r *OCIRuntime) checkpointContainer(ctr *Container, options ContainerCheckp
if options.KeepRunning {
args = append(args, "--leave-running")
}
if options.TCPEstablished {
args = append(args, "--tcp-established")
}
args = append(args, ctr.ID())
return utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, nil, r.path, args...)
}

View File

@ -65,10 +65,10 @@ func newPipe() (parent *os.File, child *os.File, err error) {
// CreateContainer creates a container in the OCI runtime
// TODO terminal support for container
// Presently just ignoring conmon opts related to it
func (r *OCIRuntime) createContainer(ctr *Container, cgroupParent string, restoreContainer bool) (err error) {
func (r *OCIRuntime) createContainer(ctr *Container, cgroupParent string, restoreOptions *ContainerCheckpointOptions) (err error) {
if ctr.state.UserNSRoot == "" {
// no need of an intermediate mount ns
return r.createOCIContainer(ctr, cgroupParent, restoreContainer)
return r.createOCIContainer(ctr, cgroupParent, restoreOptions)
}
var wg sync.WaitGroup
wg.Add(1)
@ -106,7 +106,7 @@ func (r *OCIRuntime) createContainer(ctr *Container, cgroupParent string, restor
if err != nil {
return
}
err = r.createOCIContainer(ctr, cgroupParent, restoreContainer)
err = r.createOCIContainer(ctr, cgroupParent, restoreOptions)
}()
wg.Wait()

View File

@ -15,7 +15,7 @@ func newPipe() (parent *os.File, child *os.File, err error) {
return nil, nil, ErrNotImplemented
}
func (r *OCIRuntime) createContainer(ctr *Container, cgroupParent string, restoreContainer bool) (err error) {
func (r *OCIRuntime) createContainer(ctr *Container, cgroupParent string, restoreOptions *ContainerCheckpointOptions) (err error) {
return ErrNotImplemented
}

View File

@ -2,6 +2,7 @@ package integration
import (
"fmt"
"net"
"os"
"github.com/containers/libpod/pkg/criu"
@ -126,4 +127,164 @@ var _ = Describe("Podman checkpoint", func() {
Expect(podmanTest.NumberOfContainersRunning()).To(Equal(0))
})
It("podman checkpoint latest running container", func() {
session1 := podmanTest.Podman([]string{"run", "-it", "--security-opt", "seccomp=unconfined", "--name", "first", "-d", ALPINE, "top"})
session1.WaitWithDefaultTimeout()
Expect(session1.ExitCode()).To(Equal(0))
session2 := podmanTest.Podman([]string{"run", "-it", "--security-opt", "seccomp=unconfined", "--name", "second", "-d", ALPINE, "top"})
session2.WaitWithDefaultTimeout()
Expect(session2.ExitCode()).To(Equal(0))
result := podmanTest.Podman([]string{"container", "checkpoint", "-l"})
result.WaitWithDefaultTimeout()
Expect(result.ExitCode()).To(Equal(0))
Expect(podmanTest.NumberOfContainersRunning()).To(Equal(1))
ps := podmanTest.Podman([]string{"ps", "-q", "--no-trunc"})
ps.WaitWithDefaultTimeout()
Expect(ps.ExitCode()).To(Equal(0))
Expect(ps.LineInOutputContains(session1.OutputToString())).To(BeTrue())
Expect(ps.LineInOutputContains(session2.OutputToString())).To(BeFalse())
result = podmanTest.Podman([]string{"container", "restore", "-l"})
result.WaitWithDefaultTimeout()
Expect(result.ExitCode()).To(Equal(0))
Expect(podmanTest.NumberOfContainersRunning()).To(Equal(2))
Expect(podmanTest.GetContainerStatus()).To(ContainSubstring("Up"))
Expect(podmanTest.GetContainerStatus()).To(Not(ContainSubstring("Exited")))
result = podmanTest.Podman([]string{"rm", "-fa"})
result.WaitWithDefaultTimeout()
Expect(result.ExitCode()).To(Equal(0))
Expect(podmanTest.NumberOfContainersRunning()).To(Equal(0))
})
It("podman checkpoint all running container", func() {
session1 := podmanTest.Podman([]string{"run", "-it", "--security-opt", "seccomp=unconfined", "--name", "first", "-d", ALPINE, "top"})
session1.WaitWithDefaultTimeout()
Expect(session1.ExitCode()).To(Equal(0))
session2 := podmanTest.Podman([]string{"run", "-it", "--security-opt", "seccomp=unconfined", "--name", "second", "-d", ALPINE, "top"})
session2.WaitWithDefaultTimeout()
Expect(session2.ExitCode()).To(Equal(0))
result := podmanTest.Podman([]string{"container", "checkpoint", "-a"})
result.WaitWithDefaultTimeout()
Expect(result.ExitCode()).To(Equal(0))
Expect(podmanTest.NumberOfContainersRunning()).To(Equal(0))
ps := podmanTest.Podman([]string{"ps", "-q", "--no-trunc"})
ps.WaitWithDefaultTimeout()
Expect(ps.ExitCode()).To(Equal(0))
Expect(ps.LineInOutputContains(session1.OutputToString())).To(BeFalse())
Expect(ps.LineInOutputContains(session2.OutputToString())).To(BeFalse())
result = podmanTest.Podman([]string{"container", "restore", "-a"})
result.WaitWithDefaultTimeout()
Expect(result.ExitCode()).To(Equal(0))
Expect(podmanTest.NumberOfContainersRunning()).To(Equal(2))
Expect(podmanTest.GetContainerStatus()).To(ContainSubstring("Up"))
Expect(podmanTest.GetContainerStatus()).To(Not(ContainSubstring("Exited")))
result = podmanTest.Podman([]string{"rm", "-fa"})
result.WaitWithDefaultTimeout()
Expect(result.ExitCode()).To(Equal(0))
Expect(podmanTest.NumberOfContainersRunning()).To(Equal(0))
})
It("podman checkpoint container with established tcp connections", func() {
Skip("Seems to not work (yet) in CI")
podmanTest.RestoreArtifact(redis)
session := podmanTest.Podman([]string{"run", "-it", "--security-opt", "seccomp=unconfined", "--network", "host", "-d", redis})
session.WaitWithDefaultTimeout()
Expect(session.ExitCode()).To(Equal(0))
// Open a network connection to the redis server
conn, err := net.Dial("tcp", "127.0.0.1:6379")
if err != nil {
os.Exit(1)
}
// This should fail as the container has established TCP connections
result := podmanTest.Podman([]string{"container", "checkpoint", "-l"})
result.WaitWithDefaultTimeout()
Expect(result.ExitCode()).To(Equal(125))
Expect(podmanTest.NumberOfContainersRunning()).To(Equal(1))
Expect(podmanTest.GetContainerStatus()).To(ContainSubstring("Up"))
// Now it should work thanks to "--tcp-established"
result = podmanTest.Podman([]string{"container", "checkpoint", "-l", "--tcp-established"})
result.WaitWithDefaultTimeout()
Expect(result.ExitCode()).To(Equal(0))
Expect(podmanTest.NumberOfContainersRunning()).To(Equal(0))
Expect(podmanTest.GetContainerStatus()).To(ContainSubstring("Exited"))
// Restore should fail as the checkpoint image contains established TCP connections
result = podmanTest.Podman([]string{"container", "restore", "-l"})
result.WaitWithDefaultTimeout()
Expect(result.ExitCode()).To(Equal(125))
Expect(podmanTest.NumberOfContainersRunning()).To(Equal(0))
Expect(podmanTest.GetContainerStatus()).To(ContainSubstring("Exited"))
// Now it should work thanks to "--tcp-established"
result = podmanTest.Podman([]string{"container", "restore", "-l", "--tcp-established"})
result.WaitWithDefaultTimeout()
Expect(result.ExitCode()).To(Equal(0))
Expect(podmanTest.NumberOfContainersRunning()).To(Equal(1))
Expect(podmanTest.GetContainerStatus()).To(ContainSubstring("Up"))
result = podmanTest.Podman([]string{"rm", "-fa"})
result.WaitWithDefaultTimeout()
Expect(result.ExitCode()).To(Equal(0))
Expect(podmanTest.NumberOfContainersRunning()).To(Equal(0))
conn.Close()
})
It("podman checkpoint with --leave-running", func() {
session := podmanTest.Podman([]string{"run", "-it", "--security-opt", "seccomp=unconfined", "-d", ALPINE, "top"})
session.WaitWithDefaultTimeout()
Expect(session.ExitCode()).To(Equal(0))
cid := session.OutputToString()
// Checkpoint container, but leave it running
result := podmanTest.Podman([]string{"container", "checkpoint", "--leave-running", cid})
result.WaitWithDefaultTimeout()
Expect(result.ExitCode()).To(Equal(0))
// Make sure it is still running
Expect(podmanTest.NumberOfContainersRunning()).To(Equal(1))
Expect(podmanTest.GetContainerStatus()).To(ContainSubstring("Up"))
// Stop the container
result = podmanTest.Podman([]string{"container", "stop", cid})
result.WaitWithDefaultTimeout()
Expect(result.ExitCode()).To(Equal(0))
Expect(podmanTest.NumberOfContainersRunning()).To(Equal(0))
Expect(podmanTest.GetContainerStatus()).To(ContainSubstring("Exited"))
// Restore the stopped container from the previous checkpoint
result = podmanTest.Podman([]string{"container", "restore", cid})
result.WaitWithDefaultTimeout()
Expect(result.ExitCode()).To(Equal(0))
Expect(podmanTest.NumberOfContainersRunning()).To(Equal(1))
Expect(podmanTest.GetContainerStatus()).To(ContainSubstring("Up"))
result = podmanTest.Podman([]string{"rm", "-fa"})
result.WaitWithDefaultTimeout()
Expect(result.ExitCode()).To(Equal(0))
Expect(podmanTest.NumberOfContainersRunning()).To(Equal(0))
})
})