mirror of
https://github.com/containers/podman.git
synced 2025-06-26 04:46:57 +08:00

Allow Podman containers to request to use a specific OCI runtime if multiple runtimes are configured. This is the first step to properly supporting containers in a multi-runtime environment. The biggest changes are that all OCI runtimes are now initialized when Podman creates its runtime, and containers now use the runtime requested in their configuration (instead of always the default runtime). Signed-off-by: Matthew Heon <matthew.heon@pm.me>
519 lines
15 KiB
Go
519 lines
15 KiB
Go
package libpod
|
|
|
|
import (
|
|
"bytes"
|
|
"fmt"
|
|
"io/ioutil"
|
|
"net"
|
|
"os"
|
|
"os/exec"
|
|
"path/filepath"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/containers/libpod/pkg/util"
|
|
"github.com/cri-o/ocicni/pkg/ocicni"
|
|
spec "github.com/opencontainers/runtime-spec/specs-go"
|
|
"github.com/opencontainers/selinux/go-selinux/label"
|
|
"github.com/pkg/errors"
|
|
"github.com/sirupsen/logrus"
|
|
|
|
// TODO import these functions into libpod and remove the import
|
|
// Trying to keep libpod from depending on CRI-O code
|
|
"github.com/containers/libpod/utils"
|
|
)
|
|
|
|
// OCI code is undergoing heavy rewrite
|
|
|
|
const (
|
|
// CgroupfsCgroupsManager represents cgroupfs native cgroup manager
|
|
CgroupfsCgroupsManager = "cgroupfs"
|
|
// SystemdCgroupsManager represents systemd native cgroup manager
|
|
SystemdCgroupsManager = "systemd"
|
|
|
|
// ContainerCreateTimeout represents the value of container creating timeout
|
|
ContainerCreateTimeout = 240 * time.Second
|
|
|
|
// Timeout before declaring that runtime has failed to kill a given
|
|
// container
|
|
killContainerTimeout = 5 * time.Second
|
|
// DefaultShmSize is the default shm size
|
|
DefaultShmSize = 64 * 1024 * 1024
|
|
// NsRunDir is the default directory in which running network namespaces
|
|
// are stored
|
|
NsRunDir = "/var/run/netns"
|
|
)
|
|
|
|
// OCIRuntime represents an OCI-compatible runtime that libpod can call into
|
|
// to perform container operations
|
|
type OCIRuntime struct {
|
|
name string
|
|
path string
|
|
conmonPath string
|
|
conmonEnv []string
|
|
cgroupManager string
|
|
tmpDir string
|
|
exitsDir string
|
|
socketsDir string
|
|
logSizeMax int64
|
|
noPivot bool
|
|
reservePorts bool
|
|
supportsJSON bool
|
|
}
|
|
|
|
// syncInfo is used to return data from monitor process to daemon
|
|
type syncInfo struct {
|
|
Pid int `json:"pid"`
|
|
Message string `json:"message,omitempty"`
|
|
}
|
|
|
|
// ociError is used to parse the OCI runtime JSON log. It is not part of the
|
|
// OCI runtime specifications, it follows what runc does
|
|
type ociError struct {
|
|
Level string `json:"level,omitempty"`
|
|
Time string `json:"time,omitempty"`
|
|
Msg string `json:"msg,omitempty"`
|
|
}
|
|
|
|
// Make a new OCI runtime with provided options.
|
|
// The first path that points to a valid executable will be used.
|
|
func newOCIRuntime(name string, paths []string, conmonPath string, runtimeCfg *RuntimeConfig, supportsJSON bool) (*OCIRuntime, error) {
|
|
if name == "" {
|
|
return nil, errors.Wrapf(ErrInvalidArg, "the OCI runtime must be provided a non-empty name")
|
|
}
|
|
|
|
runtime := new(OCIRuntime)
|
|
runtime.name = name
|
|
runtime.conmonPath = conmonPath
|
|
|
|
runtime.conmonEnv = runtimeCfg.ConmonEnvVars
|
|
runtime.cgroupManager = runtimeCfg.CgroupManager
|
|
runtime.tmpDir = runtimeCfg.TmpDir
|
|
runtime.logSizeMax = runtimeCfg.MaxLogSize
|
|
runtime.noPivot = runtimeCfg.NoPivotRoot
|
|
runtime.reservePorts = runtimeCfg.EnablePortReservation
|
|
|
|
// TODO: probe OCI runtime for feature and enable automatically if
|
|
// available.
|
|
runtime.supportsJSON = supportsJSON
|
|
|
|
foundPath := false
|
|
for _, path := range paths {
|
|
stat, err := os.Stat(path)
|
|
if err != nil {
|
|
if os.IsNotExist(err) {
|
|
continue
|
|
}
|
|
return nil, errors.Wrapf(err, "cannot stat %s", path)
|
|
}
|
|
if !stat.Mode().IsRegular() {
|
|
continue
|
|
}
|
|
foundPath = true
|
|
runtime.path = path
|
|
break
|
|
}
|
|
if !foundPath {
|
|
return nil, errors.Wrapf(ErrInvalidArg, "no valid executable found for OCI runtime %s", name)
|
|
}
|
|
|
|
runtime.exitsDir = filepath.Join(runtime.tmpDir, "exits")
|
|
runtime.socketsDir = filepath.Join(runtime.tmpDir, "socket")
|
|
|
|
if runtime.cgroupManager != CgroupfsCgroupsManager && runtime.cgroupManager != SystemdCgroupsManager {
|
|
return nil, errors.Wrapf(ErrInvalidArg, "invalid cgroup manager specified: %s", runtime.cgroupManager)
|
|
}
|
|
|
|
// Create the exit files and attach sockets directories
|
|
if err := os.MkdirAll(runtime.exitsDir, 0750); err != nil {
|
|
// The directory is allowed to exist
|
|
if !os.IsExist(err) {
|
|
return nil, errors.Wrapf(err, "error creating OCI runtime exit files directory %s",
|
|
runtime.exitsDir)
|
|
}
|
|
}
|
|
if err := os.MkdirAll(runtime.socketsDir, 0750); err != nil {
|
|
// The directory is allowed to exist
|
|
if !os.IsExist(err) {
|
|
return nil, errors.Wrapf(err, "error creating OCI runtime attach sockets directory %s",
|
|
runtime.socketsDir)
|
|
}
|
|
}
|
|
|
|
return runtime, nil
|
|
}
|
|
|
|
// Create systemd unit name for cgroup scopes
|
|
func createUnitName(prefix string, name string) string {
|
|
return fmt.Sprintf("%s-%s.scope", prefix, name)
|
|
}
|
|
|
|
func bindPorts(ports []ocicni.PortMapping) ([]*os.File, error) {
|
|
var files []*os.File
|
|
notifySCTP := false
|
|
for _, i := range ports {
|
|
switch i.Protocol {
|
|
case "udp":
|
|
addr, err := net.ResolveUDPAddr("udp", fmt.Sprintf("%s:%d", i.HostIP, i.HostPort))
|
|
if err != nil {
|
|
return nil, errors.Wrapf(err, "cannot resolve the UDP address")
|
|
}
|
|
|
|
server, err := net.ListenUDP("udp", addr)
|
|
if err != nil {
|
|
return nil, errors.Wrapf(err, "cannot listen on the UDP port")
|
|
}
|
|
f, err := server.File()
|
|
if err != nil {
|
|
return nil, errors.Wrapf(err, "cannot get file for UDP socket")
|
|
}
|
|
files = append(files, f)
|
|
break
|
|
|
|
case "tcp":
|
|
addr, err := net.ResolveTCPAddr("tcp4", fmt.Sprintf("%s:%d", i.HostIP, i.HostPort))
|
|
if err != nil {
|
|
return nil, errors.Wrapf(err, "cannot resolve the TCP address")
|
|
}
|
|
|
|
server, err := net.ListenTCP("tcp4", addr)
|
|
if err != nil {
|
|
return nil, errors.Wrapf(err, "cannot listen on the TCP port")
|
|
}
|
|
f, err := server.File()
|
|
if err != nil {
|
|
return nil, errors.Wrapf(err, "cannot get file for TCP socket")
|
|
}
|
|
files = append(files, f)
|
|
break
|
|
case "sctp":
|
|
if !notifySCTP {
|
|
notifySCTP = true
|
|
logrus.Warnf("port reservation for SCTP is not supported")
|
|
}
|
|
break
|
|
default:
|
|
return nil, fmt.Errorf("unknown protocol %s", i.Protocol)
|
|
|
|
}
|
|
}
|
|
return files, nil
|
|
}
|
|
|
|
// updateContainerStatus retrieves the current status of the container from the
|
|
// runtime. It updates the container's state but does not save it.
|
|
// If useRunc is false, we will not directly hit runc to see the container's
|
|
// status, but will instead only check for the existence of the conmon exit file
|
|
// and update state to stopped if it exists.
|
|
func (r *OCIRuntime) updateContainerStatus(ctr *Container, useRuntime bool) error {
|
|
exitFile := ctr.exitFilePath()
|
|
|
|
runtimeDir, err := util.GetRootlessRuntimeDir()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
// If not using the OCI runtime, we don't need to do most of this.
|
|
if !useRuntime {
|
|
// If the container's not running, nothing to do.
|
|
if ctr.state.State != ContainerStateRunning && ctr.state.State != ContainerStatePaused {
|
|
return nil
|
|
}
|
|
|
|
// Check for the exit file conmon makes
|
|
info, err := os.Stat(exitFile)
|
|
if err != nil {
|
|
if os.IsNotExist(err) {
|
|
// Container is still running, no error
|
|
return nil
|
|
}
|
|
|
|
return errors.Wrapf(err, "error running stat on container %s exit file", ctr.ID())
|
|
}
|
|
|
|
// Alright, it exists. Transition to Stopped state.
|
|
ctr.state.State = ContainerStateStopped
|
|
|
|
// Read the exit file to get our stopped time and exit code.
|
|
return ctr.handleExitFile(exitFile, info)
|
|
}
|
|
|
|
// Store old state so we know if we were already stopped
|
|
oldState := ctr.state.State
|
|
|
|
state := new(spec.State)
|
|
|
|
cmd := exec.Command(r.path, "state", ctr.ID())
|
|
cmd.Env = append(cmd.Env, fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir))
|
|
outPipe, err := cmd.StdoutPipe()
|
|
if err != nil {
|
|
return errors.Wrapf(err, "getting stdout pipe")
|
|
}
|
|
errPipe, err := cmd.StderrPipe()
|
|
if err != nil {
|
|
return errors.Wrapf(err, "getting stderr pipe")
|
|
}
|
|
|
|
if err := cmd.Start(); err != nil {
|
|
out, err2 := ioutil.ReadAll(errPipe)
|
|
if err2 != nil {
|
|
return errors.Wrapf(err, "error getting container %s state", ctr.ID())
|
|
}
|
|
if strings.Contains(string(out), "does not exist") {
|
|
ctr.removeConmonFiles()
|
|
ctr.state.ExitCode = -1
|
|
ctr.state.FinishedTime = time.Now()
|
|
ctr.state.State = ContainerStateExited
|
|
return nil
|
|
}
|
|
return errors.Wrapf(err, "error getting container %s state. stderr/out: %s", ctr.ID(), out)
|
|
}
|
|
defer cmd.Wait()
|
|
|
|
errPipe.Close()
|
|
out, err := ioutil.ReadAll(outPipe)
|
|
if err != nil {
|
|
return errors.Wrapf(err, "error reading stdout: %s", ctr.ID())
|
|
}
|
|
if err := json.NewDecoder(bytes.NewBuffer(out)).Decode(state); err != nil {
|
|
return errors.Wrapf(err, "error decoding container status for container %s", ctr.ID())
|
|
}
|
|
ctr.state.PID = state.Pid
|
|
|
|
switch state.Status {
|
|
case "created":
|
|
ctr.state.State = ContainerStateCreated
|
|
case "paused":
|
|
ctr.state.State = ContainerStatePaused
|
|
case "running":
|
|
ctr.state.State = ContainerStateRunning
|
|
case "stopped":
|
|
ctr.state.State = ContainerStateStopped
|
|
default:
|
|
return errors.Wrapf(ErrInternal, "unrecognized status returned by runtime for container %s: %s",
|
|
ctr.ID(), state.Status)
|
|
}
|
|
|
|
// Only grab exit status if we were not already stopped
|
|
// If we were, it should already be in the database
|
|
if ctr.state.State == ContainerStateStopped && oldState != ContainerStateStopped {
|
|
var fi os.FileInfo
|
|
chWait := make(chan error)
|
|
defer close(chWait)
|
|
|
|
_, err := WaitForFile(exitFile, chWait, time.Second*5)
|
|
if err == nil {
|
|
fi, err = os.Stat(exitFile)
|
|
}
|
|
if err != nil {
|
|
ctr.state.ExitCode = -1
|
|
ctr.state.FinishedTime = time.Now()
|
|
logrus.Errorf("No exit file for container %s found: %v", ctr.ID(), err)
|
|
return nil
|
|
}
|
|
|
|
return ctr.handleExitFile(exitFile, fi)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// startContainer starts the given container
|
|
// Sets time the container was started, but does not save it.
|
|
func (r *OCIRuntime) startContainer(ctr *Container) error {
|
|
// TODO: streams should probably *not* be our STDIN/OUT/ERR - redirect to buffers?
|
|
runtimeDir, err := util.GetRootlessRuntimeDir()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)}
|
|
if notify, ok := os.LookupEnv("NOTIFY_SOCKET"); ok {
|
|
env = append(env, fmt.Sprintf("NOTIFY_SOCKET=%s", notify))
|
|
}
|
|
if err := utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, "start", ctr.ID()); err != nil {
|
|
return err
|
|
}
|
|
|
|
ctr.state.StartedTime = time.Now()
|
|
|
|
return nil
|
|
}
|
|
|
|
// killContainer sends the given signal to the given container
|
|
func (r *OCIRuntime) killContainer(ctr *Container, signal uint) error {
|
|
logrus.Debugf("Sending signal %d to container %s", signal, ctr.ID())
|
|
runtimeDir, err := util.GetRootlessRuntimeDir()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)}
|
|
if err := utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, "kill", ctr.ID(), fmt.Sprintf("%d", signal)); err != nil {
|
|
return errors.Wrapf(err, "error sending signal to container %s", ctr.ID())
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// deleteContainer deletes a container from the OCI runtime
|
|
func (r *OCIRuntime) deleteContainer(ctr *Container) error {
|
|
runtimeDir, err := util.GetRootlessRuntimeDir()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)}
|
|
return utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, "delete", "--force", ctr.ID())
|
|
}
|
|
|
|
// pauseContainer pauses the given container
|
|
func (r *OCIRuntime) pauseContainer(ctr *Container) error {
|
|
runtimeDir, err := util.GetRootlessRuntimeDir()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)}
|
|
return utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, "pause", ctr.ID())
|
|
}
|
|
|
|
// unpauseContainer unpauses the given container
|
|
func (r *OCIRuntime) unpauseContainer(ctr *Container) error {
|
|
runtimeDir, err := util.GetRootlessRuntimeDir()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)}
|
|
return utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, "resume", ctr.ID())
|
|
}
|
|
|
|
// execContainer executes a command in a running container
|
|
// TODO: Add --detach support
|
|
// TODO: Convert to use conmon
|
|
// TODO: add --pid-file and use that to generate exec session tracking
|
|
func (r *OCIRuntime) execContainer(c *Container, cmd, capAdd, env []string, tty bool, cwd, user, sessionID string, streams *AttachStreams, preserveFDs int) (*exec.Cmd, error) {
|
|
if len(cmd) == 0 {
|
|
return nil, errors.Wrapf(ErrInvalidArg, "must provide a command to execute")
|
|
}
|
|
|
|
if sessionID == "" {
|
|
return nil, errors.Wrapf(ErrEmptyID, "must provide a session ID for exec")
|
|
}
|
|
|
|
runtimeDir, err := util.GetRootlessRuntimeDir()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
args := []string{}
|
|
|
|
// TODO - should we maintain separate logpaths for exec sessions?
|
|
args = append(args, "exec")
|
|
|
|
if cwd != "" {
|
|
args = append(args, "--cwd", cwd)
|
|
}
|
|
|
|
args = append(args, "--pid-file", c.execPidPath(sessionID))
|
|
|
|
if tty {
|
|
args = append(args, "--tty")
|
|
} else {
|
|
args = append(args, "--tty=false")
|
|
}
|
|
|
|
if user != "" {
|
|
args = append(args, "--user", user)
|
|
}
|
|
|
|
if preserveFDs > 0 {
|
|
args = append(args, fmt.Sprintf("--preserve-fds=%d", preserveFDs))
|
|
}
|
|
if c.config.Spec.Process.NoNewPrivileges {
|
|
args = append(args, "--no-new-privs")
|
|
}
|
|
|
|
for _, cap := range capAdd {
|
|
args = append(args, "--cap", cap)
|
|
}
|
|
|
|
for _, envVar := range env {
|
|
args = append(args, "--env", envVar)
|
|
}
|
|
|
|
// Append container ID, name and command
|
|
args = append(args, c.ID())
|
|
args = append(args, cmd...)
|
|
|
|
logrus.Debugf("Starting runtime %s with following arguments: %v", r.path, args)
|
|
|
|
execCmd := exec.Command(r.path, args...)
|
|
|
|
if streams.AttachOutput {
|
|
execCmd.Stdout = streams.OutputStream
|
|
}
|
|
if streams.AttachInput {
|
|
execCmd.Stdin = streams.InputStream
|
|
}
|
|
if streams.AttachError {
|
|
execCmd.Stderr = streams.ErrorStream
|
|
}
|
|
|
|
execCmd.Env = append(execCmd.Env, fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir))
|
|
|
|
if preserveFDs > 0 {
|
|
for fd := 3; fd < 3+preserveFDs; fd++ {
|
|
execCmd.ExtraFiles = append(execCmd.ExtraFiles, os.NewFile(uintptr(fd), fmt.Sprintf("fd-%d", fd)))
|
|
}
|
|
}
|
|
|
|
if err := execCmd.Start(); err != nil {
|
|
return nil, errors.Wrapf(err, "cannot start container %s", c.ID())
|
|
}
|
|
|
|
if preserveFDs > 0 {
|
|
for fd := 3; fd < 3+preserveFDs; fd++ {
|
|
// These fds were passed down to the runtime. Close them
|
|
// and not interfere
|
|
os.NewFile(uintptr(fd), fmt.Sprintf("fd-%d", fd)).Close()
|
|
}
|
|
}
|
|
|
|
return execCmd, nil
|
|
}
|
|
|
|
// checkpointContainer checkpoints the given container
|
|
func (r *OCIRuntime) checkpointContainer(ctr *Container, options ContainerCheckpointOptions) error {
|
|
label.SetSocketLabel(ctr.ProcessLabel())
|
|
// imagePath is used by CRIU to store the actual checkpoint files
|
|
imagePath := ctr.CheckpointPath()
|
|
// workPath will be used to store dump.log and stats-dump
|
|
workPath := ctr.bundlePath()
|
|
logrus.Debugf("Writing checkpoint to %s", imagePath)
|
|
logrus.Debugf("Writing checkpoint logs to %s", workPath)
|
|
args := []string{}
|
|
args = append(args, "checkpoint")
|
|
args = append(args, "--image-path")
|
|
args = append(args, imagePath)
|
|
args = append(args, "--work-path")
|
|
args = append(args, workPath)
|
|
if options.KeepRunning {
|
|
args = append(args, "--leave-running")
|
|
}
|
|
if options.TCPEstablished {
|
|
args = append(args, "--tcp-established")
|
|
}
|
|
args = append(args, ctr.ID())
|
|
return utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, nil, r.path, args...)
|
|
}
|
|
|
|
func (r *OCIRuntime) featureCheckCheckpointing() bool {
|
|
// Check if the runtime implements checkpointing. Currently only
|
|
// runc's checkpoint/restore implementation is supported.
|
|
cmd := exec.Command(r.path, "checkpoint", "-h")
|
|
if err := cmd.Start(); err != nil {
|
|
return false
|
|
}
|
|
if err := cmd.Wait(); err == nil {
|
|
return true
|
|
}
|
|
return false
|
|
}
|