mirror of
https://github.com/containers/podman.git
synced 2025-05-17 15:18:43 +08:00
773 lines
26 KiB
Go
773 lines
26 KiB
Go
//go:build linux
|
|
// +build linux
|
|
|
|
package libpod
|
|
|
|
import (
|
|
"crypto/rand"
|
|
"crypto/sha256"
|
|
"errors"
|
|
"fmt"
|
|
"net"
|
|
"os"
|
|
"path/filepath"
|
|
"strconv"
|
|
"strings"
|
|
"syscall"
|
|
|
|
"github.com/containernetworking/plugins/pkg/ns"
|
|
"github.com/containers/common/libnetwork/resolvconf"
|
|
"github.com/containers/common/libnetwork/slirp4netns"
|
|
"github.com/containers/common/libnetwork/types"
|
|
netUtil "github.com/containers/common/libnetwork/util"
|
|
"github.com/containers/common/pkg/netns"
|
|
"github.com/containers/common/pkg/util"
|
|
"github.com/containers/podman/v4/libpod/define"
|
|
"github.com/containers/podman/v4/pkg/rootless"
|
|
"github.com/containers/podman/v4/utils"
|
|
"github.com/containers/storage/pkg/lockfile"
|
|
"github.com/opencontainers/runtime-spec/specs-go"
|
|
"github.com/opencontainers/selinux/go-selinux/label"
|
|
"github.com/sirupsen/logrus"
|
|
"github.com/vishvananda/netlink"
|
|
"golang.org/x/sys/unix"
|
|
)
|
|
|
|
const (
|
|
// rootlessNetNsName is the file name for the rootless network namespace bind mount
|
|
rootlessNetNsName = "rootless-netns"
|
|
|
|
// rootlessNetNsSilrp4netnsPidFile is the name of the rootless netns slirp4netns pid file
|
|
rootlessNetNsSilrp4netnsPidFile = "rootless-netns-slirp4netns.pid"
|
|
|
|
// persistentCNIDir is the directory where the CNI files are stored
|
|
persistentCNIDir = "/var/lib/cni"
|
|
)
|
|
|
|
type RootlessNetNS struct {
|
|
ns ns.NetNS
|
|
dir string
|
|
Lock *lockfile.LockFile
|
|
}
|
|
|
|
// getPath will join the given path to the rootless netns dir
|
|
func (r *RootlessNetNS) getPath(path string) string {
|
|
return filepath.Join(r.dir, path)
|
|
}
|
|
|
|
// Do - run the given function in the rootless netns.
|
|
// It does not lock the rootlessCNI lock, the caller
|
|
// should only lock when needed, e.g. for network operations.
|
|
func (r *RootlessNetNS) Do(toRun func() error) error {
|
|
err := r.ns.Do(func(_ ns.NetNS) error {
|
|
// Before we can run the given function,
|
|
// we have to set up all mounts correctly.
|
|
|
|
// The order of the mounts is IMPORTANT.
|
|
// The idea of the extra mount ns is to make /run and /var/lib/cni writeable
|
|
// for the cni plugins but not affecting the podman user namespace.
|
|
// Because the plugins also need access to XDG_RUNTIME_DIR/netns some special setup is needed.
|
|
|
|
// The following bind mounts are needed
|
|
// 1. XDG_RUNTIME_DIR -> XDG_RUNTIME_DIR/rootless-netns/XDG_RUNTIME_DIR
|
|
// 2. /run/systemd -> XDG_RUNTIME_DIR/rootless-netns/run/systemd (only if it exists)
|
|
// 3. XDG_RUNTIME_DIR/rootless-netns/resolv.conf -> /etc/resolv.conf or XDG_RUNTIME_DIR/rootless-netns/run/symlink/target
|
|
// 4. XDG_RUNTIME_DIR/rootless-netns/var/lib/cni -> /var/lib/cni (if /var/lib/cni does not exist, use the parent dir)
|
|
// 5. XDG_RUNTIME_DIR/rootless-netns/run -> /run
|
|
|
|
// Create a new mount namespace,
|
|
// this must happen inside the netns thread.
|
|
err := unix.Unshare(unix.CLONE_NEWNS)
|
|
if err != nil {
|
|
return fmt.Errorf("cannot create a new mount namespace: %w", err)
|
|
}
|
|
|
|
xdgRuntimeDir, err := util.GetRuntimeDir()
|
|
if err != nil {
|
|
return fmt.Errorf("could not get runtime directory: %w", err)
|
|
}
|
|
newXDGRuntimeDir := r.getPath(xdgRuntimeDir)
|
|
// 1. Mount the netns into the new run to keep them accessible.
|
|
// Otherwise cni setup will fail because it cannot access the netns files.
|
|
err = unix.Mount(xdgRuntimeDir, newXDGRuntimeDir, "none", unix.MS_BIND|unix.MS_SHARED|unix.MS_REC, "")
|
|
if err != nil {
|
|
return fmt.Errorf("failed to mount runtime directory for rootless netns: %w", err)
|
|
}
|
|
|
|
// 2. Also keep /run/systemd if it exists.
|
|
// Many files are symlinked into this dir, for example /dev/log.
|
|
runSystemd := "/run/systemd"
|
|
_, err = os.Stat(runSystemd)
|
|
if err == nil {
|
|
newRunSystemd := r.getPath(runSystemd)
|
|
err = unix.Mount(runSystemd, newRunSystemd, "none", unix.MS_BIND|unix.MS_REC, "")
|
|
if err != nil {
|
|
return fmt.Errorf("failed to mount /run/systemd directory for rootless netns: %w", err)
|
|
}
|
|
}
|
|
|
|
// 3. On some distros /etc/resolv.conf is symlinked to somewhere under /run.
|
|
// Because the kernel will follow the symlink before mounting, it is not
|
|
// possible to mount a file at /etc/resolv.conf. We have to ensure that
|
|
// the link target will be available in the mount ns.
|
|
// see: https://github.com/containers/podman/issues/10855
|
|
resolvePath := "/etc/resolv.conf"
|
|
linkCount := 0
|
|
for i := 1; i < len(resolvePath); i++ {
|
|
// Do not use filepath.EvalSymlinks, we only want the first symlink under /run.
|
|
// If /etc/resolv.conf has more than one symlink under /run, e.g.
|
|
// -> /run/systemd/resolve/stub-resolv.conf -> /run/systemd/resolve/resolv.conf
|
|
// we would put the netns resolv.conf file to the last path. However this will
|
|
// break dns because the second link does not exist in the mount ns.
|
|
// see https://github.com/containers/podman/issues/11222
|
|
//
|
|
// We also need to resolve all path components not just the last file.
|
|
// see https://github.com/containers/podman/issues/12461
|
|
|
|
if resolvePath[i] != '/' {
|
|
// if we are at the last char we need to inc i by one because there is no final slash
|
|
if i == len(resolvePath)-1 {
|
|
i++
|
|
} else {
|
|
// not the end of path, keep going
|
|
continue
|
|
}
|
|
}
|
|
path := resolvePath[:i]
|
|
|
|
fi, err := os.Lstat(path)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to stat resolv.conf path: %w", err)
|
|
}
|
|
|
|
// no link, just continue
|
|
if fi.Mode()&os.ModeSymlink == 0 {
|
|
continue
|
|
}
|
|
|
|
link, err := os.Readlink(path)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to read resolv.conf symlink: %w", err)
|
|
}
|
|
linkCount++
|
|
if filepath.IsAbs(link) {
|
|
// link is as an absolute path
|
|
resolvePath = filepath.Join(link, resolvePath[i:])
|
|
} else {
|
|
// link is as a relative, join it with the previous path
|
|
base := filepath.Dir(path)
|
|
resolvePath = filepath.Join(base, link, resolvePath[i:])
|
|
}
|
|
// set i back to zero since we now have a new base path
|
|
i = 0
|
|
|
|
// we have to stop at the first path under /run because we will have an empty /run and will create the path anyway
|
|
// if we would continue we would need to recreate all links under /run
|
|
if strings.HasPrefix(resolvePath, "/run/") {
|
|
break
|
|
}
|
|
// make sure wo do not loop forever
|
|
if linkCount == 255 {
|
|
return errors.New("too many symlinks while resolving /etc/resolv.conf")
|
|
}
|
|
}
|
|
logrus.Debugf("The path of /etc/resolv.conf in the mount ns is %q", resolvePath)
|
|
// When /etc/resolv.conf on the host is a symlink to /run/systemd/resolve/stub-resolv.conf,
|
|
// we have to mount an empty filesystem on /run/systemd/resolve in the child namespace,
|
|
// so as to isolate the directory from the host mount namespace.
|
|
//
|
|
// Otherwise our bind-mount for /run/systemd/resolve/stub-resolv.conf is unmounted
|
|
// when systemd-resolved unlinks and recreates /run/systemd/resolve/stub-resolv.conf on the host.
|
|
// see: https://github.com/containers/podman/issues/10929
|
|
if strings.HasPrefix(resolvePath, "/run/systemd/resolve/") {
|
|
rsr := r.getPath("/run/systemd/resolve")
|
|
err = unix.Mount("", rsr, define.TypeTmpfs, unix.MS_NOEXEC|unix.MS_NOSUID|unix.MS_NODEV, "")
|
|
if err != nil {
|
|
return fmt.Errorf("failed to mount tmpfs on %q for rootless netns: %w", rsr, err)
|
|
}
|
|
}
|
|
if strings.HasPrefix(resolvePath, "/run/") {
|
|
resolvePath = r.getPath(resolvePath)
|
|
err = os.MkdirAll(filepath.Dir(resolvePath), 0700)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to create rootless-netns resolv.conf directory: %w", err)
|
|
}
|
|
// we want to bind mount on this file so we have to create the file first
|
|
_, err = os.OpenFile(resolvePath, os.O_CREATE|os.O_RDONLY, 0700)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to create rootless-netns resolv.conf file: %w", err)
|
|
}
|
|
}
|
|
// mount resolv.conf to make use of the host dns
|
|
err = unix.Mount(r.getPath("resolv.conf"), resolvePath, "none", unix.MS_BIND, "")
|
|
if err != nil {
|
|
return fmt.Errorf("failed to mount resolv.conf for rootless netns: %w", err)
|
|
}
|
|
|
|
// 4. CNI plugins need access to /var/lib/cni and /run
|
|
varDir := ""
|
|
varTarget := persistentCNIDir
|
|
// we can only mount to a target dir which exists, check /var/lib/cni recursively
|
|
// while we could always use /var there are cases where a user might store the cni
|
|
// configs under /var/custom and this would break
|
|
for {
|
|
if _, err := os.Stat(varTarget); err == nil {
|
|
varDir = r.getPath(varTarget)
|
|
break
|
|
}
|
|
varTarget = filepath.Dir(varTarget)
|
|
if varTarget == "/" {
|
|
break
|
|
}
|
|
}
|
|
if varDir == "" {
|
|
return errors.New("failed to stat /var directory")
|
|
}
|
|
// make sure to mount var first
|
|
err = unix.Mount(varDir, varTarget, "none", unix.MS_BIND, "")
|
|
if err != nil {
|
|
return fmt.Errorf("failed to mount %s for rootless netns: %w", varTarget, err)
|
|
}
|
|
|
|
// 5. Mount the new prepared run dir to /run, it has to be recursive to keep the other bind mounts.
|
|
runDir := r.getPath("run")
|
|
err = unix.Mount(runDir, "/run", "none", unix.MS_BIND|unix.MS_REC, "")
|
|
if err != nil {
|
|
return fmt.Errorf("failed to mount /run for rootless netns: %w", err)
|
|
}
|
|
|
|
// run the given function in the correct namespace
|
|
err = toRun()
|
|
return err
|
|
})
|
|
return err
|
|
}
|
|
|
|
// Clean up the rootless network namespace if needed.
|
|
// It checks if we have running containers with the bridge network mode.
|
|
// Cleanup() expects that r.Lock is locked
|
|
func (r *RootlessNetNS) Cleanup(runtime *Runtime) error {
|
|
_, err := os.Stat(r.dir)
|
|
if os.IsNotExist(err) {
|
|
// the directory does not exist, so no need for cleanup
|
|
return nil
|
|
}
|
|
activeNetns := func(c *Container) bool {
|
|
// no bridge => no need to check
|
|
if !c.config.NetMode.IsBridge() {
|
|
return false
|
|
}
|
|
|
|
// we cannot use c.state() because it will try to lock the container
|
|
// locking is a problem because cleanup is called after net teardown
|
|
// at this stage the container is already locked.
|
|
// also do not try to lock only containers which are not currently in net
|
|
// teardown because this will result in an ABBA deadlock between the rootless
|
|
// rootless netns lock and the container lock
|
|
// because we need to get the state we have to sync otherwise this will not
|
|
// work because the state is empty by default
|
|
// I do not like this but I do not see a better way at moment
|
|
err := c.syncContainer()
|
|
if err != nil {
|
|
return false
|
|
}
|
|
|
|
// only check for an active netns, we cannot use the container state
|
|
// because not running does not mean that the netns does not need cleanup
|
|
// only if the netns is empty we know that we do not need cleanup
|
|
return c.state.NetNS != ""
|
|
}
|
|
ctrs, err := runtime.GetContainers(false, activeNetns)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
// no cleanup if we found no other containers with a netns
|
|
// we will always find one container (the container cleanup that is currently calling us)
|
|
if len(ctrs) > 1 {
|
|
return nil
|
|
}
|
|
logrus.Debug("Cleaning up rootless network namespace")
|
|
err = netns.UnmountNS(r.ns.Path())
|
|
if err != nil {
|
|
return err
|
|
}
|
|
// make the following errors not fatal
|
|
err = r.ns.Close()
|
|
if err != nil {
|
|
logrus.Error(err)
|
|
}
|
|
b, err := os.ReadFile(r.getPath(rootlessNetNsSilrp4netnsPidFile))
|
|
if err == nil {
|
|
var i int
|
|
i, err = strconv.Atoi(string(b))
|
|
if err == nil {
|
|
// kill the slirp process so we do not leak it
|
|
err = syscall.Kill(i, syscall.SIGTERM)
|
|
}
|
|
}
|
|
if err != nil {
|
|
logrus.Errorf("Failed to kill slirp4netns process: %v", err)
|
|
}
|
|
err = os.RemoveAll(r.dir)
|
|
if err != nil {
|
|
logrus.Error(err)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// GetRootlessNetNs returns the rootless netns object. If create is set to true
|
|
// the rootless network namespace will be created if it does not already exist.
|
|
// If called as root it returns always nil.
|
|
// On success the returned RootlessCNI lock is locked and must be unlocked by the caller.
|
|
func (r *Runtime) GetRootlessNetNs(new bool) (*RootlessNetNS, error) {
|
|
if !rootless.IsRootless() {
|
|
return nil, nil
|
|
}
|
|
var rootlessNetNS *RootlessNetNS
|
|
runDir := r.config.Engine.TmpDir
|
|
|
|
lfile := filepath.Join(runDir, "rootless-netns.lock")
|
|
lock, err := lockfile.GetLockFile(lfile)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to get rootless-netns lockfile: %w", err)
|
|
}
|
|
lock.Lock()
|
|
defer func() {
|
|
// In case of an error (early exit) rootlessNetNS will be nil.
|
|
// Make sure to unlock otherwise we could deadlock.
|
|
if rootlessNetNS == nil {
|
|
lock.Unlock()
|
|
}
|
|
}()
|
|
|
|
rootlessNetNsDir := filepath.Join(runDir, rootlessNetNsName)
|
|
err = os.MkdirAll(rootlessNetNsDir, 0700)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("could not create rootless-netns directory: %w", err)
|
|
}
|
|
|
|
nsDir, err := netns.GetNSRunDir()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// create a hash from the static dir
|
|
// the cleanup will check if there are running containers
|
|
// if you run a several libpod instances with different root/runroot directories this check will fail
|
|
// we want one netns for each libpod static dir so we use the hash to prevent name collisions
|
|
hash := sha256.Sum256([]byte(r.config.Engine.StaticDir))
|
|
netnsName := fmt.Sprintf("%s-%x", rootlessNetNsName, hash[:10])
|
|
|
|
path := filepath.Join(nsDir, netnsName)
|
|
nsReference, err := ns.GetNS(path)
|
|
if err != nil {
|
|
if !new {
|
|
// return an error if we could not get the namespace and should no create one
|
|
return nil, fmt.Errorf("getting rootless network namespace: %w", err)
|
|
}
|
|
|
|
// When the netns is not valid but the file exists we have to remove it first,
|
|
// https://github.com/containers/common/pull/1381 changed the behavior from
|
|
// NewNSWithName()so it will now error when the file already exists.
|
|
// https://github.com/containers/podman/issues/17903#issuecomment-1494329622
|
|
if errors.As(err, &ns.NSPathNotNSErr{}) {
|
|
logrus.Infof("rootless netns is no longer valid: %v", err)
|
|
// ignore errors, if something is wrong NewNSWithName() will fail below anyway
|
|
_ = os.Remove(path)
|
|
}
|
|
|
|
// create a new namespace
|
|
logrus.Debugf("creating rootless network namespace with name %q", netnsName)
|
|
nsReference, err = netns.NewNSWithName(netnsName)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("creating rootless network namespace: %w", err)
|
|
}
|
|
res, err := slirp4netns.Setup(&slirp4netns.SetupOptions{
|
|
Config: r.config,
|
|
ContainerID: "rootless-netns",
|
|
Netns: nsReference.Path(),
|
|
})
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to start rootless-netns slirp4netns: %w", err)
|
|
}
|
|
// create pid file for the slirp4netns process
|
|
// this is need to kill the process in the cleanup
|
|
pid := strconv.Itoa(res.Pid)
|
|
err = os.WriteFile(filepath.Join(rootlessNetNsDir, rootlessNetNsSilrp4netnsPidFile), []byte(pid), 0700)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("unable to write rootless-netns slirp4netns pid file: %w", err)
|
|
}
|
|
|
|
if utils.RunsOnSystemd() {
|
|
// move to systemd scope to prevent systemd from killing it
|
|
err = utils.MoveRootlessNetnsSlirpProcessToUserSlice(res.Pid)
|
|
if err != nil {
|
|
// only log this, it is not fatal but can lead to issues when running podman inside systemd units
|
|
logrus.Errorf("failed to move the rootless netns slirp4netns process to the systemd user.slice: %v", err)
|
|
}
|
|
}
|
|
|
|
// build a new resolv.conf file which uses the slirp4netns dns server address
|
|
resolveIP, err := slirp4netns.GetDNS(res.Subnet)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to determine default slirp4netns DNS address: %w", err)
|
|
}
|
|
|
|
if err := resolvconf.New(&resolvconf.Params{
|
|
Path: filepath.Join(rootlessNetNsDir, "resolv.conf"),
|
|
// fake the netns since we want to filter localhost
|
|
Namespaces: []specs.LinuxNamespace{
|
|
{Type: specs.NetworkNamespace},
|
|
},
|
|
IPv6Enabled: res.IPv6,
|
|
KeepHostServers: true,
|
|
Nameservers: []string{resolveIP.String()},
|
|
}); err != nil {
|
|
return nil, fmt.Errorf("failed to create rootless netns resolv.conf: %w", err)
|
|
}
|
|
// create cni directories to store files
|
|
// they will be bind mounted to the correct location in an extra mount ns
|
|
err = os.MkdirAll(filepath.Join(rootlessNetNsDir, persistentCNIDir), 0700)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("could not create rootless-netns var directory: %w", err)
|
|
}
|
|
runDir := filepath.Join(rootlessNetNsDir, "run")
|
|
err = os.MkdirAll(runDir, 0700)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("could not create rootless-netns run directory: %w", err)
|
|
}
|
|
// relabel the new run directory to the iptables /run label
|
|
// this is important, otherwise the iptables command will fail
|
|
err = label.Relabel(runDir, "system_u:object_r:iptables_var_run_t:s0", false)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("could not create relabel rootless-netns run directory: %w", err)
|
|
}
|
|
// create systemd run directory
|
|
err = os.MkdirAll(filepath.Join(runDir, "systemd"), 0700)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("could not create rootless-netns systemd directory: %w", err)
|
|
}
|
|
// create the directory for the netns files at the same location
|
|
// relative to the rootless-netns location
|
|
err = os.MkdirAll(filepath.Join(rootlessNetNsDir, nsDir), 0700)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("could not create rootless-netns netns directory: %w", err)
|
|
}
|
|
}
|
|
|
|
// The CNI plugins and netavark need access to iptables in $PATH. As it turns out debian doesn't put
|
|
// /usr/sbin in $PATH for rootless users. This will break rootless networking completely.
|
|
// We might break existing users and we cannot expect everyone to change their $PATH so
|
|
// let's add /usr/sbin to $PATH ourselves.
|
|
path = os.Getenv("PATH")
|
|
if !strings.Contains(path, "/usr/sbin") {
|
|
path += ":/usr/sbin"
|
|
os.Setenv("PATH", path)
|
|
}
|
|
|
|
// Important set rootlessNetNS as last step.
|
|
// Do not return any errors after this.
|
|
rootlessNetNS = &RootlessNetNS{
|
|
ns: nsReference,
|
|
dir: rootlessNetNsDir,
|
|
Lock: lock,
|
|
}
|
|
return rootlessNetNS, nil
|
|
}
|
|
|
|
// Create and configure a new network namespace for a container
|
|
func (r *Runtime) configureNetNS(ctr *Container, ctrNS string) (status map[string]types.StatusBlock, rerr error) {
|
|
if err := r.exposeMachinePorts(ctr.config.PortMappings); err != nil {
|
|
return nil, err
|
|
}
|
|
defer func() {
|
|
// make sure to unexpose the gvproxy ports when an error happens
|
|
if rerr != nil {
|
|
if err := r.unexposeMachinePorts(ctr.config.PortMappings); err != nil {
|
|
logrus.Errorf("failed to free gvproxy machine ports: %v", err)
|
|
}
|
|
}
|
|
}()
|
|
if ctr.config.NetMode.IsSlirp4netns() {
|
|
return nil, r.setupSlirp4netns(ctr, ctrNS)
|
|
}
|
|
if ctr.config.NetMode.IsPasta() {
|
|
return nil, r.setupPasta(ctr, ctrNS)
|
|
}
|
|
networks, err := ctr.networks()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
// All networks have been removed from the container.
|
|
// This is effectively forcing net=none.
|
|
if len(networks) == 0 {
|
|
return nil, nil
|
|
}
|
|
|
|
netOpts := ctr.getNetworkOptions(networks)
|
|
netStatus, err := r.setUpNetwork(ctrNS, netOpts)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer func() {
|
|
// do not forget to tear down the netns when a later error happened.
|
|
if rerr != nil {
|
|
if err := r.teardownNetworkBackend(ctrNS, netOpts); err != nil {
|
|
logrus.Warnf("failed to teardown network after failed setup: %v", err)
|
|
}
|
|
}
|
|
}()
|
|
|
|
// set up rootless port forwarder when rootless with ports and the network status is empty,
|
|
// if this is called from network reload the network status will not be empty and we should
|
|
// not set up port because they are still active
|
|
if rootless.IsRootless() && len(ctr.config.PortMappings) > 0 && ctr.getNetworkStatus() == nil {
|
|
// set up port forwarder for rootless netns
|
|
// TODO: support slirp4netns port forwarder as well
|
|
// make sure to fix this in container.handleRestartPolicy() as well
|
|
// Important we have to call this after r.setUpNetwork() so that
|
|
// we can use the proper netStatus
|
|
err = r.setupRootlessPortMappingViaRLK(ctr, ctrNS, netStatus)
|
|
}
|
|
return netStatus, err
|
|
}
|
|
|
|
// Create and configure a new network namespace for a container
|
|
func (r *Runtime) createNetNS(ctr *Container) (n string, q map[string]types.StatusBlock, retErr error) {
|
|
ctrNS, err := netns.NewNS()
|
|
if err != nil {
|
|
return "", nil, fmt.Errorf("creating network namespace for container %s: %w", ctr.ID(), err)
|
|
}
|
|
defer func() {
|
|
if retErr != nil {
|
|
if err := netns.UnmountNS(ctrNS.Path()); err != nil {
|
|
logrus.Errorf("Unmounting partially created network namespace for container %s: %v", ctr.ID(), err)
|
|
}
|
|
if err := ctrNS.Close(); err != nil {
|
|
logrus.Errorf("Closing partially created network namespace for container %s: %v", ctr.ID(), err)
|
|
}
|
|
}
|
|
}()
|
|
|
|
logrus.Debugf("Made network namespace at %s for container %s", ctrNS.Path(), ctr.ID())
|
|
|
|
var networkStatus map[string]types.StatusBlock
|
|
networkStatus, err = r.configureNetNS(ctr, ctrNS.Path())
|
|
return ctrNS.Path(), networkStatus, err
|
|
}
|
|
|
|
// Configure the network namespace using the container process
|
|
func (r *Runtime) setupNetNS(ctr *Container) error {
|
|
nsProcess := fmt.Sprintf("/proc/%d/ns/net", ctr.state.PID)
|
|
|
|
b := make([]byte, 16)
|
|
|
|
if _, err := rand.Reader.Read(b); err != nil {
|
|
return fmt.Errorf("failed to generate random netns name: %w", err)
|
|
}
|
|
nsPath, err := netns.GetNSRunDir()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
nsPath = filepath.Join(nsPath, fmt.Sprintf("netns-%x-%x-%x-%x-%x", b[0:4], b[4:6], b[6:8], b[8:10], b[10:]))
|
|
|
|
if err := os.MkdirAll(filepath.Dir(nsPath), 0711); err != nil {
|
|
return err
|
|
}
|
|
|
|
mountPointFd, err := os.Create(nsPath)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if err := mountPointFd.Close(); err != nil {
|
|
return err
|
|
}
|
|
|
|
if err := unix.Mount(nsProcess, nsPath, "none", unix.MS_BIND, ""); err != nil {
|
|
return fmt.Errorf("cannot mount %s: %w", nsPath, err)
|
|
}
|
|
|
|
networkStatus, err := r.configureNetNS(ctr, nsPath)
|
|
|
|
// Assign NetNS attributes to container
|
|
ctr.state.NetNS = nsPath
|
|
ctr.state.NetworkStatus = networkStatus
|
|
return err
|
|
}
|
|
|
|
// Tear down a network namespace, undoing all state associated with it.
|
|
func (r *Runtime) teardownNetNS(ctr *Container) error {
|
|
if err := r.unexposeMachinePorts(ctr.config.PortMappings); err != nil {
|
|
// do not return an error otherwise we would prevent network cleanup
|
|
logrus.Errorf("failed to free gvproxy machine ports: %v", err)
|
|
}
|
|
|
|
// Do not check the error here, we want to always umount the netns
|
|
// This will ensure that the container interface will be deleted
|
|
// even when there is a CNI or netavark bug.
|
|
prevErr := r.teardownNetwork(ctr)
|
|
|
|
// First unmount the namespace
|
|
if err := netns.UnmountNS(ctr.state.NetNS); err != nil {
|
|
if prevErr != nil {
|
|
logrus.Error(prevErr)
|
|
}
|
|
return fmt.Errorf("unmounting network namespace for container %s: %w", ctr.ID(), err)
|
|
}
|
|
|
|
ctr.state.NetNS = ""
|
|
|
|
return prevErr
|
|
}
|
|
|
|
func getContainerNetNS(ctr *Container) (string, *Container, error) {
|
|
if ctr.state.NetNS != "" {
|
|
return ctr.state.NetNS, nil, nil
|
|
}
|
|
if ctr.config.NetNsCtr != "" {
|
|
c, err := ctr.runtime.GetContainer(ctr.config.NetNsCtr)
|
|
if err != nil {
|
|
return "", nil, err
|
|
}
|
|
if err = c.syncContainer(); err != nil {
|
|
return "", c, err
|
|
}
|
|
netNs, c2, err := getContainerNetNS(c)
|
|
if c2 != nil {
|
|
c = c2
|
|
}
|
|
return netNs, c, err
|
|
}
|
|
return "", nil, nil
|
|
}
|
|
|
|
// TODO (5.0): return the statistics per network interface
|
|
// This would allow better compat with docker.
|
|
func getContainerNetIO(ctr *Container) (*netlink.LinkStatistics, error) {
|
|
var netStats *netlink.LinkStatistics
|
|
|
|
netNSPath, otherCtr, netPathErr := getContainerNetNS(ctr)
|
|
if netPathErr != nil {
|
|
return nil, netPathErr
|
|
}
|
|
if netNSPath == "" {
|
|
// If netNSPath is empty, it was set as none, and no netNS was set up
|
|
// this is a valid state and thus return no error, nor any statistics
|
|
return nil, nil
|
|
}
|
|
|
|
netMode := ctr.config.NetMode
|
|
netStatus := ctr.getNetworkStatus()
|
|
if otherCtr != nil {
|
|
netMode = otherCtr.config.NetMode
|
|
netStatus = otherCtr.getNetworkStatus()
|
|
}
|
|
if netMode.IsSlirp4netns() {
|
|
// create a fake status with correct interface name for the logic below
|
|
netStatus = map[string]types.StatusBlock{
|
|
"slirp4netns": {
|
|
Interfaces: map[string]types.NetInterface{"tap0": {}},
|
|
},
|
|
}
|
|
}
|
|
err := ns.WithNetNSPath(netNSPath, func(_ ns.NetNS) error {
|
|
for _, status := range netStatus {
|
|
for dev := range status.Interfaces {
|
|
link, err := netlink.LinkByName(dev)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if netStats == nil {
|
|
netStats = link.Attrs().Statistics
|
|
continue
|
|
}
|
|
// Currently only Tx/RxBytes are used.
|
|
// In the future we should return all stats per interface so that
|
|
// api users have a better options.
|
|
stats := link.Attrs().Statistics
|
|
netStats.TxBytes += stats.TxBytes
|
|
netStats.RxBytes += stats.RxBytes
|
|
}
|
|
}
|
|
return nil
|
|
})
|
|
return netStats, err
|
|
}
|
|
|
|
// joinedNetworkNSPath returns netns path and bool if netns was set
|
|
func (c *Container) joinedNetworkNSPath() (string, bool) {
|
|
for _, namespace := range c.config.Spec.Linux.Namespaces {
|
|
if namespace.Type == specs.NetworkNamespace {
|
|
return namespace.Path, true
|
|
}
|
|
}
|
|
return "", false
|
|
}
|
|
|
|
func (c *Container) inspectJoinedNetworkNS(networkns string) (q types.StatusBlock, retErr error) {
|
|
var result types.StatusBlock
|
|
err := ns.WithNetNSPath(networkns, func(_ ns.NetNS) error {
|
|
ifaces, err := net.Interfaces()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
routes, err := netlink.RouteList(nil, netlink.FAMILY_ALL)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
var gateway net.IP
|
|
for _, route := range routes {
|
|
// default gateway
|
|
if route.Dst == nil {
|
|
gateway = route.Gw
|
|
}
|
|
}
|
|
result.Interfaces = make(map[string]types.NetInterface)
|
|
for _, iface := range ifaces {
|
|
if iface.Flags&net.FlagLoopback != 0 {
|
|
continue
|
|
}
|
|
addrs, err := iface.Addrs()
|
|
if err != nil {
|
|
continue
|
|
}
|
|
if len(addrs) == 0 {
|
|
continue
|
|
}
|
|
subnets := make([]types.NetAddress, 0, len(addrs))
|
|
for _, address := range addrs {
|
|
if ipnet, ok := address.(*net.IPNet); ok {
|
|
if ipnet.IP.IsLinkLocalMulticast() || ipnet.IP.IsLinkLocalUnicast() {
|
|
continue
|
|
}
|
|
subnet := types.NetAddress{
|
|
IPNet: types.IPNet{
|
|
IPNet: *ipnet,
|
|
},
|
|
}
|
|
if ipnet.Contains(gateway) {
|
|
subnet.Gateway = gateway
|
|
}
|
|
subnets = append(subnets, subnet)
|
|
}
|
|
}
|
|
result.Interfaces[iface.Name] = types.NetInterface{
|
|
Subnets: subnets,
|
|
MacAddress: types.HardwareAddr(iface.HardwareAddr),
|
|
}
|
|
}
|
|
return nil
|
|
})
|
|
return result, err
|
|
}
|
|
|
|
func getPastaIP(state *ContainerState) (net.IP, error) {
|
|
var ip string
|
|
err := ns.WithNetNSPath(state.NetNS, func(_ ns.NetNS) error {
|
|
// get the first ip in the netns
|
|
ip = netUtil.GetLocalIP()
|
|
return nil
|
|
})
|
|
return net.ParseIP(ip), err
|
|
}
|