//go:build linux package cgroups import ( "bufio" "bytes" "context" "errors" "fmt" "math" "os" "path/filepath" "strconv" "strings" "github.com/containers/storage/pkg/unshare" systemdDbus "github.com/coreos/go-systemd/v22/dbus" "github.com/godbus/dbus/v5" "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/cgroups/fs2" "github.com/opencontainers/runc/libcontainer/configs" "github.com/sirupsen/logrus" "golang.org/x/exp/maps" ) var ( // ErrCgroupDeleted means the cgroup was deleted ErrCgroupDeleted = errors.New("cgroup deleted") // ErrCgroupV1Rootless means the cgroup v1 were attempted to be used in rootless environment ErrCgroupV1Rootless = errors.New("no support for CGroups V1 in rootless environments") ErrStatCgroup = errors.New("no cgroup available for gathering user statistics") ) // CgroupControl controls a cgroup hierarchy type CgroupControl struct { cgroup2 bool config *configs.Cgroup systemd bool // List of additional cgroup subsystems joined that // do not have a custom handler. additionalControllers []controller } type controller struct { name string symlink bool } type controllerHandler interface { Create(*CgroupControl) (bool, error) Apply(*CgroupControl, *configs.Resources) error Destroy(*CgroupControl) error Stat(*CgroupControl, *cgroups.Stats) error } const ( cgroupRoot = "/sys/fs/cgroup" // CPU is the cpu controller CPU = "cpu" // CPUAcct is the cpuacct controller CPUAcct = "cpuacct" // CPUset is the cpuset controller CPUset = "cpuset" // Memory is the memory controller Memory = "memory" // Pids is the pids controller Pids = "pids" // Blkio is the blkio controller Blkio = "blkio" ) var handlers map[string]controllerHandler func init() { handlers = map[string]controllerHandler{ CPU: getCPUHandler(), CPUset: getCpusetHandler(), Memory: getMemoryHandler(), Pids: getPidsHandler(), Blkio: getBlkioHandler(), } } // getAvailableControllers get the available controllers func getAvailableControllers(exclude map[string]controllerHandler, cgroup2 bool) ([]controller, error) { if cgroup2 { controllers := []controller{} controllersFile := cgroupRoot + "/cgroup.controllers" // rootless cgroupv2: check available controllers for current user, systemd or servicescope will inherit if unshare.IsRootless() { userSlice, err := getCgroupPathForCurrentProcess() if err != nil { return controllers, err } // userSlice already contains '/' so not adding here basePath := cgroupRoot + userSlice controllersFile = fmt.Sprintf("%s/cgroup.controllers", basePath) } controllersFileBytes, err := os.ReadFile(controllersFile) if err != nil { return nil, fmt.Errorf("failed while reading controllers for cgroup v2: %w", err) } for _, controllerName := range strings.Fields(string(controllersFileBytes)) { c := controller{ name: controllerName, symlink: false, } controllers = append(controllers, c) } return controllers, nil } subsystems, _ := cgroupV1GetAllSubsystems() controllers := []controller{} // cgroupv1 and rootless: No subsystem is available: delegation is unsafe. if unshare.IsRootless() { return controllers, nil } for _, name := range subsystems { if _, found := exclude[name]; found { continue } fileInfo, err := os.Stat(cgroupRoot + "/" + name) if err != nil { continue } c := controller{ name: name, symlink: !fileInfo.IsDir(), } controllers = append(controllers, c) } return controllers, nil } // AvailableControllers get string:bool map of all the available controllers func AvailableControllers(exclude map[string]controllerHandler, cgroup2 bool) ([]string, error) { availableControllers, err := getAvailableControllers(exclude, cgroup2) if err != nil { return nil, err } controllerList := []string{} for _, controller := range availableControllers { controllerList = append(controllerList, controller.name) } return controllerList, nil } func cgroupV1GetAllSubsystems() ([]string, error) { f, err := os.Open("/proc/cgroups") if err != nil { return nil, err } defer f.Close() subsystems := []string{} s := bufio.NewScanner(f) for s.Scan() { text := s.Text() if text[0] != '#' { parts := strings.Fields(text) if len(parts) >= 4 && parts[3] != "0" { subsystems = append(subsystems, parts[0]) } } } if err := s.Err(); err != nil { return nil, err } return subsystems, nil } func getCgroupPathForCurrentProcess() (string, error) { path := fmt.Sprintf("/proc/%d/cgroup", os.Getpid()) f, err := os.Open(path) if err != nil { return "", err } defer f.Close() cgroupPath := "" s := bufio.NewScanner(f) for s.Scan() { text := s.Text() procEntries := strings.SplitN(text, "::", 2) // set process cgroupPath only if entry is valid if len(procEntries) > 1 { cgroupPath = procEntries[1] } } if err := s.Err(); err != nil { return cgroupPath, err } return cgroupPath, nil } // getCgroupv1Path is a helper function to get the cgroup v1 path func (c *CgroupControl) getCgroupv1Path(name string) string { return filepath.Join(cgroupRoot, name, c.config.Path) } // initialize initializes the specified hierarchy func (c *CgroupControl) initialize() (err error) { createdSoFar := map[string]controllerHandler{} defer func() { if err != nil { for name, ctr := range createdSoFar { if err := ctr.Destroy(c); err != nil { logrus.Warningf("error cleaning up controller %s for %s", name, c.config.Path) } } } }() if c.cgroup2 { if err := createCgroupv2Path(filepath.Join(cgroupRoot, c.config.Path)); err != nil { return fmt.Errorf("creating cgroup path %s: %w", c.config.Path, err) } } for name, handler := range handlers { created, err := handler.Create(c) if err != nil { return err } if created { createdSoFar[name] = handler } } if !c.cgroup2 { // We won't need to do this for cgroup v2 for _, ctr := range c.additionalControllers { if ctr.symlink { continue } path := c.getCgroupv1Path(ctr.name) if err := os.MkdirAll(path, 0o755); err != nil { return fmt.Errorf("creating cgroup path for %s: %w", ctr.name, err) } } } return nil } func readFileAsUint64(path string) (uint64, error) { data, err := os.ReadFile(path) if err != nil { return 0, err } v := cleanString(string(data)) if v == "max" { return math.MaxUint64, nil } ret, err := strconv.ParseUint(v, 10, 64) if err != nil { return ret, fmt.Errorf("parse %s from %s: %w", v, path, err) } return ret, nil } func readFileByKeyAsUint64(path, key string) (uint64, error) { content, err := os.ReadFile(path) if err != nil { return 0, err } for _, line := range strings.Split(string(content), "\n") { fields := strings.SplitN(line, " ", 2) if fields[0] == key { v := cleanString(fields[1]) if v == "max" { return math.MaxUint64, nil } ret, err := strconv.ParseUint(v, 10, 64) if err != nil { return ret, fmt.Errorf("parse %s from %s: %w", v, path, err) } return ret, nil } } return 0, fmt.Errorf("no key named %s from %s", key, path) } // New creates a new cgroup control func New(path string, resources *configs.Resources) (*CgroupControl, error) { cgroup2, err := IsCgroup2UnifiedMode() if err != nil { return nil, err } control := &CgroupControl{ cgroup2: cgroup2, config: &configs.Cgroup{ Path: path, Resources: resources, }, } if !cgroup2 { controllers, err := getAvailableControllers(handlers, false) if err != nil { return nil, err } control.additionalControllers = controllers } if err := control.initialize(); err != nil { return nil, err } return control, nil } // NewSystemd creates a new cgroup control func NewSystemd(path string, resources *configs.Resources) (*CgroupControl, error) { cgroup2, err := IsCgroup2UnifiedMode() if err != nil { return nil, err } control := &CgroupControl{ cgroup2: cgroup2, systemd: true, config: &configs.Cgroup{ Path: path, Resources: resources, Rootless: unshare.IsRootless(), }, } return control, nil } // Load loads an existing cgroup control func Load(path string) (*CgroupControl, error) { cgroup2, err := IsCgroup2UnifiedMode() if err != nil { return nil, err } control := &CgroupControl{ cgroup2: cgroup2, systemd: false, config: &configs.Cgroup{ Path: path, }, } if !cgroup2 { controllers, err := getAvailableControllers(handlers, false) if err != nil { return nil, err } control.additionalControllers = controllers } if !cgroup2 { oneExists := false // check that the cgroup exists at least under one controller for name := range handlers { p := control.getCgroupv1Path(name) if _, err := os.Stat(p); err == nil { oneExists = true break } } // if there is no controller at all, raise an error if !oneExists { if unshare.IsRootless() { return nil, ErrCgroupV1Rootless } // compatible with the error code // used by containerd/cgroups return nil, ErrCgroupDeleted } } return control, nil } // CreateSystemdUnit creates the systemd cgroup func (c *CgroupControl) CreateSystemdUnit(path string) error { if !c.systemd { return fmt.Errorf("the cgroup controller is not using systemd") } conn, err := systemdDbus.NewWithContext(context.TODO()) if err != nil { return err } defer conn.Close() return systemdCreate(c.config.Resources, path, conn) } // CreateSystemdUserUnit creates the systemd cgroup for the specified user func (c *CgroupControl) CreateSystemdUserUnit(path string, uid int) error { if !c.systemd { return fmt.Errorf("the cgroup controller is not using systemd") } conn, err := UserConnection(uid) if err != nil { return err } defer conn.Close() return systemdCreate(c.config.Resources, path, conn) } func dbusAuthConnection(uid int, createBus func(opts ...dbus.ConnOption) (*dbus.Conn, error)) (*dbus.Conn, error) { conn, err := createBus() if err != nil { return nil, err } methods := []dbus.Auth{dbus.AuthExternal(strconv.Itoa(uid))} err = conn.Auth(methods) if err != nil { conn.Close() return nil, err } if err := conn.Hello(); err != nil { return nil, err } return conn, nil } // Delete cleans a cgroup func (c *CgroupControl) Delete() error { return c.DeleteByPath(c.config.Path) } // DeleteByPathConn deletes the specified cgroup path using the specified // dbus connection if needed. func (c *CgroupControl) DeleteByPathConn(path string, conn *systemdDbus.Conn) error { if c.systemd { return systemdDestroyConn(path, conn) } if c.cgroup2 { return rmDirRecursively(filepath.Join(cgroupRoot, c.config.Path)) } var lastError error for _, h := range handlers { if err := h.Destroy(c); err != nil { lastError = err } } for _, ctr := range c.additionalControllers { if ctr.symlink { continue } p := c.getCgroupv1Path(ctr.name) if err := rmDirRecursively(p); err != nil { lastError = fmt.Errorf("remove %s: %w", p, err) } } return lastError } // DeleteByPath deletes the specified cgroup path func (c *CgroupControl) DeleteByPath(path string) error { if c.systemd { conn, err := systemdDbus.NewWithContext(context.TODO()) if err != nil { return err } defer conn.Close() return c.DeleteByPathConn(path, conn) } return c.DeleteByPathConn(path, nil) } // Update updates the cgroups func (c *CgroupControl) Update(resources *configs.Resources) error { for _, h := range handlers { if err := h.Apply(c, resources); err != nil { return err } } return nil } // AddPid moves the specified pid to the cgroup func (c *CgroupControl) AddPid(pid int) error { pidString := []byte(fmt.Sprintf("%d\n", pid)) if c.cgroup2 { path := filepath.Join(cgroupRoot, c.config.Path) return fs2.CreateCgroupPath(path, c.config) } names := maps.Keys(handlers) for _, c := range c.additionalControllers { if !c.symlink { names = append(names, c.name) } } for _, n := range names { // If we aren't using cgroup2, we won't write correctly to unified hierarchy if !c.cgroup2 && n == "unified" { continue } p := filepath.Join(c.getCgroupv1Path(n), "tasks") if err := os.WriteFile(p, pidString, 0o644); err != nil { return fmt.Errorf("write %s: %w", p, err) } } return nil } // Stat returns usage statistics for the cgroup func (c *CgroupControl) Stat() (*cgroups.Stats, error) { m := cgroups.Stats{} found := false for _, h := range handlers { if err := h.Stat(c, &m); err != nil { if !errors.Is(err, os.ErrNotExist) { return nil, err } logrus.Warningf("Failed to retrieve cgroup stats: %v", err) continue } found = true } if !found { return nil, ErrStatCgroup } return &m, nil } func readCgroupMapPath(path string) (map[string][]string, error) { ret := map[string][]string{} f, err := os.Open(path) if err != nil { if errors.Is(err, os.ErrNotExist) { return ret, nil } return nil, fmt.Errorf("open file %s: %w", path, err) } defer f.Close() scanner := bufio.NewScanner(f) for scanner.Scan() { line := scanner.Text() parts := strings.Fields(line) if len(parts) < 2 { continue } ret[parts[0]] = parts[1:] } if err := scanner.Err(); err != nil { return nil, fmt.Errorf("parsing file %s: %w", path, err) } return ret, nil } func readCgroup2MapFile(ctr *CgroupControl, name string) (map[string][]string, error) { p := filepath.Join(cgroupRoot, ctr.config.Path, name) return readCgroupMapPath(p) } func (c *CgroupControl) createCgroupDirectory(controller string) (bool, error) { cPath := c.getCgroupv1Path(controller) _, err := os.Stat(cPath) if err == nil { return false, nil } if !errors.Is(err, os.ErrNotExist) { return false, err } if err := os.MkdirAll(cPath, 0o755); err != nil { return false, fmt.Errorf("creating cgroup for %s: %w", controller, err) } return true, nil } var TestMode bool func createCgroupv2Path(path string) (deferredError error) { if !strings.HasPrefix(path, cgroupRoot+"/") { return fmt.Errorf("invalid cgroup path %s", path) } content, err := os.ReadFile(cgroupRoot + "/cgroup.controllers") if err != nil { return err } ctrs := bytes.Fields(content) res := append([]byte("+"), bytes.Join(ctrs, []byte(" +"))...) current := "/sys/fs" elements := strings.Split(path, "/") for i, e := range elements[3:] { current = filepath.Join(current, e) if i > 0 { if err := os.Mkdir(current, 0o755); err != nil { if !os.IsExist(err) { return err } } else { // If the directory was created, be sure it is not left around on errors. defer func() { if deferredError != nil { os.Remove(current) } }() } } // We enable the controllers for all the path components except the last one. It is not allowed to add // PIDs if there are already enabled controllers. if i < len(elements[3:])-1 { if err := os.WriteFile(filepath.Join(current, "cgroup.subtree_control"), res, 0o755); err != nil { return err } } } return nil } func cleanString(s string) string { return strings.Trim(s, "\n") } func readAcct(ctr *CgroupControl, name string) (uint64, error) { p := filepath.Join(ctr.getCgroupv1Path(CPUAcct), name) return readFileAsUint64(p) } func readAcctList(ctr *CgroupControl, name string) ([]uint64, error) { p := filepath.Join(ctr.getCgroupv1Path(CPUAcct), name) data, err := os.ReadFile(p) if err != nil { return nil, err } r := []uint64{} for _, s := range strings.Split(string(data), " ") { s = cleanString(s) if s == "" { break } v, err := strconv.ParseUint(s, 10, 64) if err != nil { return nil, fmt.Errorf("parsing %s: %w", s, err) } r = append(r, v) } return r, nil } func cpusetCopyFromParent(path string, cgroupv2 bool) error { for _, file := range []string{"cpuset.cpus", "cpuset.mems"} { if _, err := cpusetCopyFileFromParent(path, file, cgroupv2); err != nil { return err } } return nil } func cpusetCopyFileFromParent(dir, file string, cgroupv2 bool) ([]byte, error) { if dir == cgroupRoot { return nil, fmt.Errorf("could not find parent to initialize cpuset %s", file) } path := filepath.Join(dir, file) parentPath := path if cgroupv2 { parentPath = fmt.Sprintf("%s.effective", parentPath) } data, err := os.ReadFile(parentPath) if err != nil { // if the file doesn't exist, it is likely that the cpuset controller // is not enabled in the kernel. if os.IsNotExist(err) { return nil, nil } return nil, err } if strings.Trim(string(data), "\n") != "" { return data, nil } data, err = cpusetCopyFileFromParent(filepath.Dir(dir), file, cgroupv2) if err != nil { return nil, err } if err := os.WriteFile(path, data, 0o644); err != nil { return nil, fmt.Errorf("write %s: %w", path, err) } return data, nil } // SystemCPUUsage returns the system usage for all the cgroups func SystemCPUUsage() (uint64, error) { cgroupv2, err := IsCgroup2UnifiedMode() if err != nil { return 0, err } if !cgroupv2 { p := filepath.Join(cgroupRoot, CPUAcct, "cpuacct.usage") return readFileAsUint64(p) } files, err := os.ReadDir(cgroupRoot) if err != nil { return 0, err } var total uint64 for _, file := range files { if !file.IsDir() { continue } p := filepath.Join(cgroupRoot, file.Name(), "cpu.stat") values, err := readCgroupMapPath(p) if err != nil { return 0, err } if val, found := values["usage_usec"]; found { v, err := strconv.ParseUint(cleanString(val[0]), 10, 64) if err != nil { return 0, err } total += v * 1000 } } return total, nil }