Merge pull request #14876 from cdoern/cgroup

resource limits for pods
This commit is contained in:
OpenShift Merge Robot
2022-07-21 23:01:03 +02:00
committed by GitHub
15 changed files with 530 additions and 227 deletions

View File

@ -56,22 +56,6 @@ func DefineCreateFlags(cmd *cobra.Command, cf *entities.ContainerCreateOptions,
)
_ = cmd.RegisterFlagCompletionFunc(authfileFlagName, completion.AutocompleteDefault)
blkioWeightFlagName := "blkio-weight"
createFlags.StringVar(
&cf.BlkIOWeight,
blkioWeightFlagName, "",
"Block IO weight (relative weight) accepts a weight value between 10 and 1000.",
)
_ = cmd.RegisterFlagCompletionFunc(blkioWeightFlagName, completion.AutocompleteNone)
blkioWeightDeviceFlagName := "blkio-weight-device"
createFlags.StringSliceVar(
&cf.BlkIOWeightDevice,
blkioWeightDeviceFlagName, []string{},
"Block IO weight (relative device weight, format: `DEVICE_NAME:WEIGHT`)",
)
_ = cmd.RegisterFlagCompletionFunc(blkioWeightDeviceFlagName, completion.AutocompleteDefault)
capAddFlagName := "cap-add"
createFlags.StringSliceVar(
&cf.CapAdd,
@ -127,14 +111,6 @@ func DefineCreateFlags(cmd *cobra.Command, cf *entities.ContainerCreateOptions,
)
_ = cmd.RegisterFlagCompletionFunc(deviceReadIopsFlagName, completion.AutocompleteDefault)
deviceWriteBpsFlagName := "device-write-bps"
createFlags.StringSliceVar(
&cf.DeviceWriteBPs,
deviceWriteBpsFlagName, []string{},
"Limit write rate (bytes per second) to a device (e.g. --device-write-bps=/dev/sda:1mb)",
)
_ = cmd.RegisterFlagCompletionFunc(deviceWriteBpsFlagName, completion.AutocompleteDefault)
deviceWriteIopsFlagName := "device-write-iops"
createFlags.StringSliceVar(
&cf.DeviceWriteIOPs,
@ -783,14 +759,6 @@ func DefineCreateFlags(cmd *cobra.Command, cf *entities.ContainerCreateOptions,
)
_ = cmd.RegisterFlagCompletionFunc(deviceFlagName, completion.AutocompleteDefault)
deviceReadBpsFlagName := "device-read-bps"
createFlags.StringSliceVar(
&cf.DeviceReadBPs,
deviceReadBpsFlagName, []string{},
"Limit read rate (bytes per second) from a device (e.g. --device-read-bps=/dev/sda:1mb)",
)
_ = cmd.RegisterFlagCompletionFunc(deviceReadBpsFlagName, completion.AutocompleteDefault)
volumesFromFlagName := "volumes-from"
createFlags.StringArrayVar(
&cf.VolumesFrom,
@ -848,22 +816,6 @@ func DefineCreateFlags(cmd *cobra.Command, cf *entities.ContainerCreateOptions,
)
_ = cmd.RegisterFlagCompletionFunc(cpuRtRuntimeFlagName, completion.AutocompleteNone)
cpuSharesFlagName := "cpu-shares"
createFlags.Uint64VarP(
&cf.CPUShares,
cpuSharesFlagName, "c", 0,
"CPU shares (relative weight)",
)
_ = cmd.RegisterFlagCompletionFunc(cpuSharesFlagName, completion.AutocompleteNone)
cpusetMemsFlagName := "cpuset-mems"
createFlags.StringVar(
&cf.CPUSetMems,
cpusetMemsFlagName, "",
"Memory nodes (MEMs) in which to allow execution (0-3, 0,1). Only effective on NUMA systems.",
)
_ = cmd.RegisterFlagCompletionFunc(cpusetMemsFlagName, completion.AutocompleteNone)
memoryReservationFlagName := "memory-reservation"
createFlags.StringVar(
&cf.MemoryReservation,
@ -872,14 +824,6 @@ func DefineCreateFlags(cmd *cobra.Command, cf *entities.ContainerCreateOptions,
)
_ = cmd.RegisterFlagCompletionFunc(memoryReservationFlagName, completion.AutocompleteNone)
memorySwapFlagName := "memory-swap"
createFlags.StringVar(
&cf.MemorySwap,
memorySwapFlagName, "",
"Swap limit equal to memory plus swap: '-1' to enable unlimited swap",
)
_ = cmd.RegisterFlagCompletionFunc(memorySwapFlagName, completion.AutocompleteNone)
memorySwappinessFlagName := "memory-swappiness"
createFlags.Int64Var(
&cf.MemorySwappiness,
@ -913,4 +857,60 @@ func DefineCreateFlags(cmd *cobra.Command, cf *entities.ContainerCreateOptions,
"Memory limit "+sizeWithUnitFormat,
)
_ = cmd.RegisterFlagCompletionFunc(memoryFlagName, completion.AutocompleteNone)
cpuSharesFlagName := "cpu-shares"
createFlags.Uint64VarP(
&cf.CPUShares,
cpuSharesFlagName, "c", 0,
"CPU shares (relative weight)",
)
_ = cmd.RegisterFlagCompletionFunc(cpuSharesFlagName, completion.AutocompleteNone)
cpusetMemsFlagName := "cpuset-mems"
createFlags.StringVar(
&cf.CPUSetMems,
cpusetMemsFlagName, "",
"Memory nodes (MEMs) in which to allow execution (0-3, 0,1). Only effective on NUMA systems.",
)
_ = cmd.RegisterFlagCompletionFunc(cpusetMemsFlagName, completion.AutocompleteNone)
memorySwapFlagName := "memory-swap"
createFlags.StringVar(
&cf.MemorySwap,
memorySwapFlagName, "",
"Swap limit equal to memory plus swap: '-1' to enable unlimited swap",
)
_ = cmd.RegisterFlagCompletionFunc(memorySwapFlagName, completion.AutocompleteNone)
deviceReadBpsFlagName := "device-read-bps"
createFlags.StringSliceVar(
&cf.DeviceReadBPs,
deviceReadBpsFlagName, []string{},
"Limit read rate (bytes per second) from a device (e.g. --device-read-bps=/dev/sda:1mb)",
)
_ = cmd.RegisterFlagCompletionFunc(deviceReadBpsFlagName, completion.AutocompleteDefault)
deviceWriteBpsFlagName := "device-write-bps"
createFlags.StringSliceVar(
&cf.DeviceWriteBPs,
deviceWriteBpsFlagName, []string{},
"Limit write rate (bytes per second) to a device (e.g. --device-write-bps=/dev/sda:1mb)",
)
_ = cmd.RegisterFlagCompletionFunc(deviceWriteBpsFlagName, completion.AutocompleteDefault)
blkioWeightFlagName := "blkio-weight"
createFlags.StringVar(
&cf.BlkIOWeight,
blkioWeightFlagName, "",
"Block IO weight (relative weight) accepts a weight value between 10 and 1000.",
)
_ = cmd.RegisterFlagCompletionFunc(blkioWeightFlagName, completion.AutocompleteNone)
blkioWeightDeviceFlagName := "blkio-weight-device"
createFlags.StringSliceVar(
&cf.BlkIOWeightDevice,
blkioWeightDeviceFlagName, []string{},
"Block IO weight (relative device weight, format: `DEVICE_NAME:WEIGHT`)",
)
_ = cmd.RegisterFlagCompletionFunc(blkioWeightDeviceFlagName, completion.AutocompleteDefault)
}

View File

@ -11,6 +11,14 @@ podman\-container\-clone - Creates a copy of an existing container
## OPTIONS
#### **--blkio-weight**=*weight*
Block IO weight (relative weight) accepts a weight value between 10 and 1000.
#### **--blkio-weight-device**=*weight*
Block IO weight (relative device weight, format: `DEVICE_NAME:WEIGHT`).
#### **--cpu-period**=*limit*
Set the CPU period for the Completely Fair Scheduler (CFS), which is a
@ -126,6 +134,14 @@ If none are specified, the original container's CPU memory nodes are used.
Remove the original container that we are cloning once used to mimic the configuration.
#### **--device-read-bps**=*path*
Limit read rate (bytes per second) from a device (e.g. --device-read-bps=/dev/sda:1mb).
#### **--device-write-bps**=*path*
Limit write rate (bytes per second) to a device (e.g. --device-write-bps=/dev/sda:1mb)
#### **--force**, **-f**
Force removal of the original container that we are cloning. Can only be used in conjunction with **--destroy**.

View File

@ -11,10 +11,55 @@ podman\-pod\-clone - Creates a copy of an existing pod
## OPTIONS
#### **--blkio-weight**=*weight*
Block IO weight (relative weight) accepts a weight value between 10 and 1000.
#### **--blkio-weight-device**=*weight*
Block IO weight (relative device weight, format: `DEVICE_NAME:WEIGHT`).
#### **--cgroup-parent**=*path*
Path to cgroups under which the cgroup for the pod will be created. If the path is not absolute, the path is considered to be relative to the cgroups path of the init process. Cgroups will be created if they do not already exist.
#### **--cpu-shares**, **-c**=*shares*
CPU shares (relative weight)
By default, all containers get the same proportion of CPU cycles. This proportion
can be modified by changing the container's CPU share weighting relative
to the weighting of all other running containers.
To modify the proportion from the default of 1024, use the **--cpu-shares**
flag to set the weighting to 2 or higher.
The proportion will only apply when CPU-intensive processes are running.
When tasks in one container are idle, other containers can use the
left-over CPU time. The actual amount of CPU time will vary depending on
the number of containers running on the system.
For example, consider three containers, one has a cpu-share of 1024 and
two others have a cpu-share setting of 512. When processes in all three
containers attempt to use 100% of CPU, the first container would receive
50% of the total CPU time. If you add a fourth container with a cpu-share
of 1024, the first container only gets 33% of the CPU. The remaining containers
receive 16.5%, 16.5% and 33% of the CPU.
On a multi-core system, the shares of CPU time are distributed over all CPU
cores. Even if a container is limited to less than 100% of CPU time, it can
use 100% of each individual CPU core.
For example, consider a system with more than three cores. If you start one
container **{C0}** with **-c=512** running one process, and another container
**{C1}** with **-c=1024** running two processes, this can result in the following
division of CPU shares:
PID container CPU CPU share
100 {C0} 0 100% of CPU0
101 {C1} 1 100% of CPU1
102 {C1} 2 100% of CPU2
#### **--cpus**
Set a number of CPUs for the pod that overrides the original pods CPU limits. If none are specified, the original pod's Nano CPUs are used.
@ -23,6 +68,15 @@ Set a number of CPUs for the pod that overrides the original pods CPU limits. If
CPUs in which to allow execution (0-3, 0,1). If none are specified, the original pod's CPUset is used.
#### **--cpuset-mems**=*nodes*
Memory nodes (MEMs) in which to allow execution (0-3, 0,1). Only effective on NUMA systems.
If there are four memory nodes on the system (0-3), use `--cpuset-mems=0,1`
then processes in the container will only use memory from the first
two memory nodes.
#### **--destroy**
Remove the original pod that we are cloning once used to mimic the configuration.
@ -48,6 +102,10 @@ device. The devices that Podman will load modules for when necessary are:
Limit read rate (bytes per second) from a device (e.g. --device-read-bps=/dev/sda:1mb).
#### **--device-write-bps**=*path*
Limit write rate (bytes per second) to a device (e.g. --device-write-bps=/dev/sda:1mb)
#### **--gidmap**=*pod_gid:host_gid:amount*
GID map for the user namespace. Using this flag will run all containers in the pod with user namespace enabled. It conflicts with the `--userns` and `--subgidname` flags.
@ -90,6 +148,17 @@ RAM. If a limit of 0 is specified (not using **-m**), the container's memory is
not limited. The actual limit may be rounded up to a multiple of the operating
system's page size (the value would be very large, that's millions of trillions).
#### **--memory-swap**=*limit*
A limit value equal to memory plus swap. Must be used with the **-m**
(**--memory**) flag. The swap `LIMIT` should always be larger than **-m**
(**--memory**) value. By default, the swap `LIMIT` will be set to double
the value of --memory.
The format of `LIMIT` is `<number>[<unit>]`. Unit can be `b` (bytes),
`k` (kibibytes), `m` (mebibytes), or `g` (gibibytes). If you don't specify a
unit, `b` is used. Set LIMIT to `-1` to enable unlimited swap.
#### **--name**, **-n**
Set a custom name for the cloned pod. The default if not specified is of the syntax: **<ORIGINAL_NAME>-clone**

View File

@ -23,6 +23,9 @@ podman generates a UUID for each pod, and if a name is not assigned
to the container with **--name** then a random string name will be generated
for it. The name is useful any place you need to identify a pod.
Note: resource limit related flags work by setting the limits explicitly in the pod's cgroup
which by default, is the cgroup parent for all containers joining the pod. Containers are still delegated the ability to set their own resource limits when joining a pod meaning that if you run **podman pod create --cpus=5** you can also run **podman container create --pod=`<pod_id|pod_name>` --cpus=4** and the container will only see the smaller limit. containers do NOT get the pod level cgroup resources if they specify their own cgroup when joining a pod such as **--cgroupns=host**
## OPTIONS
#### **--add-host**=*host:ip*
@ -33,10 +36,55 @@ Add a line to /etc/hosts. The format is hostname:ip. The **--add-host**
option can be set multiple times.
The /etc/hosts file is shared between all containers in the pod.
#### **--blkio-weight**=*weight*
Block IO weight (relative weight) accepts a weight value between 10 and 1000.
#### **--blkio-weight-device**=*weight*
Block IO weight (relative device weight, format: `DEVICE_NAME:WEIGHT`).
#### **--cgroup-parent**=*path*
Path to cgroups under which the cgroup for the pod will be created. If the path is not absolute, the path is considered to be relative to the cgroups path of the init process. Cgroups will be created if they do not already exist.
#### **--cpu-shares**, **-c**=*shares*
CPU shares (relative weight)
By default, all containers get the same proportion of CPU cycles. This proportion
can be modified by changing the container's CPU share weighting relative
to the weighting of all other running containers.
To modify the proportion from the default of 1024, use the **--cpu-shares**
flag to set the weighting to 2 or higher.
The proportion will only apply when CPU-intensive processes are running.
When tasks in one container are idle, other containers can use the
left-over CPU time. The actual amount of CPU time will vary depending on
the number of containers running on the system.
For example, consider three containers, one has a cpu-share of 1024 and
two others have a cpu-share setting of 512. When processes in all three
containers attempt to use 100% of CPU, the first container would receive
50% of the total CPU time. If you add a fourth container with a cpu-share
of 1024, the first container only gets 33% of the CPU. The remaining containers
receive 16.5%, 16.5% and 33% of the CPU.
On a multi-core system, the shares of CPU time are distributed over all CPU
cores. Even if a container is limited to less than 100% of CPU time, it can
use 100% of each individual CPU core.
For example, consider a system with more than three cores. If you start one
container **{C0}** with **-c=512** running one process, and another container
**{C1}** with **-c=1024** running two processes, this can result in the following
division of CPU shares:
PID container CPU CPU share
100 {C0} 0 100% of CPU0
101 {C1} 1 100% of CPU1
102 {C1} 2 100% of CPU2
#### **--cpus**=*amount*
Set the total number of CPUs delegated to the pod. Default is 0.000 which indicates that there is no limit on computation power.
@ -52,7 +100,15 @@ Examples of the List Format:
0-4,9 # bits 0, 1, 2, 3, 4, and 9 set
0-2,7,12-14 # bits 0, 1, 2, 7, 12, 13, and 14 set
#### **--device**=*host-device[:container-device][:permissions]*
#### **--cpuset-mems**=*nodes*
Memory nodes (MEMs) in which to allow execution (0-3, 0,1). Only effective on NUMA systems.
If there are four memory nodes on the system (0-3), use `--cpuset-mems=0,1`
then processes in the container will only use memory from the first
two memory nodes.
#### **--device**=_host-device_[**:**_container-device_][**:**_permissions_]
Add a host device to the pod. Optional *permissions* parameter
can be used to specify device permissions. It is a combination of
@ -73,6 +129,10 @@ device. The devices that Podman will load modules for when necessary are:
Limit read rate (bytes per second) from a device (e.g. --device-read-bps=/dev/sda:1mb)
#### **--device-write-bps**=*path*
Limit write rate (bytes per second) to a device (e.g. --device-write-bps=/dev/sda:1mb)
#### **--dns**=*ipaddr*
Set custom DNS servers in the /etc/resolv.conf file that will be shared between all containers in the pod. A special option, "none" is allowed which disables creation of /etc/resolv.conf for the pod.
@ -174,6 +234,16 @@ RAM. If a limit of 0 is specified (not using **-m**), the container's memory is
not limited. The actual limit may be rounded up to a multiple of the operating
system's page size (the value would be very large, that's millions of trillions).
#### **--memory-swap**=*limit*
A limit value equal to memory plus swap. Must be used with the **-m**
(**--memory**) flag. The swap `LIMIT` should always be larger than **-m**
(**--memory**) value. By default, the swap `LIMIT` will be set to double
the value of --memory.
The format of `LIMIT` is `<number>[<unit>]`. Unit can be `b` (bytes),
`k` (kibibytes), `m` (mebibytes), or `g` (gibibytes). If you don't specify a
unit, `b` is used. Set LIMIT to `-1` to enable unlimited swap.
#### **--name**, **-n**=*name*
@ -603,7 +673,7 @@ $ podman pod create --network net1:ip=10.89.1.5 --network net2:ip=10.89.10.10
```
## SEE ALSO
**[podman(1)](podman.1.md)**, **[podman-pod(1)](podman-pod.1.md)**, **[podman-kube-play(1)](podman-kube-play.1.md)**, **containers.conf(1)**
**[podman(1)](podman.1.md)**, **[podman-pod(1)](podman-pod.1.md)**, **[podman-kube-play(1)](podman-kube-play.1.md)**, **containers.conf(1)**, **[cgroups(7)](https://man7.org/linux/man-pages/man7/cgroups.7.html)**
## HISTORY

View File

@ -57,20 +57,32 @@ type InspectPodData struct {
CPUPeriod uint64 `json:"cpu_period,omitempty"`
// CPUQuota contains the CPU quota of the pod
CPUQuota int64 `json:"cpu_quota,omitempty"`
// CPUShares contains the cpu shares for the pod
CPUShares uint64 `json:"cpu_shares,omitempty"`
// CPUSetCPUs contains linux specific CPU data for the pod
CPUSetCPUs string `json:"cpuset_cpus,omitempty"`
// CPUSetMems contains linux specific CPU data for the pod
CPUSetMems string `json:"cpuset_mems,omitempty"`
// Mounts contains volume related information for the pod
Mounts []InspectMount `json:"mounts,omitempty"`
// Devices contains the specified host devices
Devices []InspectDevice `json:"devices,omitempty"`
// BlkioDeviceReadBps contains the Read/Access limit for the pod's devices
BlkioDeviceReadBps []InspectBlkioThrottleDevice `json:"device_read_bps,omitempty"`
// BlkioDeviceReadBps contains the Read/Access limit for the pod's devices
BlkioDeviceWriteBps []InspectBlkioThrottleDevice `json:"device_write_bps,omitempty"`
// VolumesFrom contains the containers that the pod inherits mounts from
VolumesFrom []string `json:"volumes_from,omitempty"`
// SecurityOpt contains the specified security labels and related SELinux information
SecurityOpts []string `json:"security_opt,omitempty"`
// MemoryLimit contains the specified cgroup memory limit for the pod
MemoryLimit uint64 `json:"memory_limit,omitempty"`
// MemorySwap contains the specified memory swap limit for the pod
MemorySwap uint64 `json:"memory_swap,omitempty"`
// BlkioWeight contains the blkio weight limit for the pod
BlkioWeight uint64 `json:"blkio_weight,omitempty"`
// BlkioWeightDevice contains the blkio weight device limits for the pod
BlkioWeightDevice []InspectBlkioWeightDevice `json:"blkio_weight_device,omitempty"`
}
// InspectPodInfraConfig contains the configuration of the pod's infra

View File

@ -2145,6 +2145,18 @@ func WithServiceContainer(id string) PodCreateOption {
}
}
// WithPodResources sets resource limits to be applied to the pod's cgroup
// these will be inherited by all containers unless overridden.
func WithPodResources(resources specs.LinuxResources) PodCreateOption {
return func(pod *Pod) error {
if pod.valid {
return define.ErrPodFinalized
}
pod.config.ResourceLimits = resources
return nil
}
}
// WithVolatile sets the volatile flag for the container storage.
// The option can potentially cause data loss when used on a container that must survive a machine reboot.
func WithVolatile() CtrCreateOption {

View File

@ -83,6 +83,9 @@ type PodConfig struct {
// ID of the pod's lock
LockID uint32 `json:"lockID"`
// ResourceLimits hold the pod level resource limits
ResourceLimits specs.LinuxResources
}
// podState represents a pod's state
@ -116,18 +119,7 @@ func (p *Pod) ResourceLim() *specs.LinuxResources {
empty := &specs.LinuxResources{
CPU: &specs.LinuxCPU{},
}
infra, err := p.runtime.GetContainer(p.state.InfraContainerID)
if err != nil {
return empty
}
conf := infra.config.Spec
if err != nil {
return empty
}
if conf.Linux == nil || conf.Linux.Resources == nil {
return empty
}
if err = JSONDeepCopy(conf.Linux.Resources, resCopy); err != nil {
if err := JSONDeepCopy(p.config.ResourceLimits, resCopy); err != nil {
return nil
}
if resCopy.CPU != nil {
@ -139,51 +131,91 @@ func (p *Pod) ResourceLim() *specs.LinuxResources {
// CPUPeriod returns the pod CPU period
func (p *Pod) CPUPeriod() uint64 {
if p.state.InfraContainerID == "" {
resLim := p.ResourceLim()
if resLim.CPU == nil || resLim.CPU.Period == nil {
return 0
}
infra, err := p.runtime.GetContainer(p.state.InfraContainerID)
if err != nil {
return 0
}
conf := infra.config.Spec
if conf != nil && conf.Linux != nil && conf.Linux.Resources != nil && conf.Linux.Resources.CPU != nil && conf.Linux.Resources.CPU.Period != nil {
return *conf.Linux.Resources.CPU.Period
}
return 0
return *resLim.CPU.Period
}
// CPUQuota returns the pod CPU quota
func (p *Pod) CPUQuota() int64 {
if p.state.InfraContainerID == "" {
resLim := p.ResourceLim()
if resLim.CPU == nil || resLim.CPU.Quota == nil {
return 0
}
infra, err := p.runtime.GetContainer(p.state.InfraContainerID)
if err != nil {
return 0
}
conf := infra.config.Spec
if conf != nil && conf.Linux != nil && conf.Linux.Resources != nil && conf.Linux.Resources.CPU != nil && conf.Linux.Resources.CPU.Quota != nil {
return *conf.Linux.Resources.CPU.Quota
}
return 0
return *resLim.CPU.Quota
}
// MemoryLimit returns the pod Memory Limit
func (p *Pod) MemoryLimit() uint64 {
if p.state.InfraContainerID == "" {
resLim := p.ResourceLim()
if resLim.Memory == nil || resLim.Memory.Limit == nil {
return 0
}
infra, err := p.runtime.GetContainer(p.state.InfraContainerID)
return uint64(*resLim.Memory.Limit)
}
// MemorySwap returns the pod Memory swap limit
func (p *Pod) MemorySwap() uint64 {
resLim := p.ResourceLim()
if resLim.Memory == nil || resLim.Memory.Swap == nil {
return 0
}
return uint64(*resLim.Memory.Swap)
}
// BlkioWeight returns the pod blkio weight
func (p *Pod) BlkioWeight() uint64 {
resLim := p.ResourceLim()
if resLim.BlockIO == nil || resLim.BlockIO.Weight == nil {
return 0
}
return uint64(*resLim.BlockIO.Weight)
}
// CPUSetMems returns the pod CPUSet memory nodes
func (p *Pod) CPUSetMems() string {
resLim := p.ResourceLim()
if resLim.CPU == nil {
return ""
}
return resLim.CPU.Mems
}
// CPUShares returns the pod cpu shares
func (p *Pod) CPUShares() uint64 {
resLim := p.ResourceLim()
if resLim.CPU == nil || resLim.CPU.Shares == nil {
return 0
}
return *resLim.CPU.Shares
}
// BlkiThrottleReadBps returns the pod throttle devices
func (p *Pod) BlkiThrottleReadBps() []define.InspectBlkioThrottleDevice {
resLim := p.ResourceLim()
if resLim.BlockIO == nil || resLim.BlockIO.ThrottleReadBpsDevice == nil {
return []define.InspectBlkioThrottleDevice{}
}
devs, err := blkioDeviceThrottle(nil, resLim.BlockIO.ThrottleReadBpsDevice)
if err != nil {
return 0
return []define.InspectBlkioThrottleDevice{}
}
conf := infra.config.Spec
if conf != nil && conf.Linux != nil && conf.Linux.Resources != nil && conf.Linux.Resources.Memory != nil && conf.Linux.Resources.Memory.Limit != nil {
val := *conf.Linux.Resources.Memory.Limit
return uint64(val)
return devs
}
// BlkiThrottleWriteBps returns the pod throttle devices
func (p *Pod) BlkiThrottleWriteBps() []define.InspectBlkioThrottleDevice {
resLim := p.ResourceLim()
if resLim.BlockIO == nil || resLim.BlockIO.ThrottleWriteBpsDevice == nil {
return []define.InspectBlkioThrottleDevice{}
}
return 0
devs, err := blkioDeviceThrottle(nil, resLim.BlockIO.ThrottleWriteBpsDevice)
if err != nil {
return []define.InspectBlkioThrottleDevice{}
}
return devs
}
// NetworkMode returns the Network mode given by the user ex: pod, private...

View File

@ -659,7 +659,6 @@ func (p *Pod) Inspect() (*define.InspectPodData, error) {
var infraConfig *define.InspectPodInfraConfig
var inspectMounts []define.InspectMount
var devices []define.InspectDevice
var deviceLimits []define.InspectBlkioThrottleDevice
var infraSecurity []string
if p.state.InfraContainerID != "" {
infra, err := p.runtime.GetContainer(p.state.InfraContainerID)
@ -683,18 +682,6 @@ func (p *Pod) Inspect() (*define.InspectPodData, error) {
if err != nil {
return nil, err
}
var nodes map[string]string
devices, err = infra.GetDevices(false, *infra.config.Spec, nodes)
if err != nil {
return nil, err
}
spec := infra.config.Spec
if spec.Linux != nil && spec.Linux.Resources != nil && spec.Linux.Resources.BlockIO != nil {
deviceLimits, err = blkioDeviceThrottle(nodes, spec.Linux.Resources.BlockIO.ThrottleReadBpsDevice)
if err != nil {
return nil, err
}
}
if len(infra.config.ContainerNetworkConfig.DNSServer) > 0 {
infraConfig.DNSServer = make([]string, 0, len(infra.config.ContainerNetworkConfig.DNSServer))
@ -731,33 +718,38 @@ func (p *Pod) Inspect() (*define.InspectPodData, error) {
}
inspectData := define.InspectPodData{
ID: p.ID(),
Name: p.Name(),
Namespace: p.Namespace(),
Created: p.CreatedTime(),
CreateCommand: p.config.CreateCommand,
ExitPolicy: string(p.config.ExitPolicy),
State: podState,
Hostname: p.config.Hostname,
Labels: p.Labels(),
CreateCgroup: p.config.UsePodCgroup,
CgroupParent: p.CgroupParent(),
CgroupPath: p.state.CgroupPath,
CreateInfra: infraConfig != nil,
InfraContainerID: p.state.InfraContainerID,
InfraConfig: infraConfig,
SharedNamespaces: sharesNS,
NumContainers: uint(len(containers)),
Containers: ctrs,
CPUSetCPUs: p.ResourceLim().CPU.Cpus,
CPUPeriod: p.CPUPeriod(),
CPUQuota: p.CPUQuota(),
MemoryLimit: p.MemoryLimit(),
Mounts: inspectMounts,
Devices: devices,
BlkioDeviceReadBps: deviceLimits,
VolumesFrom: p.VolumesFrom(),
SecurityOpts: infraSecurity,
ID: p.ID(),
Name: p.Name(),
Namespace: p.Namespace(),
Created: p.CreatedTime(),
CreateCommand: p.config.CreateCommand,
ExitPolicy: string(p.config.ExitPolicy),
State: podState,
Hostname: p.config.Hostname,
Labels: p.Labels(),
CreateCgroup: p.config.UsePodCgroup,
CgroupParent: p.CgroupParent(),
CgroupPath: p.state.CgroupPath,
CreateInfra: infraConfig != nil,
InfraContainerID: p.state.InfraContainerID,
InfraConfig: infraConfig,
SharedNamespaces: sharesNS,
NumContainers: uint(len(containers)),
Containers: ctrs,
CPUSetCPUs: p.ResourceLim().CPU.Cpus,
CPUPeriod: p.CPUPeriod(),
CPUQuota: p.CPUQuota(),
MemoryLimit: p.MemoryLimit(),
Mounts: inspectMounts,
Devices: devices,
BlkioDeviceReadBps: p.BlkiThrottleReadBps(),
VolumesFrom: p.VolumesFrom(),
SecurityOpts: infraSecurity,
MemorySwap: p.MemorySwap(),
BlkioWeight: p.BlkioWeight(),
CPUSetMems: p.CPUSetMems(),
BlkioDeviceWriteBps: p.BlkiThrottleWriteBps(),
CPUShares: p.CPUShares(),
}
return &inspectData, nil

View File

@ -80,7 +80,7 @@ func (r *Runtime) NewPod(ctx context.Context, p specgen.PodSpecGenerator, option
p.InfraContainerSpec.CgroupParent = pod.state.CgroupPath
// cgroupfs + rootless = permission denied when creating the cgroup.
if !rootless.IsRootless() {
res, err := GetLimits(p.InfraContainerSpec.ResourceLimits)
res, err := GetLimits(p.ResourceLimits)
if err != nil {
return nil, err
}
@ -113,7 +113,7 @@ func (r *Runtime) NewPod(ctx context.Context, p specgen.PodSpecGenerator, option
// If we are set to use pod cgroups, set the cgroup parent that
// all containers in the pod will share
if pod.config.UsePodCgroup {
cgroupPath, err := systemdSliceFromPath(pod.config.CgroupParent, fmt.Sprintf("libpod_pod_%s", pod.ID()), p.InfraContainerSpec.ResourceLimits)
cgroupPath, err := systemdSliceFromPath(pod.config.CgroupParent, fmt.Sprintf("libpod_pod_%s", pod.ID()), p.ResourceLimits)
if err != nil {
return nil, fmt.Errorf("unable to create pod cgroup for pod %s: %w", pod.ID(), err)
}

View File

@ -302,60 +302,6 @@ func CompleteSpec(ctx context.Context, r *libpod.Runtime, s *specgen.SpecGenerat
return warnings, nil
}
// FinishThrottleDevices takes the temporary representation of the throttle
// devices in the specgen and looks up the major and major minors. it then
// sets the throttle devices proper in the specgen
func FinishThrottleDevices(s *specgen.SpecGenerator) error {
if bps := s.ThrottleReadBpsDevice; len(bps) > 0 {
for k, v := range bps {
statT := unix.Stat_t{}
if err := unix.Stat(k, &statT); err != nil {
return err
}
v.Major = (int64(unix.Major(uint64(statT.Rdev)))) //nolint: unconvert
v.Minor = (int64(unix.Minor(uint64(statT.Rdev)))) //nolint: unconvert
if s.ResourceLimits.BlockIO == nil {
s.ResourceLimits.BlockIO = new(spec.LinuxBlockIO)
}
s.ResourceLimits.BlockIO.ThrottleReadBpsDevice = append(s.ResourceLimits.BlockIO.ThrottleReadBpsDevice, v)
}
}
if bps := s.ThrottleWriteBpsDevice; len(bps) > 0 {
for k, v := range bps {
statT := unix.Stat_t{}
if err := unix.Stat(k, &statT); err != nil {
return err
}
v.Major = (int64(unix.Major(uint64(statT.Rdev)))) //nolint: unconvert
v.Minor = (int64(unix.Minor(uint64(statT.Rdev)))) //nolint: unconvert
s.ResourceLimits.BlockIO.ThrottleWriteBpsDevice = append(s.ResourceLimits.BlockIO.ThrottleWriteBpsDevice, v)
}
}
if iops := s.ThrottleReadIOPSDevice; len(iops) > 0 {
for k, v := range iops {
statT := unix.Stat_t{}
if err := unix.Stat(k, &statT); err != nil {
return err
}
v.Major = (int64(unix.Major(uint64(statT.Rdev)))) //nolint: unconvert
v.Minor = (int64(unix.Minor(uint64(statT.Rdev)))) //nolint: unconvert
s.ResourceLimits.BlockIO.ThrottleReadIOPSDevice = append(s.ResourceLimits.BlockIO.ThrottleReadIOPSDevice, v)
}
}
if iops := s.ThrottleWriteIOPSDevice; len(iops) > 0 {
for k, v := range iops {
statT := unix.Stat_t{}
if err := unix.Stat(k, &statT); err != nil {
return err
}
v.Major = (int64(unix.Major(uint64(statT.Rdev)))) //nolint: unconvert
v.Minor = (int64(unix.Minor(uint64(statT.Rdev)))) //nolint: unconvert
s.ResourceLimits.BlockIO.ThrottleWriteIOPSDevice = append(s.ResourceLimits.BlockIO.ThrottleWriteIOPSDevice, v)
}
}
return nil
}
// ConfigToSpec takes a completed container config and converts it back into a specgenerator for purposes of cloning an existing container
func ConfigToSpec(rt *libpod.Runtime, specg *specgen.SpecGenerator, contaierID string) (*libpod.Container, *libpod.InfraInherit, error) {
c, err := rt.LookupContainer(contaierID)
@ -540,3 +486,63 @@ func mapSecurityConfig(c *libpod.ContainerConfig, s *specgen.SpecGenerator) {
s.Groups = c.Groups
s.HostUsers = c.HostUsers
}
// FinishThrottleDevices takes the temporary representation of the throttle
// devices in the specgen and looks up the major and major minors. it then
// sets the throttle devices proper in the specgen
func FinishThrottleDevices(s *specgen.SpecGenerator) error {
if s.ResourceLimits == nil {
s.ResourceLimits = &spec.LinuxResources{}
}
if s.ResourceLimits.BlockIO == nil {
s.ResourceLimits.BlockIO = &spec.LinuxBlockIO{}
}
if bps := s.ThrottleReadBpsDevice; len(bps) > 0 {
for k, v := range bps {
statT := unix.Stat_t{}
if err := unix.Stat(k, &statT); err != nil {
return fmt.Errorf("could not parse throttle device at %s: %w", k, err)
}
v.Major = (int64(unix.Major(uint64(statT.Rdev)))) //nolint: unconvert
v.Minor = (int64(unix.Minor(uint64(statT.Rdev)))) //nolint: unconvert
if s.ResourceLimits.BlockIO == nil {
s.ResourceLimits.BlockIO = new(spec.LinuxBlockIO)
}
s.ResourceLimits.BlockIO.ThrottleReadBpsDevice = append(s.ResourceLimits.BlockIO.ThrottleReadBpsDevice, v)
}
}
if bps := s.ThrottleWriteBpsDevice; len(bps) > 0 {
for k, v := range bps {
statT := unix.Stat_t{}
if err := unix.Stat(k, &statT); err != nil {
return fmt.Errorf("could not parse throttle device at %s: %w", k, err)
}
v.Major = (int64(unix.Major(uint64(statT.Rdev)))) //nolint: unconvert
v.Minor = (int64(unix.Minor(uint64(statT.Rdev)))) //nolint: unconvert
s.ResourceLimits.BlockIO.ThrottleWriteBpsDevice = append(s.ResourceLimits.BlockIO.ThrottleWriteBpsDevice, v)
}
}
if iops := s.ThrottleReadIOPSDevice; len(iops) > 0 {
for k, v := range iops {
statT := unix.Stat_t{}
if err := unix.Stat(k, &statT); err != nil {
return fmt.Errorf("could not parse throttle device at %s: %w", k, err)
}
v.Major = (int64(unix.Major(uint64(statT.Rdev)))) //nolint: unconvert
v.Minor = (int64(unix.Minor(uint64(statT.Rdev)))) //nolint: unconvert
s.ResourceLimits.BlockIO.ThrottleReadIOPSDevice = append(s.ResourceLimits.BlockIO.ThrottleReadIOPSDevice, v)
}
}
if iops := s.ThrottleWriteIOPSDevice; len(iops) > 0 {
for k, v := range iops {
statT := unix.Stat_t{}
if err := unix.Stat(k, &statT); err != nil {
return fmt.Errorf("could not parse throttle device at %s: %w", k, err)
}
v.Major = (int64(unix.Major(uint64(statT.Rdev)))) //nolint: unconvert
v.Minor = (int64(unix.Minor(uint64(statT.Rdev)))) //nolint: unconvert
s.ResourceLimits.BlockIO.ThrottleWriteIOPSDevice = append(s.ResourceLimits.BlockIO.ThrottleWriteIOPSDevice, v)
}
}
return nil
}

View File

@ -55,9 +55,6 @@ func MakeContainer(ctx context.Context, rt *libpod.Runtime, s *specgen.SpecGener
}
}
if err := FinishThrottleDevices(s); err != nil {
return nil, nil, nil, err
}
// Set defaults for unset namespaces
if s.PidNS.IsDefault() {
defaultNS, err := GetDefaultNamespaceMode("pid", rtc, pod)

View File

@ -309,6 +309,17 @@ func SpecGenToOCI(ctx context.Context, s *specgen.SpecGenerator, rt *libpod.Runt
}
g.Config.Linux.Resources = s.ResourceLimits
}
weightDevices, err := WeightDevices(s.WeightDevice)
if err != nil {
return nil, err
}
if len(weightDevices) > 0 {
for _, dev := range weightDevices {
g.AddLinuxResourcesBlockIOWeightDevice(dev.Major, dev.Minor, *dev.Weight)
}
}
// Devices
// set the default rule at the beginning of device configuration
if !inUserNS && !s.Privileged {
@ -345,14 +356,6 @@ func SpecGenToOCI(ctx context.Context, s *specgen.SpecGenerator, rt *libpod.Runt
}
}
for k, v := range s.WeightDevice {
statT := unix.Stat_t{}
if err := unix.Stat(k, &statT); err != nil {
return nil, fmt.Errorf("failed to inspect '%s' in --blkio-weight-device: %w", k, err)
}
g.AddLinuxResourcesBlockIOWeightDevice((int64(unix.Major(uint64(statT.Rdev)))), (int64(unix.Minor(uint64(statT.Rdev)))), *v.Weight) //nolint: unconvert
}
BlockAccessToKernelFilesystems(s.Privileged, s.PidNS.IsHost(), s.Mask, s.Unmask, &g)
g.ClearProcessEnv()
@ -413,3 +416,19 @@ func SpecGenToOCI(ctx context.Context, s *specgen.SpecGenerator, rt *libpod.Runt
return configSpec, nil
}
func WeightDevices(wtDevices map[string]spec.LinuxWeightDevice) ([]spec.LinuxWeightDevice, error) {
devs := []spec.LinuxWeightDevice{}
for k, v := range wtDevices {
statT := unix.Stat_t{}
if err := unix.Stat(k, &statT); err != nil {
return nil, fmt.Errorf("failed to inspect '%s' in --blkio-weight-device: %w", k, err)
}
dev := new(spec.LinuxWeightDevice)
dev.Major = (int64(unix.Major(uint64(statT.Rdev)))) //nolint: unconvert
dev.Minor = (int64(unix.Minor(uint64(statT.Rdev)))) //nolint: unconvert
dev.Weight = v.Weight
devs = append(devs, *dev)
}
return devs, nil
}

View File

@ -13,6 +13,7 @@ import (
"github.com/containers/podman/v4/pkg/domain/entities"
"github.com/containers/podman/v4/pkg/specgen"
"github.com/containers/podman/v4/pkg/specgenutil"
"github.com/opencontainers/runtime-spec/specs-go"
"github.com/sirupsen/logrus"
)
@ -21,6 +22,10 @@ func MakePod(p *entities.PodSpec, rt *libpod.Runtime) (*libpod.Pod, error) {
return nil, err
}
if p.PodSpecGen.ResourceLimits == nil {
p.PodSpecGen.ResourceLimits = &specs.LinuxResources{}
}
if !p.PodSpecGen.NoInfra {
imageName, err := PullOrBuildInfraImage(rt, p.PodSpecGen.InfraImage)
if err != nil {
@ -38,10 +43,33 @@ func MakePod(p *entities.PodSpec, rt *libpod.Runtime) (*libpod.Pod, error) {
}
}
if !p.PodSpecGen.NoInfra {
err := FinishThrottleDevices(p.PodSpecGen.InfraContainerSpec)
if err != nil {
return nil, err
}
if p.PodSpecGen.InfraContainerSpec.ResourceLimits.BlockIO != nil {
p.PodSpecGen.ResourceLimits.BlockIO = p.PodSpecGen.InfraContainerSpec.ResourceLimits.BlockIO
}
weightDevices, err := WeightDevices(p.PodSpecGen.InfraContainerSpec.WeightDevice)
if err != nil {
return nil, err
}
if p.PodSpecGen.ResourceLimits != nil && len(weightDevices) > 0 {
if p.PodSpecGen.ResourceLimits.BlockIO == nil {
p.PodSpecGen.ResourceLimits.BlockIO = &specs.LinuxBlockIO{}
}
p.PodSpecGen.ResourceLimits.BlockIO.WeightDevice = weightDevices
}
}
options, err := createPodOptions(&p.PodSpecGen)
if err != nil {
return nil, err
}
pod, err := rt.NewPod(context.Background(), p.PodSpecGen, options...)
if err != nil {
return nil, err
@ -55,6 +83,11 @@ func MakePod(p *entities.PodSpec, rt *libpod.Runtime) (*libpod.Pod, error) {
return nil, err
}
p.PodSpecGen.InfraContainerSpec.User = "" // infraSpec user will get incorrectly assigned via the container creation process, overwrite here
// infra's resource limits are used as a parsing tool,
// we do not want infra to get these resources in its cgroup
// make sure of that here.
p.PodSpecGen.InfraContainerSpec.ResourceLimits = nil
p.PodSpecGen.InfraContainerSpec.WeightDevice = nil
rtSpec, spec, opts, err := MakeContainer(context.Background(), rt, p.PodSpecGen.InfraContainerSpec, false, nil)
if err != nil {
return nil, err
@ -122,6 +155,10 @@ func createPodOptions(p *specgen.PodSpecGenerator) ([]libpod.PodCreateOption, er
options = append(options, libpod.WithPodHostname(p.Hostname))
}
if p.ResourceLimits != nil {
options = append(options, libpod.WithPodResources(*p.ResourceLimits))
}
options = append(options, libpod.WithPodExitPolicy(p.ExitPolicy))
return options, nil

View File

@ -74,6 +74,12 @@ func getCPULimits(c *entities.ContainerCreateOptions) *specs.LinuxCPU {
func getIOLimits(s *specgen.SpecGenerator, c *entities.ContainerCreateOptions) (*specs.LinuxBlockIO, error) {
var err error
io := &specs.LinuxBlockIO{}
if s.ResourceLimits == nil {
s.ResourceLimits = &specs.LinuxResources{}
}
if s.ResourceLimits.BlockIO == nil {
s.ResourceLimits.BlockIO = &specs.LinuxBlockIO{}
}
hasLimits := false
if b := c.BlkIOWeight; len(b) > 0 {
u, err := strconv.ParseUint(b, 10, 16)
@ -82,6 +88,7 @@ func getIOLimits(s *specgen.SpecGenerator, c *entities.ContainerCreateOptions) (
}
nu := uint16(u)
io.Weight = &nu
s.ResourceLimits.BlockIO.Weight = &nu
hasLimits = true
}
@ -96,6 +103,7 @@ func getIOLimits(s *specgen.SpecGenerator, c *entities.ContainerCreateOptions) (
if s.ThrottleReadBpsDevice, err = parseThrottleBPSDevices(bps); err != nil {
return nil, err
}
hasLimits = true
}
@ -123,6 +131,8 @@ func getIOLimits(s *specgen.SpecGenerator, c *entities.ContainerCreateOptions) (
if !hasLimits {
return nil, nil
}
io = s.ResourceLimits.BlockIO
return io, nil
}
@ -509,7 +519,7 @@ func FillOutSpecGen(s *specgen.SpecGenerator, c *entities.ContainerCreateOptions
return err
}
}
if s.ResourceLimits.BlockIO == nil || (len(c.BlkIOWeight) != 0 || len(c.BlkIOWeightDevice) != 0) {
if s.ResourceLimits.BlockIO == nil || (len(c.BlkIOWeight) != 0 || len(c.BlkIOWeightDevice) != 0 || len(c.DeviceReadBPs) != 0 || len(c.DeviceWriteBPs) != 0) {
s.ResourceLimits.BlockIO, err = getIOLimits(s, c)
if err != nil {
return err

View File

@ -2,12 +2,17 @@
load helpers
LOOPDEVICE=
# This is a long ugly way to clean up pods and remove the pause image
function teardown() {
run_podman pod rm -f -t 0 -a
run_podman rm -f -t 0 -a
run_podman rmi --ignore $(pause_image)
basic_teardown
if [[ -n "$LOOPDEVICE" ]]; then
losetup -d $LOOPDEVICE
fi
}
@ -474,31 +479,57 @@ spec:
@test "pod resource limits" {
skip_if_remote "resource limits only implemented on non-remote"
if is_rootless; then
if is_rootless || ! is_cgroupsv2; then
skip "only meaningful for rootful"
fi
local name1="resources1"
run_podman --cgroup-manager=systemd pod create --name=$name1 --cpus=5 --memory=10m
run_podman --cgroup-manager=systemd pod start $name1
run_podman pod inspect --format '{{.CgroupPath}}' $name1
local path1="$output"
local actual1=$(< /sys/fs/cgroup/$path1/cpu.max)
is "$actual1" "500000 100000" "resource limits set properly"
local actual2=$(< /sys/fs/cgroup/$path1/memory.max)
is "$actual2" "10485760" "resource limits set properly"
run_podman pod --cgroup-manager=systemd rm -f $name1
# create loopback device
lofile=${PODMAN_TMPDIR}/disk.img
fallocate -l 1k ${lofile}
LOOPDEVICE=$(losetup --show -f $lofile)
local name2="resources2"
run_podman --cgroup-manager=cgroupfs pod create --cpus=5 --memory=10m --name=$name2
run_podman --cgroup-manager=cgroupfs pod start $name2
run_podman pod inspect --format '{{.CgroupPath}}' $name2
local path2="$output"
local actual2=$(< /sys/fs/cgroup/$path2/cpu.max)
is "$actual2" "500000 100000" "resource limits set properly"
local actual2=$(< /sys/fs/cgroup/$path2/memory.max)
is "$actual2" "10485760" "resource limits set properly"
run_podman --cgroup-manager=cgroupfs pod rm $name2
# tr needed because losetup seems to use %2d
lomajmin=$(losetup -l --noheadings --output MAJ:MIN $LOOPDEVICE | tr -d ' ')
run grep -w bfq /sys/block/$(basename ${LOOPDEVICE})/queue/scheduler
if [ $status -ne 0 ]; then
skip "BFQ scheduler is not supported on the system"
if [ -f ${lofile} ]; then
run_podman '?' rm -t 0 --all --force --ignore
while read path dev; do
if [[ "$path" == "$lofile" ]]; then
losetup -d $dev
fi
done < <(losetup -l --noheadings --output BACK-FILE,NAME)
rm ${lofile}
fi
fi
echo bfq > /sys/block/$(basename ${LOOPDEVICE})/queue/scheduler
expected_limits="
cpu.max | 500000 100000
memory.max | 5242880
memory.swap.max | 1068498944
io.max | $lomajmin rbps=1048576 wbps=1048576 riops=max wiops=max
"
for cgm in systemd cgroupfs; do
local name=resources-$cgm
run_podman --cgroup-manager=$cgm pod create --name=$name --cpus=5 --memory=5m --memory-swap=1g --cpu-shares=1000 --cpuset-cpus=0 --cpuset-mems=0 --device-read-bps=${LOOPDEVICE}:1mb --device-write-bps=${LOOPDEVICE}:1mb --blkio-weight-device=${LOOPDEVICE}:123 --blkio-weight=50
run_podman --cgroup-manager=$cgm pod start $name
run_podman pod inspect --format '{{.CgroupPath}}' $name
local cgroup_path="$output"
while read unit expect; do
local actual=$(< /sys/fs/cgroup/$cgroup_path/$unit)
is "$actual" "$expect" "resource limit under $cgm: $unit"
done < <(parse_table "$expected_limits")
run_podman --cgroup-manager=$cgm pod rm -f $name
done
# Clean up, and prevent duplicate cleanup in teardown
losetup -d $LOOPDEVICE
LOOPDEVICE=
}
@test "podman pod ps doesn't race with pod rm" {