runtime-spec: update pids.limit handling to match new guidance

The main update is actually in github.com/opencontainers/cgroups, but we
need to also update runtime-spec to a newer pre-release version to get
the updates from there as well.

In short, the behaviour change is now that "0" is treated as a valid
value to set in "pids.max", "-1" means "max" and unset/nil means "do
nothing". As described in the opencontainers/cgroups PR, this change is
actually backwards compatible because our internal state.json stores
PidsLimit, and that entry is marked as "omitempty". So, an old runc
would omit PidsLimit=0 in state.json, and this will be parsed by a new
runc as being "nil" -- and both would treat this case as "do not set
anything".

Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
This commit is contained in:
Aleksa Sarai
2025-10-22 23:57:21 +11:00
parent eec1f7e34b
commit 3b75374cc7
18 changed files with 247 additions and 52 deletions

View File

@@ -10,6 +10,22 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- The deprecated `libcontainer/userns` package has been removed; use
`github.com/moby/sys/userns` instead.
### Breaking ###
- The handling of `pids.limit` has been updated to match the newer guidance
from the OCI runtime specification. In particular, now a maximum limit value
of `0` will be treated as an actual limit (due to limitations with systemd,
it will be treated the same as a limit value of `1`). We only expect users
that explicitly set `pids.limit` to `0` will see a behaviour change.
(opencontainers/cgroups#48, #4949)
### Fixed ###
- cgroups: provide iocost statistics for cgroupv2. (opencontainers/cgroups#43)
- cgroups: retry DBus connection when it fails with EAGAIN.
(opencontainers/cgroups#45)
- cgroups: improve `cpuacct.usage_all` resilience when parsing data from
patched kernels (such as the Tencent kernels). (opencontainers/cgroups#46,
opencontainers/cgroups#50)
## [1.4.0-rc.1] - 2025-09-05
> おめェもボスになったんだろぉ?

4
go.mod
View File

@@ -14,8 +14,8 @@ require (
github.com/moby/sys/user v0.4.0
github.com/moby/sys/userns v0.1.0
github.com/mrunalp/fileutils v0.5.1
github.com/opencontainers/cgroups v0.0.5
github.com/opencontainers/runtime-spec v1.2.2-0.20250818071321-383cadbf08c0
github.com/opencontainers/cgroups v0.0.6
github.com/opencontainers/runtime-spec v1.3.0
github.com/opencontainers/selinux v1.13.0
github.com/seccomp/libseccomp-golang v0.11.1
github.com/sirupsen/logrus v1.9.3

8
go.sum
View File

@@ -46,10 +46,10 @@ github.com/moby/sys/userns v0.1.0 h1:tVLXkFOxVu9A64/yh59slHVv9ahO9UIev4JZusOLG/g
github.com/moby/sys/userns v0.1.0/go.mod h1:IHUYgu/kao6N8YZlp9Cf444ySSvCmDlmzUcYfDHOl28=
github.com/mrunalp/fileutils v0.5.1 h1:F+S7ZlNKnrwHfSwdlgNSkKo67ReVf8o9fel6C3dkm/Q=
github.com/mrunalp/fileutils v0.5.1/go.mod h1:M1WthSahJixYnrXQl/DFQuteStB1weuxD2QJNHXfbSQ=
github.com/opencontainers/cgroups v0.0.5 h1:DRITAqcOnY0uSBzIpt1RYWLjh5DPDiqUs4fY6Y0ktls=
github.com/opencontainers/cgroups v0.0.5/go.mod h1:oWVzJsKK0gG9SCRBfTpnn16WcGEqDI8PAcpMGbqWxcs=
github.com/opencontainers/runtime-spec v1.2.2-0.20250818071321-383cadbf08c0 h1:RLn0YfUWkiqPGtgUANvJrcjIkCHGRl3jcz/c557M28M=
github.com/opencontainers/runtime-spec v1.2.2-0.20250818071321-383cadbf08c0/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0=
github.com/opencontainers/cgroups v0.0.6 h1:tfZFWTIIGaUUFImTyuTg+Mr5x8XRiSdZESgEBW7UxuI=
github.com/opencontainers/cgroups v0.0.6/go.mod h1:oWVzJsKK0gG9SCRBfTpnn16WcGEqDI8PAcpMGbqWxcs=
github.com/opencontainers/runtime-spec v1.3.0 h1:YZupQUdctfhpZy3TM39nN9Ika5CBWT5diQ8ibYCRkxg=
github.com/opencontainers/runtime-spec v1.3.0/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0=
github.com/opencontainers/selinux v1.13.0 h1:Zza88GWezyT7RLql12URvoxsbLfjFx988+LGaWfbL84=
github.com/opencontainers/selinux v1.13.0/go.mod h1:XxWTed+A/s5NNq4GmYScVy+9jzXhGBVEOAyucdRUY8s=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=

View File

@@ -526,20 +526,22 @@ func TestPidsSystemd(t *testing.T) {
testPids(t, true)
}
func mkPtr[T any](v T) *T { return &v }
func testPids(t *testing.T, systemd bool) {
if testing.Short() {
return
}
config := newTemplateConfig(t, &tParam{systemd: systemd})
config.Cgroups.Resources.PidsLimit = -1
config.Cgroups.Resources.PidsLimit = mkPtr[int64](-1)
// Running multiple processes, expecting it to succeed with no pids limit.
runContainerOk(t, config, "/bin/sh", "-c", "/bin/true | /bin/true | /bin/true | /bin/true")
// Enforce a permissive limit. This needs to be fairly hand-wavey due to the
// issues with running Go binaries with pids restrictions (see below).
config.Cgroups.Resources.PidsLimit = 64
config.Cgroups.Resources.PidsLimit = mkPtr[int64](64)
runContainerOk(t, config, "/bin/sh", "-c", `
/bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | bin/true | /bin/true |
/bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | bin/true | /bin/true |
@@ -548,7 +550,7 @@ func testPids(t *testing.T, systemd bool) {
// Enforce a restrictive limit. 64 * /bin/true + 1 * shell should cause
// this to fail reliably.
config.Cgroups.Resources.PidsLimit = 64
config.Cgroups.Resources.PidsLimit = mkPtr[int64](64)
out, _, err := runContainer(t, config, "/bin/sh", "-c", `
/bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | bin/true | /bin/true |
/bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | bin/true | /bin/true |

View File

@@ -85,7 +85,8 @@ stdin. If this option is used, all other options are ignored.
(i.e. use unlimited swap).
**--pids-limit** _num_
: Set the maximum number of processes allowed in the container.
: Set the maximum number of processes allowed in the container. Use **-1** to
unset the limit.
**--l3-cache-schema** _value_
: Set the value for Intel RDT/CAT L3 cache schema.

View File

@@ -252,7 +252,9 @@ other options are ignored.
}
}
r.Pids.Limit = int64(context.Int("pids-limit"))
if context.IsSet("pids-limit") {
r.Pids.Limit = i64Ptr(int64(context.Int("pids-limit")))
}
}
// Fix up values

View File

@@ -90,8 +90,8 @@ type Resources struct {
// Cgroup's SCHED_IDLE value.
CPUIdle *int64 `json:"cpu_idle,omitempty"`
// Process limit; set <= `0' to disable limit.
PidsLimit int64 `json:"pids_limit,omitempty"`
// Process limit; set < `0' to disable limit. `nil` means "keep current limit".
PidsLimit *int64 `json:"pids_limit,omitempty"`
// Specifies per cgroup weight, range is from 10 to 1000.
BlkioWeight uint16 `json:"blkio_weight,omitempty"`

View File

@@ -129,12 +129,16 @@ func getPercpuUsageInModes(path string) ([]uint64, []uint64, error) {
defer fd.Close()
scanner := bufio.NewScanner(fd)
scanner.Scan() // skipping header line
scanner.Scan() // Read header line.
const want = "cpu user system"
if hdr := scanner.Text(); !strings.HasPrefix(hdr, want) {
return nil, nil, malformedLine(path, file, hdr)
}
for scanner.Scan() {
// Each line is: cpu user system
fields := strings.SplitN(scanner.Text(), " ", 3)
if len(fields) != 3 {
// Each line is: cpu user system. Keep N at 4 to ignore extra fields.
fields := strings.SplitN(scanner.Text(), " ", 4)
if len(fields) < 3 {
continue
}

View File

@@ -19,19 +19,24 @@ func (s *PidsGroup) Apply(path string, _ *cgroups.Resources, pid int) error {
}
func (s *PidsGroup) Set(path string, r *cgroups.Resources) error {
if r.PidsLimit != 0 {
// "max" is the fallback value.
limit := "max"
if r.PidsLimit > 0 {
limit = strconv.FormatInt(r.PidsLimit, 10)
}
if err := cgroups.WriteFile(path, "pids.max", limit); err != nil {
return err
}
if r.PidsLimit == nil {
return nil
}
// "max" is the fallback value.
val := "max"
if limit := *r.PidsLimit; limit > 0 {
val = strconv.FormatInt(limit, 10)
} else if limit == 0 {
// systemd doesn't support setting pids.max to "0", so when setting
// TasksMax we need to remap it to "1". We do the same thing here to
// avoid flip-flop behaviour between the fs and systemd drivers. In
// practice, the pids cgroup behaviour is basically identical.
val = "1"
}
if err := cgroups.WriteFile(path, "pids.max", val); err != nil {
return err
}
return nil
}

View File

@@ -165,11 +165,22 @@ func statIo(dirPath string, stats *cgroups.Stats) error {
case "wios":
op = "Write"
targetTable = &parsedStats.IoServicedRecursive
case "cost.usage":
op = "Count"
targetTable = &parsedStats.IoCostUsage
case "cost.wait":
op = "Count"
targetTable = &parsedStats.IoCostWait
case "cost.indebt":
op = "Count"
targetTable = &parsedStats.IoCostIndebt
case "cost.indelay":
op = "Count"
targetTable = &parsedStats.IoCostIndelay
default:
// Skip over entries we cannot map to cgroupv1 stats for now.
// In the future we should expand the stats struct to include
// them.
logrus.Debugf("cgroupv2 io stats: skipping over unmappable %s entry", item)
logrus.Debugf("cgroupv2 io stats: unknown entry %s", item)
continue
}

View File

@@ -4,6 +4,7 @@ import (
"errors"
"math"
"os"
"strconv"
"strings"
"golang.org/x/sys/unix"
@@ -13,19 +14,26 @@ import (
)
func isPidsSet(r *cgroups.Resources) bool {
return r.PidsLimit != 0
return r.PidsLimit != nil
}
func setPids(dirPath string, r *cgroups.Resources) error {
if !isPidsSet(r) {
return nil
}
if val := numToStr(r.PidsLimit); val != "" {
if err := cgroups.WriteFile(dirPath, "pids.max", val); err != nil {
return err
}
val := "max"
if limit := *r.PidsLimit; limit > 0 {
val = strconv.FormatInt(limit, 10)
} else if limit == 0 {
// systemd doesn't support setting pids.max to "0", so when setting
// TasksMax we need to remap it to "1". We do the same thing here to
// avoid flip-flop behaviour between the fs and systemd drivers. In
// practice, the pids cgroup behaviour is basically identical.
val = "1"
}
if err := cgroups.WriteFile(dirPath, "pids.max", val); err != nil {
return err
}
return nil
}

View File

@@ -159,6 +159,10 @@ type BlkioStats struct {
IoTimeRecursive []BlkioStatEntry `json:"io_time_recursive,omitempty"`
SectorsRecursive []BlkioStatEntry `json:"sectors_recursive,omitempty"`
PSI *PSIStats `json:"psi,omitempty"`
IoCostUsage []BlkioStatEntry `json:"io_cost_usage,omitempty"`
IoCostWait []BlkioStatEntry `json:"io_cost_wait,omitempty"`
IoCostIndebt []BlkioStatEntry `json:"io_cost_indebt,omitempty"`
IoCostIndelay []BlkioStatEntry `json:"io_cost_indelay,omitempty"`
}
type HugetlbStats struct {

View File

@@ -4,10 +4,13 @@ import (
"context"
"errors"
"fmt"
"math/rand/v2"
"sync"
"time"
systemdDbus "github.com/coreos/go-systemd/v22/dbus"
dbus "github.com/godbus/dbus/v5"
"golang.org/x/sys/unix"
)
var (
@@ -64,10 +67,27 @@ func (d *dbusConnManager) getConnection() (*systemdDbus.Conn, error) {
}
func (d *dbusConnManager) newConnection() (*systemdDbus.Conn, error) {
if dbusRootless {
return newUserSystemdDbus()
newDbusConn := func() (*systemdDbus.Conn, error) {
if dbusRootless {
return newUserSystemdDbus()
}
return systemdDbus.NewWithContext(context.TODO())
}
return systemdDbus.NewWithContext(context.TODO())
var err error
for retry := range 7 {
var conn *systemdDbus.Conn
conn, err = newDbusConn()
if !errors.Is(err, unix.EAGAIN) {
return conn, err
}
// Exponential backoff (100ms * 2^attempt + ~12.5% jitter).
// At most we would expect 15 seconds of delay with 7 attempts.
delay := 100 * time.Millisecond << retry
delay += time.Duration(rand.Int64N(1 + (delay.Milliseconds() >> 3)))
time.Sleep(delay)
}
return nil, fmt.Errorf("dbus connection failed after several retries: %w", err)
}
// resetConnection resets the connection to its initial state

View File

@@ -2,6 +2,7 @@ package systemd
import (
"errors"
"math"
"os"
"path/filepath"
"strings"
@@ -97,9 +98,17 @@ func genV1ResourcesProperties(r *cgroups.Resources, cm *dbusConnManager) ([]syst
newProp("BlockIOWeight", uint64(r.BlkioWeight)))
}
if r.PidsLimit > 0 || r.PidsLimit == -1 {
if r.PidsLimit != nil {
var tasksMax uint64
if limit := *r.PidsLimit; limit < 0 {
tasksMax = math.MaxUint64 // "infinity"
} else if limit == 0 {
tasksMax = 1 // systemd does not accept "0" for TasksMax
} else {
tasksMax = uint64(limit)
}
properties = append(properties,
newProp("TasksMax", uint64(r.PidsLimit)))
newProp("TasksMax", tasksMax))
}
err = addCpuset(cm, &properties, r.CpusetCpus, r.CpusetMems)

View File

@@ -176,6 +176,9 @@ func unifiedResToSystemdProps(cm *dbusConnManager, res map[string]string) (props
return nil, fmt.Errorf("unified resource %q value conversion error: %w", k, err)
}
}
if num == 0 {
num = 1 // systemd does not accept "0" for TasksMax
}
props = append(props,
newProp("TasksMax", num))
@@ -256,9 +259,17 @@ func genV2ResourcesProperties(dirPath string, r *cgroups.Resources, cm *dbusConn
addCPUQuota(cm, &properties, &r.CpuQuota, r.CpuPeriod)
if r.PidsLimit > 0 || r.PidsLimit == -1 {
if r.PidsLimit != nil {
var tasksMax uint64
if limit := *r.PidsLimit; limit < 0 {
tasksMax = math.MaxUint64 // "infinity"
} else if limit == 0 {
tasksMax = 1 // systemd does not accept "0" for TasksMax
} else {
tasksMax = uint64(limit)
}
properties = append(properties,
newProp("TasksMax", uint64(r.PidsLimit)))
newProp("TasksMax", tasksMax))
}
err = addCpuset(cm, &properties, r.CpusetCpus, r.CpusetMems)

View File

@@ -31,6 +31,8 @@ type Spec struct {
VM *VM `json:"vm,omitempty" platform:"vm"`
// ZOS is platform-specific configuration for z/OS based containers.
ZOS *ZOS `json:"zos,omitempty" platform:"zos"`
// FreeBSD is platform-specific configuration for FreeBSD based containers.
FreeBSD *FreeBSD `json:"freebsd,omitempty" platform:"freebsd"`
}
// Scheduler represents the scheduling attributes for a process. It is based on
@@ -170,7 +172,7 @@ type Mount struct {
// Destination is the absolute path where the mount will be placed in the container.
Destination string `json:"destination"`
// Type specifies the mount kind.
Type string `json:"type,omitempty" platform:"linux,solaris,zos"`
Type string `json:"type,omitempty" platform:"linux,solaris,zos,freebsd"`
// Source specifies the source path of the mount.
Source string `json:"source,omitempty"`
// Options are fstab style mount options.
@@ -434,7 +436,7 @@ type LinuxCPU struct {
// LinuxPids for Linux cgroup 'pids' resource management (Linux 4.3)
type LinuxPids struct {
// Maximum number of PIDs. Default is "no limit".
Limit int64 `json:"limit"`
Limit *int64 `json:"limit,omitempty"`
}
// LinuxNetwork identification and priority configuration
@@ -688,6 +690,32 @@ type WindowsHyperV struct {
UtilityVMPath string `json:"utilityVMPath,omitempty"`
}
// IOMems contains information about iomem addresses that should be passed to the VM.
type IOMems struct {
// Guest Frame Number to map the iomem range. If GFN is not specified, the mapping will be done to the same Frame Number as was provided in FirstMFN.
FirstGFN *uint64 `json:"firstGFN,omitempty"`
// Physical page number of iomem regions.
FirstMFN *uint64 `json:"firstMFN"`
// Number of pages to be mapped.
NrMFNs *uint64 `json:"nrMFNs"`
}
// Hardware configuration for the VM image
type HWConfig struct {
// Path to the container device-tree file that should be passed to the VM configuration.
DeviceTree string `json:"deviceTree,omitempty"`
// Number of virtual cpus for the VM.
VCPUs *uint32 `json:"vcpus,omitempty"`
// Maximum memory in bytes allocated to the VM.
Memory *uint64 `json:"memory,omitempty"`
// Host device tree nodes to passthrough to the VM.
DtDevs []string `json:"dtdevs,omitempty"`
// Allow auto-translated domains to access specific hardware I/O memory pages.
IOMems []IOMems `json:"iomems,omitempty"`
// Allows VM to access specific physical IRQs.
Irqs []uint32 `json:"irqs,omitempty"`
}
// VM contains information for virtual-machine-based containers.
type VM struct {
// Hypervisor specifies hypervisor-related configuration for virtual-machine-based containers.
@@ -696,6 +724,8 @@ type VM struct {
Kernel VMKernel `json:"kernel"`
// Image specifies guest image related configuration for virtual-machine-based containers.
Image VMImage `json:"image,omitempty"`
// Hardware configuration that should be passed to the VM.
HwConfig *HWConfig `json:"hwconfig,omitempty"`
}
// VMHypervisor contains information about the hypervisor to use for a virtual machine.
@@ -963,3 +993,75 @@ const (
// SchedFlagUtilClampMin represents the utilization clamp maximum scheduling flag
SchedFlagUtilClampMax LinuxSchedulerFlag = "SCHED_FLAG_UTIL_CLAMP_MAX"
)
// FreeBSD contains platform-specific configuration for FreeBSD based containers.
type FreeBSD struct {
// Devices which are accessible in the container
Devices []FreeBSDDevice `json:"devices,omitempty"`
// Jail definition for this container
Jail *FreeBSDJail `json:"jail,omitempty"`
}
type FreeBSDDevice struct {
// Path to the device, relative to /dev.
Path string `json:"path"`
// FileMode permission bits for the device.
Mode *os.FileMode `json:"mode,omitempty"`
}
// FreeBSDJail describes how to configure the container's jail
type FreeBSDJail struct {
// Parent jail name - this can be used to share a single vnet
// across several containers
Parent string `json:"parent,omitempty"`
// Whether to use parent UTS names or override in the container
Host FreeBSDSharing `json:"host,omitempty"`
// IPv4 address sharing for the container
Ip4 FreeBSDSharing `json:"ip4,omitempty"`
// IPv4 addresses for the container
Ip4Addr []string `json:"ip4Addr,omitempty"`
// IPv6 address sharing for the container
Ip6 FreeBSDSharing `json:"ip6,omitempty"`
// IPv6 addresses for the container
Ip6Addr []string `json:"ip6Addr,omitempty"`
// Which network stack to use for the container
Vnet FreeBSDSharing `json:"vnet,omitempty"`
// If set, Ip4Addr and Ip6Addr addresses will be added to this interface
Interface string `json:"interface,omitempty"`
// List interfaces to be moved to the container's vnet
VnetInterfaces []string `json:"vnetInterfaces,omitempty"`
// SystemV IPC message sharing for the container
SysVMsg FreeBSDSharing `json:"sysvmsg,omitempty"`
// SystemV semaphore message sharing for the container
SysVSem FreeBSDSharing `json:"sysvsem,omitempty"`
// SystemV memory sharing for the container
SysVShm FreeBSDSharing `json:"sysvshm,omitempty"`
// Mount visibility (see jail(8) for details)
EnforceStatfs *int `json:"enforceStatfs,omitempty"`
// Jail capabilities
Allow *FreeBSDJailAllow `json:"allow,omitempty"`
}
// These values are used to control access to features in the container, either
// disabling the feature, sharing state with the parent or creating new private
// state in the container.
type FreeBSDSharing string
const (
FreeBSDShareDisable FreeBSDSharing = "disable"
FreeBSDShareNew FreeBSDSharing = "new"
FreeBSDShareInherit FreeBSDSharing = "inherit"
)
// FreeBSDJailAllow describes jail capabilities
type FreeBSDJailAllow struct {
SetHostname bool `json:"setHostname,omitempty"`
RawSockets bool `json:"rawSockets,omitempty"`
Chflags bool `json:"chflags,omitempty"`
Mount []string `json:"mount,omitempty"`
Quotas bool `json:"quotas,omitempty"`
SocketAf bool `json:"socketAf,omitempty"`
Mlock bool `json:"mlock,omitempty"`
ReservedPorts bool `json:"reservedPorts,omitempty"`
Suser bool `json:"suser,omitempty"`
}

View File

@@ -6,12 +6,12 @@ const (
// VersionMajor is for an API incompatible changes
VersionMajor = 1
// VersionMinor is for functionality in a backwards-compatible manner
VersionMinor = 2
VersionMinor = 3
// VersionPatch is for backwards-compatible bug fixes
VersionPatch = 1
VersionPatch = 0
// VersionDev indicates development branch. Releases will be empty string.
VersionDev = "+dev"
VersionDev = ""
)
// Version is the specification version that the package types support.

4
vendor/modules.txt vendored
View File

@@ -68,7 +68,7 @@ github.com/moby/sys/userns
# github.com/mrunalp/fileutils v0.5.1
## explicit; go 1.13
github.com/mrunalp/fileutils
# github.com/opencontainers/cgroups v0.0.5
# github.com/opencontainers/cgroups v0.0.6
## explicit; go 1.23.0
github.com/opencontainers/cgroups
github.com/opencontainers/cgroups/devices
@@ -79,7 +79,7 @@ github.com/opencontainers/cgroups/fscommon
github.com/opencontainers/cgroups/internal/path
github.com/opencontainers/cgroups/manager
github.com/opencontainers/cgroups/systemd
# github.com/opencontainers/runtime-spec v1.2.2-0.20250818071321-383cadbf08c0
# github.com/opencontainers/runtime-spec v1.3.0
## explicit
github.com/opencontainers/runtime-spec/specs-go
github.com/opencontainers/runtime-spec/specs-go/features