Decouple cgroup devices handling

This commit separates the functionality of setting cgroup device
rules out of libct/cgroups to libct/cgroups/devices package. This
package, if imported, sets the function variables in libct/cgroups and
libct/cgroups/systemd, so that a cgroup manager can use those to manage
devices. If those function variables are nil (when libct/cgroups/devices
are not imported), a cgroup manager returns the ErrDevicesUnsupported
in case any device rules are set in Resources.

It also consolidates the code from libct/cgroups/ebpf and
libct/cgroups/ebpf/devicefilter into libct/cgroups/devices.

Moved some tests in libct/cg/sd that require device management to
libct/sd/devices.

Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
This commit is contained in:
Kir Kolyshkin
2022-04-06 20:47:25 -07:00
parent 25f1856236
commit b6967fa84c
18 changed files with 674 additions and 545 deletions

View File

@@ -1,9 +1,24 @@
package cgroups
import (
"errors"
"github.com/opencontainers/runc/libcontainer/configs"
)
var (
// ErrDevicesUnsupported is an error returned when a cgroup manager
// is not configured to set device rules.
ErrDevicesUnsupported = errors.New("cgroup manager is not configured to set device rules")
// DevicesSetV1 and DevicesSetV2 are functions to set devices for
// cgroup v1 and v2, respectively. Unless libcontainer/cgroups/devices
// package is imported, it is set to nil, so cgroup managers can't
// manage devices.
DevicesSetV1 func(path string, r *configs.Resources) error
DevicesSetV2 func(path string, r *configs.Resources) error
)
type Manager interface {
// Apply creates a cgroup, if not yet created, and adds a process
// with the specified pid into that cgroup. A special value of -1

View File

@@ -1,10 +1,10 @@
// Package devicefilter contains eBPF device filter program
// Implements creation of eBPF device filter program.
//
// The implementation is based on https://github.com/containers/crun/blob/0.10.2/src/libcrun/ebpf.c
// Based on https://github.com/containers/crun/blob/0.10.2/src/libcrun/ebpf.c
//
// Although ebpf.c is originally licensed under LGPL-3.0-or-later, the author (Giuseppe Scrivano)
// agreed to relicense the file in Apache License 2.0: https://github.com/opencontainers/runc/issues/2144#issuecomment-543116397
package devicefilter
package devices
import (
"errors"
@@ -13,7 +13,6 @@ import (
"strconv"
"github.com/cilium/ebpf/asm"
devicesemulator "github.com/opencontainers/runc/libcontainer/cgroups/devices"
"github.com/opencontainers/runc/libcontainer/devices"
"golang.org/x/sys/unix"
)
@@ -30,7 +29,7 @@ func DeviceFilter(rules []*devices.Rule) (asm.Instructions, string, error) {
// gives us a guarantee that the behaviour of devices filtering is the same
// as cgroupv1, including security hardenings to avoid misconfiguration
// (such as punching holes in wildcard rules).
emu := new(devicesemulator.Emulator)
emu := new(Emulator)
for _, rule := range rules {
if err := emu.Apply(*rule); err != nil {
return nil, "", err

View File

@@ -1,4 +1,4 @@
package devicefilter
package devices
import (
"strings"

View File

@@ -0,0 +1,16 @@
// Package devices contains functionality to manage cgroup devices, which
// is exposed indirectly via libcontainer/cgroups managers.
//
// To enable cgroup managers to manage devices, this package must be imported.
package devices
import (
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/systemd"
)
func init() {
cgroups.DevicesSetV1 = setV1
cgroups.DevicesSetV2 = setV2
systemd.GenerateDeviceProps = systemdProperties
}

View File

@@ -1,4 +1,4 @@
package ebpf
package devices
import (
"errors"

View File

@@ -0,0 +1,233 @@
package devices
import (
"bufio"
"errors"
"fmt"
"os"
"strings"
systemdDbus "github.com/coreos/go-systemd/v22/dbus"
"github.com/godbus/dbus/v5"
"github.com/sirupsen/logrus"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/devices"
)
// systemdProperties takes the configured device rules and generates a
// corresponding set of systemd properties to configure the devices correctly.
func systemdProperties(r *configs.Resources) ([]systemdDbus.Property, error) {
if r.SkipDevices {
return nil, nil
}
properties := []systemdDbus.Property{
// Always run in the strictest white-list mode.
newProp("DevicePolicy", "strict"),
// Empty the DeviceAllow array before filling it.
newProp("DeviceAllow", []deviceAllowEntry{}),
}
// Figure out the set of rules.
configEmu := Emulator{}
for _, rule := range r.Devices {
if err := configEmu.Apply(*rule); err != nil {
return nil, fmt.Errorf("unable to apply rule for systemd: %w", err)
}
}
// systemd doesn't support blacklists. So we log a warning, and tell
// systemd to act as a deny-all whitelist. This ruleset will be replaced
// with our normal fallback code. This may result in spurious errors, but
// the only other option is to error out here.
if configEmu.IsBlacklist() {
// However, if we're dealing with an allow-all rule then we can do it.
if configEmu.IsAllowAll() {
return allowAllDevices(), nil
}
logrus.Warn("systemd doesn't support blacklist device rules -- applying temporary deny-all rule")
return properties, nil
}
// Now generate the set of rules we actually need to apply. Unlike the
// normal devices cgroup, in "strict" mode systemd defaults to a deny-all
// whitelist which is the default for devices.Emulator.
finalRules, err := configEmu.Rules()
if err != nil {
return nil, fmt.Errorf("unable to get simplified rules for systemd: %w", err)
}
var deviceAllowList []deviceAllowEntry
for _, rule := range finalRules {
if !rule.Allow {
// Should never happen.
return nil, fmt.Errorf("[internal error] cannot add deny rule to systemd DeviceAllow list: %v", *rule)
}
switch rule.Type {
case devices.BlockDevice, devices.CharDevice:
default:
// Should never happen.
return nil, fmt.Errorf("invalid device type for DeviceAllow: %v", rule.Type)
}
entry := deviceAllowEntry{
Perms: string(rule.Permissions),
}
// systemd has a fairly odd (though understandable) syntax here, and
// because of the OCI configuration format we have to do quite a bit of
// trickery to convert things:
//
// * Concrete rules with non-wildcard major/minor numbers have to use
// /dev/{block,char} paths. This is slightly odd because it means
// that we cannot add whitelist rules for devices that don't exist,
// but there's not too much we can do about that.
//
// However, path globbing is not support for path-based rules so we
// need to handle wildcards in some other manner.
//
// * Wildcard-minor rules have to specify a "device group name" (the
// second column in /proc/devices).
//
// * Wildcard (major and minor) rules can just specify a glob with the
// type ("char-*" or "block-*").
//
// The only type of rule we can't handle is wildcard-major rules, and
// so we'll give a warning in that case (note that the fallback code
// will insert any rules systemd couldn't handle). What amazing fun.
if rule.Major == devices.Wildcard {
// "_ *:n _" rules aren't supported by systemd.
if rule.Minor != devices.Wildcard {
logrus.Warnf("systemd doesn't support '*:n' device rules -- temporarily ignoring rule: %v", *rule)
continue
}
// "_ *:* _" rules just wildcard everything.
prefix, err := groupPrefix(rule.Type)
if err != nil {
return nil, err
}
entry.Path = prefix + "*"
} else if rule.Minor == devices.Wildcard {
// "_ n:* _" rules require a device group from /proc/devices.
group, err := findDeviceGroup(rule.Type, rule.Major)
if err != nil {
return nil, fmt.Errorf("unable to find device '%v/%d': %w", rule.Type, rule.Major, err)
}
if group == "" {
// Couldn't find a group.
logrus.Warnf("could not find device group for '%v/%d' in /proc/devices -- temporarily ignoring rule: %v", rule.Type, rule.Major, *rule)
continue
}
entry.Path = group
} else {
// "_ n:m _" rules are just a path in /dev/{block,char}/.
switch rule.Type {
case devices.BlockDevice:
entry.Path = fmt.Sprintf("/dev/block/%d:%d", rule.Major, rule.Minor)
case devices.CharDevice:
entry.Path = fmt.Sprintf("/dev/char/%d:%d", rule.Major, rule.Minor)
}
}
deviceAllowList = append(deviceAllowList, entry)
}
properties = append(properties, newProp("DeviceAllow", deviceAllowList))
return properties, nil
}
func newProp(name string, units interface{}) systemdDbus.Property {
return systemdDbus.Property{
Name: name,
Value: dbus.MakeVariant(units),
}
}
func groupPrefix(ruleType devices.Type) (string, error) {
switch ruleType {
case devices.BlockDevice:
return "block-", nil
case devices.CharDevice:
return "char-", nil
default:
return "", fmt.Errorf("device type %v has no group prefix", ruleType)
}
}
// findDeviceGroup tries to find the device group name (as listed in
// /proc/devices) with the type prefixed as required for DeviceAllow, for a
// given (type, major) combination. If more than one device group exists, an
// arbitrary one is chosen.
func findDeviceGroup(ruleType devices.Type, ruleMajor int64) (string, error) {
fh, err := os.Open("/proc/devices")
if err != nil {
return "", err
}
defer fh.Close()
prefix, err := groupPrefix(ruleType)
if err != nil {
return "", err
}
scanner := bufio.NewScanner(fh)
var currentType devices.Type
for scanner.Scan() {
// We need to strip spaces because the first number is column-aligned.
line := strings.TrimSpace(scanner.Text())
// Handle the "header" lines.
switch line {
case "Block devices:":
currentType = devices.BlockDevice
continue
case "Character devices:":
currentType = devices.CharDevice
continue
case "":
continue
}
// Skip lines unrelated to our type.
if currentType != ruleType {
continue
}
// Parse out the (major, name).
var (
currMajor int64
currName string
)
if n, err := fmt.Sscanf(line, "%d %s", &currMajor, &currName); err != nil || n != 2 {
if err == nil {
err = errors.New("wrong number of fields")
}
return "", fmt.Errorf("scan /proc/devices line %q: %w", line, err)
}
if currMajor == ruleMajor {
return prefix + currName, nil
}
}
if err := scanner.Err(); err != nil {
return "", fmt.Errorf("reading /proc/devices: %w", err)
}
// Couldn't find the device group.
return "", nil
}
// DeviceAllow is the dbus type "a(ss)" which means we need a struct
// to represent it in Go.
type deviceAllowEntry struct {
Path string
Perms string
}
func allowAllDevices() []systemdDbus.Property {
// Setting mode to auto and removing all DeviceAllow rules
// results in allowing access to all devices.
return []systemdDbus.Property{
newProp("DevicePolicy", "auto"),
newProp("DeviceAllow", []deviceAllowEntry{}),
}
}

View File

@@ -0,0 +1,253 @@
package devices
import (
"bytes"
"os"
"os/exec"
"strings"
"testing"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/systemd"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/devices"
)
// TestPodSkipDevicesUpdate checks that updating a pod having SkipDevices: true
// does not result in spurious "permission denied" errors in a container
// running under the pod. The test is somewhat similar in nature to the
// @test "update devices [minimal transition rules]" in tests/integration,
// but uses a pod.
func TestPodSkipDevicesUpdate(t *testing.T) {
if !systemd.IsRunningSystemd() {
t.Skip("Test requires systemd.")
}
if os.Geteuid() != 0 {
t.Skip("Test requires root.")
}
podName := "system-runc_test_pod" + t.Name() + ".slice"
podConfig := &configs.Cgroup{
Systemd: true,
Parent: "system.slice",
Name: podName,
Resources: &configs.Resources{
PidsLimit: 42,
Memory: 32 * 1024 * 1024,
SkipDevices: true,
},
}
// Create "pod" cgroup (a systemd slice to hold containers).
pm := newManager(t, podConfig)
if err := pm.Apply(-1); err != nil {
t.Fatal(err)
}
if err := pm.Set(podConfig.Resources); err != nil {
t.Fatal(err)
}
containerConfig := &configs.Cgroup{
Parent: podName,
ScopePrefix: "test",
Name: "PodSkipDevicesUpdate",
Resources: &configs.Resources{
Devices: []*devices.Rule{
// Allow access to /dev/null.
{
Type: devices.CharDevice,
Major: 1,
Minor: 3,
Permissions: "rwm",
Allow: true,
},
},
},
}
// Create a "container" within the "pod" cgroup.
// This is not a real container, just a process in the cgroup.
cmd := exec.Command("bash", "-c", "while true; do echo > /dev/null; done")
cmd.Env = append(os.Environ(), "LANG=C")
var stderr bytes.Buffer
cmd.Stderr = &stderr
if err := cmd.Start(); err != nil {
t.Fatal(err)
}
// Make sure to not leave a zombie.
defer func() {
// These may fail, we don't care.
_ = cmd.Process.Kill()
_ = cmd.Wait()
}()
// Put the process into a cgroup.
cm := newManager(t, containerConfig)
if err := cm.Apply(cmd.Process.Pid); err != nil {
t.Fatal(err)
}
// Check that we put the "container" into the "pod" cgroup.
if !strings.HasPrefix(cm.Path("devices"), pm.Path("devices")) {
t.Fatalf("expected container cgroup path %q to be under pod cgroup path %q",
cm.Path("devices"), pm.Path("devices"))
}
if err := cm.Set(containerConfig.Resources); err != nil {
t.Fatal(err)
}
// Now update the pod a few times.
for i := 0; i < 42; i++ {
podConfig.Resources.PidsLimit++
podConfig.Resources.Memory += 1024 * 1024
if err := pm.Set(podConfig.Resources); err != nil {
t.Fatal(err)
}
}
// Kill the "container".
if err := cmd.Process.Kill(); err != nil {
t.Fatal(err)
}
_ = cmd.Wait()
// "Container" stderr should be empty.
if stderr.Len() != 0 {
t.Fatalf("container stderr not empty: %s", stderr.String())
}
}
func testSkipDevices(t *testing.T, skipDevices bool, expected []string) {
if !systemd.IsRunningSystemd() {
t.Skip("Test requires systemd.")
}
if os.Geteuid() != 0 {
t.Skip("Test requires root.")
}
podConfig := &configs.Cgroup{
Parent: "system.slice",
Name: "system-runc_test_pods.slice",
Resources: &configs.Resources{
SkipDevices: skipDevices,
},
}
// Create "pods" cgroup (a systemd slice to hold containers).
pm := newManager(t, podConfig)
if err := pm.Apply(-1); err != nil {
t.Fatal(err)
}
if err := pm.Set(podConfig.Resources); err != nil {
t.Fatal(err)
}
config := &configs.Cgroup{
Parent: "system-runc_test_pods.slice",
ScopePrefix: "test",
Name: "SkipDevices",
Resources: &configs.Resources{
Devices: []*devices.Rule{
// Allow access to /dev/full only.
{
Type: devices.CharDevice,
Major: 1,
Minor: 7,
Permissions: "rwm",
Allow: true,
},
},
},
}
// Create a "container" within the "pods" cgroup.
// This is not a real container, just a process in the cgroup.
cmd := exec.Command("bash", "-c", "read; echo > /dev/full; cat /dev/null; true")
cmd.Env = append(os.Environ(), "LANG=C")
stdinR, stdinW, err := os.Pipe()
if err != nil {
t.Fatal(err)
}
cmd.Stdin = stdinR
var stderr bytes.Buffer
cmd.Stderr = &stderr
err = cmd.Start()
stdinR.Close()
defer stdinW.Close()
if err != nil {
t.Fatal(err)
}
// Make sure to not leave a zombie.
defer func() {
// These may fail, we don't care.
_, _ = stdinW.WriteString("hey\n")
_ = cmd.Wait()
}()
// Put the process into a cgroup.
m := newManager(t, config)
if err := m.Apply(cmd.Process.Pid); err != nil {
t.Fatal(err)
}
// Check that we put the "container" into the "pod" cgroup.
if !strings.HasPrefix(m.Path("devices"), pm.Path("devices")) {
t.Fatalf("expected container cgroup path %q to be under pod cgroup path %q",
m.Path("devices"), pm.Path("devices"))
}
if err := m.Set(config.Resources); err != nil {
// failed to write "c 1:7 rwm": write /sys/fs/cgroup/devices/system.slice/system-runc_test_pods.slice/test-SkipDevices.scope/devices.allow: operation not permitted
if skipDevices == false && strings.HasSuffix(err.Error(), "/devices.allow: operation not permitted") {
// Cgroup v1 devices controller gives EPERM on trying
// to enable devices that are not enabled
// (skipDevices=false) in a parent cgroup.
// If this happens, test is passing.
return
}
t.Fatal(err)
}
// Check that we can access /dev/full but not /dev/zero.
if _, err := stdinW.WriteString("wow\n"); err != nil {
t.Fatal(err)
}
if err := cmd.Wait(); err != nil {
t.Fatal(err)
}
for _, exp := range expected {
if !strings.Contains(stderr.String(), exp) {
t.Errorf("expected %q, got: %s", exp, stderr.String())
}
}
}
func TestSkipDevicesTrue(t *testing.T) {
testSkipDevices(t, true, []string{
"echo: write error: No space left on device",
"cat: /dev/null: Operation not permitted",
})
}
func TestSkipDevicesFalse(t *testing.T) {
// If SkipDevices is not set for the parent slice, access to both
// devices should fail. This is done to assess the test correctness.
// For cgroup v1, we check for m.Set returning EPERM.
// For cgroup v2, we check for the errors below.
testSkipDevices(t, false, []string{
"/dev/full: Operation not permitted",
"cat: /dev/null: Operation not permitted",
})
}
func newManager(t *testing.T, config *configs.Cgroup) (m cgroups.Manager) {
t.Helper()
var err error
if cgroups.IsCgroup2UnifiedMode() {
m, err = systemd.NewUnifiedManager(config, "")
} else {
m, err = systemd.NewLegacyManager(config, nil)
}
if err != nil {
t.Fatal(err)
}
t.Cleanup(func() { _ = m.Destroy() })
return m
}

View File

@@ -0,0 +1,84 @@
package devices
import (
"bytes"
"errors"
"reflect"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/devices"
"github.com/opencontainers/runc/libcontainer/userns"
)
var testingSkipFinalCheck bool
func setV1(path string, r *configs.Resources) error {
if userns.RunningInUserNS() || r.SkipDevices {
return nil
}
// Generate two emulators, one for the current state of the cgroup and one
// for the requested state by the user.
current, err := loadEmulator(path)
if err != nil {
return err
}
target, err := buildEmulator(r.Devices)
if err != nil {
return err
}
// Compute the minimal set of transition rules needed to achieve the
// requested state.
transitionRules, err := current.Transition(target)
if err != nil {
return err
}
for _, rule := range transitionRules {
file := "devices.deny"
if rule.Allow {
file = "devices.allow"
}
if err := cgroups.WriteFile(path, file, rule.CgroupString()); err != nil {
return err
}
}
// Final safety check -- ensure that the resulting state is what was
// requested. This is only really correct for white-lists, but for
// black-lists we can at least check that the cgroup is in the right mode.
//
// This safety-check is skipped for the unit tests because we cannot
// currently mock devices.list correctly.
if !testingSkipFinalCheck {
currentAfter, err := loadEmulator(path)
if err != nil {
return err
}
if !target.IsBlacklist() && !reflect.DeepEqual(currentAfter, target) {
return errors.New("resulting devices cgroup doesn't precisely match target")
} else if target.IsBlacklist() != currentAfter.IsBlacklist() {
return errors.New("resulting devices cgroup doesn't match target mode")
}
}
return nil
}
func loadEmulator(path string) (*Emulator, error) {
list, err := cgroups.ReadFile(path, "devices.list")
if err != nil {
return nil, err
}
return EmulatorFromList(bytes.NewBufferString(list))
}
func buildEmulator(rules []*devices.Rule) (*Emulator, error) {
// This defaults to a white-list -- which is what we want!
emu := &Emulator{}
for _, rule := range rules {
if err := emu.Apply(*rule); err != nil {
return nil, err
}
}
return emu, nil
}

View File

@@ -1,21 +1,34 @@
package fs
package devices
import (
"os"
"path"
"testing"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/devices"
)
func TestDevicesSetAllow(t *testing.T) {
path := tempDir(t, "devices")
func init() {
testingSkipFinalCheck = true
cgroups.TestMode = true
}
writeFileContents(t, path, map[string]string{
func TestSetV1Allow(t *testing.T) {
dir := t.TempDir()
for file, contents := range map[string]string{
"devices.allow": "",
"devices.deny": "",
"devices.list": "a *:* rwm",
})
} {
err := os.WriteFile(path.Join(dir, file), []byte(contents), 0o600)
if err != nil {
t.Fatal(err)
}
}
r := &configs.Resources{
Devices: []*devices.Rule{
@@ -29,13 +42,12 @@ func TestDevicesSetAllow(t *testing.T) {
},
}
d := &DevicesGroup{TestingSkipFinalCheck: true}
if err := d.Set(path, r); err != nil {
if err := setV1(dir, r); err != nil {
t.Fatal(err)
}
// The default deny rule must be written.
value, err := fscommon.GetCgroupParamString(path, "devices.deny")
value, err := fscommon.GetCgroupParamString(dir, "devices.deny")
if err != nil {
t.Fatal(err)
}
@@ -44,7 +56,7 @@ func TestDevicesSetAllow(t *testing.T) {
}
// Permitted rule must be written.
if value, err := fscommon.GetCgroupParamString(path, "devices.allow"); err != nil {
if value, err := fscommon.GetCgroupParamString(dir, "devices.allow"); err != nil {
t.Fatal(err)
} else if value != "c 1:5 rwm" {
t.Errorf("Got the wrong value (%q), set devices.allow failed.", value)

View File

@@ -1,12 +1,10 @@
package fs2
package devices
import (
"fmt"
"golang.org/x/sys/unix"
"github.com/opencontainers/runc/libcontainer/cgroups/ebpf"
"github.com/opencontainers/runc/libcontainer/cgroups/ebpf/devicefilter"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/devices"
"github.com/opencontainers/runc/libcontainer/userns"
@@ -53,11 +51,11 @@ func canSkipEBPFError(r *configs.Resources) bool {
return true
}
func setDevices(dirPath string, r *configs.Resources) error {
func setV2(dirPath string, r *configs.Resources) error {
if r.SkipDevices {
return nil
}
insts, license, err := devicefilter.DeviceFilter(r.Devices)
insts, license, err := DeviceFilter(r.Devices)
if err != nil {
return err
}
@@ -66,7 +64,7 @@ func setDevices(dirPath string, r *configs.Resources) error {
return fmt.Errorf("cannot get dir FD for %s", dirPath)
}
defer unix.Close(dirFD)
if _, err := ebpf.LoadAttachCgroupDeviceFilter(insts, license, dirFD); err != nil {
if _, err := LoadAttachCgroupDeviceFilter(insts, license, dirFD); err != nil {
if !canSkipEBPFError(r) {
return err
}

View File

@@ -1,20 +1,11 @@
package fs
import (
"bytes"
"errors"
"reflect"
"github.com/opencontainers/runc/libcontainer/cgroups"
cgroupdevices "github.com/opencontainers/runc/libcontainer/cgroups/devices"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/devices"
"github.com/opencontainers/runc/libcontainer/userns"
)
type DevicesGroup struct {
TestingSkipFinalCheck bool
}
type DevicesGroup struct{}
func (s *DevicesGroup) Name() string {
return "devices"
@@ -33,75 +24,14 @@ func (s *DevicesGroup) Apply(path string, r *configs.Resources, pid int) error {
return apply(path, pid)
}
func loadEmulator(path string) (*cgroupdevices.Emulator, error) {
list, err := cgroups.ReadFile(path, "devices.list")
if err != nil {
return nil, err
}
return cgroupdevices.EmulatorFromList(bytes.NewBufferString(list))
}
func buildEmulator(rules []*devices.Rule) (*cgroupdevices.Emulator, error) {
// This defaults to a white-list -- which is what we want!
emu := &cgroupdevices.Emulator{}
for _, rule := range rules {
if err := emu.Apply(*rule); err != nil {
return nil, err
}
}
return emu, nil
}
func (s *DevicesGroup) Set(path string, r *configs.Resources) error {
if userns.RunningInUserNS() || r.SkipDevices {
return nil
}
// Generate two emulators, one for the current state of the cgroup and one
// for the requested state by the user.
current, err := loadEmulator(path)
if err != nil {
return err
}
target, err := buildEmulator(r.Devices)
if err != nil {
return err
}
// Compute the minimal set of transition rules needed to achieve the
// requested state.
transitionRules, err := current.Transition(target)
if err != nil {
return err
}
for _, rule := range transitionRules {
file := "devices.deny"
if rule.Allow {
file = "devices.allow"
}
if err := cgroups.WriteFile(path, file, rule.CgroupString()); err != nil {
return err
if cgroups.DevicesSetV1 == nil {
if len(r.Devices) == 0 {
return nil
}
return cgroups.ErrDevicesUnsupported
}
// Final safety check -- ensure that the resulting state is what was
// requested. This is only really correct for white-lists, but for
// black-lists we can at least check that the cgroup is in the right mode.
//
// This safety-check is skipped for the unit tests because we cannot
// currently mock devices.list correctly.
if !s.TestingSkipFinalCheck {
currentAfter, err := loadEmulator(path)
if err != nil {
return err
}
if !target.IsBlacklist() && !reflect.DeepEqual(currentAfter, target) {
return errors.New("resulting devices cgroup doesn't precisely match target")
} else if target.IsBlacklist() != currentAfter.IsBlacklist() {
return errors.New("resulting devices cgroup doesn't match target mode")
}
}
return nil
return cgroups.DevicesSetV1(path, r)
}
func (s *DevicesGroup) GetStats(path string, stats *cgroups.Stats) error {

View File

@@ -182,7 +182,7 @@ func (m *manager) Set(r *configs.Resources) error {
if err := sys.Set(path, r); err != nil {
// When rootless is true, errors from the device subsystem
// are ignored, as it is really not expected to work.
if m.cgroups.Rootless && sys.Name() == "devices" {
if m.cgroups.Rootless && sys.Name() == "devices" && !errors.Is(err, cgroups.ErrDevicesUnsupported) {
continue
}
// However, errors from other subsystems are not ignored.

View File

@@ -175,8 +175,10 @@ func (m *manager) Set(r *configs.Resources) error {
// When rootless is true, errors from the device subsystem are ignored because it is really not expected to work.
// However, errors from other subsystems are not ignored.
// see @test "runc create (rootless + limits + no cgrouppath + no permission) fails with informative error"
if err := setDevices(m.dirPath, r); err != nil && !m.config.Rootless {
return err
if err := setDevices(m.dirPath, r); err != nil {
if !m.config.Rootless || errors.Is(err, cgroups.ErrDevicesUnsupported) {
return err
}
}
// cpuset (since kernel 5.0)
if err := setCpuset(m.dirPath, r); err != nil {
@@ -201,6 +203,16 @@ func (m *manager) Set(r *configs.Resources) error {
return nil
}
func setDevices(dirPath string, r *configs.Resources) error {
if cgroups.DevicesSetV2 == nil {
if len(r.Devices) > 0 {
return cgroups.ErrDevicesUnsupported
}
return nil
}
return cgroups.DevicesSetV2(dirPath, r)
}
func (m *manager) setUnified(res map[string]string) error {
for k, v := range res {
if strings.Contains(k, "/") {

View File

@@ -16,6 +16,7 @@ import (
dbus "github.com/godbus/dbus/v5"
"github.com/sirupsen/logrus"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
)
@@ -32,6 +33,8 @@ var (
isRunningSystemdOnce sync.Once
isRunningSystemd bool
GenerateDeviceProps func(*configs.Resources) ([]systemdDbus.Property, error)
)
// NOTE: This function comes from package github.com/coreos/go-systemd/util
@@ -313,3 +316,16 @@ func addCpuset(cm *dbusConnManager, props *[]systemdDbus.Property, cpus, mems st
}
return nil
}
// generateDeviceProperties takes the configured device rules and generates a
// corresponding set of systemd properties to configure the devices correctly.
func generateDeviceProperties(r *configs.Resources) ([]systemdDbus.Property, error) {
if GenerateDeviceProps == nil {
if len(r.Devices) > 0 {
return nil, cgroups.ErrDevicesUnsupported
}
return nil, nil
}
return GenerateDeviceProps(r)
}

View File

@@ -1,20 +1,11 @@
package systemd
import (
"bufio"
"errors"
"fmt"
"os"
"reflect"
"strings"
systemdDbus "github.com/coreos/go-systemd/v22/dbus"
dbus "github.com/godbus/dbus/v5"
"github.com/sirupsen/logrus"
cgroupdevices "github.com/opencontainers/runc/libcontainer/cgroups/devices"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/devices"
)
// freezeBeforeSet answers whether there is a need to freeze the cgroup before
@@ -81,213 +72,3 @@ func (m *legacyManager) freezeBeforeSet(unitName string, r *configs.Resources) (
}
return
}
func groupPrefix(ruleType devices.Type) (string, error) {
switch ruleType {
case devices.BlockDevice:
return "block-", nil
case devices.CharDevice:
return "char-", nil
default:
return "", fmt.Errorf("device type %v has no group prefix", ruleType)
}
}
// findDeviceGroup tries to find the device group name (as listed in
// /proc/devices) with the type prefixed as required for DeviceAllow, for a
// given (type, major) combination. If more than one device group exists, an
// arbitrary one is chosen.
func findDeviceGroup(ruleType devices.Type, ruleMajor int64) (string, error) {
fh, err := os.Open("/proc/devices")
if err != nil {
return "", err
}
defer fh.Close()
prefix, err := groupPrefix(ruleType)
if err != nil {
return "", err
}
scanner := bufio.NewScanner(fh)
var currentType devices.Type
for scanner.Scan() {
// We need to strip spaces because the first number is column-aligned.
line := strings.TrimSpace(scanner.Text())
// Handle the "header" lines.
switch line {
case "Block devices:":
currentType = devices.BlockDevice
continue
case "Character devices:":
currentType = devices.CharDevice
continue
case "":
continue
}
// Skip lines unrelated to our type.
if currentType != ruleType {
continue
}
// Parse out the (major, name).
var (
currMajor int64
currName string
)
if n, err := fmt.Sscanf(line, "%d %s", &currMajor, &currName); err != nil || n != 2 {
if err == nil {
err = errors.New("wrong number of fields")
}
return "", fmt.Errorf("scan /proc/devices line %q: %w", line, err)
}
if currMajor == ruleMajor {
return prefix + currName, nil
}
}
if err := scanner.Err(); err != nil {
return "", fmt.Errorf("reading /proc/devices: %w", err)
}
// Couldn't find the device group.
return "", nil
}
// DeviceAllow is the dbus type "a(ss)" which means we need a struct
// to represent it in Go.
type deviceAllowEntry struct {
Path string
Perms string
}
func allowAllDevices() []systemdDbus.Property {
// Setting mode to auto and removing all DeviceAllow rules
// results in allowing access to all devices.
return []systemdDbus.Property{
newProp("DevicePolicy", "auto"),
newProp("DeviceAllow", []deviceAllowEntry{}),
}
}
// generateDeviceProperties takes the configured device rules and generates a
// corresponding set of systemd properties to configure the devices correctly.
func generateDeviceProperties(r *configs.Resources) ([]systemdDbus.Property, error) {
if r.SkipDevices {
return nil, nil
}
properties := []systemdDbus.Property{
// Always run in the strictest white-list mode.
newProp("DevicePolicy", "strict"),
// Empty the DeviceAllow array before filling it.
newProp("DeviceAllow", []deviceAllowEntry{}),
}
// Figure out the set of rules.
configEmu := &cgroupdevices.Emulator{}
for _, rule := range r.Devices {
if err := configEmu.Apply(*rule); err != nil {
return nil, fmt.Errorf("unable to apply rule for systemd: %w", err)
}
}
// systemd doesn't support blacklists. So we log a warning, and tell
// systemd to act as a deny-all whitelist. This ruleset will be replaced
// with our normal fallback code. This may result in spurious errors, but
// the only other option is to error out here.
if configEmu.IsBlacklist() {
// However, if we're dealing with an allow-all rule then we can do it.
if configEmu.IsAllowAll() {
return allowAllDevices(), nil
}
logrus.Warn("systemd doesn't support blacklist device rules -- applying temporary deny-all rule")
return properties, nil
}
// Now generate the set of rules we actually need to apply. Unlike the
// normal devices cgroup, in "strict" mode systemd defaults to a deny-all
// whitelist which is the default for devices.Emulator.
finalRules, err := configEmu.Rules()
if err != nil {
return nil, fmt.Errorf("unable to get simplified rules for systemd: %w", err)
}
var deviceAllowList []deviceAllowEntry
for _, rule := range finalRules {
if !rule.Allow {
// Should never happen.
return nil, fmt.Errorf("[internal error] cannot add deny rule to systemd DeviceAllow list: %v", *rule)
}
switch rule.Type {
case devices.BlockDevice, devices.CharDevice:
default:
// Should never happen.
return nil, fmt.Errorf("invalid device type for DeviceAllow: %v", rule.Type)
}
entry := deviceAllowEntry{
Perms: string(rule.Permissions),
}
// systemd has a fairly odd (though understandable) syntax here, and
// because of the OCI configuration format we have to do quite a bit of
// trickery to convert things:
//
// * Concrete rules with non-wildcard major/minor numbers have to use
// /dev/{block,char} paths. This is slightly odd because it means
// that we cannot add whitelist rules for devices that don't exist,
// but there's not too much we can do about that.
//
// However, path globbing is not support for path-based rules so we
// need to handle wildcards in some other manner.
//
// * Wildcard-minor rules have to specify a "device group name" (the
// second column in /proc/devices).
//
// * Wildcard (major and minor) rules can just specify a glob with the
// type ("char-*" or "block-*").
//
// The only type of rule we can't handle is wildcard-major rules, and
// so we'll give a warning in that case (note that the fallback code
// will insert any rules systemd couldn't handle). What amazing fun.
if rule.Major == devices.Wildcard {
// "_ *:n _" rules aren't supported by systemd.
if rule.Minor != devices.Wildcard {
logrus.Warnf("systemd doesn't support '*:n' device rules -- temporarily ignoring rule: %v", *rule)
continue
}
// "_ *:* _" rules just wildcard everything.
prefix, err := groupPrefix(rule.Type)
if err != nil {
return nil, err
}
entry.Path = prefix + "*"
} else if rule.Minor == devices.Wildcard {
// "_ n:* _" rules require a device group from /proc/devices.
group, err := findDeviceGroup(rule.Type, rule.Major)
if err != nil {
return nil, fmt.Errorf("unable to find device '%v/%d': %w", rule.Type, rule.Major, err)
}
if group == "" {
// Couldn't find a group.
logrus.Warnf("could not find device group for '%v/%d' in /proc/devices -- temporarily ignoring rule: %v", rule.Type, rule.Major, *rule)
continue
}
entry.Path = group
} else {
// "_ n:m _" rules are just a path in /dev/{block,char}/.
switch rule.Type {
case devices.BlockDevice:
entry.Path = fmt.Sprintf("/dev/block/%d:%d", rule.Major, rule.Minor)
case devices.CharDevice:
entry.Path = fmt.Sprintf("/dev/char/%d:%d", rule.Major, rule.Minor)
}
}
deviceAllowList = append(deviceAllowList, entry)
}
properties = append(properties, newProp("DeviceAllow", deviceAllowList))
return properties, nil
}

View File

@@ -8,235 +8,11 @@ import (
"strings"
"testing"
"golang.org/x/sys/unix"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/devices"
"golang.org/x/sys/unix"
)
// TestPodSkipDevicesUpdate checks that updating a pod having SkipDevices: true
// does not result in spurious "permission denied" errors in a container
// running under the pod. The test is somewhat similar in nature to the
// @test "update devices [minimal transition rules]" in tests/integration,
// but uses a pod.
func TestPodSkipDevicesUpdate(t *testing.T) {
if !IsRunningSystemd() {
t.Skip("Test requires systemd.")
}
if os.Geteuid() != 0 {
t.Skip("Test requires root.")
}
podName := "system-runc_test_pod" + t.Name() + ".slice"
podConfig := &configs.Cgroup{
Systemd: true,
Parent: "system.slice",
Name: podName,
Resources: &configs.Resources{
PidsLimit: 42,
Memory: 32 * 1024 * 1024,
SkipDevices: true,
},
}
// Create "pod" cgroup (a systemd slice to hold containers).
pm := newManager(t, podConfig)
if err := pm.Apply(-1); err != nil {
t.Fatal(err)
}
if err := pm.Set(podConfig.Resources); err != nil {
t.Fatal(err)
}
containerConfig := &configs.Cgroup{
Parent: podName,
ScopePrefix: "test",
Name: "PodSkipDevicesUpdate",
Resources: &configs.Resources{
Devices: []*devices.Rule{
// Allow access to /dev/null.
{
Type: devices.CharDevice,
Major: 1,
Minor: 3,
Permissions: "rwm",
Allow: true,
},
},
},
}
// Create a "container" within the "pod" cgroup.
// This is not a real container, just a process in the cgroup.
cmd := exec.Command("bash", "-c", "while true; do echo > /dev/null; done")
cmd.Env = append(os.Environ(), "LANG=C")
var stderr bytes.Buffer
cmd.Stderr = &stderr
if err := cmd.Start(); err != nil {
t.Fatal(err)
}
// Make sure to not leave a zombie.
defer func() {
// These may fail, we don't care.
_ = cmd.Process.Kill()
_ = cmd.Wait()
}()
// Put the process into a cgroup.
cm := newManager(t, containerConfig)
if err := cm.Apply(cmd.Process.Pid); err != nil {
t.Fatal(err)
}
// Check that we put the "container" into the "pod" cgroup.
if !strings.HasPrefix(cm.Path("devices"), pm.Path("devices")) {
t.Fatalf("expected container cgroup path %q to be under pod cgroup path %q",
cm.Path("devices"), pm.Path("devices"))
}
if err := cm.Set(containerConfig.Resources); err != nil {
t.Fatal(err)
}
// Now update the pod a few times.
for i := 0; i < 42; i++ {
podConfig.Resources.PidsLimit++
podConfig.Resources.Memory += 1024 * 1024
if err := pm.Set(podConfig.Resources); err != nil {
t.Fatal(err)
}
}
// Kill the "container".
if err := cmd.Process.Kill(); err != nil {
t.Fatal(err)
}
_ = cmd.Wait()
// "Container" stderr should be empty.
if stderr.Len() != 0 {
t.Fatalf("container stderr not empty: %s", stderr.String())
}
}
func testSkipDevices(t *testing.T, skipDevices bool, expected []string) {
if !IsRunningSystemd() {
t.Skip("Test requires systemd.")
}
if os.Geteuid() != 0 {
t.Skip("Test requires root.")
}
podConfig := &configs.Cgroup{
Parent: "system.slice",
Name: "system-runc_test_pods.slice",
Resources: &configs.Resources{
SkipDevices: skipDevices,
},
}
// Create "pods" cgroup (a systemd slice to hold containers).
pm := newManager(t, podConfig)
if err := pm.Apply(-1); err != nil {
t.Fatal(err)
}
if err := pm.Set(podConfig.Resources); err != nil {
t.Fatal(err)
}
config := &configs.Cgroup{
Parent: "system-runc_test_pods.slice",
ScopePrefix: "test",
Name: "SkipDevices",
Resources: &configs.Resources{
Devices: []*devices.Rule{
// Allow access to /dev/full only.
{
Type: devices.CharDevice,
Major: 1,
Minor: 7,
Permissions: "rwm",
Allow: true,
},
},
},
}
// Create a "container" within the "pods" cgroup.
// This is not a real container, just a process in the cgroup.
cmd := exec.Command("bash", "-c", "read; echo > /dev/full; cat /dev/null; true")
cmd.Env = append(os.Environ(), "LANG=C")
stdinR, stdinW, err := os.Pipe()
if err != nil {
t.Fatal(err)
}
cmd.Stdin = stdinR
var stderr bytes.Buffer
cmd.Stderr = &stderr
err = cmd.Start()
stdinR.Close()
defer stdinW.Close()
if err != nil {
t.Fatal(err)
}
// Make sure to not leave a zombie.
defer func() {
// These may fail, we don't care.
_, _ = stdinW.WriteString("hey\n")
_ = cmd.Wait()
}()
// Put the process into a cgroup.
m := newManager(t, config)
if err := m.Apply(cmd.Process.Pid); err != nil {
t.Fatal(err)
}
// Check that we put the "container" into the "pod" cgroup.
if !strings.HasPrefix(m.Path("devices"), pm.Path("devices")) {
t.Fatalf("expected container cgroup path %q to be under pod cgroup path %q",
m.Path("devices"), pm.Path("devices"))
}
if err := m.Set(config.Resources); err != nil {
// failed to write "c 1:7 rwm": write /sys/fs/cgroup/devices/system.slice/system-runc_test_pods.slice/test-SkipDevices.scope/devices.allow: operation not permitted
if skipDevices == false && strings.HasSuffix(err.Error(), "/devices.allow: operation not permitted") {
// Cgroup v1 devices controller gives EPERM on trying
// to enable devices that are not enabled
// (skipDevices=false) in a parent cgroup.
// If this happens, test is passing.
return
}
t.Fatal(err)
}
// Check that we can access /dev/full but not /dev/zero.
if _, err := stdinW.WriteString("wow\n"); err != nil {
t.Fatal(err)
}
if err := cmd.Wait(); err != nil {
t.Fatal(err)
}
for _, exp := range expected {
if !strings.Contains(stderr.String(), exp) {
t.Errorf("expected %q, got: %s", exp, stderr.String())
}
}
}
func TestSkipDevicesTrue(t *testing.T) {
testSkipDevices(t, true, []string{
"echo: write error: No space left on device",
"cat: /dev/null: Operation not permitted",
})
}
func TestSkipDevicesFalse(t *testing.T) {
// If SkipDevices is not set for the parent slice, access to both
// devices should fail. This is done to assess the test correctness.
// For cgroup v1, we check for m.Set returning EPERM.
// For cgroup v2, we check for the errors below.
testSkipDevices(t, false, []string{
"/dev/full: Operation not permitted",
"cat: /dev/null: Operation not permitted",
})
}
func TestFreezeBeforeSet(t *testing.T) {
requireV1(t)

View File

@@ -12,6 +12,8 @@ import (
securejoin "github.com/cyphar/filepath-securejoin"
"golang.org/x/sys/unix"
//nolint:revive // Enable cgroup manager to manage devices
_ "github.com/opencontainers/runc/libcontainer/cgroups/devices"
"github.com/opencontainers/runc/libcontainer/cgroups/manager"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/configs/validate"

View File

@@ -6,6 +6,8 @@ import (
"testing"
"github.com/opencontainers/runc/libcontainer"
//nolint:revive // Enable cgroup manager to manage devices
_ "github.com/opencontainers/runc/libcontainer/cgroups/devices"
_ "github.com/opencontainers/runc/libcontainer/nsenter"
"github.com/sirupsen/logrus"