mirror of
https://github.com/opencontainers/runc.git
synced 2025-12-24 11:50:58 +08:00
411 lines
12 KiB
Go
411 lines
12 KiB
Go
package validate
|
|
|
|
import (
|
|
"errors"
|
|
"fmt"
|
|
"os"
|
|
"path/filepath"
|
|
"strings"
|
|
"sync"
|
|
|
|
"github.com/opencontainers/runc/libcontainer/cgroups"
|
|
"github.com/opencontainers/runc/libcontainer/configs"
|
|
"github.com/opencontainers/runc/libcontainer/intelrdt"
|
|
"github.com/opencontainers/runtime-spec/specs-go"
|
|
selinux "github.com/opencontainers/selinux/go-selinux"
|
|
"github.com/sirupsen/logrus"
|
|
"golang.org/x/sys/unix"
|
|
)
|
|
|
|
type check func(config *configs.Config) error
|
|
|
|
func Validate(config *configs.Config) error {
|
|
checks := []check{
|
|
cgroupsCheck,
|
|
rootfs,
|
|
network,
|
|
uts,
|
|
security,
|
|
namespaces,
|
|
sysctl,
|
|
intelrdtCheck,
|
|
rootlessEUIDCheck,
|
|
mountsStrict,
|
|
scheduler,
|
|
ioPriority,
|
|
}
|
|
for _, c := range checks {
|
|
if err := c(config); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
// Relaxed validation rules for backward compatibility
|
|
warns := []check{
|
|
mountsWarn,
|
|
}
|
|
for _, c := range warns {
|
|
if err := c(config); err != nil {
|
|
logrus.WithError(err).Warn("configuration")
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// rootfs validates if the rootfs is an absolute path and is not a symlink
|
|
// to the container's root filesystem.
|
|
func rootfs(config *configs.Config) error {
|
|
if _, err := os.Stat(config.Rootfs); err != nil {
|
|
return fmt.Errorf("invalid rootfs: %w", err)
|
|
}
|
|
cleaned, err := filepath.Abs(config.Rootfs)
|
|
if err != nil {
|
|
return fmt.Errorf("invalid rootfs: %w", err)
|
|
}
|
|
if cleaned, err = filepath.EvalSymlinks(cleaned); err != nil {
|
|
return fmt.Errorf("invalid rootfs: %w", err)
|
|
}
|
|
if filepath.Clean(config.Rootfs) != cleaned {
|
|
return errors.New("invalid rootfs: not an absolute path, or a symlink")
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func network(config *configs.Config) error {
|
|
if !config.Namespaces.Contains(configs.NEWNET) {
|
|
if len(config.Networks) > 0 || len(config.Routes) > 0 {
|
|
return errors.New("unable to apply network settings without a private NET namespace")
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func uts(config *configs.Config) error {
|
|
if config.Hostname != "" && !config.Namespaces.Contains(configs.NEWUTS) {
|
|
return errors.New("unable to set hostname without a private UTS namespace")
|
|
}
|
|
if config.Domainname != "" && !config.Namespaces.Contains(configs.NEWUTS) {
|
|
return errors.New("unable to set domainname without a private UTS namespace")
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func security(config *configs.Config) error {
|
|
// restrict sys without mount namespace
|
|
if (len(config.MaskPaths) > 0 || len(config.ReadonlyPaths) > 0) &&
|
|
!config.Namespaces.Contains(configs.NEWNS) {
|
|
return errors.New("unable to restrict sys entries without a private MNT namespace")
|
|
}
|
|
if config.ProcessLabel != "" && !selinux.GetEnabled() {
|
|
return errors.New("selinux label is specified in config, but selinux is disabled or not supported")
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func namespaces(config *configs.Config) error {
|
|
if config.Namespaces.Contains(configs.NEWUSER) {
|
|
if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) {
|
|
return errors.New("user namespaces aren't enabled in the kernel")
|
|
}
|
|
hasPath := config.Namespaces.PathOf(configs.NEWUSER) != ""
|
|
hasMappings := config.UIDMappings != nil || config.GIDMappings != nil
|
|
if !hasPath && !hasMappings {
|
|
return errors.New("user namespaces enabled, but no namespace path to join nor mappings to apply specified")
|
|
}
|
|
// The hasPath && hasMappings validation case is handled in specconv --
|
|
// we cache the mappings in Config during specconv in the hasPath case,
|
|
// so we cannot do that validation here.
|
|
} else {
|
|
if config.UIDMappings != nil || config.GIDMappings != nil {
|
|
return errors.New("user namespace mappings specified, but user namespace isn't enabled in the config")
|
|
}
|
|
}
|
|
|
|
if config.Namespaces.Contains(configs.NEWCGROUP) {
|
|
if _, err := os.Stat("/proc/self/ns/cgroup"); os.IsNotExist(err) {
|
|
return errors.New("cgroup namespaces aren't enabled in the kernel")
|
|
}
|
|
}
|
|
|
|
if config.Namespaces.Contains(configs.NEWTIME) {
|
|
if _, err := os.Stat("/proc/self/timens_offsets"); os.IsNotExist(err) {
|
|
return errors.New("time namespaces aren't enabled in the kernel")
|
|
}
|
|
hasPath := config.Namespaces.PathOf(configs.NEWTIME) != ""
|
|
hasOffsets := config.TimeOffsets != nil
|
|
if hasPath && hasOffsets {
|
|
return errors.New("time namespace enabled, but both namespace path and time offsets specified -- you may only provide one")
|
|
}
|
|
} else {
|
|
if config.TimeOffsets != nil {
|
|
return errors.New("time namespace offsets specified, but time namespace isn't enabled in the config")
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// convertSysctlVariableToDotsSeparator can return sysctl variables in dots separator format.
|
|
// The '/' separator is also accepted in place of a '.'.
|
|
// Convert the sysctl variables to dots separator format for validation.
|
|
// More info: sysctl(8), sysctl.d(5).
|
|
//
|
|
// For example:
|
|
// Input sysctl variable "net/ipv4/conf/eno2.100.rp_filter"
|
|
// will return the converted value "net.ipv4.conf.eno2/100.rp_filter"
|
|
func convertSysctlVariableToDotsSeparator(val string) string {
|
|
if val == "" {
|
|
return val
|
|
}
|
|
firstSepIndex := strings.IndexAny(val, "./")
|
|
if firstSepIndex == -1 || val[firstSepIndex] == '.' {
|
|
return val
|
|
}
|
|
|
|
f := func(r rune) rune {
|
|
switch r {
|
|
case '.':
|
|
return '/'
|
|
case '/':
|
|
return '.'
|
|
}
|
|
return r
|
|
}
|
|
return strings.Map(f, val)
|
|
}
|
|
|
|
// sysctl validates that the specified sysctl keys are valid or not.
|
|
// /proc/sys isn't completely namespaced and depending on which namespaces
|
|
// are specified, a subset of sysctls are permitted.
|
|
func sysctl(config *configs.Config) error {
|
|
validSysctlMap := map[string]bool{
|
|
"kernel.msgmax": true,
|
|
"kernel.msgmnb": true,
|
|
"kernel.msgmni": true,
|
|
"kernel.sem": true,
|
|
"kernel.shmall": true,
|
|
"kernel.shmmax": true,
|
|
"kernel.shmmni": true,
|
|
"kernel.shm_rmid_forced": true,
|
|
}
|
|
|
|
var (
|
|
netOnce sync.Once
|
|
hostnet bool
|
|
hostnetErr error
|
|
)
|
|
|
|
for s := range config.Sysctl {
|
|
s := convertSysctlVariableToDotsSeparator(s)
|
|
if validSysctlMap[s] || strings.HasPrefix(s, "fs.mqueue.") {
|
|
if config.Namespaces.Contains(configs.NEWIPC) {
|
|
continue
|
|
} else {
|
|
return fmt.Errorf("sysctl %q is not allowed in the hosts ipc namespace", s)
|
|
}
|
|
}
|
|
if strings.HasPrefix(s, "net.") {
|
|
// Is container using host netns?
|
|
// Here "host" means "current", not "initial".
|
|
netOnce.Do(func() {
|
|
if !config.Namespaces.Contains(configs.NEWNET) {
|
|
hostnet = true
|
|
return
|
|
}
|
|
path := config.Namespaces.PathOf(configs.NEWNET)
|
|
if path == "" {
|
|
// own netns, so hostnet = false
|
|
return
|
|
}
|
|
hostnet, hostnetErr = isHostNetNS(path)
|
|
})
|
|
if hostnetErr != nil {
|
|
return fmt.Errorf("invalid netns path: %w", hostnetErr)
|
|
}
|
|
if hostnet {
|
|
return fmt.Errorf("sysctl %q not allowed in host network namespace", s)
|
|
}
|
|
continue
|
|
}
|
|
if config.Namespaces.Contains(configs.NEWUTS) {
|
|
switch s {
|
|
case "kernel.domainname":
|
|
// This is namespaced and there's no explicit OCI field for it.
|
|
continue
|
|
case "kernel.hostname":
|
|
// This is namespaced but there's a conflicting (dedicated) OCI field for it.
|
|
return fmt.Errorf("sysctl %q is not allowed as it conflicts with the OCI %q field", s, "hostname")
|
|
}
|
|
}
|
|
return fmt.Errorf("sysctl %q is not in a separate kernel namespace", s)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func intelrdtCheck(config *configs.Config) error {
|
|
if config.IntelRdt != nil {
|
|
if config.IntelRdt.ClosID == "." || config.IntelRdt.ClosID == ".." || strings.Contains(config.IntelRdt.ClosID, "/") {
|
|
return fmt.Errorf("invalid intelRdt.ClosID %q", config.IntelRdt.ClosID)
|
|
}
|
|
|
|
if !intelrdt.IsCATEnabled() && config.IntelRdt.L3CacheSchema != "" {
|
|
return errors.New("intelRdt.l3CacheSchema is specified in config, but Intel RDT/CAT is not enabled")
|
|
}
|
|
if !intelrdt.IsMBAEnabled() && config.IntelRdt.MemBwSchema != "" {
|
|
return errors.New("intelRdt.memBwSchema is specified in config, but Intel RDT/MBA is not enabled")
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func cgroupsCheck(config *configs.Config) error {
|
|
c := config.Cgroups
|
|
if c == nil {
|
|
return nil
|
|
}
|
|
|
|
if (c.Name != "" || c.Parent != "") && c.Path != "" {
|
|
return fmt.Errorf("cgroup: either Path or Name and Parent should be used, got %+v", c)
|
|
}
|
|
|
|
r := c.Resources
|
|
if r == nil {
|
|
return nil
|
|
}
|
|
|
|
if !cgroups.IsCgroup2UnifiedMode() && r.Unified != nil {
|
|
return cgroups.ErrV1NoUnified
|
|
}
|
|
|
|
if cgroups.IsCgroup2UnifiedMode() {
|
|
_, err := cgroups.ConvertMemorySwapToCgroupV2Value(r.MemorySwap, r.Memory)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func checkBindOptions(m *configs.Mount) error {
|
|
if !m.IsBind() {
|
|
return nil
|
|
}
|
|
// We must reject bind-mounts that also have filesystem-specific mount
|
|
// options, because the kernel will completely ignore these flags and we
|
|
// cannot set them per-mountpoint.
|
|
//
|
|
// It should be noted that (due to how the kernel caches superblocks), data
|
|
// options could also silently ignored for other filesystems even when
|
|
// doing a fresh mount, but there is no real way to avoid this (and it
|
|
// matches how everything else works). There have been proposals to make it
|
|
// possible for userspace to detect this caching, but this wouldn't help
|
|
// runc because the behaviour wouldn't even be desirable for most users.
|
|
if m.Data != "" {
|
|
return errors.New("bind mounts cannot have any filesystem-specific options applied")
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func checkIDMapMounts(config *configs.Config, m *configs.Mount) error {
|
|
// Make sure MOUNT_ATTR_IDMAP is not set on any of our mounts. This
|
|
// attribute is handled differently to all other attributes (through
|
|
// m.IDMapping), so make sure we never store it in the actual config. This
|
|
// really shouldn't ever happen.
|
|
if m.RecAttr != nil && (m.RecAttr.Attr_set|m.RecAttr.Attr_clr)&unix.MOUNT_ATTR_IDMAP != 0 {
|
|
return errors.New("mount configuration cannot contain recAttr for MOUNT_ATTR_IDMAP")
|
|
}
|
|
if !m.IsIDMapped() {
|
|
return nil
|
|
}
|
|
if !m.IsBind() {
|
|
return errors.New("id-mapped mounts are only supported for bind-mounts")
|
|
}
|
|
if config.RootlessEUID {
|
|
return errors.New("id-mapped mounts are not supported for rootless containers")
|
|
}
|
|
if m.IDMapping.UserNSPath == "" {
|
|
if len(m.IDMapping.UIDMappings) == 0 || len(m.IDMapping.GIDMappings) == 0 {
|
|
return errors.New("id-mapped mounts must have both uid and gid mappings specified")
|
|
}
|
|
} else {
|
|
if m.IDMapping.UIDMappings != nil || m.IDMapping.GIDMappings != nil {
|
|
// should never happen
|
|
return errors.New("[internal error] id-mapped mounts cannot have both userns_path and uid and gid mappings specified")
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func mountsWarn(config *configs.Config) error {
|
|
for _, m := range config.Mounts {
|
|
if !filepath.IsAbs(m.Destination) {
|
|
return fmt.Errorf("mount %+v: relative destination path is **deprecated**, using it as relative to /", m)
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func mountsStrict(config *configs.Config) error {
|
|
for _, m := range config.Mounts {
|
|
if err := checkBindOptions(m); err != nil {
|
|
return fmt.Errorf("invalid mount %+v: %w", m, err)
|
|
}
|
|
if err := checkIDMapMounts(config, m); err != nil {
|
|
return fmt.Errorf("invalid mount %+v: %w", m, err)
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func isHostNetNS(path string) (bool, error) {
|
|
const currentProcessNetns = "/proc/self/ns/net"
|
|
|
|
var st1, st2 unix.Stat_t
|
|
|
|
if err := unix.Stat(currentProcessNetns, &st1); err != nil {
|
|
return false, &os.PathError{Op: "stat", Path: currentProcessNetns, Err: err}
|
|
}
|
|
if err := unix.Stat(path, &st2); err != nil {
|
|
return false, &os.PathError{Op: "stat", Path: path, Err: err}
|
|
}
|
|
|
|
return (st1.Dev == st2.Dev) && (st1.Ino == st2.Ino), nil
|
|
}
|
|
|
|
// scheduler is to validate scheduler configs according to https://man7.org/linux/man-pages/man2/sched_setattr.2.html
|
|
func scheduler(config *configs.Config) error {
|
|
s := config.Scheduler
|
|
if s == nil {
|
|
return nil
|
|
}
|
|
if s.Policy == "" {
|
|
return errors.New("scheduler policy is required")
|
|
}
|
|
if s.Policy == specs.SchedOther || s.Policy == specs.SchedBatch {
|
|
if s.Nice < -20 || s.Nice > 19 {
|
|
return fmt.Errorf("invalid scheduler.nice: %d when scheduler.policy is %s", s.Nice, string(s.Policy))
|
|
}
|
|
}
|
|
if s.Priority != 0 && (s.Policy != specs.SchedFIFO && s.Policy != specs.SchedRR) {
|
|
return errors.New("scheduler.priority can only be specified for SchedFIFO or SchedRR policy")
|
|
}
|
|
if s.Policy != specs.SchedDeadline && (s.Runtime != 0 || s.Deadline != 0 || s.Period != 0) {
|
|
return errors.New("scheduler runtime/deadline/period can only be specified for SchedDeadline policy")
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func ioPriority(config *configs.Config) error {
|
|
if config.IOPriority == nil {
|
|
return nil
|
|
}
|
|
priority := config.IOPriority.Priority
|
|
if priority < 0 || priority > 7 {
|
|
return fmt.Errorf("invalid ioPriority.Priority: %d", priority)
|
|
}
|
|
return nil
|
|
}
|