// Package specconv implements conversion of specifications to libcontainer // configurations package specconv import ( "errors" "fmt" "os" "path/filepath" "sort" "strings" "sync" "time" systemdDbus "github.com/coreos/go-systemd/v22/dbus" dbus "github.com/godbus/dbus/v5" "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/devices" "github.com/opencontainers/runc/libcontainer/internal/userns" "github.com/opencontainers/runc/libcontainer/seccomp" libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils" "github.com/opencontainers/runtime-spec/specs-go" "github.com/sirupsen/logrus" "golang.org/x/sys/unix" ) var ( initMapsOnce sync.Once namespaceMapping map[specs.LinuxNamespaceType]configs.NamespaceType mountPropagationMapping map[string]int recAttrFlags map[string]struct { clear bool flag uint64 } mountFlags, extensionFlags map[string]struct { clear bool flag int } complexFlags map[string]func(*configs.Mount) ) func initMaps() { initMapsOnce.Do(func() { namespaceMapping = map[specs.LinuxNamespaceType]configs.NamespaceType{ specs.PIDNamespace: configs.NEWPID, specs.NetworkNamespace: configs.NEWNET, specs.MountNamespace: configs.NEWNS, specs.UserNamespace: configs.NEWUSER, specs.IPCNamespace: configs.NEWIPC, specs.UTSNamespace: configs.NEWUTS, specs.CgroupNamespace: configs.NEWCGROUP, specs.TimeNamespace: configs.NEWTIME, } mountPropagationMapping = map[string]int{ "rprivate": unix.MS_PRIVATE | unix.MS_REC, "private": unix.MS_PRIVATE, "rslave": unix.MS_SLAVE | unix.MS_REC, "slave": unix.MS_SLAVE, "rshared": unix.MS_SHARED | unix.MS_REC, "shared": unix.MS_SHARED, "runbindable": unix.MS_UNBINDABLE | unix.MS_REC, "unbindable": unix.MS_UNBINDABLE, } mountFlags = map[string]struct { clear bool flag int }{ // "acl" cannot be mapped to MS_POSIXACL: https://github.com/opencontainers/runc/issues/3738 "async": {true, unix.MS_SYNCHRONOUS}, "atime": {true, unix.MS_NOATIME}, "bind": {false, unix.MS_BIND}, "defaults": {false, 0}, "dev": {true, unix.MS_NODEV}, "diratime": {true, unix.MS_NODIRATIME}, "dirsync": {false, unix.MS_DIRSYNC}, "exec": {true, unix.MS_NOEXEC}, "iversion": {false, unix.MS_I_VERSION}, "lazytime": {false, unix.MS_LAZYTIME}, "loud": {true, unix.MS_SILENT}, "mand": {false, unix.MS_MANDLOCK}, "noatime": {false, unix.MS_NOATIME}, "nodev": {false, unix.MS_NODEV}, "nodiratime": {false, unix.MS_NODIRATIME}, "noexec": {false, unix.MS_NOEXEC}, "noiversion": {true, unix.MS_I_VERSION}, "nolazytime": {true, unix.MS_LAZYTIME}, "nomand": {true, unix.MS_MANDLOCK}, "norelatime": {true, unix.MS_RELATIME}, "nostrictatime": {true, unix.MS_STRICTATIME}, "nosuid": {false, unix.MS_NOSUID}, "nosymfollow": {false, unix.MS_NOSYMFOLLOW}, // since kernel 5.10 "rbind": {false, unix.MS_BIND | unix.MS_REC}, "relatime": {false, unix.MS_RELATIME}, "remount": {false, unix.MS_REMOUNT}, "ro": {false, unix.MS_RDONLY}, "rw": {true, unix.MS_RDONLY}, "silent": {false, unix.MS_SILENT}, "strictatime": {false, unix.MS_STRICTATIME}, "suid": {true, unix.MS_NOSUID}, "sync": {false, unix.MS_SYNCHRONOUS}, "symfollow": {true, unix.MS_NOSYMFOLLOW}, // since kernel 5.10 } recAttrFlags = map[string]struct { clear bool flag uint64 }{ "rro": {false, unix.MOUNT_ATTR_RDONLY}, "rrw": {true, unix.MOUNT_ATTR_RDONLY}, "rnosuid": {false, unix.MOUNT_ATTR_NOSUID}, "rsuid": {true, unix.MOUNT_ATTR_NOSUID}, "rnodev": {false, unix.MOUNT_ATTR_NODEV}, "rdev": {true, unix.MOUNT_ATTR_NODEV}, "rnoexec": {false, unix.MOUNT_ATTR_NOEXEC}, "rexec": {true, unix.MOUNT_ATTR_NOEXEC}, "rnodiratime": {false, unix.MOUNT_ATTR_NODIRATIME}, "rdiratime": {true, unix.MOUNT_ATTR_NODIRATIME}, "rrelatime": {false, unix.MOUNT_ATTR_RELATIME}, "rnorelatime": {true, unix.MOUNT_ATTR_RELATIME}, "rnoatime": {false, unix.MOUNT_ATTR_NOATIME}, "ratime": {true, unix.MOUNT_ATTR_NOATIME}, "rstrictatime": {false, unix.MOUNT_ATTR_STRICTATIME}, "rnostrictatime": {true, unix.MOUNT_ATTR_STRICTATIME}, "rnosymfollow": {false, unix.MOUNT_ATTR_NOSYMFOLLOW}, // since kernel 5.14 "rsymfollow": {true, unix.MOUNT_ATTR_NOSYMFOLLOW}, // since kernel 5.14 } extensionFlags = map[string]struct { clear bool flag int }{ "tmpcopyup": {false, configs.EXT_COPYUP}, } complexFlags = map[string]func(*configs.Mount){ "idmap": func(m *configs.Mount) { m.IDMapping = new(configs.MountIDMapping) m.IDMapping.Recursive = false // noop }, "ridmap": func(m *configs.Mount) { m.IDMapping = new(configs.MountIDMapping) m.IDMapping.Recursive = true }, } }) } // KnownNamespaces returns the list of the known namespaces. // Used by `runc features`. func KnownNamespaces() []string { initMaps() var res []string for k := range namespaceMapping { res = append(res, string(k)) } sort.Strings(res) return res } // KnownMountOptions returns the list of the known mount options. // Used by `runc features`. func KnownMountOptions() []string { initMaps() var res []string for k := range mountFlags { res = append(res, k) } for k := range mountPropagationMapping { res = append(res, k) } for k := range recAttrFlags { res = append(res, k) } for k := range extensionFlags { res = append(res, k) } sort.Strings(res) return res } // AllowedDevices is the set of devices which are automatically included for // all containers. // // # XXX (cyphar) // // This behaviour is at the very least "questionable" (if not outright // wrong) according to the runtime-spec. // // Yes, we have to include certain devices other than the ones the user // specifies, but several devices listed here are not part of the spec // (including "mknod for any device"?!). In addition, these rules are // appended to the user-provided set which means that users *cannot disable // this behaviour*. // // ... unfortunately I'm too scared to change this now because who knows how // many people depend on this (incorrect and arguably insecure) behaviour. var AllowedDevices = []*devices.Device{ // allow mknod for any device { Rule: devices.Rule{ Type: devices.CharDevice, Major: devices.Wildcard, Minor: devices.Wildcard, Permissions: "m", Allow: true, }, }, { Rule: devices.Rule{ Type: devices.BlockDevice, Major: devices.Wildcard, Minor: devices.Wildcard, Permissions: "m", Allow: true, }, }, { Path: "/dev/null", FileMode: 0o666, Uid: 0, Gid: 0, Rule: devices.Rule{ Type: devices.CharDevice, Major: 1, Minor: 3, Permissions: "rwm", Allow: true, }, }, { Path: "/dev/random", FileMode: 0o666, Uid: 0, Gid: 0, Rule: devices.Rule{ Type: devices.CharDevice, Major: 1, Minor: 8, Permissions: "rwm", Allow: true, }, }, { Path: "/dev/full", FileMode: 0o666, Uid: 0, Gid: 0, Rule: devices.Rule{ Type: devices.CharDevice, Major: 1, Minor: 7, Permissions: "rwm", Allow: true, }, }, { Path: "/dev/tty", FileMode: 0o666, Uid: 0, Gid: 0, Rule: devices.Rule{ Type: devices.CharDevice, Major: 5, Minor: 0, Permissions: "rwm", Allow: true, }, }, { Path: "/dev/zero", FileMode: 0o666, Uid: 0, Gid: 0, Rule: devices.Rule{ Type: devices.CharDevice, Major: 1, Minor: 5, Permissions: "rwm", Allow: true, }, }, { Path: "/dev/urandom", FileMode: 0o666, Uid: 0, Gid: 0, Rule: devices.Rule{ Type: devices.CharDevice, Major: 1, Minor: 9, Permissions: "rwm", Allow: true, }, }, // /dev/pts/ - pts namespaces are "coming soon" { Rule: devices.Rule{ Type: devices.CharDevice, Major: 136, Minor: devices.Wildcard, Permissions: "rwm", Allow: true, }, }, { Rule: devices.Rule{ Type: devices.CharDevice, Major: 5, Minor: 2, Permissions: "rwm", Allow: true, }, }, } type CreateOpts struct { CgroupName string UseSystemdCgroup bool NoPivotRoot bool NoNewKeyring bool Spec *specs.Spec RootlessEUID bool RootlessCgroups bool } // getwd is a wrapper similar to os.Getwd, except it always gets // the value from the kernel, which guarantees the returned value // to be absolute and clean. func getwd() (wd string, err error) { for { wd, err = unix.Getwd() if err != unix.EINTR { break } } return wd, os.NewSyscallError("getwd", err) } // CreateLibcontainerConfig creates a new libcontainer configuration from a // given specification and a cgroup name func CreateLibcontainerConfig(opts *CreateOpts) (*configs.Config, error) { // runc's cwd will always be the bundle path cwd, err := getwd() if err != nil { return nil, err } spec := opts.Spec if spec.Root == nil { return nil, errors.New("root must be specified") } rootfsPath := spec.Root.Path if !filepath.IsAbs(rootfsPath) { rootfsPath = filepath.Join(cwd, rootfsPath) } labels := []string{} for k, v := range spec.Annotations { labels = append(labels, k+"="+v) } config := &configs.Config{ Rootfs: rootfsPath, NoPivotRoot: opts.NoPivotRoot, Readonlyfs: spec.Root.Readonly, Hostname: spec.Hostname, Domainname: spec.Domainname, Labels: append(labels, "bundle="+cwd), NoNewKeyring: opts.NoNewKeyring, RootlessEUID: opts.RootlessEUID, RootlessCgroups: opts.RootlessCgroups, } for _, m := range spec.Mounts { cm, err := createLibcontainerMount(cwd, m) if err != nil { return nil, fmt.Errorf("invalid mount %+v: %w", m, err) } config.Mounts = append(config.Mounts, cm) } defaultDevs, err := createDevices(spec, config) if err != nil { return nil, err } c, err := CreateCgroupConfig(opts, defaultDevs) if err != nil { return nil, err } config.Cgroups = c // set linux-specific config if spec.Linux != nil { initMaps() if spec.Linux.RootfsPropagation != "" { var exists bool if config.RootPropagation, exists = mountPropagationMapping[spec.Linux.RootfsPropagation]; !exists { return nil, fmt.Errorf("rootfsPropagation=%v is not supported", spec.Linux.RootfsPropagation) } if config.NoPivotRoot && (config.RootPropagation&unix.MS_PRIVATE != 0) { return nil, errors.New("rootfsPropagation of [r]private is not safe without pivot_root") } } for _, ns := range spec.Linux.Namespaces { t, exists := namespaceMapping[ns.Type] if !exists { return nil, fmt.Errorf("namespace %q does not exist", ns) } if config.Namespaces.Contains(t) { return nil, fmt.Errorf("malformed spec file: duplicated ns %q", ns) } config.Namespaces.Add(t, ns.Path) } if config.Namespaces.IsPrivate(configs.NEWNET) { config.Networks = []*configs.Network{ { Type: "loopback", }, } } if config.Namespaces.Contains(configs.NEWUSER) { if err := setupUserNamespace(spec, config); err != nil { return nil, err } // For idmap and ridmap mounts without explicit mappings, use the // ones from the container's userns. If we are joining another // userns, stash the path. for _, m := range config.Mounts { if m.IDMapping != nil && m.IDMapping.UIDMappings == nil && m.IDMapping.GIDMappings == nil { if path := config.Namespaces.PathOf(configs.NEWUSER); path != "" { m.IDMapping.UserNSPath = path } else { m.IDMapping.UIDMappings = config.UIDMappings m.IDMapping.GIDMappings = config.GIDMappings } } } } config.MaskPaths = spec.Linux.MaskedPaths config.ReadonlyPaths = spec.Linux.ReadonlyPaths config.MountLabel = spec.Linux.MountLabel config.Sysctl = spec.Linux.Sysctl config.TimeOffsets = spec.Linux.TimeOffsets if spec.Linux.Seccomp != nil { seccomp, err := SetupSeccomp(spec.Linux.Seccomp) if err != nil { return nil, err } config.Seccomp = seccomp } if spec.Linux.IntelRdt != nil { config.IntelRdt = &configs.IntelRdt{ ClosID: spec.Linux.IntelRdt.ClosID, L3CacheSchema: spec.Linux.IntelRdt.L3CacheSchema, MemBwSchema: spec.Linux.IntelRdt.MemBwSchema, } } if spec.Linux.Personality != nil { if len(spec.Linux.Personality.Flags) > 0 { logrus.Warnf("ignoring unsupported personality flags: %+v because personality flag has not supported at this time", spec.Linux.Personality.Flags) } domain, err := getLinuxPersonalityFromStr(string(spec.Linux.Personality.Domain)) if err != nil { return nil, err } config.Personality = &configs.LinuxPersonality{ Domain: domain, } } } // Set the host UID that should own the container's cgroup. // This must be performed after setupUserNamespace, so that // config.HostRootUID() returns the correct result. // // Only set it if the container will have its own cgroup // namespace and the cgroupfs will be mounted read/write. // hasCgroupNS := config.Namespaces.IsPrivate(configs.NEWCGROUP) hasRwCgroupfs := false if hasCgroupNS { for _, m := range config.Mounts { if m.Source == "cgroup" && filepath.Clean(m.Destination) == "/sys/fs/cgroup" && (m.Flags&unix.MS_RDONLY) == 0 { hasRwCgroupfs = true break } } } processUid := 0 if spec.Process != nil { // Chown the cgroup to the UID running the process, // which is not necessarily UID 0 in the container // namespace (e.g., an unprivileged UID in the host // user namespace). processUid = int(spec.Process.User.UID) } if hasCgroupNS && hasRwCgroupfs { ownerUid, err := config.HostUID(processUid) // There are two error cases; we can ignore both. // // 1. uidMappings is unset. Either there is no user // namespace (fine), or it is an error (which is // checked elsewhere). // // 2. The user is unmapped in the user namespace. This is an // unusual configuration and might be an error. But it too // will be checked elsewhere, so we can ignore it here. // if err == nil { config.Cgroups.OwnerUID = &ownerUid } } if spec.Process != nil { config.OomScoreAdj = spec.Process.OOMScoreAdj config.NoNewPrivileges = spec.Process.NoNewPrivileges config.Umask = spec.Process.User.Umask config.ProcessLabel = spec.Process.SelinuxLabel if spec.Process.Capabilities != nil { config.Capabilities = &configs.Capabilities{ Bounding: spec.Process.Capabilities.Bounding, Effective: spec.Process.Capabilities.Effective, Permitted: spec.Process.Capabilities.Permitted, Inheritable: spec.Process.Capabilities.Inheritable, Ambient: spec.Process.Capabilities.Ambient, } } if spec.Process.Scheduler != nil { s := *spec.Process.Scheduler config.Scheduler = &s } if spec.Process.IOPriority != nil { ioPriority := *spec.Process.IOPriority config.IOPriority = &ioPriority } } createHooks(spec, config) config.Version = specs.Version return config, nil } func toConfigIDMap(specMaps []specs.LinuxIDMapping) []configs.IDMap { if specMaps == nil { return nil } idmaps := make([]configs.IDMap, len(specMaps)) for i, id := range specMaps { idmaps[i] = configs.IDMap{ ContainerID: int64(id.ContainerID), HostID: int64(id.HostID), Size: int64(id.Size), } } return idmaps } func createLibcontainerMount(cwd string, m specs.Mount) (*configs.Mount, error) { if !filepath.IsAbs(m.Destination) { // Relax validation for backward compatibility // TODO (runc v1.x.x): change warning to an error // return nil, fmt.Errorf("mount destination %s is not absolute", m.Destination) logrus.Warnf("mount destination %s is not absolute. Support for non-absolute mount destinations will be removed in a future release.", m.Destination) } mnt := parseMountOptions(m.Options) mnt.Destination = m.Destination mnt.Source = m.Source mnt.Device = m.Type if mnt.Flags&unix.MS_BIND != 0 { // Any "type" the user specified is meaningless (and ignored) for // bind-mounts -- so we set it to "bind" because rootfs_linux.go // (incorrectly) relies on this for some checks. mnt.Device = "bind" if !filepath.IsAbs(mnt.Source) { mnt.Source = filepath.Join(cwd, m.Source) } } if m.UIDMappings != nil || m.GIDMappings != nil { if mnt.IDMapping == nil { // Neither "idmap" nor "ridmap" were specified. mnt.IDMapping = new(configs.MountIDMapping) } mnt.IDMapping.UIDMappings = toConfigIDMap(m.UIDMappings) mnt.IDMapping.GIDMappings = toConfigIDMap(m.GIDMappings) } // None of the mount arguments can contain a null byte. Normally such // strings would either cause some other failure or would just be truncated // when we hit the null byte, but because we serialise these strings as // netlink messages (which don't have special null-byte handling) we need // to block this as early as possible. if strings.IndexByte(mnt.Source, 0) >= 0 || strings.IndexByte(mnt.Destination, 0) >= 0 || strings.IndexByte(mnt.Device, 0) >= 0 { return nil, errors.New("mount field contains null byte") } return mnt, nil } // checkPropertyName checks if systemd property name is valid. A valid name // should consist of latin letters only, and have least 3 of them. func checkPropertyName(s string) error { if len(s) < 3 { return errors.New("too short") } // Check ASCII characters rather than Unicode runes, // so we have to use indexes rather than range. for i := 0; i < len(s); i++ { ch := s[i] if (ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') { continue } return errors.New("contains non-alphabetic character") } return nil } // getLinuxPersonalityFromStr converts the string domain received from spec to equivalent integer. func getLinuxPersonalityFromStr(domain string) (int, error) { if domain == string(specs.PerLinux32) { return configs.PerLinux32, nil } else if domain == string(specs.PerLinux) { return configs.PerLinux, nil } return -1, fmt.Errorf("invalid personality domain %s", domain) } // Some systemd properties are documented as having "Sec" suffix // (e.g. TimeoutStopSec) but are expected to have "USec" suffix // here, so let's provide conversion to improve compatibility. func convertSecToUSec(value dbus.Variant) (dbus.Variant, error) { var sec uint64 const M = 1000000 vi := value.Value() switch value.Signature().String() { case "y": sec = uint64(vi.(byte)) * M case "n": sec = uint64(vi.(int16)) * M case "q": sec = uint64(vi.(uint16)) * M case "i": sec = uint64(vi.(int32)) * M case "u": sec = uint64(vi.(uint32)) * M case "x": sec = uint64(vi.(int64)) * M case "t": sec = vi.(uint64) * M case "d": sec = uint64(vi.(float64) * M) default: return value, errors.New("not a number") } return dbus.MakeVariant(sec), nil } func initSystemdProps(spec *specs.Spec) ([]systemdDbus.Property, error) { const keyPrefix = "org.systemd.property." var sp []systemdDbus.Property for k, v := range spec.Annotations { name := strings.TrimPrefix(k, keyPrefix) if len(name) == len(k) { // prefix not there continue } if err := checkPropertyName(name); err != nil { return nil, fmt.Errorf("annotation %s name incorrect: %w", k, err) } value, err := dbus.ParseVariant(v, dbus.Signature{}) if err != nil { return nil, fmt.Errorf("annotation %s=%s value parse error: %w", k, v, err) } // Check for Sec suffix. if trimName := strings.TrimSuffix(name, "Sec"); len(trimName) < len(name) { // Check for a lowercase ascii a-z just before Sec. if ch := trimName[len(trimName)-1]; ch >= 'a' && ch <= 'z' { // Convert from Sec to USec. name = trimName + "USec" value, err = convertSecToUSec(value) if err != nil { return nil, fmt.Errorf("annotation %s=%s value parse error: %w", k, v, err) } } } sp = append(sp, systemdDbus.Property{Name: name, Value: value}) } return sp, nil } func CreateCgroupConfig(opts *CreateOpts, defaultDevs []*devices.Device) (*cgroups.Cgroup, error) { var ( myCgroupPath string spec = opts.Spec useSystemdCgroup = opts.UseSystemdCgroup name = opts.CgroupName ) c := &cgroups.Cgroup{ Systemd: useSystemdCgroup, Rootless: opts.RootlessCgroups, Resources: &cgroups.Resources{}, } if useSystemdCgroup { sp, err := initSystemdProps(spec) if err != nil { return nil, err } c.SystemdProps = sp } if spec.Linux != nil && spec.Linux.CgroupsPath != "" { if useSystemdCgroup { myCgroupPath = spec.Linux.CgroupsPath } else { myCgroupPath = libcontainerUtils.CleanPath(spec.Linux.CgroupsPath) } } if useSystemdCgroup { if myCgroupPath == "" { // Default for c.Parent is set by systemd cgroup drivers. c.ScopePrefix = "runc" c.Name = name } else { // Parse the path from expected "slice:prefix:name" // for e.g. "system.slice:docker:1234" parts := strings.Split(myCgroupPath, ":") if len(parts) != 3 { return nil, fmt.Errorf("expected cgroupsPath to be of format \"slice:prefix:name\" for systemd cgroups, got %q instead", myCgroupPath) } c.Parent = parts[0] c.ScopePrefix = parts[1] c.Name = parts[2] } } else { if myCgroupPath == "" { c.Name = name } c.Path = myCgroupPath } // In rootless containers, any attempt to make cgroup changes is likely to fail. // libcontainer will validate this but ignores the error. if spec.Linux != nil { r := spec.Linux.Resources if r != nil { for i, d := range r.Devices { var ( t = "a" major = int64(-1) minor = int64(-1) ) if d.Type != "" { t = d.Type } if d.Major != nil { major = *d.Major } if d.Minor != nil { minor = *d.Minor } if d.Access == "" { return nil, fmt.Errorf("device access at %d field cannot be empty", i) } dt, err := stringToCgroupDeviceRune(t) if err != nil { return nil, err } c.Resources.Devices = append(c.Resources.Devices, &devices.Rule{ Type: dt, Major: major, Minor: minor, Permissions: devices.Permissions(d.Access), Allow: d.Allow, }) } if r.Memory != nil { if r.Memory.Limit != nil { c.Resources.Memory = *r.Memory.Limit } if r.Memory.Reservation != nil { c.Resources.MemoryReservation = *r.Memory.Reservation } if r.Memory.Swap != nil { c.Resources.MemorySwap = *r.Memory.Swap } if r.Memory.Kernel != nil || r.Memory.KernelTCP != nil { //nolint:staticcheck // Ignore SA1019. Need to keep deprecated package for compatibility. logrus.Warn("Kernel memory settings are ignored and will be removed") } if r.Memory.Swappiness != nil { c.Resources.MemorySwappiness = r.Memory.Swappiness } if r.Memory.DisableOOMKiller != nil { c.Resources.OomKillDisable = *r.Memory.DisableOOMKiller } if r.Memory.CheckBeforeUpdate != nil { c.Resources.MemoryCheckBeforeUpdate = *r.Memory.CheckBeforeUpdate } } if r.CPU != nil { if r.CPU.Shares != nil { c.Resources.CpuShares = *r.CPU.Shares // CpuWeight is used for cgroupv2 and should be converted c.Resources.CpuWeight = cgroups.ConvertCPUSharesToCgroupV2Value(c.Resources.CpuShares) } if r.CPU.Quota != nil { c.Resources.CpuQuota = *r.CPU.Quota } if r.CPU.Burst != nil { c.Resources.CpuBurst = r.CPU.Burst } if r.CPU.Period != nil { c.Resources.CpuPeriod = *r.CPU.Period } if r.CPU.RealtimeRuntime != nil { c.Resources.CpuRtRuntime = *r.CPU.RealtimeRuntime } if r.CPU.RealtimePeriod != nil { c.Resources.CpuRtPeriod = *r.CPU.RealtimePeriod } c.Resources.CpusetCpus = r.CPU.Cpus c.Resources.CpusetMems = r.CPU.Mems c.Resources.CPUIdle = r.CPU.Idle } if r.Pids != nil { c.Resources.PidsLimit = r.Pids.Limit } if r.BlockIO != nil { if r.BlockIO.Weight != nil { c.Resources.BlkioWeight = *r.BlockIO.Weight } if r.BlockIO.LeafWeight != nil { c.Resources.BlkioLeafWeight = *r.BlockIO.LeafWeight } for _, wd := range r.BlockIO.WeightDevice { var weight, leafWeight uint16 if wd.Weight != nil { weight = *wd.Weight } if wd.LeafWeight != nil { leafWeight = *wd.LeafWeight } weightDevice := cgroups.NewWeightDevice(wd.Major, wd.Minor, weight, leafWeight) c.Resources.BlkioWeightDevice = append(c.Resources.BlkioWeightDevice, weightDevice) } for _, td := range r.BlockIO.ThrottleReadBpsDevice { rate := td.Rate throttleDevice := cgroups.NewThrottleDevice(td.Major, td.Minor, rate) c.Resources.BlkioThrottleReadBpsDevice = append(c.Resources.BlkioThrottleReadBpsDevice, throttleDevice) } for _, td := range r.BlockIO.ThrottleWriteBpsDevice { rate := td.Rate throttleDevice := cgroups.NewThrottleDevice(td.Major, td.Minor, rate) c.Resources.BlkioThrottleWriteBpsDevice = append(c.Resources.BlkioThrottleWriteBpsDevice, throttleDevice) } for _, td := range r.BlockIO.ThrottleReadIOPSDevice { rate := td.Rate throttleDevice := cgroups.NewThrottleDevice(td.Major, td.Minor, rate) c.Resources.BlkioThrottleReadIOPSDevice = append(c.Resources.BlkioThrottleReadIOPSDevice, throttleDevice) } for _, td := range r.BlockIO.ThrottleWriteIOPSDevice { rate := td.Rate throttleDevice := cgroups.NewThrottleDevice(td.Major, td.Minor, rate) c.Resources.BlkioThrottleWriteIOPSDevice = append(c.Resources.BlkioThrottleWriteIOPSDevice, throttleDevice) } } for _, l := range r.HugepageLimits { c.Resources.HugetlbLimit = append(c.Resources.HugetlbLimit, &cgroups.HugepageLimit{ Pagesize: l.Pagesize, Limit: l.Limit, }) } if len(r.Rdma) > 0 { c.Resources.Rdma = make(map[string]cgroups.LinuxRdma, len(r.Rdma)) for k, v := range r.Rdma { c.Resources.Rdma[k] = cgroups.LinuxRdma{ HcaHandles: v.HcaHandles, HcaObjects: v.HcaObjects, } } } if r.Network != nil { if r.Network.ClassID != nil { c.Resources.NetClsClassid = *r.Network.ClassID } for _, m := range r.Network.Priorities { c.Resources.NetPrioIfpriomap = append(c.Resources.NetPrioIfpriomap, &cgroups.IfPrioMap{ Interface: m.Name, Priority: int64(m.Priority), }) } } if len(r.Unified) > 0 { // copy the map c.Resources.Unified = make(map[string]string, len(r.Unified)) for k, v := range r.Unified { c.Resources.Unified[k] = v } } } } // Append the default allowed devices to the end of the list. for _, device := range defaultDevs { c.Resources.Devices = append(c.Resources.Devices, &device.Rule) } return c, nil } func stringToCgroupDeviceRune(s string) (devices.Type, error) { switch s { case "a": return devices.WildcardDevice, nil case "b": return devices.BlockDevice, nil case "c": return devices.CharDevice, nil default: return 0, fmt.Errorf("invalid cgroup device type %q", s) } } func stringToDeviceRune(s string) (devices.Type, error) { switch s { case "p": return devices.FifoDevice, nil case "u", "c": return devices.CharDevice, nil case "b": return devices.BlockDevice, nil default: return 0, fmt.Errorf("invalid device type %q", s) } } func createDevices(spec *specs.Spec, config *configs.Config) ([]*devices.Device, error) { // If a spec device is redundant with a default device, remove that default // device (the spec one takes priority). dedupedAllowDevs := []*devices.Device{} next: for _, ad := range AllowedDevices { if ad.Path != "" && spec.Linux != nil { for _, sd := range spec.Linux.Devices { if sd.Path == ad.Path { continue next } } } dedupedAllowDevs = append(dedupedAllowDevs, ad) if ad.Path != "" { config.Devices = append(config.Devices, ad) } } // Merge in additional devices from the spec. if spec.Linux != nil { for _, d := range spec.Linux.Devices { var uid, gid uint32 var filemode os.FileMode = 0o666 if d.UID != nil { uid = *d.UID } if d.GID != nil { gid = *d.GID } dt, err := stringToDeviceRune(d.Type) if err != nil { return nil, err } if d.FileMode != nil { filemode = *d.FileMode &^ unix.S_IFMT } device := &devices.Device{ Rule: devices.Rule{ Type: dt, Major: d.Major, Minor: d.Minor, }, Path: d.Path, FileMode: filemode, Uid: uid, Gid: gid, } config.Devices = append(config.Devices, device) } } return dedupedAllowDevs, nil } func setupUserNamespace(spec *specs.Spec, config *configs.Config) error { if spec.Linux != nil { config.UIDMappings = toConfigIDMap(spec.Linux.UIDMappings) config.GIDMappings = toConfigIDMap(spec.Linux.GIDMappings) } if path := config.Namespaces.PathOf(configs.NEWUSER); path != "" { // Cache the current userns mappings in our configuration, so that we // can calculate uid and gid mappings within runc. These mappings are // never used for configuring the container if the path is set. uidMap, gidMap, err := userns.GetUserNamespaceMappings(path) if err != nil { return fmt.Errorf("failed to cache mappings for userns: %w", err) } // We cannot allow uid or gid mappings to be set if we are also asked // to join a userns. if config.UIDMappings != nil || config.GIDMappings != nil { // FIXME: It turns out that containerd and CRIO pass both a userns // path and the mappings of the namespace in the same config.json. // Such a configuration is technically not valid, but we used to // require mappings be specified, and thus users worked around our // bug -- so we can't regress it at the moment. But we also don't // want to produce broken behaviour if the mapping doesn't match // the userns. So (for now) we output a warning if the actual // userns mappings match the configuration, otherwise we return an // error. if !userns.IsSameMapping(uidMap, config.UIDMappings) || !userns.IsSameMapping(gidMap, config.GIDMappings) { return errors.New("user namespaces enabled, but both namespace path and non-matching mapping specified -- you may only provide one") } logrus.Warnf("config.json has both a userns path to join and a matching userns mapping specified -- you may only provide one. Future versions of runc may return an error with this configuration, please report a bug on if you see this warning and cannot update your configuration.") } config.UIDMappings = uidMap config.GIDMappings = gidMap logrus.WithFields(logrus.Fields{ "uid_map": uidMap, "gid_map": gidMap, }).Debugf("config uses path-based userns configuration -- current uid and gid mappings cached") } rootUID, err := config.HostRootUID() if err != nil { return err } rootGID, err := config.HostRootGID() if err != nil { return err } for _, node := range config.Devices { node.Uid = uint32(rootUID) node.Gid = uint32(rootGID) } return nil } // parseMountOptions parses options and returns a configs.Mount // structure with fields that depends on options set accordingly. func parseMountOptions(options []string) *configs.Mount { var ( data []string m configs.Mount recAttrSet, recAttrClr uint64 ) initMaps() for _, o := range options { // If the option does not exist in the mountFlags table, // or the flag is not supported on the platform, // then it is a data value for a specific fs type. if f, exists := mountFlags[o]; exists && f.flag != 0 { // FIXME: The *atime flags are special (they are more of an enum // with quite hairy semantics) and thus arguably setting some of // them should clear unrelated flags. if f.clear { m.Flags &= ^f.flag m.ClearedFlags |= f.flag } else { m.Flags |= f.flag m.ClearedFlags &= ^f.flag } } else if f, exists := mountPropagationMapping[o]; exists && f != 0 { m.PropagationFlags = append(m.PropagationFlags, f) } else if f, exists := recAttrFlags[o]; exists { if f.clear { recAttrClr |= f.flag recAttrSet &= ^f.flag } else { recAttrSet |= f.flag recAttrClr &= ^f.flag if f.flag&unix.MOUNT_ATTR__ATIME == f.flag { // https://man7.org/linux/man-pages/man2/mount_setattr.2.html // "cannot simply specify the access-time setting in attr_set, but must also include MOUNT_ATTR__ATIME in the attr_clr field." recAttrClr |= unix.MOUNT_ATTR__ATIME } } } else if f, exists := extensionFlags[o]; exists { if f.clear { m.Extensions &= ^f.flag } else { m.Extensions |= f.flag } } else if fn, exists := complexFlags[o]; exists { fn(&m) } else { data = append(data, o) } } m.Data = strings.Join(data, ",") if recAttrSet != 0 || recAttrClr != 0 { m.RecAttr = &unix.MountAttr{ Attr_set: recAttrSet, Attr_clr: recAttrClr, } } return &m } func SetupSeccomp(config *specs.LinuxSeccomp) (*configs.Seccomp, error) { if config == nil { return nil, nil } // No default action specified, no syscalls listed, assume seccomp disabled if config.DefaultAction == "" && len(config.Syscalls) == 0 { return nil, nil } newConfig := new(configs.Seccomp) newConfig.Syscalls = []*configs.Syscall{} // The list of flags defined in runtime-spec is a subset of the flags // in the seccomp() syscall. if config.Flags == nil { // No flags are set explicitly (not even the empty set); // set the default of specs.LinuxSeccompFlagSpecAllow, // if it is supported by the libseccomp and the kernel. if err := seccomp.FlagSupported(specs.LinuxSeccompFlagSpecAllow); err == nil { newConfig.Flags = []specs.LinuxSeccompFlag{specs.LinuxSeccompFlagSpecAllow} } } else { // Fail early if some flags are unknown or unsupported. for _, flag := range config.Flags { if err := seccomp.FlagSupported(flag); err != nil { return nil, err } newConfig.Flags = append(newConfig.Flags, flag) } } if len(config.Architectures) > 0 { newConfig.Architectures = []string{} for _, arch := range config.Architectures { newArch, err := seccomp.ConvertStringToArch(string(arch)) if err != nil { return nil, err } newConfig.Architectures = append(newConfig.Architectures, newArch) } } // Convert default action from string representation newDefaultAction, err := seccomp.ConvertStringToAction(string(config.DefaultAction)) if err != nil { return nil, err } newConfig.DefaultAction = newDefaultAction newConfig.DefaultErrnoRet = config.DefaultErrnoRet newConfig.ListenerPath = config.ListenerPath newConfig.ListenerMetadata = config.ListenerMetadata // Loop through all syscall blocks and convert them to libcontainer format for _, call := range config.Syscalls { newAction, err := seccomp.ConvertStringToAction(string(call.Action)) if err != nil { return nil, err } for _, name := range call.Names { newCall := configs.Syscall{ Name: name, Action: newAction, ErrnoRet: call.ErrnoRet, Args: []*configs.Arg{}, } // Loop through all the arguments of the syscall and convert them for _, arg := range call.Args { newOp, err := seccomp.ConvertStringToOperator(string(arg.Op)) if err != nil { return nil, err } newArg := configs.Arg{ Index: arg.Index, Value: arg.Value, ValueTwo: arg.ValueTwo, Op: newOp, } newCall.Args = append(newCall.Args, &newArg) } newConfig.Syscalls = append(newConfig.Syscalls, &newCall) } } return newConfig, nil } func createHooks(rspec *specs.Spec, config *configs.Config) { config.Hooks = configs.Hooks{} if rspec.Hooks != nil { for _, h := range rspec.Hooks.Prestart { //nolint:staticcheck // Ignore SA1019. Need to keep deprecated package for compatibility. cmd := createCommandHook(h) config.Hooks[configs.Prestart] = append(config.Hooks[configs.Prestart], configs.NewCommandHook(cmd)) } for _, h := range rspec.Hooks.CreateRuntime { cmd := createCommandHook(h) config.Hooks[configs.CreateRuntime] = append(config.Hooks[configs.CreateRuntime], configs.NewCommandHook(cmd)) } for _, h := range rspec.Hooks.CreateContainer { cmd := createCommandHook(h) config.Hooks[configs.CreateContainer] = append(config.Hooks[configs.CreateContainer], configs.NewCommandHook(cmd)) } for _, h := range rspec.Hooks.StartContainer { cmd := createCommandHook(h) config.Hooks[configs.StartContainer] = append(config.Hooks[configs.StartContainer], configs.NewCommandHook(cmd)) } for _, h := range rspec.Hooks.Poststart { cmd := createCommandHook(h) config.Hooks[configs.Poststart] = append(config.Hooks[configs.Poststart], configs.NewCommandHook(cmd)) } for _, h := range rspec.Hooks.Poststop { cmd := createCommandHook(h) config.Hooks[configs.Poststop] = append(config.Hooks[configs.Poststop], configs.NewCommandHook(cmd)) } } } func createCommandHook(h specs.Hook) configs.Command { cmd := configs.Command{ Path: h.Path, Args: h.Args, Env: h.Env, } if h.Timeout != nil { d := time.Duration(*h.Timeout) * time.Second cmd.Timeout = &d } return cmd }