mirror of
https://github.com/opencontainers/runc.git
synced 2025-10-16 04:20:52 +08:00

Without this, multiple runc containers can accidentally share the same cgroup(s) (and change each other's limits), when runc is invoked from the same directory (i.e.: same cwd on multiple runc executions). After these changes, each runc container will run on its own cgroup(s). Before, the only workaround was to invoke runc from an unique (temporary?) cwd for each container. Common cgroup configuration (and hierarchical limits) can be set by having multiple runc containers share the same cgroup parent, which is the cgroup of the process executing runc. Signed-off-by: Fabio Kung <fabio.kung@gmail.com>
412 lines
11 KiB
Go
412 lines
11 KiB
Go
// +build linux
|
|
|
|
package main
|
|
|
|
import (
|
|
"encoding/json"
|
|
"fmt"
|
|
"os"
|
|
"path/filepath"
|
|
"runtime"
|
|
"strings"
|
|
"syscall"
|
|
|
|
"github.com/Sirupsen/logrus"
|
|
"github.com/codegangsta/cli"
|
|
"github.com/opencontainers/runc/libcontainer/cgroups"
|
|
"github.com/opencontainers/runc/libcontainer/configs"
|
|
"github.com/opencontainers/runc/libcontainer/devices"
|
|
"github.com/opencontainers/specs"
|
|
)
|
|
|
|
var specCommand = cli.Command{
|
|
Name: "spec",
|
|
Usage: "create a new specification file",
|
|
Action: func(context *cli.Context) {
|
|
spec := specs.LinuxSpec{
|
|
Spec: specs.Spec{
|
|
Version: specs.Version,
|
|
Platform: specs.Platform{
|
|
OS: runtime.GOOS,
|
|
Arch: runtime.GOARCH,
|
|
},
|
|
Root: specs.Root{
|
|
Path: "rootfs",
|
|
Readonly: true,
|
|
},
|
|
Process: specs.Process{
|
|
Terminal: true,
|
|
User: specs.User{},
|
|
Args: []string{
|
|
"sh",
|
|
},
|
|
Env: []string{
|
|
"PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
|
|
"TERM=xterm",
|
|
},
|
|
},
|
|
Hostname: "shell",
|
|
Mounts: []specs.Mount{
|
|
{
|
|
Type: "proc",
|
|
Source: "proc",
|
|
Destination: "/proc",
|
|
Options: "",
|
|
},
|
|
{
|
|
Type: "tmpfs",
|
|
Source: "tmpfs",
|
|
Destination: "/dev",
|
|
Options: "nosuid,strictatime,mode=755,size=65536k",
|
|
},
|
|
{
|
|
Type: "devpts",
|
|
Source: "devpts",
|
|
Destination: "/dev/pts",
|
|
Options: "nosuid,noexec,newinstance,ptmxmode=0666,mode=0620,gid=5",
|
|
},
|
|
{
|
|
Type: "tmpfs",
|
|
Source: "shm",
|
|
Destination: "/dev/shm",
|
|
Options: "nosuid,noexec,nodev,mode=1777,size=65536k",
|
|
},
|
|
{
|
|
Type: "mqueue",
|
|
Source: "mqueue",
|
|
Destination: "/dev/mqueue",
|
|
Options: "nosuid,noexec,nodev",
|
|
},
|
|
{
|
|
Type: "sysfs",
|
|
Source: "sysfs",
|
|
Destination: "/sys",
|
|
Options: "nosuid,noexec,nodev",
|
|
},
|
|
{
|
|
Type: "cgroup",
|
|
Source: "cgroup",
|
|
Destination: "/sys/fs/cgroup",
|
|
Options: "nosuid,noexec,nodev,relatime,ro",
|
|
},
|
|
},
|
|
},
|
|
Linux: specs.Linux{
|
|
Namespaces: []specs.Namespace{
|
|
{
|
|
Type: "pid",
|
|
},
|
|
{
|
|
Type: "network",
|
|
},
|
|
{
|
|
Type: "ipc",
|
|
},
|
|
{
|
|
Type: "uts",
|
|
},
|
|
{
|
|
Type: "mount",
|
|
},
|
|
},
|
|
Capabilities: []string{
|
|
"AUDIT_WRITE",
|
|
"KILL",
|
|
"NET_BIND_SERVICE",
|
|
},
|
|
Devices: []string{
|
|
"null",
|
|
"random",
|
|
"full",
|
|
"tty",
|
|
"zero",
|
|
"urandom",
|
|
},
|
|
Resources: specs.Resources{
|
|
Memory: specs.Memory{
|
|
Swappiness: -1,
|
|
},
|
|
},
|
|
},
|
|
}
|
|
data, err := json.MarshalIndent(&spec, "", "\t")
|
|
if err != nil {
|
|
logrus.Fatal(err)
|
|
}
|
|
fmt.Printf("%s", data)
|
|
},
|
|
}
|
|
|
|
var namespaceMapping = map[string]configs.NamespaceType{
|
|
"pid": configs.NEWPID,
|
|
"network": configs.NEWNET,
|
|
"mount": configs.NEWNS,
|
|
"user": configs.NEWUSER,
|
|
"ipc": configs.NEWIPC,
|
|
"uts": configs.NEWUTS,
|
|
}
|
|
|
|
// loadSpec loads the specification from the provided path.
|
|
// If the path is empty then the default path will be "config.json"
|
|
func loadSpec(path string) (*specs.LinuxSpec, error) {
|
|
if path == "" {
|
|
path = "config.json"
|
|
}
|
|
f, err := os.Open(path)
|
|
if err != nil {
|
|
if os.IsNotExist(err) {
|
|
return nil, fmt.Errorf("JSON specification file for %s not found", path)
|
|
}
|
|
return nil, err
|
|
}
|
|
defer f.Close()
|
|
var s *specs.LinuxSpec
|
|
if err := json.NewDecoder(f).Decode(&s); err != nil {
|
|
return nil, err
|
|
}
|
|
return s, checkSpecVersion(s)
|
|
}
|
|
|
|
// checkSpecVersion makes sure that the spec version matches runc's while we are in the initial
|
|
// development period. It is better to hard fail than have missing fields or options in the spec.
|
|
func checkSpecVersion(s *specs.LinuxSpec) error {
|
|
if s.Version != specs.Version {
|
|
return fmt.Errorf("spec version is not compatible with implemented version %q: spec %q", specs.Version, s.Version)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func createLibcontainerConfig(cgroupName string, spec *specs.LinuxSpec) (*configs.Config, error) {
|
|
cwd, err := os.Getwd()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
rootfsPath := spec.Root.Path
|
|
if !filepath.IsAbs(rootfsPath) {
|
|
rootfsPath = filepath.Join(cwd, rootfsPath)
|
|
}
|
|
config := &configs.Config{
|
|
Rootfs: rootfsPath,
|
|
Capabilities: spec.Linux.Capabilities,
|
|
Readonlyfs: spec.Root.Readonly,
|
|
Hostname: spec.Hostname,
|
|
Privatefs: true,
|
|
}
|
|
for _, ns := range spec.Linux.Namespaces {
|
|
t, exists := namespaceMapping[ns.Type]
|
|
if !exists {
|
|
return nil, fmt.Errorf("namespace %q does not exist", ns)
|
|
}
|
|
config.Namespaces.Add(t, ns.Path)
|
|
}
|
|
if config.Namespaces.Contains(configs.NEWNET) {
|
|
config.Networks = []*configs.Network{
|
|
{
|
|
Type: "loopback",
|
|
},
|
|
}
|
|
}
|
|
for _, m := range spec.Mounts {
|
|
config.Mounts = append(config.Mounts, createLibcontainerMount(cwd, m))
|
|
}
|
|
if err := createDevices(spec, config); err != nil {
|
|
return nil, err
|
|
}
|
|
if err := setupUserNamespace(spec, config); err != nil {
|
|
return nil, err
|
|
}
|
|
c, err := createCgroupConfig(cgroupName, spec, config.Devices)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
config.Cgroups = c
|
|
if config.Readonlyfs {
|
|
setReadonly(config)
|
|
config.MaskPaths = []string{
|
|
"/proc/kcore",
|
|
}
|
|
config.ReadonlyPaths = []string{
|
|
"/proc/sys", "/proc/sysrq-trigger", "/proc/irq", "/proc/bus",
|
|
}
|
|
}
|
|
config.Sysctl = spec.Linux.Sysctl
|
|
return config, nil
|
|
}
|
|
|
|
func createLibcontainerMount(cwd string, m specs.Mount) *configs.Mount {
|
|
flags, data := parseMountOptions(m.Options)
|
|
source := m.Source
|
|
if m.Type == "bind" {
|
|
if !filepath.IsAbs(source) {
|
|
source = filepath.Join(cwd, m.Source)
|
|
}
|
|
}
|
|
return &configs.Mount{
|
|
Device: m.Type,
|
|
Source: source,
|
|
Destination: m.Destination,
|
|
Data: data,
|
|
Flags: flags,
|
|
}
|
|
}
|
|
|
|
func createCgroupConfig(name string, spec *specs.LinuxSpec, devices []*configs.Device) (*configs.Cgroup, error) {
|
|
myCgroupPath, err := cgroups.GetThisCgroupDir("devices")
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
c := &configs.Cgroup{
|
|
Name: name,
|
|
Parent: myCgroupPath,
|
|
AllowedDevices: append(devices, allowedDevices...),
|
|
}
|
|
r := spec.Linux.Resources
|
|
c.Memory = r.Memory.Limit
|
|
c.MemoryReservation = r.Memory.Reservation
|
|
c.MemorySwap = r.Memory.Swap
|
|
c.KernelMemory = r.Memory.Kernel
|
|
c.MemorySwappiness = r.Memory.Swappiness
|
|
c.CpuShares = r.CPU.Shares
|
|
c.CpuQuota = r.CPU.Quota
|
|
c.CpuPeriod = r.CPU.Period
|
|
c.CpuRtRuntime = r.CPU.RealtimeRuntime
|
|
c.CpuRtPeriod = r.CPU.RealtimePeriod
|
|
c.CpusetCpus = r.CPU.Cpus
|
|
c.CpusetMems = r.CPU.Mems
|
|
c.BlkioThrottleReadBpsDevice = r.BlockIO.ThrottleReadBpsDevice
|
|
c.BlkioThrottleWriteBpsDevice = r.BlockIO.ThrottleWriteBpsDevice
|
|
c.BlkioThrottleReadIOpsDevice = r.BlockIO.ThrottleReadIOpsDevice
|
|
c.BlkioThrottleWriteIOpsDevice = r.BlockIO.ThrottleWriteIOpsDevice
|
|
c.BlkioWeight = r.BlockIO.Weight
|
|
c.BlkioWeightDevice = r.BlockIO.WeightDevice
|
|
for _, l := range r.HugepageLimits {
|
|
c.HugetlbLimit = append(c.HugetlbLimit, &configs.HugepageLimit{
|
|
Pagesize: l.Pagesize,
|
|
Limit: l.Limit,
|
|
})
|
|
}
|
|
c.OomKillDisable = r.DisableOOMKiller
|
|
c.NetClsClassid = r.Network.ClassID
|
|
for _, m := range r.Network.Priorities {
|
|
c.NetPrioIfpriomap = append(c.NetPrioIfpriomap, &configs.IfPrioMap{
|
|
Interface: m.Name,
|
|
Priority: m.Priority,
|
|
})
|
|
}
|
|
return c, nil
|
|
}
|
|
|
|
func createDevices(spec *specs.LinuxSpec, config *configs.Config) error {
|
|
for _, name := range spec.Linux.Devices {
|
|
d, err := devices.DeviceFromPath(filepath.Join("/dev", name), "rwm")
|
|
if err != nil {
|
|
return err
|
|
}
|
|
config.Devices = append(config.Devices, d)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func setReadonly(config *configs.Config) {
|
|
for _, m := range config.Mounts {
|
|
if m.Device == "sysfs" {
|
|
m.Flags |= syscall.MS_RDONLY
|
|
}
|
|
}
|
|
}
|
|
|
|
func setupUserNamespace(spec *specs.LinuxSpec, config *configs.Config) error {
|
|
if len(spec.Linux.UIDMappings) == 0 {
|
|
return nil
|
|
}
|
|
config.Namespaces.Add(configs.NEWUSER, "")
|
|
create := func(m specs.IDMapping) configs.IDMap {
|
|
return configs.IDMap{
|
|
HostID: int(m.HostID),
|
|
ContainerID: int(m.ContainerID),
|
|
Size: int(m.Size),
|
|
}
|
|
}
|
|
for _, m := range spec.Linux.UIDMappings {
|
|
config.UidMappings = append(config.UidMappings, create(m))
|
|
}
|
|
for _, m := range spec.Linux.GIDMappings {
|
|
config.GidMappings = append(config.GidMappings, create(m))
|
|
}
|
|
rootUID, err := config.HostUID()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
rootGID, err := config.HostGID()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
for _, node := range config.Devices {
|
|
node.Uid = uint32(rootUID)
|
|
node.Gid = uint32(rootGID)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// parseMountOptions parses the string and returns the flags and any mount data that
|
|
// it contains.
|
|
func parseMountOptions(options string) (int, string) {
|
|
var (
|
|
flag int
|
|
data []string
|
|
)
|
|
flags := map[string]struct {
|
|
clear bool
|
|
flag int
|
|
}{
|
|
"async": {true, syscall.MS_SYNCHRONOUS},
|
|
"atime": {true, syscall.MS_NOATIME},
|
|
"bind": {false, syscall.MS_BIND},
|
|
"defaults": {false, 0},
|
|
"dev": {true, syscall.MS_NODEV},
|
|
"diratime": {true, syscall.MS_NODIRATIME},
|
|
"dirsync": {false, syscall.MS_DIRSYNC},
|
|
"exec": {true, syscall.MS_NOEXEC},
|
|
"mand": {false, syscall.MS_MANDLOCK},
|
|
"noatime": {false, syscall.MS_NOATIME},
|
|
"nodev": {false, syscall.MS_NODEV},
|
|
"nodiratime": {false, syscall.MS_NODIRATIME},
|
|
"noexec": {false, syscall.MS_NOEXEC},
|
|
"nomand": {true, syscall.MS_MANDLOCK},
|
|
"norelatime": {true, syscall.MS_RELATIME},
|
|
"nostrictatime": {true, syscall.MS_STRICTATIME},
|
|
"nosuid": {false, syscall.MS_NOSUID},
|
|
"private": {false, syscall.MS_PRIVATE},
|
|
"rbind": {false, syscall.MS_BIND | syscall.MS_REC},
|
|
"relatime": {false, syscall.MS_RELATIME},
|
|
"remount": {false, syscall.MS_REMOUNT},
|
|
"ro": {false, syscall.MS_RDONLY},
|
|
"rprivate": {false, syscall.MS_PRIVATE | syscall.MS_REC},
|
|
"rshared": {false, syscall.MS_SHARED | syscall.MS_REC},
|
|
"rslave": {false, syscall.MS_SLAVE | syscall.MS_REC},
|
|
"runbindable": {false, syscall.MS_UNBINDABLE | syscall.MS_REC},
|
|
"rw": {true, syscall.MS_RDONLY},
|
|
"shared": {false, syscall.MS_SHARED},
|
|
"slave": {false, syscall.MS_SLAVE},
|
|
"strictatime": {false, syscall.MS_STRICTATIME},
|
|
"suid": {true, syscall.MS_NOSUID},
|
|
"sync": {false, syscall.MS_SYNCHRONOUS},
|
|
"unbindable": {false, syscall.MS_UNBINDABLE},
|
|
}
|
|
for _, o := range strings.Split(options, ",") {
|
|
// If the option does not exist in the flags table or the flag
|
|
// is not supported on the platform,
|
|
// then it is a data value for a specific fs type
|
|
if f, exists := flags[o]; exists && f.flag != 0 {
|
|
if f.clear {
|
|
flag &= ^f.flag
|
|
} else {
|
|
flag |= f.flag
|
|
}
|
|
} else {
|
|
data = append(data, o)
|
|
}
|
|
}
|
|
return flag, strings.Join(data, ",")
|
|
}
|