Files
runc/utils_linux.go
Xiaochen Shen 27560ace2f libcontainer: intelrdt: add support for Intel RDT/MBA in runc
Memory Bandwidth Allocation (MBA) is a resource allocation sub-feature
of Intel Resource Director Technology (RDT) which is supported on some
Intel Xeon platforms. Intel RDT/MBA provides indirect and approximate
throttle over memory bandwidth for the software. A user controls the
resource by indicating the percentage of maximum memory bandwidth.

Hardware details of Intel RDT/MBA can be found in section 17.18 of
Intel Software Developer Manual:
https://software.intel.com/en-us/articles/intel-sdm

In Linux 4.12 kernel and newer, Intel RDT/MBA is enabled by kernel
config CONFIG_INTEL_RDT. If hardware support, CPU flags `rdt_a` and
`mba` will be set in /proc/cpuinfo.

Intel RDT "resource control" filesystem hierarchy:
mount -t resctrl resctrl /sys/fs/resctrl
tree /sys/fs/resctrl
/sys/fs/resctrl/
|-- info
|   |-- L3
|   |   |-- cbm_mask
|   |   |-- min_cbm_bits
|   |   |-- num_closids
|   |-- MB
|       |-- bandwidth_gran
|       |-- delay_linear
|       |-- min_bandwidth
|       |-- num_closids
|-- ...
|-- schemata
|-- tasks
|-- <container_id>
    |-- ...
    |-- schemata
    |-- tasks

For MBA support for `runc`, we will reuse the infrastructure and code
base of Intel RDT/CAT which implemented in #1279. We could also make
use of `tasks` and `schemata` configuration for memory bandwidth
resource constraints.

The file `tasks` has a list of tasks that belongs to this group (e.g.,
<container_id>" group). Tasks can be added to a group by writing the
task ID to the "tasks" file (which will automatically remove them from
the previous group to which they belonged). New tasks created by
fork(2) and clone(2) are added to the same group as their parent.

The file `schemata` has a list of all the resources available to this
group. Each resource (L3 cache, memory bandwidth) has its own line and
format.

Memory bandwidth schema:
It has allocation values for memory bandwidth on each socket, which
contains L3 cache id and memory bandwidth percentage.
    Format: "MB:<cache_id0>=bandwidth0;<cache_id1>=bandwidth1;..."

The minimum bandwidth percentage value for each CPU model is predefined
and can be looked up through "info/MB/min_bandwidth". The bandwidth
granularity that is allocated is also dependent on the CPU model and
can be looked up at "info/MB/bandwidth_gran". The available bandwidth
control steps are: min_bw + N * bw_gran. Intermediate values are
rounded to the next control step available on the hardware.

For more information about Intel RDT kernel interface:
https://www.kernel.org/doc/Documentation/x86/intel_rdt_ui.txt

An example for runc:
Consider a two-socket machine with two L3 caches where the minimum
memory bandwidth of 10% with a memory bandwidth granularity of 10%.
Tasks inside the container may use a maximum memory bandwidth of 20%
on socket 0 and 70% on socket 1.

"linux": {
    "intelRdt": {
        "memBwSchema": "MB:0=20;1=70"
    }
}

Signed-off-by: Xiaochen Shen <xiaochen.shen@intel.com>
2018-10-16 14:29:29 +08:00

440 lines
12 KiB
Go

// +build linux
package main
import (
"errors"
"fmt"
"net"
"os"
"os/exec"
"path/filepath"
"strconv"
"github.com/opencontainers/runc/libcontainer"
"github.com/opencontainers/runc/libcontainer/cgroups/systemd"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/intelrdt"
"github.com/opencontainers/runc/libcontainer/specconv"
"github.com/opencontainers/runc/libcontainer/utils"
"github.com/opencontainers/runtime-spec/specs-go"
"github.com/coreos/go-systemd/activation"
"github.com/sirupsen/logrus"
"github.com/urfave/cli"
"golang.org/x/sys/unix"
)
var errEmptyID = errors.New("container id cannot be empty")
// loadFactory returns the configured factory instance for execing containers.
func loadFactory(context *cli.Context) (libcontainer.Factory, error) {
root := context.GlobalString("root")
abs, err := filepath.Abs(root)
if err != nil {
return nil, err
}
// We default to cgroupfs, and can only use systemd if the system is a
// systemd box.
cgroupManager := libcontainer.Cgroupfs
rootlessCg, err := shouldUseRootlessCgroupManager(context)
if err != nil {
return nil, err
}
if rootlessCg {
cgroupManager = libcontainer.RootlessCgroupfs
}
if context.GlobalBool("systemd-cgroup") {
if systemd.UseSystemd() {
cgroupManager = libcontainer.SystemdCgroups
} else {
return nil, fmt.Errorf("systemd cgroup flag passed, but systemd support for managing cgroups is not available")
}
}
intelRdtManager := libcontainer.IntelRdtFs
if !intelrdt.IsCatEnabled() && !intelrdt.IsMbaEnabled() {
intelRdtManager = nil
}
// We resolve the paths for {newuidmap,newgidmap} from the context of runc,
// to avoid doing a path lookup in the nsexec context. TODO: The binary
// names are not currently configurable.
newuidmap, err := exec.LookPath("newuidmap")
if err != nil {
newuidmap = ""
}
newgidmap, err := exec.LookPath("newgidmap")
if err != nil {
newgidmap = ""
}
return libcontainer.New(abs, cgroupManager, intelRdtManager,
libcontainer.CriuPath(context.GlobalString("criu")),
libcontainer.NewuidmapPath(newuidmap),
libcontainer.NewgidmapPath(newgidmap))
}
// getContainer returns the specified container instance by loading it from state
// with the default factory.
func getContainer(context *cli.Context) (libcontainer.Container, error) {
id := context.Args().First()
if id == "" {
return nil, errEmptyID
}
factory, err := loadFactory(context)
if err != nil {
return nil, err
}
return factory.Load(id)
}
func fatalf(t string, v ...interface{}) {
fatal(fmt.Errorf(t, v...))
}
func getDefaultImagePath(context *cli.Context) string {
cwd, err := os.Getwd()
if err != nil {
panic(err)
}
return filepath.Join(cwd, "checkpoint")
}
// newProcess returns a new libcontainer Process with the arguments from the
// spec and stdio from the current process.
func newProcess(p specs.Process, init bool) (*libcontainer.Process, error) {
lp := &libcontainer.Process{
Args: p.Args,
Env: p.Env,
// TODO: fix libcontainer's API to better support uid/gid in a typesafe way.
User: fmt.Sprintf("%d:%d", p.User.UID, p.User.GID),
Cwd: p.Cwd,
Label: p.SelinuxLabel,
NoNewPrivileges: &p.NoNewPrivileges,
AppArmorProfile: p.ApparmorProfile,
Init: init,
}
if p.ConsoleSize != nil {
lp.ConsoleWidth = uint16(p.ConsoleSize.Width)
lp.ConsoleHeight = uint16(p.ConsoleSize.Height)
}
if p.Capabilities != nil {
lp.Capabilities = &configs.Capabilities{}
lp.Capabilities.Bounding = p.Capabilities.Bounding
lp.Capabilities.Effective = p.Capabilities.Effective
lp.Capabilities.Inheritable = p.Capabilities.Inheritable
lp.Capabilities.Permitted = p.Capabilities.Permitted
lp.Capabilities.Ambient = p.Capabilities.Ambient
}
for _, gid := range p.User.AdditionalGids {
lp.AdditionalGroups = append(lp.AdditionalGroups, strconv.FormatUint(uint64(gid), 10))
}
for _, rlimit := range p.Rlimits {
rl, err := createLibContainerRlimit(rlimit)
if err != nil {
return nil, err
}
lp.Rlimits = append(lp.Rlimits, rl)
}
return lp, nil
}
func destroy(container libcontainer.Container) {
if err := container.Destroy(); err != nil {
logrus.Error(err)
}
}
// setupIO modifies the given process config according to the options.
func setupIO(process *libcontainer.Process, rootuid, rootgid int, createTTY, detach bool, sockpath string) (*tty, error) {
if createTTY {
process.Stdin = nil
process.Stdout = nil
process.Stderr = nil
t := &tty{}
if !detach {
parent, child, err := utils.NewSockPair("console")
if err != nil {
return nil, err
}
process.ConsoleSocket = child
t.postStart = append(t.postStart, parent, child)
t.consoleC = make(chan error, 1)
go func() {
if err := t.recvtty(process, parent); err != nil {
t.consoleC <- err
}
t.consoleC <- nil
}()
} else {
// the caller of runc will handle receiving the console master
conn, err := net.Dial("unix", sockpath)
if err != nil {
return nil, err
}
uc, ok := conn.(*net.UnixConn)
if !ok {
return nil, fmt.Errorf("casting to UnixConn failed")
}
t.postStart = append(t.postStart, uc)
socket, err := uc.File()
if err != nil {
return nil, err
}
t.postStart = append(t.postStart, socket)
process.ConsoleSocket = socket
}
return t, nil
}
// when runc will detach the caller provides the stdio to runc via runc's 0,1,2
// and the container's process inherits runc's stdio.
if detach {
if err := inheritStdio(process); err != nil {
return nil, err
}
return &tty{}, nil
}
return setupProcessPipes(process, rootuid, rootgid)
}
// createPidFile creates a file with the processes pid inside it atomically
// it creates a temp file with the paths filename + '.' infront of it
// then renames the file
func createPidFile(path string, process *libcontainer.Process) error {
pid, err := process.Pid()
if err != nil {
return err
}
var (
tmpDir = filepath.Dir(path)
tmpName = filepath.Join(tmpDir, fmt.Sprintf(".%s", filepath.Base(path)))
)
f, err := os.OpenFile(tmpName, os.O_RDWR|os.O_CREATE|os.O_EXCL|os.O_SYNC, 0666)
if err != nil {
return err
}
_, err = fmt.Fprintf(f, "%d", pid)
f.Close()
if err != nil {
return err
}
return os.Rename(tmpName, path)
}
func createContainer(context *cli.Context, id string, spec *specs.Spec) (libcontainer.Container, error) {
rootlessCg, err := shouldUseRootlessCgroupManager(context)
if err != nil {
return nil, err
}
config, err := specconv.CreateLibcontainerConfig(&specconv.CreateOpts{
CgroupName: id,
UseSystemdCgroup: context.GlobalBool("systemd-cgroup"),
NoPivotRoot: context.Bool("no-pivot"),
NoNewKeyring: context.Bool("no-new-keyring"),
Spec: spec,
RootlessEUID: os.Geteuid() != 0,
RootlessCgroups: rootlessCg,
})
if err != nil {
return nil, err
}
factory, err := loadFactory(context)
if err != nil {
return nil, err
}
return factory.Create(id, config)
}
type runner struct {
init bool
enableSubreaper bool
shouldDestroy bool
detach bool
listenFDs []*os.File
preserveFDs int
pidFile string
consoleSocket string
container libcontainer.Container
action CtAct
notifySocket *notifySocket
criuOpts *libcontainer.CriuOpts
}
func (r *runner) run(config *specs.Process) (int, error) {
if err := r.checkTerminal(config); err != nil {
r.destroy()
return -1, err
}
process, err := newProcess(*config, r.init)
if err != nil {
r.destroy()
return -1, err
}
if len(r.listenFDs) > 0 {
process.Env = append(process.Env, fmt.Sprintf("LISTEN_FDS=%d", len(r.listenFDs)), "LISTEN_PID=1")
process.ExtraFiles = append(process.ExtraFiles, r.listenFDs...)
}
baseFd := 3 + len(process.ExtraFiles)
for i := baseFd; i < baseFd+r.preserveFDs; i++ {
process.ExtraFiles = append(process.ExtraFiles, os.NewFile(uintptr(i), "PreserveFD:"+strconv.Itoa(i)))
}
rootuid, err := r.container.Config().HostRootUID()
if err != nil {
r.destroy()
return -1, err
}
rootgid, err := r.container.Config().HostRootGID()
if err != nil {
r.destroy()
return -1, err
}
var (
detach = r.detach || (r.action == CT_ACT_CREATE)
)
// Setting up IO is a two stage process. We need to modify process to deal
// with detaching containers, and then we get a tty after the container has
// started.
handler := newSignalHandler(r.enableSubreaper, r.notifySocket)
tty, err := setupIO(process, rootuid, rootgid, config.Terminal, detach, r.consoleSocket)
if err != nil {
r.destroy()
return -1, err
}
defer tty.Close()
switch r.action {
case CT_ACT_CREATE:
err = r.container.Start(process)
case CT_ACT_RESTORE:
err = r.container.Restore(process, r.criuOpts)
case CT_ACT_RUN:
err = r.container.Run(process)
default:
panic("Unknown action")
}
if err != nil {
r.destroy()
return -1, err
}
if err := tty.waitConsole(); err != nil {
r.terminate(process)
r.destroy()
return -1, err
}
if err = tty.ClosePostStart(); err != nil {
r.terminate(process)
r.destroy()
return -1, err
}
if r.pidFile != "" {
if err = createPidFile(r.pidFile, process); err != nil {
r.terminate(process)
r.destroy()
return -1, err
}
}
status, err := handler.forward(process, tty, detach)
if err != nil {
r.terminate(process)
}
if detach {
return 0, nil
}
r.destroy()
return status, err
}
func (r *runner) destroy() {
if r.shouldDestroy {
destroy(r.container)
}
}
func (r *runner) terminate(p *libcontainer.Process) {
_ = p.Signal(unix.SIGKILL)
_, _ = p.Wait()
}
func (r *runner) checkTerminal(config *specs.Process) error {
detach := r.detach || (r.action == CT_ACT_CREATE)
// Check command-line for sanity.
if detach && config.Terminal && r.consoleSocket == "" {
return fmt.Errorf("cannot allocate tty if runc will detach without setting console socket")
}
if (!detach || !config.Terminal) && r.consoleSocket != "" {
return fmt.Errorf("cannot use console socket if runc will not detach or allocate tty")
}
return nil
}
func validateProcessSpec(spec *specs.Process) error {
if spec.Cwd == "" {
return fmt.Errorf("Cwd property must not be empty")
}
if !filepath.IsAbs(spec.Cwd) {
return fmt.Errorf("Cwd must be an absolute path")
}
if len(spec.Args) == 0 {
return fmt.Errorf("args must not be empty")
}
return nil
}
type CtAct uint8
const (
CT_ACT_CREATE CtAct = iota + 1
CT_ACT_RUN
CT_ACT_RESTORE
)
func startContainer(context *cli.Context, spec *specs.Spec, action CtAct, criuOpts *libcontainer.CriuOpts) (int, error) {
id := context.Args().First()
if id == "" {
return -1, errEmptyID
}
notifySocket := newNotifySocket(context, os.Getenv("NOTIFY_SOCKET"), id)
if notifySocket != nil {
notifySocket.setupSpec(context, spec)
}
container, err := createContainer(context, id, spec)
if err != nil {
return -1, err
}
if notifySocket != nil {
err := notifySocket.setupSocket()
if err != nil {
return -1, err
}
}
// Support on-demand socket activation by passing file descriptors into the container init process.
listenFDs := []*os.File{}
if os.Getenv("LISTEN_FDS") != "" {
listenFDs = activation.Files(false)
}
r := &runner{
enableSubreaper: !context.Bool("no-subreaper"),
shouldDestroy: true,
container: container,
listenFDs: listenFDs,
notifySocket: notifySocket,
consoleSocket: context.String("console-socket"),
detach: context.Bool("detach"),
pidFile: context.String("pid-file"),
preserveFDs: context.Int("preserve-fds"),
action: action,
criuOpts: criuOpts,
init: true,
}
return r.run(spec.Process)
}