Files
runc/libcontainer/process_linux.go
Aleksa Sarai d2f49696b0 runc: add support for rootless containers
This enables the support for the rootless container mode. There are many
restrictions on what rootless containers can do, so many different runC
commands have been disabled:

* runc checkpoint
* runc events
* runc pause
* runc ps
* runc restore
* runc resume
* runc update

The following commands work:

* runc create
* runc delete
* runc exec
* runc kill
* runc list
* runc run
* runc spec
* runc state

In addition, any specification options that imply joining cgroups have
also been disabled. This is due to support for unprivileged subtree
management not being available from Linux upstream.

Signed-off-by: Aleksa Sarai <asarai@suse.de>
2017-03-23 20:45:24 +11:00

488 lines
13 KiB
Go

// +build linux
package libcontainer
import (
"encoding/json"
"errors"
"fmt"
"io"
"os"
"os/exec"
"path/filepath"
"strconv"
"syscall"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/system"
"github.com/opencontainers/runc/libcontainer/utils"
)
type parentProcess interface {
// pid returns the pid for the running process.
pid() int
// start starts the process execution.
start() error
// send a SIGKILL to the process and wait for the exit.
terminate() error
// wait waits on the process returning the process state.
wait() (*os.ProcessState, error)
// startTime returns the process start time.
startTime() (string, error)
signal(os.Signal) error
externalDescriptors() []string
setExternalDescriptors(fds []string)
}
type setnsProcess struct {
cmd *exec.Cmd
parentPipe *os.File
childPipe *os.File
cgroupPaths map[string]string
config *initConfig
fds []string
process *Process
bootstrapData io.Reader
}
func (p *setnsProcess) startTime() (string, error) {
return system.GetProcessStartTime(p.pid())
}
func (p *setnsProcess) signal(sig os.Signal) error {
s, ok := sig.(syscall.Signal)
if !ok {
return errors.New("os: unsupported signal type")
}
return syscall.Kill(p.pid(), s)
}
func (p *setnsProcess) start() (err error) {
defer p.parentPipe.Close()
err = p.cmd.Start()
p.childPipe.Close()
if err != nil {
return newSystemErrorWithCause(err, "starting setns process")
}
if p.bootstrapData != nil {
if _, err := io.Copy(p.parentPipe, p.bootstrapData); err != nil {
return newSystemErrorWithCause(err, "copying bootstrap data to pipe")
}
}
if err = p.execSetns(); err != nil {
return newSystemErrorWithCause(err, "executing setns process")
}
// We can't join cgroups if we're in a rootless container.
if !p.config.Rootless && len(p.cgroupPaths) > 0 {
if err := cgroups.EnterPid(p.cgroupPaths, p.pid()); err != nil {
return newSystemErrorWithCausef(err, "adding pid %d to cgroups", p.pid())
}
}
// set rlimits, this has to be done here because we lose permissions
// to raise the limits once we enter a user-namespace
if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil {
return newSystemErrorWithCause(err, "setting rlimits for process")
}
if err := utils.WriteJSON(p.parentPipe, p.config); err != nil {
return newSystemErrorWithCause(err, "writing config to pipe")
}
ierr := parseSync(p.parentPipe, func(sync *syncT) error {
switch sync.Type {
case procReady:
// This shouldn't happen.
panic("unexpected procReady in setns")
case procHooks:
// This shouldn't happen.
panic("unexpected procHooks in setns")
default:
return newSystemError(fmt.Errorf("invalid JSON payload from child"))
}
})
if err := syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR); err != nil {
return newSystemErrorWithCause(err, "calling shutdown on init pipe")
}
// Must be done after Shutdown so the child will exit and we can wait for it.
if ierr != nil {
p.wait()
return ierr
}
return nil
}
// execSetns runs the process that executes C code to perform the setns calls
// because setns support requires the C process to fork off a child and perform the setns
// before the go runtime boots, we wait on the process to die and receive the child's pid
// over the provided pipe.
func (p *setnsProcess) execSetns() error {
status, err := p.cmd.Process.Wait()
if err != nil {
p.cmd.Wait()
return newSystemErrorWithCause(err, "waiting on setns process to finish")
}
if !status.Success() {
p.cmd.Wait()
return newSystemError(&exec.ExitError{ProcessState: status})
}
var pid *pid
if err := json.NewDecoder(p.parentPipe).Decode(&pid); err != nil {
p.cmd.Wait()
return newSystemErrorWithCause(err, "reading pid from init pipe")
}
process, err := os.FindProcess(pid.Pid)
if err != nil {
return err
}
p.cmd.Process = process
p.process.ops = p
return nil
}
// terminate sends a SIGKILL to the forked process for the setns routine then waits to
// avoid the process becoming a zombie.
func (p *setnsProcess) terminate() error {
if p.cmd.Process == nil {
return nil
}
err := p.cmd.Process.Kill()
if _, werr := p.wait(); err == nil {
err = werr
}
return err
}
func (p *setnsProcess) wait() (*os.ProcessState, error) {
err := p.cmd.Wait()
// Return actual ProcessState even on Wait error
return p.cmd.ProcessState, err
}
func (p *setnsProcess) pid() int {
return p.cmd.Process.Pid
}
func (p *setnsProcess) externalDescriptors() []string {
return p.fds
}
func (p *setnsProcess) setExternalDescriptors(newFds []string) {
p.fds = newFds
}
type initProcess struct {
cmd *exec.Cmd
parentPipe *os.File
childPipe *os.File
config *initConfig
manager cgroups.Manager
container *linuxContainer
fds []string
process *Process
bootstrapData io.Reader
sharePidns bool
rootDir *os.File
}
func (p *initProcess) pid() int {
return p.cmd.Process.Pid
}
func (p *initProcess) externalDescriptors() []string {
return p.fds
}
// execSetns runs the process that executes C code to perform the setns calls
// because setns support requires the C process to fork off a child and perform the setns
// before the go runtime boots, we wait on the process to die and receive the child's pid
// over the provided pipe.
// This is called by initProcess.start function
func (p *initProcess) execSetns() error {
status, err := p.cmd.Process.Wait()
if err != nil {
p.cmd.Wait()
return err
}
if !status.Success() {
p.cmd.Wait()
return &exec.ExitError{ProcessState: status}
}
var pid *pid
if err := json.NewDecoder(p.parentPipe).Decode(&pid); err != nil {
p.cmd.Wait()
return err
}
process, err := os.FindProcess(pid.Pid)
if err != nil {
return err
}
p.cmd.Process = process
p.process.ops = p
return nil
}
func (p *initProcess) start() error {
defer p.parentPipe.Close()
err := p.cmd.Start()
p.process.ops = p
p.childPipe.Close()
p.rootDir.Close()
if err != nil {
p.process.ops = nil
return newSystemErrorWithCause(err, "starting init process command")
}
if _, err := io.Copy(p.parentPipe, p.bootstrapData); err != nil {
return newSystemErrorWithCause(err, "copying bootstrap data to pipe")
}
if err := p.execSetns(); err != nil {
return newSystemErrorWithCause(err, "running exec setns process for init")
}
// Save the standard descriptor names before the container process
// can potentially move them (e.g., via dup2()). If we don't do this now,
// we won't know at checkpoint time which file descriptor to look up.
fds, err := getPipeFds(p.pid())
if err != nil {
return newSystemErrorWithCausef(err, "getting pipe fds for pid %d", p.pid())
}
p.setExternalDescriptors(fds)
if !p.container.config.Rootless {
// Do this before syncing with child so that no children can escape the
// cgroup. We can't do this if we're not running as root.
if err := p.manager.Apply(p.pid()); err != nil {
return newSystemErrorWithCause(err, "applying cgroup configuration for process")
}
}
defer func() {
if err != nil && !p.container.config.Rootless {
// TODO: should not be the responsibility to call here
p.manager.Destroy()
}
}()
if err := p.createNetworkInterfaces(); err != nil {
return newSystemErrorWithCause(err, "creating network interfaces")
}
if err := p.sendConfig(); err != nil {
return newSystemErrorWithCause(err, "sending config to init process")
}
var (
sentRun bool
sentResume bool
)
ierr := parseSync(p.parentPipe, func(sync *syncT) error {
switch sync.Type {
case procReady:
// We can't set cgroups if we're in a rootless container.
if !p.container.config.Rootless {
if err := p.manager.Set(p.config.Config); err != nil {
return newSystemErrorWithCause(err, "setting cgroup config for ready process")
}
}
// set rlimits, this has to be done here because we lose permissions
// to raise the limits once we enter a user-namespace
if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil {
return newSystemErrorWithCause(err, "setting rlimits for ready process")
}
// call prestart hooks
if !p.config.Config.Namespaces.Contains(configs.NEWNS) {
if p.config.Config.Hooks != nil {
s := configs.HookState{
Version: p.container.config.Version,
ID: p.container.id,
Pid: p.pid(),
Bundle: utils.SearchLabels(p.config.Config.Labels, "bundle"),
}
for i, hook := range p.config.Config.Hooks.Prestart {
if err := hook.Run(s); err != nil {
return newSystemErrorWithCausef(err, "running prestart hook %d", i)
}
}
}
}
// Sync with child.
if err := writeSync(p.parentPipe, procRun); err != nil {
return newSystemErrorWithCause(err, "writing syncT 'run'")
}
sentRun = true
case procHooks:
if p.config.Config.Hooks != nil {
s := configs.HookState{
Version: p.container.config.Version,
ID: p.container.id,
Pid: p.pid(),
Bundle: utils.SearchLabels(p.config.Config.Labels, "bundle"),
}
for i, hook := range p.config.Config.Hooks.Prestart {
if err := hook.Run(s); err != nil {
return newSystemErrorWithCausef(err, "running prestart hook %d", i)
}
}
}
// Sync with child.
if err := writeSync(p.parentPipe, procResume); err != nil {
return newSystemErrorWithCause(err, "writing syncT 'resume'")
}
sentResume = true
default:
return newSystemError(fmt.Errorf("invalid JSON payload from child"))
}
return nil
})
if !sentRun {
return newSystemErrorWithCause(ierr, "container init")
}
if p.config.Config.Namespaces.Contains(configs.NEWNS) && !sentResume {
return newSystemError(fmt.Errorf("could not synchronise after executing prestart hooks with container process"))
}
if err := syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR); err != nil {
return newSystemErrorWithCause(err, "shutting down init pipe")
}
// Must be done after Shutdown so the child will exit and we can wait for it.
if ierr != nil {
p.wait()
return ierr
}
return nil
}
func (p *initProcess) wait() (*os.ProcessState, error) {
err := p.cmd.Wait()
if err != nil {
return p.cmd.ProcessState, err
}
// we should kill all processes in cgroup when init is died if we use host PID namespace
if p.sharePidns {
signalAllProcesses(p.manager, syscall.SIGKILL)
}
return p.cmd.ProcessState, nil
}
func (p *initProcess) terminate() error {
if p.cmd.Process == nil {
return nil
}
err := p.cmd.Process.Kill()
if _, werr := p.wait(); err == nil {
err = werr
}
return err
}
func (p *initProcess) startTime() (string, error) {
return system.GetProcessStartTime(p.pid())
}
func (p *initProcess) sendConfig() error {
// send the config to the container's init process, we don't use JSON Encode
// here because there might be a problem in JSON decoder in some cases, see:
// https://github.com/docker/docker/issues/14203#issuecomment-174177790
return utils.WriteJSON(p.parentPipe, p.config)
}
func (p *initProcess) createNetworkInterfaces() error {
for _, config := range p.config.Config.Networks {
strategy, err := getStrategy(config.Type)
if err != nil {
return err
}
n := &network{
Network: *config,
}
if err := strategy.create(n, p.pid()); err != nil {
return err
}
p.config.Networks = append(p.config.Networks, n)
}
return nil
}
func (p *initProcess) signal(sig os.Signal) error {
s, ok := sig.(syscall.Signal)
if !ok {
return errors.New("os: unsupported signal type")
}
return syscall.Kill(p.pid(), s)
}
func (p *initProcess) setExternalDescriptors(newFds []string) {
p.fds = newFds
}
func getPipeFds(pid int) ([]string, error) {
fds := make([]string, 3)
dirPath := filepath.Join("/proc", strconv.Itoa(pid), "/fd")
for i := 0; i < 3; i++ {
// XXX: This breaks if the path is not a valid symlink (which can
// happen in certain particularly unlucky mount namespace setups).
f := filepath.Join(dirPath, strconv.Itoa(i))
target, err := os.Readlink(f)
if err != nil {
// Ignore permission errors, for rootless containers and other
// non-dumpable processes. if we can't get the fd for a particular
// file, there's not much we can do.
if os.IsPermission(err) {
continue
}
return fds, err
}
fds[i] = target
}
return fds, nil
}
// InitializeIO creates pipes for use with the process's stdio and returns the
// opposite side for each. Do not use this if you want to have a pseudoterminal
// set up for you by libcontainer (TODO: fix that too).
// TODO: This is mostly unnecessary, and should be handled by clients.
func (p *Process) InitializeIO(rootuid, rootgid int) (i *IO, err error) {
var fds []uintptr
i = &IO{}
// cleanup in case of an error
defer func() {
if err != nil {
for _, fd := range fds {
syscall.Close(int(fd))
}
}
}()
// STDIN
r, w, err := os.Pipe()
if err != nil {
return nil, err
}
fds = append(fds, r.Fd(), w.Fd())
p.Stdin, i.Stdin = r, w
// STDOUT
if r, w, err = os.Pipe(); err != nil {
return nil, err
}
fds = append(fds, r.Fd(), w.Fd())
p.Stdout, i.Stdout = w, r
// STDERR
if r, w, err = os.Pipe(); err != nil {
return nil, err
}
fds = append(fds, r.Fd(), w.Fd())
p.Stderr, i.Stderr = w, r
// change ownership of the pipes incase we are in a user namespace
for _, fd := range fds {
if err := syscall.Fchown(int(fd), rootuid, rootgid); err != nil {
return nil, err
}
}
return i, nil
}