mirror of
https://github.com/opencontainers/runc.git
synced 2025-10-05 07:27:03 +08:00

This enables the support for the rootless container mode. There are many restrictions on what rootless containers can do, so many different runC commands have been disabled: * runc checkpoint * runc events * runc pause * runc ps * runc restore * runc resume * runc update The following commands work: * runc create * runc delete * runc exec * runc kill * runc list * runc run * runc spec * runc state In addition, any specification options that imply joining cgroups have also been disabled. This is due to support for unprivileged subtree management not being available from Linux upstream. Signed-off-by: Aleksa Sarai <asarai@suse.de>
488 lines
13 KiB
Go
488 lines
13 KiB
Go
// +build linux
|
|
|
|
package libcontainer
|
|
|
|
import (
|
|
"encoding/json"
|
|
"errors"
|
|
"fmt"
|
|
"io"
|
|
"os"
|
|
"os/exec"
|
|
"path/filepath"
|
|
"strconv"
|
|
"syscall"
|
|
|
|
"github.com/opencontainers/runc/libcontainer/cgroups"
|
|
"github.com/opencontainers/runc/libcontainer/configs"
|
|
"github.com/opencontainers/runc/libcontainer/system"
|
|
"github.com/opencontainers/runc/libcontainer/utils"
|
|
)
|
|
|
|
type parentProcess interface {
|
|
// pid returns the pid for the running process.
|
|
pid() int
|
|
|
|
// start starts the process execution.
|
|
start() error
|
|
|
|
// send a SIGKILL to the process and wait for the exit.
|
|
terminate() error
|
|
|
|
// wait waits on the process returning the process state.
|
|
wait() (*os.ProcessState, error)
|
|
|
|
// startTime returns the process start time.
|
|
startTime() (string, error)
|
|
|
|
signal(os.Signal) error
|
|
|
|
externalDescriptors() []string
|
|
|
|
setExternalDescriptors(fds []string)
|
|
}
|
|
|
|
type setnsProcess struct {
|
|
cmd *exec.Cmd
|
|
parentPipe *os.File
|
|
childPipe *os.File
|
|
cgroupPaths map[string]string
|
|
config *initConfig
|
|
fds []string
|
|
process *Process
|
|
bootstrapData io.Reader
|
|
}
|
|
|
|
func (p *setnsProcess) startTime() (string, error) {
|
|
return system.GetProcessStartTime(p.pid())
|
|
}
|
|
|
|
func (p *setnsProcess) signal(sig os.Signal) error {
|
|
s, ok := sig.(syscall.Signal)
|
|
if !ok {
|
|
return errors.New("os: unsupported signal type")
|
|
}
|
|
return syscall.Kill(p.pid(), s)
|
|
}
|
|
|
|
func (p *setnsProcess) start() (err error) {
|
|
defer p.parentPipe.Close()
|
|
err = p.cmd.Start()
|
|
p.childPipe.Close()
|
|
if err != nil {
|
|
return newSystemErrorWithCause(err, "starting setns process")
|
|
}
|
|
if p.bootstrapData != nil {
|
|
if _, err := io.Copy(p.parentPipe, p.bootstrapData); err != nil {
|
|
return newSystemErrorWithCause(err, "copying bootstrap data to pipe")
|
|
}
|
|
}
|
|
if err = p.execSetns(); err != nil {
|
|
return newSystemErrorWithCause(err, "executing setns process")
|
|
}
|
|
// We can't join cgroups if we're in a rootless container.
|
|
if !p.config.Rootless && len(p.cgroupPaths) > 0 {
|
|
if err := cgroups.EnterPid(p.cgroupPaths, p.pid()); err != nil {
|
|
return newSystemErrorWithCausef(err, "adding pid %d to cgroups", p.pid())
|
|
}
|
|
}
|
|
// set rlimits, this has to be done here because we lose permissions
|
|
// to raise the limits once we enter a user-namespace
|
|
if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil {
|
|
return newSystemErrorWithCause(err, "setting rlimits for process")
|
|
}
|
|
if err := utils.WriteJSON(p.parentPipe, p.config); err != nil {
|
|
return newSystemErrorWithCause(err, "writing config to pipe")
|
|
}
|
|
|
|
ierr := parseSync(p.parentPipe, func(sync *syncT) error {
|
|
switch sync.Type {
|
|
case procReady:
|
|
// This shouldn't happen.
|
|
panic("unexpected procReady in setns")
|
|
case procHooks:
|
|
// This shouldn't happen.
|
|
panic("unexpected procHooks in setns")
|
|
default:
|
|
return newSystemError(fmt.Errorf("invalid JSON payload from child"))
|
|
}
|
|
})
|
|
|
|
if err := syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR); err != nil {
|
|
return newSystemErrorWithCause(err, "calling shutdown on init pipe")
|
|
}
|
|
// Must be done after Shutdown so the child will exit and we can wait for it.
|
|
if ierr != nil {
|
|
p.wait()
|
|
return ierr
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// execSetns runs the process that executes C code to perform the setns calls
|
|
// because setns support requires the C process to fork off a child and perform the setns
|
|
// before the go runtime boots, we wait on the process to die and receive the child's pid
|
|
// over the provided pipe.
|
|
func (p *setnsProcess) execSetns() error {
|
|
status, err := p.cmd.Process.Wait()
|
|
if err != nil {
|
|
p.cmd.Wait()
|
|
return newSystemErrorWithCause(err, "waiting on setns process to finish")
|
|
}
|
|
if !status.Success() {
|
|
p.cmd.Wait()
|
|
return newSystemError(&exec.ExitError{ProcessState: status})
|
|
}
|
|
var pid *pid
|
|
if err := json.NewDecoder(p.parentPipe).Decode(&pid); err != nil {
|
|
p.cmd.Wait()
|
|
return newSystemErrorWithCause(err, "reading pid from init pipe")
|
|
}
|
|
process, err := os.FindProcess(pid.Pid)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
p.cmd.Process = process
|
|
p.process.ops = p
|
|
return nil
|
|
}
|
|
|
|
// terminate sends a SIGKILL to the forked process for the setns routine then waits to
|
|
// avoid the process becoming a zombie.
|
|
func (p *setnsProcess) terminate() error {
|
|
if p.cmd.Process == nil {
|
|
return nil
|
|
}
|
|
err := p.cmd.Process.Kill()
|
|
if _, werr := p.wait(); err == nil {
|
|
err = werr
|
|
}
|
|
return err
|
|
}
|
|
|
|
func (p *setnsProcess) wait() (*os.ProcessState, error) {
|
|
err := p.cmd.Wait()
|
|
|
|
// Return actual ProcessState even on Wait error
|
|
return p.cmd.ProcessState, err
|
|
}
|
|
|
|
func (p *setnsProcess) pid() int {
|
|
return p.cmd.Process.Pid
|
|
}
|
|
|
|
func (p *setnsProcess) externalDescriptors() []string {
|
|
return p.fds
|
|
}
|
|
|
|
func (p *setnsProcess) setExternalDescriptors(newFds []string) {
|
|
p.fds = newFds
|
|
}
|
|
|
|
type initProcess struct {
|
|
cmd *exec.Cmd
|
|
parentPipe *os.File
|
|
childPipe *os.File
|
|
config *initConfig
|
|
manager cgroups.Manager
|
|
container *linuxContainer
|
|
fds []string
|
|
process *Process
|
|
bootstrapData io.Reader
|
|
sharePidns bool
|
|
rootDir *os.File
|
|
}
|
|
|
|
func (p *initProcess) pid() int {
|
|
return p.cmd.Process.Pid
|
|
}
|
|
|
|
func (p *initProcess) externalDescriptors() []string {
|
|
return p.fds
|
|
}
|
|
|
|
// execSetns runs the process that executes C code to perform the setns calls
|
|
// because setns support requires the C process to fork off a child and perform the setns
|
|
// before the go runtime boots, we wait on the process to die and receive the child's pid
|
|
// over the provided pipe.
|
|
// This is called by initProcess.start function
|
|
func (p *initProcess) execSetns() error {
|
|
status, err := p.cmd.Process.Wait()
|
|
if err != nil {
|
|
p.cmd.Wait()
|
|
return err
|
|
}
|
|
if !status.Success() {
|
|
p.cmd.Wait()
|
|
return &exec.ExitError{ProcessState: status}
|
|
}
|
|
var pid *pid
|
|
if err := json.NewDecoder(p.parentPipe).Decode(&pid); err != nil {
|
|
p.cmd.Wait()
|
|
return err
|
|
}
|
|
process, err := os.FindProcess(pid.Pid)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
p.cmd.Process = process
|
|
p.process.ops = p
|
|
return nil
|
|
}
|
|
|
|
func (p *initProcess) start() error {
|
|
defer p.parentPipe.Close()
|
|
err := p.cmd.Start()
|
|
p.process.ops = p
|
|
p.childPipe.Close()
|
|
p.rootDir.Close()
|
|
if err != nil {
|
|
p.process.ops = nil
|
|
return newSystemErrorWithCause(err, "starting init process command")
|
|
}
|
|
if _, err := io.Copy(p.parentPipe, p.bootstrapData); err != nil {
|
|
return newSystemErrorWithCause(err, "copying bootstrap data to pipe")
|
|
}
|
|
if err := p.execSetns(); err != nil {
|
|
return newSystemErrorWithCause(err, "running exec setns process for init")
|
|
}
|
|
// Save the standard descriptor names before the container process
|
|
// can potentially move them (e.g., via dup2()). If we don't do this now,
|
|
// we won't know at checkpoint time which file descriptor to look up.
|
|
fds, err := getPipeFds(p.pid())
|
|
if err != nil {
|
|
return newSystemErrorWithCausef(err, "getting pipe fds for pid %d", p.pid())
|
|
}
|
|
p.setExternalDescriptors(fds)
|
|
if !p.container.config.Rootless {
|
|
// Do this before syncing with child so that no children can escape the
|
|
// cgroup. We can't do this if we're not running as root.
|
|
if err := p.manager.Apply(p.pid()); err != nil {
|
|
return newSystemErrorWithCause(err, "applying cgroup configuration for process")
|
|
}
|
|
}
|
|
defer func() {
|
|
if err != nil && !p.container.config.Rootless {
|
|
// TODO: should not be the responsibility to call here
|
|
p.manager.Destroy()
|
|
}
|
|
}()
|
|
if err := p.createNetworkInterfaces(); err != nil {
|
|
return newSystemErrorWithCause(err, "creating network interfaces")
|
|
}
|
|
if err := p.sendConfig(); err != nil {
|
|
return newSystemErrorWithCause(err, "sending config to init process")
|
|
}
|
|
var (
|
|
sentRun bool
|
|
sentResume bool
|
|
)
|
|
|
|
ierr := parseSync(p.parentPipe, func(sync *syncT) error {
|
|
switch sync.Type {
|
|
case procReady:
|
|
// We can't set cgroups if we're in a rootless container.
|
|
if !p.container.config.Rootless {
|
|
if err := p.manager.Set(p.config.Config); err != nil {
|
|
return newSystemErrorWithCause(err, "setting cgroup config for ready process")
|
|
}
|
|
}
|
|
// set rlimits, this has to be done here because we lose permissions
|
|
// to raise the limits once we enter a user-namespace
|
|
if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil {
|
|
return newSystemErrorWithCause(err, "setting rlimits for ready process")
|
|
}
|
|
// call prestart hooks
|
|
if !p.config.Config.Namespaces.Contains(configs.NEWNS) {
|
|
if p.config.Config.Hooks != nil {
|
|
s := configs.HookState{
|
|
Version: p.container.config.Version,
|
|
ID: p.container.id,
|
|
Pid: p.pid(),
|
|
Bundle: utils.SearchLabels(p.config.Config.Labels, "bundle"),
|
|
}
|
|
for i, hook := range p.config.Config.Hooks.Prestart {
|
|
if err := hook.Run(s); err != nil {
|
|
return newSystemErrorWithCausef(err, "running prestart hook %d", i)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
// Sync with child.
|
|
if err := writeSync(p.parentPipe, procRun); err != nil {
|
|
return newSystemErrorWithCause(err, "writing syncT 'run'")
|
|
}
|
|
sentRun = true
|
|
case procHooks:
|
|
if p.config.Config.Hooks != nil {
|
|
s := configs.HookState{
|
|
Version: p.container.config.Version,
|
|
ID: p.container.id,
|
|
Pid: p.pid(),
|
|
Bundle: utils.SearchLabels(p.config.Config.Labels, "bundle"),
|
|
}
|
|
for i, hook := range p.config.Config.Hooks.Prestart {
|
|
if err := hook.Run(s); err != nil {
|
|
return newSystemErrorWithCausef(err, "running prestart hook %d", i)
|
|
}
|
|
}
|
|
}
|
|
// Sync with child.
|
|
if err := writeSync(p.parentPipe, procResume); err != nil {
|
|
return newSystemErrorWithCause(err, "writing syncT 'resume'")
|
|
}
|
|
sentResume = true
|
|
default:
|
|
return newSystemError(fmt.Errorf("invalid JSON payload from child"))
|
|
}
|
|
|
|
return nil
|
|
})
|
|
|
|
if !sentRun {
|
|
return newSystemErrorWithCause(ierr, "container init")
|
|
}
|
|
if p.config.Config.Namespaces.Contains(configs.NEWNS) && !sentResume {
|
|
return newSystemError(fmt.Errorf("could not synchronise after executing prestart hooks with container process"))
|
|
}
|
|
if err := syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR); err != nil {
|
|
return newSystemErrorWithCause(err, "shutting down init pipe")
|
|
}
|
|
|
|
// Must be done after Shutdown so the child will exit and we can wait for it.
|
|
if ierr != nil {
|
|
p.wait()
|
|
return ierr
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (p *initProcess) wait() (*os.ProcessState, error) {
|
|
err := p.cmd.Wait()
|
|
if err != nil {
|
|
return p.cmd.ProcessState, err
|
|
}
|
|
// we should kill all processes in cgroup when init is died if we use host PID namespace
|
|
if p.sharePidns {
|
|
signalAllProcesses(p.manager, syscall.SIGKILL)
|
|
}
|
|
return p.cmd.ProcessState, nil
|
|
}
|
|
|
|
func (p *initProcess) terminate() error {
|
|
if p.cmd.Process == nil {
|
|
return nil
|
|
}
|
|
err := p.cmd.Process.Kill()
|
|
if _, werr := p.wait(); err == nil {
|
|
err = werr
|
|
}
|
|
return err
|
|
}
|
|
|
|
func (p *initProcess) startTime() (string, error) {
|
|
return system.GetProcessStartTime(p.pid())
|
|
}
|
|
|
|
func (p *initProcess) sendConfig() error {
|
|
// send the config to the container's init process, we don't use JSON Encode
|
|
// here because there might be a problem in JSON decoder in some cases, see:
|
|
// https://github.com/docker/docker/issues/14203#issuecomment-174177790
|
|
return utils.WriteJSON(p.parentPipe, p.config)
|
|
}
|
|
|
|
func (p *initProcess) createNetworkInterfaces() error {
|
|
for _, config := range p.config.Config.Networks {
|
|
strategy, err := getStrategy(config.Type)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
n := &network{
|
|
Network: *config,
|
|
}
|
|
if err := strategy.create(n, p.pid()); err != nil {
|
|
return err
|
|
}
|
|
p.config.Networks = append(p.config.Networks, n)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (p *initProcess) signal(sig os.Signal) error {
|
|
s, ok := sig.(syscall.Signal)
|
|
if !ok {
|
|
return errors.New("os: unsupported signal type")
|
|
}
|
|
return syscall.Kill(p.pid(), s)
|
|
}
|
|
|
|
func (p *initProcess) setExternalDescriptors(newFds []string) {
|
|
p.fds = newFds
|
|
}
|
|
|
|
func getPipeFds(pid int) ([]string, error) {
|
|
fds := make([]string, 3)
|
|
|
|
dirPath := filepath.Join("/proc", strconv.Itoa(pid), "/fd")
|
|
for i := 0; i < 3; i++ {
|
|
// XXX: This breaks if the path is not a valid symlink (which can
|
|
// happen in certain particularly unlucky mount namespace setups).
|
|
f := filepath.Join(dirPath, strconv.Itoa(i))
|
|
target, err := os.Readlink(f)
|
|
if err != nil {
|
|
// Ignore permission errors, for rootless containers and other
|
|
// non-dumpable processes. if we can't get the fd for a particular
|
|
// file, there's not much we can do.
|
|
if os.IsPermission(err) {
|
|
continue
|
|
}
|
|
return fds, err
|
|
}
|
|
fds[i] = target
|
|
}
|
|
return fds, nil
|
|
}
|
|
|
|
// InitializeIO creates pipes for use with the process's stdio and returns the
|
|
// opposite side for each. Do not use this if you want to have a pseudoterminal
|
|
// set up for you by libcontainer (TODO: fix that too).
|
|
// TODO: This is mostly unnecessary, and should be handled by clients.
|
|
func (p *Process) InitializeIO(rootuid, rootgid int) (i *IO, err error) {
|
|
var fds []uintptr
|
|
i = &IO{}
|
|
// cleanup in case of an error
|
|
defer func() {
|
|
if err != nil {
|
|
for _, fd := range fds {
|
|
syscall.Close(int(fd))
|
|
}
|
|
}
|
|
}()
|
|
// STDIN
|
|
r, w, err := os.Pipe()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
fds = append(fds, r.Fd(), w.Fd())
|
|
p.Stdin, i.Stdin = r, w
|
|
// STDOUT
|
|
if r, w, err = os.Pipe(); err != nil {
|
|
return nil, err
|
|
}
|
|
fds = append(fds, r.Fd(), w.Fd())
|
|
p.Stdout, i.Stdout = w, r
|
|
// STDERR
|
|
if r, w, err = os.Pipe(); err != nil {
|
|
return nil, err
|
|
}
|
|
fds = append(fds, r.Fd(), w.Fd())
|
|
p.Stderr, i.Stderr = w, r
|
|
// change ownership of the pipes incase we are in a user namespace
|
|
for _, fd := range fds {
|
|
if err := syscall.Fchown(int(fd), rootuid, rootgid); err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
return i, nil
|
|
}
|