mirror of
https://github.com/opencontainers/runc.git
synced 2025-10-05 15:37:02 +08:00

This enables the support for the rootless container mode. There are many restrictions on what rootless containers can do, so many different runC commands have been disabled: * runc checkpoint * runc events * runc pause * runc ps * runc restore * runc resume * runc update The following commands work: * runc create * runc delete * runc exec * runc kill * runc list * runc run * runc spec * runc state In addition, any specification options that imply joining cgroups have also been disabled. This is due to support for unprivileged subtree management not being available from Linux upstream. Signed-off-by: Aleksa Sarai <asarai@suse.de>
1495 lines
40 KiB
Go
1495 lines
40 KiB
Go
// +build linux
|
|
|
|
package libcontainer
|
|
|
|
import (
|
|
"bytes"
|
|
"encoding/json"
|
|
"fmt"
|
|
"io"
|
|
"io/ioutil"
|
|
"os"
|
|
"os/exec"
|
|
"path/filepath"
|
|
"reflect"
|
|
"strings"
|
|
"sync"
|
|
"syscall"
|
|
"time"
|
|
|
|
"github.com/Sirupsen/logrus"
|
|
"github.com/golang/protobuf/proto"
|
|
"github.com/opencontainers/runc/libcontainer/cgroups"
|
|
"github.com/opencontainers/runc/libcontainer/configs"
|
|
"github.com/opencontainers/runc/libcontainer/criurpc"
|
|
"github.com/opencontainers/runc/libcontainer/system"
|
|
"github.com/opencontainers/runc/libcontainer/utils"
|
|
"github.com/syndtr/gocapability/capability"
|
|
"github.com/vishvananda/netlink/nl"
|
|
)
|
|
|
|
const stdioFdCount = 3
|
|
|
|
type linuxContainer struct {
|
|
id string
|
|
root string
|
|
config *configs.Config
|
|
cgroupManager cgroups.Manager
|
|
initArgs []string
|
|
initProcess parentProcess
|
|
initProcessStartTime string
|
|
criuPath string
|
|
m sync.Mutex
|
|
criuVersion int
|
|
state containerState
|
|
created time.Time
|
|
}
|
|
|
|
// State represents a running container's state
|
|
type State struct {
|
|
BaseState
|
|
|
|
// Platform specific fields below here
|
|
|
|
// Specifies if the container was started under the rootless mode.
|
|
Rootless bool `json:"rootless"`
|
|
|
|
// Path to all the cgroups setup for a container. Key is cgroup subsystem name
|
|
// with the value as the path.
|
|
CgroupPaths map[string]string `json:"cgroup_paths"`
|
|
|
|
// NamespacePaths are filepaths to the container's namespaces. Key is the namespace type
|
|
// with the value as the path.
|
|
NamespacePaths map[configs.NamespaceType]string `json:"namespace_paths"`
|
|
|
|
// Container's standard descriptors (std{in,out,err}), needed for checkpoint and restore
|
|
ExternalDescriptors []string `json:"external_descriptors,omitempty"`
|
|
}
|
|
|
|
// Container is a libcontainer container object.
|
|
//
|
|
// Each container is thread-safe within the same process. Since a container can
|
|
// be destroyed by a separate process, any function may return that the container
|
|
// was not found.
|
|
type Container interface {
|
|
BaseContainer
|
|
|
|
// Methods below here are platform specific
|
|
|
|
// Checkpoint checkpoints the running container's state to disk using the criu(8) utility.
|
|
//
|
|
// errors:
|
|
// Systemerror - System error.
|
|
Checkpoint(criuOpts *CriuOpts) error
|
|
|
|
// Restore restores the checkpointed container to a running state using the criu(8) utility.
|
|
//
|
|
// errors:
|
|
// Systemerror - System error.
|
|
Restore(process *Process, criuOpts *CriuOpts) error
|
|
|
|
// If the Container state is RUNNING or CREATED, sets the Container state to PAUSING and pauses
|
|
// the execution of any user processes. Asynchronously, when the container finished being paused the
|
|
// state is changed to PAUSED.
|
|
// If the Container state is PAUSED, do nothing.
|
|
//
|
|
// errors:
|
|
// ContainerNotExists - Container no longer exists,
|
|
// ContainerNotRunning - Container not running or created,
|
|
// Systemerror - System error.
|
|
Pause() error
|
|
|
|
// If the Container state is PAUSED, resumes the execution of any user processes in the
|
|
// Container before setting the Container state to RUNNING.
|
|
// If the Container state is RUNNING, do nothing.
|
|
//
|
|
// errors:
|
|
// ContainerNotExists - Container no longer exists,
|
|
// ContainerNotPaused - Container is not paused,
|
|
// Systemerror - System error.
|
|
Resume() error
|
|
|
|
// NotifyOOM returns a read-only channel signaling when the container receives an OOM notification.
|
|
//
|
|
// errors:
|
|
// Systemerror - System error.
|
|
NotifyOOM() (<-chan struct{}, error)
|
|
|
|
// NotifyMemoryPressure returns a read-only channel signaling when the container reaches a given pressure level
|
|
//
|
|
// errors:
|
|
// Systemerror - System error.
|
|
NotifyMemoryPressure(level PressureLevel) (<-chan struct{}, error)
|
|
}
|
|
|
|
// ID returns the container's unique ID
|
|
func (c *linuxContainer) ID() string {
|
|
return c.id
|
|
}
|
|
|
|
// Config returns the container's configuration
|
|
func (c *linuxContainer) Config() configs.Config {
|
|
return *c.config
|
|
}
|
|
|
|
func (c *linuxContainer) Status() (Status, error) {
|
|
c.m.Lock()
|
|
defer c.m.Unlock()
|
|
return c.currentStatus()
|
|
}
|
|
|
|
func (c *linuxContainer) State() (*State, error) {
|
|
c.m.Lock()
|
|
defer c.m.Unlock()
|
|
return c.currentState()
|
|
}
|
|
|
|
func (c *linuxContainer) Processes() ([]int, error) {
|
|
pids, err := c.cgroupManager.GetAllPids()
|
|
if err != nil {
|
|
return nil, newSystemErrorWithCause(err, "getting all container pids from cgroups")
|
|
}
|
|
return pids, nil
|
|
}
|
|
|
|
func (c *linuxContainer) Stats() (*Stats, error) {
|
|
var (
|
|
err error
|
|
stats = &Stats{}
|
|
)
|
|
if stats.CgroupStats, err = c.cgroupManager.GetStats(); err != nil {
|
|
return stats, newSystemErrorWithCause(err, "getting container stats from cgroups")
|
|
}
|
|
for _, iface := range c.config.Networks {
|
|
switch iface.Type {
|
|
case "veth":
|
|
istats, err := getNetworkInterfaceStats(iface.HostInterfaceName)
|
|
if err != nil {
|
|
return stats, newSystemErrorWithCausef(err, "getting network stats for interface %q", iface.HostInterfaceName)
|
|
}
|
|
stats.Interfaces = append(stats.Interfaces, istats)
|
|
}
|
|
}
|
|
return stats, nil
|
|
}
|
|
|
|
func (c *linuxContainer) Set(config configs.Config) error {
|
|
c.m.Lock()
|
|
defer c.m.Unlock()
|
|
status, err := c.currentStatus()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if status == Stopped {
|
|
return newGenericError(fmt.Errorf("container not running"), ContainerNotRunning)
|
|
}
|
|
c.config = &config
|
|
return c.cgroupManager.Set(c.config)
|
|
}
|
|
|
|
func (c *linuxContainer) Start(process *Process) error {
|
|
c.m.Lock()
|
|
defer c.m.Unlock()
|
|
status, err := c.currentStatus()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if status == Stopped {
|
|
if err := c.createExecFifo(); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
if err := c.start(process, status == Stopped); err != nil {
|
|
if status == Stopped {
|
|
c.deleteExecFifo()
|
|
}
|
|
return err
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (c *linuxContainer) Run(process *Process) error {
|
|
c.m.Lock()
|
|
status, err := c.currentStatus()
|
|
if err != nil {
|
|
c.m.Unlock()
|
|
return err
|
|
}
|
|
c.m.Unlock()
|
|
if err := c.Start(process); err != nil {
|
|
return err
|
|
}
|
|
if status == Stopped {
|
|
return c.exec()
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (c *linuxContainer) Exec() error {
|
|
c.m.Lock()
|
|
defer c.m.Unlock()
|
|
return c.exec()
|
|
}
|
|
|
|
func (c *linuxContainer) exec() error {
|
|
path := filepath.Join(c.root, execFifoFilename)
|
|
f, err := os.OpenFile(path, os.O_RDONLY, 0)
|
|
if err != nil {
|
|
return newSystemErrorWithCause(err, "open exec fifo for reading")
|
|
}
|
|
defer f.Close()
|
|
data, err := ioutil.ReadAll(f)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if len(data) > 0 {
|
|
os.Remove(path)
|
|
return nil
|
|
}
|
|
return fmt.Errorf("cannot start an already running container")
|
|
}
|
|
|
|
func (c *linuxContainer) start(process *Process, isInit bool) error {
|
|
parent, err := c.newParentProcess(process, isInit)
|
|
if err != nil {
|
|
return newSystemErrorWithCause(err, "creating new parent process")
|
|
}
|
|
if err := parent.start(); err != nil {
|
|
// terminate the process to ensure that it properly is reaped.
|
|
if err := parent.terminate(); err != nil {
|
|
logrus.Warn(err)
|
|
}
|
|
return newSystemErrorWithCause(err, "starting container process")
|
|
}
|
|
// generate a timestamp indicating when the container was started
|
|
c.created = time.Now().UTC()
|
|
c.state = &runningState{
|
|
c: c,
|
|
}
|
|
if isInit {
|
|
c.state = &createdState{
|
|
c: c,
|
|
}
|
|
state, err := c.updateState(parent)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
c.initProcessStartTime = state.InitProcessStartTime
|
|
|
|
if c.config.Hooks != nil {
|
|
s := configs.HookState{
|
|
Version: c.config.Version,
|
|
ID: c.id,
|
|
Pid: parent.pid(),
|
|
Bundle: utils.SearchLabels(c.config.Labels, "bundle"),
|
|
}
|
|
for i, hook := range c.config.Hooks.Poststart {
|
|
if err := hook.Run(s); err != nil {
|
|
if err := parent.terminate(); err != nil {
|
|
logrus.Warn(err)
|
|
}
|
|
return newSystemErrorWithCausef(err, "running poststart hook %d", i)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (c *linuxContainer) Signal(s os.Signal, all bool) error {
|
|
if all {
|
|
return signalAllProcesses(c.cgroupManager, s)
|
|
}
|
|
if err := c.initProcess.signal(s); err != nil {
|
|
return newSystemErrorWithCause(err, "signaling init process")
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (c *linuxContainer) createExecFifo() error {
|
|
rootuid, err := c.Config().HostUID()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
rootgid, err := c.Config().HostGID()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
fifoName := filepath.Join(c.root, execFifoFilename)
|
|
if _, err := os.Stat(fifoName); err == nil {
|
|
return fmt.Errorf("exec fifo %s already exists", fifoName)
|
|
}
|
|
oldMask := syscall.Umask(0000)
|
|
if err := syscall.Mkfifo(fifoName, 0622); err != nil {
|
|
syscall.Umask(oldMask)
|
|
return err
|
|
}
|
|
syscall.Umask(oldMask)
|
|
if err := os.Chown(fifoName, rootuid, rootgid); err != nil {
|
|
return err
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (c *linuxContainer) deleteExecFifo() {
|
|
fifoName := filepath.Join(c.root, execFifoFilename)
|
|
os.Remove(fifoName)
|
|
}
|
|
|
|
func (c *linuxContainer) newParentProcess(p *Process, doInit bool) (parentProcess, error) {
|
|
parentPipe, childPipe, err := utils.NewSockPair("init")
|
|
if err != nil {
|
|
return nil, newSystemErrorWithCause(err, "creating new init pipe")
|
|
}
|
|
cmd, err := c.commandTemplate(p, childPipe)
|
|
if err != nil {
|
|
return nil, newSystemErrorWithCause(err, "creating new command template")
|
|
}
|
|
if !doInit {
|
|
return c.newSetnsProcess(p, cmd, parentPipe, childPipe)
|
|
}
|
|
|
|
// We only set up rootDir if we're not doing a `runc exec`. The reason for
|
|
// this is to avoid cases where a racing, unprivileged process inside the
|
|
// container can get access to the statedir file descriptor (which would
|
|
// allow for container rootfs escape).
|
|
rootDir, err := os.Open(c.root)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
cmd.ExtraFiles = append(cmd.ExtraFiles, rootDir)
|
|
cmd.Env = append(cmd.Env,
|
|
fmt.Sprintf("_LIBCONTAINER_STATEDIR=%d", stdioFdCount+len(cmd.ExtraFiles)-1))
|
|
return c.newInitProcess(p, cmd, parentPipe, childPipe, rootDir)
|
|
}
|
|
|
|
func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec.Cmd, error) {
|
|
cmd := exec.Command(c.initArgs[0], c.initArgs[1:]...)
|
|
cmd.Stdin = p.Stdin
|
|
cmd.Stdout = p.Stdout
|
|
cmd.Stderr = p.Stderr
|
|
cmd.Dir = c.config.Rootfs
|
|
if cmd.SysProcAttr == nil {
|
|
cmd.SysProcAttr = &syscall.SysProcAttr{}
|
|
}
|
|
cmd.ExtraFiles = append(cmd.ExtraFiles, p.ExtraFiles...)
|
|
if p.ConsoleSocket != nil {
|
|
cmd.ExtraFiles = append(cmd.ExtraFiles, p.ConsoleSocket)
|
|
cmd.Env = append(cmd.Env,
|
|
fmt.Sprintf("_LIBCONTAINER_CONSOLE=%d", stdioFdCount+len(cmd.ExtraFiles)-1),
|
|
)
|
|
}
|
|
cmd.ExtraFiles = append(cmd.ExtraFiles, childPipe)
|
|
cmd.Env = append(cmd.Env,
|
|
fmt.Sprintf("_LIBCONTAINER_INITPIPE=%d", stdioFdCount+len(cmd.ExtraFiles)-1),
|
|
)
|
|
// NOTE: when running a container with no PID namespace and the parent process spawning the container is
|
|
// PID1 the pdeathsig is being delivered to the container's init process by the kernel for some reason
|
|
// even with the parent still running.
|
|
if c.config.ParentDeathSignal > 0 {
|
|
cmd.SysProcAttr.Pdeathsig = syscall.Signal(c.config.ParentDeathSignal)
|
|
}
|
|
return cmd, nil
|
|
}
|
|
|
|
func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe, rootDir *os.File) (*initProcess, error) {
|
|
cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard))
|
|
nsMaps := make(map[configs.NamespaceType]string)
|
|
for _, ns := range c.config.Namespaces {
|
|
if ns.Path != "" {
|
|
nsMaps[ns.Type] = ns.Path
|
|
}
|
|
}
|
|
_, sharePidns := nsMaps[configs.NEWPID]
|
|
data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return &initProcess{
|
|
cmd: cmd,
|
|
childPipe: childPipe,
|
|
parentPipe: parentPipe,
|
|
manager: c.cgroupManager,
|
|
config: c.newInitConfig(p),
|
|
container: c,
|
|
process: p,
|
|
bootstrapData: data,
|
|
sharePidns: sharePidns,
|
|
rootDir: rootDir,
|
|
}, nil
|
|
}
|
|
|
|
func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) (*setnsProcess, error) {
|
|
cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initSetns))
|
|
state, err := c.currentState()
|
|
if err != nil {
|
|
return nil, newSystemErrorWithCause(err, "getting container's current state")
|
|
}
|
|
// for setns process, we don't have to set cloneflags as the process namespaces
|
|
// will only be set via setns syscall
|
|
data, err := c.bootstrapData(0, state.NamespacePaths)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return &setnsProcess{
|
|
cmd: cmd,
|
|
cgroupPaths: c.cgroupManager.GetPaths(),
|
|
childPipe: childPipe,
|
|
parentPipe: parentPipe,
|
|
config: c.newInitConfig(p),
|
|
process: p,
|
|
bootstrapData: data,
|
|
}, nil
|
|
}
|
|
|
|
func (c *linuxContainer) newInitConfig(process *Process) *initConfig {
|
|
cfg := &initConfig{
|
|
Config: c.config,
|
|
Args: process.Args,
|
|
Env: process.Env,
|
|
User: process.User,
|
|
AdditionalGroups: process.AdditionalGroups,
|
|
Cwd: process.Cwd,
|
|
Capabilities: process.Capabilities,
|
|
PassedFilesCount: len(process.ExtraFiles),
|
|
ContainerId: c.ID(),
|
|
NoNewPrivileges: c.config.NoNewPrivileges,
|
|
Rootless: c.config.Rootless,
|
|
AppArmorProfile: c.config.AppArmorProfile,
|
|
ProcessLabel: c.config.ProcessLabel,
|
|
Rlimits: c.config.Rlimits,
|
|
}
|
|
if process.NoNewPrivileges != nil {
|
|
cfg.NoNewPrivileges = *process.NoNewPrivileges
|
|
}
|
|
if process.AppArmorProfile != "" {
|
|
cfg.AppArmorProfile = process.AppArmorProfile
|
|
}
|
|
if process.Label != "" {
|
|
cfg.ProcessLabel = process.Label
|
|
}
|
|
if len(process.Rlimits) > 0 {
|
|
cfg.Rlimits = process.Rlimits
|
|
}
|
|
cfg.CreateConsole = process.ConsoleSocket != nil
|
|
return cfg
|
|
}
|
|
|
|
func (c *linuxContainer) Destroy() error {
|
|
c.m.Lock()
|
|
defer c.m.Unlock()
|
|
return c.state.destroy()
|
|
}
|
|
|
|
func (c *linuxContainer) Pause() error {
|
|
c.m.Lock()
|
|
defer c.m.Unlock()
|
|
status, err := c.currentStatus()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
switch status {
|
|
case Running, Created:
|
|
if err := c.cgroupManager.Freeze(configs.Frozen); err != nil {
|
|
return err
|
|
}
|
|
return c.state.transition(&pausedState{
|
|
c: c,
|
|
})
|
|
}
|
|
return newGenericError(fmt.Errorf("container not running or created: %s", status), ContainerNotRunning)
|
|
}
|
|
|
|
func (c *linuxContainer) Resume() error {
|
|
c.m.Lock()
|
|
defer c.m.Unlock()
|
|
status, err := c.currentStatus()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if status != Paused {
|
|
return newGenericError(fmt.Errorf("container not paused"), ContainerNotPaused)
|
|
}
|
|
if err := c.cgroupManager.Freeze(configs.Thawed); err != nil {
|
|
return err
|
|
}
|
|
return c.state.transition(&runningState{
|
|
c: c,
|
|
})
|
|
}
|
|
|
|
func (c *linuxContainer) NotifyOOM() (<-chan struct{}, error) {
|
|
return notifyOnOOM(c.cgroupManager.GetPaths())
|
|
}
|
|
|
|
func (c *linuxContainer) NotifyMemoryPressure(level PressureLevel) (<-chan struct{}, error) {
|
|
return notifyMemoryPressure(c.cgroupManager.GetPaths(), level)
|
|
}
|
|
|
|
// checkCriuVersion checks Criu version greater than or equal to minVersion
|
|
func (c *linuxContainer) checkCriuVersion(minVersion string) error {
|
|
var x, y, z, versionReq int
|
|
|
|
_, err := fmt.Sscanf(minVersion, "%d.%d.%d\n", &x, &y, &z) // 1.5.2
|
|
if err != nil {
|
|
_, err = fmt.Sscanf(minVersion, "Version: %d.%d\n", &x, &y) // 1.6
|
|
}
|
|
versionReq = x*10000 + y*100 + z
|
|
|
|
out, err := exec.Command(c.criuPath, "-V").Output()
|
|
if err != nil {
|
|
return fmt.Errorf("Unable to execute CRIU command: %s", c.criuPath)
|
|
}
|
|
|
|
x = 0
|
|
y = 0
|
|
z = 0
|
|
if ep := strings.Index(string(out), "-"); ep >= 0 {
|
|
// criu Git version format
|
|
var version string
|
|
if sp := strings.Index(string(out), "GitID"); sp > 0 {
|
|
version = string(out)[sp:ep]
|
|
} else {
|
|
return fmt.Errorf("Unable to parse the CRIU version: %s", c.criuPath)
|
|
}
|
|
|
|
n, err := fmt.Sscanf(string(version), "GitID: v%d.%d.%d", &x, &y, &z) // 1.5.2
|
|
if err != nil {
|
|
n, err = fmt.Sscanf(string(version), "GitID: v%d.%d", &x, &y) // 1.6
|
|
y++
|
|
} else {
|
|
z++
|
|
}
|
|
if n < 2 || err != nil {
|
|
return fmt.Errorf("Unable to parse the CRIU version: %s %d %s", version, n, err)
|
|
}
|
|
} else {
|
|
// criu release version format
|
|
n, err := fmt.Sscanf(string(out), "Version: %d.%d.%d\n", &x, &y, &z) // 1.5.2
|
|
if err != nil {
|
|
n, err = fmt.Sscanf(string(out), "Version: %d.%d\n", &x, &y) // 1.6
|
|
}
|
|
if n < 2 || err != nil {
|
|
return fmt.Errorf("Unable to parse the CRIU version: %s %d %s", out, n, err)
|
|
}
|
|
}
|
|
|
|
c.criuVersion = x*10000 + y*100 + z
|
|
|
|
if c.criuVersion < versionReq {
|
|
return fmt.Errorf("CRIU version must be %s or higher", minVersion)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
const descriptorsFilename = "descriptors.json"
|
|
|
|
func (c *linuxContainer) addCriuDumpMount(req *criurpc.CriuReq, m *configs.Mount) {
|
|
mountDest := m.Destination
|
|
if strings.HasPrefix(mountDest, c.config.Rootfs) {
|
|
mountDest = mountDest[len(c.config.Rootfs):]
|
|
}
|
|
|
|
extMnt := &criurpc.ExtMountMap{
|
|
Key: proto.String(mountDest),
|
|
Val: proto.String(mountDest),
|
|
}
|
|
req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt)
|
|
}
|
|
|
|
func (c *linuxContainer) addMaskPaths(req *criurpc.CriuReq) error {
|
|
for _, path := range c.config.MaskPaths {
|
|
fi, err := os.Stat(fmt.Sprintf("/proc/%d/root/%s", c.initProcess.pid(), path))
|
|
if err != nil {
|
|
if os.IsNotExist(err) {
|
|
continue
|
|
}
|
|
return err
|
|
}
|
|
if fi.IsDir() {
|
|
continue
|
|
}
|
|
|
|
extMnt := &criurpc.ExtMountMap{
|
|
Key: proto.String(path),
|
|
Val: proto.String("/dev/null"),
|
|
}
|
|
req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error {
|
|
c.m.Lock()
|
|
defer c.m.Unlock()
|
|
|
|
// TODO(avagin): Figure out how to make this work nicely. CRIU 2.0 has
|
|
// support for doing unprivileged dumps, but the setup of
|
|
// rootless containers might make this complicated.
|
|
if c.config.Rootless {
|
|
return fmt.Errorf("cannot checkpoint a rootless container")
|
|
}
|
|
|
|
if err := c.checkCriuVersion("1.5.2"); err != nil {
|
|
return err
|
|
}
|
|
|
|
if criuOpts.ImagesDirectory == "" {
|
|
return fmt.Errorf("invalid directory to save checkpoint")
|
|
}
|
|
|
|
// Since a container can be C/R'ed multiple times,
|
|
// the checkpoint directory may already exist.
|
|
if err := os.Mkdir(criuOpts.ImagesDirectory, 0755); err != nil && !os.IsExist(err) {
|
|
return err
|
|
}
|
|
|
|
if criuOpts.WorkDirectory == "" {
|
|
criuOpts.WorkDirectory = filepath.Join(c.root, "criu.work")
|
|
}
|
|
|
|
if err := os.Mkdir(criuOpts.WorkDirectory, 0755); err != nil && !os.IsExist(err) {
|
|
return err
|
|
}
|
|
|
|
workDir, err := os.Open(criuOpts.WorkDirectory)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer workDir.Close()
|
|
|
|
imageDir, err := os.Open(criuOpts.ImagesDirectory)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer imageDir.Close()
|
|
|
|
rpcOpts := criurpc.CriuOpts{
|
|
ImagesDirFd: proto.Int32(int32(imageDir.Fd())),
|
|
WorkDirFd: proto.Int32(int32(workDir.Fd())),
|
|
LogLevel: proto.Int32(4),
|
|
LogFile: proto.String("dump.log"),
|
|
Root: proto.String(c.config.Rootfs),
|
|
ManageCgroups: proto.Bool(true),
|
|
NotifyScripts: proto.Bool(true),
|
|
Pid: proto.Int32(int32(c.initProcess.pid())),
|
|
ShellJob: proto.Bool(criuOpts.ShellJob),
|
|
LeaveRunning: proto.Bool(criuOpts.LeaveRunning),
|
|
TcpEstablished: proto.Bool(criuOpts.TcpEstablished),
|
|
ExtUnixSk: proto.Bool(criuOpts.ExternalUnixConnections),
|
|
FileLocks: proto.Bool(criuOpts.FileLocks),
|
|
EmptyNs: proto.Uint32(criuOpts.EmptyNs),
|
|
}
|
|
|
|
// append optional criu opts, e.g., page-server and port
|
|
if criuOpts.PageServer.Address != "" && criuOpts.PageServer.Port != 0 {
|
|
rpcOpts.Ps = &criurpc.CriuPageServerInfo{
|
|
Address: proto.String(criuOpts.PageServer.Address),
|
|
Port: proto.Int32(criuOpts.PageServer.Port),
|
|
}
|
|
}
|
|
|
|
//pre-dump may need parentImage param to complete iterative migration
|
|
if criuOpts.ParentImage != "" {
|
|
rpcOpts.ParentImg = proto.String(criuOpts.ParentImage)
|
|
rpcOpts.TrackMem = proto.Bool(true)
|
|
}
|
|
|
|
// append optional manage cgroups mode
|
|
if criuOpts.ManageCgroupsMode != 0 {
|
|
if err := c.checkCriuVersion("1.7"); err != nil {
|
|
return err
|
|
}
|
|
mode := criurpc.CriuCgMode(criuOpts.ManageCgroupsMode)
|
|
rpcOpts.ManageCgroupsMode = &mode
|
|
}
|
|
|
|
var t criurpc.CriuReqType
|
|
if criuOpts.PreDump {
|
|
t = criurpc.CriuReqType_PRE_DUMP
|
|
} else {
|
|
t = criurpc.CriuReqType_DUMP
|
|
}
|
|
req := &criurpc.CriuReq{
|
|
Type: &t,
|
|
Opts: &rpcOpts,
|
|
}
|
|
|
|
//no need to dump these information in pre-dump
|
|
if !criuOpts.PreDump {
|
|
for _, m := range c.config.Mounts {
|
|
switch m.Device {
|
|
case "bind":
|
|
c.addCriuDumpMount(req, m)
|
|
break
|
|
case "cgroup":
|
|
binds, err := getCgroupMounts(m)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
for _, b := range binds {
|
|
c.addCriuDumpMount(req, b)
|
|
}
|
|
break
|
|
}
|
|
}
|
|
|
|
if err := c.addMaskPaths(req); err != nil {
|
|
return err
|
|
}
|
|
|
|
for _, node := range c.config.Devices {
|
|
m := &configs.Mount{Destination: node.Path, Source: node.Path}
|
|
c.addCriuDumpMount(req, m)
|
|
}
|
|
|
|
// Write the FD info to a file in the image directory
|
|
fdsJSON, err := json.Marshal(c.initProcess.externalDescriptors())
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
err = ioutil.WriteFile(filepath.Join(criuOpts.ImagesDirectory, descriptorsFilename), fdsJSON, 0655)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
err = c.criuSwrk(nil, req, criuOpts, false)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (c *linuxContainer) addCriuRestoreMount(req *criurpc.CriuReq, m *configs.Mount) {
|
|
mountDest := m.Destination
|
|
if strings.HasPrefix(mountDest, c.config.Rootfs) {
|
|
mountDest = mountDest[len(c.config.Rootfs):]
|
|
}
|
|
|
|
extMnt := &criurpc.ExtMountMap{
|
|
Key: proto.String(mountDest),
|
|
Val: proto.String(m.Source),
|
|
}
|
|
req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt)
|
|
}
|
|
|
|
func (c *linuxContainer) restoreNetwork(req *criurpc.CriuReq, criuOpts *CriuOpts) {
|
|
for _, iface := range c.config.Networks {
|
|
switch iface.Type {
|
|
case "veth":
|
|
veth := new(criurpc.CriuVethPair)
|
|
veth.IfOut = proto.String(iface.HostInterfaceName)
|
|
veth.IfIn = proto.String(iface.Name)
|
|
req.Opts.Veths = append(req.Opts.Veths, veth)
|
|
break
|
|
case "loopback":
|
|
break
|
|
}
|
|
}
|
|
for _, i := range criuOpts.VethPairs {
|
|
veth := new(criurpc.CriuVethPair)
|
|
veth.IfOut = proto.String(i.HostInterfaceName)
|
|
veth.IfIn = proto.String(i.ContainerInterfaceName)
|
|
req.Opts.Veths = append(req.Opts.Veths, veth)
|
|
}
|
|
}
|
|
|
|
func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
|
|
c.m.Lock()
|
|
defer c.m.Unlock()
|
|
|
|
// TODO(avagin): Figure out how to make this work nicely. CRIU doesn't have
|
|
// support for unprivileged restore at the moment.
|
|
if c.config.Rootless {
|
|
return fmt.Errorf("cannot restore a rootless container")
|
|
}
|
|
|
|
if err := c.checkCriuVersion("1.5.2"); err != nil {
|
|
return err
|
|
}
|
|
if criuOpts.WorkDirectory == "" {
|
|
criuOpts.WorkDirectory = filepath.Join(c.root, "criu.work")
|
|
}
|
|
// Since a container can be C/R'ed multiple times,
|
|
// the work directory may already exist.
|
|
if err := os.Mkdir(criuOpts.WorkDirectory, 0655); err != nil && !os.IsExist(err) {
|
|
return err
|
|
}
|
|
workDir, err := os.Open(criuOpts.WorkDirectory)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer workDir.Close()
|
|
if criuOpts.ImagesDirectory == "" {
|
|
return fmt.Errorf("invalid directory to restore checkpoint")
|
|
}
|
|
imageDir, err := os.Open(criuOpts.ImagesDirectory)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer imageDir.Close()
|
|
// CRIU has a few requirements for a root directory:
|
|
// * it must be a mount point
|
|
// * its parent must not be overmounted
|
|
// c.config.Rootfs is bind-mounted to a temporary directory
|
|
// to satisfy these requirements.
|
|
root := filepath.Join(c.root, "criu-root")
|
|
if err := os.Mkdir(root, 0755); err != nil {
|
|
return err
|
|
}
|
|
defer os.Remove(root)
|
|
root, err = filepath.EvalSymlinks(root)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
err = syscall.Mount(c.config.Rootfs, root, "", syscall.MS_BIND|syscall.MS_REC, "")
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer syscall.Unmount(root, syscall.MNT_DETACH)
|
|
t := criurpc.CriuReqType_RESTORE
|
|
req := &criurpc.CriuReq{
|
|
Type: &t,
|
|
Opts: &criurpc.CriuOpts{
|
|
ImagesDirFd: proto.Int32(int32(imageDir.Fd())),
|
|
WorkDirFd: proto.Int32(int32(workDir.Fd())),
|
|
EvasiveDevices: proto.Bool(true),
|
|
LogLevel: proto.Int32(4),
|
|
LogFile: proto.String("restore.log"),
|
|
RstSibling: proto.Bool(true),
|
|
Root: proto.String(root),
|
|
ManageCgroups: proto.Bool(true),
|
|
NotifyScripts: proto.Bool(true),
|
|
ShellJob: proto.Bool(criuOpts.ShellJob),
|
|
ExtUnixSk: proto.Bool(criuOpts.ExternalUnixConnections),
|
|
TcpEstablished: proto.Bool(criuOpts.TcpEstablished),
|
|
FileLocks: proto.Bool(criuOpts.FileLocks),
|
|
EmptyNs: proto.Uint32(criuOpts.EmptyNs),
|
|
},
|
|
}
|
|
|
|
for _, m := range c.config.Mounts {
|
|
switch m.Device {
|
|
case "bind":
|
|
c.addCriuRestoreMount(req, m)
|
|
break
|
|
case "cgroup":
|
|
binds, err := getCgroupMounts(m)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
for _, b := range binds {
|
|
c.addCriuRestoreMount(req, b)
|
|
}
|
|
break
|
|
}
|
|
}
|
|
|
|
if len(c.config.MaskPaths) > 0 {
|
|
m := &configs.Mount{Destination: "/dev/null", Source: "/dev/null"}
|
|
c.addCriuRestoreMount(req, m)
|
|
}
|
|
|
|
for _, node := range c.config.Devices {
|
|
m := &configs.Mount{Destination: node.Path, Source: node.Path}
|
|
c.addCriuRestoreMount(req, m)
|
|
}
|
|
|
|
if criuOpts.EmptyNs&syscall.CLONE_NEWNET == 0 {
|
|
c.restoreNetwork(req, criuOpts)
|
|
}
|
|
|
|
// append optional manage cgroups mode
|
|
if criuOpts.ManageCgroupsMode != 0 {
|
|
if err := c.checkCriuVersion("1.7"); err != nil {
|
|
return err
|
|
}
|
|
mode := criurpc.CriuCgMode(criuOpts.ManageCgroupsMode)
|
|
req.Opts.ManageCgroupsMode = &mode
|
|
}
|
|
|
|
var (
|
|
fds []string
|
|
fdJSON []byte
|
|
)
|
|
if fdJSON, err = ioutil.ReadFile(filepath.Join(criuOpts.ImagesDirectory, descriptorsFilename)); err != nil {
|
|
return err
|
|
}
|
|
|
|
if err := json.Unmarshal(fdJSON, &fds); err != nil {
|
|
return err
|
|
}
|
|
for i := range fds {
|
|
if s := fds[i]; strings.Contains(s, "pipe:") {
|
|
inheritFd := new(criurpc.InheritFd)
|
|
inheritFd.Key = proto.String(s)
|
|
inheritFd.Fd = proto.Int32(int32(i))
|
|
req.Opts.InheritFd = append(req.Opts.InheritFd, inheritFd)
|
|
}
|
|
}
|
|
return c.criuSwrk(process, req, criuOpts, true)
|
|
}
|
|
|
|
func (c *linuxContainer) criuApplyCgroups(pid int, req *criurpc.CriuReq) error {
|
|
// XXX: Do we need to deal with this case? AFAIK criu still requires root.
|
|
if err := c.cgroupManager.Apply(pid); err != nil {
|
|
return err
|
|
}
|
|
|
|
path := fmt.Sprintf("/proc/%d/cgroup", pid)
|
|
cgroupsPaths, err := cgroups.ParseCgroupFile(path)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
for c, p := range cgroupsPaths {
|
|
cgroupRoot := &criurpc.CgroupRoot{
|
|
Ctrl: proto.String(c),
|
|
Path: proto.String(p),
|
|
}
|
|
req.Opts.CgRoot = append(req.Opts.CgRoot, cgroupRoot)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (c *linuxContainer) criuSwrk(process *Process, req *criurpc.CriuReq, opts *CriuOpts, applyCgroups bool) error {
|
|
fds, err := syscall.Socketpair(syscall.AF_LOCAL, syscall.SOCK_SEQPACKET|syscall.SOCK_CLOEXEC, 0)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
logPath := filepath.Join(opts.WorkDirectory, req.GetOpts().GetLogFile())
|
|
criuClient := os.NewFile(uintptr(fds[0]), "criu-transport-client")
|
|
criuServer := os.NewFile(uintptr(fds[1]), "criu-transport-server")
|
|
defer criuClient.Close()
|
|
defer criuServer.Close()
|
|
|
|
args := []string{"swrk", "3"}
|
|
logrus.Debugf("Using CRIU %d at: %s", c.criuVersion, c.criuPath)
|
|
logrus.Debugf("Using CRIU with following args: %s", args)
|
|
cmd := exec.Command(c.criuPath, args...)
|
|
if process != nil {
|
|
cmd.Stdin = process.Stdin
|
|
cmd.Stdout = process.Stdout
|
|
cmd.Stderr = process.Stderr
|
|
}
|
|
cmd.ExtraFiles = append(cmd.ExtraFiles, criuServer)
|
|
|
|
if err := cmd.Start(); err != nil {
|
|
return err
|
|
}
|
|
criuServer.Close()
|
|
|
|
defer func() {
|
|
criuClient.Close()
|
|
_, err := cmd.Process.Wait()
|
|
if err != nil {
|
|
return
|
|
}
|
|
}()
|
|
|
|
if applyCgroups {
|
|
err := c.criuApplyCgroups(cmd.Process.Pid, req)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
var extFds []string
|
|
if process != nil {
|
|
extFds, err = getPipeFds(cmd.Process.Pid)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
logrus.Debugf("Using CRIU in %s mode", req.GetType().String())
|
|
val := reflect.ValueOf(req.GetOpts())
|
|
v := reflect.Indirect(val)
|
|
for i := 0; i < v.NumField(); i++ {
|
|
st := v.Type()
|
|
name := st.Field(i).Name
|
|
if strings.HasPrefix(name, "XXX_") {
|
|
continue
|
|
}
|
|
value := val.MethodByName("Get" + name).Call([]reflect.Value{})
|
|
logrus.Debugf("CRIU option %s with value %v", name, value[0])
|
|
}
|
|
data, err := proto.Marshal(req)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
_, err = criuClient.Write(data)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
buf := make([]byte, 10*4096)
|
|
for true {
|
|
n, err := criuClient.Read(buf)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if n == 0 {
|
|
return fmt.Errorf("unexpected EOF")
|
|
}
|
|
if n == len(buf) {
|
|
return fmt.Errorf("buffer is too small")
|
|
}
|
|
|
|
resp := new(criurpc.CriuResp)
|
|
err = proto.Unmarshal(buf[:n], resp)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if !resp.GetSuccess() {
|
|
typeString := req.GetType().String()
|
|
return fmt.Errorf("criu failed: type %s errno %d\nlog file: %s", typeString, resp.GetCrErrno(), logPath)
|
|
}
|
|
|
|
t := resp.GetType()
|
|
switch {
|
|
case t == criurpc.CriuReqType_NOTIFY:
|
|
if err := c.criuNotifications(resp, process, opts, extFds); err != nil {
|
|
return err
|
|
}
|
|
t = criurpc.CriuReqType_NOTIFY
|
|
req = &criurpc.CriuReq{
|
|
Type: &t,
|
|
NotifySuccess: proto.Bool(true),
|
|
}
|
|
data, err = proto.Marshal(req)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
_, err = criuClient.Write(data)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
continue
|
|
case t == criurpc.CriuReqType_RESTORE:
|
|
case t == criurpc.CriuReqType_DUMP:
|
|
break
|
|
case t == criurpc.CriuReqType_PRE_DUMP:
|
|
// In pre-dump mode CRIU is in a loop and waits for
|
|
// the final DUMP command.
|
|
// The current runc pre-dump approach, however, is
|
|
// start criu in PRE_DUMP once for a single pre-dump
|
|
// and not the whole series of pre-dump, pre-dump, ...m, dump
|
|
// If we got the message CriuReqType_PRE_DUMP it means
|
|
// CRIU was successful and we need to forcefully stop CRIU
|
|
logrus.Debugf("PRE_DUMP finished. Send close signal to CRIU service")
|
|
criuClient.Close()
|
|
// Process status won't be success, because one end of sockets is closed
|
|
_, err := cmd.Process.Wait()
|
|
if err != nil {
|
|
logrus.Debugf("After PRE_DUMP CRIU exiting failed")
|
|
return err
|
|
}
|
|
return nil
|
|
default:
|
|
return fmt.Errorf("unable to parse the response %s", resp.String())
|
|
}
|
|
|
|
break
|
|
}
|
|
|
|
// cmd.Wait() waits cmd.goroutines which are used for proxying file descriptors.
|
|
// Here we want to wait only the CRIU process.
|
|
st, err := cmd.Process.Wait()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if !st.Success() {
|
|
return fmt.Errorf("criu failed: %s\nlog file: %s", st.String(), logPath)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// block any external network activity
|
|
func lockNetwork(config *configs.Config) error {
|
|
for _, config := range config.Networks {
|
|
strategy, err := getStrategy(config.Type)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
if err := strategy.detach(config); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func unlockNetwork(config *configs.Config) error {
|
|
for _, config := range config.Networks {
|
|
strategy, err := getStrategy(config.Type)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if err = strategy.attach(config); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (c *linuxContainer) criuNotifications(resp *criurpc.CriuResp, process *Process, opts *CriuOpts, fds []string) error {
|
|
notify := resp.GetNotify()
|
|
if notify == nil {
|
|
return fmt.Errorf("invalid response: %s", resp.String())
|
|
}
|
|
switch {
|
|
case notify.GetScript() == "post-dump":
|
|
f, err := os.Create(filepath.Join(c.root, "checkpoint"))
|
|
if err != nil {
|
|
return err
|
|
}
|
|
f.Close()
|
|
case notify.GetScript() == "network-unlock":
|
|
if err := unlockNetwork(c.config); err != nil {
|
|
return err
|
|
}
|
|
case notify.GetScript() == "network-lock":
|
|
if err := lockNetwork(c.config); err != nil {
|
|
return err
|
|
}
|
|
case notify.GetScript() == "setup-namespaces":
|
|
if c.config.Hooks != nil {
|
|
s := configs.HookState{
|
|
Version: c.config.Version,
|
|
ID: c.id,
|
|
Pid: int(notify.GetPid()),
|
|
Bundle: utils.SearchLabels(c.config.Labels, "bundle"),
|
|
}
|
|
for i, hook := range c.config.Hooks.Prestart {
|
|
if err := hook.Run(s); err != nil {
|
|
return newSystemErrorWithCausef(err, "running prestart hook %d", i)
|
|
}
|
|
}
|
|
}
|
|
case notify.GetScript() == "post-restore":
|
|
pid := notify.GetPid()
|
|
r, err := newRestoredProcess(int(pid), fds)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
process.ops = r
|
|
if err := c.state.transition(&restoredState{
|
|
imageDir: opts.ImagesDirectory,
|
|
c: c,
|
|
}); err != nil {
|
|
return err
|
|
}
|
|
// create a timestamp indicating when the restored checkpoint was started
|
|
c.created = time.Now().UTC()
|
|
if _, err := c.updateState(r); err != nil {
|
|
return err
|
|
}
|
|
if err := os.Remove(filepath.Join(c.root, "checkpoint")); err != nil {
|
|
if !os.IsNotExist(err) {
|
|
logrus.Error(err)
|
|
}
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (c *linuxContainer) updateState(process parentProcess) (*State, error) {
|
|
c.initProcess = process
|
|
state, err := c.currentState()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
err = c.saveState(state)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return state, nil
|
|
}
|
|
|
|
func (c *linuxContainer) saveState(s *State) error {
|
|
f, err := os.Create(filepath.Join(c.root, stateFilename))
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer f.Close()
|
|
return utils.WriteJSON(f, s)
|
|
}
|
|
|
|
func (c *linuxContainer) deleteState() error {
|
|
return os.Remove(filepath.Join(c.root, stateFilename))
|
|
}
|
|
|
|
func (c *linuxContainer) currentStatus() (Status, error) {
|
|
if err := c.refreshState(); err != nil {
|
|
return -1, err
|
|
}
|
|
return c.state.status(), nil
|
|
}
|
|
|
|
// refreshState needs to be called to verify that the current state on the
|
|
// container is what is true. Because consumers of libcontainer can use it
|
|
// out of process we need to verify the container's status based on runtime
|
|
// information and not rely on our in process info.
|
|
func (c *linuxContainer) refreshState() error {
|
|
paused, err := c.isPaused()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if paused {
|
|
return c.state.transition(&pausedState{c: c})
|
|
}
|
|
t, err := c.runType()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
switch t {
|
|
case Created:
|
|
return c.state.transition(&createdState{c: c})
|
|
case Running:
|
|
return c.state.transition(&runningState{c: c})
|
|
}
|
|
return c.state.transition(&stoppedState{c: c})
|
|
}
|
|
|
|
// doesInitProcessExist checks if the init process is still the same process
|
|
// as the initial one, it could happen that the original process has exited
|
|
// and a new process has been created with the same pid, in this case, the
|
|
// container would already be stopped.
|
|
func (c *linuxContainer) doesInitProcessExist(initPid int) (bool, error) {
|
|
startTime, err := system.GetProcessStartTime(initPid)
|
|
if err != nil {
|
|
return false, newSystemErrorWithCausef(err, "getting init process %d start time", initPid)
|
|
}
|
|
if c.initProcessStartTime != startTime {
|
|
return false, nil
|
|
}
|
|
return true, nil
|
|
}
|
|
|
|
func (c *linuxContainer) runType() (Status, error) {
|
|
if c.initProcess == nil {
|
|
return Stopped, nil
|
|
}
|
|
pid := c.initProcess.pid()
|
|
// return Running if the init process is alive
|
|
if err := syscall.Kill(pid, 0); err != nil {
|
|
if err == syscall.ESRCH {
|
|
// It means the process does not exist anymore, could happen when the
|
|
// process exited just when we call the function, we should not return
|
|
// error in this case.
|
|
return Stopped, nil
|
|
}
|
|
return Stopped, newSystemErrorWithCausef(err, "sending signal 0 to pid %d", pid)
|
|
}
|
|
// check if the process is still the original init process.
|
|
exist, err := c.doesInitProcessExist(pid)
|
|
if !exist || err != nil {
|
|
return Stopped, err
|
|
}
|
|
// We'll create exec fifo and blocking on it after container is created,
|
|
// and delete it after start container.
|
|
if _, err := os.Stat(filepath.Join(c.root, execFifoFilename)); err == nil {
|
|
return Created, nil
|
|
}
|
|
return Running, nil
|
|
}
|
|
|
|
func (c *linuxContainer) isPaused() (bool, error) {
|
|
data, err := ioutil.ReadFile(filepath.Join(c.cgroupManager.GetPaths()["freezer"], "freezer.state"))
|
|
if err != nil {
|
|
// If freezer cgroup is not mounted, the container would just be not paused.
|
|
if os.IsNotExist(err) {
|
|
return false, nil
|
|
}
|
|
return false, newSystemErrorWithCause(err, "checking if container is paused")
|
|
}
|
|
return bytes.Equal(bytes.TrimSpace(data), []byte("FROZEN")), nil
|
|
}
|
|
|
|
func (c *linuxContainer) currentState() (*State, error) {
|
|
var (
|
|
startTime string
|
|
externalDescriptors []string
|
|
pid = -1
|
|
)
|
|
if c.initProcess != nil {
|
|
pid = c.initProcess.pid()
|
|
startTime, _ = c.initProcess.startTime()
|
|
externalDescriptors = c.initProcess.externalDescriptors()
|
|
}
|
|
state := &State{
|
|
BaseState: BaseState{
|
|
ID: c.ID(),
|
|
Config: *c.config,
|
|
InitProcessPid: pid,
|
|
InitProcessStartTime: startTime,
|
|
Created: c.created,
|
|
},
|
|
Rootless: c.config.Rootless,
|
|
CgroupPaths: c.cgroupManager.GetPaths(),
|
|
NamespacePaths: make(map[configs.NamespaceType]string),
|
|
ExternalDescriptors: externalDescriptors,
|
|
}
|
|
if pid > 0 {
|
|
for _, ns := range c.config.Namespaces {
|
|
state.NamespacePaths[ns.Type] = ns.GetPath(pid)
|
|
}
|
|
for _, nsType := range configs.NamespaceTypes() {
|
|
if !configs.IsNamespaceSupported(nsType) {
|
|
continue
|
|
}
|
|
if _, ok := state.NamespacePaths[nsType]; !ok {
|
|
ns := configs.Namespace{Type: nsType}
|
|
state.NamespacePaths[ns.Type] = ns.GetPath(pid)
|
|
}
|
|
}
|
|
}
|
|
return state, nil
|
|
}
|
|
|
|
// orderNamespacePaths sorts namespace paths into a list of paths that we
|
|
// can setns in order.
|
|
func (c *linuxContainer) orderNamespacePaths(namespaces map[configs.NamespaceType]string) ([]string, error) {
|
|
paths := []string{}
|
|
order := []configs.NamespaceType{
|
|
// The user namespace *must* be done first.
|
|
configs.NEWUSER,
|
|
configs.NEWIPC,
|
|
configs.NEWUTS,
|
|
configs.NEWNET,
|
|
configs.NEWPID,
|
|
configs.NEWNS,
|
|
}
|
|
|
|
// Remove namespaces that we don't need to join.
|
|
var nsTypes []configs.NamespaceType
|
|
for _, ns := range order {
|
|
if c.config.Namespaces.Contains(ns) {
|
|
nsTypes = append(nsTypes, ns)
|
|
}
|
|
}
|
|
for _, nsType := range nsTypes {
|
|
if p, ok := namespaces[nsType]; ok && p != "" {
|
|
// check if the requested namespace is supported
|
|
if !configs.IsNamespaceSupported(nsType) {
|
|
return nil, newSystemError(fmt.Errorf("namespace %s is not supported", nsType))
|
|
}
|
|
// only set to join this namespace if it exists
|
|
if _, err := os.Lstat(p); err != nil {
|
|
return nil, newSystemErrorWithCausef(err, "running lstat on namespace path %q", p)
|
|
}
|
|
// do not allow namespace path with comma as we use it to separate
|
|
// the namespace paths
|
|
if strings.ContainsRune(p, ',') {
|
|
return nil, newSystemError(fmt.Errorf("invalid path %s", p))
|
|
}
|
|
paths = append(paths, fmt.Sprintf("%s:%s", configs.NsName(nsType), p))
|
|
}
|
|
}
|
|
return paths, nil
|
|
}
|
|
|
|
func encodeIDMapping(idMap []configs.IDMap) ([]byte, error) {
|
|
data := bytes.NewBuffer(nil)
|
|
for _, im := range idMap {
|
|
line := fmt.Sprintf("%d %d %d\n", im.ContainerID, im.HostID, im.Size)
|
|
if _, err := data.WriteString(line); err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
return data.Bytes(), nil
|
|
}
|
|
|
|
// bootstrapData encodes the necessary data in netlink binary format
|
|
// as a io.Reader.
|
|
// Consumer can write the data to a bootstrap program
|
|
// such as one that uses nsenter package to bootstrap the container's
|
|
// init process correctly, i.e. with correct namespaces, uid/gid
|
|
// mapping etc.
|
|
func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.NamespaceType]string) (io.Reader, error) {
|
|
// create the netlink message
|
|
r := nl.NewNetlinkRequest(int(InitMsg), 0)
|
|
|
|
// write cloneFlags
|
|
r.AddData(&Int32msg{
|
|
Type: CloneFlagsAttr,
|
|
Value: uint32(cloneFlags),
|
|
})
|
|
|
|
// write custom namespace paths
|
|
if len(nsMaps) > 0 {
|
|
nsPaths, err := c.orderNamespacePaths(nsMaps)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
r.AddData(&Bytemsg{
|
|
Type: NsPathsAttr,
|
|
Value: []byte(strings.Join(nsPaths, ",")),
|
|
})
|
|
}
|
|
|
|
// write namespace paths only when we are not joining an existing user ns
|
|
_, joinExistingUser := nsMaps[configs.NEWUSER]
|
|
if !joinExistingUser {
|
|
// write uid mappings
|
|
if len(c.config.UidMappings) > 0 {
|
|
b, err := encodeIDMapping(c.config.UidMappings)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
r.AddData(&Bytemsg{
|
|
Type: UidmapAttr,
|
|
Value: b,
|
|
})
|
|
}
|
|
|
|
// write gid mappings
|
|
if len(c.config.GidMappings) > 0 {
|
|
b, err := encodeIDMapping(c.config.GidMappings)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
r.AddData(&Bytemsg{
|
|
Type: GidmapAttr,
|
|
Value: b,
|
|
})
|
|
// The following only applies if we are root.
|
|
if !c.config.Rootless {
|
|
// check if we have CAP_SETGID to setgroup properly
|
|
pid, err := capability.NewPid(os.Getpid())
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if !pid.Get(capability.EFFECTIVE, capability.CAP_SETGID) {
|
|
r.AddData(&Boolmsg{
|
|
Type: SetgroupAttr,
|
|
Value: true,
|
|
})
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// write oom_score_adj
|
|
r.AddData(&Bytemsg{
|
|
Type: OomScoreAdjAttr,
|
|
Value: []byte(fmt.Sprintf("%d", c.config.OomScoreAdj)),
|
|
})
|
|
|
|
// write rootless
|
|
r.AddData(&Boolmsg{
|
|
Type: RootlessAttr,
|
|
Value: c.config.Rootless,
|
|
})
|
|
|
|
return bytes.NewReader(r.Serialize()), nil
|
|
}
|