mirror of
https://github.com/opencontainers/runc.git
synced 2025-09-27 03:46:19 +08:00

Function strings.Title is deprecated as of Go 1.18, because it does not handle some corner cases good enough. In this case, though, it is perfectly fine to use it since we have a single ASCII word as an argument, and strings.Title won't be removed until at least Go 2.0. Suppress the deprecation warning. The alternative is to not capitalize the namespace string; this will break restoring of a container checkpointed by earlier version of runc. Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
2272 lines
65 KiB
Go
2272 lines
65 KiB
Go
package libcontainer
|
|
|
|
import (
|
|
"bytes"
|
|
"encoding/json"
|
|
"errors"
|
|
"fmt"
|
|
"io"
|
|
"net"
|
|
"os"
|
|
"os/exec"
|
|
"path"
|
|
"path/filepath"
|
|
"reflect"
|
|
"strconv"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/checkpoint-restore/go-criu/v5"
|
|
criurpc "github.com/checkpoint-restore/go-criu/v5/rpc"
|
|
securejoin "github.com/cyphar/filepath-securejoin"
|
|
"github.com/opencontainers/runtime-spec/specs-go"
|
|
"github.com/sirupsen/logrus"
|
|
"github.com/vishvananda/netlink/nl"
|
|
"golang.org/x/sys/execabs"
|
|
"golang.org/x/sys/unix"
|
|
"google.golang.org/protobuf/proto"
|
|
|
|
"github.com/opencontainers/runc/libcontainer/cgroups"
|
|
"github.com/opencontainers/runc/libcontainer/configs"
|
|
"github.com/opencontainers/runc/libcontainer/intelrdt"
|
|
"github.com/opencontainers/runc/libcontainer/system"
|
|
"github.com/opencontainers/runc/libcontainer/utils"
|
|
)
|
|
|
|
const stdioFdCount = 3
|
|
|
|
type linuxContainer struct {
|
|
id string
|
|
root string
|
|
config *configs.Config
|
|
cgroupManager cgroups.Manager
|
|
intelRdtManager *intelrdt.Manager
|
|
initProcess parentProcess
|
|
initProcessStartTime uint64
|
|
m sync.Mutex
|
|
criuVersion int
|
|
state containerState
|
|
created time.Time
|
|
fifo *os.File
|
|
}
|
|
|
|
// State represents a running container's state
|
|
type State struct {
|
|
BaseState
|
|
|
|
// Platform specific fields below here
|
|
|
|
// Specified if the container was started under the rootless mode.
|
|
// Set to true if BaseState.Config.RootlessEUID && BaseState.Config.RootlessCgroups
|
|
Rootless bool `json:"rootless"`
|
|
|
|
// Paths to all the container's cgroups, as returned by (*cgroups.Manager).GetPaths
|
|
//
|
|
// For cgroup v1, a key is cgroup subsystem name, and the value is the path
|
|
// to the cgroup for this subsystem.
|
|
//
|
|
// For cgroup v2 unified hierarchy, a key is "", and the value is the unified path.
|
|
CgroupPaths map[string]string `json:"cgroup_paths"`
|
|
|
|
// NamespacePaths are filepaths to the container's namespaces. Key is the namespace type
|
|
// with the value as the path.
|
|
NamespacePaths map[configs.NamespaceType]string `json:"namespace_paths"`
|
|
|
|
// Container's standard descriptors (std{in,out,err}), needed for checkpoint and restore
|
|
ExternalDescriptors []string `json:"external_descriptors,omitempty"`
|
|
|
|
// Intel RDT "resource control" filesystem path
|
|
IntelRdtPath string `json:"intel_rdt_path"`
|
|
}
|
|
|
|
// Container is a libcontainer container object.
|
|
//
|
|
// Each container is thread-safe within the same process. Since a container can
|
|
// be destroyed by a separate process, any function may return that the container
|
|
// was not found.
|
|
type Container interface {
|
|
BaseContainer
|
|
|
|
// Methods below here are platform specific
|
|
|
|
// Checkpoint checkpoints the running container's state to disk using the criu(8) utility.
|
|
Checkpoint(criuOpts *CriuOpts) error
|
|
|
|
// Restore restores the checkpointed container to a running state using the criu(8) utility.
|
|
Restore(process *Process, criuOpts *CriuOpts) error
|
|
|
|
// If the Container state is RUNNING or CREATED, sets the Container state to PAUSED and pauses
|
|
// the execution of any user processes. Asynchronously, when the container finished being paused the
|
|
// state is changed to PAUSED.
|
|
// If the Container state is PAUSED, do nothing.
|
|
Pause() error
|
|
|
|
// If the Container state is PAUSED, resumes the execution of any user processes in the
|
|
// Container before setting the Container state to RUNNING.
|
|
// If the Container state is RUNNING, do nothing.
|
|
Resume() error
|
|
|
|
// NotifyOOM returns a read-only channel signaling when the container receives an OOM notification.
|
|
NotifyOOM() (<-chan struct{}, error)
|
|
|
|
// NotifyMemoryPressure returns a read-only channel signaling when the container reaches a given pressure level
|
|
NotifyMemoryPressure(level PressureLevel) (<-chan struct{}, error)
|
|
}
|
|
|
|
// ID returns the container's unique ID
|
|
func (c *linuxContainer) ID() string {
|
|
return c.id
|
|
}
|
|
|
|
// Config returns the container's configuration
|
|
func (c *linuxContainer) Config() configs.Config {
|
|
return *c.config
|
|
}
|
|
|
|
func (c *linuxContainer) Status() (Status, error) {
|
|
c.m.Lock()
|
|
defer c.m.Unlock()
|
|
return c.currentStatus()
|
|
}
|
|
|
|
func (c *linuxContainer) State() (*State, error) {
|
|
c.m.Lock()
|
|
defer c.m.Unlock()
|
|
return c.currentState()
|
|
}
|
|
|
|
func (c *linuxContainer) OCIState() (*specs.State, error) {
|
|
c.m.Lock()
|
|
defer c.m.Unlock()
|
|
return c.currentOCIState()
|
|
}
|
|
|
|
func (c *linuxContainer) Processes() ([]int, error) {
|
|
var pids []int
|
|
status, err := c.currentStatus()
|
|
if err != nil {
|
|
return pids, err
|
|
}
|
|
// for systemd cgroup, the unit's cgroup path will be auto removed if container's all processes exited
|
|
if status == Stopped && !c.cgroupManager.Exists() {
|
|
return pids, nil
|
|
}
|
|
|
|
pids, err = c.cgroupManager.GetAllPids()
|
|
if err != nil {
|
|
return nil, fmt.Errorf("unable to get all container pids: %w", err)
|
|
}
|
|
return pids, nil
|
|
}
|
|
|
|
func (c *linuxContainer) Stats() (*Stats, error) {
|
|
var (
|
|
err error
|
|
stats = &Stats{}
|
|
)
|
|
if stats.CgroupStats, err = c.cgroupManager.GetStats(); err != nil {
|
|
return stats, fmt.Errorf("unable to get container cgroup stats: %w", err)
|
|
}
|
|
if c.intelRdtManager != nil {
|
|
if stats.IntelRdtStats, err = c.intelRdtManager.GetStats(); err != nil {
|
|
return stats, fmt.Errorf("unable to get container Intel RDT stats: %w", err)
|
|
}
|
|
}
|
|
for _, iface := range c.config.Networks {
|
|
switch iface.Type {
|
|
case "veth":
|
|
istats, err := getNetworkInterfaceStats(iface.HostInterfaceName)
|
|
if err != nil {
|
|
return stats, fmt.Errorf("unable to get network stats for interface %q: %w", iface.HostInterfaceName, err)
|
|
}
|
|
stats.Interfaces = append(stats.Interfaces, istats)
|
|
}
|
|
}
|
|
return stats, nil
|
|
}
|
|
|
|
func (c *linuxContainer) Set(config configs.Config) error {
|
|
c.m.Lock()
|
|
defer c.m.Unlock()
|
|
status, err := c.currentStatus()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if status == Stopped {
|
|
return ErrNotRunning
|
|
}
|
|
if err := c.cgroupManager.Set(config.Cgroups.Resources); err != nil {
|
|
// Set configs back
|
|
if err2 := c.cgroupManager.Set(c.config.Cgroups.Resources); err2 != nil {
|
|
logrus.Warnf("Setting back cgroup configs failed due to error: %v, your state.json and actual configs might be inconsistent.", err2)
|
|
}
|
|
return err
|
|
}
|
|
if c.intelRdtManager != nil {
|
|
if err := c.intelRdtManager.Set(&config); err != nil {
|
|
// Set configs back
|
|
if err2 := c.cgroupManager.Set(c.config.Cgroups.Resources); err2 != nil {
|
|
logrus.Warnf("Setting back cgroup configs failed due to error: %v, your state.json and actual configs might be inconsistent.", err2)
|
|
}
|
|
if err2 := c.intelRdtManager.Set(c.config); err2 != nil {
|
|
logrus.Warnf("Setting back intelrdt configs failed due to error: %v, your state.json and actual configs might be inconsistent.", err2)
|
|
}
|
|
return err
|
|
}
|
|
}
|
|
// After config setting succeed, update config and states
|
|
c.config = &config
|
|
_, err = c.updateState(nil)
|
|
return err
|
|
}
|
|
|
|
func (c *linuxContainer) Start(process *Process) error {
|
|
c.m.Lock()
|
|
defer c.m.Unlock()
|
|
if c.config.Cgroups.Resources.SkipDevices {
|
|
return errors.New("can't start container with SkipDevices set")
|
|
}
|
|
if process.Init {
|
|
if err := c.createExecFifo(); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
if err := c.start(process); err != nil {
|
|
if process.Init {
|
|
c.deleteExecFifo()
|
|
}
|
|
return err
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (c *linuxContainer) Run(process *Process) error {
|
|
if err := c.Start(process); err != nil {
|
|
return err
|
|
}
|
|
if process.Init {
|
|
return c.exec()
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (c *linuxContainer) Exec() error {
|
|
c.m.Lock()
|
|
defer c.m.Unlock()
|
|
return c.exec()
|
|
}
|
|
|
|
func (c *linuxContainer) exec() error {
|
|
path := filepath.Join(c.root, execFifoFilename)
|
|
pid := c.initProcess.pid()
|
|
blockingFifoOpenCh := awaitFifoOpen(path)
|
|
for {
|
|
select {
|
|
case result := <-blockingFifoOpenCh:
|
|
return handleFifoResult(result)
|
|
|
|
case <-time.After(time.Millisecond * 100):
|
|
stat, err := system.Stat(pid)
|
|
if err != nil || stat.State == system.Zombie {
|
|
// could be because process started, ran, and completed between our 100ms timeout and our system.Stat() check.
|
|
// see if the fifo exists and has data (with a non-blocking open, which will succeed if the writing process is complete).
|
|
if err := handleFifoResult(fifoOpen(path, false)); err != nil {
|
|
return errors.New("container process is already dead")
|
|
}
|
|
return nil
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
func readFromExecFifo(execFifo io.Reader) error {
|
|
data, err := io.ReadAll(execFifo)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if len(data) <= 0 {
|
|
return errors.New("cannot start an already running container")
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func awaitFifoOpen(path string) <-chan openResult {
|
|
fifoOpened := make(chan openResult)
|
|
go func() {
|
|
result := fifoOpen(path, true)
|
|
fifoOpened <- result
|
|
}()
|
|
return fifoOpened
|
|
}
|
|
|
|
func fifoOpen(path string, block bool) openResult {
|
|
flags := os.O_RDONLY
|
|
if !block {
|
|
flags |= unix.O_NONBLOCK
|
|
}
|
|
f, err := os.OpenFile(path, flags, 0)
|
|
if err != nil {
|
|
return openResult{err: fmt.Errorf("exec fifo: %w", err)}
|
|
}
|
|
return openResult{file: f}
|
|
}
|
|
|
|
func handleFifoResult(result openResult) error {
|
|
if result.err != nil {
|
|
return result.err
|
|
}
|
|
f := result.file
|
|
defer f.Close()
|
|
if err := readFromExecFifo(f); err != nil {
|
|
return err
|
|
}
|
|
return os.Remove(f.Name())
|
|
}
|
|
|
|
type openResult struct {
|
|
file *os.File
|
|
err error
|
|
}
|
|
|
|
func (c *linuxContainer) start(process *Process) (retErr error) {
|
|
parent, err := c.newParentProcess(process)
|
|
if err != nil {
|
|
return fmt.Errorf("unable to create new parent process: %w", err)
|
|
}
|
|
|
|
logsDone := parent.forwardChildLogs()
|
|
if logsDone != nil {
|
|
defer func() {
|
|
// Wait for log forwarder to finish. This depends on
|
|
// runc init closing the _LIBCONTAINER_LOGPIPE log fd.
|
|
err := <-logsDone
|
|
if err != nil && retErr == nil {
|
|
retErr = fmt.Errorf("unable to forward init logs: %w", err)
|
|
}
|
|
}()
|
|
}
|
|
|
|
if err := parent.start(); err != nil {
|
|
return fmt.Errorf("unable to start container process: %w", err)
|
|
}
|
|
|
|
if process.Init {
|
|
c.fifo.Close()
|
|
if c.config.Hooks != nil {
|
|
s, err := c.currentOCIState()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
if err := c.config.Hooks[configs.Poststart].RunHooks(s); err != nil {
|
|
if err := ignoreTerminateErrors(parent.terminate()); err != nil {
|
|
logrus.Warn(fmt.Errorf("error running poststart hook: %w", err))
|
|
}
|
|
return err
|
|
}
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (c *linuxContainer) Signal(s os.Signal, all bool) error {
|
|
c.m.Lock()
|
|
defer c.m.Unlock()
|
|
status, err := c.currentStatus()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if all {
|
|
// for systemd cgroup, the unit's cgroup path will be auto removed if container's all processes exited
|
|
if status == Stopped && !c.cgroupManager.Exists() {
|
|
return nil
|
|
}
|
|
return signalAllProcesses(c.cgroupManager, s)
|
|
}
|
|
// to avoid a PID reuse attack
|
|
if status == Running || status == Created || status == Paused {
|
|
if err := c.initProcess.signal(s); err != nil {
|
|
return fmt.Errorf("unable to signal init: %w", err)
|
|
}
|
|
if status == Paused {
|
|
// For cgroup v1, killing a process in a frozen cgroup
|
|
// does nothing until it's thawed. Only thaw the cgroup
|
|
// for SIGKILL.
|
|
if s, ok := s.(unix.Signal); ok && s == unix.SIGKILL {
|
|
_ = c.cgroupManager.Freeze(configs.Thawed)
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
return ErrNotRunning
|
|
}
|
|
|
|
func (c *linuxContainer) createExecFifo() error {
|
|
rootuid, err := c.Config().HostRootUID()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
rootgid, err := c.Config().HostRootGID()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
fifoName := filepath.Join(c.root, execFifoFilename)
|
|
if _, err := os.Stat(fifoName); err == nil {
|
|
return fmt.Errorf("exec fifo %s already exists", fifoName)
|
|
}
|
|
oldMask := unix.Umask(0o000)
|
|
if err := unix.Mkfifo(fifoName, 0o622); err != nil {
|
|
unix.Umask(oldMask)
|
|
return err
|
|
}
|
|
unix.Umask(oldMask)
|
|
return os.Chown(fifoName, rootuid, rootgid)
|
|
}
|
|
|
|
func (c *linuxContainer) deleteExecFifo() {
|
|
fifoName := filepath.Join(c.root, execFifoFilename)
|
|
os.Remove(fifoName)
|
|
}
|
|
|
|
// includeExecFifo opens the container's execfifo as a pathfd, so that the
|
|
// container cannot access the statedir (and the FIFO itself remains
|
|
// un-opened). It then adds the FifoFd to the given exec.Cmd as an inherited
|
|
// fd, with _LIBCONTAINER_FIFOFD set to its fd number.
|
|
func (c *linuxContainer) includeExecFifo(cmd *exec.Cmd) error {
|
|
fifoName := filepath.Join(c.root, execFifoFilename)
|
|
fifo, err := os.OpenFile(fifoName, unix.O_PATH|unix.O_CLOEXEC, 0)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
c.fifo = fifo
|
|
|
|
cmd.ExtraFiles = append(cmd.ExtraFiles, fifo)
|
|
cmd.Env = append(cmd.Env,
|
|
"_LIBCONTAINER_FIFOFD="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1))
|
|
return nil
|
|
}
|
|
|
|
func (c *linuxContainer) newParentProcess(p *Process) (parentProcess, error) {
|
|
parentInitPipe, childInitPipe, err := utils.NewSockPair("init")
|
|
if err != nil {
|
|
return nil, fmt.Errorf("unable to create init pipe: %w", err)
|
|
}
|
|
messageSockPair := filePair{parentInitPipe, childInitPipe}
|
|
|
|
parentLogPipe, childLogPipe, err := os.Pipe()
|
|
if err != nil {
|
|
return nil, fmt.Errorf("unable to create log pipe: %w", err)
|
|
}
|
|
logFilePair := filePair{parentLogPipe, childLogPipe}
|
|
|
|
cmd := c.commandTemplate(p, childInitPipe, childLogPipe)
|
|
if !p.Init {
|
|
return c.newSetnsProcess(p, cmd, messageSockPair, logFilePair)
|
|
}
|
|
|
|
// We only set up fifoFd if we're not doing a `runc exec`. The historic
|
|
// reason for this is that previously we would pass a dirfd that allowed
|
|
// for container rootfs escape (and not doing it in `runc exec` avoided
|
|
// that problem), but we no longer do that. However, there's no need to do
|
|
// this for `runc exec` so we just keep it this way to be safe.
|
|
if err := c.includeExecFifo(cmd); err != nil {
|
|
return nil, fmt.Errorf("unable to setup exec fifo: %w", err)
|
|
}
|
|
return c.newInitProcess(p, cmd, messageSockPair, logFilePair)
|
|
}
|
|
|
|
func (c *linuxContainer) commandTemplate(p *Process, childInitPipe *os.File, childLogPipe *os.File) *exec.Cmd {
|
|
cmd := exec.Command("/proc/self/exe", "init")
|
|
cmd.Args[0] = os.Args[0]
|
|
cmd.Stdin = p.Stdin
|
|
cmd.Stdout = p.Stdout
|
|
cmd.Stderr = p.Stderr
|
|
cmd.Dir = c.config.Rootfs
|
|
if cmd.SysProcAttr == nil {
|
|
cmd.SysProcAttr = &unix.SysProcAttr{}
|
|
}
|
|
cmd.Env = append(cmd.Env, "GOMAXPROCS="+os.Getenv("GOMAXPROCS"))
|
|
cmd.ExtraFiles = append(cmd.ExtraFiles, p.ExtraFiles...)
|
|
if p.ConsoleSocket != nil {
|
|
cmd.ExtraFiles = append(cmd.ExtraFiles, p.ConsoleSocket)
|
|
cmd.Env = append(cmd.Env,
|
|
"_LIBCONTAINER_CONSOLE="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1),
|
|
)
|
|
}
|
|
cmd.ExtraFiles = append(cmd.ExtraFiles, childInitPipe)
|
|
cmd.Env = append(cmd.Env,
|
|
"_LIBCONTAINER_INITPIPE="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1),
|
|
"_LIBCONTAINER_STATEDIR="+c.root,
|
|
)
|
|
|
|
cmd.ExtraFiles = append(cmd.ExtraFiles, childLogPipe)
|
|
cmd.Env = append(cmd.Env,
|
|
"_LIBCONTAINER_LOGPIPE="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1),
|
|
"_LIBCONTAINER_LOGLEVEL="+p.LogLevel,
|
|
)
|
|
|
|
// NOTE: when running a container with no PID namespace and the parent process spawning the container is
|
|
// PID1 the pdeathsig is being delivered to the container's init process by the kernel for some reason
|
|
// even with the parent still running.
|
|
if c.config.ParentDeathSignal > 0 {
|
|
cmd.SysProcAttr.Pdeathsig = unix.Signal(c.config.ParentDeathSignal)
|
|
}
|
|
return cmd
|
|
}
|
|
|
|
// shouldSendMountSources says whether the child process must setup bind mounts with
|
|
// the source pre-opened (O_PATH) in the host user namespace.
|
|
// See https://github.com/opencontainers/runc/issues/2484
|
|
func (c *linuxContainer) shouldSendMountSources() bool {
|
|
// Passing the mount sources via SCM_RIGHTS is only necessary when
|
|
// both userns and mntns are active.
|
|
if !c.config.Namespaces.Contains(configs.NEWUSER) ||
|
|
!c.config.Namespaces.Contains(configs.NEWNS) {
|
|
return false
|
|
}
|
|
|
|
// nsexec.c send_mountsources() requires setns(mntns) capabilities
|
|
// CAP_SYS_CHROOT and CAP_SYS_ADMIN.
|
|
if c.config.RootlessEUID {
|
|
return false
|
|
}
|
|
|
|
// We need to send sources if there are bind-mounts.
|
|
for _, m := range c.config.Mounts {
|
|
if m.IsBind() {
|
|
return true
|
|
}
|
|
}
|
|
|
|
return false
|
|
}
|
|
|
|
func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, messageSockPair, logFilePair filePair) (*initProcess, error) {
|
|
cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard))
|
|
nsMaps := make(map[configs.NamespaceType]string)
|
|
for _, ns := range c.config.Namespaces {
|
|
if ns.Path != "" {
|
|
nsMaps[ns.Type] = ns.Path
|
|
}
|
|
}
|
|
_, sharePidns := nsMaps[configs.NEWPID]
|
|
data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps, initStandard)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
if c.shouldSendMountSources() {
|
|
// Elements on this slice will be paired with mounts (see StartInitialization() and
|
|
// prepareRootfs()). This slice MUST have the same size as c.config.Mounts.
|
|
mountFds := make([]int, len(c.config.Mounts))
|
|
for i, m := range c.config.Mounts {
|
|
if !m.IsBind() {
|
|
// Non bind-mounts do not use an fd.
|
|
mountFds[i] = -1
|
|
continue
|
|
}
|
|
|
|
// The fd passed here will not be used: nsexec.c will overwrite it with dup3(). We just need
|
|
// to allocate a fd so that we know the number to pass in the environment variable. The fd
|
|
// must not be closed before cmd.Start(), so we reuse messageSockPair.child because the
|
|
// lifecycle of that fd is already taken care of.
|
|
cmd.ExtraFiles = append(cmd.ExtraFiles, messageSockPair.child)
|
|
mountFds[i] = stdioFdCount + len(cmd.ExtraFiles) - 1
|
|
}
|
|
|
|
mountFdsJson, err := json.Marshal(mountFds)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("Error creating _LIBCONTAINER_MOUNT_FDS: %w", err)
|
|
}
|
|
|
|
cmd.Env = append(cmd.Env,
|
|
"_LIBCONTAINER_MOUNT_FDS="+string(mountFdsJson),
|
|
)
|
|
}
|
|
|
|
init := &initProcess{
|
|
cmd: cmd,
|
|
messageSockPair: messageSockPair,
|
|
logFilePair: logFilePair,
|
|
manager: c.cgroupManager,
|
|
intelRdtManager: c.intelRdtManager,
|
|
config: c.newInitConfig(p),
|
|
container: c,
|
|
process: p,
|
|
bootstrapData: data,
|
|
sharePidns: sharePidns,
|
|
}
|
|
c.initProcess = init
|
|
return init, nil
|
|
}
|
|
|
|
func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, messageSockPair, logFilePair filePair) (*setnsProcess, error) {
|
|
cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initSetns))
|
|
state, err := c.currentState()
|
|
if err != nil {
|
|
return nil, fmt.Errorf("unable to get container state: %w", err)
|
|
}
|
|
// for setns process, we don't have to set cloneflags as the process namespaces
|
|
// will only be set via setns syscall
|
|
data, err := c.bootstrapData(0, state.NamespacePaths, initSetns)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
proc := &setnsProcess{
|
|
cmd: cmd,
|
|
cgroupPaths: state.CgroupPaths,
|
|
rootlessCgroups: c.config.RootlessCgroups,
|
|
intelRdtPath: state.IntelRdtPath,
|
|
messageSockPair: messageSockPair,
|
|
logFilePair: logFilePair,
|
|
manager: c.cgroupManager,
|
|
config: c.newInitConfig(p),
|
|
process: p,
|
|
bootstrapData: data,
|
|
initProcessPid: state.InitProcessPid,
|
|
}
|
|
if len(p.SubCgroupPaths) > 0 {
|
|
if add, ok := p.SubCgroupPaths[""]; ok {
|
|
// cgroup v1: using the same path for all controllers.
|
|
// cgroup v2: the only possible way.
|
|
for k := range proc.cgroupPaths {
|
|
subPath := path.Join(proc.cgroupPaths[k], add)
|
|
if !strings.HasPrefix(subPath, proc.cgroupPaths[k]) {
|
|
return nil, fmt.Errorf("%s is not a sub cgroup path", add)
|
|
}
|
|
proc.cgroupPaths[k] = subPath
|
|
}
|
|
// cgroup v2: do not try to join init process's cgroup
|
|
// as a fallback (see (*setnsProcess).start).
|
|
proc.initProcessPid = 0
|
|
} else {
|
|
// Per-controller paths.
|
|
for ctrl, add := range p.SubCgroupPaths {
|
|
if val, ok := proc.cgroupPaths[ctrl]; ok {
|
|
subPath := path.Join(val, add)
|
|
if !strings.HasPrefix(subPath, val) {
|
|
return nil, fmt.Errorf("%s is not a sub cgroup path", add)
|
|
}
|
|
proc.cgroupPaths[ctrl] = subPath
|
|
} else {
|
|
return nil, fmt.Errorf("unknown controller %s in SubCgroupPaths", ctrl)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return proc, nil
|
|
}
|
|
|
|
func (c *linuxContainer) newInitConfig(process *Process) *initConfig {
|
|
cfg := &initConfig{
|
|
Config: c.config,
|
|
Args: process.Args,
|
|
Env: process.Env,
|
|
User: process.User,
|
|
AdditionalGroups: process.AdditionalGroups,
|
|
Cwd: process.Cwd,
|
|
Capabilities: process.Capabilities,
|
|
PassedFilesCount: len(process.ExtraFiles),
|
|
ContainerID: c.ID(),
|
|
NoNewPrivileges: c.config.NoNewPrivileges,
|
|
RootlessEUID: c.config.RootlessEUID,
|
|
RootlessCgroups: c.config.RootlessCgroups,
|
|
AppArmorProfile: c.config.AppArmorProfile,
|
|
ProcessLabel: c.config.ProcessLabel,
|
|
Rlimits: c.config.Rlimits,
|
|
CreateConsole: process.ConsoleSocket != nil,
|
|
ConsoleWidth: process.ConsoleWidth,
|
|
ConsoleHeight: process.ConsoleHeight,
|
|
}
|
|
if process.NoNewPrivileges != nil {
|
|
cfg.NoNewPrivileges = *process.NoNewPrivileges
|
|
}
|
|
if process.AppArmorProfile != "" {
|
|
cfg.AppArmorProfile = process.AppArmorProfile
|
|
}
|
|
if process.Label != "" {
|
|
cfg.ProcessLabel = process.Label
|
|
}
|
|
if len(process.Rlimits) > 0 {
|
|
cfg.Rlimits = process.Rlimits
|
|
}
|
|
if cgroups.IsCgroup2UnifiedMode() {
|
|
cfg.Cgroup2Path = c.cgroupManager.Path("")
|
|
}
|
|
|
|
return cfg
|
|
}
|
|
|
|
func (c *linuxContainer) Destroy() error {
|
|
c.m.Lock()
|
|
defer c.m.Unlock()
|
|
return c.state.destroy()
|
|
}
|
|
|
|
func (c *linuxContainer) Pause() error {
|
|
c.m.Lock()
|
|
defer c.m.Unlock()
|
|
status, err := c.currentStatus()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
switch status {
|
|
case Running, Created:
|
|
if err := c.cgroupManager.Freeze(configs.Frozen); err != nil {
|
|
return err
|
|
}
|
|
return c.state.transition(&pausedState{
|
|
c: c,
|
|
})
|
|
}
|
|
return ErrNotRunning
|
|
}
|
|
|
|
func (c *linuxContainer) Resume() error {
|
|
c.m.Lock()
|
|
defer c.m.Unlock()
|
|
status, err := c.currentStatus()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if status != Paused {
|
|
return ErrNotPaused
|
|
}
|
|
if err := c.cgroupManager.Freeze(configs.Thawed); err != nil {
|
|
return err
|
|
}
|
|
return c.state.transition(&runningState{
|
|
c: c,
|
|
})
|
|
}
|
|
|
|
func (c *linuxContainer) NotifyOOM() (<-chan struct{}, error) {
|
|
// XXX(cyphar): This requires cgroups.
|
|
if c.config.RootlessCgroups {
|
|
logrus.Warn("getting OOM notifications may fail if you don't have the full access to cgroups")
|
|
}
|
|
path := c.cgroupManager.Path("memory")
|
|
if cgroups.IsCgroup2UnifiedMode() {
|
|
return notifyOnOOMV2(path)
|
|
}
|
|
return notifyOnOOM(path)
|
|
}
|
|
|
|
func (c *linuxContainer) NotifyMemoryPressure(level PressureLevel) (<-chan struct{}, error) {
|
|
// XXX(cyphar): This requires cgroups.
|
|
if c.config.RootlessCgroups {
|
|
logrus.Warn("getting memory pressure notifications may fail if you don't have the full access to cgroups")
|
|
}
|
|
return notifyMemoryPressure(c.cgroupManager.Path("memory"), level)
|
|
}
|
|
|
|
var criuFeatures *criurpc.CriuFeatures
|
|
|
|
func (c *linuxContainer) checkCriuFeatures(criuOpts *CriuOpts, rpcOpts *criurpc.CriuOpts, criuFeat *criurpc.CriuFeatures) error {
|
|
t := criurpc.CriuReqType_FEATURE_CHECK
|
|
|
|
// make sure the features we are looking for are really not from
|
|
// some previous check
|
|
criuFeatures = nil
|
|
|
|
req := &criurpc.CriuReq{
|
|
Type: &t,
|
|
// Theoretically this should not be necessary but CRIU
|
|
// segfaults if Opts is empty.
|
|
// Fixed in CRIU 2.12
|
|
Opts: rpcOpts,
|
|
Features: criuFeat,
|
|
}
|
|
|
|
err := c.criuSwrk(nil, req, criuOpts, nil)
|
|
if err != nil {
|
|
logrus.Debugf("%s", err)
|
|
return errors.New("CRIU feature check failed")
|
|
}
|
|
|
|
missingFeatures := false
|
|
|
|
// The outer if checks if the fields actually exist
|
|
if (criuFeat.MemTrack != nil) &&
|
|
(criuFeatures.MemTrack != nil) {
|
|
// The inner if checks if they are set to true
|
|
if *criuFeat.MemTrack && !*criuFeatures.MemTrack {
|
|
missingFeatures = true
|
|
logrus.Debugf("CRIU does not support MemTrack")
|
|
}
|
|
}
|
|
|
|
// This needs to be repeated for every new feature check.
|
|
// Is there a way to put this in a function. Reflection?
|
|
if (criuFeat.LazyPages != nil) &&
|
|
(criuFeatures.LazyPages != nil) {
|
|
if *criuFeat.LazyPages && !*criuFeatures.LazyPages {
|
|
missingFeatures = true
|
|
logrus.Debugf("CRIU does not support LazyPages")
|
|
}
|
|
}
|
|
|
|
if missingFeatures {
|
|
return errors.New("CRIU is missing features")
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func compareCriuVersion(criuVersion int, minVersion int) error {
|
|
// simple function to perform the actual version compare
|
|
if criuVersion < minVersion {
|
|
return fmt.Errorf("CRIU version %d must be %d or higher", criuVersion, minVersion)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// checkCriuVersion checks Criu version greater than or equal to minVersion
|
|
func (c *linuxContainer) checkCriuVersion(minVersion int) error {
|
|
// If the version of criu has already been determined there is no need
|
|
// to ask criu for the version again. Use the value from c.criuVersion.
|
|
if c.criuVersion != 0 {
|
|
return compareCriuVersion(c.criuVersion, minVersion)
|
|
}
|
|
|
|
criu := criu.MakeCriu()
|
|
var err error
|
|
c.criuVersion, err = criu.GetCriuVersion()
|
|
if err != nil {
|
|
return fmt.Errorf("CRIU version check failed: %w", err)
|
|
}
|
|
|
|
return compareCriuVersion(c.criuVersion, minVersion)
|
|
}
|
|
|
|
const descriptorsFilename = "descriptors.json"
|
|
|
|
func (c *linuxContainer) addCriuDumpMount(req *criurpc.CriuReq, m *configs.Mount) {
|
|
mountDest := strings.TrimPrefix(m.Destination, c.config.Rootfs)
|
|
if dest, err := securejoin.SecureJoin(c.config.Rootfs, mountDest); err == nil {
|
|
mountDest = dest[len(c.config.Rootfs):]
|
|
}
|
|
extMnt := &criurpc.ExtMountMap{
|
|
Key: proto.String(mountDest),
|
|
Val: proto.String(mountDest),
|
|
}
|
|
req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt)
|
|
}
|
|
|
|
func (c *linuxContainer) addMaskPaths(req *criurpc.CriuReq) error {
|
|
for _, path := range c.config.MaskPaths {
|
|
fi, err := os.Stat(fmt.Sprintf("/proc/%d/root/%s", c.initProcess.pid(), path))
|
|
if err != nil {
|
|
if os.IsNotExist(err) {
|
|
continue
|
|
}
|
|
return err
|
|
}
|
|
if fi.IsDir() {
|
|
continue
|
|
}
|
|
|
|
extMnt := &criurpc.ExtMountMap{
|
|
Key: proto.String(path),
|
|
Val: proto.String("/dev/null"),
|
|
}
|
|
req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (c *linuxContainer) handleCriuConfigurationFile(rpcOpts *criurpc.CriuOpts) {
|
|
// CRIU will evaluate a configuration starting with release 3.11.
|
|
// Settings in the configuration file will overwrite RPC settings.
|
|
// Look for annotations. The annotation 'org.criu.config'
|
|
// specifies if CRIU should use a different, container specific
|
|
// configuration file.
|
|
configFile, exists := utils.SearchLabels(c.config.Labels, "org.criu.config")
|
|
if exists {
|
|
// If the annotation 'org.criu.config' exists and is set
|
|
// to a non-empty string, tell CRIU to use that as a
|
|
// configuration file. If the file does not exist, CRIU
|
|
// will just ignore it.
|
|
if configFile != "" {
|
|
rpcOpts.ConfigFile = proto.String(configFile)
|
|
}
|
|
// If 'org.criu.config' exists and is set to an empty
|
|
// string, a runc specific CRIU configuration file will
|
|
// be not set at all.
|
|
} else {
|
|
// If the mentioned annotation has not been found, specify
|
|
// a default CRIU configuration file.
|
|
rpcOpts.ConfigFile = proto.String("/etc/criu/runc.conf")
|
|
}
|
|
}
|
|
|
|
func (c *linuxContainer) criuSupportsExtNS(t configs.NamespaceType) bool {
|
|
var minVersion int
|
|
switch t {
|
|
case configs.NEWNET:
|
|
// CRIU supports different external namespace with different released CRIU versions.
|
|
// For network namespaces to work we need at least criu 3.11.0 => 31100.
|
|
minVersion = 31100
|
|
case configs.NEWPID:
|
|
// For PID namespaces criu 31500 is needed.
|
|
minVersion = 31500
|
|
default:
|
|
return false
|
|
}
|
|
return c.checkCriuVersion(minVersion) == nil
|
|
}
|
|
|
|
func criuNsToKey(t configs.NamespaceType) string {
|
|
return "extRoot" + strings.Title(configs.NsName(t)) + "NS" //nolint:staticcheck // SA1019: strings.Title is deprecated
|
|
}
|
|
|
|
func (c *linuxContainer) handleCheckpointingExternalNamespaces(rpcOpts *criurpc.CriuOpts, t configs.NamespaceType) error {
|
|
if !c.criuSupportsExtNS(t) {
|
|
return nil
|
|
}
|
|
|
|
nsPath := c.config.Namespaces.PathOf(t)
|
|
if nsPath == "" {
|
|
return nil
|
|
}
|
|
// CRIU expects the information about an external namespace
|
|
// like this: --external <TYPE>[<inode>]:<key>
|
|
// This <key> is always 'extRoot<TYPE>NS'.
|
|
var ns unix.Stat_t
|
|
if err := unix.Stat(nsPath, &ns); err != nil {
|
|
return err
|
|
}
|
|
criuExternal := fmt.Sprintf("%s[%d]:%s", configs.NsName(t), ns.Ino, criuNsToKey(t))
|
|
rpcOpts.External = append(rpcOpts.External, criuExternal)
|
|
|
|
return nil
|
|
}
|
|
|
|
func (c *linuxContainer) handleRestoringNamespaces(rpcOpts *criurpc.CriuOpts, extraFiles *[]*os.File) error {
|
|
for _, ns := range c.config.Namespaces {
|
|
switch ns.Type {
|
|
case configs.NEWNET, configs.NEWPID:
|
|
// If the container is running in a network or PID namespace and has
|
|
// a path to the network or PID namespace configured, we will dump
|
|
// that network or PID namespace as an external namespace and we
|
|
// will expect that the namespace exists during restore.
|
|
// This basically means that CRIU will ignore the namespace
|
|
// and expect it to be setup correctly.
|
|
if err := c.handleRestoringExternalNamespaces(rpcOpts, extraFiles, ns.Type); err != nil {
|
|
return err
|
|
}
|
|
default:
|
|
// For all other namespaces except NET and PID CRIU has
|
|
// a simpler way of joining the existing namespace if set
|
|
nsPath := c.config.Namespaces.PathOf(ns.Type)
|
|
if nsPath == "" {
|
|
continue
|
|
}
|
|
if ns.Type == configs.NEWCGROUP {
|
|
// CRIU has no code to handle NEWCGROUP
|
|
return fmt.Errorf("Do not know how to handle namespace %v", ns.Type)
|
|
}
|
|
// CRIU has code to handle NEWTIME, but it does not seem to be defined in runc
|
|
|
|
// CRIU will issue a warning for NEWUSER:
|
|
// criu/namespaces.c: 'join-ns with user-namespace is not fully tested and dangerous'
|
|
rpcOpts.JoinNs = append(rpcOpts.JoinNs, &criurpc.JoinNamespace{
|
|
Ns: proto.String(configs.NsName(ns.Type)),
|
|
NsFile: proto.String(nsPath),
|
|
})
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (c *linuxContainer) handleRestoringExternalNamespaces(rpcOpts *criurpc.CriuOpts, extraFiles *[]*os.File, t configs.NamespaceType) error {
|
|
if !c.criuSupportsExtNS(t) {
|
|
return nil
|
|
}
|
|
|
|
nsPath := c.config.Namespaces.PathOf(t)
|
|
if nsPath == "" {
|
|
return nil
|
|
}
|
|
// CRIU wants the information about an existing namespace
|
|
// like this: --inherit-fd fd[<fd>]:<key>
|
|
// The <key> needs to be the same as during checkpointing.
|
|
// We are always using 'extRoot<TYPE>NS' as the key in this.
|
|
nsFd, err := os.Open(nsPath)
|
|
if err != nil {
|
|
logrus.Errorf("If a specific network namespace is defined it must exist: %s", err)
|
|
return fmt.Errorf("Requested network namespace %v does not exist", nsPath)
|
|
}
|
|
inheritFd := &criurpc.InheritFd{
|
|
Key: proto.String(criuNsToKey(t)),
|
|
// The offset of four is necessary because 0, 1, 2 and 3 are
|
|
// already used by stdin, stdout, stderr, 'criu swrk' socket.
|
|
Fd: proto.Int32(int32(4 + len(*extraFiles))),
|
|
}
|
|
rpcOpts.InheritFd = append(rpcOpts.InheritFd, inheritFd)
|
|
// All open FDs need to be transferred to CRIU via extraFiles
|
|
*extraFiles = append(*extraFiles, nsFd)
|
|
|
|
return nil
|
|
}
|
|
|
|
func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error {
|
|
c.m.Lock()
|
|
defer c.m.Unlock()
|
|
|
|
// Checkpoint is unlikely to work if os.Geteuid() != 0 || system.RunningInUserNS().
|
|
// (CLI prints a warning)
|
|
// TODO(avagin): Figure out how to make this work nicely. CRIU 2.0 has
|
|
// support for doing unprivileged dumps, but the setup of
|
|
// rootless containers might make this complicated.
|
|
|
|
// We are relying on the CRIU version RPC which was introduced with CRIU 3.0.0
|
|
if err := c.checkCriuVersion(30000); err != nil {
|
|
return err
|
|
}
|
|
|
|
if criuOpts.ImagesDirectory == "" {
|
|
return errors.New("invalid directory to save checkpoint")
|
|
}
|
|
|
|
// Since a container can be C/R'ed multiple times,
|
|
// the checkpoint directory may already exist.
|
|
if err := os.Mkdir(criuOpts.ImagesDirectory, 0o700); err != nil && !os.IsExist(err) {
|
|
return err
|
|
}
|
|
|
|
imageDir, err := os.Open(criuOpts.ImagesDirectory)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer imageDir.Close()
|
|
|
|
rpcOpts := criurpc.CriuOpts{
|
|
ImagesDirFd: proto.Int32(int32(imageDir.Fd())),
|
|
LogLevel: proto.Int32(4),
|
|
LogFile: proto.String("dump.log"),
|
|
Root: proto.String(c.config.Rootfs),
|
|
ManageCgroups: proto.Bool(true),
|
|
NotifyScripts: proto.Bool(true),
|
|
Pid: proto.Int32(int32(c.initProcess.pid())),
|
|
ShellJob: proto.Bool(criuOpts.ShellJob),
|
|
LeaveRunning: proto.Bool(criuOpts.LeaveRunning),
|
|
TcpEstablished: proto.Bool(criuOpts.TcpEstablished),
|
|
ExtUnixSk: proto.Bool(criuOpts.ExternalUnixConnections),
|
|
FileLocks: proto.Bool(criuOpts.FileLocks),
|
|
EmptyNs: proto.Uint32(criuOpts.EmptyNs),
|
|
OrphanPtsMaster: proto.Bool(true),
|
|
AutoDedup: proto.Bool(criuOpts.AutoDedup),
|
|
LazyPages: proto.Bool(criuOpts.LazyPages),
|
|
}
|
|
|
|
// if criuOpts.WorkDirectory is not set, criu default is used.
|
|
if criuOpts.WorkDirectory != "" {
|
|
if err := os.Mkdir(criuOpts.WorkDirectory, 0o700); err != nil && !os.IsExist(err) {
|
|
return err
|
|
}
|
|
workDir, err := os.Open(criuOpts.WorkDirectory)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer workDir.Close()
|
|
rpcOpts.WorkDirFd = proto.Int32(int32(workDir.Fd()))
|
|
}
|
|
|
|
c.handleCriuConfigurationFile(&rpcOpts)
|
|
|
|
// If the container is running in a network namespace and has
|
|
// a path to the network namespace configured, we will dump
|
|
// that network namespace as an external namespace and we
|
|
// will expect that the namespace exists during restore.
|
|
// This basically means that CRIU will ignore the namespace
|
|
// and expect to be setup correctly.
|
|
if err := c.handleCheckpointingExternalNamespaces(&rpcOpts, configs.NEWNET); err != nil {
|
|
return err
|
|
}
|
|
|
|
// Same for possible external PID namespaces
|
|
if err := c.handleCheckpointingExternalNamespaces(&rpcOpts, configs.NEWPID); err != nil {
|
|
return err
|
|
}
|
|
|
|
// CRIU can use cgroup freezer; when rpcOpts.FreezeCgroup
|
|
// is not set, CRIU uses ptrace() to pause the processes.
|
|
// Note cgroup v2 freezer is only supported since CRIU release 3.14.
|
|
if !cgroups.IsCgroup2UnifiedMode() || c.checkCriuVersion(31400) == nil {
|
|
if fcg := c.cgroupManager.Path("freezer"); fcg != "" {
|
|
rpcOpts.FreezeCgroup = proto.String(fcg)
|
|
}
|
|
}
|
|
|
|
// append optional criu opts, e.g., page-server and port
|
|
if criuOpts.PageServer.Address != "" && criuOpts.PageServer.Port != 0 {
|
|
rpcOpts.Ps = &criurpc.CriuPageServerInfo{
|
|
Address: proto.String(criuOpts.PageServer.Address),
|
|
Port: proto.Int32(criuOpts.PageServer.Port),
|
|
}
|
|
}
|
|
|
|
// pre-dump may need parentImage param to complete iterative migration
|
|
if criuOpts.ParentImage != "" {
|
|
rpcOpts.ParentImg = proto.String(criuOpts.ParentImage)
|
|
rpcOpts.TrackMem = proto.Bool(true)
|
|
}
|
|
|
|
// append optional manage cgroups mode
|
|
if criuOpts.ManageCgroupsMode != 0 {
|
|
mode := criuOpts.ManageCgroupsMode
|
|
rpcOpts.ManageCgroupsMode = &mode
|
|
}
|
|
|
|
var t criurpc.CriuReqType
|
|
if criuOpts.PreDump {
|
|
feat := criurpc.CriuFeatures{
|
|
MemTrack: proto.Bool(true),
|
|
}
|
|
|
|
if err := c.checkCriuFeatures(criuOpts, &rpcOpts, &feat); err != nil {
|
|
return err
|
|
}
|
|
|
|
t = criurpc.CriuReqType_PRE_DUMP
|
|
} else {
|
|
t = criurpc.CriuReqType_DUMP
|
|
}
|
|
|
|
if criuOpts.LazyPages {
|
|
// lazy migration requested; check if criu supports it
|
|
feat := criurpc.CriuFeatures{
|
|
LazyPages: proto.Bool(true),
|
|
}
|
|
if err := c.checkCriuFeatures(criuOpts, &rpcOpts, &feat); err != nil {
|
|
return err
|
|
}
|
|
|
|
if fd := criuOpts.StatusFd; fd != -1 {
|
|
// check that the FD is valid
|
|
flags, err := unix.FcntlInt(uintptr(fd), unix.F_GETFL, 0)
|
|
if err != nil {
|
|
return fmt.Errorf("invalid --status-fd argument %d: %w", fd, err)
|
|
}
|
|
// and writable
|
|
if flags&unix.O_WRONLY == 0 {
|
|
return fmt.Errorf("invalid --status-fd argument %d: not writable", fd)
|
|
}
|
|
|
|
if c.checkCriuVersion(31500) != nil {
|
|
// For criu 3.15+, use notifications (see case "status-ready"
|
|
// in criuNotifications). Otherwise, rely on criu status fd.
|
|
rpcOpts.StatusFd = proto.Int32(int32(fd))
|
|
}
|
|
}
|
|
}
|
|
|
|
req := &criurpc.CriuReq{
|
|
Type: &t,
|
|
Opts: &rpcOpts,
|
|
}
|
|
|
|
// no need to dump all this in pre-dump
|
|
if !criuOpts.PreDump {
|
|
hasCgroupns := c.config.Namespaces.Contains(configs.NEWCGROUP)
|
|
for _, m := range c.config.Mounts {
|
|
switch m.Device {
|
|
case "bind":
|
|
c.addCriuDumpMount(req, m)
|
|
case "cgroup":
|
|
if cgroups.IsCgroup2UnifiedMode() || hasCgroupns {
|
|
// real mount(s)
|
|
continue
|
|
}
|
|
// a set of "external" bind mounts
|
|
binds, err := getCgroupMounts(m)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
for _, b := range binds {
|
|
c.addCriuDumpMount(req, b)
|
|
}
|
|
}
|
|
}
|
|
|
|
if err := c.addMaskPaths(req); err != nil {
|
|
return err
|
|
}
|
|
|
|
for _, node := range c.config.Devices {
|
|
m := &configs.Mount{Destination: node.Path, Source: node.Path}
|
|
c.addCriuDumpMount(req, m)
|
|
}
|
|
|
|
// Write the FD info to a file in the image directory
|
|
fdsJSON, err := json.Marshal(c.initProcess.externalDescriptors())
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
err = os.WriteFile(filepath.Join(criuOpts.ImagesDirectory, descriptorsFilename), fdsJSON, 0o600)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
err = c.criuSwrk(nil, req, criuOpts, nil)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (c *linuxContainer) addCriuRestoreMount(req *criurpc.CriuReq, m *configs.Mount) {
|
|
mountDest := strings.TrimPrefix(m.Destination, c.config.Rootfs)
|
|
if dest, err := securejoin.SecureJoin(c.config.Rootfs, mountDest); err == nil {
|
|
mountDest = dest[len(c.config.Rootfs):]
|
|
}
|
|
extMnt := &criurpc.ExtMountMap{
|
|
Key: proto.String(mountDest),
|
|
Val: proto.String(m.Source),
|
|
}
|
|
req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt)
|
|
}
|
|
|
|
func (c *linuxContainer) restoreNetwork(req *criurpc.CriuReq, criuOpts *CriuOpts) {
|
|
for _, iface := range c.config.Networks {
|
|
switch iface.Type {
|
|
case "veth":
|
|
veth := new(criurpc.CriuVethPair)
|
|
veth.IfOut = proto.String(iface.HostInterfaceName)
|
|
veth.IfIn = proto.String(iface.Name)
|
|
req.Opts.Veths = append(req.Opts.Veths, veth)
|
|
case "loopback":
|
|
// Do nothing
|
|
}
|
|
}
|
|
for _, i := range criuOpts.VethPairs {
|
|
veth := new(criurpc.CriuVethPair)
|
|
veth.IfOut = proto.String(i.HostInterfaceName)
|
|
veth.IfIn = proto.String(i.ContainerInterfaceName)
|
|
req.Opts.Veths = append(req.Opts.Veths, veth)
|
|
}
|
|
}
|
|
|
|
// makeCriuRestoreMountpoints makes the actual mountpoints for the
|
|
// restore using CRIU. This function is inspired from the code in
|
|
// rootfs_linux.go
|
|
func (c *linuxContainer) makeCriuRestoreMountpoints(m *configs.Mount) error {
|
|
switch m.Device {
|
|
case "cgroup":
|
|
// No mount point(s) need to be created:
|
|
//
|
|
// * for v1, mount points are saved by CRIU because
|
|
// /sys/fs/cgroup is a tmpfs mount
|
|
//
|
|
// * for v2, /sys/fs/cgroup is a real mount, but
|
|
// the mountpoint appears as soon as /sys is mounted
|
|
return nil
|
|
case "bind":
|
|
// The prepareBindMount() function checks if source
|
|
// exists. So it cannot be used for other filesystem types.
|
|
// TODO: pass something else than nil? Not sure if criu is
|
|
// impacted by issue #2484
|
|
if err := prepareBindMount(m, c.config.Rootfs, nil); err != nil {
|
|
return err
|
|
}
|
|
default:
|
|
// for all other filesystems just create the mountpoints
|
|
dest, err := securejoin.SecureJoin(c.config.Rootfs, m.Destination)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if err := checkProcMount(c.config.Rootfs, dest, ""); err != nil {
|
|
return err
|
|
}
|
|
if err := os.MkdirAll(dest, 0o755); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// isPathInPrefixList is a small function for CRIU restore to make sure
|
|
// mountpoints, which are on a tmpfs, are not created in the roofs
|
|
func isPathInPrefixList(path string, prefix []string) bool {
|
|
for _, p := range prefix {
|
|
if strings.HasPrefix(path, p+"/") {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
// prepareCriuRestoreMounts tries to set up the rootfs of the
|
|
// container to be restored in the same way runc does it for
|
|
// initial container creation. Even for a read-only rootfs container
|
|
// runc modifies the rootfs to add mountpoints which do not exist.
|
|
// This function also creates missing mountpoints as long as they
|
|
// are not on top of a tmpfs, as CRIU will restore tmpfs content anyway.
|
|
func (c *linuxContainer) prepareCriuRestoreMounts(mounts []*configs.Mount) error {
|
|
// First get a list of a all tmpfs mounts
|
|
tmpfs := []string{}
|
|
for _, m := range mounts {
|
|
switch m.Device {
|
|
case "tmpfs":
|
|
tmpfs = append(tmpfs, m.Destination)
|
|
}
|
|
}
|
|
// Now go through all mounts and create the mountpoints
|
|
// if the mountpoints are not on a tmpfs, as CRIU will
|
|
// restore the complete tmpfs content from its checkpoint.
|
|
umounts := []string{}
|
|
defer func() {
|
|
for _, u := range umounts {
|
|
_ = utils.WithProcfd(c.config.Rootfs, u, func(procfd string) error {
|
|
if e := unix.Unmount(procfd, unix.MNT_DETACH); e != nil {
|
|
if e != unix.EINVAL { //nolint:errorlint // unix errors are bare
|
|
// Ignore EINVAL as it means 'target is not a mount point.'
|
|
// It probably has already been unmounted.
|
|
logrus.Warnf("Error during cleanup unmounting of %s (%s): %v", procfd, u, e)
|
|
}
|
|
}
|
|
return nil
|
|
})
|
|
}
|
|
}()
|
|
for _, m := range mounts {
|
|
if !isPathInPrefixList(m.Destination, tmpfs) {
|
|
if err := c.makeCriuRestoreMountpoints(m); err != nil {
|
|
return err
|
|
}
|
|
// If the mount point is a bind mount, we need to mount
|
|
// it now so that runc can create the necessary mount
|
|
// points for mounts in bind mounts.
|
|
// This also happens during initial container creation.
|
|
// Without this CRIU restore will fail
|
|
// See: https://github.com/opencontainers/runc/issues/2748
|
|
// It is also not necessary to order the mount points
|
|
// because during initial container creation mounts are
|
|
// set up in the order they are configured.
|
|
if m.Device == "bind" {
|
|
if err := utils.WithProcfd(c.config.Rootfs, m.Destination, func(procfd string) error {
|
|
if err := mount(m.Source, m.Destination, procfd, "", unix.MS_BIND|unix.MS_REC, ""); err != nil {
|
|
return err
|
|
}
|
|
return nil
|
|
}); err != nil {
|
|
return err
|
|
}
|
|
umounts = append(umounts, m.Destination)
|
|
}
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
|
|
c.m.Lock()
|
|
defer c.m.Unlock()
|
|
|
|
var extraFiles []*os.File
|
|
|
|
// Restore is unlikely to work if os.Geteuid() != 0 || system.RunningInUserNS().
|
|
// (CLI prints a warning)
|
|
// TODO(avagin): Figure out how to make this work nicely. CRIU doesn't have
|
|
// support for unprivileged restore at the moment.
|
|
|
|
// We are relying on the CRIU version RPC which was introduced with CRIU 3.0.0
|
|
if err := c.checkCriuVersion(30000); err != nil {
|
|
return err
|
|
}
|
|
if criuOpts.ImagesDirectory == "" {
|
|
return errors.New("invalid directory to restore checkpoint")
|
|
}
|
|
imageDir, err := os.Open(criuOpts.ImagesDirectory)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer imageDir.Close()
|
|
// CRIU has a few requirements for a root directory:
|
|
// * it must be a mount point
|
|
// * its parent must not be overmounted
|
|
// c.config.Rootfs is bind-mounted to a temporary directory
|
|
// to satisfy these requirements.
|
|
root := filepath.Join(c.root, "criu-root")
|
|
if err := os.Mkdir(root, 0o755); err != nil {
|
|
return err
|
|
}
|
|
defer os.Remove(root)
|
|
root, err = filepath.EvalSymlinks(root)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
err = mount(c.config.Rootfs, root, "", "", unix.MS_BIND|unix.MS_REC, "")
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer unix.Unmount(root, unix.MNT_DETACH) //nolint: errcheck
|
|
t := criurpc.CriuReqType_RESTORE
|
|
req := &criurpc.CriuReq{
|
|
Type: &t,
|
|
Opts: &criurpc.CriuOpts{
|
|
ImagesDirFd: proto.Int32(int32(imageDir.Fd())),
|
|
EvasiveDevices: proto.Bool(true),
|
|
LogLevel: proto.Int32(4),
|
|
LogFile: proto.String("restore.log"),
|
|
RstSibling: proto.Bool(true),
|
|
Root: proto.String(root),
|
|
ManageCgroups: proto.Bool(true),
|
|
NotifyScripts: proto.Bool(true),
|
|
ShellJob: proto.Bool(criuOpts.ShellJob),
|
|
ExtUnixSk: proto.Bool(criuOpts.ExternalUnixConnections),
|
|
TcpEstablished: proto.Bool(criuOpts.TcpEstablished),
|
|
FileLocks: proto.Bool(criuOpts.FileLocks),
|
|
EmptyNs: proto.Uint32(criuOpts.EmptyNs),
|
|
OrphanPtsMaster: proto.Bool(true),
|
|
AutoDedup: proto.Bool(criuOpts.AutoDedup),
|
|
LazyPages: proto.Bool(criuOpts.LazyPages),
|
|
},
|
|
}
|
|
|
|
if criuOpts.LsmProfile != "" {
|
|
// CRIU older than 3.16 has a bug which breaks the possibility
|
|
// to set a different LSM profile.
|
|
if err := c.checkCriuVersion(31600); err != nil {
|
|
return errors.New("--lsm-profile requires at least CRIU 3.16")
|
|
}
|
|
req.Opts.LsmProfile = proto.String(criuOpts.LsmProfile)
|
|
}
|
|
if criuOpts.LsmMountContext != "" {
|
|
if err := c.checkCriuVersion(31600); err != nil {
|
|
return errors.New("--lsm-mount-context requires at least CRIU 3.16")
|
|
}
|
|
req.Opts.LsmMountContext = proto.String(criuOpts.LsmMountContext)
|
|
}
|
|
|
|
if criuOpts.WorkDirectory != "" {
|
|
// Since a container can be C/R'ed multiple times,
|
|
// the work directory may already exist.
|
|
if err := os.Mkdir(criuOpts.WorkDirectory, 0o700); err != nil && !os.IsExist(err) {
|
|
return err
|
|
}
|
|
workDir, err := os.Open(criuOpts.WorkDirectory)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer workDir.Close()
|
|
req.Opts.WorkDirFd = proto.Int32(int32(workDir.Fd()))
|
|
}
|
|
c.handleCriuConfigurationFile(req.Opts)
|
|
|
|
if err := c.handleRestoringNamespaces(req.Opts, &extraFiles); err != nil {
|
|
return err
|
|
}
|
|
|
|
// This will modify the rootfs of the container in the same way runc
|
|
// modifies the container during initial creation.
|
|
if err := c.prepareCriuRestoreMounts(c.config.Mounts); err != nil {
|
|
return err
|
|
}
|
|
|
|
hasCgroupns := c.config.Namespaces.Contains(configs.NEWCGROUP)
|
|
for _, m := range c.config.Mounts {
|
|
switch m.Device {
|
|
case "bind":
|
|
c.addCriuRestoreMount(req, m)
|
|
case "cgroup":
|
|
if cgroups.IsCgroup2UnifiedMode() || hasCgroupns {
|
|
continue
|
|
}
|
|
// cgroup v1 is a set of bind mounts, unless cgroupns is used
|
|
binds, err := getCgroupMounts(m)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
for _, b := range binds {
|
|
c.addCriuRestoreMount(req, b)
|
|
}
|
|
}
|
|
}
|
|
|
|
if len(c.config.MaskPaths) > 0 {
|
|
m := &configs.Mount{Destination: "/dev/null", Source: "/dev/null"}
|
|
c.addCriuRestoreMount(req, m)
|
|
}
|
|
|
|
for _, node := range c.config.Devices {
|
|
m := &configs.Mount{Destination: node.Path, Source: node.Path}
|
|
c.addCriuRestoreMount(req, m)
|
|
}
|
|
|
|
if criuOpts.EmptyNs&unix.CLONE_NEWNET == 0 {
|
|
c.restoreNetwork(req, criuOpts)
|
|
}
|
|
|
|
// append optional manage cgroups mode
|
|
if criuOpts.ManageCgroupsMode != 0 {
|
|
mode := criuOpts.ManageCgroupsMode
|
|
req.Opts.ManageCgroupsMode = &mode
|
|
}
|
|
|
|
var (
|
|
fds []string
|
|
fdJSON []byte
|
|
)
|
|
if fdJSON, err = os.ReadFile(filepath.Join(criuOpts.ImagesDirectory, descriptorsFilename)); err != nil {
|
|
return err
|
|
}
|
|
|
|
if err := json.Unmarshal(fdJSON, &fds); err != nil {
|
|
return err
|
|
}
|
|
for i := range fds {
|
|
if s := fds[i]; strings.Contains(s, "pipe:") {
|
|
inheritFd := new(criurpc.InheritFd)
|
|
inheritFd.Key = proto.String(s)
|
|
inheritFd.Fd = proto.Int32(int32(i))
|
|
req.Opts.InheritFd = append(req.Opts.InheritFd, inheritFd)
|
|
}
|
|
}
|
|
err = c.criuSwrk(process, req, criuOpts, extraFiles)
|
|
|
|
// Now that CRIU is done let's close all opened FDs CRIU needed.
|
|
for _, fd := range extraFiles {
|
|
fd.Close()
|
|
}
|
|
|
|
return err
|
|
}
|
|
|
|
func (c *linuxContainer) criuApplyCgroups(pid int, req *criurpc.CriuReq) error {
|
|
// need to apply cgroups only on restore
|
|
if req.GetType() != criurpc.CriuReqType_RESTORE {
|
|
return nil
|
|
}
|
|
|
|
// XXX: Do we need to deal with this case? AFAIK criu still requires root.
|
|
if err := c.cgroupManager.Apply(pid); err != nil {
|
|
return err
|
|
}
|
|
|
|
if err := c.cgroupManager.Set(c.config.Cgroups.Resources); err != nil {
|
|
return err
|
|
}
|
|
|
|
if cgroups.IsCgroup2UnifiedMode() {
|
|
return nil
|
|
}
|
|
// the stuff below is cgroupv1-specific
|
|
|
|
path := fmt.Sprintf("/proc/%d/cgroup", pid)
|
|
cgroupsPaths, err := cgroups.ParseCgroupFile(path)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
for c, p := range cgroupsPaths {
|
|
cgroupRoot := &criurpc.CgroupRoot{
|
|
Ctrl: proto.String(c),
|
|
Path: proto.String(p),
|
|
}
|
|
req.Opts.CgRoot = append(req.Opts.CgRoot, cgroupRoot)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (c *linuxContainer) criuSwrk(process *Process, req *criurpc.CriuReq, opts *CriuOpts, extraFiles []*os.File) error {
|
|
fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_SEQPACKET|unix.SOCK_CLOEXEC, 0)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
var logPath string
|
|
if opts != nil {
|
|
logPath = filepath.Join(opts.WorkDirectory, req.GetOpts().GetLogFile())
|
|
} else {
|
|
// For the VERSION RPC 'opts' is set to 'nil' and therefore
|
|
// opts.WorkDirectory does not exist. Set logPath to "".
|
|
logPath = ""
|
|
}
|
|
criuClient := os.NewFile(uintptr(fds[0]), "criu-transport-client")
|
|
criuClientFileCon, err := net.FileConn(criuClient)
|
|
criuClient.Close()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
criuClientCon := criuClientFileCon.(*net.UnixConn)
|
|
defer criuClientCon.Close()
|
|
|
|
criuServer := os.NewFile(uintptr(fds[1]), "criu-transport-server")
|
|
defer criuServer.Close()
|
|
|
|
if c.criuVersion != 0 {
|
|
// If the CRIU Version is still '0' then this is probably
|
|
// the initial CRIU run to detect the version. Skip it.
|
|
logrus.Debugf("Using CRIU %d", c.criuVersion)
|
|
}
|
|
cmd := exec.Command("criu", "swrk", "3")
|
|
if process != nil {
|
|
cmd.Stdin = process.Stdin
|
|
cmd.Stdout = process.Stdout
|
|
cmd.Stderr = process.Stderr
|
|
}
|
|
cmd.ExtraFiles = append(cmd.ExtraFiles, criuServer)
|
|
if extraFiles != nil {
|
|
cmd.ExtraFiles = append(cmd.ExtraFiles, extraFiles...)
|
|
}
|
|
|
|
if err := cmd.Start(); err != nil {
|
|
return err
|
|
}
|
|
// we close criuServer so that even if CRIU crashes or unexpectedly exits, runc will not hang.
|
|
criuServer.Close()
|
|
// cmd.Process will be replaced by a restored init.
|
|
criuProcess := cmd.Process
|
|
|
|
var criuProcessState *os.ProcessState
|
|
defer func() {
|
|
if criuProcessState == nil {
|
|
criuClientCon.Close()
|
|
_, err := criuProcess.Wait()
|
|
if err != nil {
|
|
logrus.Warnf("wait on criuProcess returned %v", err)
|
|
}
|
|
}
|
|
}()
|
|
|
|
if err := c.criuApplyCgroups(criuProcess.Pid, req); err != nil {
|
|
return err
|
|
}
|
|
|
|
var extFds []string
|
|
if process != nil {
|
|
extFds, err = getPipeFds(criuProcess.Pid)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
logrus.Debugf("Using CRIU in %s mode", req.GetType().String())
|
|
// In the case of criurpc.CriuReqType_FEATURE_CHECK req.GetOpts()
|
|
// should be empty. For older CRIU versions it still will be
|
|
// available but empty. criurpc.CriuReqType_VERSION actually
|
|
// has no req.GetOpts().
|
|
if logrus.GetLevel() >= logrus.DebugLevel &&
|
|
!(req.GetType() == criurpc.CriuReqType_FEATURE_CHECK ||
|
|
req.GetType() == criurpc.CriuReqType_VERSION) {
|
|
|
|
val := reflect.ValueOf(req.GetOpts())
|
|
v := reflect.Indirect(val)
|
|
for i := 0; i < v.NumField(); i++ {
|
|
st := v.Type()
|
|
name := st.Field(i).Name
|
|
if 'A' <= name[0] && name[0] <= 'Z' {
|
|
value := val.MethodByName("Get" + name).Call([]reflect.Value{})
|
|
logrus.Debugf("CRIU option %s with value %v", name, value[0])
|
|
}
|
|
}
|
|
}
|
|
data, err := proto.Marshal(req)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
_, err = criuClientCon.Write(data)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
buf := make([]byte, 10*4096)
|
|
oob := make([]byte, 4096)
|
|
for {
|
|
n, oobn, _, _, err := criuClientCon.ReadMsgUnix(buf, oob)
|
|
if req.Opts != nil && req.Opts.StatusFd != nil {
|
|
// Close status_fd as soon as we got something back from criu,
|
|
// assuming it has consumed (reopened) it by this time.
|
|
// Otherwise it will might be left open forever and whoever
|
|
// is waiting on it will wait forever.
|
|
fd := int(*req.Opts.StatusFd)
|
|
_ = unix.Close(fd)
|
|
req.Opts.StatusFd = nil
|
|
}
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if n == 0 {
|
|
return errors.New("unexpected EOF")
|
|
}
|
|
if n == len(buf) {
|
|
return errors.New("buffer is too small")
|
|
}
|
|
|
|
resp := new(criurpc.CriuResp)
|
|
err = proto.Unmarshal(buf[:n], resp)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if !resp.GetSuccess() {
|
|
typeString := req.GetType().String()
|
|
return fmt.Errorf("criu failed: type %s errno %d\nlog file: %s", typeString, resp.GetCrErrno(), logPath)
|
|
}
|
|
|
|
t := resp.GetType()
|
|
switch {
|
|
case t == criurpc.CriuReqType_FEATURE_CHECK:
|
|
logrus.Debugf("Feature check says: %s", resp)
|
|
criuFeatures = resp.GetFeatures()
|
|
case t == criurpc.CriuReqType_NOTIFY:
|
|
if err := c.criuNotifications(resp, process, cmd, opts, extFds, oob[:oobn]); err != nil {
|
|
return err
|
|
}
|
|
t = criurpc.CriuReqType_NOTIFY
|
|
req = &criurpc.CriuReq{
|
|
Type: &t,
|
|
NotifySuccess: proto.Bool(true),
|
|
}
|
|
data, err = proto.Marshal(req)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
_, err = criuClientCon.Write(data)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
continue
|
|
case t == criurpc.CriuReqType_RESTORE:
|
|
case t == criurpc.CriuReqType_DUMP:
|
|
case t == criurpc.CriuReqType_PRE_DUMP:
|
|
default:
|
|
return fmt.Errorf("unable to parse the response %s", resp.String())
|
|
}
|
|
|
|
break
|
|
}
|
|
|
|
_ = criuClientCon.CloseWrite()
|
|
// cmd.Wait() waits cmd.goroutines which are used for proxying file descriptors.
|
|
// Here we want to wait only the CRIU process.
|
|
criuProcessState, err = criuProcess.Wait()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
// In pre-dump mode CRIU is in a loop and waits for
|
|
// the final DUMP command.
|
|
// The current runc pre-dump approach, however, is
|
|
// start criu in PRE_DUMP once for a single pre-dump
|
|
// and not the whole series of pre-dump, pre-dump, ...m, dump
|
|
// If we got the message CriuReqType_PRE_DUMP it means
|
|
// CRIU was successful and we need to forcefully stop CRIU
|
|
if !criuProcessState.Success() && *req.Type != criurpc.CriuReqType_PRE_DUMP {
|
|
return fmt.Errorf("criu failed: %s\nlog file: %s", criuProcessState.String(), logPath)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// block any external network activity
|
|
func lockNetwork(config *configs.Config) error {
|
|
for _, config := range config.Networks {
|
|
strategy, err := getStrategy(config.Type)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
if err := strategy.detach(config); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func unlockNetwork(config *configs.Config) error {
|
|
for _, config := range config.Networks {
|
|
strategy, err := getStrategy(config.Type)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if err = strategy.attach(config); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (c *linuxContainer) criuNotifications(resp *criurpc.CriuResp, process *Process, cmd *exec.Cmd, opts *CriuOpts, fds []string, oob []byte) error {
|
|
notify := resp.GetNotify()
|
|
if notify == nil {
|
|
return fmt.Errorf("invalid response: %s", resp.String())
|
|
}
|
|
script := notify.GetScript()
|
|
logrus.Debugf("notify: %s\n", script)
|
|
switch script {
|
|
case "post-dump":
|
|
f, err := os.Create(filepath.Join(c.root, "checkpoint"))
|
|
if err != nil {
|
|
return err
|
|
}
|
|
f.Close()
|
|
case "network-unlock":
|
|
if err := unlockNetwork(c.config); err != nil {
|
|
return err
|
|
}
|
|
case "network-lock":
|
|
if err := lockNetwork(c.config); err != nil {
|
|
return err
|
|
}
|
|
case "setup-namespaces":
|
|
if c.config.Hooks != nil {
|
|
s, err := c.currentOCIState()
|
|
if err != nil {
|
|
return nil
|
|
}
|
|
s.Pid = int(notify.GetPid())
|
|
|
|
if err := c.config.Hooks[configs.Prestart].RunHooks(s); err != nil {
|
|
return err
|
|
}
|
|
if err := c.config.Hooks[configs.CreateRuntime].RunHooks(s); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
case "post-restore":
|
|
pid := notify.GetPid()
|
|
|
|
p, err := os.FindProcess(int(pid))
|
|
if err != nil {
|
|
return err
|
|
}
|
|
cmd.Process = p
|
|
|
|
r, err := newRestoredProcess(cmd, fds)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
process.ops = r
|
|
if err := c.state.transition(&restoredState{
|
|
imageDir: opts.ImagesDirectory,
|
|
c: c,
|
|
}); err != nil {
|
|
return err
|
|
}
|
|
// create a timestamp indicating when the restored checkpoint was started
|
|
c.created = time.Now().UTC()
|
|
if _, err := c.updateState(r); err != nil {
|
|
return err
|
|
}
|
|
if err := os.Remove(filepath.Join(c.root, "checkpoint")); err != nil {
|
|
if !os.IsNotExist(err) {
|
|
logrus.Error(err)
|
|
}
|
|
}
|
|
case "orphan-pts-master":
|
|
scm, err := unix.ParseSocketControlMessage(oob)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
fds, err := unix.ParseUnixRights(&scm[0])
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
master := os.NewFile(uintptr(fds[0]), "orphan-pts-master")
|
|
defer master.Close()
|
|
|
|
// While we can access console.master, using the API is a good idea.
|
|
if err := utils.SendFd(process.ConsoleSocket, master.Name(), master.Fd()); err != nil {
|
|
return err
|
|
}
|
|
case "status-ready":
|
|
if opts.StatusFd != -1 {
|
|
// write \0 to status fd to notify that lazy page server is ready
|
|
_, err := unix.Write(opts.StatusFd, []byte{0})
|
|
if err != nil {
|
|
logrus.Warnf("can't write \\0 to status fd: %v", err)
|
|
}
|
|
_ = unix.Close(opts.StatusFd)
|
|
opts.StatusFd = -1
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (c *linuxContainer) updateState(process parentProcess) (*State, error) {
|
|
if process != nil {
|
|
c.initProcess = process
|
|
}
|
|
state, err := c.currentState()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
err = c.saveState(state)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return state, nil
|
|
}
|
|
|
|
func (c *linuxContainer) saveState(s *State) (retErr error) {
|
|
tmpFile, err := os.CreateTemp(c.root, "state-")
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
defer func() {
|
|
if retErr != nil {
|
|
tmpFile.Close()
|
|
os.Remove(tmpFile.Name())
|
|
}
|
|
}()
|
|
|
|
err = utils.WriteJSON(tmpFile, s)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
err = tmpFile.Close()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
stateFilePath := filepath.Join(c.root, stateFilename)
|
|
return os.Rename(tmpFile.Name(), stateFilePath)
|
|
}
|
|
|
|
func (c *linuxContainer) currentStatus() (Status, error) {
|
|
if err := c.refreshState(); err != nil {
|
|
return -1, err
|
|
}
|
|
return c.state.status(), nil
|
|
}
|
|
|
|
// refreshState needs to be called to verify that the current state on the
|
|
// container is what is true. Because consumers of libcontainer can use it
|
|
// out of process we need to verify the container's status based on runtime
|
|
// information and not rely on our in process info.
|
|
func (c *linuxContainer) refreshState() error {
|
|
paused, err := c.isPaused()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if paused {
|
|
return c.state.transition(&pausedState{c: c})
|
|
}
|
|
t := c.runType()
|
|
switch t {
|
|
case Created:
|
|
return c.state.transition(&createdState{c: c})
|
|
case Running:
|
|
return c.state.transition(&runningState{c: c})
|
|
}
|
|
return c.state.transition(&stoppedState{c: c})
|
|
}
|
|
|
|
func (c *linuxContainer) runType() Status {
|
|
if c.initProcess == nil {
|
|
return Stopped
|
|
}
|
|
pid := c.initProcess.pid()
|
|
stat, err := system.Stat(pid)
|
|
if err != nil {
|
|
return Stopped
|
|
}
|
|
if stat.StartTime != c.initProcessStartTime || stat.State == system.Zombie || stat.State == system.Dead {
|
|
return Stopped
|
|
}
|
|
// We'll create exec fifo and blocking on it after container is created,
|
|
// and delete it after start container.
|
|
if _, err := os.Stat(filepath.Join(c.root, execFifoFilename)); err == nil {
|
|
return Created
|
|
}
|
|
return Running
|
|
}
|
|
|
|
func (c *linuxContainer) isPaused() (bool, error) {
|
|
state, err := c.cgroupManager.GetFreezerState()
|
|
if err != nil {
|
|
return false, err
|
|
}
|
|
return state == configs.Frozen, nil
|
|
}
|
|
|
|
func (c *linuxContainer) currentState() (*State, error) {
|
|
var (
|
|
startTime uint64
|
|
externalDescriptors []string
|
|
pid = -1
|
|
)
|
|
if c.initProcess != nil {
|
|
pid = c.initProcess.pid()
|
|
startTime, _ = c.initProcess.startTime()
|
|
externalDescriptors = c.initProcess.externalDescriptors()
|
|
}
|
|
|
|
intelRdtPath := ""
|
|
if c.intelRdtManager != nil {
|
|
intelRdtPath = c.intelRdtManager.GetPath()
|
|
}
|
|
state := &State{
|
|
BaseState: BaseState{
|
|
ID: c.ID(),
|
|
Config: *c.config,
|
|
InitProcessPid: pid,
|
|
InitProcessStartTime: startTime,
|
|
Created: c.created,
|
|
},
|
|
Rootless: c.config.RootlessEUID && c.config.RootlessCgroups,
|
|
CgroupPaths: c.cgroupManager.GetPaths(),
|
|
IntelRdtPath: intelRdtPath,
|
|
NamespacePaths: make(map[configs.NamespaceType]string),
|
|
ExternalDescriptors: externalDescriptors,
|
|
}
|
|
if pid > 0 {
|
|
for _, ns := range c.config.Namespaces {
|
|
state.NamespacePaths[ns.Type] = ns.GetPath(pid)
|
|
}
|
|
for _, nsType := range configs.NamespaceTypes() {
|
|
if !configs.IsNamespaceSupported(nsType) {
|
|
continue
|
|
}
|
|
if _, ok := state.NamespacePaths[nsType]; !ok {
|
|
ns := configs.Namespace{Type: nsType}
|
|
state.NamespacePaths[ns.Type] = ns.GetPath(pid)
|
|
}
|
|
}
|
|
}
|
|
return state, nil
|
|
}
|
|
|
|
func (c *linuxContainer) currentOCIState() (*specs.State, error) {
|
|
bundle, annotations := utils.Annotations(c.config.Labels)
|
|
state := &specs.State{
|
|
Version: specs.Version,
|
|
ID: c.ID(),
|
|
Bundle: bundle,
|
|
Annotations: annotations,
|
|
}
|
|
status, err := c.currentStatus()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
state.Status = specs.ContainerState(status.String())
|
|
if status != Stopped {
|
|
if c.initProcess != nil {
|
|
state.Pid = c.initProcess.pid()
|
|
}
|
|
}
|
|
return state, nil
|
|
}
|
|
|
|
// orderNamespacePaths sorts namespace paths into a list of paths that we
|
|
// can setns in order.
|
|
func (c *linuxContainer) orderNamespacePaths(namespaces map[configs.NamespaceType]string) ([]string, error) {
|
|
paths := []string{}
|
|
for _, ns := range configs.NamespaceTypes() {
|
|
|
|
// Remove namespaces that we don't need to join.
|
|
if !c.config.Namespaces.Contains(ns) {
|
|
continue
|
|
}
|
|
|
|
if p, ok := namespaces[ns]; ok && p != "" {
|
|
// check if the requested namespace is supported
|
|
if !configs.IsNamespaceSupported(ns) {
|
|
return nil, fmt.Errorf("namespace %s is not supported", ns)
|
|
}
|
|
// only set to join this namespace if it exists
|
|
if _, err := os.Lstat(p); err != nil {
|
|
return nil, fmt.Errorf("namespace path: %w", err)
|
|
}
|
|
// do not allow namespace path with comma as we use it to separate
|
|
// the namespace paths
|
|
if strings.ContainsRune(p, ',') {
|
|
return nil, fmt.Errorf("invalid namespace path %s", p)
|
|
}
|
|
paths = append(paths, fmt.Sprintf("%s:%s", configs.NsName(ns), p))
|
|
}
|
|
|
|
}
|
|
|
|
return paths, nil
|
|
}
|
|
|
|
func encodeIDMapping(idMap []configs.IDMap) ([]byte, error) {
|
|
data := bytes.NewBuffer(nil)
|
|
for _, im := range idMap {
|
|
line := fmt.Sprintf("%d %d %d\n", im.ContainerID, im.HostID, im.Size)
|
|
if _, err := data.WriteString(line); err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
return data.Bytes(), nil
|
|
}
|
|
|
|
// netlinkError is an error wrapper type for use by custom netlink message
|
|
// types. Panics with errors are wrapped in netlinkError so that the recover
|
|
// in bootstrapData can distinguish intentional panics.
|
|
type netlinkError struct{ error }
|
|
|
|
// bootstrapData encodes the necessary data in netlink binary format
|
|
// as a io.Reader.
|
|
// Consumer can write the data to a bootstrap program
|
|
// such as one that uses nsenter package to bootstrap the container's
|
|
// init process correctly, i.e. with correct namespaces, uid/gid
|
|
// mapping etc.
|
|
func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.NamespaceType]string, it initType) (_ io.Reader, Err error) {
|
|
// create the netlink message
|
|
r := nl.NewNetlinkRequest(int(InitMsg), 0)
|
|
|
|
// Our custom messages cannot bubble up an error using returns, instead
|
|
// they will panic with the specific error type, netlinkError. In that
|
|
// case, recover from the panic and return that as an error.
|
|
defer func() {
|
|
if r := recover(); r != nil {
|
|
if e, ok := r.(netlinkError); ok {
|
|
Err = e.error
|
|
} else {
|
|
panic(r)
|
|
}
|
|
}
|
|
}()
|
|
|
|
// write cloneFlags
|
|
r.AddData(&Int32msg{
|
|
Type: CloneFlagsAttr,
|
|
Value: uint32(cloneFlags),
|
|
})
|
|
|
|
// write custom namespace paths
|
|
if len(nsMaps) > 0 {
|
|
nsPaths, err := c.orderNamespacePaths(nsMaps)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
r.AddData(&Bytemsg{
|
|
Type: NsPathsAttr,
|
|
Value: []byte(strings.Join(nsPaths, ",")),
|
|
})
|
|
}
|
|
|
|
// write namespace paths only when we are not joining an existing user ns
|
|
_, joinExistingUser := nsMaps[configs.NEWUSER]
|
|
if !joinExistingUser {
|
|
// write uid mappings
|
|
if len(c.config.UidMappings) > 0 {
|
|
if c.config.RootlessEUID {
|
|
// We resolve the paths for new{u,g}idmap from
|
|
// the context of runc to avoid doing a path
|
|
// lookup in the nsexec context.
|
|
if path, err := execabs.LookPath("newuidmap"); err == nil {
|
|
r.AddData(&Bytemsg{
|
|
Type: UidmapPathAttr,
|
|
Value: []byte(path),
|
|
})
|
|
}
|
|
}
|
|
b, err := encodeIDMapping(c.config.UidMappings)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
r.AddData(&Bytemsg{
|
|
Type: UidmapAttr,
|
|
Value: b,
|
|
})
|
|
}
|
|
|
|
// write gid mappings
|
|
if len(c.config.GidMappings) > 0 {
|
|
b, err := encodeIDMapping(c.config.GidMappings)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
r.AddData(&Bytemsg{
|
|
Type: GidmapAttr,
|
|
Value: b,
|
|
})
|
|
if c.config.RootlessEUID {
|
|
if path, err := execabs.LookPath("newgidmap"); err == nil {
|
|
r.AddData(&Bytemsg{
|
|
Type: GidmapPathAttr,
|
|
Value: []byte(path),
|
|
})
|
|
}
|
|
}
|
|
if requiresRootOrMappingTool(c.config) {
|
|
r.AddData(&Boolmsg{
|
|
Type: SetgroupAttr,
|
|
Value: true,
|
|
})
|
|
}
|
|
}
|
|
}
|
|
|
|
if c.config.OomScoreAdj != nil {
|
|
// write oom_score_adj
|
|
r.AddData(&Bytemsg{
|
|
Type: OomScoreAdjAttr,
|
|
Value: []byte(strconv.Itoa(*c.config.OomScoreAdj)),
|
|
})
|
|
}
|
|
|
|
// write rootless
|
|
r.AddData(&Boolmsg{
|
|
Type: RootlessEUIDAttr,
|
|
Value: c.config.RootlessEUID,
|
|
})
|
|
|
|
// Bind mount source to open.
|
|
if it == initStandard && c.shouldSendMountSources() {
|
|
var mounts []byte
|
|
for _, m := range c.config.Mounts {
|
|
if m.IsBind() {
|
|
if strings.IndexByte(m.Source, 0) >= 0 {
|
|
return nil, fmt.Errorf("mount source string contains null byte: %q", m.Source)
|
|
}
|
|
mounts = append(mounts, []byte(m.Source)...)
|
|
}
|
|
mounts = append(mounts, byte(0))
|
|
}
|
|
|
|
r.AddData(&Bytemsg{
|
|
Type: MountSourcesAttr,
|
|
Value: mounts,
|
|
})
|
|
}
|
|
|
|
return bytes.NewReader(r.Serialize()), nil
|
|
}
|
|
|
|
// ignoreTerminateErrors returns nil if the given err matches an error known
|
|
// to indicate that the terminate occurred successfully or err was nil, otherwise
|
|
// err is returned unaltered.
|
|
func ignoreTerminateErrors(err error) error {
|
|
if err == nil {
|
|
return nil
|
|
}
|
|
// terminate() might return an error from either Kill or Wait.
|
|
// The (*Cmd).Wait documentation says: "If the command fails to run
|
|
// or doesn't complete successfully, the error is of type *ExitError".
|
|
// Filter out such errors (like "exit status 1" or "signal: killed").
|
|
var exitErr *exec.ExitError
|
|
if errors.As(err, &exitErr) {
|
|
return nil
|
|
}
|
|
if errors.Is(err, os.ErrProcessDone) {
|
|
return nil
|
|
}
|
|
s := err.Error()
|
|
if strings.Contains(s, "Wait was already called") {
|
|
return nil
|
|
}
|
|
return err
|
|
}
|
|
|
|
func requiresRootOrMappingTool(c *configs.Config) bool {
|
|
gidMap := []configs.IDMap{
|
|
{ContainerID: 0, HostID: os.Getegid(), Size: 1},
|
|
}
|
|
return !reflect.DeepEqual(c.GidMappings, gidMap)
|
|
}
|