mirror of
				https://github.com/opencontainers/runc.git
				synced 2025-10-31 02:56:25 +08:00 
			
		
		
		
	 871057d863
			
		
	
	871057d863
	
	
	
		
			
			Because we have the overlay solution, we can drop runc-dmz binary solution since it has too many limitations. Signed-off-by: lifubang <lifubang@acmcoder.com>
		
			
				
	
	
		
			1163 lines
		
	
	
		
			34 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
			
		
		
	
	
			1163 lines
		
	
	
		
			34 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
| package libcontainer
 | |
| 
 | |
| import (
 | |
| 	"bytes"
 | |
| 	"errors"
 | |
| 	"fmt"
 | |
| 	"io"
 | |
| 	"os"
 | |
| 	"os/exec"
 | |
| 	"path"
 | |
| 	"path/filepath"
 | |
| 	"reflect"
 | |
| 	"strconv"
 | |
| 	"strings"
 | |
| 	"sync"
 | |
| 	"time"
 | |
| 
 | |
| 	"github.com/opencontainers/runtime-spec/specs-go"
 | |
| 	"github.com/sirupsen/logrus"
 | |
| 	"github.com/vishvananda/netlink/nl"
 | |
| 	"golang.org/x/sys/unix"
 | |
| 
 | |
| 	"github.com/opencontainers/runc/libcontainer/cgroups"
 | |
| 	"github.com/opencontainers/runc/libcontainer/configs"
 | |
| 	"github.com/opencontainers/runc/libcontainer/dmz"
 | |
| 	"github.com/opencontainers/runc/libcontainer/intelrdt"
 | |
| 	"github.com/opencontainers/runc/libcontainer/system"
 | |
| 	"github.com/opencontainers/runc/libcontainer/utils"
 | |
| )
 | |
| 
 | |
| const stdioFdCount = 3
 | |
| 
 | |
| // Container is a libcontainer container object.
 | |
| type Container struct {
 | |
| 	id                   string
 | |
| 	stateDir             string
 | |
| 	config               *configs.Config
 | |
| 	cgroupManager        cgroups.Manager
 | |
| 	intelRdtManager      *intelrdt.Manager
 | |
| 	initProcess          parentProcess
 | |
| 	initProcessStartTime uint64
 | |
| 	m                    sync.Mutex
 | |
| 	criuVersion          int
 | |
| 	state                containerState
 | |
| 	created              time.Time
 | |
| 	fifo                 *os.File
 | |
| }
 | |
| 
 | |
| // State represents a running container's state
 | |
| type State struct {
 | |
| 	BaseState
 | |
| 
 | |
| 	// Platform specific fields below here
 | |
| 
 | |
| 	// Specified if the container was started under the rootless mode.
 | |
| 	// Set to true if BaseState.Config.RootlessEUID && BaseState.Config.RootlessCgroups
 | |
| 	Rootless bool `json:"rootless"`
 | |
| 
 | |
| 	// Paths to all the container's cgroups, as returned by (*cgroups.Manager).GetPaths
 | |
| 	//
 | |
| 	// For cgroup v1, a key is cgroup subsystem name, and the value is the path
 | |
| 	// to the cgroup for this subsystem.
 | |
| 	//
 | |
| 	// For cgroup v2 unified hierarchy, a key is "", and the value is the unified path.
 | |
| 	CgroupPaths map[string]string `json:"cgroup_paths"`
 | |
| 
 | |
| 	// NamespacePaths are filepaths to the container's namespaces. Key is the namespace type
 | |
| 	// with the value as the path.
 | |
| 	NamespacePaths map[configs.NamespaceType]string `json:"namespace_paths"`
 | |
| 
 | |
| 	// Container's standard descriptors (std{in,out,err}), needed for checkpoint and restore
 | |
| 	ExternalDescriptors []string `json:"external_descriptors,omitempty"`
 | |
| 
 | |
| 	// Intel RDT "resource control" filesystem path
 | |
| 	IntelRdtPath string `json:"intel_rdt_path"`
 | |
| }
 | |
| 
 | |
| // ID returns the container's unique ID
 | |
| func (c *Container) ID() string {
 | |
| 	return c.id
 | |
| }
 | |
| 
 | |
| // Config returns the container's configuration
 | |
| func (c *Container) Config() configs.Config {
 | |
| 	return *c.config
 | |
| }
 | |
| 
 | |
| // Status returns the current status of the container.
 | |
| func (c *Container) Status() (Status, error) {
 | |
| 	c.m.Lock()
 | |
| 	defer c.m.Unlock()
 | |
| 	return c.currentStatus()
 | |
| }
 | |
| 
 | |
| // State returns the current container's state information.
 | |
| func (c *Container) State() (*State, error) {
 | |
| 	c.m.Lock()
 | |
| 	defer c.m.Unlock()
 | |
| 	return c.currentState(), nil
 | |
| }
 | |
| 
 | |
| // OCIState returns the current container's state information.
 | |
| func (c *Container) OCIState() (*specs.State, error) {
 | |
| 	c.m.Lock()
 | |
| 	defer c.m.Unlock()
 | |
| 	return c.currentOCIState()
 | |
| }
 | |
| 
 | |
| // ignoreCgroupError filters out cgroup-related errors that can be ignored,
 | |
| // because the container is stopped and its cgroup is gone.
 | |
| func (c *Container) ignoreCgroupError(err error) error {
 | |
| 	if err == nil {
 | |
| 		return nil
 | |
| 	}
 | |
| 	if errors.Is(err, os.ErrNotExist) && !c.hasInit() && !c.cgroupManager.Exists() {
 | |
| 		return nil
 | |
| 	}
 | |
| 	return err
 | |
| }
 | |
| 
 | |
| // Processes returns the PIDs inside this container. The PIDs are in the
 | |
| // namespace of the calling process.
 | |
| //
 | |
| // Some of the returned PIDs may no longer refer to processes in the container,
 | |
| // unless the container state is PAUSED in which case every PID in the slice is
 | |
| // valid.
 | |
| func (c *Container) Processes() ([]int, error) {
 | |
| 	pids, err := c.cgroupManager.GetAllPids()
 | |
| 	if err = c.ignoreCgroupError(err); err != nil {
 | |
| 		return nil, fmt.Errorf("unable to get all container pids: %w", err)
 | |
| 	}
 | |
| 	return pids, nil
 | |
| }
 | |
| 
 | |
| // Stats returns statistics for the container.
 | |
| func (c *Container) Stats() (*Stats, error) {
 | |
| 	var (
 | |
| 		err   error
 | |
| 		stats = &Stats{}
 | |
| 	)
 | |
| 	if stats.CgroupStats, err = c.cgroupManager.GetStats(); err != nil {
 | |
| 		return stats, fmt.Errorf("unable to get container cgroup stats: %w", err)
 | |
| 	}
 | |
| 	if c.intelRdtManager != nil {
 | |
| 		if stats.IntelRdtStats, err = c.intelRdtManager.GetStats(); err != nil {
 | |
| 			return stats, fmt.Errorf("unable to get container Intel RDT stats: %w", err)
 | |
| 		}
 | |
| 	}
 | |
| 	for _, iface := range c.config.Networks {
 | |
| 		switch iface.Type {
 | |
| 		case "veth":
 | |
| 			istats, err := getNetworkInterfaceStats(iface.HostInterfaceName)
 | |
| 			if err != nil {
 | |
| 				return stats, fmt.Errorf("unable to get network stats for interface %q: %w", iface.HostInterfaceName, err)
 | |
| 			}
 | |
| 			stats.Interfaces = append(stats.Interfaces, istats)
 | |
| 		}
 | |
| 	}
 | |
| 	return stats, nil
 | |
| }
 | |
| 
 | |
| // Set resources of container as configured. Can be used to change resources
 | |
| // when the container is running.
 | |
| func (c *Container) Set(config configs.Config) error {
 | |
| 	c.m.Lock()
 | |
| 	defer c.m.Unlock()
 | |
| 	status, err := c.currentStatus()
 | |
| 	if err != nil {
 | |
| 		return err
 | |
| 	}
 | |
| 	if status == Stopped {
 | |
| 		return ErrNotRunning
 | |
| 	}
 | |
| 	if err := c.cgroupManager.Set(config.Cgroups.Resources); err != nil {
 | |
| 		// Set configs back
 | |
| 		if err2 := c.cgroupManager.Set(c.config.Cgroups.Resources); err2 != nil {
 | |
| 			logrus.Warnf("Setting back cgroup configs failed due to error: %v, your state.json and actual configs might be inconsistent.", err2)
 | |
| 		}
 | |
| 		return err
 | |
| 	}
 | |
| 	if c.intelRdtManager != nil {
 | |
| 		if err := c.intelRdtManager.Set(&config); err != nil {
 | |
| 			// Set configs back
 | |
| 			if err2 := c.cgroupManager.Set(c.config.Cgroups.Resources); err2 != nil {
 | |
| 				logrus.Warnf("Setting back cgroup configs failed due to error: %v, your state.json and actual configs might be inconsistent.", err2)
 | |
| 			}
 | |
| 			if err2 := c.intelRdtManager.Set(c.config); err2 != nil {
 | |
| 				logrus.Warnf("Setting back intelrdt configs failed due to error: %v, your state.json and actual configs might be inconsistent.", err2)
 | |
| 			}
 | |
| 			return err
 | |
| 		}
 | |
| 	}
 | |
| 	// After config setting succeed, update config and states
 | |
| 	c.config = &config
 | |
| 	_, err = c.updateState(nil)
 | |
| 	return err
 | |
| }
 | |
| 
 | |
| // Start starts a process inside the container. Returns error if process fails
 | |
| // to start. You can track process lifecycle with passed Process structure.
 | |
| func (c *Container) Start(process *Process) error {
 | |
| 	c.m.Lock()
 | |
| 	defer c.m.Unlock()
 | |
| 	return c.start(process)
 | |
| }
 | |
| 
 | |
| // Run immediately starts the process inside the container. Returns an error if
 | |
| // the process fails to start. It does not block waiting for the exec fifo
 | |
| // after start returns but opens the fifo after start returns.
 | |
| func (c *Container) Run(process *Process) error {
 | |
| 	c.m.Lock()
 | |
| 	defer c.m.Unlock()
 | |
| 	if err := c.start(process); err != nil {
 | |
| 		return err
 | |
| 	}
 | |
| 	if process.Init {
 | |
| 		return c.exec()
 | |
| 	}
 | |
| 	return nil
 | |
| }
 | |
| 
 | |
| // Exec signals the container to exec the users process at the end of the init.
 | |
| func (c *Container) Exec() error {
 | |
| 	c.m.Lock()
 | |
| 	defer c.m.Unlock()
 | |
| 	return c.exec()
 | |
| }
 | |
| 
 | |
| func (c *Container) exec() error {
 | |
| 	path := filepath.Join(c.stateDir, execFifoFilename)
 | |
| 	pid := c.initProcess.pid()
 | |
| 	blockingFifoOpenCh := awaitFifoOpen(path)
 | |
| 	for {
 | |
| 		select {
 | |
| 		case result := <-blockingFifoOpenCh:
 | |
| 			return handleFifoResult(result)
 | |
| 
 | |
| 		case <-time.After(time.Millisecond * 100):
 | |
| 			stat, err := system.Stat(pid)
 | |
| 			if err != nil || stat.State == system.Zombie {
 | |
| 				// could be because process started, ran, and completed between our 100ms timeout and our system.Stat() check.
 | |
| 				// see if the fifo exists and has data (with a non-blocking open, which will succeed if the writing process is complete).
 | |
| 				if err := handleFifoResult(fifoOpen(path, false)); err != nil {
 | |
| 					return errors.New("container process is already dead")
 | |
| 				}
 | |
| 				return nil
 | |
| 			}
 | |
| 		}
 | |
| 	}
 | |
| }
 | |
| 
 | |
| func readFromExecFifo(execFifo io.Reader) error {
 | |
| 	data, err := io.ReadAll(execFifo)
 | |
| 	if err != nil {
 | |
| 		return err
 | |
| 	}
 | |
| 	if len(data) <= 0 {
 | |
| 		return errors.New("cannot start an already running container")
 | |
| 	}
 | |
| 	return nil
 | |
| }
 | |
| 
 | |
| func awaitFifoOpen(path string) <-chan openResult {
 | |
| 	fifoOpened := make(chan openResult)
 | |
| 	go func() {
 | |
| 		result := fifoOpen(path, true)
 | |
| 		fifoOpened <- result
 | |
| 	}()
 | |
| 	return fifoOpened
 | |
| }
 | |
| 
 | |
| func fifoOpen(path string, block bool) openResult {
 | |
| 	flags := os.O_RDONLY
 | |
| 	if !block {
 | |
| 		flags |= unix.O_NONBLOCK
 | |
| 	}
 | |
| 	f, err := os.OpenFile(path, flags, 0)
 | |
| 	if err != nil {
 | |
| 		return openResult{err: fmt.Errorf("exec fifo: %w", err)}
 | |
| 	}
 | |
| 	return openResult{file: f}
 | |
| }
 | |
| 
 | |
| func handleFifoResult(result openResult) error {
 | |
| 	if result.err != nil {
 | |
| 		return result.err
 | |
| 	}
 | |
| 	f := result.file
 | |
| 	defer f.Close()
 | |
| 	if err := readFromExecFifo(f); err != nil {
 | |
| 		return err
 | |
| 	}
 | |
| 	return os.Remove(f.Name())
 | |
| }
 | |
| 
 | |
| type openResult struct {
 | |
| 	file *os.File
 | |
| 	err  error
 | |
| }
 | |
| 
 | |
| func (c *Container) start(process *Process) (retErr error) {
 | |
| 	if c.config.Cgroups.Resources.SkipDevices {
 | |
| 		return errors.New("can't start container with SkipDevices set")
 | |
| 	}
 | |
| 	if process.Init {
 | |
| 		if c.initProcessStartTime != 0 {
 | |
| 			return errors.New("container already has init process")
 | |
| 		}
 | |
| 		if err := c.createExecFifo(); err != nil {
 | |
| 			return err
 | |
| 		}
 | |
| 		defer func() {
 | |
| 			if retErr != nil {
 | |
| 				c.deleteExecFifo()
 | |
| 			}
 | |
| 		}()
 | |
| 	}
 | |
| 
 | |
| 	parent, err := c.newParentProcess(process)
 | |
| 	if err != nil {
 | |
| 		return fmt.Errorf("unable to create new parent process: %w", err)
 | |
| 	}
 | |
| 	// We do not need the cloned binaries once the process is spawned.
 | |
| 	defer process.closeClonedExes()
 | |
| 
 | |
| 	logsDone := parent.forwardChildLogs()
 | |
| 	if logsDone != nil {
 | |
| 		defer func() {
 | |
| 			// Wait for log forwarder to finish. This depends on
 | |
| 			// runc init closing the _LIBCONTAINER_LOGPIPE log fd.
 | |
| 			err := <-logsDone
 | |
| 			if err != nil && retErr == nil {
 | |
| 				retErr = fmt.Errorf("unable to forward init logs: %w", err)
 | |
| 			}
 | |
| 		}()
 | |
| 	}
 | |
| 
 | |
| 	// Before starting "runc init", mark all non-stdio open files as O_CLOEXEC
 | |
| 	// to make sure we don't leak any files into "runc init". Any files to be
 | |
| 	// passed to "runc init" through ExtraFiles will get dup2'd by the Go
 | |
| 	// runtime and thus their O_CLOEXEC flag will be cleared. This is some
 | |
| 	// additional protection against attacks like CVE-2024-21626, by making
 | |
| 	// sure we never leak files to "runc init" we didn't intend to.
 | |
| 	if err := utils.CloseExecFrom(3); err != nil {
 | |
| 		return fmt.Errorf("unable to mark non-stdio fds as cloexec: %w", err)
 | |
| 	}
 | |
| 	if err := parent.start(); err != nil {
 | |
| 		return fmt.Errorf("unable to start container process: %w", err)
 | |
| 	}
 | |
| 
 | |
| 	if process.Init {
 | |
| 		c.fifo.Close()
 | |
| 		if c.config.Hooks != nil {
 | |
| 			s, err := c.currentOCIState()
 | |
| 			if err != nil {
 | |
| 				return err
 | |
| 			}
 | |
| 
 | |
| 			if err := c.config.Hooks.Run(configs.Poststart, s); err != nil {
 | |
| 				if err := ignoreTerminateErrors(parent.terminate()); err != nil {
 | |
| 					logrus.Warn(fmt.Errorf("error running poststart hook: %w", err))
 | |
| 				}
 | |
| 				return err
 | |
| 			}
 | |
| 		}
 | |
| 	}
 | |
| 	return nil
 | |
| }
 | |
| 
 | |
| // Signal sends a specified signal to container's init.
 | |
| //
 | |
| // When s is SIGKILL and the container does not have its own PID namespace, all
 | |
| // the container's processes are killed. In this scenario, the libcontainer
 | |
| // user may be required to implement a proper child reaper.
 | |
| func (c *Container) Signal(s os.Signal) error {
 | |
| 	c.m.Lock()
 | |
| 	defer c.m.Unlock()
 | |
| 
 | |
| 	// When a container has its own PID namespace, inside it the init PID
 | |
| 	// is 1, and thus it is handled specially by the kernel. In particular,
 | |
| 	// killing init with SIGKILL from an ancestor namespace will also kill
 | |
| 	// all other processes in that PID namespace (see pid_namespaces(7)).
 | |
| 	//
 | |
| 	// OTOH, if PID namespace is shared, we should kill all pids to avoid
 | |
| 	// leftover processes. Handle this special case here.
 | |
| 	if s == unix.SIGKILL && !c.config.Namespaces.IsPrivate(configs.NEWPID) {
 | |
| 		if err := signalAllProcesses(c.cgroupManager, unix.SIGKILL); err != nil {
 | |
| 			if c.config.RootlessCgroups { // may not have an access to cgroup
 | |
| 				logrus.WithError(err).Warn("failed to kill all processes, possibly due to lack of cgroup (Hint: enable cgroup v2 delegation)")
 | |
| 				// Some processes may leak when cgroup is not delegated
 | |
| 				// https://github.com/opencontainers/runc/pull/4395#pullrequestreview-2291179652
 | |
| 				return c.signal(s)
 | |
| 			}
 | |
| 			// For not rootless container, if there is no init process and no cgroup,
 | |
| 			// it means that the container is not running.
 | |
| 			if errors.Is(err, ErrCgroupNotExist) && !c.hasInit() {
 | |
| 				err = ErrNotRunning
 | |
| 			}
 | |
| 			return fmt.Errorf("unable to kill all processes: %w", err)
 | |
| 		}
 | |
| 		return nil
 | |
| 	}
 | |
| 
 | |
| 	return c.signal(s)
 | |
| }
 | |
| 
 | |
| func (c *Container) signal(s os.Signal) error {
 | |
| 	// To avoid a PID reuse attack, don't kill non-running container.
 | |
| 	if !c.hasInit() {
 | |
| 		return ErrNotRunning
 | |
| 	}
 | |
| 	if err := c.initProcess.signal(s); err != nil {
 | |
| 		return fmt.Errorf("unable to signal init: %w", err)
 | |
| 	}
 | |
| 	if s == unix.SIGKILL {
 | |
| 		// For cgroup v1, killing a process in a frozen cgroup
 | |
| 		// does nothing until it's thawed. Only thaw the cgroup
 | |
| 		// for SIGKILL.
 | |
| 		if paused, _ := c.isPaused(); paused {
 | |
| 			_ = c.cgroupManager.Freeze(configs.Thawed)
 | |
| 		}
 | |
| 	}
 | |
| 	return nil
 | |
| }
 | |
| 
 | |
| func (c *Container) createExecFifo() (retErr error) {
 | |
| 	rootuid, err := c.Config().HostRootUID()
 | |
| 	if err != nil {
 | |
| 		return err
 | |
| 	}
 | |
| 	rootgid, err := c.Config().HostRootGID()
 | |
| 	if err != nil {
 | |
| 		return err
 | |
| 	}
 | |
| 
 | |
| 	fifoName := filepath.Join(c.stateDir, execFifoFilename)
 | |
| 	if err := unix.Mkfifo(fifoName, 0o622); err != nil {
 | |
| 		return &os.PathError{Op: "mkfifo", Path: fifoName, Err: err}
 | |
| 	}
 | |
| 	defer func() {
 | |
| 		if retErr != nil {
 | |
| 			os.Remove(fifoName)
 | |
| 		}
 | |
| 	}()
 | |
| 	// Ensure permission bits (can be different because of umask).
 | |
| 	if err := os.Chmod(fifoName, 0o622); err != nil {
 | |
| 		return err
 | |
| 	}
 | |
| 	return os.Chown(fifoName, rootuid, rootgid)
 | |
| }
 | |
| 
 | |
| func (c *Container) deleteExecFifo() {
 | |
| 	fifoName := filepath.Join(c.stateDir, execFifoFilename)
 | |
| 	os.Remove(fifoName)
 | |
| }
 | |
| 
 | |
| // includeExecFifo opens the container's execfifo as a pathfd, so that the
 | |
| // container cannot access the statedir (and the FIFO itself remains
 | |
| // un-opened). It then adds the FifoFd to the given exec.Cmd as an inherited
 | |
| // fd, with _LIBCONTAINER_FIFOFD set to its fd number.
 | |
| func (c *Container) includeExecFifo(cmd *exec.Cmd) error {
 | |
| 	fifoName := filepath.Join(c.stateDir, execFifoFilename)
 | |
| 	fifo, err := os.OpenFile(fifoName, unix.O_PATH|unix.O_CLOEXEC, 0)
 | |
| 	if err != nil {
 | |
| 		return err
 | |
| 	}
 | |
| 	c.fifo = fifo
 | |
| 
 | |
| 	cmd.ExtraFiles = append(cmd.ExtraFiles, fifo)
 | |
| 	cmd.Env = append(cmd.Env,
 | |
| 		"_LIBCONTAINER_FIFOFD="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1))
 | |
| 	return nil
 | |
| }
 | |
| 
 | |
| func (c *Container) newParentProcess(p *Process) (parentProcess, error) {
 | |
| 	comm, err := newProcessComm()
 | |
| 	if err != nil {
 | |
| 		return nil, err
 | |
| 	}
 | |
| 
 | |
| 	// Make sure we use a new safe copy of /proc/self/exe binary each time, this
 | |
| 	// is called to make sure that if a container manages to overwrite the file,
 | |
| 	// it cannot affect other containers on the system. For runc, this code will
 | |
| 	// only ever be called once, but libcontainer users might call this more than
 | |
| 	// once.
 | |
| 	p.closeClonedExes()
 | |
| 	var (
 | |
| 		exePath string
 | |
| 		safeExe *os.File
 | |
| 	)
 | |
| 	if dmz.IsSelfExeCloned() {
 | |
| 		// /proc/self/exe is already a cloned binary -- no need to do anything
 | |
| 		logrus.Debug("skipping binary cloning -- /proc/self/exe is already cloned!")
 | |
| 		// We don't need to use /proc/thread-self here because the exe mm of a
 | |
| 		// thread-group is guaranteed to be the same for all threads by
 | |
| 		// definition. This lets us avoid having to do runtime.LockOSThread.
 | |
| 		exePath = "/proc/self/exe"
 | |
| 	} else {
 | |
| 		var err error
 | |
| 		safeExe, err = dmz.CloneSelfExe(c.stateDir)
 | |
| 		if err != nil {
 | |
| 			return nil, fmt.Errorf("unable to create safe /proc/self/exe clone for runc init: %w", err)
 | |
| 		}
 | |
| 		exePath = "/proc/self/fd/" + strconv.Itoa(int(safeExe.Fd()))
 | |
| 		p.clonedExes = append(p.clonedExes, safeExe)
 | |
| 		logrus.Debug("runc-dmz: using /proc/self/exe clone") // used for tests
 | |
| 	}
 | |
| 
 | |
| 	cmd := exec.Command(exePath, "init")
 | |
| 	cmd.Args[0] = os.Args[0]
 | |
| 	cmd.Stdin = p.Stdin
 | |
| 	cmd.Stdout = p.Stdout
 | |
| 	cmd.Stderr = p.Stderr
 | |
| 	cmd.Dir = c.config.Rootfs
 | |
| 	if cmd.SysProcAttr == nil {
 | |
| 		cmd.SysProcAttr = &unix.SysProcAttr{}
 | |
| 	}
 | |
| 	cmd.Env = append(cmd.Env, "GOMAXPROCS="+os.Getenv("GOMAXPROCS"))
 | |
| 	cmd.ExtraFiles = append(cmd.ExtraFiles, p.ExtraFiles...)
 | |
| 	if p.ConsoleSocket != nil {
 | |
| 		cmd.ExtraFiles = append(cmd.ExtraFiles, p.ConsoleSocket)
 | |
| 		cmd.Env = append(cmd.Env,
 | |
| 			"_LIBCONTAINER_CONSOLE="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1),
 | |
| 		)
 | |
| 	}
 | |
| 
 | |
| 	cmd.ExtraFiles = append(cmd.ExtraFiles, comm.initSockChild)
 | |
| 	cmd.Env = append(cmd.Env,
 | |
| 		"_LIBCONTAINER_INITPIPE="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1),
 | |
| 	)
 | |
| 	cmd.ExtraFiles = append(cmd.ExtraFiles, comm.syncSockChild.File())
 | |
| 	cmd.Env = append(cmd.Env,
 | |
| 		"_LIBCONTAINER_SYNCPIPE="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1),
 | |
| 	)
 | |
| 
 | |
| 	cmd.ExtraFiles = append(cmd.ExtraFiles, comm.logPipeChild)
 | |
| 	cmd.Env = append(cmd.Env,
 | |
| 		"_LIBCONTAINER_LOGPIPE="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1))
 | |
| 	if p.LogLevel != "" {
 | |
| 		cmd.Env = append(cmd.Env, "_LIBCONTAINER_LOGLEVEL="+p.LogLevel)
 | |
| 	}
 | |
| 
 | |
| 	if p.PidfdSocket != nil {
 | |
| 		cmd.ExtraFiles = append(cmd.ExtraFiles, p.PidfdSocket)
 | |
| 		cmd.Env = append(cmd.Env,
 | |
| 			"_LIBCONTAINER_PIDFD_SOCK="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1),
 | |
| 		)
 | |
| 	}
 | |
| 
 | |
| 	// TODO: After https://go-review.googlesource.com/c/go/+/515799 included
 | |
| 	// in go versions supported by us, we can remove this logic.
 | |
| 	if safeExe != nil {
 | |
| 		// Due to a Go stdlib bug, we need to add safeExe to the set of
 | |
| 		// ExtraFiles otherwise it is possible for the stdlib to clobber the fd
 | |
| 		// during forkAndExecInChild1 and replace it with some other file that
 | |
| 		// might be malicious. This is less than ideal (because the descriptor
 | |
| 		// will be non-O_CLOEXEC) however we have protections in "runc init" to
 | |
| 		// stop us from leaking extra file descriptors.
 | |
| 		//
 | |
| 		// See <https://github.com/golang/go/issues/61751>.
 | |
| 		cmd.ExtraFiles = append(cmd.ExtraFiles, safeExe)
 | |
| 
 | |
| 		// There is a race situation when we are opening a file, if there is a
 | |
| 		// small fd was closed at that time, maybe it will be reused by safeExe.
 | |
| 		// Because of Go stdlib fds shuffling bug, if the fd of safeExe is too
 | |
| 		// small, go stdlib will dup3 it to another fd, or dup3 a other fd to this
 | |
| 		// fd, then it will cause the fd type cmd.Path refers to a random path,
 | |
| 		// and it can lead to an error "permission denied" when starting the process.
 | |
| 		// Please see #4294.
 | |
| 		// So we should not use the original fd of safeExe, but use the fd after
 | |
| 		// shuffled by Go stdlib. Because Go stdlib will guarantee this fd refers to
 | |
| 		// the correct file.
 | |
| 		cmd.Path = "/proc/self/fd/" + strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1)
 | |
| 	}
 | |
| 
 | |
| 	// NOTE: when running a container with no PID namespace and the parent
 | |
| 	//       process spawning the container is PID1 the pdeathsig is being
 | |
| 	//       delivered to the container's init process by the kernel for some
 | |
| 	//       reason even with the parent still running.
 | |
| 	if c.config.ParentDeathSignal > 0 {
 | |
| 		cmd.SysProcAttr.Pdeathsig = unix.Signal(c.config.ParentDeathSignal)
 | |
| 	}
 | |
| 
 | |
| 	if p.Init {
 | |
| 		// We only set up fifoFd if we're not doing a `runc exec`. The historic
 | |
| 		// reason for this is that previously we would pass a dirfd that allowed
 | |
| 		// for container rootfs escape (and not doing it in `runc exec` avoided
 | |
| 		// that problem), but we no longer do that. However, there's no need to do
 | |
| 		// this for `runc exec` so we just keep it this way to be safe.
 | |
| 		if err := c.includeExecFifo(cmd); err != nil {
 | |
| 			return nil, fmt.Errorf("unable to setup exec fifo: %w", err)
 | |
| 		}
 | |
| 		return c.newInitProcess(p, cmd, comm)
 | |
| 	}
 | |
| 	return c.newSetnsProcess(p, cmd, comm)
 | |
| }
 | |
| 
 | |
| func (c *Container) newInitProcess(p *Process, cmd *exec.Cmd, comm *processComm) (*initProcess, error) {
 | |
| 	cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard))
 | |
| 	nsMaps := make(map[configs.NamespaceType]string)
 | |
| 	for _, ns := range c.config.Namespaces {
 | |
| 		if ns.Path != "" {
 | |
| 			nsMaps[ns.Type] = ns.Path
 | |
| 		}
 | |
| 	}
 | |
| 	data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps)
 | |
| 	if err != nil {
 | |
| 		return nil, err
 | |
| 	}
 | |
| 
 | |
| 	init := &initProcess{
 | |
| 		cmd:             cmd,
 | |
| 		comm:            comm,
 | |
| 		manager:         c.cgroupManager,
 | |
| 		intelRdtManager: c.intelRdtManager,
 | |
| 		config:          c.newInitConfig(p),
 | |
| 		container:       c,
 | |
| 		process:         p,
 | |
| 		bootstrapData:   data,
 | |
| 	}
 | |
| 	c.initProcess = init
 | |
| 	return init, nil
 | |
| }
 | |
| 
 | |
| func (c *Container) newSetnsProcess(p *Process, cmd *exec.Cmd, comm *processComm) (*setnsProcess, error) {
 | |
| 	cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initSetns))
 | |
| 	state := c.currentState()
 | |
| 	// for setns process, we don't have to set cloneflags as the process namespaces
 | |
| 	// will only be set via setns syscall
 | |
| 	data, err := c.bootstrapData(0, state.NamespacePaths)
 | |
| 	if err != nil {
 | |
| 		return nil, err
 | |
| 	}
 | |
| 	proc := &setnsProcess{
 | |
| 		cmd:             cmd,
 | |
| 		cgroupPaths:     state.CgroupPaths,
 | |
| 		rootlessCgroups: c.config.RootlessCgroups,
 | |
| 		intelRdtPath:    state.IntelRdtPath,
 | |
| 		comm:            comm,
 | |
| 		manager:         c.cgroupManager,
 | |
| 		config:          c.newInitConfig(p),
 | |
| 		process:         p,
 | |
| 		bootstrapData:   data,
 | |
| 		initProcessPid:  state.InitProcessPid,
 | |
| 	}
 | |
| 	if len(p.SubCgroupPaths) > 0 {
 | |
| 		if add, ok := p.SubCgroupPaths[""]; ok {
 | |
| 			// cgroup v1: using the same path for all controllers.
 | |
| 			// cgroup v2: the only possible way.
 | |
| 			for k := range proc.cgroupPaths {
 | |
| 				subPath := path.Join(proc.cgroupPaths[k], add)
 | |
| 				if !strings.HasPrefix(subPath, proc.cgroupPaths[k]) {
 | |
| 					return nil, fmt.Errorf("%s is not a sub cgroup path", add)
 | |
| 				}
 | |
| 				proc.cgroupPaths[k] = subPath
 | |
| 			}
 | |
| 			// cgroup v2: do not try to join init process's cgroup
 | |
| 			// as a fallback (see (*setnsProcess).start).
 | |
| 			proc.initProcessPid = 0
 | |
| 		} else {
 | |
| 			// Per-controller paths.
 | |
| 			for ctrl, add := range p.SubCgroupPaths {
 | |
| 				if val, ok := proc.cgroupPaths[ctrl]; ok {
 | |
| 					subPath := path.Join(val, add)
 | |
| 					if !strings.HasPrefix(subPath, val) {
 | |
| 						return nil, fmt.Errorf("%s is not a sub cgroup path", add)
 | |
| 					}
 | |
| 					proc.cgroupPaths[ctrl] = subPath
 | |
| 				} else {
 | |
| 					return nil, fmt.Errorf("unknown controller %s in SubCgroupPaths", ctrl)
 | |
| 				}
 | |
| 			}
 | |
| 		}
 | |
| 	}
 | |
| 	return proc, nil
 | |
| }
 | |
| 
 | |
| func (c *Container) newInitConfig(process *Process) *initConfig {
 | |
| 	cfg := &initConfig{
 | |
| 		Config:           c.config,
 | |
| 		Args:             process.Args,
 | |
| 		Env:              process.Env,
 | |
| 		User:             process.User,
 | |
| 		AdditionalGroups: process.AdditionalGroups,
 | |
| 		Cwd:              process.Cwd,
 | |
| 		Capabilities:     process.Capabilities,
 | |
| 		PassedFilesCount: len(process.ExtraFiles),
 | |
| 		ContainerID:      c.ID(),
 | |
| 		NoNewPrivileges:  c.config.NoNewPrivileges,
 | |
| 		RootlessEUID:     c.config.RootlessEUID,
 | |
| 		RootlessCgroups:  c.config.RootlessCgroups,
 | |
| 		AppArmorProfile:  c.config.AppArmorProfile,
 | |
| 		ProcessLabel:     c.config.ProcessLabel,
 | |
| 		Rlimits:          c.config.Rlimits,
 | |
| 		CreateConsole:    process.ConsoleSocket != nil,
 | |
| 		ConsoleWidth:     process.ConsoleWidth,
 | |
| 		ConsoleHeight:    process.ConsoleHeight,
 | |
| 	}
 | |
| 	if process.NoNewPrivileges != nil {
 | |
| 		cfg.NoNewPrivileges = *process.NoNewPrivileges
 | |
| 	}
 | |
| 	if process.AppArmorProfile != "" {
 | |
| 		cfg.AppArmorProfile = process.AppArmorProfile
 | |
| 	}
 | |
| 	if process.Label != "" {
 | |
| 		cfg.ProcessLabel = process.Label
 | |
| 	}
 | |
| 	if len(process.Rlimits) > 0 {
 | |
| 		cfg.Rlimits = process.Rlimits
 | |
| 	}
 | |
| 	if cgroups.IsCgroup2UnifiedMode() {
 | |
| 		cfg.Cgroup2Path = c.cgroupManager.Path("")
 | |
| 	}
 | |
| 
 | |
| 	return cfg
 | |
| }
 | |
| 
 | |
| // Destroy destroys the container, if its in a valid state.
 | |
| //
 | |
| // Any event registrations are removed before the container is destroyed.
 | |
| // No error is returned if the container is already destroyed.
 | |
| //
 | |
| // Running containers must first be stopped using Signal.
 | |
| // Paused containers must first be resumed using Resume.
 | |
| func (c *Container) Destroy() error {
 | |
| 	c.m.Lock()
 | |
| 	defer c.m.Unlock()
 | |
| 	if err := c.state.destroy(); err != nil {
 | |
| 		return fmt.Errorf("unable to destroy container: %w", err)
 | |
| 	}
 | |
| 	return nil
 | |
| }
 | |
| 
 | |
| // Pause pauses the container, if its state is RUNNING or CREATED, changing
 | |
| // its state to PAUSED. If the state is already PAUSED, does nothing.
 | |
| func (c *Container) Pause() error {
 | |
| 	c.m.Lock()
 | |
| 	defer c.m.Unlock()
 | |
| 	status, err := c.currentStatus()
 | |
| 	if err != nil {
 | |
| 		return err
 | |
| 	}
 | |
| 	switch status {
 | |
| 	case Running, Created:
 | |
| 		if err := c.cgroupManager.Freeze(configs.Frozen); err != nil {
 | |
| 			return err
 | |
| 		}
 | |
| 		return c.state.transition(&pausedState{
 | |
| 			c: c,
 | |
| 		})
 | |
| 	}
 | |
| 	return ErrNotRunning
 | |
| }
 | |
| 
 | |
| // Resume resumes the execution of any user processes in the
 | |
| // container before setting the container state to RUNNING.
 | |
| // This is only performed if the current state is PAUSED.
 | |
| // If the Container state is RUNNING, does nothing.
 | |
| func (c *Container) Resume() error {
 | |
| 	c.m.Lock()
 | |
| 	defer c.m.Unlock()
 | |
| 	status, err := c.currentStatus()
 | |
| 	if err != nil {
 | |
| 		return err
 | |
| 	}
 | |
| 	if status != Paused {
 | |
| 		return ErrNotPaused
 | |
| 	}
 | |
| 	if err := c.cgroupManager.Freeze(configs.Thawed); err != nil {
 | |
| 		return err
 | |
| 	}
 | |
| 	return c.state.transition(&runningState{
 | |
| 		c: c,
 | |
| 	})
 | |
| }
 | |
| 
 | |
| // NotifyOOM returns a read-only channel signaling when the container receives
 | |
| // an OOM notification.
 | |
| func (c *Container) NotifyOOM() (<-chan struct{}, error) {
 | |
| 	// XXX(cyphar): This requires cgroups.
 | |
| 	if c.config.RootlessCgroups {
 | |
| 		logrus.Warn("getting OOM notifications may fail if you don't have the full access to cgroups")
 | |
| 	}
 | |
| 	path := c.cgroupManager.Path("memory")
 | |
| 	if cgroups.IsCgroup2UnifiedMode() {
 | |
| 		return notifyOnOOMV2(path)
 | |
| 	}
 | |
| 	return notifyOnOOM(path)
 | |
| }
 | |
| 
 | |
| // NotifyMemoryPressure returns a read-only channel signaling when the
 | |
| // container reaches a given pressure level.
 | |
| func (c *Container) NotifyMemoryPressure(level PressureLevel) (<-chan struct{}, error) {
 | |
| 	// XXX(cyphar): This requires cgroups.
 | |
| 	if c.config.RootlessCgroups {
 | |
| 		logrus.Warn("getting memory pressure notifications may fail if you don't have the full access to cgroups")
 | |
| 	}
 | |
| 	return notifyMemoryPressure(c.cgroupManager.Path("memory"), level)
 | |
| }
 | |
| 
 | |
| func (c *Container) updateState(process parentProcess) (*State, error) {
 | |
| 	if process != nil {
 | |
| 		c.initProcess = process
 | |
| 	}
 | |
| 	state := c.currentState()
 | |
| 	if err := c.saveState(state); err != nil {
 | |
| 		return nil, err
 | |
| 	}
 | |
| 	return state, nil
 | |
| }
 | |
| 
 | |
| func (c *Container) saveState(s *State) (retErr error) {
 | |
| 	tmpFile, err := os.CreateTemp(c.stateDir, "state-")
 | |
| 	if err != nil {
 | |
| 		return err
 | |
| 	}
 | |
| 
 | |
| 	defer func() {
 | |
| 		if retErr != nil {
 | |
| 			tmpFile.Close()
 | |
| 			os.Remove(tmpFile.Name())
 | |
| 		}
 | |
| 	}()
 | |
| 
 | |
| 	err = utils.WriteJSON(tmpFile, s)
 | |
| 	if err != nil {
 | |
| 		return err
 | |
| 	}
 | |
| 	err = tmpFile.Close()
 | |
| 	if err != nil {
 | |
| 		return err
 | |
| 	}
 | |
| 
 | |
| 	stateFilePath := filepath.Join(c.stateDir, stateFilename)
 | |
| 	return os.Rename(tmpFile.Name(), stateFilePath)
 | |
| }
 | |
| 
 | |
| func (c *Container) currentStatus() (Status, error) {
 | |
| 	if err := c.refreshState(); err != nil {
 | |
| 		return -1, err
 | |
| 	}
 | |
| 	return c.state.status(), nil
 | |
| }
 | |
| 
 | |
| // refreshState needs to be called to verify that the current state on the
 | |
| // container is what is true.  Because consumers of libcontainer can use it
 | |
| // out of process we need to verify the container's status based on runtime
 | |
| // information and not rely on our in process info.
 | |
| func (c *Container) refreshState() error {
 | |
| 	paused, err := c.isPaused()
 | |
| 	if err != nil {
 | |
| 		return err
 | |
| 	}
 | |
| 	if paused {
 | |
| 		return c.state.transition(&pausedState{c: c})
 | |
| 	}
 | |
| 	if !c.hasInit() {
 | |
| 		return c.state.transition(&stoppedState{c: c})
 | |
| 	}
 | |
| 	// The presence of exec fifo helps to distinguish between
 | |
| 	// the created and the running states.
 | |
| 	if _, err := os.Stat(filepath.Join(c.stateDir, execFifoFilename)); err == nil {
 | |
| 		return c.state.transition(&createdState{c: c})
 | |
| 	}
 | |
| 	return c.state.transition(&runningState{c: c})
 | |
| }
 | |
| 
 | |
| // hasInit tells whether the container init process exists.
 | |
| func (c *Container) hasInit() bool {
 | |
| 	if c.initProcess == nil {
 | |
| 		return false
 | |
| 	}
 | |
| 	pid := c.initProcess.pid()
 | |
| 	stat, err := system.Stat(pid)
 | |
| 	if err != nil {
 | |
| 		return false
 | |
| 	}
 | |
| 	if stat.StartTime != c.initProcessStartTime || stat.State == system.Zombie || stat.State == system.Dead {
 | |
| 		return false
 | |
| 	}
 | |
| 	return true
 | |
| }
 | |
| 
 | |
| func (c *Container) isPaused() (bool, error) {
 | |
| 	state, err := c.cgroupManager.GetFreezerState()
 | |
| 	if err != nil {
 | |
| 		return false, err
 | |
| 	}
 | |
| 	return state == configs.Frozen, nil
 | |
| }
 | |
| 
 | |
| func (c *Container) currentState() *State {
 | |
| 	var (
 | |
| 		startTime           uint64
 | |
| 		externalDescriptors []string
 | |
| 		pid                 = -1
 | |
| 	)
 | |
| 	if c.initProcess != nil {
 | |
| 		pid = c.initProcess.pid()
 | |
| 		startTime, _ = c.initProcess.startTime()
 | |
| 		externalDescriptors = c.initProcess.externalDescriptors()
 | |
| 	}
 | |
| 
 | |
| 	intelRdtPath := ""
 | |
| 	if c.intelRdtManager != nil {
 | |
| 		intelRdtPath = c.intelRdtManager.GetPath()
 | |
| 	}
 | |
| 	state := &State{
 | |
| 		BaseState: BaseState{
 | |
| 			ID:                   c.ID(),
 | |
| 			Config:               *c.config,
 | |
| 			InitProcessPid:       pid,
 | |
| 			InitProcessStartTime: startTime,
 | |
| 			Created:              c.created,
 | |
| 		},
 | |
| 		Rootless:            c.config.RootlessEUID && c.config.RootlessCgroups,
 | |
| 		CgroupPaths:         c.cgroupManager.GetPaths(),
 | |
| 		IntelRdtPath:        intelRdtPath,
 | |
| 		NamespacePaths:      make(map[configs.NamespaceType]string),
 | |
| 		ExternalDescriptors: externalDescriptors,
 | |
| 	}
 | |
| 	if pid > 0 {
 | |
| 		for _, ns := range c.config.Namespaces {
 | |
| 			state.NamespacePaths[ns.Type] = ns.GetPath(pid)
 | |
| 		}
 | |
| 		for _, nsType := range configs.NamespaceTypes() {
 | |
| 			if !configs.IsNamespaceSupported(nsType) {
 | |
| 				continue
 | |
| 			}
 | |
| 			if _, ok := state.NamespacePaths[nsType]; !ok {
 | |
| 				ns := configs.Namespace{Type: nsType}
 | |
| 				state.NamespacePaths[ns.Type] = ns.GetPath(pid)
 | |
| 			}
 | |
| 		}
 | |
| 	}
 | |
| 	return state
 | |
| }
 | |
| 
 | |
| func (c *Container) currentOCIState() (*specs.State, error) {
 | |
| 	bundle, annotations := utils.Annotations(c.config.Labels)
 | |
| 	state := &specs.State{
 | |
| 		Version:     specs.Version,
 | |
| 		ID:          c.ID(),
 | |
| 		Bundle:      bundle,
 | |
| 		Annotations: annotations,
 | |
| 	}
 | |
| 	status, err := c.currentStatus()
 | |
| 	if err != nil {
 | |
| 		return nil, err
 | |
| 	}
 | |
| 	state.Status = specs.ContainerState(status.String())
 | |
| 	if status != Stopped {
 | |
| 		if c.initProcess != nil {
 | |
| 			state.Pid = c.initProcess.pid()
 | |
| 		}
 | |
| 	}
 | |
| 	return state, nil
 | |
| }
 | |
| 
 | |
| // orderNamespacePaths sorts namespace paths into a list of paths that we
 | |
| // can setns in order.
 | |
| func (c *Container) orderNamespacePaths(namespaces map[configs.NamespaceType]string) ([]string, error) {
 | |
| 	paths := []string{}
 | |
| 	for _, ns := range configs.NamespaceTypes() {
 | |
| 
 | |
| 		// Remove namespaces that we don't need to join.
 | |
| 		if !c.config.Namespaces.Contains(ns) {
 | |
| 			continue
 | |
| 		}
 | |
| 
 | |
| 		if p, ok := namespaces[ns]; ok && p != "" {
 | |
| 			// check if the requested namespace is supported
 | |
| 			if !configs.IsNamespaceSupported(ns) {
 | |
| 				return nil, fmt.Errorf("namespace %s is not supported", ns)
 | |
| 			}
 | |
| 			// only set to join this namespace if it exists
 | |
| 			if _, err := os.Lstat(p); err != nil {
 | |
| 				return nil, fmt.Errorf("namespace path: %w", err)
 | |
| 			}
 | |
| 			// do not allow namespace path with comma as we use it to separate
 | |
| 			// the namespace paths
 | |
| 			if strings.ContainsRune(p, ',') {
 | |
| 				return nil, fmt.Errorf("invalid namespace path %s", p)
 | |
| 			}
 | |
| 			paths = append(paths, fmt.Sprintf("%s:%s", configs.NsName(ns), p))
 | |
| 		}
 | |
| 
 | |
| 	}
 | |
| 
 | |
| 	return paths, nil
 | |
| }
 | |
| 
 | |
| func encodeIDMapping(idMap []configs.IDMap) ([]byte, error) {
 | |
| 	data := bytes.NewBuffer(nil)
 | |
| 	for _, im := range idMap {
 | |
| 		line := fmt.Sprintf("%d %d %d\n", im.ContainerID, im.HostID, im.Size)
 | |
| 		if _, err := data.WriteString(line); err != nil {
 | |
| 			return nil, err
 | |
| 		}
 | |
| 	}
 | |
| 	return data.Bytes(), nil
 | |
| }
 | |
| 
 | |
| // netlinkError is an error wrapper type for use by custom netlink message
 | |
| // types. Panics with errors are wrapped in netlinkError so that the recover
 | |
| // in bootstrapData can distinguish intentional panics.
 | |
| type netlinkError struct{ error }
 | |
| 
 | |
| // bootstrapData encodes the necessary data in netlink binary format
 | |
| // as a io.Reader.
 | |
| // Consumer can write the data to a bootstrap program
 | |
| // such as one that uses nsenter package to bootstrap the container's
 | |
| // init process correctly, i.e. with correct namespaces, uid/gid
 | |
| // mapping etc.
 | |
| func (c *Container) bootstrapData(cloneFlags uintptr, nsMaps map[configs.NamespaceType]string) (_ io.Reader, Err error) {
 | |
| 	// create the netlink message
 | |
| 	r := nl.NewNetlinkRequest(int(InitMsg), 0)
 | |
| 
 | |
| 	// Our custom messages cannot bubble up an error using returns, instead
 | |
| 	// they will panic with the specific error type, netlinkError. In that
 | |
| 	// case, recover from the panic and return that as an error.
 | |
| 	defer func() {
 | |
| 		if r := recover(); r != nil {
 | |
| 			if e, ok := r.(netlinkError); ok {
 | |
| 				Err = e.error
 | |
| 			} else {
 | |
| 				panic(r)
 | |
| 			}
 | |
| 		}
 | |
| 	}()
 | |
| 
 | |
| 	// write cloneFlags
 | |
| 	r.AddData(&Int32msg{
 | |
| 		Type:  CloneFlagsAttr,
 | |
| 		Value: uint32(cloneFlags),
 | |
| 	})
 | |
| 
 | |
| 	// write custom namespace paths
 | |
| 	if len(nsMaps) > 0 {
 | |
| 		nsPaths, err := c.orderNamespacePaths(nsMaps)
 | |
| 		if err != nil {
 | |
| 			return nil, err
 | |
| 		}
 | |
| 		r.AddData(&Bytemsg{
 | |
| 			Type:  NsPathsAttr,
 | |
| 			Value: []byte(strings.Join(nsPaths, ",")),
 | |
| 		})
 | |
| 	}
 | |
| 
 | |
| 	// write namespace paths only when we are not joining an existing user ns
 | |
| 	_, joinExistingUser := nsMaps[configs.NEWUSER]
 | |
| 	if !joinExistingUser {
 | |
| 		// write uid mappings
 | |
| 		if len(c.config.UIDMappings) > 0 {
 | |
| 			if c.config.RootlessEUID {
 | |
| 				// We resolve the paths for new{u,g}idmap from
 | |
| 				// the context of runc to avoid doing a path
 | |
| 				// lookup in the nsexec context.
 | |
| 				if path, err := exec.LookPath("newuidmap"); err == nil {
 | |
| 					r.AddData(&Bytemsg{
 | |
| 						Type:  UidmapPathAttr,
 | |
| 						Value: []byte(path),
 | |
| 					})
 | |
| 				}
 | |
| 			}
 | |
| 			b, err := encodeIDMapping(c.config.UIDMappings)
 | |
| 			if err != nil {
 | |
| 				return nil, err
 | |
| 			}
 | |
| 			r.AddData(&Bytemsg{
 | |
| 				Type:  UidmapAttr,
 | |
| 				Value: b,
 | |
| 			})
 | |
| 		}
 | |
| 
 | |
| 		// write gid mappings
 | |
| 		if len(c.config.GIDMappings) > 0 {
 | |
| 			b, err := encodeIDMapping(c.config.GIDMappings)
 | |
| 			if err != nil {
 | |
| 				return nil, err
 | |
| 			}
 | |
| 			r.AddData(&Bytemsg{
 | |
| 				Type:  GidmapAttr,
 | |
| 				Value: b,
 | |
| 			})
 | |
| 			if c.config.RootlessEUID {
 | |
| 				if path, err := exec.LookPath("newgidmap"); err == nil {
 | |
| 					r.AddData(&Bytemsg{
 | |
| 						Type:  GidmapPathAttr,
 | |
| 						Value: []byte(path),
 | |
| 					})
 | |
| 				}
 | |
| 			}
 | |
| 			if requiresRootOrMappingTool(c.config) {
 | |
| 				r.AddData(&Boolmsg{
 | |
| 					Type:  SetgroupAttr,
 | |
| 					Value: true,
 | |
| 				})
 | |
| 			}
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	if c.config.OomScoreAdj != nil {
 | |
| 		// write oom_score_adj
 | |
| 		r.AddData(&Bytemsg{
 | |
| 			Type:  OomScoreAdjAttr,
 | |
| 			Value: []byte(strconv.Itoa(*c.config.OomScoreAdj)),
 | |
| 		})
 | |
| 	}
 | |
| 
 | |
| 	// write rootless
 | |
| 	r.AddData(&Boolmsg{
 | |
| 		Type:  RootlessEUIDAttr,
 | |
| 		Value: c.config.RootlessEUID,
 | |
| 	})
 | |
| 
 | |
| 	// write boottime and monotonic time ns offsets.
 | |
| 	if c.config.TimeOffsets != nil {
 | |
| 		var offsetSpec bytes.Buffer
 | |
| 		for clock, offset := range c.config.TimeOffsets {
 | |
| 			fmt.Fprintf(&offsetSpec, "%s %d %d\n", clock, offset.Secs, offset.Nanosecs)
 | |
| 		}
 | |
| 		r.AddData(&Bytemsg{
 | |
| 			Type:  TimeOffsetsAttr,
 | |
| 			Value: offsetSpec.Bytes(),
 | |
| 		})
 | |
| 	}
 | |
| 
 | |
| 	return bytes.NewReader(r.Serialize()), nil
 | |
| }
 | |
| 
 | |
| // ignoreTerminateErrors returns nil if the given err matches an error known
 | |
| // to indicate that the terminate occurred successfully or err was nil, otherwise
 | |
| // err is returned unaltered.
 | |
| func ignoreTerminateErrors(err error) error {
 | |
| 	if err == nil {
 | |
| 		return nil
 | |
| 	}
 | |
| 	// terminate() might return an error from either Kill or Wait.
 | |
| 	// The (*Cmd).Wait documentation says: "If the command fails to run
 | |
| 	// or doesn't complete successfully, the error is of type *ExitError".
 | |
| 	// Filter out such errors (like "exit status 1" or "signal: killed").
 | |
| 	var exitErr *exec.ExitError
 | |
| 	if errors.As(err, &exitErr) {
 | |
| 		return nil
 | |
| 	}
 | |
| 	if errors.Is(err, os.ErrProcessDone) {
 | |
| 		return nil
 | |
| 	}
 | |
| 	s := err.Error()
 | |
| 	if strings.Contains(s, "Wait was already called") {
 | |
| 		return nil
 | |
| 	}
 | |
| 	return err
 | |
| }
 | |
| 
 | |
| func requiresRootOrMappingTool(c *configs.Config) bool {
 | |
| 	gidMap := []configs.IDMap{
 | |
| 		{ContainerID: 0, HostID: int64(os.Getegid()), Size: 1},
 | |
| 	}
 | |
| 	return !reflect.DeepEqual(c.GIDMappings, gidMap)
 | |
| }
 |