package libcontainer import ( "bytes" "encoding/json" "errors" "fmt" "io" "os" "os/exec" "path" "path/filepath" "reflect" "strconv" "strings" "sync" "time" "github.com/opencontainers/runtime-spec/specs-go" "github.com/sirupsen/logrus" "github.com/vishvananda/netlink/nl" "golang.org/x/sys/execabs" "golang.org/x/sys/unix" "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/intelrdt" "github.com/opencontainers/runc/libcontainer/system" "github.com/opencontainers/runc/libcontainer/utils" ) const stdioFdCount = 3 // Container is a libcontainer container object. type Container struct { id string root string config *configs.Config cgroupManager cgroups.Manager intelRdtManager *intelrdt.Manager initProcess parentProcess initProcessStartTime uint64 m sync.Mutex criuVersion int state containerState created time.Time fifo *os.File } // State represents a running container's state type State struct { BaseState // Platform specific fields below here // Specified if the container was started under the rootless mode. // Set to true if BaseState.Config.RootlessEUID && BaseState.Config.RootlessCgroups Rootless bool `json:"rootless"` // Paths to all the container's cgroups, as returned by (*cgroups.Manager).GetPaths // // For cgroup v1, a key is cgroup subsystem name, and the value is the path // to the cgroup for this subsystem. // // For cgroup v2 unified hierarchy, a key is "", and the value is the unified path. CgroupPaths map[string]string `json:"cgroup_paths"` // NamespacePaths are filepaths to the container's namespaces. Key is the namespace type // with the value as the path. NamespacePaths map[configs.NamespaceType]string `json:"namespace_paths"` // Container's standard descriptors (std{in,out,err}), needed for checkpoint and restore ExternalDescriptors []string `json:"external_descriptors,omitempty"` // Intel RDT "resource control" filesystem path IntelRdtPath string `json:"intel_rdt_path"` } // ID returns the container's unique ID func (c *Container) ID() string { return c.id } // Config returns the container's configuration func (c *Container) Config() configs.Config { return *c.config } // Status returns the current status of the container. func (c *Container) Status() (Status, error) { c.m.Lock() defer c.m.Unlock() return c.currentStatus() } // State returns the current container's state information. func (c *Container) State() (*State, error) { c.m.Lock() defer c.m.Unlock() return c.currentState() } // OCIState returns the current container's state information. func (c *Container) OCIState() (*specs.State, error) { c.m.Lock() defer c.m.Unlock() return c.currentOCIState() } // ignoreCgroupError filters out cgroup-related errors that can be ignored, // because the container is stopped and its cgroup is gone. func (c *Container) ignoreCgroupError(err error) error { if err == nil { return nil } if errors.Is(err, os.ErrNotExist) && c.runType() == Stopped && !c.cgroupManager.Exists() { return nil } return err } // Processes returns the PIDs inside this container. The PIDs are in the // namespace of the calling process. // // Some of the returned PIDs may no longer refer to processes in the container, // unless the container state is PAUSED in which case every PID in the slice is // valid. func (c *Container) Processes() ([]int, error) { pids, err := c.cgroupManager.GetAllPids() if err = c.ignoreCgroupError(err); err != nil { return nil, fmt.Errorf("unable to get all container pids: %w", err) } return pids, nil } // Stats returns statistics for the container. func (c *Container) Stats() (*Stats, error) { var ( err error stats = &Stats{} ) if stats.CgroupStats, err = c.cgroupManager.GetStats(); err != nil { return stats, fmt.Errorf("unable to get container cgroup stats: %w", err) } if c.intelRdtManager != nil { if stats.IntelRdtStats, err = c.intelRdtManager.GetStats(); err != nil { return stats, fmt.Errorf("unable to get container Intel RDT stats: %w", err) } } for _, iface := range c.config.Networks { switch iface.Type { case "veth": istats, err := getNetworkInterfaceStats(iface.HostInterfaceName) if err != nil { return stats, fmt.Errorf("unable to get network stats for interface %q: %w", iface.HostInterfaceName, err) } stats.Interfaces = append(stats.Interfaces, istats) } } return stats, nil } // Set resources of container as configured. Can be used to change resources // when the container is running. func (c *Container) Set(config configs.Config) error { c.m.Lock() defer c.m.Unlock() status, err := c.currentStatus() if err != nil { return err } if status == Stopped { return ErrNotRunning } if err := c.cgroupManager.Set(config.Cgroups.Resources); err != nil { // Set configs back if err2 := c.cgroupManager.Set(c.config.Cgroups.Resources); err2 != nil { logrus.Warnf("Setting back cgroup configs failed due to error: %v, your state.json and actual configs might be inconsistent.", err2) } return err } if c.intelRdtManager != nil { if err := c.intelRdtManager.Set(&config); err != nil { // Set configs back if err2 := c.cgroupManager.Set(c.config.Cgroups.Resources); err2 != nil { logrus.Warnf("Setting back cgroup configs failed due to error: %v, your state.json and actual configs might be inconsistent.", err2) } if err2 := c.intelRdtManager.Set(c.config); err2 != nil { logrus.Warnf("Setting back intelrdt configs failed due to error: %v, your state.json and actual configs might be inconsistent.", err2) } return err } } // After config setting succeed, update config and states c.config = &config _, err = c.updateState(nil) return err } // Start starts a process inside the container. Returns error if process fails // to start. You can track process lifecycle with passed Process structure. func (c *Container) Start(process *Process) error { c.m.Lock() defer c.m.Unlock() if c.config.Cgroups.Resources.SkipDevices { return errors.New("can't start container with SkipDevices set") } if process.Init { if err := c.createExecFifo(); err != nil { return err } } if err := c.start(process); err != nil { if process.Init { c.deleteExecFifo() } return err } return nil } // Run immediately starts the process inside the container. Returns an error if // the process fails to start. It does not block waiting for the exec fifo // after start returns but opens the fifo after start returns. func (c *Container) Run(process *Process) error { if err := c.Start(process); err != nil { return err } if process.Init { return c.exec() } return nil } // Exec signals the container to exec the users process at the end of the init. func (c *Container) Exec() error { c.m.Lock() defer c.m.Unlock() return c.exec() } func (c *Container) exec() error { path := filepath.Join(c.root, execFifoFilename) pid := c.initProcess.pid() blockingFifoOpenCh := awaitFifoOpen(path) for { select { case result := <-blockingFifoOpenCh: return handleFifoResult(result) case <-time.After(time.Millisecond * 100): stat, err := system.Stat(pid) if err != nil || stat.State == system.Zombie { // could be because process started, ran, and completed between our 100ms timeout and our system.Stat() check. // see if the fifo exists and has data (with a non-blocking open, which will succeed if the writing process is complete). if err := handleFifoResult(fifoOpen(path, false)); err != nil { return errors.New("container process is already dead") } return nil } } } } func readFromExecFifo(execFifo io.Reader) error { data, err := io.ReadAll(execFifo) if err != nil { return err } if len(data) <= 0 { return errors.New("cannot start an already running container") } return nil } func awaitFifoOpen(path string) <-chan openResult { fifoOpened := make(chan openResult) go func() { result := fifoOpen(path, true) fifoOpened <- result }() return fifoOpened } func fifoOpen(path string, block bool) openResult { flags := os.O_RDONLY if !block { flags |= unix.O_NONBLOCK } f, err := os.OpenFile(path, flags, 0) if err != nil { return openResult{err: fmt.Errorf("exec fifo: %w", err)} } return openResult{file: f} } func handleFifoResult(result openResult) error { if result.err != nil { return result.err } f := result.file defer f.Close() if err := readFromExecFifo(f); err != nil { return err } return os.Remove(f.Name()) } type openResult struct { file *os.File err error } func (c *Container) start(process *Process) (retErr error) { parent, err := c.newParentProcess(process) if err != nil { return fmt.Errorf("unable to create new parent process: %w", err) } logsDone := parent.forwardChildLogs() if logsDone != nil { defer func() { // Wait for log forwarder to finish. This depends on // runc init closing the _LIBCONTAINER_LOGPIPE log fd. err := <-logsDone if err != nil && retErr == nil { retErr = fmt.Errorf("unable to forward init logs: %w", err) } }() } if err := parent.start(); err != nil { return fmt.Errorf("unable to start container process: %w", err) } if process.Init { c.fifo.Close() if c.config.Hooks != nil { s, err := c.currentOCIState() if err != nil { return err } if err := c.config.Hooks.Run(configs.Poststart, s); err != nil { if err := ignoreTerminateErrors(parent.terminate()); err != nil { logrus.Warn(fmt.Errorf("error running poststart hook: %w", err)) } return err } } } return nil } // Signal sends a specified signal to container's init. // // When s is SIGKILL and the container does not have its own PID namespace, all // the container's processes are killed. In this scenario, the libcontainer // user may be required to implement a proper child reaper. func (c *Container) Signal(s os.Signal) error { c.m.Lock() defer c.m.Unlock() status, err := c.currentStatus() if err != nil { return err } // To avoid a PID reuse attack, don't kill non-running container. switch status { case Running, Created, Paused: default: return ErrNotRunning } // When a container has its own PID namespace, inside it the init PID // is 1, and thus it is handled specially by the kernel. In particular, // killing init with SIGKILL from an ancestor namespace will also kill // all other processes in that PID namespace (see pid_namespaces(7)). // // OTOH, if PID namespace is shared, we should kill all pids to avoid // leftover processes. if s == unix.SIGKILL && !c.config.Namespaces.IsPrivate(configs.NEWPID) { err = signalAllProcesses(c.cgroupManager, unix.SIGKILL) } else { err = c.initProcess.signal(s) } if err != nil { return fmt.Errorf("unable to signal init: %w", err) } if status == Paused && s == unix.SIGKILL { // For cgroup v1, killing a process in a frozen cgroup // does nothing until it's thawed. Only thaw the cgroup // for SIGKILL. _ = c.cgroupManager.Freeze(configs.Thawed) } return nil } func (c *Container) createExecFifo() error { rootuid, err := c.Config().HostRootUID() if err != nil { return err } rootgid, err := c.Config().HostRootGID() if err != nil { return err } fifoName := filepath.Join(c.root, execFifoFilename) if _, err := os.Stat(fifoName); err == nil { return fmt.Errorf("exec fifo %s already exists", fifoName) } oldMask := unix.Umask(0o000) if err := unix.Mkfifo(fifoName, 0o622); err != nil { unix.Umask(oldMask) return err } unix.Umask(oldMask) return os.Chown(fifoName, rootuid, rootgid) } func (c *Container) deleteExecFifo() { fifoName := filepath.Join(c.root, execFifoFilename) os.Remove(fifoName) } // includeExecFifo opens the container's execfifo as a pathfd, so that the // container cannot access the statedir (and the FIFO itself remains // un-opened). It then adds the FifoFd to the given exec.Cmd as an inherited // fd, with _LIBCONTAINER_FIFOFD set to its fd number. func (c *Container) includeExecFifo(cmd *exec.Cmd) error { fifoName := filepath.Join(c.root, execFifoFilename) fifo, err := os.OpenFile(fifoName, unix.O_PATH|unix.O_CLOEXEC, 0) if err != nil { return err } c.fifo = fifo cmd.ExtraFiles = append(cmd.ExtraFiles, fifo) cmd.Env = append(cmd.Env, "_LIBCONTAINER_FIFOFD="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1)) return nil } func (c *Container) newParentProcess(p *Process) (parentProcess, error) { parentInitPipe, childInitPipe, err := utils.NewSockPair("init") if err != nil { return nil, fmt.Errorf("unable to create init pipe: %w", err) } messageSockPair := filePair{parentInitPipe, childInitPipe} parentLogPipe, childLogPipe, err := os.Pipe() if err != nil { return nil, fmt.Errorf("unable to create log pipe: %w", err) } logFilePair := filePair{parentLogPipe, childLogPipe} cmd := c.commandTemplate(p, childInitPipe, childLogPipe) if !p.Init { return c.newSetnsProcess(p, cmd, messageSockPair, logFilePair) } // We only set up fifoFd if we're not doing a `runc exec`. The historic // reason for this is that previously we would pass a dirfd that allowed // for container rootfs escape (and not doing it in `runc exec` avoided // that problem), but we no longer do that. However, there's no need to do // this for `runc exec` so we just keep it this way to be safe. if err := c.includeExecFifo(cmd); err != nil { return nil, fmt.Errorf("unable to setup exec fifo: %w", err) } return c.newInitProcess(p, cmd, messageSockPair, logFilePair) } func (c *Container) commandTemplate(p *Process, childInitPipe *os.File, childLogPipe *os.File) *exec.Cmd { cmd := exec.Command("/proc/self/exe", "init") cmd.Args[0] = os.Args[0] cmd.Stdin = p.Stdin cmd.Stdout = p.Stdout cmd.Stderr = p.Stderr cmd.Dir = c.config.Rootfs if cmd.SysProcAttr == nil { cmd.SysProcAttr = &unix.SysProcAttr{} } cmd.Env = append(cmd.Env, "GOMAXPROCS="+os.Getenv("GOMAXPROCS")) cmd.ExtraFiles = append(cmd.ExtraFiles, p.ExtraFiles...) if p.ConsoleSocket != nil { cmd.ExtraFiles = append(cmd.ExtraFiles, p.ConsoleSocket) cmd.Env = append(cmd.Env, "_LIBCONTAINER_CONSOLE="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1), ) } cmd.ExtraFiles = append(cmd.ExtraFiles, childInitPipe) cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITPIPE="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1), "_LIBCONTAINER_STATEDIR="+c.root, ) cmd.ExtraFiles = append(cmd.ExtraFiles, childLogPipe) cmd.Env = append(cmd.Env, "_LIBCONTAINER_LOGPIPE="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1)) if p.LogLevel != "" { cmd.Env = append(cmd.Env, "_LIBCONTAINER_LOGLEVEL="+p.LogLevel) } // NOTE: when running a container with no PID namespace and the parent process spawning the container is // PID1 the pdeathsig is being delivered to the container's init process by the kernel for some reason // even with the parent still running. if c.config.ParentDeathSignal > 0 { cmd.SysProcAttr.Pdeathsig = unix.Signal(c.config.ParentDeathSignal) } return cmd } // shouldSendMountSources says whether the child process must setup bind mounts with // the source pre-opened (O_PATH) in the host user namespace. // See https://github.com/opencontainers/runc/issues/2484 func (c *Container) shouldSendMountSources() bool { // Passing the mount sources via SCM_RIGHTS is only necessary when // both userns and mntns are active. if !c.config.Namespaces.Contains(configs.NEWUSER) || !c.config.Namespaces.Contains(configs.NEWNS) { return false } // nsexec.c send_mountsources() requires setns(mntns) capabilities // CAP_SYS_CHROOT and CAP_SYS_ADMIN. if c.config.RootlessEUID { return false } // We need to send sources if there are non-idmap bind-mounts. for _, m := range c.config.Mounts { if m.IsBind() && !m.IsIDMapped() { return true } } return false } // shouldSendIdmapSources says whether the child process must setup idmap mounts with // the mount_setattr already done in the host user namespace. func (c *Container) shouldSendIdmapSources() bool { // nsexec.c mount_setattr() requires CAP_SYS_ADMIN in: // * the user namespace the filesystem was mounted in; // * the user namespace we're trying to idmap the mount to; // * the owning user namespace of the mount namespace you're currently located in. // // See the comment from Christian Brauner: // https://github.com/opencontainers/runc/pull/3717#discussion_r1103607972 // // Let's just rule out rootless, we don't have those permission in the // rootless case. if c.config.RootlessEUID { return false } // For the time being we require userns to be in use. if !c.config.Namespaces.Contains(configs.NEWUSER) { return false } // We need to send sources if there are idmap bind-mounts. for _, m := range c.config.Mounts { if m.IsBind() && m.IsIDMapped() { return true } } return false } func (c *Container) sendMountSources(cmd *exec.Cmd, messageSockPair filePair) error { if !c.shouldSendMountSources() { return nil } return c.sendFdsSources(cmd, messageSockPair, "_LIBCONTAINER_MOUNT_FDS", func(m *configs.Mount) bool { return m.IsBind() && !m.IsIDMapped() }) } func (c *Container) sendIdmapSources(cmd *exec.Cmd, messageSockPair filePair) error { if !c.shouldSendIdmapSources() { return nil } return c.sendFdsSources(cmd, messageSockPair, "_LIBCONTAINER_IDMAP_FDS", func(m *configs.Mount) bool { return m.IsBind() && m.IsIDMapped() }) } func (c *Container) sendFdsSources(cmd *exec.Cmd, messageSockPair filePair, envVar string, condition func(*configs.Mount) bool) error { // Elements on these slices will be paired with mounts (see StartInitialization() and // prepareRootfs()). These slices MUST have the same size as c.config.Mounts. fds := make([]int, len(c.config.Mounts)) for i, m := range c.config.Mounts { if !condition(m) { // The -1 fd is ignored later. fds[i] = -1 continue } // The fd passed here will not be used: nsexec.c will overwrite it with dup3(). We just need // to allocate a fd so that we know the number to pass in the environment variable. The fd // must not be closed before cmd.Start(), so we reuse messageSockPair.child because the // lifecycle of that fd is already taken care of. cmd.ExtraFiles = append(cmd.ExtraFiles, messageSockPair.child) fds[i] = stdioFdCount + len(cmd.ExtraFiles) - 1 } fdsJSON, err := json.Marshal(fds) if err != nil { return fmt.Errorf("Error creating %v: %w", envVar, err) } cmd.Env = append(cmd.Env, envVar+"="+string(fdsJSON)) return nil } func (c *Container) newInitProcess(p *Process, cmd *exec.Cmd, messageSockPair, logFilePair filePair) (*initProcess, error) { cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard)) nsMaps := make(map[configs.NamespaceType]string) for _, ns := range c.config.Namespaces { if ns.Path != "" { nsMaps[ns.Type] = ns.Path } } data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps, initStandard) if err != nil { return nil, err } if err := c.sendMountSources(cmd, messageSockPair); err != nil { return nil, err } if err := c.sendIdmapSources(cmd, messageSockPair); err != nil { return nil, err } init := &initProcess{ cmd: cmd, messageSockPair: messageSockPair, logFilePair: logFilePair, manager: c.cgroupManager, intelRdtManager: c.intelRdtManager, config: c.newInitConfig(p), container: c, process: p, bootstrapData: data, } c.initProcess = init return init, nil } func (c *Container) newSetnsProcess(p *Process, cmd *exec.Cmd, messageSockPair, logFilePair filePair) (*setnsProcess, error) { cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initSetns)) state, err := c.currentState() if err != nil { return nil, fmt.Errorf("unable to get container state: %w", err) } // for setns process, we don't have to set cloneflags as the process namespaces // will only be set via setns syscall data, err := c.bootstrapData(0, state.NamespacePaths, initSetns) if err != nil { return nil, err } proc := &setnsProcess{ cmd: cmd, cgroupPaths: state.CgroupPaths, rootlessCgroups: c.config.RootlessCgroups, intelRdtPath: state.IntelRdtPath, messageSockPair: messageSockPair, logFilePair: logFilePair, manager: c.cgroupManager, config: c.newInitConfig(p), process: p, bootstrapData: data, initProcessPid: state.InitProcessPid, } if len(p.SubCgroupPaths) > 0 { if add, ok := p.SubCgroupPaths[""]; ok { // cgroup v1: using the same path for all controllers. // cgroup v2: the only possible way. for k := range proc.cgroupPaths { subPath := path.Join(proc.cgroupPaths[k], add) if !strings.HasPrefix(subPath, proc.cgroupPaths[k]) { return nil, fmt.Errorf("%s is not a sub cgroup path", add) } proc.cgroupPaths[k] = subPath } // cgroup v2: do not try to join init process's cgroup // as a fallback (see (*setnsProcess).start). proc.initProcessPid = 0 } else { // Per-controller paths. for ctrl, add := range p.SubCgroupPaths { if val, ok := proc.cgroupPaths[ctrl]; ok { subPath := path.Join(val, add) if !strings.HasPrefix(subPath, val) { return nil, fmt.Errorf("%s is not a sub cgroup path", add) } proc.cgroupPaths[ctrl] = subPath } else { return nil, fmt.Errorf("unknown controller %s in SubCgroupPaths", ctrl) } } } } return proc, nil } func (c *Container) newInitConfig(process *Process) *initConfig { cfg := &initConfig{ Config: c.config, Args: process.Args, Env: process.Env, User: process.User, AdditionalGroups: process.AdditionalGroups, Cwd: process.Cwd, Capabilities: process.Capabilities, PassedFilesCount: len(process.ExtraFiles), ContainerID: c.ID(), NoNewPrivileges: c.config.NoNewPrivileges, RootlessEUID: c.config.RootlessEUID, RootlessCgroups: c.config.RootlessCgroups, AppArmorProfile: c.config.AppArmorProfile, ProcessLabel: c.config.ProcessLabel, Rlimits: c.config.Rlimits, CreateConsole: process.ConsoleSocket != nil, ConsoleWidth: process.ConsoleWidth, ConsoleHeight: process.ConsoleHeight, } if process.NoNewPrivileges != nil { cfg.NoNewPrivileges = *process.NoNewPrivileges } if process.AppArmorProfile != "" { cfg.AppArmorProfile = process.AppArmorProfile } if process.Label != "" { cfg.ProcessLabel = process.Label } if len(process.Rlimits) > 0 { cfg.Rlimits = process.Rlimits } if cgroups.IsCgroup2UnifiedMode() { cfg.Cgroup2Path = c.cgroupManager.Path("") } return cfg } // Destroy destroys the container, if its in a valid state. // // Any event registrations are removed before the container is destroyed. // No error is returned if the container is already destroyed. // // Running containers must first be stopped using Signal. // Paused containers must first be resumed using Resume. func (c *Container) Destroy() error { c.m.Lock() defer c.m.Unlock() return c.state.destroy() } // Pause pauses the container, if its state is RUNNING or CREATED, changing // its state to PAUSED. If the state is already PAUSED, does nothing. func (c *Container) Pause() error { c.m.Lock() defer c.m.Unlock() status, err := c.currentStatus() if err != nil { return err } switch status { case Running, Created: if err := c.cgroupManager.Freeze(configs.Frozen); err != nil { return err } return c.state.transition(&pausedState{ c: c, }) } return ErrNotRunning } // Resume resumes the execution of any user processes in the // container before setting the container state to RUNNING. // This is only performed if the current state is PAUSED. // If the Container state is RUNNING, does nothing. func (c *Container) Resume() error { c.m.Lock() defer c.m.Unlock() status, err := c.currentStatus() if err != nil { return err } if status != Paused { return ErrNotPaused } if err := c.cgroupManager.Freeze(configs.Thawed); err != nil { return err } return c.state.transition(&runningState{ c: c, }) } // NotifyOOM returns a read-only channel signaling when the container receives // an OOM notification. func (c *Container) NotifyOOM() (<-chan struct{}, error) { // XXX(cyphar): This requires cgroups. if c.config.RootlessCgroups { logrus.Warn("getting OOM notifications may fail if you don't have the full access to cgroups") } path := c.cgroupManager.Path("memory") if cgroups.IsCgroup2UnifiedMode() { return notifyOnOOMV2(path) } return notifyOnOOM(path) } // NotifyMemoryPressure returns a read-only channel signaling when the // container reaches a given pressure level. func (c *Container) NotifyMemoryPressure(level PressureLevel) (<-chan struct{}, error) { // XXX(cyphar): This requires cgroups. if c.config.RootlessCgroups { logrus.Warn("getting memory pressure notifications may fail if you don't have the full access to cgroups") } return notifyMemoryPressure(c.cgroupManager.Path("memory"), level) } func (c *Container) updateState(process parentProcess) (*State, error) { if process != nil { c.initProcess = process } state, err := c.currentState() if err != nil { return nil, err } err = c.saveState(state) if err != nil { return nil, err } return state, nil } func (c *Container) saveState(s *State) (retErr error) { tmpFile, err := os.CreateTemp(c.root, "state-") if err != nil { return err } defer func() { if retErr != nil { tmpFile.Close() os.Remove(tmpFile.Name()) } }() err = utils.WriteJSON(tmpFile, s) if err != nil { return err } err = tmpFile.Close() if err != nil { return err } stateFilePath := filepath.Join(c.root, stateFilename) return os.Rename(tmpFile.Name(), stateFilePath) } func (c *Container) currentStatus() (Status, error) { if err := c.refreshState(); err != nil { return -1, err } return c.state.status(), nil } // refreshState needs to be called to verify that the current state on the // container is what is true. Because consumers of libcontainer can use it // out of process we need to verify the container's status based on runtime // information and not rely on our in process info. func (c *Container) refreshState() error { paused, err := c.isPaused() if err != nil { return err } if paused { return c.state.transition(&pausedState{c: c}) } t := c.runType() switch t { case Created: return c.state.transition(&createdState{c: c}) case Running: return c.state.transition(&runningState{c: c}) } return c.state.transition(&stoppedState{c: c}) } func (c *Container) runType() Status { if c.initProcess == nil { return Stopped } pid := c.initProcess.pid() stat, err := system.Stat(pid) if err != nil { return Stopped } if stat.StartTime != c.initProcessStartTime || stat.State == system.Zombie || stat.State == system.Dead { return Stopped } // We'll create exec fifo and blocking on it after container is created, // and delete it after start container. if _, err := os.Stat(filepath.Join(c.root, execFifoFilename)); err == nil { return Created } return Running } func (c *Container) isPaused() (bool, error) { state, err := c.cgroupManager.GetFreezerState() if err != nil { return false, err } return state == configs.Frozen, nil } func (c *Container) currentState() (*State, error) { var ( startTime uint64 externalDescriptors []string pid = -1 ) if c.initProcess != nil { pid = c.initProcess.pid() startTime, _ = c.initProcess.startTime() externalDescriptors = c.initProcess.externalDescriptors() } intelRdtPath := "" if c.intelRdtManager != nil { intelRdtPath = c.intelRdtManager.GetPath() } state := &State{ BaseState: BaseState{ ID: c.ID(), Config: *c.config, InitProcessPid: pid, InitProcessStartTime: startTime, Created: c.created, }, Rootless: c.config.RootlessEUID && c.config.RootlessCgroups, CgroupPaths: c.cgroupManager.GetPaths(), IntelRdtPath: intelRdtPath, NamespacePaths: make(map[configs.NamespaceType]string), ExternalDescriptors: externalDescriptors, } if pid > 0 { for _, ns := range c.config.Namespaces { state.NamespacePaths[ns.Type] = ns.GetPath(pid) } for _, nsType := range configs.NamespaceTypes() { if !configs.IsNamespaceSupported(nsType) { continue } if _, ok := state.NamespacePaths[nsType]; !ok { ns := configs.Namespace{Type: nsType} state.NamespacePaths[ns.Type] = ns.GetPath(pid) } } } return state, nil } func (c *Container) currentOCIState() (*specs.State, error) { bundle, annotations := utils.Annotations(c.config.Labels) state := &specs.State{ Version: specs.Version, ID: c.ID(), Bundle: bundle, Annotations: annotations, } status, err := c.currentStatus() if err != nil { return nil, err } state.Status = specs.ContainerState(status.String()) if status != Stopped { if c.initProcess != nil { state.Pid = c.initProcess.pid() } } return state, nil } // orderNamespacePaths sorts namespace paths into a list of paths that we // can setns in order. func (c *Container) orderNamespacePaths(namespaces map[configs.NamespaceType]string) ([]string, error) { paths := []string{} for _, ns := range configs.NamespaceTypes() { // Remove namespaces that we don't need to join. if !c.config.Namespaces.Contains(ns) { continue } if p, ok := namespaces[ns]; ok && p != "" { // check if the requested namespace is supported if !configs.IsNamespaceSupported(ns) { return nil, fmt.Errorf("namespace %s is not supported", ns) } // only set to join this namespace if it exists if _, err := os.Lstat(p); err != nil { return nil, fmt.Errorf("namespace path: %w", err) } // do not allow namespace path with comma as we use it to separate // the namespace paths if strings.ContainsRune(p, ',') { return nil, fmt.Errorf("invalid namespace path %s", p) } paths = append(paths, fmt.Sprintf("%s:%s", configs.NsName(ns), p)) } } return paths, nil } func encodeIDMapping(idMap []configs.IDMap) ([]byte, error) { data := bytes.NewBuffer(nil) for _, im := range idMap { line := fmt.Sprintf("%d %d %d\n", im.ContainerID, im.HostID, im.Size) if _, err := data.WriteString(line); err != nil { return nil, err } } return data.Bytes(), nil } // netlinkError is an error wrapper type for use by custom netlink message // types. Panics with errors are wrapped in netlinkError so that the recover // in bootstrapData can distinguish intentional panics. type netlinkError struct{ error } // bootstrapData encodes the necessary data in netlink binary format // as a io.Reader. // Consumer can write the data to a bootstrap program // such as one that uses nsenter package to bootstrap the container's // init process correctly, i.e. with correct namespaces, uid/gid // mapping etc. func (c *Container) bootstrapData(cloneFlags uintptr, nsMaps map[configs.NamespaceType]string, it initType) (_ io.Reader, Err error) { // create the netlink message r := nl.NewNetlinkRequest(int(InitMsg), 0) // Our custom messages cannot bubble up an error using returns, instead // they will panic with the specific error type, netlinkError. In that // case, recover from the panic and return that as an error. defer func() { if r := recover(); r != nil { if e, ok := r.(netlinkError); ok { Err = e.error } else { panic(r) } } }() // write cloneFlags r.AddData(&Int32msg{ Type: CloneFlagsAttr, Value: uint32(cloneFlags), }) // write custom namespace paths if len(nsMaps) > 0 { nsPaths, err := c.orderNamespacePaths(nsMaps) if err != nil { return nil, err } r.AddData(&Bytemsg{ Type: NsPathsAttr, Value: []byte(strings.Join(nsPaths, ",")), }) } // write namespace paths only when we are not joining an existing user ns _, joinExistingUser := nsMaps[configs.NEWUSER] if !joinExistingUser { // write uid mappings if len(c.config.UIDMappings) > 0 { if c.config.RootlessEUID { // We resolve the paths for new{u,g}idmap from // the context of runc to avoid doing a path // lookup in the nsexec context. if path, err := execabs.LookPath("newuidmap"); err == nil { r.AddData(&Bytemsg{ Type: UidmapPathAttr, Value: []byte(path), }) } } b, err := encodeIDMapping(c.config.UIDMappings) if err != nil { return nil, err } r.AddData(&Bytemsg{ Type: UidmapAttr, Value: b, }) } // write gid mappings if len(c.config.GIDMappings) > 0 { b, err := encodeIDMapping(c.config.GIDMappings) if err != nil { return nil, err } r.AddData(&Bytemsg{ Type: GidmapAttr, Value: b, }) if c.config.RootlessEUID { if path, err := execabs.LookPath("newgidmap"); err == nil { r.AddData(&Bytemsg{ Type: GidmapPathAttr, Value: []byte(path), }) } } if requiresRootOrMappingTool(c.config) { r.AddData(&Boolmsg{ Type: SetgroupAttr, Value: true, }) } } } if c.config.OomScoreAdj != nil { // write oom_score_adj r.AddData(&Bytemsg{ Type: OomScoreAdjAttr, Value: []byte(strconv.Itoa(*c.config.OomScoreAdj)), }) } // write rootless r.AddData(&Boolmsg{ Type: RootlessEUIDAttr, Value: c.config.RootlessEUID, }) // Bind mount source to open. if it == initStandard && c.shouldSendMountSources() { var mounts []byte for _, m := range c.config.Mounts { if m.IsBind() && !m.IsIDMapped() { if strings.IndexByte(m.Source, 0) >= 0 { return nil, fmt.Errorf("mount source string contains null byte: %q", m.Source) } mounts = append(mounts, []byte(m.Source)...) } mounts = append(mounts, byte(0)) } r.AddData(&Bytemsg{ Type: MountSourcesAttr, Value: mounts, }) } // Idmap mount sources to open. if it == initStandard && c.shouldSendIdmapSources() { var mounts []byte for _, m := range c.config.Mounts { if m.IsBind() && m.IsIDMapped() { // While other parts of the code check this too (like // libcontainer/specconv/spec_linux.go) we do it here also because some libcontainer // users don't use those functions. if strings.IndexByte(m.Source, 0) >= 0 { return nil, fmt.Errorf("mount source string contains null byte: %q", m.Source) } mounts = append(mounts, []byte(m.Source)...) } mounts = append(mounts, byte(0)) } r.AddData(&Bytemsg{ Type: IdmapSourcesAttr, Value: mounts, }) } // write boottime and monotonic time ns offsets. if c.config.TimeOffsets != nil { var offsetSpec bytes.Buffer for clock, offset := range c.config.TimeOffsets { fmt.Fprintf(&offsetSpec, "%s %d %d\n", clock, offset.Secs, offset.Nanosecs) } r.AddData(&Bytemsg{ Type: TimeOffsetsAttr, Value: offsetSpec.Bytes(), }) } return bytes.NewReader(r.Serialize()), nil } // ignoreTerminateErrors returns nil if the given err matches an error known // to indicate that the terminate occurred successfully or err was nil, otherwise // err is returned unaltered. func ignoreTerminateErrors(err error) error { if err == nil { return nil } // terminate() might return an error from either Kill or Wait. // The (*Cmd).Wait documentation says: "If the command fails to run // or doesn't complete successfully, the error is of type *ExitError". // Filter out such errors (like "exit status 1" or "signal: killed"). var exitErr *exec.ExitError if errors.As(err, &exitErr) { return nil } if errors.Is(err, os.ErrProcessDone) { return nil } s := err.Error() if strings.Contains(s, "Wait was already called") { return nil } return err } func requiresRootOrMappingTool(c *configs.Config) bool { gidMap := []configs.IDMap{ {ContainerID: 0, HostID: os.Getegid(), Size: 1}, } return !reflect.DeepEqual(c.GIDMappings, gidMap) }