mirror of
https://github.com/opencontainers/runc.git
synced 2025-10-18 21:34:40 +08:00

If we pass a file descriptor to the host filesystem while joining a container, there is a race condition where a process inside the container can ptrace(2) the joining process and stop it from closing its file descriptor to the stateDirFd. Then the process can access the *host* filesystem from that file descriptor. This was fixed in part by5d93fed3d2
("Set init processes as non-dumpable"), but that fix is more of a hail-mary than an actual fix for the underlying issue. To fix this, don't open or pass the stateDirFd to the init process unless we're creating a new container. A proper fix for this would be to remove the need for even passing around directory file descriptors (which are quite dangerous in the context of mount namespaces). There is still an issue with containers that have CAP_SYS_PTRACE and are using the setns(2)-style of joining a container namespace. Currently I'm not really sure how to fix it without rampant layer violation. Fixes: CVE-2016-9962 Fixes:5d93fed3d2
("Set init processes as non-dumpable") Signed-off-by: Aleksa Sarai <asarai@suse.de>
308 lines
8.6 KiB
Go
308 lines
8.6 KiB
Go
// +build linux
|
|
|
|
package libcontainer
|
|
|
|
import (
|
|
"encoding/json"
|
|
"fmt"
|
|
"os"
|
|
"path/filepath"
|
|
"regexp"
|
|
"runtime/debug"
|
|
"strconv"
|
|
"syscall"
|
|
|
|
"github.com/docker/docker/pkg/mount"
|
|
"github.com/opencontainers/runc/libcontainer/cgroups"
|
|
"github.com/opencontainers/runc/libcontainer/cgroups/fs"
|
|
"github.com/opencontainers/runc/libcontainer/cgroups/systemd"
|
|
"github.com/opencontainers/runc/libcontainer/configs"
|
|
"github.com/opencontainers/runc/libcontainer/configs/validate"
|
|
"github.com/opencontainers/runc/libcontainer/utils"
|
|
)
|
|
|
|
const (
|
|
stateFilename = "state.json"
|
|
execFifoFilename = "exec.fifo"
|
|
)
|
|
|
|
var (
|
|
idRegex = regexp.MustCompile(`^[\w+-\.]+$`)
|
|
maxIdLen = 1024
|
|
)
|
|
|
|
// InitArgs returns an options func to configure a LinuxFactory with the
|
|
// provided init binary path and arguments.
|
|
func InitArgs(args ...string) func(*LinuxFactory) error {
|
|
return func(l *LinuxFactory) error {
|
|
l.InitArgs = args
|
|
return nil
|
|
}
|
|
}
|
|
|
|
// SystemdCgroups is an options func to configure a LinuxFactory to return
|
|
// containers that use systemd to create and manage cgroups.
|
|
func SystemdCgroups(l *LinuxFactory) error {
|
|
l.NewCgroupsManager = func(config *configs.Cgroup, paths map[string]string) cgroups.Manager {
|
|
return &systemd.Manager{
|
|
Cgroups: config,
|
|
Paths: paths,
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// Cgroupfs is an options func to configure a LinuxFactory to return
|
|
// containers that use the native cgroups filesystem implementation to
|
|
// create and manage cgroups.
|
|
func Cgroupfs(l *LinuxFactory) error {
|
|
l.NewCgroupsManager = func(config *configs.Cgroup, paths map[string]string) cgroups.Manager {
|
|
return &fs.Manager{
|
|
Cgroups: config,
|
|
Paths: paths,
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// TmpfsRoot is an option func to mount LinuxFactory.Root to tmpfs.
|
|
func TmpfsRoot(l *LinuxFactory) error {
|
|
mounted, err := mount.Mounted(l.Root)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if !mounted {
|
|
if err := syscall.Mount("tmpfs", l.Root, "tmpfs", 0, ""); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// CriuPath returns an option func to configure a LinuxFactory with the
|
|
// provided criupath
|
|
func CriuPath(criupath string) func(*LinuxFactory) error {
|
|
return func(l *LinuxFactory) error {
|
|
l.CriuPath = criupath
|
|
return nil
|
|
}
|
|
}
|
|
|
|
// New returns a linux based container factory based in the root directory and
|
|
// configures the factory with the provided option funcs.
|
|
func New(root string, options ...func(*LinuxFactory) error) (Factory, error) {
|
|
if root != "" {
|
|
if err := os.MkdirAll(root, 0700); err != nil {
|
|
return nil, newGenericError(err, SystemError)
|
|
}
|
|
}
|
|
l := &LinuxFactory{
|
|
Root: root,
|
|
InitArgs: []string{"/proc/self/exe", "init"},
|
|
Validator: validate.New(),
|
|
CriuPath: "criu",
|
|
}
|
|
Cgroupfs(l)
|
|
for _, opt := range options {
|
|
if err := opt(l); err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
return l, nil
|
|
}
|
|
|
|
// LinuxFactory implements the default factory interface for linux based systems.
|
|
type LinuxFactory struct {
|
|
// Root directory for the factory to store state.
|
|
Root string
|
|
|
|
// InitArgs are arguments for calling the init responsibilities for spawning
|
|
// a container.
|
|
InitArgs []string
|
|
|
|
// CriuPath is the path to the criu binary used for checkpoint and restore of
|
|
// containers.
|
|
CriuPath string
|
|
|
|
// Validator provides validation to container configurations.
|
|
Validator validate.Validator
|
|
|
|
// NewCgroupsManager returns an initialized cgroups manager for a single container.
|
|
NewCgroupsManager func(config *configs.Cgroup, paths map[string]string) cgroups.Manager
|
|
}
|
|
|
|
func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, error) {
|
|
if l.Root == "" {
|
|
return nil, newGenericError(fmt.Errorf("invalid root"), ConfigInvalid)
|
|
}
|
|
if err := l.validateID(id); err != nil {
|
|
return nil, err
|
|
}
|
|
if err := l.Validator.Validate(config); err != nil {
|
|
return nil, newGenericError(err, ConfigInvalid)
|
|
}
|
|
uid, err := config.HostUID()
|
|
if err != nil {
|
|
return nil, newGenericError(err, SystemError)
|
|
}
|
|
gid, err := config.HostGID()
|
|
if err != nil {
|
|
return nil, newGenericError(err, SystemError)
|
|
}
|
|
containerRoot := filepath.Join(l.Root, id)
|
|
if _, err := os.Stat(containerRoot); err == nil {
|
|
return nil, newGenericError(fmt.Errorf("container with id exists: %v", id), IdInUse)
|
|
} else if !os.IsNotExist(err) {
|
|
return nil, newGenericError(err, SystemError)
|
|
}
|
|
if err := os.MkdirAll(containerRoot, 0711); err != nil {
|
|
return nil, newGenericError(err, SystemError)
|
|
}
|
|
if err := os.Chown(containerRoot, uid, gid); err != nil {
|
|
return nil, newGenericError(err, SystemError)
|
|
}
|
|
fifoName := filepath.Join(containerRoot, execFifoFilename)
|
|
oldMask := syscall.Umask(0000)
|
|
if err := syscall.Mkfifo(fifoName, 0622); err != nil {
|
|
syscall.Umask(oldMask)
|
|
return nil, newGenericError(err, SystemError)
|
|
}
|
|
syscall.Umask(oldMask)
|
|
if err := os.Chown(fifoName, uid, gid); err != nil {
|
|
return nil, newGenericError(err, SystemError)
|
|
}
|
|
c := &linuxContainer{
|
|
id: id,
|
|
root: containerRoot,
|
|
config: config,
|
|
initArgs: l.InitArgs,
|
|
criuPath: l.CriuPath,
|
|
cgroupManager: l.NewCgroupsManager(config.Cgroups, nil),
|
|
}
|
|
c.state = &stoppedState{c: c}
|
|
return c, nil
|
|
}
|
|
|
|
func (l *LinuxFactory) Load(id string) (Container, error) {
|
|
if l.Root == "" {
|
|
return nil, newGenericError(fmt.Errorf("invalid root"), ConfigInvalid)
|
|
}
|
|
containerRoot := filepath.Join(l.Root, id)
|
|
state, err := l.loadState(containerRoot, id)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
r := &nonChildProcess{
|
|
processPid: state.InitProcessPid,
|
|
processStartTime: state.InitProcessStartTime,
|
|
fds: state.ExternalDescriptors,
|
|
}
|
|
c := &linuxContainer{
|
|
initProcess: r,
|
|
initProcessStartTime: state.InitProcessStartTime,
|
|
id: id,
|
|
config: &state.Config,
|
|
initArgs: l.InitArgs,
|
|
criuPath: l.CriuPath,
|
|
cgroupManager: l.NewCgroupsManager(state.Config.Cgroups, state.CgroupPaths),
|
|
root: containerRoot,
|
|
created: state.Created,
|
|
}
|
|
c.state = &loadedState{c: c}
|
|
if err := c.refreshState(); err != nil {
|
|
return nil, err
|
|
}
|
|
return c, nil
|
|
}
|
|
|
|
func (l *LinuxFactory) Type() string {
|
|
return "libcontainer"
|
|
}
|
|
|
|
// StartInitialization loads a container by opening the pipe fd from the parent to read the configuration and state
|
|
// This is a low level implementation detail of the reexec and should not be consumed externally
|
|
func (l *LinuxFactory) StartInitialization() (err error) {
|
|
var (
|
|
pipefd, rootfd int
|
|
envInitPipe = os.Getenv("_LIBCONTAINER_INITPIPE")
|
|
envStateDir = os.Getenv("_LIBCONTAINER_STATEDIR")
|
|
)
|
|
|
|
// Get the INITPIPE.
|
|
pipefd, err = strconv.Atoi(envInitPipe)
|
|
if err != nil {
|
|
return fmt.Errorf("unable to convert _LIBCONTAINER_INITPIPE=%s to int: %s", envInitPipe, err)
|
|
}
|
|
|
|
var (
|
|
pipe = os.NewFile(uintptr(pipefd), "pipe")
|
|
it = initType(os.Getenv("_LIBCONTAINER_INITTYPE"))
|
|
)
|
|
defer pipe.Close()
|
|
|
|
// Only init processes have STATEDIR.
|
|
rootfd = -1
|
|
if it == initStandard {
|
|
rootfd, err = strconv.Atoi(envStateDir)
|
|
if err != nil {
|
|
return fmt.Errorf("unable to convert _LIBCONTAINER_STATEDIR=%s to int: %s", envStateDir, err)
|
|
}
|
|
}
|
|
|
|
// clear the current process's environment to clean any libcontainer
|
|
// specific env vars.
|
|
os.Clearenv()
|
|
|
|
defer func() {
|
|
// We have an error during the initialization of the container's init,
|
|
// send it back to the parent process in the form of an initError.
|
|
if werr := utils.WriteJSON(pipe, syncT{procError}); werr != nil {
|
|
fmt.Fprintln(os.Stderr, err)
|
|
return
|
|
}
|
|
if werr := utils.WriteJSON(pipe, newSystemError(err)); werr != nil {
|
|
fmt.Fprintln(os.Stderr, err)
|
|
return
|
|
}
|
|
}()
|
|
defer func() {
|
|
if e := recover(); e != nil {
|
|
err = fmt.Errorf("panic from initialization: %v, %v", e, string(debug.Stack()))
|
|
}
|
|
}()
|
|
|
|
i, err := newContainerInit(it, pipe, rootfd)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
// If Init succeeds, syscall.Exec will not return, hence none of the defers will be called.
|
|
return i.Init()
|
|
}
|
|
|
|
func (l *LinuxFactory) loadState(root, id string) (*State, error) {
|
|
f, err := os.Open(filepath.Join(root, stateFilename))
|
|
if err != nil {
|
|
if os.IsNotExist(err) {
|
|
return nil, newGenericError(fmt.Errorf("container %q does not exist", id), ContainerNotExists)
|
|
}
|
|
return nil, newGenericError(err, SystemError)
|
|
}
|
|
defer f.Close()
|
|
var state *State
|
|
if err := json.NewDecoder(f).Decode(&state); err != nil {
|
|
return nil, newGenericError(err, SystemError)
|
|
}
|
|
return state, nil
|
|
}
|
|
|
|
func (l *LinuxFactory) validateID(id string) error {
|
|
if !idRegex.MatchString(id) {
|
|
return newGenericError(fmt.Errorf("invalid id format: %v", id), InvalidIdFormat)
|
|
}
|
|
if len(id) > maxIdLen {
|
|
return newGenericError(fmt.Errorf("invalid id format: %v", id), InvalidIdFormat)
|
|
}
|
|
return nil
|
|
}
|