runc/libcontainer/mount_linux.go

package libcontainer

import (
	"errors"
	"fmt"
	"io/fs"
	"os"
	"strconv"
	"strings"

	"github.com/sirupsen/logrus"
	"golang.org/x/sys/unix"

	"github.com/opencontainers/runc/libcontainer/configs"
	"github.com/opencontainers/runc/libcontainer/internal/userns"
	"github.com/opencontainers/runc/libcontainer/utils"
)

// mountSourceType indicates what type of file descriptor is being returned. It
// is used to tell rootfs_linux.go whether or not to use move_mount(2) to
// install the mount.
type mountSourceType string

const (
	// An open_tree(2)-style file descriptor that needs to be installed using
	// move_mount(2) to install.
	mountSourceOpenTree mountSourceType = "open_tree"
	// A plain file descriptor that can be mounted through /proc/thread-self/fd.
	mountSourcePlain mountSourceType = "plain-open"
)

type mountSource struct {
	Type mountSourceType `json:"type"`
	file *os.File        `json:"-"`
}

// mountError holds an error from a failed mount or unmount operation.
type mountError struct {
	op      string
	source  string
	srcFile *mountSource
	target  string
	dstFd   string
	flags   uintptr
	data    string
	err     error
}

// int32plus is a collection of int types with >=32 bits.
type int32plus interface {
	int | uint | int32 | uint32 | int64 | uint64 | uintptr
}

// stringifyMountFlags converts mount(2) flags to a string that you can use in
// error messages.
func stringifyMountFlags[Int int32plus](flags Int) string {
	flagNames := []struct {
		name string
		bits Int
	}{
		{"MS_RDONLY", unix.MS_RDONLY},
		{"MS_NOSUID", unix.MS_NOSUID},
		{"MS_NODEV", unix.MS_NODEV},
		{"MS_NOEXEC", unix.MS_NOEXEC},
		{"MS_SYNCHRONOUS", unix.MS_SYNCHRONOUS},
		{"MS_REMOUNT", unix.MS_REMOUNT},
		{"MS_MANDLOCK", unix.MS_MANDLOCK},
		{"MS_DIRSYNC", unix.MS_DIRSYNC},
		{"MS_NOSYMFOLLOW", unix.MS_NOSYMFOLLOW},
		// No (1 << 9) flag.
		{"MS_NOATIME", unix.MS_NOATIME},
		{"MS_NODIRATIME", unix.MS_NODIRATIME},
		{"MS_BIND", unix.MS_BIND},
		{"MS_MOVE", unix.MS_MOVE},
		{"MS_REC", unix.MS_REC},
		// MS_VERBOSE was deprecated and swapped to MS_SILENT.
		{"MS_SILENT", unix.MS_SILENT},
		{"MS_POSIXACL", unix.MS_POSIXACL},
		{"MS_UNBINDABLE", unix.MS_UNBINDABLE},
		{"MS_PRIVATE", unix.MS_PRIVATE},
		{"MS_SLAVE", unix.MS_SLAVE},
		{"MS_SHARED", unix.MS_SHARED},
		{"MS_RELATIME", unix.MS_RELATIME},
		// MS_KERNMOUNT (1 << 22) is internal to the kernel.
		{"MS_I_VERSION", unix.MS_I_VERSION},
		{"MS_STRICTATIME", unix.MS_STRICTATIME},
		{"MS_LAZYTIME", unix.MS_LAZYTIME},
	}
	var (
		flagSet  []string
		seenBits Int
	)
	for _, flag := range flagNames {
		if flags&flag.bits == flag.bits {
			seenBits |= flag.bits
			flagSet = append(flagSet, flag.name)
		}
	}
	// If there were any remaining flags specified we don't know the name of,
	// just add them in an 0x... format.
	if remaining := flags &^ seenBits; remaining != 0 {
		flagSet = append(flagSet, "0x"+strconv.FormatUint(uint64(remaining), 16))
	}
	return strings.Join(flagSet, "|")
}

// Error provides a string error representation.
func (e *mountError) Error() string {
	out := e.op + " "

	if e.source != "" {
		out += "src=" + e.source + ", "
		if e.srcFile != nil {
			out += "srcType=" + string(e.srcFile.Type) + ", "
			out += "srcFd=" + strconv.Itoa(int(e.srcFile.file.Fd())) + ", "
		}
	}
	out += "dst=" + e.target
	if e.dstFd != "" {
		out += ", dstFd=" + e.dstFd
	}

	if e.flags != uintptr(0) {
		out += ", flags=" + stringifyMountFlags(e.flags)
	}
	if e.data != "" {
		out += ", data=" + e.data
	}

	out += ": " + e.err.Error()
	return out
}

// Unwrap returns the underlying error.
// This is a convention used by Go 1.13+ standard library.
func (e *mountError) Unwrap() error {
	return e.err
}

// mount is a simple unix.Mount wrapper, returning an error with more context
// in case it failed.
func mount(source, target, fstype string, flags uintptr, data string) error {
	return mountViaFds(source, nil, target, "", fstype, flags, data)
}

// mountViaFds is a unix.Mount wrapper which uses srcFile instead of source,
// and dstFd instead of target, unless those are empty.
//
// If srcFile is non-nil and flags does not contain MS_REMOUNT, mountViaFds
// will mount it according to the mountSourceType of the file descriptor.
//
// The dstFd argument, if non-empty, is expected to be in the form of a path to
// an opened file descriptor on procfs (i.e. "/proc/thread-self/fd/NN").
//
// If a file descriptor is used instead of a source or a target path, the
// corresponding path is only used to add context to an error in case the mount
// operation has failed.
func mountViaFds(source string, srcFile *mountSource, target, dstFd, fstype string, flags uintptr, data string) error {
	// MS_REMOUNT and srcFile don't make sense together.
	if srcFile != nil && flags&unix.MS_REMOUNT != 0 {
		logrus.Debugf("mount source passed along with MS_REMOUNT -- ignoring srcFile")
		srcFile = nil
	}
	dst := target
	if dstFd != "" {
		dst = dstFd
	}
	src := source
	isMoveMount := srcFile != nil && srcFile.Type == mountSourceOpenTree
	if srcFile != nil {
		// If we're going to use the /proc/thread-self/... path for classic
		// mount(2), we need to get a safe handle to /proc/thread-self. This
		// isn't needed for move_mount(2) because in that case the path is just
		// a dummy string used for error info.
		srcFileFd := srcFile.file.Fd()
		if isMoveMount {
			src = "/proc/self/fd/" + strconv.Itoa(int(srcFileFd))
		} else {
			var closer utils.ProcThreadSelfCloser
			src, closer = utils.ProcThreadSelfFd(srcFileFd)
			defer closer()
		}
	}

	var op string
	var err error
	if isMoveMount {
		op = "move_mount"
		err = unix.MoveMount(int(srcFile.file.Fd()), "",
			unix.AT_FDCWD, dstFd,
			unix.MOVE_MOUNT_F_EMPTY_PATH|unix.MOVE_MOUNT_T_SYMLINKS)
	} else {
		op = "mount"
		err = unix.Mount(src, dst, fstype, flags, data)
	}
	if err != nil {
		return &mountError{
			op:      op,
			source:  source,
			srcFile: srcFile,
			target:  target,
			dstFd:   dstFd,
			flags:   flags,
			data:    data,
			err:     err,
		}
	}
	return nil
}

// unmount is a simple unix.Unmount wrapper.
func unmount(target string, flags int) error {
	err := unix.Unmount(target, flags)
	if err != nil {
		return &mountError{
			op:     "unmount",
			target: target,
			flags:  uintptr(flags),
			err:    err,
		}
	}
	return nil
}

// syscallMode returns the syscall-specific mode bits from Go's portable mode bits.
// Copy from https://cs.opensource.google/go/go/+/refs/tags/go1.20.7:src/os/file_posix.go;l=61-75
func syscallMode(i fs.FileMode) (o uint32) {
	o |= uint32(i.Perm())
	if i&fs.ModeSetuid != 0 {
		o |= unix.S_ISUID
	}
	if i&fs.ModeSetgid != 0 {
		o |= unix.S_ISGID
	}
	if i&fs.ModeSticky != 0 {
		o |= unix.S_ISVTX
	}
	// No mapping for Go's ModeTemporary (plan9 only).
	return
}

// mountFd creates a "mount source fd" (either through open_tree(2) or just
// open(O_PATH)) based on the provided configuration. This function must be
// called from within the container's mount namespace.
//
// In the case of idmapped mount configurations, the returned mount source will
// be an open_tree(2) file with MOUNT_ATTR_IDMAP applied. For other
// bind-mounts, it will be an O_PATH. If the type of mount cannot be handled,
// the returned mountSource will be nil, indicating that the container init
// process will need to do an old-fashioned mount(2) themselves.
//
// This helper is only intended to be used by goCreateMountSources.
func mountFd(nsHandles *userns.Handles, m *configs.Mount) (*mountSource, error) {
	if !m.IsBind() {
		return nil, errors.New("new mount api: only bind-mounts are supported")
	}
	if nsHandles == nil {
		nsHandles = new(userns.Handles)
		defer nsHandles.Release()
	}

	var mountFile *os.File
	var sourceType mountSourceType

	// Ideally, we would use OPEN_TREE_CLONE for everything, because we can
	// be sure that the file descriptor cannot be used to escape outside of
	// the mount root. Unfortunately, OPEN_TREE_CLONE is far more expensive
	// than open(2) because it requires doing mounts inside a new anonymous
	// mount namespace. So we use open(2) for standard bind-mounts, and
	// OPEN_TREE_CLONE when we need to set mount attributes here.
	//
	// While passing open(2)'d paths from the host rootfs isn't exactly the
	// safest thing in the world, the files will not survive across
	// execve(2) and "runc init" is non-dumpable so it should not be
	// possible for a malicious container process to gain access to the
	// file descriptors. We also don't do any of this for "runc exec",
	// lessening the risk even further.
	if m.IsIDMapped() {
		flags := uint(unix.OPEN_TREE_CLONE | unix.OPEN_TREE_CLOEXEC)
		if m.Flags&unix.MS_REC == unix.MS_REC {
			flags |= unix.AT_RECURSIVE
		}
		fd, err := unix.OpenTree(unix.AT_FDCWD, m.Source, flags)
		if err != nil {
			return nil, &os.PathError{Op: "open_tree(OPEN_TREE_CLONE)", Path: m.Source, Err: err}
		}
		mountFile = os.NewFile(uintptr(fd), m.Source)
		sourceType = mountSourceOpenTree

		// Configure the id mapping.
		var usernsFile *os.File
		if m.IDMapping.UserNSPath == "" {
			usernsFile, err = nsHandles.Get(userns.Mapping{
				UIDMappings: m.IDMapping.UIDMappings,
				GIDMappings: m.IDMapping.GIDMappings,
			})
			if err != nil {
				return nil, fmt.Errorf("failed to create userns for %s id-mapping: %w", m.Source, err)
			}
		} else {
			usernsFile, err = os.Open(m.IDMapping.UserNSPath)
			if err != nil {
				return nil, fmt.Errorf("failed to open existing userns for %s id-mapping: %w", m.Source, err)
			}
		}
		defer usernsFile.Close()

		setAttrFlags := uint(unix.AT_EMPTY_PATH)
		// If the mount has "ridmap" set, we apply the configuration
		// recursively. This allows you to create "rbind" mounts where only
		// the top-level mount has an idmapping. I'm not sure why you'd
		// want that, but still...
		if m.IDMapping.Recursive {
			setAttrFlags |= unix.AT_RECURSIVE
		}
		if err := unix.MountSetattr(int(mountFile.Fd()), "", setAttrFlags, &unix.MountAttr{
			Attr_set:  unix.MOUNT_ATTR_IDMAP,
			Userns_fd: uint64(usernsFile.Fd()),
		}); err != nil {
			extraMsg := ""
			if err == unix.EINVAL {
				extraMsg = " (maybe the filesystem used doesn't support idmap mounts on this kernel?)"
			}

			return nil, fmt.Errorf("failed to set MOUNT_ATTR_IDMAP on %s: %w%s", m.Source, err, extraMsg)
		}
	} else {
		var err error
		mountFile, err = os.OpenFile(m.Source, unix.O_PATH|unix.O_CLOEXEC, 0)
		if err != nil {
			return nil, err
		}
		sourceType = mountSourcePlain
	}
	return &mountSource{
		Type: sourceType,
		file: mountFile,
	}, nil
}