package libcontainer import ( "errors" "fmt" "io/fs" "os" "strconv" "strings" "github.com/sirupsen/logrus" "golang.org/x/sys/unix" "github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/internal/userns" "github.com/opencontainers/runc/libcontainer/utils" ) // mountSourceType indicates what type of file descriptor is being returned. It // is used to tell rootfs_linux.go whether or not to use move_mount(2) to // install the mount. type mountSourceType string const ( // An open_tree(2)-style file descriptor that needs to be installed using // move_mount(2) to install. mountSourceOpenTree mountSourceType = "open_tree" // A plain file descriptor that can be mounted through /proc/thread-self/fd. mountSourcePlain mountSourceType = "plain-open" ) type mountSource struct { Type mountSourceType `json:"type"` file *os.File `json:"-"` } // mountError holds an error from a failed mount or unmount operation. type mountError struct { op string source string srcFile *mountSource target string dstFd string flags uintptr data string err error } // int32plus is a collection of int types with >=32 bits. type int32plus interface { int | uint | int32 | uint32 | int64 | uint64 | uintptr } // stringifyMountFlags converts mount(2) flags to a string that you can use in // error messages. func stringifyMountFlags[Int int32plus](flags Int) string { flagNames := []struct { name string bits Int }{ {"MS_RDONLY", unix.MS_RDONLY}, {"MS_NOSUID", unix.MS_NOSUID}, {"MS_NODEV", unix.MS_NODEV}, {"MS_NOEXEC", unix.MS_NOEXEC}, {"MS_SYNCHRONOUS", unix.MS_SYNCHRONOUS}, {"MS_REMOUNT", unix.MS_REMOUNT}, {"MS_MANDLOCK", unix.MS_MANDLOCK}, {"MS_DIRSYNC", unix.MS_DIRSYNC}, {"MS_NOSYMFOLLOW", unix.MS_NOSYMFOLLOW}, // No (1 << 9) flag. {"MS_NOATIME", unix.MS_NOATIME}, {"MS_NODIRATIME", unix.MS_NODIRATIME}, {"MS_BIND", unix.MS_BIND}, {"MS_MOVE", unix.MS_MOVE}, {"MS_REC", unix.MS_REC}, // MS_VERBOSE was deprecated and swapped to MS_SILENT. {"MS_SILENT", unix.MS_SILENT}, {"MS_POSIXACL", unix.MS_POSIXACL}, {"MS_UNBINDABLE", unix.MS_UNBINDABLE}, {"MS_PRIVATE", unix.MS_PRIVATE}, {"MS_SLAVE", unix.MS_SLAVE}, {"MS_SHARED", unix.MS_SHARED}, {"MS_RELATIME", unix.MS_RELATIME}, // MS_KERNMOUNT (1 << 22) is internal to the kernel. {"MS_I_VERSION", unix.MS_I_VERSION}, {"MS_STRICTATIME", unix.MS_STRICTATIME}, {"MS_LAZYTIME", unix.MS_LAZYTIME}, } var ( flagSet []string seenBits Int ) for _, flag := range flagNames { if flags&flag.bits == flag.bits { seenBits |= flag.bits flagSet = append(flagSet, flag.name) } } // If there were any remaining flags specified we don't know the name of, // just add them in an 0x... format. if remaining := flags &^ seenBits; remaining != 0 { flagSet = append(flagSet, "0x"+strconv.FormatUint(uint64(remaining), 16)) } return strings.Join(flagSet, "|") } // Error provides a string error representation. func (e *mountError) Error() string { out := e.op + " " if e.source != "" { out += "src=" + e.source + ", " if e.srcFile != nil { out += "srcType=" + string(e.srcFile.Type) + ", " out += "srcFd=" + strconv.Itoa(int(e.srcFile.file.Fd())) + ", " } } out += "dst=" + e.target if e.dstFd != "" { out += ", dstFd=" + e.dstFd } if e.flags != uintptr(0) { out += ", flags=" + stringifyMountFlags(e.flags) } if e.data != "" { out += ", data=" + e.data } out += ": " + e.err.Error() return out } // Unwrap returns the underlying error. // This is a convention used by Go 1.13+ standard library. func (e *mountError) Unwrap() error { return e.err } // mount is a simple unix.Mount wrapper, returning an error with more context // in case it failed. func mount(source, target, fstype string, flags uintptr, data string) error { return mountViaFds(source, nil, target, "", fstype, flags, data) } // mountViaFds is a unix.Mount wrapper which uses srcFile instead of source, // and dstFd instead of target, unless those are empty. // // If srcFile is non-nil and flags does not contain MS_REMOUNT, mountViaFds // will mount it according to the mountSourceType of the file descriptor. // // The dstFd argument, if non-empty, is expected to be in the form of a path to // an opened file descriptor on procfs (i.e. "/proc/thread-self/fd/NN"). // // If a file descriptor is used instead of a source or a target path, the // corresponding path is only used to add context to an error in case the mount // operation has failed. func mountViaFds(source string, srcFile *mountSource, target, dstFd, fstype string, flags uintptr, data string) error { // MS_REMOUNT and srcFile don't make sense together. if srcFile != nil && flags&unix.MS_REMOUNT != 0 { logrus.Debugf("mount source passed along with MS_REMOUNT -- ignoring srcFile") srcFile = nil } dst := target if dstFd != "" { dst = dstFd } src := source isMoveMount := srcFile != nil && srcFile.Type == mountSourceOpenTree if srcFile != nil { // If we're going to use the /proc/thread-self/... path for classic // mount(2), we need to get a safe handle to /proc/thread-self. This // isn't needed for move_mount(2) because in that case the path is just // a dummy string used for error info. srcFileFd := srcFile.file.Fd() if isMoveMount { src = "/proc/self/fd/" + strconv.Itoa(int(srcFileFd)) } else { var closer utils.ProcThreadSelfCloser src, closer = utils.ProcThreadSelfFd(srcFileFd) defer closer() } } var op string var err error if isMoveMount { op = "move_mount" err = unix.MoveMount(int(srcFile.file.Fd()), "", unix.AT_FDCWD, dstFd, unix.MOVE_MOUNT_F_EMPTY_PATH|unix.MOVE_MOUNT_T_SYMLINKS) } else { op = "mount" err = unix.Mount(src, dst, fstype, flags, data) } if err != nil { return &mountError{ op: op, source: source, srcFile: srcFile, target: target, dstFd: dstFd, flags: flags, data: data, err: err, } } return nil } // unmount is a simple unix.Unmount wrapper. func unmount(target string, flags int) error { err := unix.Unmount(target, flags) if err != nil { return &mountError{ op: "unmount", target: target, flags: uintptr(flags), err: err, } } return nil } // syscallMode returns the syscall-specific mode bits from Go's portable mode bits. // Copy from https://cs.opensource.google/go/go/+/refs/tags/go1.20.7:src/os/file_posix.go;l=61-75 func syscallMode(i fs.FileMode) (o uint32) { o |= uint32(i.Perm()) if i&fs.ModeSetuid != 0 { o |= unix.S_ISUID } if i&fs.ModeSetgid != 0 { o |= unix.S_ISGID } if i&fs.ModeSticky != 0 { o |= unix.S_ISVTX } // No mapping for Go's ModeTemporary (plan9 only). return } // mountFd creates a "mount source fd" (either through open_tree(2) or just // open(O_PATH)) based on the provided configuration. This function must be // called from within the container's mount namespace. // // In the case of idmapped mount configurations, the returned mount source will // be an open_tree(2) file with MOUNT_ATTR_IDMAP applied. For other // bind-mounts, it will be an O_PATH. If the type of mount cannot be handled, // the returned mountSource will be nil, indicating that the container init // process will need to do an old-fashioned mount(2) themselves. // // This helper is only intended to be used by goCreateMountSources. func mountFd(nsHandles *userns.Handles, m *configs.Mount) (*mountSource, error) { if !m.IsBind() { return nil, errors.New("new mount api: only bind-mounts are supported") } if nsHandles == nil { nsHandles = new(userns.Handles) defer nsHandles.Release() } var mountFile *os.File var sourceType mountSourceType // Ideally, we would use OPEN_TREE_CLONE for everything, because we can // be sure that the file descriptor cannot be used to escape outside of // the mount root. Unfortunately, OPEN_TREE_CLONE is far more expensive // than open(2) because it requires doing mounts inside a new anonymous // mount namespace. So we use open(2) for standard bind-mounts, and // OPEN_TREE_CLONE when we need to set mount attributes here. // // While passing open(2)'d paths from the host rootfs isn't exactly the // safest thing in the world, the files will not survive across // execve(2) and "runc init" is non-dumpable so it should not be // possible for a malicious container process to gain access to the // file descriptors. We also don't do any of this for "runc exec", // lessening the risk even further. if m.IsIDMapped() { flags := uint(unix.OPEN_TREE_CLONE | unix.OPEN_TREE_CLOEXEC) if m.Flags&unix.MS_REC == unix.MS_REC { flags |= unix.AT_RECURSIVE } fd, err := unix.OpenTree(unix.AT_FDCWD, m.Source, flags) if err != nil { return nil, &os.PathError{Op: "open_tree(OPEN_TREE_CLONE)", Path: m.Source, Err: err} } mountFile = os.NewFile(uintptr(fd), m.Source) sourceType = mountSourceOpenTree // Configure the id mapping. var usernsFile *os.File if m.IDMapping.UserNSPath == "" { usernsFile, err = nsHandles.Get(userns.Mapping{ UIDMappings: m.IDMapping.UIDMappings, GIDMappings: m.IDMapping.GIDMappings, }) if err != nil { return nil, fmt.Errorf("failed to create userns for %s id-mapping: %w", m.Source, err) } } else { usernsFile, err = os.Open(m.IDMapping.UserNSPath) if err != nil { return nil, fmt.Errorf("failed to open existing userns for %s id-mapping: %w", m.Source, err) } } defer usernsFile.Close() setAttrFlags := uint(unix.AT_EMPTY_PATH) // If the mount has "ridmap" set, we apply the configuration // recursively. This allows you to create "rbind" mounts where only // the top-level mount has an idmapping. I'm not sure why you'd // want that, but still... if m.IDMapping.Recursive { setAttrFlags |= unix.AT_RECURSIVE } if err := unix.MountSetattr(int(mountFile.Fd()), "", setAttrFlags, &unix.MountAttr{ Attr_set: unix.MOUNT_ATTR_IDMAP, Userns_fd: uint64(usernsFile.Fd()), }); err != nil { extraMsg := "" if err == unix.EINVAL { extraMsg = " (maybe the filesystem used doesn't support idmap mounts on this kernel?)" } return nil, fmt.Errorf("failed to set MOUNT_ATTR_IDMAP on %s: %w%s", m.Source, err, extraMsg) } } else { var err error mountFile, err = os.OpenFile(m.Source, unix.O_PATH|unix.O_CLOEXEC, 0) if err != nil { return nil, err } sourceType = mountSourcePlain } return &mountSource{ Type: sourceType, file: mountFile, }, nil }