diff --git a/internal/pathrs/root_pathrslite.go b/internal/pathrs/root_pathrslite.go new file mode 100644 index 000000000..2dd73b160 --- /dev/null +++ b/internal/pathrs/root_pathrslite.go @@ -0,0 +1,71 @@ +// SPDX-License-Identifier: Apache-2.0 +/* + * Copyright (C) 2024-2025 Aleksa Sarai + * Copyright (C) 2024-2025 SUSE LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package pathrs + +import ( + "fmt" + "os" + "path/filepath" + + "github.com/cyphar/filepath-securejoin/pathrs-lite" + "golang.org/x/sys/unix" + + "github.com/opencontainers/runc/internal/linux" +) + +// OpenInRoot opens the given path inside the root with the provided flags. It +// is effectively shorthand for [securejoin.OpenInRoot] followed by +// [securejoin.Reopen]. +func OpenInRoot(root, subpath string, flags int) (*os.File, error) { + handle, err := pathrs.OpenInRoot(root, subpath) + if err != nil { + return nil, err + } + defer handle.Close() + return pathrs.Reopen(handle, flags) +} + +// CreateInRoot creates a new file inside a root (as well as any missing parent +// directories) and returns a handle to said file. This effectively has +// open(O_CREAT|O_NOFOLLOW) semantics. If you want the creation to use O_EXCL, +// include it in the passed flags. The fileMode argument uses unix.* mode bits, +// *not* os.FileMode. +func CreateInRoot(root, subpath string, flags int, fileMode uint32) (*os.File, error) { + dir, filename := filepath.Split(subpath) + if filepath.Join("/", filename) == "/" { + return nil, fmt.Errorf("create in root subpath %q has bad trailing component %q", subpath, filename) + } + + dirFd, err := MkdirAllInRootOpen(root, dir, 0o755) + if err != nil { + return nil, err + } + defer dirFd.Close() + + // We know that the filename does not have any "/" components, and that + // dirFd is inside the root. O_NOFOLLOW will stop us from following + // trailing symlinks, so this is safe to do. libpathrs's Root::create_file + // works the same way. + flags |= unix.O_CREAT | unix.O_NOFOLLOW + fd, err := linux.Openat(int(dirFd.Fd()), filename, flags, fileMode) + if err != nil { + return nil, err + } + return os.NewFile(uintptr(fd), root+"/"+subpath), nil +} diff --git a/libcontainer/criu_linux.go b/libcontainer/criu_linux.go index e96873923..04e0d5f64 100644 --- a/libcontainer/criu_linux.go +++ b/libcontainer/criu_linux.go @@ -582,8 +582,12 @@ func (c *Container) prepareCriuRestoreMounts(mounts []*configs.Mount) error { if isOnTmpfs(m.Destination, mounts) { continue } - if _, err := createMountpoint(c.config.Rootfs, mountEntry{Mount: m}); err != nil { - return fmt.Errorf("create criu restore mountpoint for %s mount: %w", m.Destination, err) + me := mountEntry{Mount: m} + if err := me.createOpenMountpoint(c.config.Rootfs); err != nil { + return fmt.Errorf("create criu restore mountpoint for %s mount: %w", me.Destination, err) + } + if me.dstFile != nil { + defer me.dstFile.Close() } // If the mount point is a bind mount, we need to mount // it now so that runc can create the necessary mount @@ -595,13 +599,20 @@ func (c *Container) prepareCriuRestoreMounts(mounts []*configs.Mount) error { // because during initial container creation mounts are // set up in the order they are configured. if m.Device == "bind" { - if err := utils.WithProcfd(c.config.Rootfs, m.Destination, func(dstFd string) error { + if err := utils.WithProcfdFile(me.dstFile, func(dstFd string) error { return mountViaFds(m.Source, nil, m.Destination, dstFd, "", unix.MS_BIND|unix.MS_REC, "") }); err != nil { return err } umounts = append(umounts, m.Destination) } + if me.dstFile != nil { + // As this is being done in a loop, the defer earlier will be + // delayed until all mountpoints are handled -- for a config with + // many mountpoints this could result in a lot of open files. So we + // opportunistically close the file as well as deferring it. + _ = me.dstFile.Close() + } } return nil } diff --git a/libcontainer/rootfs_linux.go b/libcontainer/rootfs_linux.go index ec3c77232..018fafda5 100644 --- a/libcontainer/rootfs_linux.go +++ b/libcontainer/rootfs_linux.go @@ -13,6 +13,7 @@ import ( "time" securejoin "github.com/cyphar/filepath-securejoin" + "github.com/cyphar/filepath-securejoin/pathrs-lite/procfs" "github.com/moby/sys/mountinfo" "github.com/moby/sys/userns" "github.com/mrunalp/fileutils" @@ -46,6 +47,7 @@ type mountConfig struct { type mountEntry struct { *configs.Mount srcFile *mountSource + dstFile *os.File } // srcName is only meant for error messages, it returns a "friendly" name. @@ -369,7 +371,7 @@ func mountCgroupV1(m mountEntry, c *mountConfig) error { } func mountCgroupV2(m mountEntry, c *mountConfig) error { - err := utils.WithProcfd(c.root, m.Destination, func(dstFd string) error { + err := utils.WithProcfdFile(m.dstFile, func(dstFd string) error { return mountViaFds(m.Source, nil, m.Destination, dstFd, "cgroup2", uintptr(m.Flags), m.Data) }) if err == nil || (!errors.Is(err, unix.EPERM) && !errors.Is(err, unix.EBUSY)) { @@ -398,14 +400,14 @@ func mountCgroupV2(m mountEntry, c *mountConfig) error { // // Mask `/sys/fs/cgroup` to ensure it is read-only, even when `/sys` is mounted // with `rbind,ro` (`runc spec --rootless` produces `rbind,ro` for `/sys`). - err = utils.WithProcfd(c.root, m.Destination, func(procfd string) error { + err = utils.WithProcfdFile(m.dstFile, func(procfd string) error { return maskPaths([]string{procfd}, c.label) }) } return err } -func doTmpfsCopyUp(m mountEntry, rootfs, mountLabel string) (Err error) { +func doTmpfsCopyUp(m mountEntry, mountLabel string) (Err error) { // Set up a scratch dir for the tmpfs on the host. tmpdir, err := prepareTmp("/tmp") if err != nil { @@ -418,13 +420,19 @@ func doTmpfsCopyUp(m mountEntry, rootfs, mountLabel string) (Err error) { } defer os.RemoveAll(tmpDir) - // Configure the *host* tmpdir as if it's the container mount. We change - // m.Destination since we are going to mount *on the host*. - oldDest := m.Destination - m.Destination = tmpDir - err = mountPropagate(m, "/", mountLabel) - m.Destination = oldDest + tmpDirFile, err := os.OpenFile(tmpDir, unix.O_DIRECTORY|unix.O_CLOEXEC, 0) if err != nil { + return fmt.Errorf("tmpcopyup: %w", err) + } + defer tmpDirFile.Close() + + // Configure the *host* tmpdir as if it's the container mount. We change + // m.dstFile since we are going to mount *on the host*. + hostMount := mountEntry{ + Mount: m.Mount, + dstFile: tmpDirFile, + } + if err := hostMount.mountPropagate("/", mountLabel); err != nil { return err } defer func() { @@ -435,7 +443,7 @@ func doTmpfsCopyUp(m mountEntry, rootfs, mountLabel string) (Err error) { } }() - return utils.WithProcfd(rootfs, m.Destination, func(dstFd string) (Err error) { + return utils.WithProcfdFile(m.dstFile, func(dstFd string) (Err error) { // Copy the container data to the host tmpdir. We append "/" to force // CopyDirectory to resolve the symlink rather than trying to copy the // symlink itself. @@ -497,72 +505,76 @@ func statfsToMountFlags(st unix.Statfs_t) int { var errRootfsToFile = errors.New("config tries to change rootfs to file") -func createMountpoint(rootfs string, m mountEntry) (string, error) { - dest, err := securejoin.SecureJoin(rootfs, m.Destination) +func (m *mountEntry) createOpenMountpoint(rootfs string) (Err error) { + unsafePath := utils.StripRoot(rootfs, m.Destination) + dstFile, err := pathrs.OpenInRoot(rootfs, unsafePath, unix.O_PATH) + defer func() { + if dstFile != nil && Err != nil { + _ = dstFile.Close() + } + }() if err != nil { - return "", err - } - if err := checkProcMount(rootfs, dest, m); err != nil { - return "", fmt.Errorf("check proc-safety of %s mount: %w", m.Destination, err) - } - - switch m.Device { - case "bind": - fi, _, err := m.srcStat() - if err != nil { - // Error out if the source of a bind mount does not exist as we - // will be unable to bind anything to it. - return "", err + if !errors.Is(err, unix.ENOENT) { + return fmt.Errorf("lookup mountpoint target: %w", err) } - // If the original source is not a directory, make the target a file. - if !fi.IsDir() { - // Make sure we aren't tricked into trying to make the root a file. - if rootfs == dest { - return "", fmt.Errorf("%w: file bind mount over rootfs", errRootfsToFile) - } - // Make the parent directory. - destDir, destBase := filepath.Split(dest) - destDirFd, err := pathrs.MkdirAllInRootOpen(rootfs, destDir, 0o755) + + // If the mountpoint doesn't already exist, we want to create a mountpoint + // that makes sense for the source. For file bind-mounts this is an empty + // file, for everything else it's a directory. + dstIsFile := false + if m.Device == "bind" { + fi, _, err := m.srcStat() if err != nil { - return "", fmt.Errorf("make parent dir of file bind-mount: %w", err) + // Error out if the source of a bind mount does not exist as we + // will be unable to bind anything to it. + return err } - defer destDirFd.Close() - // Make the target file. We want to avoid opening any file that is - // already there because it could be a "bad" file like an invalid - // device or hung tty that might cause a DoS, so we use mknodat. - // destBase does not contain any "/" components, and mknodat does - // not follow trailing symlinks, so we can safely just call mknodat - // here. - if err := unix.Mknodat(int(destDirFd.Fd()), destBase, unix.S_IFREG|0o644, 0); err != nil { - // If we get EEXIST, there was already an inode there and - // we can consider that a success. - if !errors.Is(err, unix.EEXIST) { - err = &os.PathError{Op: "mknod regular file", Path: dest, Err: err} - return "", fmt.Errorf("create target of file bind-mount: %w", err) - } - } - // Nothing left to do. - return dest, nil + dstIsFile = !fi.IsDir() } - case "tmpfs": + if dstIsFile { + dstFile, err = pathrs.CreateInRoot(rootfs, unsafePath, unix.O_CREAT|unix.O_EXCL|unix.O_NOFOLLOW, 0o644) + } else { + dstFile, err = pathrs.MkdirAllInRootOpen(rootfs, unsafePath, 0o755) + } + if err != nil { + return fmt.Errorf("make mountpoint %q: %w", m.Destination, err) + } + } + + if m.Device == "tmpfs" { // If the original target exists, copy the mode for the tmpfs mount. - if stat, err := os.Stat(dest); err == nil { - dt := fmt.Sprintf("mode=%04o", syscallMode(stat.Mode())) - if m.Data != "" { - dt = dt + "," + m.Data - } - m.Data = dt - - // Nothing left to do. - return dest, nil + stat, err := dstFile.Stat() + if err != nil { + return fmt.Errorf("check tmpfs source mode: %w", err) } + dt := fmt.Sprintf("mode=%04o", syscallMode(stat.Mode())) + if m.Data != "" { + dt = dt + "," + m.Data + } + m.Data = dt } - if err := pathrs.MkdirAllInRoot(rootfs, dest, 0o755); err != nil { - return "", err + dstFullPath, err := procfs.ProcSelfFdReadlink(dstFile) + if err != nil { + return fmt.Errorf("get mount destination real path: %w", err) } - return dest, nil + if !pathrs.IsLexicallyInRoot(rootfs, dstFullPath) { + return fmt.Errorf("mountpoint %q is outside of rootfs %q", dstFullPath, rootfs) + } + if relPath, err := filepath.Rel(rootfs, dstFullPath); err != nil { + return fmt.Errorf("get relative path of %q: %w", dstFullPath, err) + } else if relPath == "." { + return fmt.Errorf("mountpoint %q is on the top of rootfs %q", dstFullPath, rootfs) + } + // TODO: Make checkProcMount use dstFile directly to avoid the need to + // operate on paths here. + if err := checkProcMount(rootfs, dstFullPath, *m); err != nil { + return fmt.Errorf("check proc-safety of %s mount: %w", m.Destination, err) + } + // Update mountEntry. + m.dstFile = dstFile + return nil } func mountToRootfs(c *mountConfig, m mountEntry) error { @@ -592,36 +604,47 @@ func mountToRootfs(c *mountConfig, m mountEntry) error { } else if !fi.IsDir() { return fmt.Errorf("filesystem %q must be mounted on ordinary directory", m.Device) } - if err := pathrs.MkdirAllInRoot(rootfs, dest, 0o755); err != nil { + dstFile, err := pathrs.MkdirAllInRootOpen(rootfs, dest, 0o755) + if err != nil { return err } - // Selinux kernels do not support labeling of /proc or /sys. - return mountPropagate(m, rootfs, "") + defer dstFile.Close() + // "proc" and "sys" mounts need special handling (without resolving the + // destination) to avoid attacks. + m.dstFile = dstFile + return m.mountPropagate(rootfs, "") } - dest, err := createMountpoint(rootfs, m) - if err != nil { + mountLabel := c.label + if err := m.createOpenMountpoint(rootfs); err != nil { return fmt.Errorf("create mountpoint for %s mount: %w", m.Destination, err) } - mountLabel := c.label + defer func() { + if m.dstFile != nil { + _ = m.dstFile.Close() + m.dstFile = nil + } + }() switch m.Device { case "mqueue": - if err := mountPropagate(m, rootfs, ""); err != nil { + if err := m.mountPropagate(rootfs, ""); err != nil { return err } - return label.SetFileLabel(dest, mountLabel) + return utils.WithProcfdFile(m.dstFile, func(dstFd string) error { + return label.SetFileLabel(dstFd, mountLabel) + }) case "tmpfs": + var err error if m.Extensions&configs.EXT_COPYUP == configs.EXT_COPYUP { - err = doTmpfsCopyUp(m, rootfs, mountLabel) + err = doTmpfsCopyUp(m, mountLabel) } else { - err = mountPropagate(m, rootfs, mountLabel) + err = m.mountPropagate(rootfs, mountLabel) } - return err case "bind": // open_tree()-related shenanigans are all handled in mountViaFds. - if err := mountPropagate(m, rootfs, mountLabel); err != nil { + if err := m.mountPropagate(rootfs, mountLabel); err != nil { return err } @@ -635,7 +658,7 @@ func mountToRootfs(c *mountConfig, m mountEntry) error { // contrast to mount(8)'s current behaviour, but is what users probably // expect. See . if m.Flags & ^(unix.MS_BIND|unix.MS_REC|unix.MS_REMOUNT) != 0 || m.ClearedFlags != 0 { - if err := utils.WithProcfd(rootfs, m.Destination, func(dstFd string) error { + if err := utils.WithProcfdFile(m.dstFile, func(dstFd string) error { flags := m.Flags | unix.MS_BIND | unix.MS_REMOUNT // The runtime-spec says we SHOULD map to the relevant mount(8) // behaviour. However, it's not clear whether we want the @@ -736,14 +759,14 @@ func mountToRootfs(c *mountConfig, m mountEntry) error { return err } } - return setRecAttr(m.Mount, rootfs) + return setRecAttr(m) case "cgroup": if cgroups.IsCgroup2UnifiedMode() { return mountCgroupV2(m, c) } return mountCgroupV1(m, c) default: - return mountPropagate(m, rootfs, mountLabel) + return m.mountPropagate(rootfs, mountLabel) } } @@ -939,11 +962,9 @@ func bindMountDeviceNode(destDir *os.File, destName string, node *devices.Device } defer dstFile.Close() - dstFd, closer := utils.ProcThreadSelfFd(dstFile.Fd()) - defer closer() - - dstPath := filepath.Join(destDir.Name(), destName) - return mountViaFds(node.Path, nil, dstPath, dstFd, "bind", unix.MS_BIND, "") + return utils.WithProcfdFile(dstFile, func(dstFd string) error { + return mountViaFds(node.Path, nil, dstFile.Name(), dstFd, "bind", unix.MS_BIND, "") + }) } // Creates the device node in the rootfs of the container. @@ -1350,9 +1371,46 @@ func maskPaths(paths []string, mountLabel string) error { return nil } +func reopenAfterMount(rootfs string, f *os.File, flags int) (_ *os.File, Err error) { + fullPath, err := procfs.ProcSelfFdReadlink(f) + if err != nil { + return nil, fmt.Errorf("get full path: %w", err) + } + if !pathrs.IsLexicallyInRoot(rootfs, fullPath) { + return nil, fmt.Errorf("mountpoint %q is outside of rootfs %q", fullPath, rootfs) + } + unsafePath := utils.StripRoot(rootfs, fullPath) + reopened, err := pathrs.OpenInRoot(rootfs, unsafePath, flags) + if err != nil { + return nil, fmt.Errorf("re-open mountpoint %q: %w", unsafePath, err) + } + defer func() { + if Err != nil { + _ = reopened.Close() + } + }() + + // NOTE: The best we can do here is confirm that the new mountpoint handle + // matches the original target handle, but an attacker could've swapped a + // different path to replace it. In the worst case this could result in us + // applying later vfsmount flags onto the wrong mount. + // + // This is far from ideal, but the only way of doing this in a race-free + // way is to switch the new mount API (move_mount(2) does not require this + // re-opening step, and thus no such races are possible). + reopenedFullPath, err := procfs.ProcSelfFdReadlink(reopened) + if err != nil { + return nil, fmt.Errorf("check full path of re-opened mountpoint: %w", err) + } + if reopenedFullPath != fullPath { + return nil, fmt.Errorf("mountpoint %q was moved while re-opening", unsafePath) + } + return reopened, nil +} + // Do the mount operation followed by additional mounts required to take care // of propagation flags. This will always be scoped inside the container rootfs. -func mountPropagate(m mountEntry, rootfs string, mountLabel string) error { +func (m *mountEntry) mountPropagate(rootfs string, mountLabel string) error { var ( data = label.FormatMountLabel(m.Data, mountLabel) flags = m.Flags @@ -1365,19 +1423,30 @@ func mountPropagate(m mountEntry, rootfs string, mountLabel string) error { flags &= ^unix.MS_RDONLY } - // Because the destination is inside a container path which might be - // mutating underneath us, we verify that we are actually going to mount - // inside the container with WithProcfd() -- mounting through a procfd - // mounts on the target. - if err := utils.WithProcfd(rootfs, m.Destination, func(dstFd string) error { + if err := utils.WithProcfdFile(m.dstFile, func(dstFd string) error { return mountViaFds(m.Source, m.srcFile, m.Destination, dstFd, m.Device, uintptr(flags), data) }); err != nil { return err } + + // We need to re-open the mountpoint after doing the mount, in order for us + // to operate on the new mount we just created. However, we cannot use + // pathrs.Reopen because we need to re-resolve from the parent directory to + // get a new handle to the top mount. + // + // TODO: Use move_mount(2) on newer kernels so that this is no longer + // necessary on modern systems. + newDstFile, err := reopenAfterMount(rootfs, m.dstFile, unix.O_PATH) + if err != nil { + return fmt.Errorf("reopen mountpoint after mount: %w", err) + } + _ = m.dstFile.Close() + m.dstFile = newDstFile + // We have to apply mount propagation flags in a separate WithProcfd() call // because the previous call invalidates the passed procfd -- the mount // target needs to be re-opened. - if err := utils.WithProcfd(rootfs, m.Destination, func(dstFd string) error { + if err := utils.WithProcfdFile(m.dstFile, func(dstFd string) error { for _, pflag := range m.PropagationFlags { if err := mountViaFds("", nil, m.Destination, dstFd, "", uintptr(pflag), ""); err != nil { return err @@ -1390,11 +1459,11 @@ func mountPropagate(m mountEntry, rootfs string, mountLabel string) error { return nil } -func setRecAttr(m *configs.Mount, rootfs string) error { +func setRecAttr(m mountEntry) error { if m.RecAttr == nil { return nil } - return utils.WithProcfd(rootfs, m.Destination, func(procfd string) error { + return utils.WithProcfdFile(m.dstFile, func(procfd string) error { return unix.MountSetattr(-1, procfd, unix.AT_RECURSIVE, m.RecAttr) }) } diff --git a/libcontainer/utils/utils.go b/libcontainer/utils/utils.go index 1467d17eb..88291a0fe 100644 --- a/libcontainer/utils/utils.go +++ b/libcontainer/utils/utils.go @@ -66,11 +66,11 @@ func CleanPath(path string) string { return path } -// stripRoot returns the passed path, stripping the root path if it was +// StripRoot returns the passed path, stripping the root path if it was // (lexicially) inside it. Note that both passed paths will always be treated // as absolute, and the returned path will also always be absolute. In // addition, the paths are cleaned before stripping the root. -func stripRoot(root, path string) string { +func StripRoot(root, path string) string { // Make the paths clean and absolute. root, path = CleanPath("/"+root), CleanPath("/"+path) switch { diff --git a/libcontainer/utils/utils_test.go b/libcontainer/utils/utils_test.go index 06c042f5f..4b5fd833c 100644 --- a/libcontainer/utils/utils_test.go +++ b/libcontainer/utils/utils_test.go @@ -131,9 +131,9 @@ func TestStripRoot(t *testing.T) { {"/foo/bar", "foo/bar/baz/beef", "/baz/beef"}, {"foo/bar", "foo/bar/baz/beets", "/baz/beets"}, } { - got := stripRoot(test.root, test.path) + got := StripRoot(test.root, test.path) if got != test.out { - t.Errorf("stripRoot(%q, %q) -- got %q, expected %q", test.root, test.path, got, test.out) + t.Errorf("StripRoot(%q, %q) -- got %q, expected %q", test.root, test.path, got, test.out) } } } diff --git a/libcontainer/utils/utils_unix.go b/libcontainer/utils/utils_unix.go index 6c7dfa7d9..a457a09b0 100644 --- a/libcontainer/utils/utils_unix.go +++ b/libcontainer/utils/utils_unix.go @@ -153,8 +153,8 @@ func NewSockPair(name string) (parent, child *os.File, err error) { // the passed closure (the file handle will be freed once the closure returns). func WithProcfd(root, unsafePath string, fn func(procfd string) error) error { // Remove the root then forcefully resolve inside the root. - unsafePath = stripRoot(root, unsafePath) - path, err := securejoin.SecureJoin(root, unsafePath) + unsafePath = StripRoot(root, unsafePath) + fullPath, err := securejoin.SecureJoin(root, unsafePath) if err != nil { return fmt.Errorf("resolving path inside rootfs failed: %w", err) } @@ -163,7 +163,7 @@ func WithProcfd(root, unsafePath string, fn func(procfd string) error) error { defer closer() // Open the target path. - fh, err := os.OpenFile(path, unix.O_PATH|unix.O_CLOEXEC, 0) + fh, err := os.OpenFile(fullPath, unix.O_PATH|unix.O_CLOEXEC, 0) if err != nil { return fmt.Errorf("open o_path procfd: %w", err) } @@ -173,13 +173,24 @@ func WithProcfd(root, unsafePath string, fn func(procfd string) error) error { // Double-check the path is the one we expected. if realpath, err := os.Readlink(procfd); err != nil { return fmt.Errorf("procfd verification failed: %w", err) - } else if realpath != path { + } else if realpath != fullPath { return fmt.Errorf("possibly malicious path detected -- refusing to operate on %s", realpath) } return fn(procfd) } +// WithProcfdFile is a very minimal wrapper around [ProcThreadSelfFd], intended +// to make migrating from [WithProcfd] and [WithProcfdPath] usage easier. The +// caller is responsible for making sure that the provided file handle is +// actually safe to operate on. +func WithProcfdFile(file *os.File, fn func(procfd string) error) error { + fdpath, closer := ProcThreadSelfFd(file.Fd()) + defer closer() + + return fn(fdpath) +} + type ProcThreadSelfCloser func() var (