mirror of
https://github.com/opencontainers/runc.git
synced 2025-10-27 09:30:54 +08:00
tree-wide: use /proc/thread-self for thread-local state
With the idmap work, we will have a tainted Go thread in our thread-group that has a different mount namespace to the other threads. It seems that (due to some bad luck) the Go scheduler tends to make this thread the thread-group leader in our tests, which results in very baffling failures where /proc/self/mountinfo produces gibberish results. In order to avoid this, switch to using /proc/thread-self for everything that is thread-local. This primarily includes switching all file descriptor paths (CLONE_FS), all of the places that check the current cgroup (technically we never will run a single runc thread in a separate cgroup, but better to be safe than sorry), and the aforementioned mountinfo code. We don't need to do anything for the following because the results we need aren't thread-local: * Checks that certain namespaces are supported by stat(2)ing /proc/self/ns/... * /proc/self/exe and /proc/self/cmdline are not thread-local. * While threads can be in different cgroups, we do not do this for the runc binary (or libcontainer) and thus we do not need to switch to the thread-local version of /proc/self/cgroups. * All of the CLONE_NEWUSER files are not thread-local because you cannot set the usernamespace of a single thread (setns(CLONE_NEWUSER) is blocked for multi-threaded programs). Note that we have to use runtime.LockOSThread when we have an open handle to a tid-specific procfs file that we are operating on multiple times. Go can reschedule us such that we are running on a different thread and then kill the original thread (causing -ENOENT or similarly confusing errors). This is not strictly necessary for most usages of /proc/thread-self (such as using /proc/thread-self/fd/$n directly) since only operating on the actual inodes associated with the tid requires this locking, but because of the pre-3.17 fallback for CentOS, we have to do this in most cases. In addition, CentOS's kernel is too old for /proc/thread-self, which requires us to emulate it -- however in rootfs_linux.go, we are in the container pid namespace but /proc is the host's procfs. This leads to the incredibly frustrating situation where there is no way (on pre-4.1 Linux) to figure out which /proc/self/task/... entry refers to the current tid. We can just use /proc/self in this case. Yes this is all pretty ugly. I also wish it wasn't necessary. Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
This commit is contained in:
@@ -7,9 +7,13 @@ import (
|
||||
"fmt"
|
||||
"math"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"runtime"
|
||||
"strconv"
|
||||
"sync"
|
||||
|
||||
securejoin "github.com/cyphar/filepath-securejoin"
|
||||
"github.com/sirupsen/logrus"
|
||||
"golang.org/x/sys/unix"
|
||||
)
|
||||
|
||||
@@ -57,7 +61,10 @@ func CloseExecFrom(minFd int) error {
|
||||
return os.NewSyscallError("close_range", err)
|
||||
}
|
||||
|
||||
fdDir, err := os.Open("/proc/self/fd")
|
||||
procSelfFd, closer := ProcThreadSelf("fd")
|
||||
defer closer()
|
||||
|
||||
fdDir, err := os.Open(procSelfFd)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -98,3 +105,100 @@ func NewSockPair(name string) (parent, child *os.File, err error) {
|
||||
}
|
||||
return os.NewFile(uintptr(fds[1]), name+"-p"), os.NewFile(uintptr(fds[0]), name+"-c"), nil
|
||||
}
|
||||
|
||||
// WithProcfd runs the passed closure with a procfd path (/proc/self/fd/...)
|
||||
// corresponding to the unsafePath resolved within the root. Before passing the
|
||||
// fd, this path is verified to have been inside the root -- so operating on it
|
||||
// through the passed fdpath should be safe. Do not access this path through
|
||||
// the original path strings, and do not attempt to use the pathname outside of
|
||||
// the passed closure (the file handle will be freed once the closure returns).
|
||||
func WithProcfd(root, unsafePath string, fn func(procfd string) error) error {
|
||||
// Remove the root then forcefully resolve inside the root.
|
||||
unsafePath = stripRoot(root, unsafePath)
|
||||
path, err := securejoin.SecureJoin(root, unsafePath)
|
||||
if err != nil {
|
||||
return fmt.Errorf("resolving path inside rootfs failed: %w", err)
|
||||
}
|
||||
|
||||
procSelfFd, closer := ProcThreadSelf("fd/")
|
||||
defer closer()
|
||||
|
||||
// Open the target path.
|
||||
fh, err := os.OpenFile(path, unix.O_PATH|unix.O_CLOEXEC, 0)
|
||||
if err != nil {
|
||||
return fmt.Errorf("open o_path procfd: %w", err)
|
||||
}
|
||||
defer fh.Close()
|
||||
|
||||
procfd := filepath.Join(procSelfFd, strconv.Itoa(int(fh.Fd())))
|
||||
// Double-check the path is the one we expected.
|
||||
if realpath, err := os.Readlink(procfd); err != nil {
|
||||
return fmt.Errorf("procfd verification failed: %w", err)
|
||||
} else if realpath != path {
|
||||
return fmt.Errorf("possibly malicious path detected -- refusing to operate on %s", realpath)
|
||||
}
|
||||
|
||||
return fn(procfd)
|
||||
}
|
||||
|
||||
type ProcThreadSelfCloser func()
|
||||
|
||||
var (
|
||||
haveProcThreadSelf bool
|
||||
haveProcThreadSelfOnce sync.Once
|
||||
)
|
||||
|
||||
// ProcThreadSelf returns a string that is equivalent to
|
||||
// /proc/thread-self/<subpath>, with a graceful fallback on older kernels where
|
||||
// /proc/thread-self doesn't exist. This method DOES NOT use SecureJoin,
|
||||
// meaning that the passed string needs to be trusted. The caller _must_ call
|
||||
// the returned procThreadSelfCloser function (which is runtime.UnlockOSThread)
|
||||
// *only once* after it has finished using the returned path string.
|
||||
func ProcThreadSelf(subpath string) (string, ProcThreadSelfCloser) {
|
||||
haveProcThreadSelfOnce.Do(func() {
|
||||
if _, err := os.Stat("/proc/thread-self/"); err == nil {
|
||||
haveProcThreadSelf = true
|
||||
} else {
|
||||
logrus.Debugf("cannot stat /proc/thread-self (%v), falling back to /proc/self/task/<tid>", err)
|
||||
}
|
||||
})
|
||||
|
||||
// We need to lock our thread until the caller is done with the path string
|
||||
// because any non-atomic operation on the path (such as opening a file,
|
||||
// then reading it) could be interrupted by the Go runtime where the
|
||||
// underlying thread is swapped out and the original thread is killed,
|
||||
// resulting in pull-your-hair-out-hard-to-debug issues in the caller. In
|
||||
// addition, the pre-3.17 fallback makes everything non-atomic because the
|
||||
// same thing could happen between unix.Gettid() and the path operations.
|
||||
//
|
||||
// In theory, we don't need to lock in the atomic user case when using
|
||||
// /proc/thread-self/, but it's better to be safe than sorry (and there are
|
||||
// only one or two truly atomic users of /proc/thread-self/).
|
||||
runtime.LockOSThread()
|
||||
|
||||
threadSelf := "/proc/thread-self/"
|
||||
if !haveProcThreadSelf {
|
||||
// Pre-3.17 kernels did not have /proc/thread-self, so do it manually.
|
||||
threadSelf = "/proc/self/task/" + strconv.Itoa(unix.Gettid()) + "/"
|
||||
if _, err := os.Stat(threadSelf); err != nil {
|
||||
// Unfortunately, this code is called from rootfs_linux.go where we
|
||||
// are running inside the pid namespace of the container but /proc
|
||||
// is the host's procfs. Unfortunately there is no real way to get
|
||||
// the correct tid to use here (the kernel age means we cannot do
|
||||
// things like set up a private fsopen("proc") -- even scanning
|
||||
// NSpid in all of the tasks in /proc/self/task/*/status requires
|
||||
// Linux 4.1).
|
||||
//
|
||||
// So, we just have to assume that /proc/self is acceptable in this
|
||||
// one specific case.
|
||||
if os.Getpid() == 1 {
|
||||
logrus.Debugf("/proc/thread-self (tid=%d) cannot be emulated inside the initial container setup -- using /proc/self instead: %v", unix.Gettid(), err)
|
||||
} else {
|
||||
// This should never happen, but the fallback should work in most cases...
|
||||
logrus.Warnf("/proc/thread-self could not be emulated for pid=%d (tid=%d) -- using more buggy /proc/self fallback instead: %v", os.Getpid(), unix.Gettid(), err)
|
||||
}
|
||||
threadSelf = "/proc/self/"
|
||||
}
|
||||
}
|
||||
return threadSelf + subpath, runtime.UnlockOSThread
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user