mirror of
https://github.com/opencontainers/runc.git
synced 2025-09-27 03:46:19 +08:00

With open_tree(OPEN_TREE_CLONE), it is possible to implement both the id-mapped mounts and bind-mount source file descriptor logic entirely in Go without requiring any complicated handling from nsexec. However, implementing it the naive way (do the OPEN_TREE_CLONE in the host namespace before the rootfs is set up -- which is what the existing implementation did) exposes issues in how mount ordering (in particular when handling mount sources from inside the container rootfs, but also in relation to mount propagation) was handled for idmapped mounts and bind-mount sources. In order to solve this problem completely, it is necessary to spawn a thread which joins the container mount namespace and provides mountfds when requested by the rootfs setup code (ensuring that the mount order and mount propagation of the source of the bind-mount are handled correctly). While the need to join the mount namespace leads to other complicated (such as with the usage of /proc/self -- fixed in a later patch) the resulting code is still reasonable and is the only real way to solve the issue. This allows us to reduce the amount of C code we have in nsexec, as well as simplifying a whole host of places that were made more complicated with the addition of id-mapped mounts and the bind sourcefd logic. Because we join the container namespace, we can continue to use regular O_PATH file descriptors for non-id-mapped bind-mount sources (which means we don't have to raise the kernel requirement for that case). In addition, we can easily add support for id-mappings that don't match the container's user namespace. The approach taken here is to use Go's officially supported mechanism for spawning a process in a user namespace, but (ab)use PTRACE_TRACEME to avoid actually having to exec a different process. The most efficient way to implement this would be to do clone() in cgo directly to run a function that just does kill(getpid(), SIGSTOP) -- we can always switch to that if it turns out this approach is too slow. It should be noted that the included micro-benchmark seems to indicate this is Fast Enough(TM): goos: linux goarch: amd64 pkg: github.com/opencontainers/runc/libcontainer/userns cpu: Intel(R) Core(TM) i5-10210U CPU @ 1.60GHz BenchmarkSpawnProc BenchmarkSpawnProc-8 1670 770065 ns/op Fixes:fda12ab101
("Support idmap mounts on volumes") Fixes:9c444070ec
("Open bind mount sources from the host userns") Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
204 lines
5.3 KiB
Go
204 lines
5.3 KiB
Go
package libcontainer
|
|
|
|
import (
|
|
"encoding/json"
|
|
"errors"
|
|
"fmt"
|
|
"io"
|
|
"os"
|
|
"strconv"
|
|
|
|
"github.com/opencontainers/runc/libcontainer/utils"
|
|
|
|
"github.com/sirupsen/logrus"
|
|
)
|
|
|
|
type syncType string
|
|
|
|
// Constants that are used for synchronisation between the parent and child
|
|
// during container setup. They come in pairs (with procError being a generic
|
|
// response which is followed by an &initError).
|
|
//
|
|
// [ child ] <-> [ parent ]
|
|
//
|
|
// procMountPlease --> [open(2) or open_tree(2) and configure mount]
|
|
// Arg: configs.Mount
|
|
// <-- procMountFd
|
|
// file: mountfd
|
|
//
|
|
// procSeccomp --> [forward fd to listenerPath]
|
|
// file: seccomp fd
|
|
// --- no return synchronisation
|
|
//
|
|
// procHooks --> [run hooks]
|
|
// <-- procHooksDone
|
|
//
|
|
// procReady --> [final setup]
|
|
// <-- procRun
|
|
//
|
|
// procSeccomp --> [grab seccomp fd with pidfd_getfd()]
|
|
// <-- procSeccompDone
|
|
const (
|
|
procError syncType = "procError"
|
|
procReady syncType = "procReady"
|
|
procRun syncType = "procRun"
|
|
procHooks syncType = "procHooks"
|
|
procHooksDone syncType = "procHooksDone"
|
|
procMountPlease syncType = "procMountPlease"
|
|
procMountFd syncType = "procMountFd"
|
|
procSeccomp syncType = "procSeccomp"
|
|
procSeccompDone syncType = "procSeccompDone"
|
|
)
|
|
|
|
type syncFlags int
|
|
|
|
const (
|
|
syncFlagHasFd syncFlags = (1 << iota)
|
|
)
|
|
|
|
type syncT struct {
|
|
Type syncType `json:"type"`
|
|
Flags syncFlags `json:"flags"`
|
|
Arg *json.RawMessage `json:"arg,omitempty"`
|
|
File *os.File `json:"-"` // passed oob through SCM_RIGHTS
|
|
}
|
|
|
|
func (s syncT) String() string {
|
|
str := "type:" + string(s.Type)
|
|
if s.Flags != 0 {
|
|
str += " flags:0b" + strconv.FormatInt(int64(s.Flags), 2)
|
|
}
|
|
if s.Arg != nil {
|
|
str += " arg:" + string(*s.Arg)
|
|
}
|
|
if s.File != nil {
|
|
str += " file:" + s.File.Name() + " (fd:" + strconv.Itoa(int(s.File.Fd())) + ")"
|
|
}
|
|
return str
|
|
}
|
|
|
|
// initError is used to wrap errors for passing them via JSON,
|
|
// as encoding/json can't unmarshal into error type.
|
|
type initError struct {
|
|
Message string `json:"message,omitempty"`
|
|
}
|
|
|
|
func (i initError) Error() string {
|
|
return i.Message
|
|
}
|
|
|
|
func doWriteSync(pipe *syncSocket, sync syncT) error {
|
|
sync.Flags &= ^syncFlagHasFd
|
|
if sync.File != nil {
|
|
sync.Flags |= syncFlagHasFd
|
|
}
|
|
logrus.Debugf("writing sync %s", sync)
|
|
data, err := json.Marshal(sync)
|
|
if err != nil {
|
|
return fmt.Errorf("marshal sync %v: %w", sync.Type, err)
|
|
}
|
|
if _, err := pipe.WritePacket(data); err != nil {
|
|
return fmt.Errorf("writing sync %v: %w", sync.Type, err)
|
|
}
|
|
if sync.Flags&syncFlagHasFd != 0 {
|
|
logrus.Debugf("writing sync file %s", sync)
|
|
if err := utils.SendFile(pipe.File(), sync.File); err != nil {
|
|
return fmt.Errorf("sending file after sync %q: %w", sync.Type, err)
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func writeSync(pipe *syncSocket, sync syncType) error {
|
|
return doWriteSync(pipe, syncT{Type: sync})
|
|
}
|
|
|
|
func writeSyncArg(pipe *syncSocket, sync syncType, arg interface{}) error {
|
|
argJSON, err := json.Marshal(arg)
|
|
if err != nil {
|
|
return fmt.Errorf("writing sync %v: marshal argument failed: %w", sync, err)
|
|
}
|
|
argJSONMsg := json.RawMessage(argJSON)
|
|
return doWriteSync(pipe, syncT{Type: sync, Arg: &argJSONMsg})
|
|
}
|
|
|
|
func doReadSync(pipe *syncSocket) (syncT, error) {
|
|
var sync syncT
|
|
logrus.Debugf("reading sync")
|
|
packet, err := pipe.ReadPacket()
|
|
if err != nil {
|
|
if errors.Is(err, io.EOF) {
|
|
logrus.Debugf("sync pipe closed")
|
|
return sync, err
|
|
}
|
|
return sync, fmt.Errorf("reading from parent failed: %w", err)
|
|
}
|
|
if err := json.Unmarshal(packet, &sync); err != nil {
|
|
return sync, fmt.Errorf("unmarshal sync from parent failed: %w", err)
|
|
}
|
|
logrus.Debugf("read sync %s", sync)
|
|
if sync.Type == procError {
|
|
var ierr initError
|
|
if sync.Arg == nil {
|
|
return sync, errors.New("procError missing error payload")
|
|
}
|
|
if err := json.Unmarshal(*sync.Arg, &ierr); err != nil {
|
|
return sync, fmt.Errorf("unmarshal procError failed: %w", err)
|
|
}
|
|
return sync, &ierr
|
|
}
|
|
if sync.Flags&syncFlagHasFd != 0 {
|
|
logrus.Debugf("reading sync file %s", sync)
|
|
file, err := utils.RecvFile(pipe.File())
|
|
if err != nil {
|
|
return sync, fmt.Errorf("receiving fd from sync %v failed: %w", sync.Type, err)
|
|
}
|
|
sync.File = file
|
|
}
|
|
return sync, nil
|
|
}
|
|
|
|
func readSyncFull(pipe *syncSocket, expected syncType) (syncT, error) {
|
|
sync, err := doReadSync(pipe)
|
|
if err != nil {
|
|
return sync, err
|
|
}
|
|
if sync.Type != expected {
|
|
return sync, fmt.Errorf("unexpected synchronisation flag: got %q, expected %q", sync.Type, expected)
|
|
}
|
|
return sync, nil
|
|
}
|
|
|
|
func readSync(pipe *syncSocket, expected syncType) error {
|
|
sync, err := readSyncFull(pipe, expected)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if sync.Arg != nil {
|
|
return fmt.Errorf("sync %v had unexpected argument passed: %q", expected, string(*sync.Arg))
|
|
}
|
|
if sync.File != nil {
|
|
_ = sync.File.Close()
|
|
return fmt.Errorf("sync %v had unexpected file passed", sync.Type)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// parseSync runs the given callback function on each syncT received from the
|
|
// child. It will return once io.EOF is returned from the given pipe.
|
|
func parseSync(pipe *syncSocket, fn func(*syncT) error) error {
|
|
for {
|
|
sync, err := doReadSync(pipe)
|
|
if err != nil {
|
|
if errors.Is(err, io.EOF) {
|
|
break
|
|
}
|
|
return err
|
|
}
|
|
if err := fn(&sync); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
return nil
|
|
}
|