mirror of
https://github.com/opencontainers/runc.git
synced 2025-09-27 03:46:19 +08:00

The original reasoning for this option was to avoid having mount options be overwritten by runc. However, adding command-line arguments has historically been a bad idea because it forces strict-runc-compatible OCI runtimes to copy out-of-spec features directly from runc and these flags are usually quite difficult to enable by users when using runc through several layers of engines and orchestrators. A far more preferable solution is to have a heuristic which detects whether copying the original mount's mount options would override an explicit mount option specified by the user. In this case, we should return an error. You only end up in this path in the userns case, if you have a bind-mount source with locked flags. During the course of writing this patch, I discovered that several aspects of our handling of flags for bind-mounts left much to be desired. We have completely botched the handling of explicitly cleared flags since commit97f5ee4e6a
("Only remount if requested flags differ from current"), with our behaviour only becoming increasingly more weird with50105de1d8
("Fix failure with rw bind mount of a ro fuse") andda780e4d27
("Fix bind mounts of filesystems with certain options set"). In short, we would only clear flags explicitly request by the user purely by chance, in ways that it really should've been reported to us by now. The most egregious is that mounts explicitly marked "rw" were actually mounted "ro" if the bind-mount source was "ro" and no other special flags were included. In addition, our handling of atime was completely broken -- mostly due to how subtle the semantics of atime are on Linux. Unfortunately, while the runtime-spec requires us to implement mount(8)'s behaviour, several aspects of the util-linux mount(8)'s behaviour are broken and thus copying them makes little sense. Since the runtime-spec behaviour for this case (should mount options for a "bind" mount use the "mount --bind -o ..." or "mount --bind -o remount,..." semantics? Is the fallback code we have for userns actually spec-compliant?) and the mount(8) behaviour (see [1]) are not well-defined, this commit simply fixes the most obvious aspects of the behaviour that are broken while keeping the current spirit of the implementation. NOTE: The handling of atime in the base case is left for a future PR to deal with. This means that the atime of the source mount will be silently left alone unless the fallback path needs to be taken, and any flags not explicitly set will be cleared in the base case. Whether we should always be operating as "mount --bind -o remount,..." (where we default to the original mount source flags) is a topic for a separate PR and (probably) associated runtime-spec PR. So, to resolve this: * We store which flags were explicitly requested to be cleared by the user, so that we can detect whether the userns fallback path would end up setting a flag the user explicitly wished to clear. If so, we return an error because we couldn't fulfil the configuration settings. * Revert97f5ee4e6a
("Only remount if requested flags differ from current"), as missing flags do not mean we can skip MS_REMOUNT (in fact, missing flags are how you indicate a flag needs to be cleared with mount(2)). The original purpose of the patch was to fix the userns issue, but as mentioned above the correct mechanism is to do a fallback mount that copies the lockable flags from statfs(2). * Improve handling of atime in the fallback case by: - Correctly handling the returned flags in statfs(2). - Implement the MNT_LOCK_ATIME checks in our code to ensure we produce errors rather than silently producing incorrect atime mounts. * Improve the tests so we correctly detect all of these contingencies, including a general "bind-mount atime handling" test to ensure that the behaviour described here is accurate. This change also inlines the remount() function -- it was only ever used for the bind-mount remount case, and its behaviour is very bind-mount specific. [1]: https://github.com/util-linux/util-linux/issues/2433 Reverts:97f5ee4e6a
("Only remount if requested flags differ from current") Fixes:50105de1d8
("Fix failure with rw bind mount of a ro fuse") Fixes:da780e4d27
("Fix bind mounts of filesystems with certain options set") Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
401 lines
10 KiB
Go
401 lines
10 KiB
Go
package main
|
|
|
|
import (
|
|
"errors"
|
|
"fmt"
|
|
"net"
|
|
"os"
|
|
"path/filepath"
|
|
"strconv"
|
|
|
|
"github.com/coreos/go-systemd/v22/activation"
|
|
"github.com/opencontainers/runtime-spec/specs-go"
|
|
selinux "github.com/opencontainers/selinux/go-selinux"
|
|
"github.com/sirupsen/logrus"
|
|
"github.com/urfave/cli"
|
|
"golang.org/x/sys/unix"
|
|
|
|
"github.com/opencontainers/runc/libcontainer"
|
|
"github.com/opencontainers/runc/libcontainer/configs"
|
|
"github.com/opencontainers/runc/libcontainer/specconv"
|
|
"github.com/opencontainers/runc/libcontainer/utils"
|
|
)
|
|
|
|
var errEmptyID = errors.New("container id cannot be empty")
|
|
|
|
// getContainer returns the specified container instance by loading it from
|
|
// a state directory (root).
|
|
func getContainer(context *cli.Context) (*libcontainer.Container, error) {
|
|
id := context.Args().First()
|
|
if id == "" {
|
|
return nil, errEmptyID
|
|
}
|
|
root := context.GlobalString("root")
|
|
return libcontainer.Load(root, id)
|
|
}
|
|
|
|
func getDefaultImagePath() string {
|
|
cwd, err := os.Getwd()
|
|
if err != nil {
|
|
panic(err)
|
|
}
|
|
return filepath.Join(cwd, "checkpoint")
|
|
}
|
|
|
|
// newProcess returns a new libcontainer Process with the arguments from the
|
|
// spec and stdio from the current process.
|
|
func newProcess(p specs.Process) (*libcontainer.Process, error) {
|
|
lp := &libcontainer.Process{
|
|
Args: p.Args,
|
|
Env: p.Env,
|
|
// TODO: fix libcontainer's API to better support uid/gid in a typesafe way.
|
|
User: fmt.Sprintf("%d:%d", p.User.UID, p.User.GID),
|
|
Cwd: p.Cwd,
|
|
Label: p.SelinuxLabel,
|
|
NoNewPrivileges: &p.NoNewPrivileges,
|
|
AppArmorProfile: p.ApparmorProfile,
|
|
}
|
|
|
|
if p.ConsoleSize != nil {
|
|
lp.ConsoleWidth = uint16(p.ConsoleSize.Width)
|
|
lp.ConsoleHeight = uint16(p.ConsoleSize.Height)
|
|
}
|
|
|
|
if p.Scheduler != nil {
|
|
s := *p.Scheduler
|
|
lp.Scheduler = &s
|
|
}
|
|
|
|
if p.Capabilities != nil {
|
|
lp.Capabilities = &configs.Capabilities{}
|
|
lp.Capabilities.Bounding = p.Capabilities.Bounding
|
|
lp.Capabilities.Effective = p.Capabilities.Effective
|
|
lp.Capabilities.Inheritable = p.Capabilities.Inheritable
|
|
lp.Capabilities.Permitted = p.Capabilities.Permitted
|
|
lp.Capabilities.Ambient = p.Capabilities.Ambient
|
|
}
|
|
for _, gid := range p.User.AdditionalGids {
|
|
lp.AdditionalGroups = append(lp.AdditionalGroups, strconv.FormatUint(uint64(gid), 10))
|
|
}
|
|
for _, rlimit := range p.Rlimits {
|
|
rl, err := createLibContainerRlimit(rlimit)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
lp.Rlimits = append(lp.Rlimits, rl)
|
|
}
|
|
return lp, nil
|
|
}
|
|
|
|
func destroy(container *libcontainer.Container) {
|
|
if err := container.Destroy(); err != nil {
|
|
logrus.Error(err)
|
|
}
|
|
}
|
|
|
|
// setupIO modifies the given process config according to the options.
|
|
func setupIO(process *libcontainer.Process, rootuid, rootgid int, createTTY, detach bool, sockpath string) (*tty, error) {
|
|
if createTTY {
|
|
process.Stdin = nil
|
|
process.Stdout = nil
|
|
process.Stderr = nil
|
|
t := &tty{}
|
|
if !detach {
|
|
if err := t.initHostConsole(); err != nil {
|
|
return nil, err
|
|
}
|
|
parent, child, err := utils.NewSockPair("console")
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
process.ConsoleSocket = child
|
|
t.postStart = append(t.postStart, parent, child)
|
|
t.consoleC = make(chan error, 1)
|
|
go func() {
|
|
t.consoleC <- t.recvtty(parent)
|
|
}()
|
|
} else {
|
|
// the caller of runc will handle receiving the console master
|
|
conn, err := net.Dial("unix", sockpath)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
uc, ok := conn.(*net.UnixConn)
|
|
if !ok {
|
|
return nil, errors.New("casting to UnixConn failed")
|
|
}
|
|
t.postStart = append(t.postStart, uc)
|
|
socket, err := uc.File()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
t.postStart = append(t.postStart, socket)
|
|
process.ConsoleSocket = socket
|
|
}
|
|
return t, nil
|
|
}
|
|
// when runc will detach the caller provides the stdio to runc via runc's 0,1,2
|
|
// and the container's process inherits runc's stdio.
|
|
if detach {
|
|
inheritStdio(process)
|
|
return &tty{}, nil
|
|
}
|
|
return setupProcessPipes(process, rootuid, rootgid)
|
|
}
|
|
|
|
// createPidFile creates a file with the processes pid inside it atomically
|
|
// it creates a temp file with the paths filename + '.' infront of it
|
|
// then renames the file
|
|
func createPidFile(path string, process *libcontainer.Process) error {
|
|
pid, err := process.Pid()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
var (
|
|
tmpDir = filepath.Dir(path)
|
|
tmpName = filepath.Join(tmpDir, "."+filepath.Base(path))
|
|
)
|
|
f, err := os.OpenFile(tmpName, os.O_RDWR|os.O_CREATE|os.O_EXCL|os.O_SYNC, 0o666)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
_, err = f.WriteString(strconv.Itoa(pid))
|
|
f.Close()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
return os.Rename(tmpName, path)
|
|
}
|
|
|
|
func createContainer(context *cli.Context, id string, spec *specs.Spec) (*libcontainer.Container, error) {
|
|
rootlessCg, err := shouldUseRootlessCgroupManager(context)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
config, err := specconv.CreateLibcontainerConfig(&specconv.CreateOpts{
|
|
CgroupName: id,
|
|
UseSystemdCgroup: context.GlobalBool("systemd-cgroup"),
|
|
NoPivotRoot: context.Bool("no-pivot"),
|
|
NoNewKeyring: context.Bool("no-new-keyring"),
|
|
Spec: spec,
|
|
RootlessEUID: os.Geteuid() != 0,
|
|
RootlessCgroups: rootlessCg,
|
|
})
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
root := context.GlobalString("root")
|
|
return libcontainer.Create(root, id, config)
|
|
}
|
|
|
|
type runner struct {
|
|
init bool
|
|
enableSubreaper bool
|
|
shouldDestroy bool
|
|
detach bool
|
|
listenFDs []*os.File
|
|
preserveFDs int
|
|
pidFile string
|
|
consoleSocket string
|
|
container *libcontainer.Container
|
|
action CtAct
|
|
notifySocket *notifySocket
|
|
criuOpts *libcontainer.CriuOpts
|
|
subCgroupPaths map[string]string
|
|
}
|
|
|
|
func (r *runner) run(config *specs.Process) (int, error) {
|
|
var err error
|
|
defer func() {
|
|
if err != nil {
|
|
r.destroy()
|
|
}
|
|
}()
|
|
if err = r.checkTerminal(config); err != nil {
|
|
return -1, err
|
|
}
|
|
process, err := newProcess(*config)
|
|
if err != nil {
|
|
return -1, err
|
|
}
|
|
process.LogLevel = strconv.Itoa(int(logrus.GetLevel()))
|
|
// Populate the fields that come from runner.
|
|
process.Init = r.init
|
|
process.SubCgroupPaths = r.subCgroupPaths
|
|
if len(r.listenFDs) > 0 {
|
|
process.Env = append(process.Env, "LISTEN_FDS="+strconv.Itoa(len(r.listenFDs)), "LISTEN_PID=1")
|
|
process.ExtraFiles = append(process.ExtraFiles, r.listenFDs...)
|
|
}
|
|
baseFd := 3 + len(process.ExtraFiles)
|
|
for i := baseFd; i < baseFd+r.preserveFDs; i++ {
|
|
_, err = os.Stat("/proc/self/fd/" + strconv.Itoa(i))
|
|
if err != nil {
|
|
return -1, fmt.Errorf("unable to stat preserved-fd %d (of %d): %w", i-baseFd, r.preserveFDs, err)
|
|
}
|
|
process.ExtraFiles = append(process.ExtraFiles, os.NewFile(uintptr(i), "PreserveFD:"+strconv.Itoa(i)))
|
|
}
|
|
rootuid, err := r.container.Config().HostRootUID()
|
|
if err != nil {
|
|
return -1, err
|
|
}
|
|
rootgid, err := r.container.Config().HostRootGID()
|
|
if err != nil {
|
|
return -1, err
|
|
}
|
|
detach := r.detach || (r.action == CT_ACT_CREATE)
|
|
// Setting up IO is a two stage process. We need to modify process to deal
|
|
// with detaching containers, and then we get a tty after the container has
|
|
// started.
|
|
handler := newSignalHandler(r.enableSubreaper, r.notifySocket)
|
|
tty, err := setupIO(process, rootuid, rootgid, config.Terminal, detach, r.consoleSocket)
|
|
if err != nil {
|
|
return -1, err
|
|
}
|
|
defer tty.Close()
|
|
|
|
switch r.action {
|
|
case CT_ACT_CREATE:
|
|
err = r.container.Start(process)
|
|
case CT_ACT_RESTORE:
|
|
err = r.container.Restore(process, r.criuOpts)
|
|
case CT_ACT_RUN:
|
|
err = r.container.Run(process)
|
|
default:
|
|
panic("Unknown action")
|
|
}
|
|
if err != nil {
|
|
return -1, err
|
|
}
|
|
if err = tty.waitConsole(); err != nil {
|
|
r.terminate(process)
|
|
return -1, err
|
|
}
|
|
tty.ClosePostStart()
|
|
if r.pidFile != "" {
|
|
if err = createPidFile(r.pidFile, process); err != nil {
|
|
r.terminate(process)
|
|
return -1, err
|
|
}
|
|
}
|
|
status, err := handler.forward(process, tty, detach)
|
|
if err != nil {
|
|
r.terminate(process)
|
|
}
|
|
if detach {
|
|
return 0, nil
|
|
}
|
|
if err == nil {
|
|
r.destroy()
|
|
}
|
|
return status, err
|
|
}
|
|
|
|
func (r *runner) destroy() {
|
|
if r.shouldDestroy {
|
|
destroy(r.container)
|
|
}
|
|
}
|
|
|
|
func (r *runner) terminate(p *libcontainer.Process) {
|
|
_ = p.Signal(unix.SIGKILL)
|
|
_, _ = p.Wait()
|
|
}
|
|
|
|
func (r *runner) checkTerminal(config *specs.Process) error {
|
|
detach := r.detach || (r.action == CT_ACT_CREATE)
|
|
// Check command-line for sanity.
|
|
if detach && config.Terminal && r.consoleSocket == "" {
|
|
return errors.New("cannot allocate tty if runc will detach without setting console socket")
|
|
}
|
|
if (!detach || !config.Terminal) && r.consoleSocket != "" {
|
|
return errors.New("cannot use console socket if runc will not detach or allocate tty")
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func validateProcessSpec(spec *specs.Process) error {
|
|
if spec == nil {
|
|
return errors.New("process property must not be empty")
|
|
}
|
|
if spec.Cwd == "" {
|
|
return errors.New("Cwd property must not be empty")
|
|
}
|
|
if !filepath.IsAbs(spec.Cwd) {
|
|
return errors.New("Cwd must be an absolute path")
|
|
}
|
|
if len(spec.Args) == 0 {
|
|
return errors.New("args must not be empty")
|
|
}
|
|
if spec.SelinuxLabel != "" && !selinux.GetEnabled() {
|
|
return errors.New("selinux label is specified in config, but selinux is disabled or not supported")
|
|
}
|
|
return nil
|
|
}
|
|
|
|
type CtAct uint8
|
|
|
|
const (
|
|
CT_ACT_CREATE CtAct = iota + 1
|
|
CT_ACT_RUN
|
|
CT_ACT_RESTORE
|
|
)
|
|
|
|
func startContainer(context *cli.Context, action CtAct, criuOpts *libcontainer.CriuOpts) (int, error) {
|
|
if err := revisePidFile(context); err != nil {
|
|
return -1, err
|
|
}
|
|
spec, err := setupSpec(context)
|
|
if err != nil {
|
|
return -1, err
|
|
}
|
|
|
|
id := context.Args().First()
|
|
if id == "" {
|
|
return -1, errEmptyID
|
|
}
|
|
|
|
notifySocket := newNotifySocket(context, os.Getenv("NOTIFY_SOCKET"), id)
|
|
if notifySocket != nil {
|
|
notifySocket.setupSpec(spec)
|
|
}
|
|
|
|
container, err := createContainer(context, id, spec)
|
|
if err != nil {
|
|
return -1, err
|
|
}
|
|
|
|
if notifySocket != nil {
|
|
if err := notifySocket.setupSocketDirectory(); err != nil {
|
|
return -1, err
|
|
}
|
|
if action == CT_ACT_RUN {
|
|
if err := notifySocket.bindSocket(); err != nil {
|
|
return -1, err
|
|
}
|
|
}
|
|
}
|
|
|
|
// Support on-demand socket activation by passing file descriptors into the container init process.
|
|
listenFDs := []*os.File{}
|
|
if os.Getenv("LISTEN_FDS") != "" {
|
|
listenFDs = activation.Files(false)
|
|
}
|
|
|
|
r := &runner{
|
|
enableSubreaper: !context.Bool("no-subreaper"),
|
|
shouldDestroy: !context.Bool("keep"),
|
|
container: container,
|
|
listenFDs: listenFDs,
|
|
notifySocket: notifySocket,
|
|
consoleSocket: context.String("console-socket"),
|
|
detach: context.Bool("detach"),
|
|
pidFile: context.String("pid-file"),
|
|
preserveFDs: context.Int("preserve-fds"),
|
|
action: action,
|
|
criuOpts: criuOpts,
|
|
init: true,
|
|
}
|
|
return r.run(spec.Process)
|
|
}
|