mirror of
https://github.com/opencontainers/runc.git
synced 2025-09-26 19:41:35 +08:00
contrib: memfd-bind: add helper for memfd-sealed-bind trick
This really isn't ideal but it can be used to avoid the largest issues with the memfd-based runc binary protection. There are several caveats with using this tool, see the help page for the new binary for details. Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
This commit is contained in:
9
.gitignore
vendored
9
.gitignore
vendored
@@ -1,10 +1,11 @@
|
||||
vendor/pkg
|
||||
/runc
|
||||
/runc-*
|
||||
contrib/cmd/recvtty/recvtty
|
||||
contrib/cmd/sd-helper/sd-helper
|
||||
contrib/cmd/seccompagent/seccompagent
|
||||
contrib/cmd/fs-idmap/fs-idmap
|
||||
/contrib/cmd/recvtty/recvtty
|
||||
/contrib/cmd/sd-helper/sd-helper
|
||||
/contrib/cmd/seccompagent/seccompagent
|
||||
/contrib/cmd/fs-idmap/fs-idmap
|
||||
/contrib/cmd/memfd-bind/memfd-bind
|
||||
man/man8
|
||||
release
|
||||
Vagrantfile
|
||||
|
7
Makefile
7
Makefile
@@ -67,9 +67,9 @@ runc: runc-dmz
|
||||
$(GO_BUILD) -o runc .
|
||||
make verify-dmz-arch
|
||||
|
||||
all: runc recvtty sd-helper seccompagent fs-idmap
|
||||
all: runc recvtty sd-helper seccompagent fs-idmap memfd-bind
|
||||
|
||||
recvtty sd-helper seccompagent fs-idmap:
|
||||
recvtty sd-helper seccompagent fs-idmap memfd-bind:
|
||||
$(GO_BUILD) -o contrib/cmd/$@/$@ ./contrib/cmd/$@
|
||||
|
||||
static: runc-dmz
|
||||
@@ -161,10 +161,11 @@ install-man: man
|
||||
|
||||
clean:
|
||||
rm -f runc runc-* libcontainer/dmz/runc-dmz
|
||||
rm -f contrib/cmd/fs-idmap/fs-idmap
|
||||
rm -f contrib/cmd/recvtty/recvtty
|
||||
rm -f contrib/cmd/sd-helper/sd-helper
|
||||
rm -f contrib/cmd/seccompagent/seccompagent
|
||||
rm -f contrib/cmd/fs-idmap/fs-idmap
|
||||
rm -f contrib/cmd/memfd-bind/memfd-bind
|
||||
sudo rm -rf release
|
||||
rm -rf man/man8
|
||||
|
||||
|
@@ -68,13 +68,15 @@ make BUILDTAGS=""
|
||||
| Build Tag | Feature | Enabled by Default | Dependencies |
|
||||
|---------------|---------------------------------------|--------------------|---------------------|
|
||||
| `seccomp` | Syscall filtering using `libseccomp`. | yes | `libseccomp` |
|
||||
| `!runc_nodmz` | Reduce memory usage for CVE-2019-5736 protection by using a small C binary. `runc_nodmz` disables this feature and causes runc to use a different protection mechanism which will further increases memory usage temporarily during container startup. This feature can also be disabled at runtime by setting the `RUNC_DMZ=legacy` environment variable. | yes ||
|
||||
| `!runc_nodmz` | Reduce memory usage for CVE-2019-5736 protection by using a small C binary, [see `memfd-bind` for more details][contrib-memfd-bind]. `runc_nodmz` disables this feature and causes runc to use a different protection mechanism which will further increases memory usage temporarily during container startup. This feature can also be disabled at runtime by setting the `RUNC_DMZ=legacy` environment variable. | yes ||
|
||||
|
||||
The following build tags were used earlier, but are now obsoleted:
|
||||
- **nokmem** (since runc v1.0.0-rc94 kernel memory settings are ignored)
|
||||
- **apparmor** (since runc v1.0.0-rc93 the feature is always enabled)
|
||||
- **selinux** (since runc v1.0.0-rc93 the feature is always enabled)
|
||||
|
||||
[contrib-memfd-bind]: /contrib/memfd-bind/README.md
|
||||
|
||||
### Running the test suite
|
||||
|
||||
`runc` currently supports running its test suite via Docker.
|
||||
|
67
contrib/cmd/memfd-bind/README.md
Normal file
67
contrib/cmd/memfd-bind/README.md
Normal file
@@ -0,0 +1,67 @@
|
||||
## memfd-bind ##
|
||||
|
||||
`runc` normally has to make a binary copy of itself (or of a smaller helper
|
||||
binary called `runc-dmz`) when constructing a container process in order to
|
||||
defend against certain container runtime attacks such as CVE-2019-5736.
|
||||
|
||||
This cloned binary only exists until the container process starts (this means
|
||||
for `runc run` and `runc exec`, it only exists for a few hundred milliseconds
|
||||
-- for `runc create` it exists until `runc start` is called). However, because
|
||||
the clone is done using a memfd (or by creating files in directories that are
|
||||
likely to be a `tmpfs`), this can lead to temporary increases in *host* memory
|
||||
usage. Unless you are running on a cgroupv1 system with the cgroupv1 memory
|
||||
controller enabled and the (deprecated) `memory.move_charge_at_immigrate`
|
||||
enabled, there is no effect on the container's memory.
|
||||
|
||||
However, for certain configurations this can still be undesirable. This daemon
|
||||
allows you to create a sealed memfd copy of the `runc` binary, which will cause
|
||||
`runc` to skip all binary copying, resulting in no additional memory usage for
|
||||
each container process (instead there is a single in-memory copy of the
|
||||
binary). It should be noted that (strictly speaking) this is slightly less
|
||||
secure if you are concerned about Dirty Cow-like 0-day kernel vulnerabilities,
|
||||
but for most users the security benefit is identical.
|
||||
|
||||
The provided `memfd-bind@.service` file can be used to get systemd to manage
|
||||
this daemon. You can supply the path like so:
|
||||
|
||||
```
|
||||
% systemctl start memfd-bind@/usr/bin/runc
|
||||
```
|
||||
|
||||
Thus, there are three ways of protecting against CVE-2019-5736, in order of how
|
||||
much memory usage they can use:
|
||||
|
||||
* `memfd-bind` only creates a single in-memory copy of the `runc` binary (about
|
||||
10MB), regardless of how many containers are running.
|
||||
|
||||
* `runc-dmz` is (depending on which libc it was compiled with) between 10kB and
|
||||
1MB in size, and a copy is created once per process spawned inside a
|
||||
container by runc (both the pid1 and every `runc exec`). There are
|
||||
circumstances where using `runc-dmz` will fail in ways that runc cannot
|
||||
predict ahead of time (such as restrictive LSMs applied to containers), in
|
||||
which case users can disable it with the `RUNC_DMZ=legacy` setting.
|
||||
`runc-dmz` also requires an additional `execve` over the other options,
|
||||
though since the binary is so small the cost is probably not even noticeable.
|
||||
|
||||
* The classic method of making a copy of the entire `runc` binary during
|
||||
container process setup takes up about 10MB per process spawned inside the
|
||||
container by runc (both pid1 and `runc exec`).
|
||||
|
||||
### Caveats ###
|
||||
|
||||
There are several downsides with using `memfd-bind` on the `runc` binary:
|
||||
|
||||
* The `memfd-bind` process needs to continue to run indefinitely in order for
|
||||
the memfd reference to stay alive. If the process is forcefully killed, the
|
||||
bind-mount on top of the `runc` binary will become stale and nobody will be
|
||||
able to execute it (you can use `memfd-bind --cleanup` to clean up the stale
|
||||
mount).
|
||||
|
||||
* Only root can execute the cloned binary due to permission restrictions on
|
||||
accessing other process's files. More specifically, only users with ptrace
|
||||
privileges over the memfd-bind daemon can access the file (but in practice
|
||||
this is usually only root).
|
||||
|
||||
* When updating `runc`, the daemon needs to be stopped before the update (so
|
||||
the package manager can access the underlying file) and then restarted after
|
||||
the update.
|
240
contrib/cmd/memfd-bind/memfd-bind.go
Normal file
240
contrib/cmd/memfd-bind/memfd-bind.go
Normal file
@@ -0,0 +1,240 @@
|
||||
/*
|
||||
* Copyright (c) 2023 SUSE LLC
|
||||
* Copyright (c) 2023 Aleksa Sarai <cyphar@cyphar.com>
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package main
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"os/signal"
|
||||
"runtime"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/opencontainers/runc/libcontainer/dmz"
|
||||
|
||||
"github.com/sirupsen/logrus"
|
||||
"github.com/urfave/cli"
|
||||
"golang.org/x/sys/unix"
|
||||
)
|
||||
|
||||
// version will be populated by the Makefile, read from
|
||||
// VERSION file of the source code.
|
||||
var version = ""
|
||||
|
||||
// gitCommit will be the hash that the binary was built from
|
||||
// and will be populated by the Makefile.
|
||||
var gitCommit = ""
|
||||
|
||||
const (
|
||||
usage = `Open Container Initiative contrib/cmd/memfd-bind
|
||||
|
||||
In order to protect against certain container attacks, every runc invocation
|
||||
that involves creating or joining a container will cause runc to make a copy of
|
||||
the runc binary in memory (usually to a memfd). While "runc init" is very
|
||||
short-lived, this extra memory usage can cause problems for containers with
|
||||
very small memory limits (or containers that have many "runc exec" invocations
|
||||
applied to them at the same time).
|
||||
|
||||
memfd-bind is a tool to create a persistent memfd-sealed-copy of the runc binary,
|
||||
which will cause runc to not make its own copy. This means you can get the
|
||||
benefits of using a sealed memfd as runc's binary (even in a container breakout
|
||||
attack to get write access to the runc binary, neither the underlying binary
|
||||
nor the memfd copy can be changed).
|
||||
|
||||
To use memfd-bind, just specify which path you want to create a socket path at
|
||||
which you want to receive terminals:
|
||||
|
||||
$ sudo memfd-bind /usr/bin/runc
|
||||
|
||||
Note that (due to kernel restrictions on bind-mounts), this program must remain
|
||||
running on the host in order for the binary to be readable (it is recommended
|
||||
you use a systemd unit to keep this process around).
|
||||
|
||||
If this program dies, there will be a leftover mountpoint that always returns
|
||||
-EINVAL when attempting to access it. You need to use memfd-bind --cleanup on the
|
||||
path in order to unmount the path (regular umount(8) will not work):
|
||||
|
||||
$ sudo memfd-bind --cleanup /usr/bin/runc
|
||||
|
||||
Note that (due to restrictions on /proc/$pid/fd/$fd magic-link resolution),
|
||||
only privileged users (specifically, those that have ptrace privileges over the
|
||||
memfd-bind daemon) can access the memfd bind-mount. This means that using this
|
||||
tool to harden your /usr/bin/runc binary would result in unprivileged users
|
||||
being unable to execute the binary. If this is an issue, you could make all
|
||||
privileged process use a different copy of runc (by making a copy in somewhere
|
||||
like /usr/sbin/runc) and only using memfd-bind for the version used by
|
||||
privileged users.
|
||||
`
|
||||
)
|
||||
|
||||
func cleanup(path string) error {
|
||||
file, err := os.OpenFile(path, unix.O_PATH|unix.O_NOFOLLOW|unix.O_CLOEXEC, 0)
|
||||
if err != nil {
|
||||
return fmt.Errorf("cleanup: failed to open runc binary path: %w", err)
|
||||
}
|
||||
defer file.Close()
|
||||
fdPath := fmt.Sprintf("/proc/self/fd/%d", file.Fd())
|
||||
|
||||
// Keep umounting until we hit a umount error.
|
||||
for unix.Unmount(fdPath, unix.MNT_DETACH) == nil {
|
||||
// loop...
|
||||
logrus.Debugf("memfd-bind: path %q unmount succeeded...", path)
|
||||
}
|
||||
logrus.Infof("memfd-bind: path %q has been cleared of all old bind-mounts", path)
|
||||
return nil
|
||||
}
|
||||
|
||||
// memfdClone is a memfd-only implementation of dmz.CloneBinary.
|
||||
func memfdClone(path string) (*os.File, error) {
|
||||
binFile, err := os.Open(path)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to open runc binary path: %w", err)
|
||||
}
|
||||
defer binFile.Close()
|
||||
stat, err := binFile.Stat()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("checking %s size: %w", path, err)
|
||||
}
|
||||
size := stat.Size()
|
||||
memfd, sealFn, err := dmz.Memfd("/proc/self/exe")
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("creating memfd failed: %w", err)
|
||||
}
|
||||
copied, err := io.Copy(memfd, binFile)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("copy binary: %w", err)
|
||||
} else if copied != size {
|
||||
return nil, fmt.Errorf("copied binary size mismatch: %d != %d", copied, size)
|
||||
}
|
||||
if err := sealFn(&memfd); err != nil {
|
||||
return nil, fmt.Errorf("could not seal fd: %w", err)
|
||||
}
|
||||
if !dmz.IsCloned(memfd) {
|
||||
return nil, fmt.Errorf("cloned memfd is not properly sealed")
|
||||
}
|
||||
return memfd, nil
|
||||
}
|
||||
|
||||
func mount(path string) error {
|
||||
memfdFile, err := memfdClone(path)
|
||||
if err != nil {
|
||||
return fmt.Errorf("memfd clone: %w", err)
|
||||
}
|
||||
defer memfdFile.Close()
|
||||
memfdPath := fmt.Sprintf("/proc/self/fd/%d", memfdFile.Fd())
|
||||
|
||||
// We have to open an O_NOFOLLOW|O_PATH to the memfd magic-link because we
|
||||
// cannot bind-mount the memfd itself (it's in the internal kernel mount
|
||||
// namespace and cross-mount-namespace bind-mounts are not allowed). This
|
||||
// also requires that this program stay alive continuously for the
|
||||
// magic-link to stay alive...
|
||||
memfdLink, err := os.OpenFile(memfdPath, unix.O_PATH|unix.O_NOFOLLOW|unix.O_CLOEXEC, 0)
|
||||
if err != nil {
|
||||
return fmt.Errorf("mount: failed to /proc/self/fd magic-link for memfd: %w", err)
|
||||
}
|
||||
defer memfdLink.Close()
|
||||
memfdLinkFdPath := fmt.Sprintf("/proc/self/fd/%d", memfdLink.Fd())
|
||||
|
||||
exeFile, err := os.OpenFile(path, unix.O_PATH|unix.O_NOFOLLOW|unix.O_CLOEXEC, 0)
|
||||
if err != nil {
|
||||
return fmt.Errorf("mount: failed to open target runc binary path: %w", err)
|
||||
}
|
||||
defer exeFile.Close()
|
||||
exeFdPath := fmt.Sprintf("/proc/self/fd/%d", exeFile.Fd())
|
||||
|
||||
err = unix.Mount(memfdLinkFdPath, exeFdPath, "", unix.MS_BIND, "")
|
||||
if err != nil {
|
||||
return fmt.Errorf("mount: failed to mount memfd on top of runc binary path target: %w", err)
|
||||
}
|
||||
|
||||
// If there is a signal we want to do cleanup.
|
||||
sigCh := make(chan os.Signal, 1)
|
||||
signal.Notify(sigCh, os.Interrupt, unix.SIGTERM, unix.SIGINT)
|
||||
go func() {
|
||||
<-sigCh
|
||||
logrus.Infof("memfd-bind: exit signal caught! cleaning up the bind-mount on %q...", path)
|
||||
_ = cleanup(path)
|
||||
os.Exit(0)
|
||||
}()
|
||||
|
||||
// Clean up things we don't need...
|
||||
_ = exeFile.Close()
|
||||
_ = memfdLink.Close()
|
||||
|
||||
// We now have to stay alive to keep the magic-link alive...
|
||||
logrus.Infof("memfd-bind: bind-mount of memfd over %q created -- looping forever!", path)
|
||||
for {
|
||||
// loop forever...
|
||||
time.Sleep(time.Duration(1<<63 - 1))
|
||||
// make sure the memfd isn't gc'd
|
||||
runtime.KeepAlive(memfdFile)
|
||||
}
|
||||
}
|
||||
|
||||
func main() {
|
||||
app := cli.NewApp()
|
||||
app.Name = "memfd-bind"
|
||||
app.Usage = usage
|
||||
|
||||
// Set version to be the same as runC.
|
||||
var v []string
|
||||
if version != "" {
|
||||
v = append(v, version)
|
||||
}
|
||||
if gitCommit != "" {
|
||||
v = append(v, "commit: "+gitCommit)
|
||||
}
|
||||
app.Version = strings.Join(v, "\n")
|
||||
|
||||
// Set the flags.
|
||||
app.Flags = []cli.Flag{
|
||||
cli.BoolFlag{
|
||||
Name: "cleanup",
|
||||
Usage: "Do not create a new memfd-sealed file, only clean up an existing one at <path>.",
|
||||
},
|
||||
cli.BoolFlag{
|
||||
Name: "debug",
|
||||
Usage: "Enable debug logging.",
|
||||
},
|
||||
}
|
||||
|
||||
app.Action = func(ctx *cli.Context) error {
|
||||
args := ctx.Args()
|
||||
if len(args) != 1 {
|
||||
return errors.New("need to specify a single path to the runc binary")
|
||||
}
|
||||
path := ctx.Args()[0]
|
||||
|
||||
if ctx.Bool("debug") {
|
||||
logrus.SetLevel(logrus.DebugLevel)
|
||||
}
|
||||
|
||||
err := cleanup(path)
|
||||
// We only care about cleanup errors when doing --cleanup.
|
||||
if ctx.Bool("cleanup") {
|
||||
return err
|
||||
}
|
||||
return mount(path)
|
||||
}
|
||||
if err := app.Run(os.Args); err != nil {
|
||||
fmt.Fprintf(os.Stderr, "memfd-bind: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
}
|
11
contrib/cmd/memfd-bind/memfd-bind@.service
Normal file
11
contrib/cmd/memfd-bind/memfd-bind@.service
Normal file
@@ -0,0 +1,11 @@
|
||||
[Unit]
|
||||
Description=Manage memfd-bind of %I
|
||||
Documentation=https://github.com/opencontainers/runc
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
ExecStart=memfd-bind "%I"
|
||||
ExecStop=memfd-bind --cleanup "%I"
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
Reference in New Issue
Block a user