mirror of
https://github.com/opencontainers/runc.git
synced 2025-10-08 17:00:13 +08:00
runc-dmz: reduce memfd binary cloning cost with small C binary
The idea is to remove the need for cloning the entire runc binary by
replacing the final execve() call of the container process with an
execve() call to a clone of a small C binary which just does an execve()
of its arguments.
This provides similar protection against CVE-2019-5736 but without
requiring a >10MB binary copy for each "runc init". When compiled with
musl, runc-dmz is 13kB (though unfortunately with glibc, it is 1.1MB
which is still quite large).
It should be noted that there is still a window where the container
processes could get access to the host runc binary, but because we set
ourselves as non-dumpable the container would need CAP_SYS_PTRACE (which
is not enabled by default in Docker) in order to get around the
proc_fd_access_allowed() checks. In addition, since Linux 4.10[1] the
kernel blocks access entirely for user namespaced containers in this
scenario. For those cases we cannot use runc-dmz, but most containers
won't have this issue.
This new runc-dmz binary can be opted out of at compile time by setting
the "runc_nodmz" buildtag, and at runtime by setting the RUNC_DMZ=legacy
environment variable. In both cases, runc will fall back to the classic
/proc/self/exe-based cloning trick. If /proc/self/exe is already a
sealed memfd (namely if the user is using contrib/cmd/memfd-bind to
create a persistent sealed memfd for runc), neither runc-dmz nor
/proc/self/exe cloning will be used because they are not necessary.
[1]: bfedb58925
Co-authored-by: lifubang <lifubang@acmcoder.com>
Signed-off-by: lifubang <lifubang@acmcoder.com>
[cyphar: address various review nits]
[cyphar: fix runc-dmz cross-compilation]
[cyphar: embed runc-dmz into runc binary and clone in Go code]
[cyphar: make runc-dmz optional, with fallback to /proc/self/exe cloning]
[cyphar: do not use runc-dmz when the container has certain privs]
Co-authored-by: Aleksa Sarai <cyphar@cyphar.com>
Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
This commit is contained in:
17
.github/workflows/test.yml
vendored
17
.github/workflows/test.yml
vendored
@@ -28,6 +28,7 @@ jobs:
|
||||
rootless: ["rootless", ""]
|
||||
race: ["-race", ""]
|
||||
criu: ["", "criu-dev"]
|
||||
dmz: ["", "runc_nodmz"]
|
||||
exclude:
|
||||
- criu: criu-dev
|
||||
rootless: rootless
|
||||
@@ -35,6 +36,10 @@ jobs:
|
||||
go-version: 1.20.x
|
||||
- criu: criu-dev
|
||||
race: -race
|
||||
- dmz: runc_nodmz
|
||||
criu: criu-dev
|
||||
- dmz: runc_nodmz
|
||||
os: ubuntu-20.04
|
||||
runs-on: ${{ matrix.os }}
|
||||
|
||||
steps:
|
||||
@@ -71,6 +76,8 @@ jobs:
|
||||
go-version: ${{ matrix.go-version }}
|
||||
|
||||
- name: build
|
||||
env:
|
||||
EXTRA_BUILDTAGS: ${{ matrix.dmz }}
|
||||
run: sudo -E PATH="$PATH" make EXTRA_FLAGS="${{ matrix.race }}" all
|
||||
|
||||
- name: install bats
|
||||
@@ -80,6 +87,8 @@ jobs:
|
||||
|
||||
- name: unit test
|
||||
if: matrix.rootless != 'rootless'
|
||||
env:
|
||||
EXTRA_BUILDTAGS: ${{ matrix.dmz }}
|
||||
run: sudo -E PATH="$PATH" -- make TESTFLAGS="${{ matrix.race }}" localunittest
|
||||
|
||||
- name: add rootless user
|
||||
@@ -113,8 +122,12 @@ jobs:
|
||||
# However, we do not have 32-bit ARM CI, so we use i386 for testing 32bit stuff.
|
||||
# We are not interested in providing official support for i386.
|
||||
cross-i386:
|
||||
runs-on: ubuntu-22.04
|
||||
timeout-minutes: 15
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
dmz: ["", "runc_nodmz"]
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
steps:
|
||||
|
||||
@@ -136,4 +149,6 @@ jobs:
|
||||
go-version: 1.x # Latest stable
|
||||
|
||||
- name: unit test
|
||||
env:
|
||||
EXTRA_BUILDTAGS: ${{ matrix.dmz }}
|
||||
run: sudo -E PATH="$PATH" -- make GOARCH=386 localunittest
|
||||
|
@@ -7,6 +7,7 @@
|
||||
run:
|
||||
build-tags:
|
||||
- seccomp
|
||||
- runc_nodmz
|
||||
|
||||
linters:
|
||||
disable-all: true
|
||||
|
@@ -3,6 +3,7 @@
|
||||
run:
|
||||
build-tags:
|
||||
- seccomp
|
||||
- runc_nodmz
|
||||
|
||||
linters:
|
||||
enable:
|
||||
|
35
Makefile
35
Makefile
@@ -1,6 +1,11 @@
|
||||
SHELL = /bin/bash
|
||||
|
||||
CONTAINER_ENGINE := docker
|
||||
GO ?= go
|
||||
|
||||
# Get CC values for cross-compilation.
|
||||
include cc_platform.mk
|
||||
|
||||
PREFIX ?= /usr/local
|
||||
BINDIR := $(PREFIX)/sbin
|
||||
MANDIR := $(PREFIX)/share/man
|
||||
@@ -10,6 +15,7 @@ GIT_BRANCH_CLEAN := $(shell echo $(GIT_BRANCH) | sed -e "s/[^[:alnum:]]/-/g")
|
||||
RUNC_IMAGE := runc_dev$(if $(GIT_BRANCH_CLEAN),:$(GIT_BRANCH_CLEAN))
|
||||
PROJECT := github.com/opencontainers/runc
|
||||
BUILDTAGS ?= seccomp urfave_cli_no_docs
|
||||
BUILDTAGS += $(EXTRA_BUILDTAGS)
|
||||
|
||||
COMMIT ?= $(shell git describe --dirty --long --always)
|
||||
VERSION := $(shell cat ./VERSION)
|
||||
@@ -57,16 +63,23 @@ endif
|
||||
|
||||
.DEFAULT: runc
|
||||
|
||||
runc:
|
||||
runc: runc-dmz
|
||||
$(GO_BUILD) -o runc .
|
||||
make verify-dmz-arch
|
||||
|
||||
all: runc recvtty sd-helper seccompagent fs-idmap
|
||||
|
||||
recvtty sd-helper seccompagent fs-idmap:
|
||||
$(GO_BUILD) -o contrib/cmd/$@/$@ ./contrib/cmd/$@
|
||||
|
||||
static:
|
||||
static: runc-dmz
|
||||
$(GO_BUILD_STATIC) -o runc .
|
||||
make verify-dmz-arch
|
||||
|
||||
.PHONY: runc-dmz
|
||||
runc-dmz:
|
||||
rm -f libcontainer/dmz/runc-dmz
|
||||
$(GO) generate -tags "$(BUILDTAGS)" ./libcontainer/dmz
|
||||
|
||||
releaseall: RELEASE_ARGS := "-a 386 -a amd64 -a arm64 -a armel -a armhf -a ppc64le -a riscv64 -a s390x"
|
||||
releaseall: release
|
||||
@@ -147,12 +160,12 @@ install-man: man
|
||||
install -D -m 644 man/man8/*.8 $(DESTDIR)$(MANDIR)/man8
|
||||
|
||||
clean:
|
||||
rm -f runc runc-*
|
||||
rm -f runc runc-* libcontainer/dmz/runc-dmz
|
||||
rm -f contrib/cmd/recvtty/recvtty
|
||||
rm -f contrib/cmd/sd-helper/sd-helper
|
||||
rm -f contrib/cmd/seccompagent/seccompagent
|
||||
rm -f contrib/cmd/fs-idmap/fs-idmap
|
||||
rm -rf release
|
||||
sudo rm -rf release
|
||||
rm -rf man/man8
|
||||
|
||||
cfmt: C_SRC=$(shell git ls-files '*.c' | grep -v '^vendor/')
|
||||
@@ -188,6 +201,18 @@ verify-dependencies: vendor
|
||||
@test -z "$$(git status --porcelain -- go.mod go.sum vendor/)" \
|
||||
|| (echo -e "git status:\n $$(git status -- go.mod go.sum vendor/)\nerror: vendor/, go.mod and/or go.sum not up to date. Run \"make vendor\" to update"; exit 1) \
|
||||
&& echo "all vendor files are up to date."
|
||||
verify-dmz-arch:
|
||||
@test -s libcontainer/dmz/runc-dmz || exit 0; \
|
||||
set -Eeuo pipefail; \
|
||||
export LC_ALL=C; \
|
||||
echo "readelf -h runc"; \
|
||||
readelf -h runc | grep -E "(Machine|Flags):"; \
|
||||
echo "readelf -h libcontainer/dmz/runc-dmz"; \
|
||||
readelf -h libcontainer/dmz/runc-dmz | grep -E "(Machine|Flags):"; \
|
||||
diff -u \
|
||||
<(readelf -h runc | grep -E "(Machine|Flags):") \
|
||||
<(readelf -h libcontainer/dmz/runc-dmz | grep -E "(Machine|Flags):") \
|
||||
&& echo "runc-dmz architecture matches runc binary."
|
||||
|
||||
validate-keyring:
|
||||
script/keyring_validate.sh
|
||||
@@ -197,4 +222,4 @@ validate-keyring:
|
||||
test localtest unittest localunittest integration localintegration \
|
||||
rootlessintegration localrootlessintegration shell install install-bash \
|
||||
install-man clean cfmt shfmt localshfmt shellcheck \
|
||||
vendor verify-changelog verify-dependencies validate-keyring
|
||||
vendor verify-changelog verify-dependencies verify-dmz-arch validate-keyring
|
||||
|
@@ -65,9 +65,10 @@ e.g. to disable seccomp:
|
||||
make BUILDTAGS=""
|
||||
```
|
||||
|
||||
| Build Tag | Feature | Enabled by default | Dependency |
|
||||
|-----------|------------------------------------|--------------------|------------|
|
||||
| seccomp | Syscall filtering | yes | libseccomp |
|
||||
| Build Tag | Feature | Enabled by Default | Dependencies |
|
||||
|---------------|---------------------------------------|--------------------|---------------------|
|
||||
| `seccomp` | Syscall filtering using `libseccomp`. | yes | `libseccomp` |
|
||||
| `!runc_nodmz` | Reduce memory usage for CVE-2019-5736 protection by using a small C binary. `runc_nodmz` disables this feature and causes runc to use a different protection mechanism which will further increases memory usage temporarily during container startup. This feature can also be disabled at runtime by setting the `RUNC_DMZ=legacy` environment variable. | yes ||
|
||||
|
||||
The following build tags were used earlier, but are now obsoleted:
|
||||
- **nokmem** (since runc v1.0.0-rc94 kernel memory settings are ignored)
|
||||
|
61
cc_platform.mk
Normal file
61
cc_platform.mk
Normal file
@@ -0,0 +1,61 @@
|
||||
# NOTE: Make sure you keep this file in sync with scripts/lib.sh.
|
||||
|
||||
GO ?= go
|
||||
GOARCH ?= $(shell $(GO) env GOARCH)
|
||||
|
||||
ifneq ($(shell grep -i "ID_LIKE=.*suse" /etc/os-release),)
|
||||
# openSUSE has a custom PLATFORM
|
||||
PLATFORM ?= suse-linux
|
||||
IS_SUSE := 1
|
||||
else
|
||||
PLATFORM ?= linux-gnu
|
||||
endif
|
||||
|
||||
ifeq ($(GOARCH),$(shell GOARCH= $(GO) env GOARCH))
|
||||
# use the native CC and STRIP
|
||||
HOST :=
|
||||
else ifeq ($(GOARCH),386)
|
||||
# Always use the 64-bit compiler to build the 386 binary, which works for
|
||||
# the more common cross-build method for x86 (namely, the equivalent of
|
||||
# dpkg --add-architecture).
|
||||
ifdef IS_SUSE
|
||||
# There is no x86_64-suse-linux-gcc, so use the native one.
|
||||
HOST :=
|
||||
CPU_TYPE := i586
|
||||
else
|
||||
HOST := x86_64-$(PLATFORM)-
|
||||
CPU_TYPE := i686
|
||||
endif
|
||||
CFLAGS := -m32 -march=$(CPU_TYPE) $(CFLAGS)
|
||||
else ifeq ($(GOARCH),amd64)
|
||||
ifdef IS_SUSE
|
||||
# There is no x86_64-suse-linux-gcc, so use the native one.
|
||||
HOST :=
|
||||
else
|
||||
HOST := x86_64-$(PLATFORM)-
|
||||
endif
|
||||
else ifeq ($(GOARCH),arm64)
|
||||
HOST := aarch64-$(PLATFORM)-
|
||||
else ifeq ($(GOARCH),arm)
|
||||
# HOST already configured by release_build.sh in this case.
|
||||
else ifeq ($(GOARCH),armel)
|
||||
HOST := arm-$(PLATFORM)eabi-
|
||||
else ifeq ($(GOARCH),armhf)
|
||||
HOST := arm-$(PLATFORM)eabihf-
|
||||
else ifeq ($(GOARCH),ppc64le)
|
||||
HOST := powerpc64le-$(PLATFORM)-
|
||||
else ifeq ($(GOARCH),riscv64)
|
||||
HOST := riscv64-$(PLATFORM)-
|
||||
else ifeq ($(GOARCH),s390x)
|
||||
HOST := s390x-$(PLATFORM)-
|
||||
else
|
||||
$(error Unsupported GOARCH $(GOARCH))
|
||||
endif
|
||||
|
||||
ifeq ($(origin CC),$(filter $(origin CC),undefined default))
|
||||
# Override CC if it's undefined or just the default value set by Make.
|
||||
CC := $(HOST)gcc
|
||||
export CC
|
||||
endif
|
||||
STRIP ?= $(HOST)strip
|
||||
export STRIP
|
@@ -27,6 +27,7 @@ import (
|
||||
"github.com/opencontainers/runc/libcontainer/dmz"
|
||||
"github.com/opencontainers/runc/libcontainer/intelrdt"
|
||||
"github.com/opencontainers/runc/libcontainer/system"
|
||||
"github.com/opencontainers/runc/libcontainer/system/kernelversion"
|
||||
"github.com/opencontainers/runc/libcontainer/utils"
|
||||
)
|
||||
|
||||
@@ -444,6 +445,48 @@ func (c *Container) includeExecFifo(cmd *exec.Cmd) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// No longer needed in Go 1.21.
|
||||
func slicesContains[S ~[]E, E comparable](slice S, needle E) bool {
|
||||
for _, val := range slice {
|
||||
if val == needle {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func isDmzBinarySafe(c *configs.Config) bool {
|
||||
// Because we set the dumpable flag in nsexec, the only time when it is
|
||||
// unsafe to use runc-dmz is when the container process would be able to
|
||||
// race against "runc init" and bypass the ptrace_may_access() checks.
|
||||
//
|
||||
// This is only the case if the container processes could have
|
||||
// CAP_SYS_PTRACE somehow (i.e. the capability is present in the bounding,
|
||||
// inheritable, or ambient sets). Luckily, most containers do not have this
|
||||
// capability.
|
||||
if c.Capabilities == nil ||
|
||||
(!slicesContains(c.Capabilities.Bounding, "CAP_SYS_PTRACE") &&
|
||||
!slicesContains(c.Capabilities.Inheritable, "CAP_SYS_PTRACE") &&
|
||||
!slicesContains(c.Capabilities.Ambient, "CAP_SYS_PTRACE")) {
|
||||
return true
|
||||
}
|
||||
|
||||
// Since Linux 4.10 (see bfedb589252c0) user namespaced containers cannot
|
||||
// access /proc/$pid/exe of runc after it joins the namespace (until it
|
||||
// does an exec), regardless of the capability set. This has been
|
||||
// backported to other distribution kernels, but there's no way of checking
|
||||
// this cheaply -- better to be safe than sorry here.
|
||||
linux410 := kernelversion.KernelVersion{Kernel: 4, Major: 10}
|
||||
if ok, err := kernelversion.GreaterEqualThan(linux410); ok && err == nil {
|
||||
if c.Namespaces.Contains(configs.NEWUSER) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
// Assume it's unsafe otherwise.
|
||||
return false
|
||||
}
|
||||
|
||||
func (c *Container) newParentProcess(p *Process) (parentProcess, error) {
|
||||
parentInitPipe, childInitPipe, err := utils.NewSockPair("init")
|
||||
if err != nil {
|
||||
@@ -457,27 +500,54 @@ func (c *Container) newParentProcess(p *Process) (parentProcess, error) {
|
||||
}
|
||||
logFilePair := filePair{parentLogPipe, childLogPipe}
|
||||
|
||||
// Make sure we use a new safe copy of /proc/self/exe each time this is
|
||||
// called, to make sure that if a container manages to overwrite the file
|
||||
// it cannot affect other containers on the system. For runc, this code
|
||||
// will only ever be called once, but libcontainer users might call this
|
||||
// more than once.
|
||||
// Make sure we use a new safe copy of /proc/self/exe or the runc-dmz
|
||||
// binary each time this is called, to make sure that if a container
|
||||
// manages to overwrite the file it cannot affect other containers on the
|
||||
// system. For runc, this code will only ever be called once, but
|
||||
// libcontainer users might call this more than once.
|
||||
p.closeClonedExes()
|
||||
var (
|
||||
exePath string
|
||||
safeExe *os.File
|
||||
// only one of dmzExe or safeExe are used at a time
|
||||
dmzExe, safeExe *os.File
|
||||
)
|
||||
if dmz.IsSelfExeCloned() {
|
||||
// /proc/self/exe is already a cloned binary -- no need to do anything
|
||||
logrus.Debug("skipping binary cloning -- /proc/self/exe is already cloned!")
|
||||
exePath = "/proc/self/exe"
|
||||
} else {
|
||||
safeExe, err = dmz.CloneSelfExe(c.root)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("unable to create safe /proc/self/exe clone for runc init: %w", err)
|
||||
var err error
|
||||
if isDmzBinarySafe(c.config) {
|
||||
dmzExe, err = dmz.Binary(c.root)
|
||||
if err == nil {
|
||||
// We can use our own executable without cloning if we are using
|
||||
// runc-dmz.
|
||||
exePath = "/proc/self/exe"
|
||||
p.clonedExes = append(p.clonedExes, dmzExe)
|
||||
} else if errors.Is(err, dmz.ErrNoDmzBinary) {
|
||||
logrus.Debug("runc-dmz binary not embedded in runc binary, falling back to /proc/self/exe clone")
|
||||
} else if err != nil {
|
||||
return nil, fmt.Errorf("failed to create runc-dmz binary clone: %w", err)
|
||||
}
|
||||
} else {
|
||||
// If the configuration makes it unsafe to use runc-dmz, pretend we
|
||||
// don't have it embedded so we do /proc/self/exe cloning.
|
||||
logrus.Debug("container configuration unsafe for runc-dmz, falling back to /proc/self/exe clone")
|
||||
err = dmz.ErrNoDmzBinary
|
||||
}
|
||||
if errors.Is(err, dmz.ErrNoDmzBinary) {
|
||||
safeExe, err = dmz.CloneSelfExe(c.root)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("unable to create safe /proc/self/exe clone for runc init: %w", err)
|
||||
}
|
||||
exePath = "/proc/self/fd/" + strconv.Itoa(int(safeExe.Fd()))
|
||||
p.clonedExes = append(p.clonedExes, safeExe)
|
||||
}
|
||||
// Just to make sure we don't run without protection.
|
||||
if dmzExe == nil && safeExe == nil {
|
||||
// This should never happen.
|
||||
return nil, fmt.Errorf("[internal error] attempted to spawn a container with no /proc/self/exe protection")
|
||||
}
|
||||
exePath = "/proc/self/fd/" + strconv.Itoa(int(safeExe.Fd()))
|
||||
p.clonedExes = append(p.clonedExes, safeExe)
|
||||
}
|
||||
|
||||
cmd := exec.Command(exePath, "init")
|
||||
@@ -503,6 +573,12 @@ func (c *Container) newParentProcess(p *Process) (parentProcess, error) {
|
||||
"_LIBCONTAINER_STATEDIR="+c.root,
|
||||
)
|
||||
|
||||
if dmzExe != nil {
|
||||
cmd.ExtraFiles = append(cmd.ExtraFiles, dmzExe)
|
||||
cmd.Env = append(cmd.Env,
|
||||
"_LIBCONTAINER_DMZEXEFD="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1))
|
||||
}
|
||||
|
||||
cmd.ExtraFiles = append(cmd.ExtraFiles, childLogPipe)
|
||||
cmd.Env = append(cmd.Env,
|
||||
"_LIBCONTAINER_LOGPIPE="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1))
|
||||
|
1
libcontainer/dmz/.gitignore
vendored
Normal file
1
libcontainer/dmz/.gitignore
vendored
Normal file
@@ -0,0 +1 @@
|
||||
/runc-dmz
|
6
libcontainer/dmz/Makefile
Normal file
6
libcontainer/dmz/Makefile
Normal file
@@ -0,0 +1,6 @@
|
||||
# Get CC values for cross-compilation.
|
||||
include ../../cc_platform.mk
|
||||
|
||||
runc-dmz: _dmz.c
|
||||
$(CC) $(CFLAGS) -static -o $@ $^
|
||||
$(STRIP) -gs $@
|
10
libcontainer/dmz/_dmz.c
Normal file
10
libcontainer/dmz/_dmz.c
Normal file
@@ -0,0 +1,10 @@
|
||||
#include <unistd.h>
|
||||
|
||||
extern char **environ;
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
if (argc < 1)
|
||||
return 127;
|
||||
return execve(argv[0], argv, environ);
|
||||
}
|
9
libcontainer/dmz/dmz.go
Normal file
9
libcontainer/dmz/dmz.go
Normal file
@@ -0,0 +1,9 @@
|
||||
package dmz
|
||||
|
||||
import (
|
||||
"errors"
|
||||
)
|
||||
|
||||
// ErrNoDmzBinary is returned by Binary when there is no runc-dmz binary
|
||||
// embedded in the runc program.
|
||||
var ErrNoDmzBinary = errors.New("runc-dmz binary not embedded in this program")
|
1
libcontainer/dmz/dmz_fallback_linux.go
Normal file
1
libcontainer/dmz/dmz_fallback_linux.go
Normal file
@@ -0,0 +1 @@
|
||||
package dmz
|
48
libcontainer/dmz/dmz_linux.go
Normal file
48
libcontainer/dmz/dmz_linux.go
Normal file
@@ -0,0 +1,48 @@
|
||||
//go:build !runc_nodmz
|
||||
// +build !runc_nodmz
|
||||
|
||||
package dmz
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"debug/elf"
|
||||
_ "embed"
|
||||
"os"
|
||||
|
||||
"github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
// Try to build the runc-dmz binary. If it fails, replace it with an empty file
|
||||
// (this will trigger us to fall back to a clone of /proc/self/exe). Yeah, this
|
||||
// is a bit ugly but it makes sure that weird cross-compilation setups don't
|
||||
// break because of runc-dmz.
|
||||
//
|
||||
//go:generate sh -c "make -B runc-dmz || echo -n >runc-dmz"
|
||||
//go:embed runc-dmz
|
||||
var runcDmzBinary []byte
|
||||
|
||||
// Binary returns a cloned copy (see CloneBinary) of a very minimal C program
|
||||
// that just does an execve() of its arguments. This is used in the final
|
||||
// execution step of the container execution as an intermediate process before
|
||||
// the container process is execve'd. This allows for protection against
|
||||
// CVE-2019-5736 without requiring a complete copy of the runc binary. Each
|
||||
// call to Binary will return a new copy.
|
||||
//
|
||||
// If the runc-dmz binary is not embedded into the runc binary, Binary will
|
||||
// return ErrNoDmzBinary as the error.
|
||||
func Binary(tmpDir string) (*os.File, error) {
|
||||
rdr := bytes.NewBuffer(runcDmzBinary)
|
||||
// Verify that our embedded binary has a standard ELF header.
|
||||
if !bytes.HasPrefix(rdr.Bytes(), []byte(elf.ELFMAG)) {
|
||||
if rdr.Len() != 0 {
|
||||
logrus.Infof("misconfigured build: embedded runc-dmz binary is non-empty but is missing a proper ELF header")
|
||||
}
|
||||
return nil, ErrNoDmzBinary
|
||||
}
|
||||
// Setting RUNC_DMZ=legacy disables this dmz method.
|
||||
if os.Getenv("RUNC_DMZ") == "legacy" {
|
||||
logrus.Debugf("RUNC_DMZ=legacy set -- switching back to classic /proc/self/exe cloning")
|
||||
return nil, ErrNoDmzBinary
|
||||
}
|
||||
return CloneBinary(rdr, int64(rdr.Len()), "runc-dmz", tmpDir)
|
||||
}
|
12
libcontainer/dmz/dmz_unsupported.go
Normal file
12
libcontainer/dmz/dmz_unsupported.go
Normal file
@@ -0,0 +1,12 @@
|
||||
//go:build !linux || runc_nodmz
|
||||
// +build !linux runc_nodmz
|
||||
|
||||
package dmz
|
||||
|
||||
import (
|
||||
"os"
|
||||
)
|
||||
|
||||
func Binary(_ string) (*os.File, error) {
|
||||
return nil, ErrNoDmzBinary
|
||||
}
|
@@ -182,6 +182,17 @@ func startInitialization() (retErr error) {
|
||||
return err
|
||||
}
|
||||
|
||||
// Get runc-dmz fds.
|
||||
var dmzExe *os.File
|
||||
if dmzFdStr := os.Getenv("_LIBCONTAINER_DMZEXEFD"); dmzFdStr != "" {
|
||||
dmzFd, err := strconv.Atoi(dmzFdStr)
|
||||
if err != nil {
|
||||
return fmt.Errorf("unable to convert _LIBCONTAINER_DMZEXEFD: %w", err)
|
||||
}
|
||||
unix.CloseOnExec(dmzFd)
|
||||
dmzExe = os.NewFile(uintptr(dmzFd), "runc-dmz")
|
||||
}
|
||||
|
||||
// clear the current process's environment to clean any libcontainer
|
||||
// specific env vars.
|
||||
os.Clearenv()
|
||||
@@ -197,10 +208,10 @@ func startInitialization() (retErr error) {
|
||||
}()
|
||||
|
||||
// If init succeeds, it will not return, hence none of the defers will be called.
|
||||
return containerInit(it, pipe, consoleSocket, fifofd, logFD, mountFds{sourceFds: mountSrcFds, idmapFds: idmapFds})
|
||||
return containerInit(it, pipe, consoleSocket, fifofd, logFD, dmzExe, mountFds{sourceFds: mountSrcFds, idmapFds: idmapFds})
|
||||
}
|
||||
|
||||
func containerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd, logFd int, mountFds mountFds) error {
|
||||
func containerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd, logFd int, dmzExe *os.File, mountFds mountFds) error {
|
||||
var config *initConfig
|
||||
if err := json.NewDecoder(pipe).Decode(&config); err != nil {
|
||||
return err
|
||||
@@ -208,6 +219,7 @@ func containerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd, lo
|
||||
if err := populateProcessEnvironment(config.Env); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
switch t {
|
||||
case initSetns:
|
||||
// mount and idmap fds must be nil in this case. We don't mount while doing runc exec.
|
||||
@@ -220,6 +232,7 @@ func containerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd, lo
|
||||
consoleSocket: consoleSocket,
|
||||
config: config,
|
||||
logFd: logFd,
|
||||
dmzExe: dmzExe,
|
||||
}
|
||||
return i.Init()
|
||||
case initStandard:
|
||||
@@ -230,6 +243,7 @@ func containerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd, lo
|
||||
config: config,
|
||||
fifoFd: fifoFd,
|
||||
logFd: logFd,
|
||||
dmzExe: dmzExe,
|
||||
mountFds: mountFds,
|
||||
}
|
||||
return i.Init()
|
||||
|
@@ -4,6 +4,7 @@ import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
"strconv"
|
||||
|
||||
"github.com/opencontainers/selinux/go-selinux"
|
||||
@@ -23,6 +24,7 @@ type linuxSetnsInit struct {
|
||||
consoleSocket *os.File
|
||||
config *initConfig
|
||||
logFd int
|
||||
dmzExe *os.File
|
||||
}
|
||||
|
||||
func (l *linuxSetnsInit) getSessionRingName() string {
|
||||
@@ -85,6 +87,18 @@ func (l *linuxSetnsInit) Init() error {
|
||||
if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil {
|
||||
return err
|
||||
}
|
||||
// Check for the arg early to make sure it exists.
|
||||
name, err := exec.LookPath(l.config.Args[0])
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
// exec.LookPath in Go < 1.20 might return no error for an executable
|
||||
// residing on a file system mounted with noexec flag, so perform this
|
||||
// extra check now while we can still return a proper error.
|
||||
// TODO: remove this once go < 1.20 is not supported.
|
||||
if err := eaccess(name); err != nil {
|
||||
return &os.PathError{Op: "eaccess", Path: name, Err: err}
|
||||
}
|
||||
// Set seccomp as close to execve as possible, so as few syscalls take
|
||||
// place afterward (reducing the amount of syscalls that users need to
|
||||
// enable in their seccomp profiles).
|
||||
@@ -98,10 +112,15 @@ func (l *linuxSetnsInit) Init() error {
|
||||
}
|
||||
}
|
||||
logrus.Debugf("setns_init: about to exec")
|
||||
|
||||
// Close the log pipe fd so the parent's ForwardLogs can exit.
|
||||
if err := unix.Close(l.logFd); err != nil {
|
||||
return &os.PathError{Op: "close log pipe", Path: "fd " + strconv.Itoa(l.logFd), Err: err}
|
||||
}
|
||||
|
||||
return system.Execv(l.config.Args[0], l.config.Args[0:], os.Environ())
|
||||
if l.dmzExe != nil {
|
||||
l.config.Args[0] = name
|
||||
return system.Fexecve(l.dmzExe.Fd(), l.config.Args, os.Environ())
|
||||
}
|
||||
return system.Exec(name, l.config.Args, os.Environ())
|
||||
}
|
||||
|
@@ -25,6 +25,7 @@ type linuxStandardInit struct {
|
||||
parentPid int
|
||||
fifoFd int
|
||||
logFd int
|
||||
dmzExe *os.File
|
||||
mountFds mountFds
|
||||
config *initConfig
|
||||
}
|
||||
@@ -262,5 +263,9 @@ func (l *linuxStandardInit) Init() error {
|
||||
return err
|
||||
}
|
||||
|
||||
return system.Exec(name, l.config.Args[0:], os.Environ())
|
||||
if l.dmzExe != nil {
|
||||
l.config.Args[0] = name
|
||||
return system.Fexecve(l.dmzExe.Fd(), l.config.Args, os.Environ())
|
||||
}
|
||||
return system.Exec(name, l.config.Args, os.Environ())
|
||||
}
|
||||
|
94
libcontainer/system/kernelversion/kernel_linux.go
Normal file
94
libcontainer/system/kernelversion/kernel_linux.go
Normal file
@@ -0,0 +1,94 @@
|
||||
/*
|
||||
Copyright The containerd Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
|
||||
File copied and customized based on
|
||||
https://github.com/moby/moby/tree/v20.10.14/profiles/seccomp/kernel_linux.go
|
||||
|
||||
File copied from
|
||||
https://github.com/containerd/containerd/blob/v1.7.5/contrib/seccomp/kernelversion/kernel_linux.go
|
||||
*/
|
||||
|
||||
package kernelversion
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"sync"
|
||||
|
||||
"golang.org/x/sys/unix"
|
||||
)
|
||||
|
||||
// KernelVersion holds information about the kernel.
|
||||
type KernelVersion struct {
|
||||
Kernel uint64 // Version of the Kernel (i.e., the "4" in "4.1.2-generic")
|
||||
Major uint64 // Major revision of the Kernel (i.e., the "1" in "4.1.2-generic")
|
||||
}
|
||||
|
||||
func (k *KernelVersion) String() string {
|
||||
if k.Kernel > 0 || k.Major > 0 {
|
||||
return fmt.Sprintf("%d.%d", k.Kernel, k.Major)
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
var (
|
||||
currentKernelVersion *KernelVersion
|
||||
kernelVersionError error
|
||||
once sync.Once
|
||||
)
|
||||
|
||||
// getKernelVersion gets the current kernel version.
|
||||
func getKernelVersion() (*KernelVersion, error) {
|
||||
once.Do(func() {
|
||||
var uts unix.Utsname
|
||||
if err := unix.Uname(&uts); err != nil {
|
||||
return
|
||||
}
|
||||
// Remove the \x00 from the release for Atoi to parse correctly
|
||||
currentKernelVersion, kernelVersionError = parseRelease(string(uts.Release[:bytes.IndexByte(uts.Release[:], 0)]))
|
||||
})
|
||||
return currentKernelVersion, kernelVersionError
|
||||
}
|
||||
|
||||
// parseRelease parses a string and creates a KernelVersion based on it.
|
||||
func parseRelease(release string) (*KernelVersion, error) {
|
||||
var version KernelVersion
|
||||
|
||||
// We're only make sure we get the "kernel" and "major revision". Sometimes we have
|
||||
// 3.12.25-gentoo, but sometimes we just have 3.12-1-amd64.
|
||||
_, err := fmt.Sscanf(release, "%d.%d", &version.Kernel, &version.Major)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to parse kernel version %q: %w", release, err)
|
||||
}
|
||||
return &version, nil
|
||||
}
|
||||
|
||||
// GreaterEqualThan checks if the host's kernel version is greater than, or
|
||||
// equal to the given kernel version v. Only "kernel version" and "major revision"
|
||||
// can be specified (e.g., "3.12") and will be taken into account, which means
|
||||
// that 3.12.25-gentoo and 3.12-1-amd64 are considered equal (kernel: 3, major: 12).
|
||||
func GreaterEqualThan(minVersion KernelVersion) (bool, error) {
|
||||
kv, err := getKernelVersion()
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
if kv.Kernel > minVersion.Kernel {
|
||||
return true, nil
|
||||
}
|
||||
if kv.Kernel == minVersion.Kernel && kv.Major >= minVersion.Major {
|
||||
return true, nil
|
||||
}
|
||||
return false, nil
|
||||
}
|
140
libcontainer/system/kernelversion/kernel_linux_test.go
Normal file
140
libcontainer/system/kernelversion/kernel_linux_test.go
Normal file
@@ -0,0 +1,140 @@
|
||||
/*
|
||||
Copyright The containerd Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
|
||||
File copied and customized based on
|
||||
https://github.com/moby/moby/tree/v20.10.14/profiles/seccomp/kernel_linux_test.go
|
||||
*/
|
||||
|
||||
package kernelversion
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestGetKernelVersion(t *testing.T) {
|
||||
version, err := getKernelVersion()
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if version == nil {
|
||||
t.Fatal("version is nil")
|
||||
}
|
||||
if version.Kernel == 0 {
|
||||
t.Fatal("no kernel version")
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseRelease(t *testing.T) {
|
||||
tests := []struct {
|
||||
in string
|
||||
out KernelVersion
|
||||
expectedErr error
|
||||
}{
|
||||
{in: "3.8", out: KernelVersion{Kernel: 3, Major: 8}},
|
||||
{in: "3.8.0", out: KernelVersion{Kernel: 3, Major: 8}},
|
||||
{in: "3.8.0-19-generic", out: KernelVersion{Kernel: 3, Major: 8}},
|
||||
{in: "3.4.54.longterm-1", out: KernelVersion{Kernel: 3, Major: 4}},
|
||||
{in: "3.10.0-862.2.3.el7.x86_64", out: KernelVersion{Kernel: 3, Major: 10}},
|
||||
{in: "3.12.8tag", out: KernelVersion{Kernel: 3, Major: 12}},
|
||||
{in: "3.12-1-amd64", out: KernelVersion{Kernel: 3, Major: 12}},
|
||||
{in: "3.12foobar", out: KernelVersion{Kernel: 3, Major: 12}},
|
||||
{in: "99.999.999-19-generic", out: KernelVersion{Kernel: 99, Major: 999}},
|
||||
{in: "", expectedErr: fmt.Errorf(`failed to parse kernel version "": EOF`)},
|
||||
{in: "3", expectedErr: fmt.Errorf(`failed to parse kernel version "3": unexpected EOF`)},
|
||||
{in: "3.", expectedErr: fmt.Errorf(`failed to parse kernel version "3.": EOF`)},
|
||||
{in: "3a", expectedErr: fmt.Errorf(`failed to parse kernel version "3a": input does not match format`)},
|
||||
{in: "3.a", expectedErr: fmt.Errorf(`failed to parse kernel version "3.a": expected integer`)},
|
||||
{in: "a", expectedErr: fmt.Errorf(`failed to parse kernel version "a": expected integer`)},
|
||||
{in: "a.a", expectedErr: fmt.Errorf(`failed to parse kernel version "a.a": expected integer`)},
|
||||
{in: "a.a.a-a", expectedErr: fmt.Errorf(`failed to parse kernel version "a.a.a-a": expected integer`)},
|
||||
{in: "-3", expectedErr: fmt.Errorf(`failed to parse kernel version "-3": expected integer`)},
|
||||
{in: "-3.", expectedErr: fmt.Errorf(`failed to parse kernel version "-3.": expected integer`)},
|
||||
{in: "-3.8", expectedErr: fmt.Errorf(`failed to parse kernel version "-3.8": expected integer`)},
|
||||
{in: "-3.-8", expectedErr: fmt.Errorf(`failed to parse kernel version "-3.-8": expected integer`)},
|
||||
{in: "3.-8", expectedErr: fmt.Errorf(`failed to parse kernel version "3.-8": expected integer`)},
|
||||
}
|
||||
for _, tc := range tests {
|
||||
tc := tc
|
||||
t.Run(tc.in, func(t *testing.T) {
|
||||
version, err := parseRelease(tc.in)
|
||||
if tc.expectedErr != nil {
|
||||
if err == nil {
|
||||
t.Fatal("expected an error")
|
||||
}
|
||||
if err.Error() != tc.expectedErr.Error() {
|
||||
t.Fatalf("expected: %s, got: %s", tc.expectedErr, err)
|
||||
}
|
||||
return
|
||||
}
|
||||
if err != nil {
|
||||
t.Fatal("unexpected error:", err)
|
||||
}
|
||||
if version == nil {
|
||||
t.Fatal("version is nil")
|
||||
}
|
||||
if version.Kernel != tc.out.Kernel || version.Major != tc.out.Major {
|
||||
t.Fatalf("expected: %d.%d, got: %d.%d", tc.out.Kernel, tc.out.Major, version.Kernel, version.Major)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestGreaterEqualThan(t *testing.T) {
|
||||
// Get the current kernel version, so that we can make test relative to that
|
||||
v, err := getKernelVersion()
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
tests := []struct {
|
||||
doc string
|
||||
in KernelVersion
|
||||
expected bool
|
||||
}{
|
||||
{
|
||||
doc: "same version",
|
||||
in: KernelVersion{v.Kernel, v.Major},
|
||||
expected: true,
|
||||
},
|
||||
{
|
||||
doc: "kernel minus one",
|
||||
in: KernelVersion{v.Kernel - 1, v.Major},
|
||||
expected: true,
|
||||
},
|
||||
{
|
||||
doc: "kernel plus one",
|
||||
in: KernelVersion{v.Kernel + 1, v.Major},
|
||||
expected: false,
|
||||
},
|
||||
{
|
||||
doc: "major plus one",
|
||||
in: KernelVersion{v.Kernel, v.Major + 1},
|
||||
expected: false,
|
||||
},
|
||||
}
|
||||
for _, tc := range tests {
|
||||
tc := tc
|
||||
t.Run(tc.doc+": "+tc.in.String(), func(t *testing.T) {
|
||||
ok, err := GreaterEqualThan(tc.in)
|
||||
if err != nil {
|
||||
t.Fatal("unexpected error:", err)
|
||||
}
|
||||
if ok != tc.expected {
|
||||
t.Fatalf("expected: %v, got: %v", tc.expected, ok)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
@@ -7,6 +7,8 @@ import (
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
"strconv"
|
||||
"syscall"
|
||||
"unsafe"
|
||||
|
||||
"github.com/sirupsen/logrus"
|
||||
@@ -38,7 +40,6 @@ func Execv(cmd string, args []string, env []string) error {
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return Exec(name, args, env)
|
||||
}
|
||||
|
||||
@@ -51,6 +52,49 @@ func Exec(cmd string, args []string, env []string) error {
|
||||
}
|
||||
}
|
||||
|
||||
func execveat(fd uintptr, pathname string, args []string, env []string, flags int) error {
|
||||
pathnamep, err := syscall.BytePtrFromString(pathname)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
argvp, err := syscall.SlicePtrFromStrings(args)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
envp, err := syscall.SlicePtrFromStrings(env)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
_, _, errno := syscall.Syscall6(
|
||||
unix.SYS_EXECVEAT,
|
||||
fd,
|
||||
uintptr(unsafe.Pointer(pathnamep)),
|
||||
uintptr(unsafe.Pointer(&argvp[0])),
|
||||
uintptr(unsafe.Pointer(&envp[0])),
|
||||
uintptr(flags),
|
||||
0,
|
||||
)
|
||||
return errno
|
||||
}
|
||||
|
||||
func Fexecve(fd uintptr, args []string, env []string) error {
|
||||
var err error
|
||||
for {
|
||||
err = execveat(fd, "", args, env, unix.AT_EMPTY_PATH)
|
||||
if err != unix.EINTR { // nolint:errorlint // unix errors are bare
|
||||
break
|
||||
}
|
||||
}
|
||||
if err == unix.ENOSYS { // nolint:errorlint // unix errors are bare
|
||||
// Fallback to classic /proc/self/fd/... exec.
|
||||
return Exec("/proc/self/fd/"+strconv.Itoa(int(fd)), args, env)
|
||||
}
|
||||
return os.NewSyscallError("execveat", err)
|
||||
}
|
||||
|
||||
func SetParentDeathSignal(sig uintptr) error {
|
||||
if err := unix.Prctl(unix.PR_SET_PDEATHSIG, sig, 0, 0, 0); err != nil {
|
||||
return err
|
||||
|
Reference in New Issue
Block a user