runc-dmz: reduce memfd binary cloning cost with small C binary

The idea is to remove the need for cloning the entire runc binary by
replacing the final execve() call of the container process with an
execve() call to a clone of a small C binary which just does an execve()
of its arguments.

This provides similar protection against CVE-2019-5736 but without
requiring a >10MB binary copy for each "runc init". When compiled with
musl, runc-dmz is 13kB (though unfortunately with glibc, it is 1.1MB
which is still quite large).

It should be noted that there is still a window where the container
processes could get access to the host runc binary, but because we set
ourselves as non-dumpable the container would need CAP_SYS_PTRACE (which
is not enabled by default in Docker) in order to get around the
proc_fd_access_allowed() checks. In addition, since Linux 4.10[1] the
kernel blocks access entirely for user namespaced containers in this
scenario. For those cases we cannot use runc-dmz, but most containers
won't have this issue.

This new runc-dmz binary can be opted out of at compile time by setting
the "runc_nodmz" buildtag, and at runtime by setting the RUNC_DMZ=legacy
environment variable. In both cases, runc will fall back to the classic
/proc/self/exe-based cloning trick. If /proc/self/exe is already a
sealed memfd (namely if the user is using contrib/cmd/memfd-bind to
create a persistent sealed memfd for runc), neither runc-dmz nor
/proc/self/exe cloning will be used because they are not necessary.

[1]: bfedb58925

Co-authored-by: lifubang <lifubang@acmcoder.com>
Signed-off-by: lifubang <lifubang@acmcoder.com>
[cyphar: address various review nits]
[cyphar: fix runc-dmz cross-compilation]
[cyphar: embed runc-dmz into runc binary and clone in Go code]
[cyphar: make runc-dmz optional, with fallback to /proc/self/exe cloning]
[cyphar: do not use runc-dmz when the container has certain privs]
Co-authored-by: Aleksa Sarai <cyphar@cyphar.com>
Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
This commit is contained in:
lifubang
2023-08-15 17:00:22 +08:00
committed by Aleksa Sarai
parent e089db3b4a
commit dac4171746
20 changed files with 608 additions and 25 deletions

View File

@@ -28,6 +28,7 @@ jobs:
rootless: ["rootless", ""] rootless: ["rootless", ""]
race: ["-race", ""] race: ["-race", ""]
criu: ["", "criu-dev"] criu: ["", "criu-dev"]
dmz: ["", "runc_nodmz"]
exclude: exclude:
- criu: criu-dev - criu: criu-dev
rootless: rootless rootless: rootless
@@ -35,6 +36,10 @@ jobs:
go-version: 1.20.x go-version: 1.20.x
- criu: criu-dev - criu: criu-dev
race: -race race: -race
- dmz: runc_nodmz
criu: criu-dev
- dmz: runc_nodmz
os: ubuntu-20.04
runs-on: ${{ matrix.os }} runs-on: ${{ matrix.os }}
steps: steps:
@@ -71,6 +76,8 @@ jobs:
go-version: ${{ matrix.go-version }} go-version: ${{ matrix.go-version }}
- name: build - name: build
env:
EXTRA_BUILDTAGS: ${{ matrix.dmz }}
run: sudo -E PATH="$PATH" make EXTRA_FLAGS="${{ matrix.race }}" all run: sudo -E PATH="$PATH" make EXTRA_FLAGS="${{ matrix.race }}" all
- name: install bats - name: install bats
@@ -80,6 +87,8 @@ jobs:
- name: unit test - name: unit test
if: matrix.rootless != 'rootless' if: matrix.rootless != 'rootless'
env:
EXTRA_BUILDTAGS: ${{ matrix.dmz }}
run: sudo -E PATH="$PATH" -- make TESTFLAGS="${{ matrix.race }}" localunittest run: sudo -E PATH="$PATH" -- make TESTFLAGS="${{ matrix.race }}" localunittest
- name: add rootless user - name: add rootless user
@@ -113,8 +122,12 @@ jobs:
# However, we do not have 32-bit ARM CI, so we use i386 for testing 32bit stuff. # However, we do not have 32-bit ARM CI, so we use i386 for testing 32bit stuff.
# We are not interested in providing official support for i386. # We are not interested in providing official support for i386.
cross-i386: cross-i386:
runs-on: ubuntu-22.04
timeout-minutes: 15 timeout-minutes: 15
strategy:
fail-fast: false
matrix:
dmz: ["", "runc_nodmz"]
runs-on: ubuntu-22.04
steps: steps:
@@ -136,4 +149,6 @@ jobs:
go-version: 1.x # Latest stable go-version: 1.x # Latest stable
- name: unit test - name: unit test
env:
EXTRA_BUILDTAGS: ${{ matrix.dmz }}
run: sudo -E PATH="$PATH" -- make GOARCH=386 localunittest run: sudo -E PATH="$PATH" -- make GOARCH=386 localunittest

View File

@@ -7,6 +7,7 @@
run: run:
build-tags: build-tags:
- seccomp - seccomp
- runc_nodmz
linters: linters:
disable-all: true disable-all: true

View File

@@ -3,6 +3,7 @@
run: run:
build-tags: build-tags:
- seccomp - seccomp
- runc_nodmz
linters: linters:
enable: enable:

View File

@@ -1,6 +1,11 @@
SHELL = /bin/bash
CONTAINER_ENGINE := docker CONTAINER_ENGINE := docker
GO ?= go GO ?= go
# Get CC values for cross-compilation.
include cc_platform.mk
PREFIX ?= /usr/local PREFIX ?= /usr/local
BINDIR := $(PREFIX)/sbin BINDIR := $(PREFIX)/sbin
MANDIR := $(PREFIX)/share/man MANDIR := $(PREFIX)/share/man
@@ -10,6 +15,7 @@ GIT_BRANCH_CLEAN := $(shell echo $(GIT_BRANCH) | sed -e "s/[^[:alnum:]]/-/g")
RUNC_IMAGE := runc_dev$(if $(GIT_BRANCH_CLEAN),:$(GIT_BRANCH_CLEAN)) RUNC_IMAGE := runc_dev$(if $(GIT_BRANCH_CLEAN),:$(GIT_BRANCH_CLEAN))
PROJECT := github.com/opencontainers/runc PROJECT := github.com/opencontainers/runc
BUILDTAGS ?= seccomp urfave_cli_no_docs BUILDTAGS ?= seccomp urfave_cli_no_docs
BUILDTAGS += $(EXTRA_BUILDTAGS)
COMMIT ?= $(shell git describe --dirty --long --always) COMMIT ?= $(shell git describe --dirty --long --always)
VERSION := $(shell cat ./VERSION) VERSION := $(shell cat ./VERSION)
@@ -57,16 +63,23 @@ endif
.DEFAULT: runc .DEFAULT: runc
runc: runc: runc-dmz
$(GO_BUILD) -o runc . $(GO_BUILD) -o runc .
make verify-dmz-arch
all: runc recvtty sd-helper seccompagent fs-idmap all: runc recvtty sd-helper seccompagent fs-idmap
recvtty sd-helper seccompagent fs-idmap: recvtty sd-helper seccompagent fs-idmap:
$(GO_BUILD) -o contrib/cmd/$@/$@ ./contrib/cmd/$@ $(GO_BUILD) -o contrib/cmd/$@/$@ ./contrib/cmd/$@
static: static: runc-dmz
$(GO_BUILD_STATIC) -o runc . $(GO_BUILD_STATIC) -o runc .
make verify-dmz-arch
.PHONY: runc-dmz
runc-dmz:
rm -f libcontainer/dmz/runc-dmz
$(GO) generate -tags "$(BUILDTAGS)" ./libcontainer/dmz
releaseall: RELEASE_ARGS := "-a 386 -a amd64 -a arm64 -a armel -a armhf -a ppc64le -a riscv64 -a s390x" releaseall: RELEASE_ARGS := "-a 386 -a amd64 -a arm64 -a armel -a armhf -a ppc64le -a riscv64 -a s390x"
releaseall: release releaseall: release
@@ -147,12 +160,12 @@ install-man: man
install -D -m 644 man/man8/*.8 $(DESTDIR)$(MANDIR)/man8 install -D -m 644 man/man8/*.8 $(DESTDIR)$(MANDIR)/man8
clean: clean:
rm -f runc runc-* rm -f runc runc-* libcontainer/dmz/runc-dmz
rm -f contrib/cmd/recvtty/recvtty rm -f contrib/cmd/recvtty/recvtty
rm -f contrib/cmd/sd-helper/sd-helper rm -f contrib/cmd/sd-helper/sd-helper
rm -f contrib/cmd/seccompagent/seccompagent rm -f contrib/cmd/seccompagent/seccompagent
rm -f contrib/cmd/fs-idmap/fs-idmap rm -f contrib/cmd/fs-idmap/fs-idmap
rm -rf release sudo rm -rf release
rm -rf man/man8 rm -rf man/man8
cfmt: C_SRC=$(shell git ls-files '*.c' | grep -v '^vendor/') cfmt: C_SRC=$(shell git ls-files '*.c' | grep -v '^vendor/')
@@ -188,6 +201,18 @@ verify-dependencies: vendor
@test -z "$$(git status --porcelain -- go.mod go.sum vendor/)" \ @test -z "$$(git status --porcelain -- go.mod go.sum vendor/)" \
|| (echo -e "git status:\n $$(git status -- go.mod go.sum vendor/)\nerror: vendor/, go.mod and/or go.sum not up to date. Run \"make vendor\" to update"; exit 1) \ || (echo -e "git status:\n $$(git status -- go.mod go.sum vendor/)\nerror: vendor/, go.mod and/or go.sum not up to date. Run \"make vendor\" to update"; exit 1) \
&& echo "all vendor files are up to date." && echo "all vendor files are up to date."
verify-dmz-arch:
@test -s libcontainer/dmz/runc-dmz || exit 0; \
set -Eeuo pipefail; \
export LC_ALL=C; \
echo "readelf -h runc"; \
readelf -h runc | grep -E "(Machine|Flags):"; \
echo "readelf -h libcontainer/dmz/runc-dmz"; \
readelf -h libcontainer/dmz/runc-dmz | grep -E "(Machine|Flags):"; \
diff -u \
<(readelf -h runc | grep -E "(Machine|Flags):") \
<(readelf -h libcontainer/dmz/runc-dmz | grep -E "(Machine|Flags):") \
&& echo "runc-dmz architecture matches runc binary."
validate-keyring: validate-keyring:
script/keyring_validate.sh script/keyring_validate.sh
@@ -197,4 +222,4 @@ validate-keyring:
test localtest unittest localunittest integration localintegration \ test localtest unittest localunittest integration localintegration \
rootlessintegration localrootlessintegration shell install install-bash \ rootlessintegration localrootlessintegration shell install install-bash \
install-man clean cfmt shfmt localshfmt shellcheck \ install-man clean cfmt shfmt localshfmt shellcheck \
vendor verify-changelog verify-dependencies validate-keyring vendor verify-changelog verify-dependencies verify-dmz-arch validate-keyring

View File

@@ -65,9 +65,10 @@ e.g. to disable seccomp:
make BUILDTAGS="" make BUILDTAGS=""
``` ```
| Build Tag | Feature | Enabled by default | Dependency | | Build Tag | Feature | Enabled by Default | Dependencies |
|-----------|------------------------------------|--------------------|------------| |---------------|---------------------------------------|--------------------|---------------------|
| seccomp | Syscall filtering | yes | libseccomp | | `seccomp` | Syscall filtering using `libseccomp`. | yes | `libseccomp` |
| `!runc_nodmz` | Reduce memory usage for CVE-2019-5736 protection by using a small C binary. `runc_nodmz` disables this feature and causes runc to use a different protection mechanism which will further increases memory usage temporarily during container startup. This feature can also be disabled at runtime by setting the `RUNC_DMZ=legacy` environment variable. | yes ||
The following build tags were used earlier, but are now obsoleted: The following build tags were used earlier, but are now obsoleted:
- **nokmem** (since runc v1.0.0-rc94 kernel memory settings are ignored) - **nokmem** (since runc v1.0.0-rc94 kernel memory settings are ignored)

61
cc_platform.mk Normal file
View File

@@ -0,0 +1,61 @@
# NOTE: Make sure you keep this file in sync with scripts/lib.sh.
GO ?= go
GOARCH ?= $(shell $(GO) env GOARCH)
ifneq ($(shell grep -i "ID_LIKE=.*suse" /etc/os-release),)
# openSUSE has a custom PLATFORM
PLATFORM ?= suse-linux
IS_SUSE := 1
else
PLATFORM ?= linux-gnu
endif
ifeq ($(GOARCH),$(shell GOARCH= $(GO) env GOARCH))
# use the native CC and STRIP
HOST :=
else ifeq ($(GOARCH),386)
# Always use the 64-bit compiler to build the 386 binary, which works for
# the more common cross-build method for x86 (namely, the equivalent of
# dpkg --add-architecture).
ifdef IS_SUSE
# There is no x86_64-suse-linux-gcc, so use the native one.
HOST :=
CPU_TYPE := i586
else
HOST := x86_64-$(PLATFORM)-
CPU_TYPE := i686
endif
CFLAGS := -m32 -march=$(CPU_TYPE) $(CFLAGS)
else ifeq ($(GOARCH),amd64)
ifdef IS_SUSE
# There is no x86_64-suse-linux-gcc, so use the native one.
HOST :=
else
HOST := x86_64-$(PLATFORM)-
endif
else ifeq ($(GOARCH),arm64)
HOST := aarch64-$(PLATFORM)-
else ifeq ($(GOARCH),arm)
# HOST already configured by release_build.sh in this case.
else ifeq ($(GOARCH),armel)
HOST := arm-$(PLATFORM)eabi-
else ifeq ($(GOARCH),armhf)
HOST := arm-$(PLATFORM)eabihf-
else ifeq ($(GOARCH),ppc64le)
HOST := powerpc64le-$(PLATFORM)-
else ifeq ($(GOARCH),riscv64)
HOST := riscv64-$(PLATFORM)-
else ifeq ($(GOARCH),s390x)
HOST := s390x-$(PLATFORM)-
else
$(error Unsupported GOARCH $(GOARCH))
endif
ifeq ($(origin CC),$(filter $(origin CC),undefined default))
# Override CC if it's undefined or just the default value set by Make.
CC := $(HOST)gcc
export CC
endif
STRIP ?= $(HOST)strip
export STRIP

View File

@@ -27,6 +27,7 @@ import (
"github.com/opencontainers/runc/libcontainer/dmz" "github.com/opencontainers/runc/libcontainer/dmz"
"github.com/opencontainers/runc/libcontainer/intelrdt" "github.com/opencontainers/runc/libcontainer/intelrdt"
"github.com/opencontainers/runc/libcontainer/system" "github.com/opencontainers/runc/libcontainer/system"
"github.com/opencontainers/runc/libcontainer/system/kernelversion"
"github.com/opencontainers/runc/libcontainer/utils" "github.com/opencontainers/runc/libcontainer/utils"
) )
@@ -444,6 +445,48 @@ func (c *Container) includeExecFifo(cmd *exec.Cmd) error {
return nil return nil
} }
// No longer needed in Go 1.21.
func slicesContains[S ~[]E, E comparable](slice S, needle E) bool {
for _, val := range slice {
if val == needle {
return true
}
}
return false
}
func isDmzBinarySafe(c *configs.Config) bool {
// Because we set the dumpable flag in nsexec, the only time when it is
// unsafe to use runc-dmz is when the container process would be able to
// race against "runc init" and bypass the ptrace_may_access() checks.
//
// This is only the case if the container processes could have
// CAP_SYS_PTRACE somehow (i.e. the capability is present in the bounding,
// inheritable, or ambient sets). Luckily, most containers do not have this
// capability.
if c.Capabilities == nil ||
(!slicesContains(c.Capabilities.Bounding, "CAP_SYS_PTRACE") &&
!slicesContains(c.Capabilities.Inheritable, "CAP_SYS_PTRACE") &&
!slicesContains(c.Capabilities.Ambient, "CAP_SYS_PTRACE")) {
return true
}
// Since Linux 4.10 (see bfedb589252c0) user namespaced containers cannot
// access /proc/$pid/exe of runc after it joins the namespace (until it
// does an exec), regardless of the capability set. This has been
// backported to other distribution kernels, but there's no way of checking
// this cheaply -- better to be safe than sorry here.
linux410 := kernelversion.KernelVersion{Kernel: 4, Major: 10}
if ok, err := kernelversion.GreaterEqualThan(linux410); ok && err == nil {
if c.Namespaces.Contains(configs.NEWUSER) {
return true
}
}
// Assume it's unsafe otherwise.
return false
}
func (c *Container) newParentProcess(p *Process) (parentProcess, error) { func (c *Container) newParentProcess(p *Process) (parentProcess, error) {
parentInitPipe, childInitPipe, err := utils.NewSockPair("init") parentInitPipe, childInitPipe, err := utils.NewSockPair("init")
if err != nil { if err != nil {
@@ -457,21 +500,42 @@ func (c *Container) newParentProcess(p *Process) (parentProcess, error) {
} }
logFilePair := filePair{parentLogPipe, childLogPipe} logFilePair := filePair{parentLogPipe, childLogPipe}
// Make sure we use a new safe copy of /proc/self/exe each time this is // Make sure we use a new safe copy of /proc/self/exe or the runc-dmz
// called, to make sure that if a container manages to overwrite the file // binary each time this is called, to make sure that if a container
// it cannot affect other containers on the system. For runc, this code // manages to overwrite the file it cannot affect other containers on the
// will only ever be called once, but libcontainer users might call this // system. For runc, this code will only ever be called once, but
// more than once. // libcontainer users might call this more than once.
p.closeClonedExes() p.closeClonedExes()
var ( var (
exePath string exePath string
safeExe *os.File // only one of dmzExe or safeExe are used at a time
dmzExe, safeExe *os.File
) )
if dmz.IsSelfExeCloned() { if dmz.IsSelfExeCloned() {
// /proc/self/exe is already a cloned binary -- no need to do anything // /proc/self/exe is already a cloned binary -- no need to do anything
logrus.Debug("skipping binary cloning -- /proc/self/exe is already cloned!") logrus.Debug("skipping binary cloning -- /proc/self/exe is already cloned!")
exePath = "/proc/self/exe" exePath = "/proc/self/exe"
} else { } else {
var err error
if isDmzBinarySafe(c.config) {
dmzExe, err = dmz.Binary(c.root)
if err == nil {
// We can use our own executable without cloning if we are using
// runc-dmz.
exePath = "/proc/self/exe"
p.clonedExes = append(p.clonedExes, dmzExe)
} else if errors.Is(err, dmz.ErrNoDmzBinary) {
logrus.Debug("runc-dmz binary not embedded in runc binary, falling back to /proc/self/exe clone")
} else if err != nil {
return nil, fmt.Errorf("failed to create runc-dmz binary clone: %w", err)
}
} else {
// If the configuration makes it unsafe to use runc-dmz, pretend we
// don't have it embedded so we do /proc/self/exe cloning.
logrus.Debug("container configuration unsafe for runc-dmz, falling back to /proc/self/exe clone")
err = dmz.ErrNoDmzBinary
}
if errors.Is(err, dmz.ErrNoDmzBinary) {
safeExe, err = dmz.CloneSelfExe(c.root) safeExe, err = dmz.CloneSelfExe(c.root)
if err != nil { if err != nil {
return nil, fmt.Errorf("unable to create safe /proc/self/exe clone for runc init: %w", err) return nil, fmt.Errorf("unable to create safe /proc/self/exe clone for runc init: %w", err)
@@ -479,6 +543,12 @@ func (c *Container) newParentProcess(p *Process) (parentProcess, error) {
exePath = "/proc/self/fd/" + strconv.Itoa(int(safeExe.Fd())) exePath = "/proc/self/fd/" + strconv.Itoa(int(safeExe.Fd()))
p.clonedExes = append(p.clonedExes, safeExe) p.clonedExes = append(p.clonedExes, safeExe)
} }
// Just to make sure we don't run without protection.
if dmzExe == nil && safeExe == nil {
// This should never happen.
return nil, fmt.Errorf("[internal error] attempted to spawn a container with no /proc/self/exe protection")
}
}
cmd := exec.Command(exePath, "init") cmd := exec.Command(exePath, "init")
cmd.Args[0] = os.Args[0] cmd.Args[0] = os.Args[0]
@@ -503,6 +573,12 @@ func (c *Container) newParentProcess(p *Process) (parentProcess, error) {
"_LIBCONTAINER_STATEDIR="+c.root, "_LIBCONTAINER_STATEDIR="+c.root,
) )
if dmzExe != nil {
cmd.ExtraFiles = append(cmd.ExtraFiles, dmzExe)
cmd.Env = append(cmd.Env,
"_LIBCONTAINER_DMZEXEFD="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1))
}
cmd.ExtraFiles = append(cmd.ExtraFiles, childLogPipe) cmd.ExtraFiles = append(cmd.ExtraFiles, childLogPipe)
cmd.Env = append(cmd.Env, cmd.Env = append(cmd.Env,
"_LIBCONTAINER_LOGPIPE="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1)) "_LIBCONTAINER_LOGPIPE="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1))

1
libcontainer/dmz/.gitignore vendored Normal file
View File

@@ -0,0 +1 @@
/runc-dmz

View File

@@ -0,0 +1,6 @@
# Get CC values for cross-compilation.
include ../../cc_platform.mk
runc-dmz: _dmz.c
$(CC) $(CFLAGS) -static -o $@ $^
$(STRIP) -gs $@

10
libcontainer/dmz/_dmz.c Normal file
View File

@@ -0,0 +1,10 @@
#include <unistd.h>
extern char **environ;
int main(int argc, char **argv)
{
if (argc < 1)
return 127;
return execve(argv[0], argv, environ);
}

9
libcontainer/dmz/dmz.go Normal file
View File

@@ -0,0 +1,9 @@
package dmz
import (
"errors"
)
// ErrNoDmzBinary is returned by Binary when there is no runc-dmz binary
// embedded in the runc program.
var ErrNoDmzBinary = errors.New("runc-dmz binary not embedded in this program")

View File

@@ -0,0 +1 @@
package dmz

View File

@@ -0,0 +1,48 @@
//go:build !runc_nodmz
// +build !runc_nodmz
package dmz
import (
"bytes"
"debug/elf"
_ "embed"
"os"
"github.com/sirupsen/logrus"
)
// Try to build the runc-dmz binary. If it fails, replace it with an empty file
// (this will trigger us to fall back to a clone of /proc/self/exe). Yeah, this
// is a bit ugly but it makes sure that weird cross-compilation setups don't
// break because of runc-dmz.
//
//go:generate sh -c "make -B runc-dmz || echo -n >runc-dmz"
//go:embed runc-dmz
var runcDmzBinary []byte
// Binary returns a cloned copy (see CloneBinary) of a very minimal C program
// that just does an execve() of its arguments. This is used in the final
// execution step of the container execution as an intermediate process before
// the container process is execve'd. This allows for protection against
// CVE-2019-5736 without requiring a complete copy of the runc binary. Each
// call to Binary will return a new copy.
//
// If the runc-dmz binary is not embedded into the runc binary, Binary will
// return ErrNoDmzBinary as the error.
func Binary(tmpDir string) (*os.File, error) {
rdr := bytes.NewBuffer(runcDmzBinary)
// Verify that our embedded binary has a standard ELF header.
if !bytes.HasPrefix(rdr.Bytes(), []byte(elf.ELFMAG)) {
if rdr.Len() != 0 {
logrus.Infof("misconfigured build: embedded runc-dmz binary is non-empty but is missing a proper ELF header")
}
return nil, ErrNoDmzBinary
}
// Setting RUNC_DMZ=legacy disables this dmz method.
if os.Getenv("RUNC_DMZ") == "legacy" {
logrus.Debugf("RUNC_DMZ=legacy set -- switching back to classic /proc/self/exe cloning")
return nil, ErrNoDmzBinary
}
return CloneBinary(rdr, int64(rdr.Len()), "runc-dmz", tmpDir)
}

View File

@@ -0,0 +1,12 @@
//go:build !linux || runc_nodmz
// +build !linux runc_nodmz
package dmz
import (
"os"
)
func Binary(_ string) (*os.File, error) {
return nil, ErrNoDmzBinary
}

View File

@@ -182,6 +182,17 @@ func startInitialization() (retErr error) {
return err return err
} }
// Get runc-dmz fds.
var dmzExe *os.File
if dmzFdStr := os.Getenv("_LIBCONTAINER_DMZEXEFD"); dmzFdStr != "" {
dmzFd, err := strconv.Atoi(dmzFdStr)
if err != nil {
return fmt.Errorf("unable to convert _LIBCONTAINER_DMZEXEFD: %w", err)
}
unix.CloseOnExec(dmzFd)
dmzExe = os.NewFile(uintptr(dmzFd), "runc-dmz")
}
// clear the current process's environment to clean any libcontainer // clear the current process's environment to clean any libcontainer
// specific env vars. // specific env vars.
os.Clearenv() os.Clearenv()
@@ -197,10 +208,10 @@ func startInitialization() (retErr error) {
}() }()
// If init succeeds, it will not return, hence none of the defers will be called. // If init succeeds, it will not return, hence none of the defers will be called.
return containerInit(it, pipe, consoleSocket, fifofd, logFD, mountFds{sourceFds: mountSrcFds, idmapFds: idmapFds}) return containerInit(it, pipe, consoleSocket, fifofd, logFD, dmzExe, mountFds{sourceFds: mountSrcFds, idmapFds: idmapFds})
} }
func containerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd, logFd int, mountFds mountFds) error { func containerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd, logFd int, dmzExe *os.File, mountFds mountFds) error {
var config *initConfig var config *initConfig
if err := json.NewDecoder(pipe).Decode(&config); err != nil { if err := json.NewDecoder(pipe).Decode(&config); err != nil {
return err return err
@@ -208,6 +219,7 @@ func containerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd, lo
if err := populateProcessEnvironment(config.Env); err != nil { if err := populateProcessEnvironment(config.Env); err != nil {
return err return err
} }
switch t { switch t {
case initSetns: case initSetns:
// mount and idmap fds must be nil in this case. We don't mount while doing runc exec. // mount and idmap fds must be nil in this case. We don't mount while doing runc exec.
@@ -220,6 +232,7 @@ func containerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd, lo
consoleSocket: consoleSocket, consoleSocket: consoleSocket,
config: config, config: config,
logFd: logFd, logFd: logFd,
dmzExe: dmzExe,
} }
return i.Init() return i.Init()
case initStandard: case initStandard:
@@ -230,6 +243,7 @@ func containerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd, lo
config: config, config: config,
fifoFd: fifoFd, fifoFd: fifoFd,
logFd: logFd, logFd: logFd,
dmzExe: dmzExe,
mountFds: mountFds, mountFds: mountFds,
} }
return i.Init() return i.Init()

View File

@@ -4,6 +4,7 @@ import (
"errors" "errors"
"fmt" "fmt"
"os" "os"
"os/exec"
"strconv" "strconv"
"github.com/opencontainers/selinux/go-selinux" "github.com/opencontainers/selinux/go-selinux"
@@ -23,6 +24,7 @@ type linuxSetnsInit struct {
consoleSocket *os.File consoleSocket *os.File
config *initConfig config *initConfig
logFd int logFd int
dmzExe *os.File
} }
func (l *linuxSetnsInit) getSessionRingName() string { func (l *linuxSetnsInit) getSessionRingName() string {
@@ -85,6 +87,18 @@ func (l *linuxSetnsInit) Init() error {
if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil { if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil {
return err return err
} }
// Check for the arg early to make sure it exists.
name, err := exec.LookPath(l.config.Args[0])
if err != nil {
return err
}
// exec.LookPath in Go < 1.20 might return no error for an executable
// residing on a file system mounted with noexec flag, so perform this
// extra check now while we can still return a proper error.
// TODO: remove this once go < 1.20 is not supported.
if err := eaccess(name); err != nil {
return &os.PathError{Op: "eaccess", Path: name, Err: err}
}
// Set seccomp as close to execve as possible, so as few syscalls take // Set seccomp as close to execve as possible, so as few syscalls take
// place afterward (reducing the amount of syscalls that users need to // place afterward (reducing the amount of syscalls that users need to
// enable in their seccomp profiles). // enable in their seccomp profiles).
@@ -98,10 +112,15 @@ func (l *linuxSetnsInit) Init() error {
} }
} }
logrus.Debugf("setns_init: about to exec") logrus.Debugf("setns_init: about to exec")
// Close the log pipe fd so the parent's ForwardLogs can exit. // Close the log pipe fd so the parent's ForwardLogs can exit.
if err := unix.Close(l.logFd); err != nil { if err := unix.Close(l.logFd); err != nil {
return &os.PathError{Op: "close log pipe", Path: "fd " + strconv.Itoa(l.logFd), Err: err} return &os.PathError{Op: "close log pipe", Path: "fd " + strconv.Itoa(l.logFd), Err: err}
} }
return system.Execv(l.config.Args[0], l.config.Args[0:], os.Environ()) if l.dmzExe != nil {
l.config.Args[0] = name
return system.Fexecve(l.dmzExe.Fd(), l.config.Args, os.Environ())
}
return system.Exec(name, l.config.Args, os.Environ())
} }

View File

@@ -25,6 +25,7 @@ type linuxStandardInit struct {
parentPid int parentPid int
fifoFd int fifoFd int
logFd int logFd int
dmzExe *os.File
mountFds mountFds mountFds mountFds
config *initConfig config *initConfig
} }
@@ -262,5 +263,9 @@ func (l *linuxStandardInit) Init() error {
return err return err
} }
return system.Exec(name, l.config.Args[0:], os.Environ()) if l.dmzExe != nil {
l.config.Args[0] = name
return system.Fexecve(l.dmzExe.Fd(), l.config.Args, os.Environ())
}
return system.Exec(name, l.config.Args, os.Environ())
} }

View File

@@ -0,0 +1,94 @@
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
File copied and customized based on
https://github.com/moby/moby/tree/v20.10.14/profiles/seccomp/kernel_linux.go
File copied from
https://github.com/containerd/containerd/blob/v1.7.5/contrib/seccomp/kernelversion/kernel_linux.go
*/
package kernelversion
import (
"bytes"
"fmt"
"sync"
"golang.org/x/sys/unix"
)
// KernelVersion holds information about the kernel.
type KernelVersion struct {
Kernel uint64 // Version of the Kernel (i.e., the "4" in "4.1.2-generic")
Major uint64 // Major revision of the Kernel (i.e., the "1" in "4.1.2-generic")
}
func (k *KernelVersion) String() string {
if k.Kernel > 0 || k.Major > 0 {
return fmt.Sprintf("%d.%d", k.Kernel, k.Major)
}
return ""
}
var (
currentKernelVersion *KernelVersion
kernelVersionError error
once sync.Once
)
// getKernelVersion gets the current kernel version.
func getKernelVersion() (*KernelVersion, error) {
once.Do(func() {
var uts unix.Utsname
if err := unix.Uname(&uts); err != nil {
return
}
// Remove the \x00 from the release for Atoi to parse correctly
currentKernelVersion, kernelVersionError = parseRelease(string(uts.Release[:bytes.IndexByte(uts.Release[:], 0)]))
})
return currentKernelVersion, kernelVersionError
}
// parseRelease parses a string and creates a KernelVersion based on it.
func parseRelease(release string) (*KernelVersion, error) {
var version KernelVersion
// We're only make sure we get the "kernel" and "major revision". Sometimes we have
// 3.12.25-gentoo, but sometimes we just have 3.12-1-amd64.
_, err := fmt.Sscanf(release, "%d.%d", &version.Kernel, &version.Major)
if err != nil {
return nil, fmt.Errorf("failed to parse kernel version %q: %w", release, err)
}
return &version, nil
}
// GreaterEqualThan checks if the host's kernel version is greater than, or
// equal to the given kernel version v. Only "kernel version" and "major revision"
// can be specified (e.g., "3.12") and will be taken into account, which means
// that 3.12.25-gentoo and 3.12-1-amd64 are considered equal (kernel: 3, major: 12).
func GreaterEqualThan(minVersion KernelVersion) (bool, error) {
kv, err := getKernelVersion()
if err != nil {
return false, err
}
if kv.Kernel > minVersion.Kernel {
return true, nil
}
if kv.Kernel == minVersion.Kernel && kv.Major >= minVersion.Major {
return true, nil
}
return false, nil
}

View File

@@ -0,0 +1,140 @@
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
File copied and customized based on
https://github.com/moby/moby/tree/v20.10.14/profiles/seccomp/kernel_linux_test.go
*/
package kernelversion
import (
"fmt"
"testing"
)
func TestGetKernelVersion(t *testing.T) {
version, err := getKernelVersion()
if err != nil {
t.Fatal(err)
}
if version == nil {
t.Fatal("version is nil")
}
if version.Kernel == 0 {
t.Fatal("no kernel version")
}
}
func TestParseRelease(t *testing.T) {
tests := []struct {
in string
out KernelVersion
expectedErr error
}{
{in: "3.8", out: KernelVersion{Kernel: 3, Major: 8}},
{in: "3.8.0", out: KernelVersion{Kernel: 3, Major: 8}},
{in: "3.8.0-19-generic", out: KernelVersion{Kernel: 3, Major: 8}},
{in: "3.4.54.longterm-1", out: KernelVersion{Kernel: 3, Major: 4}},
{in: "3.10.0-862.2.3.el7.x86_64", out: KernelVersion{Kernel: 3, Major: 10}},
{in: "3.12.8tag", out: KernelVersion{Kernel: 3, Major: 12}},
{in: "3.12-1-amd64", out: KernelVersion{Kernel: 3, Major: 12}},
{in: "3.12foobar", out: KernelVersion{Kernel: 3, Major: 12}},
{in: "99.999.999-19-generic", out: KernelVersion{Kernel: 99, Major: 999}},
{in: "", expectedErr: fmt.Errorf(`failed to parse kernel version "": EOF`)},
{in: "3", expectedErr: fmt.Errorf(`failed to parse kernel version "3": unexpected EOF`)},
{in: "3.", expectedErr: fmt.Errorf(`failed to parse kernel version "3.": EOF`)},
{in: "3a", expectedErr: fmt.Errorf(`failed to parse kernel version "3a": input does not match format`)},
{in: "3.a", expectedErr: fmt.Errorf(`failed to parse kernel version "3.a": expected integer`)},
{in: "a", expectedErr: fmt.Errorf(`failed to parse kernel version "a": expected integer`)},
{in: "a.a", expectedErr: fmt.Errorf(`failed to parse kernel version "a.a": expected integer`)},
{in: "a.a.a-a", expectedErr: fmt.Errorf(`failed to parse kernel version "a.a.a-a": expected integer`)},
{in: "-3", expectedErr: fmt.Errorf(`failed to parse kernel version "-3": expected integer`)},
{in: "-3.", expectedErr: fmt.Errorf(`failed to parse kernel version "-3.": expected integer`)},
{in: "-3.8", expectedErr: fmt.Errorf(`failed to parse kernel version "-3.8": expected integer`)},
{in: "-3.-8", expectedErr: fmt.Errorf(`failed to parse kernel version "-3.-8": expected integer`)},
{in: "3.-8", expectedErr: fmt.Errorf(`failed to parse kernel version "3.-8": expected integer`)},
}
for _, tc := range tests {
tc := tc
t.Run(tc.in, func(t *testing.T) {
version, err := parseRelease(tc.in)
if tc.expectedErr != nil {
if err == nil {
t.Fatal("expected an error")
}
if err.Error() != tc.expectedErr.Error() {
t.Fatalf("expected: %s, got: %s", tc.expectedErr, err)
}
return
}
if err != nil {
t.Fatal("unexpected error:", err)
}
if version == nil {
t.Fatal("version is nil")
}
if version.Kernel != tc.out.Kernel || version.Major != tc.out.Major {
t.Fatalf("expected: %d.%d, got: %d.%d", tc.out.Kernel, tc.out.Major, version.Kernel, version.Major)
}
})
}
}
func TestGreaterEqualThan(t *testing.T) {
// Get the current kernel version, so that we can make test relative to that
v, err := getKernelVersion()
if err != nil {
t.Fatal(err)
}
tests := []struct {
doc string
in KernelVersion
expected bool
}{
{
doc: "same version",
in: KernelVersion{v.Kernel, v.Major},
expected: true,
},
{
doc: "kernel minus one",
in: KernelVersion{v.Kernel - 1, v.Major},
expected: true,
},
{
doc: "kernel plus one",
in: KernelVersion{v.Kernel + 1, v.Major},
expected: false,
},
{
doc: "major plus one",
in: KernelVersion{v.Kernel, v.Major + 1},
expected: false,
},
}
for _, tc := range tests {
tc := tc
t.Run(tc.doc+": "+tc.in.String(), func(t *testing.T) {
ok, err := GreaterEqualThan(tc.in)
if err != nil {
t.Fatal("unexpected error:", err)
}
if ok != tc.expected {
t.Fatalf("expected: %v, got: %v", tc.expected, ok)
}
})
}
}

View File

@@ -7,6 +7,8 @@ import (
"fmt" "fmt"
"os" "os"
"os/exec" "os/exec"
"strconv"
"syscall"
"unsafe" "unsafe"
"github.com/sirupsen/logrus" "github.com/sirupsen/logrus"
@@ -38,7 +40,6 @@ func Execv(cmd string, args []string, env []string) error {
if err != nil { if err != nil {
return err return err
} }
return Exec(name, args, env) return Exec(name, args, env)
} }
@@ -51,6 +52,49 @@ func Exec(cmd string, args []string, env []string) error {
} }
} }
func execveat(fd uintptr, pathname string, args []string, env []string, flags int) error {
pathnamep, err := syscall.BytePtrFromString(pathname)
if err != nil {
return err
}
argvp, err := syscall.SlicePtrFromStrings(args)
if err != nil {
return err
}
envp, err := syscall.SlicePtrFromStrings(env)
if err != nil {
return err
}
_, _, errno := syscall.Syscall6(
unix.SYS_EXECVEAT,
fd,
uintptr(unsafe.Pointer(pathnamep)),
uintptr(unsafe.Pointer(&argvp[0])),
uintptr(unsafe.Pointer(&envp[0])),
uintptr(flags),
0,
)
return errno
}
func Fexecve(fd uintptr, args []string, env []string) error {
var err error
for {
err = execveat(fd, "", args, env, unix.AT_EMPTY_PATH)
if err != unix.EINTR { // nolint:errorlint // unix errors are bare
break
}
}
if err == unix.ENOSYS { // nolint:errorlint // unix errors are bare
// Fallback to classic /proc/self/fd/... exec.
return Exec("/proc/self/fd/"+strconv.Itoa(int(fd)), args, env)
}
return os.NewSyscallError("execveat", err)
}
func SetParentDeathSignal(sig uintptr) error { func SetParentDeathSignal(sig uintptr) error {
if err := unix.Prctl(unix.PR_SET_PDEATHSIG, sig, 0, 0, 0); err != nil { if err := unix.Prctl(unix.PR_SET_PDEATHSIG, sig, 0, 0, 0); err != nil {
return err return err