mirror of
				https://github.com/opencontainers/runc.git
				synced 2025-10-31 11:06:21 +08:00 
			
		
		
		
	 dac4171746
			
		
	
	dac4171746
	
	
	
		
			
			The idea is to remove the need for cloning the entire runc binary by
replacing the final execve() call of the container process with an
execve() call to a clone of a small C binary which just does an execve()
of its arguments.
This provides similar protection against CVE-2019-5736 but without
requiring a >10MB binary copy for each "runc init". When compiled with
musl, runc-dmz is 13kB (though unfortunately with glibc, it is 1.1MB
which is still quite large).
It should be noted that there is still a window where the container
processes could get access to the host runc binary, but because we set
ourselves as non-dumpable the container would need CAP_SYS_PTRACE (which
is not enabled by default in Docker) in order to get around the
proc_fd_access_allowed() checks. In addition, since Linux 4.10[1] the
kernel blocks access entirely for user namespaced containers in this
scenario. For those cases we cannot use runc-dmz, but most containers
won't have this issue.
This new runc-dmz binary can be opted out of at compile time by setting
the "runc_nodmz" buildtag, and at runtime by setting the RUNC_DMZ=legacy
environment variable. In both cases, runc will fall back to the classic
/proc/self/exe-based cloning trick. If /proc/self/exe is already a
sealed memfd (namely if the user is using contrib/cmd/memfd-bind to
create a persistent sealed memfd for runc), neither runc-dmz nor
/proc/self/exe cloning will be used because they are not necessary.
[1]: bfedb58925
Co-authored-by: lifubang <lifubang@acmcoder.com>
Signed-off-by: lifubang <lifubang@acmcoder.com>
[cyphar: address various review nits]
[cyphar: fix runc-dmz cross-compilation]
[cyphar: embed runc-dmz into runc binary and clone in Go code]
[cyphar: make runc-dmz optional, with fallback to /proc/self/exe cloning]
[cyphar: do not use runc-dmz when the container has certain privs]
Co-authored-by: Aleksa Sarai <cyphar@cyphar.com>
Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
		
	
		
			
				
	
	
		
			95 lines
		
	
	
		
			2.9 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
			
		
		
	
	
			95 lines
		
	
	
		
			2.9 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
| /*
 | |
|    Copyright The containerd Authors.
 | |
| 
 | |
|    Licensed under the Apache License, Version 2.0 (the "License");
 | |
|    you may not use this file except in compliance with the License.
 | |
|    You may obtain a copy of the License at
 | |
| 
 | |
|        http://www.apache.org/licenses/LICENSE-2.0
 | |
| 
 | |
|    Unless required by applicable law or agreed to in writing, software
 | |
|    distributed under the License is distributed on an "AS IS" BASIS,
 | |
|    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | |
|    See the License for the specific language governing permissions and
 | |
|    limitations under the License.
 | |
| 
 | |
|    File copied and customized based on
 | |
|    https://github.com/moby/moby/tree/v20.10.14/profiles/seccomp/kernel_linux.go
 | |
| 
 | |
|    File copied from
 | |
|    https://github.com/containerd/containerd/blob/v1.7.5/contrib/seccomp/kernelversion/kernel_linux.go
 | |
| */
 | |
| 
 | |
| package kernelversion
 | |
| 
 | |
| import (
 | |
| 	"bytes"
 | |
| 	"fmt"
 | |
| 	"sync"
 | |
| 
 | |
| 	"golang.org/x/sys/unix"
 | |
| )
 | |
| 
 | |
| // KernelVersion holds information about the kernel.
 | |
| type KernelVersion struct {
 | |
| 	Kernel uint64 // Version of the Kernel (i.e., the "4" in "4.1.2-generic")
 | |
| 	Major  uint64 // Major revision of the Kernel (i.e., the "1" in "4.1.2-generic")
 | |
| }
 | |
| 
 | |
| func (k *KernelVersion) String() string {
 | |
| 	if k.Kernel > 0 || k.Major > 0 {
 | |
| 		return fmt.Sprintf("%d.%d", k.Kernel, k.Major)
 | |
| 	}
 | |
| 	return ""
 | |
| }
 | |
| 
 | |
| var (
 | |
| 	currentKernelVersion *KernelVersion
 | |
| 	kernelVersionError   error
 | |
| 	once                 sync.Once
 | |
| )
 | |
| 
 | |
| // getKernelVersion gets the current kernel version.
 | |
| func getKernelVersion() (*KernelVersion, error) {
 | |
| 	once.Do(func() {
 | |
| 		var uts unix.Utsname
 | |
| 		if err := unix.Uname(&uts); err != nil {
 | |
| 			return
 | |
| 		}
 | |
| 		// Remove the \x00 from the release for Atoi to parse correctly
 | |
| 		currentKernelVersion, kernelVersionError = parseRelease(string(uts.Release[:bytes.IndexByte(uts.Release[:], 0)]))
 | |
| 	})
 | |
| 	return currentKernelVersion, kernelVersionError
 | |
| }
 | |
| 
 | |
| // parseRelease parses a string and creates a KernelVersion based on it.
 | |
| func parseRelease(release string) (*KernelVersion, error) {
 | |
| 	var version KernelVersion
 | |
| 
 | |
| 	// We're only make sure we get the "kernel" and "major revision". Sometimes we have
 | |
| 	// 3.12.25-gentoo, but sometimes we just have 3.12-1-amd64.
 | |
| 	_, err := fmt.Sscanf(release, "%d.%d", &version.Kernel, &version.Major)
 | |
| 	if err != nil {
 | |
| 		return nil, fmt.Errorf("failed to parse kernel version %q: %w", release, err)
 | |
| 	}
 | |
| 	return &version, nil
 | |
| }
 | |
| 
 | |
| // GreaterEqualThan checks if the host's kernel version is greater than, or
 | |
| // equal to the given kernel version v. Only "kernel version" and "major revision"
 | |
| // can be specified (e.g., "3.12") and will be taken into account, which means
 | |
| // that 3.12.25-gentoo and 3.12-1-amd64 are considered equal (kernel: 3, major: 12).
 | |
| func GreaterEqualThan(minVersion KernelVersion) (bool, error) {
 | |
| 	kv, err := getKernelVersion()
 | |
| 	if err != nil {
 | |
| 		return false, err
 | |
| 	}
 | |
| 	if kv.Kernel > minVersion.Kernel {
 | |
| 		return true, nil
 | |
| 	}
 | |
| 	if kv.Kernel == minVersion.Kernel && kv.Major >= minVersion.Major {
 | |
| 		return true, nil
 | |
| 	}
 | |
| 	return false, nil
 | |
| }
 |