mirror of
https://github.com/opencontainers/runc.git
synced 2025-09-29 12:52:34 +08:00
nsexec: migrate memfd /proc/self/exe logic to Go code
This allow us to remove the amount of C code in runc quite substantially, as well as removing a whole execve(2) from the nsexec path because we no longer spawn "runc init" only to re-exec "runc init" after doing the clone. Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
This commit is contained in:
@@ -24,6 +24,7 @@ import (
|
|||||||
|
|
||||||
"github.com/opencontainers/runc/libcontainer/cgroups"
|
"github.com/opencontainers/runc/libcontainer/cgroups"
|
||||||
"github.com/opencontainers/runc/libcontainer/configs"
|
"github.com/opencontainers/runc/libcontainer/configs"
|
||||||
|
"github.com/opencontainers/runc/libcontainer/dmz"
|
||||||
"github.com/opencontainers/runc/libcontainer/intelrdt"
|
"github.com/opencontainers/runc/libcontainer/intelrdt"
|
||||||
"github.com/opencontainers/runc/libcontainer/system"
|
"github.com/opencontainers/runc/libcontainer/system"
|
||||||
"github.com/opencontainers/runc/libcontainer/utils"
|
"github.com/opencontainers/runc/libcontainer/utils"
|
||||||
@@ -316,6 +317,8 @@ func (c *Container) start(process *Process) (retErr error) {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("unable to create new parent process: %w", err)
|
return fmt.Errorf("unable to create new parent process: %w", err)
|
||||||
}
|
}
|
||||||
|
// We do not need the cloned binaries once the process is spawned.
|
||||||
|
defer process.closeClonedExes()
|
||||||
|
|
||||||
logsDone := parent.forwardChildLogs()
|
logsDone := parent.forwardChildLogs()
|
||||||
if logsDone != nil {
|
if logsDone != nil {
|
||||||
@@ -454,24 +457,30 @@ func (c *Container) newParentProcess(p *Process) (parentProcess, error) {
|
|||||||
}
|
}
|
||||||
logFilePair := filePair{parentLogPipe, childLogPipe}
|
logFilePair := filePair{parentLogPipe, childLogPipe}
|
||||||
|
|
||||||
cmd := c.commandTemplate(p, childInitPipe, childLogPipe)
|
// Make sure we use a new safe copy of /proc/self/exe each time this is
|
||||||
if !p.Init {
|
// called, to make sure that if a container manages to overwrite the file
|
||||||
return c.newSetnsProcess(p, cmd, messageSockPair, logFilePair)
|
// it cannot affect other containers on the system. For runc, this code
|
||||||
|
// will only ever be called once, but libcontainer users might call this
|
||||||
|
// more than once.
|
||||||
|
p.closeClonedExes()
|
||||||
|
var (
|
||||||
|
exePath string
|
||||||
|
safeExe *os.File
|
||||||
|
)
|
||||||
|
if dmz.IsSelfExeCloned() {
|
||||||
|
// /proc/self/exe is already a cloned binary -- no need to do anything
|
||||||
|
logrus.Debug("skipping binary cloning -- /proc/self/exe is already cloned!")
|
||||||
|
exePath = "/proc/self/exe"
|
||||||
|
} else {
|
||||||
|
safeExe, err = dmz.CloneSelfExe(c.root)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("unable to create safe /proc/self/exe clone for runc init: %w", err)
|
||||||
|
}
|
||||||
|
exePath = "/proc/self/fd/" + strconv.Itoa(int(safeExe.Fd()))
|
||||||
|
p.clonedExes = append(p.clonedExes, safeExe)
|
||||||
}
|
}
|
||||||
|
|
||||||
// We only set up fifoFd if we're not doing a `runc exec`. The historic
|
cmd := exec.Command(exePath, "init")
|
||||||
// reason for this is that previously we would pass a dirfd that allowed
|
|
||||||
// for container rootfs escape (and not doing it in `runc exec` avoided
|
|
||||||
// that problem), but we no longer do that. However, there's no need to do
|
|
||||||
// this for `runc exec` so we just keep it this way to be safe.
|
|
||||||
if err := c.includeExecFifo(cmd); err != nil {
|
|
||||||
return nil, fmt.Errorf("unable to setup exec fifo: %w", err)
|
|
||||||
}
|
|
||||||
return c.newInitProcess(p, cmd, messageSockPair, logFilePair)
|
|
||||||
}
|
|
||||||
|
|
||||||
func (c *Container) commandTemplate(p *Process, childInitPipe *os.File, childLogPipe *os.File) *exec.Cmd {
|
|
||||||
cmd := exec.Command("/proc/self/exe", "init")
|
|
||||||
cmd.Args[0] = os.Args[0]
|
cmd.Args[0] = os.Args[0]
|
||||||
cmd.Stdin = p.Stdin
|
cmd.Stdin = p.Stdin
|
||||||
cmd.Stdout = p.Stdout
|
cmd.Stdout = p.Stdout
|
||||||
@@ -501,13 +510,38 @@ func (c *Container) commandTemplate(p *Process, childInitPipe *os.File, childLog
|
|||||||
cmd.Env = append(cmd.Env, "_LIBCONTAINER_LOGLEVEL="+p.LogLevel)
|
cmd.Env = append(cmd.Env, "_LIBCONTAINER_LOGLEVEL="+p.LogLevel)
|
||||||
}
|
}
|
||||||
|
|
||||||
// NOTE: when running a container with no PID namespace and the parent process spawning the container is
|
if safeExe != nil {
|
||||||
// PID1 the pdeathsig is being delivered to the container's init process by the kernel for some reason
|
// Due to a Go stdlib bug, we need to add safeExe to the set of
|
||||||
// even with the parent still running.
|
// ExtraFiles otherwise it is possible for the stdlib to clobber the fd
|
||||||
|
// during forkAndExecInChild1 and replace it with some other file that
|
||||||
|
// might be malicious. This is less than ideal (because the descriptor
|
||||||
|
// will be non-O_CLOEXEC) however we have protections in "runc init" to
|
||||||
|
// stop us from leaking extra file descriptors.
|
||||||
|
//
|
||||||
|
// See <https://github.com/golang/go/issues/61751>.
|
||||||
|
cmd.ExtraFiles = append(cmd.ExtraFiles, safeExe)
|
||||||
|
}
|
||||||
|
|
||||||
|
// NOTE: when running a container with no PID namespace and the parent
|
||||||
|
// process spawning the container is PID1 the pdeathsig is being
|
||||||
|
// delivered to the container's init process by the kernel for some
|
||||||
|
// reason even with the parent still running.
|
||||||
if c.config.ParentDeathSignal > 0 {
|
if c.config.ParentDeathSignal > 0 {
|
||||||
cmd.SysProcAttr.Pdeathsig = unix.Signal(c.config.ParentDeathSignal)
|
cmd.SysProcAttr.Pdeathsig = unix.Signal(c.config.ParentDeathSignal)
|
||||||
}
|
}
|
||||||
return cmd
|
|
||||||
|
if p.Init {
|
||||||
|
// We only set up fifoFd if we're not doing a `runc exec`. The historic
|
||||||
|
// reason for this is that previously we would pass a dirfd that allowed
|
||||||
|
// for container rootfs escape (and not doing it in `runc exec` avoided
|
||||||
|
// that problem), but we no longer do that. However, there's no need to do
|
||||||
|
// this for `runc exec` so we just keep it this way to be safe.
|
||||||
|
if err := c.includeExecFifo(cmd); err != nil {
|
||||||
|
return nil, fmt.Errorf("unable to setup exec fifo: %w", err)
|
||||||
|
}
|
||||||
|
return c.newInitProcess(p, cmd, messageSockPair, logFilePair)
|
||||||
|
}
|
||||||
|
return c.newSetnsProcess(p, cmd, messageSockPair, logFilePair)
|
||||||
}
|
}
|
||||||
|
|
||||||
// shouldSendMountSources says whether the child process must setup bind mounts with
|
// shouldSendMountSources says whether the child process must setup bind mounts with
|
||||||
|
192
libcontainer/dmz/cloned_binary_linux.go
Normal file
192
libcontainer/dmz/cloned_binary_linux.go
Normal file
@@ -0,0 +1,192 @@
|
|||||||
|
package dmz
|
||||||
|
|
||||||
|
import (
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
"io"
|
||||||
|
"os"
|
||||||
|
|
||||||
|
"github.com/sirupsen/logrus"
|
||||||
|
"golang.org/x/sys/unix"
|
||||||
|
|
||||||
|
"github.com/opencontainers/runc/libcontainer/system"
|
||||||
|
)
|
||||||
|
|
||||||
|
type SealFunc func(**os.File) error
|
||||||
|
|
||||||
|
var (
|
||||||
|
_ SealFunc = sealMemfd
|
||||||
|
_ SealFunc = sealFile
|
||||||
|
)
|
||||||
|
|
||||||
|
const baseMemfdSeals = unix.F_SEAL_SEAL | unix.F_SEAL_SHRINK | unix.F_SEAL_GROW | unix.F_SEAL_WRITE
|
||||||
|
|
||||||
|
func sealMemfd(f **os.File) error {
|
||||||
|
if err := (*f).Chmod(0o511); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
// Try to set the newer memfd sealing flags, but we ignore
|
||||||
|
// errors because they are not needed and we want to continue
|
||||||
|
// to work on older kernels.
|
||||||
|
fd := (*f).Fd()
|
||||||
|
// F_SEAL_FUTURE_WRITE -- Linux 5.1
|
||||||
|
_, _ = unix.FcntlInt(fd, unix.F_ADD_SEALS, unix.F_SEAL_FUTURE_WRITE)
|
||||||
|
// F_SEAL_EXEC -- Linux 6.3
|
||||||
|
const F_SEAL_EXEC = 0x20 //nolint:revive // this matches the unix.* name
|
||||||
|
_, _ = unix.FcntlInt(fd, unix.F_ADD_SEALS, F_SEAL_EXEC)
|
||||||
|
// Apply all original memfd seals.
|
||||||
|
_, err := unix.FcntlInt(fd, unix.F_ADD_SEALS, baseMemfdSeals)
|
||||||
|
return os.NewSyscallError("fcntl(F_ADD_SEALS)", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Memfd creates a sealable executable memfd (supported since Linux 3.17).
|
||||||
|
func Memfd(comment string) (*os.File, SealFunc, error) {
|
||||||
|
file, err := system.ExecutableMemfd("runc_cloned:"+comment, unix.MFD_ALLOW_SEALING|unix.MFD_CLOEXEC)
|
||||||
|
return file, sealMemfd, err
|
||||||
|
}
|
||||||
|
|
||||||
|
func sealFile(f **os.File) error {
|
||||||
|
if err := (*f).Chmod(0o511); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
// When sealing an O_TMPFILE-style descriptor we need to
|
||||||
|
// re-open the path as O_PATH to clear the existing write
|
||||||
|
// handle we have.
|
||||||
|
opath, err := os.OpenFile(fmt.Sprintf("/proc/self/fd/%d", (*f).Fd()), unix.O_PATH|unix.O_CLOEXEC, 0)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("reopen tmpfile: %w", err)
|
||||||
|
}
|
||||||
|
_ = (*f).Close()
|
||||||
|
*f = opath
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// otmpfile creates an open(O_TMPFILE) file in the given directory (supported
|
||||||
|
// since Linux 3.11).
|
||||||
|
func otmpfile(dir string) (*os.File, SealFunc, error) {
|
||||||
|
file, err := os.OpenFile(dir, unix.O_TMPFILE|unix.O_RDWR|unix.O_EXCL|unix.O_CLOEXEC, 0o700)
|
||||||
|
if err != nil {
|
||||||
|
return nil, nil, fmt.Errorf("O_TMPFILE creation failed: %w", err)
|
||||||
|
}
|
||||||
|
// Make sure we actually got an unlinked O_TMPFILE descriptor.
|
||||||
|
var stat unix.Stat_t
|
||||||
|
if err := unix.Fstat(int(file.Fd()), &stat); err != nil {
|
||||||
|
file.Close()
|
||||||
|
return nil, nil, fmt.Errorf("cannot fstat O_TMPFILE fd: %w", err)
|
||||||
|
} else if stat.Nlink != 0 {
|
||||||
|
file.Close()
|
||||||
|
return nil, nil, errors.New("O_TMPFILE has non-zero nlink")
|
||||||
|
}
|
||||||
|
return file, sealFile, err
|
||||||
|
}
|
||||||
|
|
||||||
|
// mktemp creates a classic unlinked file in the given directory.
|
||||||
|
func mktemp(dir string) (*os.File, SealFunc, error) {
|
||||||
|
file, err := os.CreateTemp(dir, "runc.")
|
||||||
|
if err != nil {
|
||||||
|
return nil, nil, err
|
||||||
|
}
|
||||||
|
// Unlink the file and verify it was unlinked.
|
||||||
|
if err := os.Remove(file.Name()); err != nil {
|
||||||
|
return nil, nil, fmt.Errorf("unlinking classic tmpfile: %w", err)
|
||||||
|
}
|
||||||
|
var stat unix.Stat_t
|
||||||
|
if err := unix.Fstat(int(file.Fd()), &stat); err != nil {
|
||||||
|
return nil, nil, fmt.Errorf("cannot fstat classic tmpfile: %w", err)
|
||||||
|
} else if stat.Nlink != 0 {
|
||||||
|
return nil, nil, fmt.Errorf("classic tmpfile %s has non-zero nlink after unlink", file.Name())
|
||||||
|
}
|
||||||
|
return file, sealFile, err
|
||||||
|
}
|
||||||
|
|
||||||
|
func getSealableFile(comment, tmpDir string) (file *os.File, sealFn SealFunc, err error) {
|
||||||
|
// First, try an executable memfd (supported since Linux 3.17).
|
||||||
|
file, sealFn, err = Memfd(comment)
|
||||||
|
if err == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
logrus.Debugf("memfd cloned binary failed, falling back to O_TMPFILE: %v", err)
|
||||||
|
// Try to fallback to O_TMPFILE (supported since Linux 3.11).
|
||||||
|
file, sealFn, err = otmpfile(tmpDir)
|
||||||
|
if err == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
logrus.Debugf("O_TMPFILE cloned binary failed, falling back to mktemp(): %v", err)
|
||||||
|
// Finally, try a classic unlinked temporary file.
|
||||||
|
file, sealFn, err = mktemp(tmpDir)
|
||||||
|
if err == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
return nil, nil, fmt.Errorf("could not create sealable file for cloned binary: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// CloneBinary creates a "sealed" clone of a given binary, which can be used to
|
||||||
|
// thwart attempts by the container process to gain access to host binaries
|
||||||
|
// through procfs magic-link shenanigans. For more details on why this is
|
||||||
|
// necessary, see CVE-2019-5736.
|
||||||
|
func CloneBinary(src io.Reader, size int64, name, tmpDir string) (*os.File, error) {
|
||||||
|
logrus.Debugf("cloning %s binary (%d bytes)", name, size)
|
||||||
|
file, sealFn, err := getSealableFile(name, tmpDir)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
copied, err := io.Copy(file, src)
|
||||||
|
if err != nil {
|
||||||
|
file.Close()
|
||||||
|
return nil, fmt.Errorf("copy binary: %w", err)
|
||||||
|
} else if copied != size {
|
||||||
|
file.Close()
|
||||||
|
return nil, fmt.Errorf("copied binary size mismatch: %d != %d", copied, size)
|
||||||
|
}
|
||||||
|
if err := sealFn(&file); err != nil {
|
||||||
|
file.Close()
|
||||||
|
return nil, fmt.Errorf("could not seal fd: %w", err)
|
||||||
|
}
|
||||||
|
return file, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// IsCloned returns whether the given file can be guaranteed to be a safe exe.
|
||||||
|
func IsCloned(exe *os.File) bool {
|
||||||
|
seals, err := unix.FcntlInt(exe.Fd(), unix.F_GET_SEALS, 0)
|
||||||
|
if err != nil {
|
||||||
|
// /proc/self/exe is probably not a memfd
|
||||||
|
logrus.Debugf("F_GET_SEALS on %s failed: %v", exe.Name(), err)
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
// The memfd must have all of the base seals applied.
|
||||||
|
logrus.Debugf("checking %s memfd seals: 0x%x", exe.Name(), seals)
|
||||||
|
return seals&baseMemfdSeals == baseMemfdSeals
|
||||||
|
}
|
||||||
|
|
||||||
|
// CloneSelfExe makes a clone of the current process's binary (through
|
||||||
|
// /proc/self/exe). This binary can then be used for "runc init" in order to
|
||||||
|
// make sure the container process can never resolve the original runc binary.
|
||||||
|
// For more details on why this is necessary, see CVE-2019-5736.
|
||||||
|
func CloneSelfExe(tmpDir string) (*os.File, error) {
|
||||||
|
selfExe, err := os.Open("/proc/self/exe")
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("opening current binary: %w", err)
|
||||||
|
}
|
||||||
|
defer selfExe.Close()
|
||||||
|
|
||||||
|
stat, err := selfExe.Stat()
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("checking /proc/self/exe size: %w", err)
|
||||||
|
}
|
||||||
|
size := stat.Size()
|
||||||
|
|
||||||
|
return CloneBinary(selfExe, size, "/proc/self/exe", tmpDir)
|
||||||
|
}
|
||||||
|
|
||||||
|
// IsSelfExeCloned returns whether /proc/self/exe is a cloned binary that can
|
||||||
|
// be guaranteed to be safe. This means that it must be a sealed memfd. Other
|
||||||
|
// types of clones cannot be completely verified as safe.
|
||||||
|
func IsSelfExeCloned() bool {
|
||||||
|
selfExe, err := os.Open("/proc/self/exe")
|
||||||
|
if err != nil {
|
||||||
|
logrus.Debugf("open /proc/self/exe failed: %v", err)
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
defer selfExe.Close()
|
||||||
|
return IsCloned(selfExe)
|
||||||
|
}
|
@@ -1,567 +0,0 @@
|
|||||||
// SPDX-License-Identifier: Apache-2.0 OR LGPL-2.1-or-later
|
|
||||||
/*
|
|
||||||
* Copyright (C) 2019 Aleksa Sarai <cyphar@cyphar.com>
|
|
||||||
* Copyright (C) 2019 SUSE LLC
|
|
||||||
*
|
|
||||||
* This work is dual licensed under the following licenses. You may use,
|
|
||||||
* redistribute, and/or modify the work under the conditions of either (or
|
|
||||||
* both) licenses.
|
|
||||||
*
|
|
||||||
* === Apache-2.0 ===
|
|
||||||
*
|
|
||||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
* you may not use this file except in compliance with the License.
|
|
||||||
* You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*
|
|
||||||
* === LGPL-2.1-or-later ===
|
|
||||||
*
|
|
||||||
* This library is free software; you can redistribute it and/or
|
|
||||||
* modify it under the terms of the GNU Lesser General Public
|
|
||||||
* License as published by the Free Software Foundation; either
|
|
||||||
* version 2.1 of the License, or (at your option) any later version.
|
|
||||||
*
|
|
||||||
* This library is distributed in the hope that it will be useful,
|
|
||||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
* Lesser General Public License for more details.
|
|
||||||
*
|
|
||||||
* You should have received a copy of the GNU Lesser General Public
|
|
||||||
* License along with this library. If not, see
|
|
||||||
* <https://www.gnu.org/licenses/>.
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
|
|
||||||
#define _GNU_SOURCE
|
|
||||||
#include <unistd.h>
|
|
||||||
#include <stdio.h>
|
|
||||||
#include <stdlib.h>
|
|
||||||
#include <stdbool.h>
|
|
||||||
#include <string.h>
|
|
||||||
#include <limits.h>
|
|
||||||
#include <fcntl.h>
|
|
||||||
#include <errno.h>
|
|
||||||
|
|
||||||
#include <sched.h>
|
|
||||||
#include <sys/types.h>
|
|
||||||
#include <sys/stat.h>
|
|
||||||
#include <sys/statfs.h>
|
|
||||||
#include <sys/vfs.h>
|
|
||||||
#include <sys/mman.h>
|
|
||||||
#include <sys/mount.h>
|
|
||||||
#include <sys/sendfile.h>
|
|
||||||
#include <sys/socket.h>
|
|
||||||
#include <sys/syscall.h>
|
|
||||||
#include <sys/wait.h>
|
|
||||||
|
|
||||||
#include "ipc.h"
|
|
||||||
#include "log.h"
|
|
||||||
|
|
||||||
/* Use our own wrapper for memfd_create. */
|
|
||||||
#ifndef SYS_memfd_create
|
|
||||||
# ifdef __NR_memfd_create
|
|
||||||
# define SYS_memfd_create __NR_memfd_create
|
|
||||||
# else
|
|
||||||
/* These values come from <https://fedora.juszkiewicz.com.pl/syscalls.html>. */
|
|
||||||
# warning "libc is outdated -- using hard-coded SYS_memfd_create"
|
|
||||||
# if defined(__x86_64__)
|
|
||||||
# define SYS_memfd_create 319
|
|
||||||
# elif defined(__i386__)
|
|
||||||
# define SYS_memfd_create 356
|
|
||||||
# elif defined(__ia64__)
|
|
||||||
# define SYS_memfd_create 1340
|
|
||||||
# elif defined(__arm__)
|
|
||||||
# define SYS_memfd_create 385
|
|
||||||
# elif defined(__aarch64__)
|
|
||||||
# define SYS_memfd_create 279
|
|
||||||
# elif defined(__ppc__) || defined(__PPC64__) || defined(__powerpc64__)
|
|
||||||
# define SYS_memfd_create 360
|
|
||||||
# elif defined(__s390__) || defined(__s390x__)
|
|
||||||
# define SYS_memfd_create 350
|
|
||||||
# else
|
|
||||||
# warning "unknown architecture -- cannot hard-code SYS_memfd_create"
|
|
||||||
# endif
|
|
||||||
# endif
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/* memfd_create(2) flags -- copied from <linux/memfd.h>. */
|
|
||||||
#ifndef MFD_CLOEXEC
|
|
||||||
# define MFD_CLOEXEC 0x0001U
|
|
||||||
# define MFD_ALLOW_SEALING 0x0002U
|
|
||||||
#endif
|
|
||||||
#ifndef MFD_EXEC
|
|
||||||
# define MFD_EXEC 0x0010U
|
|
||||||
#endif
|
|
||||||
|
|
||||||
int memfd_create(const char *name, unsigned int flags)
|
|
||||||
{
|
|
||||||
#ifdef SYS_memfd_create
|
|
||||||
return syscall(SYS_memfd_create, name, flags);
|
|
||||||
#else
|
|
||||||
errno = ENOSYS;
|
|
||||||
return -1;
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
/* This comes directly from <linux/fcntl.h>. */
|
|
||||||
#ifndef F_LINUX_SPECIFIC_BASE
|
|
||||||
# define F_LINUX_SPECIFIC_BASE 1024
|
|
||||||
#endif
|
|
||||||
#ifndef F_ADD_SEALS
|
|
||||||
# define F_ADD_SEALS (F_LINUX_SPECIFIC_BASE + 9)
|
|
||||||
# define F_GET_SEALS (F_LINUX_SPECIFIC_BASE + 10)
|
|
||||||
#endif
|
|
||||||
#ifndef F_SEAL_SEAL
|
|
||||||
# define F_SEAL_SEAL 0x0001 /* prevent further seals from being set */
|
|
||||||
# define F_SEAL_SHRINK 0x0002 /* prevent file from shrinking */
|
|
||||||
# define F_SEAL_GROW 0x0004 /* prevent file from growing */
|
|
||||||
# define F_SEAL_WRITE 0x0008 /* prevent writes */
|
|
||||||
#endif
|
|
||||||
#ifndef F_SEAL_FUTURE_WRITE
|
|
||||||
# define F_SEAL_FUTURE_WRITE 0x0010 /* prevent future writes while mapped */
|
|
||||||
#endif
|
|
||||||
#ifndef F_SEAL_EXEC
|
|
||||||
# define F_SEAL_EXEC 0x0020 /* prevent chmod modifying exec bits */
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#define CLONED_BINARY_ENV "_LIBCONTAINER_CLONED_BINARY"
|
|
||||||
#define RUNC_MEMFD_COMMENT "runc_cloned:/proc/self/exe"
|
|
||||||
/*
|
|
||||||
* There are newer memfd seals (such as F_SEAL_FUTURE_WRITE and F_SEAL_EXEC),
|
|
||||||
* which we use opportunistically. However, this set is the original set of
|
|
||||||
* memfd seals, and we require them all to be set to trust our /proc/self/exe
|
|
||||||
* if it is a memfd.
|
|
||||||
*/
|
|
||||||
#define RUNC_MEMFD_MIN_SEALS \
|
|
||||||
(F_SEAL_SEAL | F_SEAL_SHRINK | F_SEAL_GROW | F_SEAL_WRITE)
|
|
||||||
|
|
||||||
static void *must_realloc(void *ptr, size_t size)
|
|
||||||
{
|
|
||||||
void *old = ptr;
|
|
||||||
do {
|
|
||||||
ptr = realloc(old, size);
|
|
||||||
} while (!ptr);
|
|
||||||
return ptr;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Verify whether we are currently in a self-cloned program (namely, is
|
|
||||||
* /proc/self/exe a memfd). F_GET_SEALS will only succeed for memfds (or rather
|
|
||||||
* for shmem files), and we want to be sure it's actually sealed.
|
|
||||||
*/
|
|
||||||
static int is_self_cloned(void)
|
|
||||||
{
|
|
||||||
int fd, seals = 0, is_cloned = false;
|
|
||||||
struct stat statbuf = { };
|
|
||||||
struct statfs fsbuf = { };
|
|
||||||
|
|
||||||
fd = open("/proc/self/exe", O_RDONLY | O_CLOEXEC);
|
|
||||||
if (fd < 0) {
|
|
||||||
write_log(ERROR, "cannot open runc binary for reading: open /proc/self/exe: %m");
|
|
||||||
return -ENOTRECOVERABLE;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Is the binary a fully-sealed memfd? We don't need CLONED_BINARY_ENV for
|
|
||||||
* this, because you cannot write to a sealed memfd no matter what.
|
|
||||||
*/
|
|
||||||
seals = fcntl(fd, F_GET_SEALS);
|
|
||||||
if (seals >= 0) {
|
|
||||||
write_log(DEBUG, "checking /proc/self/exe memfd seals: 0x%x", seals);
|
|
||||||
is_cloned = (seals & RUNC_MEMFD_MIN_SEALS) == RUNC_MEMFD_MIN_SEALS;
|
|
||||||
if (is_cloned)
|
|
||||||
goto out;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* All other forms require CLONED_BINARY_ENV, since they are potentially
|
|
||||||
* writeable (or we can't tell if they're fully safe) and thus we must
|
|
||||||
* check the environment as an extra layer of defence.
|
|
||||||
*/
|
|
||||||
if (!getenv(CLONED_BINARY_ENV)) {
|
|
||||||
is_cloned = false;
|
|
||||||
goto out;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Is the binary on a read-only filesystem? We can't detect bind-mounts in
|
|
||||||
* particular (in-kernel they are identical to regular mounts) but we can
|
|
||||||
* at least be sure that it's read-only. In addition, to make sure that
|
|
||||||
* it's *our* bind-mount we check CLONED_BINARY_ENV.
|
|
||||||
*/
|
|
||||||
if (fstatfs(fd, &fsbuf) >= 0)
|
|
||||||
is_cloned |= (fsbuf.f_flags & MS_RDONLY);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Okay, we're a tmpfile -- or we're currently running on RHEL <=7.6
|
|
||||||
* which appears to have a borked backport of F_GET_SEALS. Either way,
|
|
||||||
* having a file which has no hardlinks indicates that we aren't using
|
|
||||||
* a host-side "runc" binary and this is something that a container
|
|
||||||
* cannot fake (because unlinking requires being able to resolve the
|
|
||||||
* path that you want to unlink).
|
|
||||||
*/
|
|
||||||
if (fstat(fd, &statbuf) >= 0)
|
|
||||||
is_cloned |= (statbuf.st_nlink == 0);
|
|
||||||
|
|
||||||
out:
|
|
||||||
close(fd);
|
|
||||||
return is_cloned;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Read a given file into a new buffer, and providing the length. */
|
|
||||||
static char *read_file(char *path, size_t *length)
|
|
||||||
{
|
|
||||||
int fd;
|
|
||||||
char buf[4096], *copy = NULL;
|
|
||||||
|
|
||||||
if (!length)
|
|
||||||
return NULL;
|
|
||||||
|
|
||||||
fd = open(path, O_RDONLY | O_CLOEXEC);
|
|
||||||
if (fd < 0)
|
|
||||||
return NULL;
|
|
||||||
|
|
||||||
*length = 0;
|
|
||||||
for (;;) {
|
|
||||||
ssize_t n;
|
|
||||||
|
|
||||||
n = read(fd, buf, sizeof(buf));
|
|
||||||
if (n < 0)
|
|
||||||
goto error;
|
|
||||||
if (!n)
|
|
||||||
break;
|
|
||||||
|
|
||||||
copy = must_realloc(copy, (*length + n) * sizeof(*copy));
|
|
||||||
memcpy(copy + *length, buf, n);
|
|
||||||
*length += n;
|
|
||||||
}
|
|
||||||
close(fd);
|
|
||||||
return copy;
|
|
||||||
|
|
||||||
error:
|
|
||||||
close(fd);
|
|
||||||
free(copy);
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* A poor-man's version of "xargs -0". Basically parses a given block of
|
|
||||||
* NUL-delimited data, within the given length and adds a pointer to each entry
|
|
||||||
* to the array of pointers.
|
|
||||||
*/
|
|
||||||
static int parse_xargs(char *data, int data_length, char ***output)
|
|
||||||
{
|
|
||||||
int num = 0;
|
|
||||||
char *cur = data;
|
|
||||||
|
|
||||||
if (!data || *output != NULL)
|
|
||||||
return -1;
|
|
||||||
|
|
||||||
while (cur < data + data_length) {
|
|
||||||
num++;
|
|
||||||
*output = must_realloc(*output, (num + 1) * sizeof(**output));
|
|
||||||
(*output)[num - 1] = cur;
|
|
||||||
cur += strlen(cur) + 1;
|
|
||||||
}
|
|
||||||
(*output)[num] = NULL;
|
|
||||||
return num;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* "Parse" out argv from /proc/self/cmdline.
|
|
||||||
* This is necessary because we are running in a context where we don't have a
|
|
||||||
* main() that we can just get the arguments from.
|
|
||||||
*/
|
|
||||||
static int fetchve(char ***argv)
|
|
||||||
{
|
|
||||||
char *cmdline = NULL;
|
|
||||||
size_t cmdline_size;
|
|
||||||
|
|
||||||
cmdline = read_file("/proc/self/cmdline", &cmdline_size);
|
|
||||||
if (!cmdline)
|
|
||||||
goto error;
|
|
||||||
|
|
||||||
if (parse_xargs(cmdline, cmdline_size, argv) <= 0)
|
|
||||||
goto error;
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
error:
|
|
||||||
free(cmdline);
|
|
||||||
return -EINVAL;
|
|
||||||
}
|
|
||||||
|
|
||||||
enum {
|
|
||||||
EFD_NONE = 0,
|
|
||||||
EFD_MEMFD,
|
|
||||||
EFD_FILE,
|
|
||||||
};
|
|
||||||
|
|
||||||
/*
|
|
||||||
* This comes from <linux/fcntl.h>. We can't hard-code __O_TMPFILE because it
|
|
||||||
* changes depending on the architecture. If we don't have O_TMPFILE we always
|
|
||||||
* have the mkostemp(3) fallback.
|
|
||||||
*/
|
|
||||||
#ifndef O_TMPFILE
|
|
||||||
# if defined(__O_TMPFILE) && defined(O_DIRECTORY)
|
|
||||||
# define O_TMPFILE (__O_TMPFILE | O_DIRECTORY)
|
|
||||||
# endif
|
|
||||||
#endif
|
|
||||||
|
|
||||||
static inline bool is_memfd_unsupported_error(int err)
|
|
||||||
{
|
|
||||||
/*
|
|
||||||
* - ENOSYS is obviously an "unsupported" error.
|
|
||||||
*
|
|
||||||
* - EINVAL could be hit if MFD_EXEC is not supported (pre-6.3 kernel),
|
|
||||||
* but it can also be hit if vm.memfd_noexec=2 (in kernels without
|
|
||||||
* [1] applied) and the flags does not contain MFD_EXEC. However,
|
|
||||||
* there was a bug in the original 6.3 implementation of
|
|
||||||
* vm.memfd_noexec=2, which meant that MFD_EXEC would work even in
|
|
||||||
* the "strict" mode. Because we try MFD_EXEC first, we won't get
|
|
||||||
* EINVAL in the vm.memfd_noexec=2 case (which means we don't need to
|
|
||||||
* figure out whether to log the message about memfd_create).
|
|
||||||
*
|
|
||||||
* - EACCES is returned in kernels that contain [1] in the
|
|
||||||
* vm.memfd_noexec=2 case.
|
|
||||||
*
|
|
||||||
* At time of writing, [1] is not in Linus's tree and it't not clear if
|
|
||||||
* it will be backported to stable, so what exact versions apply here
|
|
||||||
* is unclear. But the bug is present in 6.3-6.5 at the very least.
|
|
||||||
*
|
|
||||||
* [1]: https://lore.kernel.org/all/20230705063315.3680666-2-jeffxu@google.com/
|
|
||||||
*/
|
|
||||||
if (err == EACCES)
|
|
||||||
write_log(INFO,
|
|
||||||
"memfd_create(MFD_EXEC) failed, possibly due to vm.memfd_noexec=2 -- falling back to less secure O_TMPFILE");
|
|
||||||
return err == ENOSYS || err == EINVAL || err == EACCES;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int make_execfd(int *fdtype)
|
|
||||||
{
|
|
||||||
int fd = -1;
|
|
||||||
char template[PATH_MAX] = { 0 };
|
|
||||||
char *prefix = getenv("_LIBCONTAINER_STATEDIR");
|
|
||||||
|
|
||||||
if (!prefix || *prefix != '/')
|
|
||||||
prefix = "/tmp";
|
|
||||||
if (snprintf(template, sizeof(template), "%s/runc.XXXXXX", prefix) < 0)
|
|
||||||
return -1;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Now try memfd, it's much nicer than actually creating a file in STATEDIR
|
|
||||||
* since it's easily detected thanks to sealing and also doesn't require
|
|
||||||
* assumptions about STATEDIR.
|
|
||||||
*/
|
|
||||||
*fdtype = EFD_MEMFD;
|
|
||||||
/*
|
|
||||||
* On newer kernels we should set MFD_EXEC to indicate we need +x
|
|
||||||
* permissions. Otherwise an admin with vm.memfd_noexec=1 would subtly
|
|
||||||
* break runc. vm.memfd_noexec=2 is a little bit more complicated, see the
|
|
||||||
* comment in is_memfd_unsupported_error() -- the upshot is that doing it
|
|
||||||
* this way works, but only because of two overlapping bugs in the sysctl
|
|
||||||
* implementation.
|
|
||||||
*/
|
|
||||||
fd = memfd_create(RUNC_MEMFD_COMMENT, MFD_EXEC | MFD_CLOEXEC | MFD_ALLOW_SEALING);
|
|
||||||
if (fd < 0 && is_memfd_unsupported_error(errno))
|
|
||||||
fd = memfd_create(RUNC_MEMFD_COMMENT, MFD_CLOEXEC | MFD_ALLOW_SEALING);
|
|
||||||
if (fd >= 0)
|
|
||||||
return fd;
|
|
||||||
if (!is_memfd_unsupported_error(errno))
|
|
||||||
goto error;
|
|
||||||
|
|
||||||
#ifdef O_TMPFILE
|
|
||||||
/*
|
|
||||||
* Try O_TMPFILE to avoid races where someone might snatch our file. Note
|
|
||||||
* that O_EXCL isn't actually a security measure here (since you can just
|
|
||||||
* fd re-open it and clear O_EXCL).
|
|
||||||
*/
|
|
||||||
*fdtype = EFD_FILE;
|
|
||||||
fd = open(prefix, O_TMPFILE | O_EXCL | O_RDWR | O_CLOEXEC, 0700);
|
|
||||||
if (fd >= 0) {
|
|
||||||
struct stat statbuf = { };
|
|
||||||
bool working_otmpfile = false;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* open(2) ignores unknown O_* flags -- yeah, I was surprised when I
|
|
||||||
* found this out too. As a result we can't check for EINVAL. However,
|
|
||||||
* if we get nlink != 0 (or EISDIR) then we know that this kernel
|
|
||||||
* doesn't support O_TMPFILE.
|
|
||||||
*/
|
|
||||||
if (fstat(fd, &statbuf) >= 0)
|
|
||||||
working_otmpfile = (statbuf.st_nlink == 0);
|
|
||||||
|
|
||||||
if (working_otmpfile)
|
|
||||||
return fd;
|
|
||||||
|
|
||||||
/* Pretend that we got EISDIR since O_TMPFILE failed. */
|
|
||||||
close(fd);
|
|
||||||
errno = EISDIR;
|
|
||||||
}
|
|
||||||
if (errno != EISDIR)
|
|
||||||
goto error;
|
|
||||||
#endif /* defined(O_TMPFILE) */
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Our final option is to create a temporary file the old-school way, and
|
|
||||||
* then unlink it so that nothing else sees it by accident.
|
|
||||||
*/
|
|
||||||
*fdtype = EFD_FILE;
|
|
||||||
fd = mkostemp(template, O_CLOEXEC);
|
|
||||||
if (fd >= 0) {
|
|
||||||
if (unlink(template) >= 0)
|
|
||||||
return fd;
|
|
||||||
close(fd);
|
|
||||||
}
|
|
||||||
|
|
||||||
error:
|
|
||||||
*fdtype = EFD_NONE;
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int seal_execfd(int *fd, int fdtype)
|
|
||||||
{
|
|
||||||
switch (fdtype) {
|
|
||||||
case EFD_MEMFD:{
|
|
||||||
/*
|
|
||||||
* Try to seal with newer seals, but we ignore errors because older
|
|
||||||
* kernels don't support some of them. For container security only
|
|
||||||
* RUNC_MEMFD_MIN_SEALS are strictly required, but the rest are
|
|
||||||
* nice-to-haves. We apply RUNC_MEMFD_MIN_SEALS at the end because it
|
|
||||||
* contains F_SEAL_SEAL.
|
|
||||||
*/
|
|
||||||
int __attribute__((unused)) _err1 = fcntl(*fd, F_ADD_SEALS, F_SEAL_FUTURE_WRITE); // Linux 5.1
|
|
||||||
int __attribute__((unused)) _err2 = fcntl(*fd, F_ADD_SEALS, F_SEAL_EXEC); // Linux 6.3
|
|
||||||
return fcntl(*fd, F_ADD_SEALS, RUNC_MEMFD_MIN_SEALS);
|
|
||||||
}
|
|
||||||
case EFD_FILE:{
|
|
||||||
/* Need to re-open our pseudo-memfd as an O_PATH to avoid execve(2) giving -ETXTBSY. */
|
|
||||||
int newfd;
|
|
||||||
char fdpath[PATH_MAX] = { 0 };
|
|
||||||
|
|
||||||
if (fchmod(*fd, 0100) < 0)
|
|
||||||
return -1;
|
|
||||||
|
|
||||||
if (snprintf(fdpath, sizeof(fdpath), "/proc/self/fd/%d", *fd) < 0)
|
|
||||||
return -1;
|
|
||||||
|
|
||||||
newfd = open(fdpath, O_PATH | O_CLOEXEC);
|
|
||||||
if (newfd < 0)
|
|
||||||
return -1;
|
|
||||||
|
|
||||||
close(*fd);
|
|
||||||
*fd = newfd;
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
default:
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
static ssize_t fd_to_fd(int outfd, int infd)
|
|
||||||
{
|
|
||||||
ssize_t total = 0;
|
|
||||||
char buffer[4096];
|
|
||||||
|
|
||||||
for (;;) {
|
|
||||||
ssize_t nread, nwritten = 0;
|
|
||||||
|
|
||||||
nread = read(infd, buffer, sizeof(buffer));
|
|
||||||
if (nread < 0)
|
|
||||||
return -1;
|
|
||||||
if (!nread)
|
|
||||||
break;
|
|
||||||
|
|
||||||
do {
|
|
||||||
ssize_t n = write(outfd, buffer + nwritten, nread - nwritten);
|
|
||||||
if (n < 0)
|
|
||||||
return -1;
|
|
||||||
nwritten += n;
|
|
||||||
} while (nwritten < nread);
|
|
||||||
|
|
||||||
total += nwritten;
|
|
||||||
}
|
|
||||||
|
|
||||||
return total;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int clone_binary(void)
|
|
||||||
{
|
|
||||||
int binfd, execfd;
|
|
||||||
struct stat statbuf = { };
|
|
||||||
size_t sent = 0;
|
|
||||||
int fdtype = EFD_NONE;
|
|
||||||
|
|
||||||
execfd = make_execfd(&fdtype);
|
|
||||||
if (execfd < 0 || fdtype == EFD_NONE)
|
|
||||||
return -ENOTRECOVERABLE;
|
|
||||||
|
|
||||||
binfd = open("/proc/self/exe", O_RDONLY | O_CLOEXEC);
|
|
||||||
if (binfd < 0)
|
|
||||||
goto error;
|
|
||||||
|
|
||||||
if (fstat(binfd, &statbuf) < 0)
|
|
||||||
goto error_binfd;
|
|
||||||
|
|
||||||
while (sent < statbuf.st_size) {
|
|
||||||
int n = sendfile(execfd, binfd, NULL, statbuf.st_size - sent);
|
|
||||||
if (n < 0) {
|
|
||||||
/* sendfile can fail so we fallback to a dumb user-space copy. */
|
|
||||||
n = fd_to_fd(execfd, binfd);
|
|
||||||
if (n < 0)
|
|
||||||
goto error_binfd;
|
|
||||||
}
|
|
||||||
sent += n;
|
|
||||||
}
|
|
||||||
close(binfd);
|
|
||||||
if (sent != statbuf.st_size)
|
|
||||||
goto error;
|
|
||||||
|
|
||||||
if (seal_execfd(&execfd, fdtype) < 0)
|
|
||||||
goto error;
|
|
||||||
|
|
||||||
return execfd;
|
|
||||||
|
|
||||||
error_binfd:
|
|
||||||
close(binfd);
|
|
||||||
error:
|
|
||||||
close(execfd);
|
|
||||||
return -EIO;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Get cheap access to the environment. */
|
|
||||||
extern char **environ;
|
|
||||||
|
|
||||||
int ensure_cloned_binary(void)
|
|
||||||
{
|
|
||||||
int execfd;
|
|
||||||
char **argv = NULL;
|
|
||||||
|
|
||||||
/* Check that we're not self-cloned, and if we are then bail. */
|
|
||||||
int cloned = is_self_cloned();
|
|
||||||
if (cloned > 0 || cloned == -ENOTRECOVERABLE)
|
|
||||||
return cloned;
|
|
||||||
|
|
||||||
if (fetchve(&argv) < 0)
|
|
||||||
return -EINVAL;
|
|
||||||
|
|
||||||
execfd = clone_binary();
|
|
||||||
if (execfd < 0)
|
|
||||||
return -EIO;
|
|
||||||
|
|
||||||
if (putenv(CLONED_BINARY_ENV "=1"))
|
|
||||||
goto error;
|
|
||||||
|
|
||||||
fexecve(execfd, argv, environ);
|
|
||||||
error:
|
|
||||||
close(execfd);
|
|
||||||
return -ENOEXEC;
|
|
||||||
}
|
|
@@ -536,9 +536,6 @@ void join_namespaces(char *nslist)
|
|||||||
free(namespaces);
|
free(namespaces);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Defined in cloned_binary.c. */
|
|
||||||
extern int ensure_cloned_binary(void);
|
|
||||||
|
|
||||||
static inline int sane_kill(pid_t pid, int signum)
|
static inline int sane_kill(pid_t pid, int signum)
|
||||||
{
|
{
|
||||||
if (pid > 0)
|
if (pid > 0)
|
||||||
@@ -791,14 +788,6 @@ void nsexec(void)
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
* We need to re-exec if we are not in a cloned binary. This is necessary
|
|
||||||
* to ensure that containers won't be able to access the host binary
|
|
||||||
* through /proc/self/exe. See CVE-2019-5736.
|
|
||||||
*/
|
|
||||||
if (ensure_cloned_binary() < 0)
|
|
||||||
bail("could not ensure we are a cloned binary");
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Inform the parent we're past initial setup.
|
* Inform the parent we're past initial setup.
|
||||||
* For the other side of this, see initWaiter.
|
* For the other side of this, see initWaiter.
|
||||||
|
@@ -49,6 +49,9 @@ type Process struct {
|
|||||||
// ExtraFiles specifies additional open files to be inherited by the container
|
// ExtraFiles specifies additional open files to be inherited by the container
|
||||||
ExtraFiles []*os.File
|
ExtraFiles []*os.File
|
||||||
|
|
||||||
|
// open handles to cloned binaries -- see dmz.ClonedBinary for more details
|
||||||
|
clonedExes []*os.File
|
||||||
|
|
||||||
// Initial sizings for the console
|
// Initial sizings for the console
|
||||||
ConsoleWidth uint16
|
ConsoleWidth uint16
|
||||||
ConsoleHeight uint16
|
ConsoleHeight uint16
|
||||||
@@ -121,6 +124,15 @@ func (p Process) Signal(sig os.Signal) error {
|
|||||||
return p.ops.signal(sig)
|
return p.ops.signal(sig)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// closeClonedExes cleans up any existing cloned binaries associated with the
|
||||||
|
// Process.
|
||||||
|
func (p *Process) closeClonedExes() {
|
||||||
|
for _, exe := range p.clonedExes {
|
||||||
|
_ = exe.Close()
|
||||||
|
}
|
||||||
|
p.clonedExes = nil
|
||||||
|
}
|
||||||
|
|
||||||
// IO holds the process's STDIO
|
// IO holds the process's STDIO
|
||||||
type IO struct {
|
type IO struct {
|
||||||
Stdin io.WriteCloser
|
Stdin io.WriteCloser
|
||||||
|
@@ -4,10 +4,12 @@
|
|||||||
package system
|
package system
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"fmt"
|
||||||
"os"
|
"os"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"unsafe"
|
"unsafe"
|
||||||
|
|
||||||
|
"github.com/sirupsen/logrus"
|
||||||
"golang.org/x/sys/unix"
|
"golang.org/x/sys/unix"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -102,3 +104,29 @@ func GetSubreaper() (int, error) {
|
|||||||
|
|
||||||
return int(i), nil
|
return int(i), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func ExecutableMemfd(comment string, flags int) (*os.File, error) {
|
||||||
|
// Try to use MFD_EXEC first. On pre-6.3 kernels we get -EINVAL for this
|
||||||
|
// flag. On post-6.3 kernels, with vm.memfd_noexec=1 this ensures we get an
|
||||||
|
// executable memfd. For vm.memfd_noexec=2 this is a bit more complicated.
|
||||||
|
// The original vm.memfd_noexec=2 implementation incorrectly silently
|
||||||
|
// allowed MFD_EXEC[1] -- this should be fixed in 6.6. On 6.6 and newer
|
||||||
|
// kernels, we will get -EACCES if we try to use MFD_EXEC with
|
||||||
|
// vm.memfd_noexec=2 (for 6.3-6.5, -EINVAL was the intended return value).
|
||||||
|
//
|
||||||
|
// The upshot is we only need to retry without MFD_EXEC on -EINVAL because
|
||||||
|
// it just so happens that passing MFD_EXEC bypasses vm.memfd_noexec=2 on
|
||||||
|
// kernels where -EINVAL is actually a security denial.
|
||||||
|
memfd, err := unix.MemfdCreate(comment, flags|unix.MFD_EXEC)
|
||||||
|
if err == unix.EINVAL {
|
||||||
|
memfd, err = unix.MemfdCreate(comment, flags)
|
||||||
|
}
|
||||||
|
if err != nil {
|
||||||
|
if err == unix.EACCES {
|
||||||
|
logrus.Info("memfd_create(MFD_EXEC) failed, possibly due to vm.memfd_noexec=2 -- falling back to less secure O_TMPFILE")
|
||||||
|
}
|
||||||
|
err := os.NewSyscallError("memfd_create", err)
|
||||||
|
return nil, fmt.Errorf("failed to create executable memfd: %w", err)
|
||||||
|
}
|
||||||
|
return os.NewFile(uintptr(memfd), "/memfd:"+comment), nil
|
||||||
|
}
|
||||||
|
Reference in New Issue
Block a user