Open bind mount sources from the host userns

The source of the bind mount might not be accessible in a different user
namespace because a component of the source path might not be traversed
under the users and groups mapped inside the user namespace. This caused
errors such as the following:

  # time="2020-06-22T13:48:26Z" level=error msg="container_linux.go:367:
  starting container process caused: process_linux.go:459:
  container init caused: rootfs_linux.go:58:
  mounting \"/tmp/busyboxtest/source-inaccessible/dir\"
  to rootfs at \"/tmp/inaccessible\" caused:
  stat /tmp/busyboxtest/source-inaccessible/dir: permission denied"

To solve this problem, this patch performs the following:

1. in nsexec.c, it opens the source path in the host userns (so we have
   the right permissions to open it) but in the container mntns (so the
   kernel cross mntns mount check let us mount it later:
   https://github.com/torvalds/linux/blob/v5.8/fs/namespace.c#L2312).

2. in nsexec.c, it passes the file descriptors of the source to the
   child process with SCM_RIGHTS.

3. In runc-init in Golang, it finishes the mounts while inside the
   userns even without access to the some components of the source
   paths.

Passing the fds with SCM_RIGHTS is necessary because once the child
process is in the container mntns, it is already in the container userns
so it cannot temporarily join the host mntns.

This patch uses the existing mechanism with _LIBCONTAINER_* environment
variables to pass the file descriptors from runc to runc init.

This patch uses the existing mechanism with the Netlink-style bootstrap
to pass information about the list of source mounts to nsexec.c.

Rootless containers don't use this bind mount sources fdpassing
mechanism because we can't setns() to the target mntns in a rootless
container (we don't have the privileges when we are in the host userns).

This patch takes care of using O_CLOEXEC on mount fds, and close them
early.

Fixes: #2484.

Signed-off-by: Alban Crequy <alban@kinvolk.io>
Signed-off-by: Rodrigo Campos <rodrigo@kinvolk.io>
Co-authored-by: Rodrigo Campos <rodrigo@kinvolk.io>
This commit is contained in:
Alban Crequy
2020-09-03 14:41:05 +02:00
committed by Rodrigo Campos
parent 2357eab8ca
commit 9c444070ec
8 changed files with 412 additions and 25 deletions

View File

@@ -1,5 +1,7 @@
package configs
import "golang.org/x/sys/unix"
const (
// EXT_COPYUP is a directive to copy up the contents of a directory when
// a tmpfs is mounted over it.
@@ -37,3 +39,7 @@ type Mount struct {
// Optional Command to be run after Source is mounted.
PostmountCmds []Command `json:"postmount_cmds"`
}
func (m *Mount) IsBind() bool {
return m.Flags&unix.MS_BIND != 0
}

View File

@@ -521,6 +521,33 @@ func (c *linuxContainer) commandTemplate(p *Process, childInitPipe *os.File, chi
return cmd
}
// shouldSendMountSources says whether the child process must setup bind mounts with
// the source pre-opened (O_PATH) in the host user namespace.
// See https://github.com/opencontainers/runc/issues/2484
func (c *linuxContainer) shouldSendMountSources() bool {
// Passing the mount sources via SCM_RIGHTS is only necessary when
// both userns and mntns are active.
if !c.config.Namespaces.Contains(configs.NEWUSER) ||
!c.config.Namespaces.Contains(configs.NEWNS) {
return false
}
// nsexec.c send_mountsources() requires setns(mntns) capabilities
// CAP_SYS_CHROOT and CAP_SYS_ADMIN.
if c.config.RootlessEUID {
return false
}
// We need to send sources if there are bind-mounts.
for _, m := range c.config.Mounts {
if m.IsBind() {
return true
}
}
return false
}
func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, messageSockPair, logFilePair filePair) (*initProcess, error) {
cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard))
nsMaps := make(map[configs.NamespaceType]string)
@@ -530,10 +557,40 @@ func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, messageSockPa
}
}
_, sharePidns := nsMaps[configs.NEWPID]
data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps)
data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps, initStandard)
if err != nil {
return nil, err
}
if c.shouldSendMountSources() {
// Elements on this slice will be paired with mounts (see StartInitialization() and
// prepareRootfs()). This slice MUST have the same size as c.config.Mounts.
mountFds := make([]int, len(c.config.Mounts))
for i, m := range c.config.Mounts {
if !m.IsBind() {
// Non bind-mounts do not use an fd.
mountFds[i] = -1
continue
}
// The fd passed here will not be used: nsexec.c will overwrite it with dup3(). We just need
// to allocate a fd so that we know the number to pass in the environment variable. The fd
// must not be closed before cmd.Start(), so we reuse messageSockPair.child because the
// lifecycle of that fd is already taken care of.
cmd.ExtraFiles = append(cmd.ExtraFiles, messageSockPair.child)
mountFds[i] = stdioFdCount + len(cmd.ExtraFiles) - 1
}
mountFdsJson, err := json.Marshal(mountFds)
if err != nil {
return nil, fmt.Errorf("Error creating _LIBCONTAINER_MOUNT_FDS: %w", err)
}
cmd.Env = append(cmd.Env,
"_LIBCONTAINER_MOUNT_FDS="+string(mountFdsJson),
)
}
init := &initProcess{
cmd: cmd,
messageSockPair: messageSockPair,
@@ -558,7 +615,7 @@ func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, messageSockP
}
// for setns process, we don't have to set cloneflags as the process namespaces
// will only be set via setns syscall
data, err := c.bootstrapData(0, state.NamespacePaths)
data, err := c.bootstrapData(0, state.NamespacePaths, initSetns)
if err != nil {
return nil, err
}
@@ -1213,7 +1270,9 @@ func (c *linuxContainer) makeCriuRestoreMountpoints(m *configs.Mount) error {
case "bind":
// The prepareBindMount() function checks if source
// exists. So it cannot be used for other filesystem types.
if err := prepareBindMount(m, c.config.Rootfs); err != nil {
// TODO: pass something else than nil? Not sure if criu is
// impacted by issue #2484
if err := prepareBindMount(m, c.config.Rootfs, nil); err != nil {
return err
}
default:
@@ -2050,7 +2109,7 @@ func encodeIDMapping(idMap []configs.IDMap) ([]byte, error) {
// such as one that uses nsenter package to bootstrap the container's
// init process correctly, i.e. with correct namespaces, uid/gid
// mapping etc.
func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.NamespaceType]string) (io.Reader, error) {
func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.NamespaceType]string, it initType) (io.Reader, error) {
// create the netlink message
r := nl.NewNetlinkRequest(int(InitMsg), 0)
@@ -2132,6 +2191,22 @@ func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.Na
Value: c.config.RootlessEUID,
})
// Bind mount source to open.
if it == initStandard && c.shouldSendMountSources() {
var mounts []byte
for _, m := range c.config.Mounts {
if m.IsBind() {
mounts = append(mounts, []byte(m.Source)...)
}
mounts = append(mounts, byte(0))
}
r.AddData(&Bytemsg{
Type: MountSourcesAttr,
Value: mounts,
})
}
return bytes.NewReader(r.Serialize()), nil
}

View File

@@ -295,6 +295,12 @@ func (l *LinuxFactory) StartInitialization() (err error) {
return fmt.Errorf("unable to convert _LIBCONTAINER_LOGPIPE: %w", err)
}
// Get mount files (O_PATH).
mountFds, err := parseMountFds()
if err != nil {
return err
}
// clear the current process's environment to clean any libcontainer
// specific env vars.
os.Clearenv()
@@ -305,7 +311,7 @@ func (l *LinuxFactory) StartInitialization() (err error) {
}
}()
i, err := newContainerInit(it, pipe, consoleSocket, fifofd, logPipeFd)
i, err := newContainerInit(it, pipe, consoleSocket, fifofd, logPipeFd, mountFds)
if err != nil {
return err
}
@@ -359,3 +365,18 @@ func NewgidmapPath(newgidmapPath string) func(*LinuxFactory) error {
return nil
}
}
func parseMountFds() ([]int, error) {
fdsJson := os.Getenv("_LIBCONTAINER_MOUNT_FDS")
if fdsJson == "" {
// Always return the nil slice if no fd is present.
return nil, nil
}
var mountFds []int
if err := json.Unmarshal([]byte(fdsJson), &mountFds); err != nil {
return nil, fmt.Errorf("Error unmarshalling _LIBCONTAINER_MOUNT_FDS: %w", err)
}
return mountFds, nil
}

View File

@@ -76,7 +76,7 @@ type initer interface {
Init() error
}
func newContainerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd, logFd int) (initer, error) {
func newContainerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd, logFd int, mountFds []int) (initer, error) {
var config *initConfig
if err := json.NewDecoder(pipe).Decode(&config); err != nil {
return nil, err
@@ -86,6 +86,11 @@ func newContainerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd,
}
switch t {
case initSetns:
// mountFds must be nil in this case. We don't mount while doing runc exec.
if mountFds != nil {
return nil, errors.New("mountFds must be nil. Can't mount while doing runc exec.")
}
return &linuxSetnsInit{
pipe: pipe,
consoleSocket: consoleSocket,
@@ -100,6 +105,7 @@ func newContainerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd,
config: config,
fifoFd: fifoFd,
logFd: logFd,
mountFds: mountFds,
}, nil
}
return nil, fmt.Errorf("unknown init type %q", t)

View File

@@ -18,6 +18,7 @@ const (
RootlessEUIDAttr uint16 = 27287
UidmapPathAttr uint16 = 27288
GidmapPathAttr uint16 = 27289
MountSourcesAttr uint16 = 27290
)
type Int32msg struct {

View File

@@ -4,6 +4,7 @@
#include <errno.h>
#include <fcntl.h>
#include <grp.h>
#include <limits.h>
#include <sched.h>
#include <setjmp.h>
#include <signal.h>
@@ -39,6 +40,8 @@ enum sync_t {
SYNC_RECVPID_ACK = 0x43, /* PID was correctly received by parent. */
SYNC_GRANDCHILD = 0x44, /* The grandchild is ready to run. */
SYNC_CHILD_FINISH = 0x45, /* The child or grandchild has finished. */
SYNC_MOUNTSOURCES_PLS = 0x46, /* Tell parent to send mount sources by SCM_RIGHTS. */
SYNC_MOUNTSOURCES_ACK = 0x47, /* All mount sources have been sent. */
};
#define STAGE_SETUP -1
@@ -87,6 +90,10 @@ struct nlconfig_t {
size_t uidmappath_len;
char *gidmappath;
size_t gidmappath_len;
/* Mount sources opened outside the container userns. */
char *mountsources;
size_t mountsources_len;
};
/*
@@ -119,6 +126,7 @@ static int loglevel = DEBUG;
#define ROOTLESS_EUID_ATTR 27287
#define UIDMAPPATH_ATTR 27288
#define GIDMAPPATH_ATTR 27289
#define MOUNT_SOURCES_ATTR 27290
/*
* Use the raw syscall for versions of glibc which don't include a function for
@@ -542,6 +550,10 @@ static void nl_parse(int fd, struct nlconfig_t *config)
case SETGROUP_ATTR:
config->is_setgroup = readint8(current);
break;
case MOUNT_SOURCES_ATTR:
config->mountsources = current;
config->mountsources_len = payload_len;
break;
default:
bail("unknown netlink message type %d", nlattr->nla_type);
}
@@ -633,6 +645,193 @@ static inline int sane_kill(pid_t pid, int signum)
return 0;
}
void receive_fd(int sockfd, int new_fd)
{
int bytes_read;
struct msghdr msg = { };
struct cmsghdr *cmsg;
struct iovec iov = { };
char null_byte = '\0';
int ret;
int fd_count;
int *fd_payload;
iov.iov_base = &null_byte;
iov.iov_len = 1;
msg.msg_iov = &iov;
msg.msg_iovlen = 1;
msg.msg_controllen = CMSG_SPACE(sizeof(int));
msg.msg_control = malloc(msg.msg_controllen);
if (msg.msg_control == NULL) {
bail("Can't allocate memory to receive fd.");
}
memset(msg.msg_control, 0, msg.msg_controllen);
bytes_read = recvmsg(sockfd, &msg, 0);
if (bytes_read != 1)
bail("failed to receive fd from unix socket %d", sockfd);
if (msg.msg_flags & MSG_CTRUNC)
bail("received truncated control message from unix socket %d", sockfd);
cmsg = CMSG_FIRSTHDR(&msg);
if (!cmsg)
bail("received message from unix socket %d without control message", sockfd);
if (cmsg->cmsg_level != SOL_SOCKET)
bail("received unknown control message from unix socket %d: cmsg_level=%d", sockfd, cmsg->cmsg_level);
if (cmsg->cmsg_type != SCM_RIGHTS)
bail("received unknown control message from unix socket %d: cmsg_type=%d", sockfd, cmsg->cmsg_type);
fd_count = (cmsg->cmsg_len - CMSG_LEN(0)) / sizeof(int);
if (fd_count != 1)
bail("received control message from unix socket %d with too many fds: %d", sockfd, fd_count);
fd_payload = (int *)CMSG_DATA(cmsg);
ret = dup3(*fd_payload, new_fd, O_CLOEXEC);
if (ret < 0)
bail("cannot dup3 fd %d to %d", *fd_payload, new_fd);
free(msg.msg_control);
ret = close(*fd_payload);
if (ret < 0)
bail("cannot close fd %d", *fd_payload);
}
void send_fd(int sockfd, int fd)
{
int bytes_written;
struct msghdr msg = { };
struct cmsghdr *cmsg;
struct iovec iov[1] = { };
char null_byte = '\0';
iov[0].iov_base = &null_byte;
iov[0].iov_len = 1;
msg.msg_iov = iov;
msg.msg_iovlen = 1;
/* We send only one fd as specified by cmsg->cmsg_len below, even
* though msg.msg_controllen might have more space due to alignment. */
msg.msg_controllen = CMSG_SPACE(sizeof(int));
msg.msg_control = malloc(msg.msg_controllen);
if (msg.msg_control == NULL) {
bail("Can't allocate memory to send fd.");
}
memset(msg.msg_control, 0, msg.msg_controllen);
cmsg = CMSG_FIRSTHDR(&msg);
cmsg->cmsg_level = SOL_SOCKET;
cmsg->cmsg_type = SCM_RIGHTS;
cmsg->cmsg_len = CMSG_LEN(sizeof(int));
memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
bytes_written = sendmsg(sockfd, &msg, 0);
free(msg.msg_control);
if (bytes_written != 1)
bail("failed to send fd %d via unix socket %d", fd, sockfd);
}
void receive_mountsources(int sockfd)
{
char *mount_fds, *endp;
long new_fd;
// This env var must be a json array of ints.
mount_fds = getenv("_LIBCONTAINER_MOUNT_FDS");
if (mount_fds[0] != '[') {
bail("malformed _LIBCONTAINER_MOUNT_FDS env var: missing '['");
}
mount_fds++;
for (endp = mount_fds; *endp != ']'; mount_fds = endp + 1) {
new_fd = strtol(mount_fds, &endp, 10);
if (endp == mount_fds) {
bail("malformed _LIBCONTAINER_MOUNT_FDS env var: not a number");
}
if (*endp == '\0') {
bail("malformed _LIBCONTAINER_MOUNT_FDS env var: missing ]");
}
// The list contains -1 when no fd is needed. Ignore them.
if (new_fd == -1) {
continue;
}
if (new_fd == LONG_MAX || new_fd < 0 || new_fd > INT_MAX) {
bail("malformed _LIBCONTAINER_MOUNT_FDS env var: fds out of range");
}
receive_fd(sockfd, new_fd);
}
}
void send_mountsources(int sockfd, pid_t child, char *mountsources, size_t mountsources_len)
{
char proc_path[PATH_MAX];
int host_mntns_fd;
int container_mntns_fd;
int fd;
int ret;
// container_linux.go shouldSendMountSources() decides if mount sources
// should be pre-opened (O_PATH) and passed via SCM_RIGHTS
if (mountsources == NULL)
return;
host_mntns_fd = open("/proc/self/ns/mnt", O_RDONLY | O_CLOEXEC);
if (host_mntns_fd == -1)
bail("failed to get current mount namespace");
if (snprintf(proc_path, PATH_MAX, "/proc/%d/ns/mnt", child) < 0)
bail("failed to get mount namespace path");
container_mntns_fd = open(proc_path, O_RDONLY | O_CLOEXEC);
if (container_mntns_fd == -1)
bail("failed to get container mount namespace");
if (setns(container_mntns_fd, CLONE_NEWNS) < 0)
bail("failed to setns to container mntns");
char *mountsources_end = mountsources + mountsources_len;
while (mountsources < mountsources_end) {
if (mountsources[0] == '\0') {
mountsources++;
continue;
}
fd = open(mountsources, O_PATH | O_CLOEXEC);
if (fd < 0)
bail("failed to open mount source %s", mountsources);
send_fd(sockfd, fd);
ret = close(fd);
if (ret != 0)
bail("failed to close mount source fd %d", fd);
mountsources += strlen(mountsources) + 1;
}
if (setns(host_mntns_fd, CLONE_NEWNS) < 0)
bail("failed to setns to host mntns");
ret = close(host_mntns_fd);
if (ret != 0)
bail("failed to close host mount namespace fd %d", host_mntns_fd);
ret = close(container_mntns_fd);
if (ret != 0)
bail("failed to close container mount namespace fd %d", container_mntns_fd);
}
void nsexec(void)
{
int pipenum;
@@ -865,6 +1064,16 @@ void nsexec(void)
bail("failed to sync with runc: write(pid-JSON)");
}
break;
case SYNC_MOUNTSOURCES_PLS:
send_mountsources(syncfd, stage1_pid, config.mountsources,
config.mountsources_len);
s = SYNC_MOUNTSOURCES_ACK;
if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
kill(stage1_pid, SIGKILL);
bail("failed to sync with child: write(SYNC_MOUNTSOURCES_ACK)");
}
break;
case SYNC_CHILD_FINISH:
write_log(DEBUG, "stage-1 complete");
stage1_complete = true;
@@ -1019,6 +1228,28 @@ void nsexec(void)
if (unshare(config.cloneflags & ~CLONE_NEWCGROUP) < 0)
bail("failed to unshare remaining namespaces (except cgroupns)");
/* Ask our parent to send the mount sources fds. */
if (config.mountsources) {
s = SYNC_MOUNTSOURCES_PLS;
if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
kill(stage2_pid, SIGKILL);
bail("failed to sync with parent: write(SYNC_MOUNTSOURCES_PLS)");
}
/* Receive and install all mount sources fds. */
receive_mountsources(syncfd);
/* Parent finished to send the mount sources fds. */
if (read(syncfd, &s, sizeof(s)) != sizeof(s)) {
kill(stage2_pid, SIGKILL);
bail("failed to sync with parent: read(SYNC_MOUNTSOURCES_ACK)");
}
if (s != SYNC_MOUNTSOURCES_ACK) {
kill(stage2_pid, SIGKILL);
bail("failed to sync with parent: SYNC_MOUNTSOURCES_ACK: got %u", s);
}
}
/*
* TODO: What about non-namespace clone flags that we're dropping here?
*

View File

@@ -36,6 +36,7 @@ type mountConfig struct {
cgroup2Path string
rootlessCgroups bool
cgroupns bool
fd *int
}
// needsSetupDev returns true if /dev needs to be set up.
@@ -51,12 +52,16 @@ func needsSetupDev(config *configs.Config) bool {
// prepareRootfs sets up the devices, mount points, and filesystems for use
// inside a new mount namespace. It doesn't set anything as ro. You must call
// finalizeRootfs after this function to finish setting up the rootfs.
func prepareRootfs(pipe io.ReadWriter, iConfig *initConfig) (err error) {
func prepareRootfs(pipe io.ReadWriter, iConfig *initConfig, mountFds []int) (err error) {
config := iConfig.Config
if err := prepareRoot(config); err != nil {
return fmt.Errorf("error preparing rootfs: %w", err)
}
if mountFds != nil && len(mountFds) != len(config.Mounts) {
return fmt.Errorf("malformed mountFds slice. Expected size: %v, got: %v. Slice: %v", len(config.Mounts), len(mountFds), mountFds)
}
mountConfig := &mountConfig{
root: config.Rootfs,
label: config.MountLabel,
@@ -65,12 +70,19 @@ func prepareRootfs(pipe io.ReadWriter, iConfig *initConfig) (err error) {
cgroupns: config.Namespaces.Contains(configs.NEWCGROUP),
}
setupDev := needsSetupDev(config)
for _, m := range config.Mounts {
for i, m := range config.Mounts {
for _, precmd := range m.PremountCmds {
if err := mountCmd(precmd); err != nil {
return fmt.Errorf("error running premount command: %w", err)
}
}
// Just before the loop we checked that if not empty, len(mountFds) == len(config.Mounts).
// Therefore, we can access mountFds[i] without any concerns.
if mountFds != nil && mountFds[i] != -1 {
mountConfig.fd = &mountFds[i]
}
if err := mountToRootfs(m, mountConfig); err != nil {
return fmt.Errorf("error mounting %q to rootfs at %q: %w", m.Source, m.Destination, err)
}
@@ -210,8 +222,13 @@ func mountCmd(cmd configs.Command) error {
return nil
}
func prepareBindMount(m *configs.Mount, rootfs string) error {
stat, err := os.Stat(m.Source)
func prepareBindMount(m *configs.Mount, rootfs string, mountFd *int) error {
source := m.Source
if mountFd != nil {
source = "/proc/self/fd/" + strconv.Itoa(*mountFd)
}
stat, err := os.Stat(source)
if err != nil {
// error out if the source of a bind mount does not exist as we will be
// unable to bind anything to it.
@@ -225,7 +242,7 @@ func prepareBindMount(m *configs.Mount, rootfs string) error {
if dest, err = securejoin.SecureJoin(rootfs, m.Destination); err != nil {
return err
}
if err := checkProcMount(rootfs, dest, m.Source); err != nil {
if err := checkProcMount(rootfs, dest, source); err != nil {
return err
}
if err := createIfNotExists(dest, stat.IsDir()); err != nil {
@@ -255,9 +272,11 @@ func mountCgroupV1(m *configs.Mount, c *mountConfig) error {
Data: "mode=755",
PropagationFlags: m.PropagationFlags,
}
if err := mountToRootfs(tmpfs, c); err != nil {
return err
}
for _, b := range binds {
if c.cgroupns {
subsystemPath := filepath.Join(c.root, b.Destination)
@@ -347,7 +366,7 @@ func doTmpfsCopyUp(m *configs.Mount, rootfs, mountLabel string) (Err error) {
// m.Destination since we are going to mount *on the host*.
oldDest := m.Destination
m.Destination = tmpDir
err = mountPropagate(m, "/", mountLabel)
err = mountPropagate(m, "/", mountLabel, nil)
m.Destination = oldDest
if err != nil {
return err
@@ -378,6 +397,7 @@ func doTmpfsCopyUp(m *configs.Mount, rootfs, mountLabel string) (Err error) {
func mountToRootfs(m *configs.Mount, c *mountConfig) error {
rootfs := c.root
mountLabel := c.label
mountFd := c.fd
dest, err := securejoin.SecureJoin(rootfs, m.Destination)
if err != nil {
return err
@@ -401,12 +421,12 @@ func mountToRootfs(m *configs.Mount, c *mountConfig) error {
return err
}
// Selinux kernels do not support labeling of /proc or /sys
return mountPropagate(m, rootfs, "")
return mountPropagate(m, rootfs, "", nil)
case "mqueue":
if err := os.MkdirAll(dest, 0o755); err != nil {
return err
}
if err := mountPropagate(m, rootfs, ""); err != nil {
if err := mountPropagate(m, rootfs, "", nil); err != nil {
return err
}
return label.SetFileLabel(dest, mountLabel)
@@ -421,11 +441,13 @@ func mountToRootfs(m *configs.Mount, c *mountConfig) error {
if m.Extensions&configs.EXT_COPYUP == configs.EXT_COPYUP {
err = doTmpfsCopyUp(m, rootfs, mountLabel)
} else {
err = mountPropagate(m, rootfs, mountLabel)
err = mountPropagate(m, rootfs, mountLabel, nil)
}
if err != nil {
return err
}
if stat != nil {
if err = os.Chmod(dest, stat.Mode()); err != nil {
return err
@@ -433,23 +455,23 @@ func mountToRootfs(m *configs.Mount, c *mountConfig) error {
}
// Initially mounted rw in mountPropagate, remount to ro if flag set.
if m.Flags&unix.MS_RDONLY != 0 {
if err := remount(m, rootfs); err != nil {
if err := remount(m, rootfs, mountFd); err != nil {
return err
}
}
return nil
case "bind":
if err := prepareBindMount(m, rootfs); err != nil {
if err := prepareBindMount(m, rootfs, mountFd); err != nil {
return err
}
if err := mountPropagate(m, rootfs, mountLabel); err != nil {
if err := mountPropagate(m, rootfs, mountLabel, mountFd); err != nil {
return err
}
// bind mount won't change mount options, we need remount to make mount options effective.
// first check that we have non-default options required before attempting a remount
if m.Flags&^(unix.MS_REC|unix.MS_REMOUNT|unix.MS_BIND) != 0 {
// only remount if unique mount options are set
if err := remount(m, rootfs); err != nil {
if err := remount(m, rootfs, mountFd); err != nil {
return err
}
}
@@ -475,7 +497,7 @@ func mountToRootfs(m *configs.Mount, c *mountConfig) error {
if err := os.MkdirAll(dest, 0o755); err != nil {
return err
}
return mountPropagate(m, rootfs, mountLabel)
return mountPropagate(m, rootfs, mountLabel, mountFd)
}
return nil
}
@@ -1037,15 +1059,20 @@ func writeSystemProperty(key, value string) error {
return ioutil.WriteFile(path.Join("/proc/sys", keyPath), []byte(value), 0o644)
}
func remount(m *configs.Mount, rootfs string) error {
func remount(m *configs.Mount, rootfs string, mountFd *int) error {
source := m.Source
if mountFd != nil {
source = "/proc/self/fd/" + strconv.Itoa(*mountFd)
}
return utils.WithProcfd(rootfs, m.Destination, func(procfd string) error {
return mount(m.Source, m.Destination, procfd, m.Device, uintptr(m.Flags|unix.MS_REMOUNT), "")
return mount(source, m.Destination, procfd, m.Device, uintptr(m.Flags|unix.MS_REMOUNT), "")
})
}
// Do the mount operation followed by additional mounts required to take care
// of propagation flags. This will always be scoped inside the container rootfs.
func mountPropagate(m *configs.Mount, rootfs string, mountLabel string) error {
func mountPropagate(m *configs.Mount, rootfs string, mountLabel string, mountFd *int) error {
var (
data = label.FormatMountLabel(m.Data, mountLabel)
flags = m.Flags
@@ -1062,8 +1089,13 @@ func mountPropagate(m *configs.Mount, rootfs string, mountLabel string) error {
// mutating underneath us, we verify that we are actually going to mount
// inside the container with WithProcfd() -- mounting through a procfd
// mounts on the target.
source := m.Source
if mountFd != nil {
source = "/proc/self/fd/" + strconv.Itoa(*mountFd)
}
if err := utils.WithProcfd(rootfs, m.Destination, func(procfd string) error {
return mount(m.Source, m.Destination, procfd, m.Device, uintptr(flags), data)
return mount(source, m.Destination, procfd, m.Device, uintptr(flags), data)
}); err != nil {
return err
}

View File

@@ -26,6 +26,7 @@ type linuxStandardInit struct {
parentPid int
fifoFd int
logFd int
mountFds []int
config *initConfig
}
@@ -87,9 +88,23 @@ func (l *linuxStandardInit) Init() error {
// initialises the labeling system
selinux.GetEnabled()
if err := prepareRootfs(l.pipe, l.config); err != nil {
// We don't need the mountFds after prepareRootfs() nor if it fails.
err := prepareRootfs(l.pipe, l.config, l.mountFds)
for _, m := range l.mountFds {
if m == -1 {
continue
}
if err := unix.Close(m); err != nil {
return fmt.Errorf("Unable to close mountFds fds: %w", err)
}
}
if err != nil {
return err
}
// Set up the console. This has to be done *before* we finalize the rootfs,
// but *after* we've given the user the chance to set up all of the mounts
// they wanted.