mirror of
https://github.com/opencontainers/runc.git
synced 2025-10-21 14:39:36 +08:00

This addresses the following TODO in the code (added back in 2015
by commit 845fc65e5
):
> // TODO: fix libcontainer's API to better support uid/gid in a typesafe way.
Historically, libcontainer internally uses strings for user, group, and
additional (aka supplementary) groups.
Yet, runc receives those credentials as part of runtime-spec's process,
which uses integers for all of them (see [1], [2]).
What happens next is:
1. runc start/run/exec converts those credentials to strings (a User
string containing "UID:GID", and a []string for additional GIDs) and
passes those onto runc init.
2. runc init converts them back to int, in the most complicated way
possible (parsing container's /etc/passwd and /etc/group).
All this conversion and, especially, parsing is totally unnecessary,
but is performed on every container exec (and start).
The only benefit of all this is, a libcontainer user could use user and
group names instead of numeric IDs (but runc itself is not using this
feature, and we don't know if there are any other users of this).
Let's remove this back and forth translation, hopefully increasing
runc exec performance.
The only remaining need to parse /etc/passwd is to set HOME environment
variable for a specified UID, in case $HOME is not explicitly set in
process.Env. This can now be done right in prepareEnv, which simplifies
the code flow a lot. Alas, we can not use standard os/user.LookupId, as
it could cache host's /etc/passwd or the current user (even with the
osusergo tag).
PS Note that the structures being changed (initConfig and Process) are
never saved to disk as JSON by runc, so there is no compatibility issue
for runc users.
Still, this is a breaking change in libcontainer, but we never promised
that libcontainer API will be stable (and there's a special package
that can handle it -- github.com/moby/sys/user). Reflect this in
CHANGELOG.
For 3998.
[1]: https://github.com/opencontainers/runtime-spec/blob/v1.0.2/config.md#posix-platform-user
[2]: https://github.com/opencontainers/runtime-spec/blob/v1.0.2/specs-go/config.go#L86
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
159 lines
5.0 KiB
Go
159 lines
5.0 KiB
Go
package libcontainer
|
|
|
|
import (
|
|
"errors"
|
|
"fmt"
|
|
"os"
|
|
"os/exec"
|
|
|
|
"github.com/opencontainers/selinux/go-selinux"
|
|
"github.com/sirupsen/logrus"
|
|
"golang.org/x/sys/unix"
|
|
|
|
"github.com/opencontainers/runc/libcontainer/apparmor"
|
|
"github.com/opencontainers/runc/libcontainer/keys"
|
|
"github.com/opencontainers/runc/libcontainer/seccomp"
|
|
"github.com/opencontainers/runc/libcontainer/system"
|
|
"github.com/opencontainers/runc/libcontainer/utils"
|
|
)
|
|
|
|
// linuxSetnsInit performs the container's initialization for running a new process
|
|
// inside an existing container.
|
|
type linuxSetnsInit struct {
|
|
pipe *syncSocket
|
|
consoleSocket *os.File
|
|
pidfdSocket *os.File
|
|
config *initConfig
|
|
logPipe *os.File
|
|
}
|
|
|
|
func (l *linuxSetnsInit) getSessionRingName() string {
|
|
return "_ses." + l.config.ContainerID
|
|
}
|
|
|
|
func (l *linuxSetnsInit) Init() error {
|
|
if !l.config.Config.NoNewKeyring {
|
|
if err := selinux.SetKeyLabel(l.config.ProcessLabel); err != nil {
|
|
return err
|
|
}
|
|
defer selinux.SetKeyLabel("") //nolint: errcheck
|
|
// Do not inherit the parent's session keyring.
|
|
if _, err := keys.JoinSessionKeyring(l.getSessionRingName()); err != nil {
|
|
// Same justification as in standart_init_linux.go as to why we
|
|
// don't bail on ENOSYS.
|
|
//
|
|
// TODO(cyphar): And we should have logging here too.
|
|
if !errors.Is(err, unix.ENOSYS) {
|
|
return fmt.Errorf("unable to join session keyring: %w", err)
|
|
}
|
|
}
|
|
}
|
|
|
|
if l.config.CreateConsole {
|
|
if err := setupConsole(l.consoleSocket, l.config, false); err != nil {
|
|
return err
|
|
}
|
|
if err := system.Setctty(); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
if l.pidfdSocket != nil {
|
|
if err := setupPidfd(l.pidfdSocket, "setns"); err != nil {
|
|
return fmt.Errorf("failed to setup pidfd: %w", err)
|
|
}
|
|
}
|
|
if l.config.NoNewPrivileges {
|
|
if err := unix.Prctl(unix.PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
if l.config.Config.Umask != nil {
|
|
unix.Umask(int(*l.config.Config.Umask))
|
|
}
|
|
|
|
if err := setupScheduler(l.config.Config); err != nil {
|
|
return err
|
|
}
|
|
|
|
if err := setupIOPriority(l.config.Config); err != nil {
|
|
return err
|
|
}
|
|
// Tell our parent that we're ready to exec. This must be done before the
|
|
// Seccomp rules have been applied, because we need to be able to read and
|
|
// write to a socket.
|
|
if err := syncParentReady(l.pipe); err != nil {
|
|
return fmt.Errorf("sync ready: %w", err)
|
|
}
|
|
|
|
if err := selinux.SetExecLabel(l.config.ProcessLabel); err != nil {
|
|
return err
|
|
}
|
|
defer selinux.SetExecLabel("") //nolint: errcheck
|
|
// Without NoNewPrivileges seccomp is a privileged operation, so we need to
|
|
// do this before dropping capabilities; otherwise do it as late as possible
|
|
// just before execve so as few syscalls take place after it as possible.
|
|
if l.config.Config.Seccomp != nil && !l.config.NoNewPrivileges {
|
|
seccompFd, err := seccomp.InitSeccomp(l.config.Config.Seccomp)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if err := syncParentSeccomp(l.pipe, seccompFd); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
if err := finalizeNamespace(l.config); err != nil {
|
|
return err
|
|
}
|
|
if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil {
|
|
return err
|
|
}
|
|
if l.config.Config.Personality != nil {
|
|
if err := setupPersonality(l.config.Config); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
// Check for the arg early to make sure it exists.
|
|
name, err := exec.LookPath(l.config.Args[0])
|
|
if err != nil {
|
|
return err
|
|
}
|
|
// Set seccomp as close to execve as possible, so as few syscalls take
|
|
// place afterward (reducing the amount of syscalls that users need to
|
|
// enable in their seccomp profiles).
|
|
if l.config.Config.Seccomp != nil && l.config.NoNewPrivileges {
|
|
seccompFd, err := seccomp.InitSeccomp(l.config.Config.Seccomp)
|
|
if err != nil {
|
|
return fmt.Errorf("unable to init seccomp: %w", err)
|
|
}
|
|
if err := syncParentSeccomp(l.pipe, seccompFd); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
// Close the pipe to signal that we have completed our init.
|
|
// Please keep this because we don't want to get a pipe write error if
|
|
// there is an error from `execve` after all fds closed.
|
|
_ = l.pipe.Close()
|
|
|
|
// Close the log pipe fd so the parent's ForwardLogs can exit.
|
|
logrus.Debugf("setns_init: about to exec")
|
|
if err := l.logPipe.Close(); err != nil {
|
|
return fmt.Errorf("close log pipe: %w", err)
|
|
}
|
|
|
|
// Close all file descriptors we are not passing to the container. This is
|
|
// necessary because the execve target could use internal runc fds as the
|
|
// execve path, potentially giving access to binary files from the host
|
|
// (which can then be opened by container processes, leading to container
|
|
// escapes). Note that because this operation will close any open file
|
|
// descriptors that are referenced by (*os.File) handles from underneath
|
|
// the Go runtime, we must not do any file operations after this point
|
|
// (otherwise the (*os.File) finaliser could close the wrong file). See
|
|
// CVE-2024-21626 for more information as to why this protection is
|
|
// necessary.
|
|
if err := utils.UnsafeCloseFrom(l.config.PassedFilesCount + 3); err != nil {
|
|
return err
|
|
}
|
|
return system.Exec(name, l.config.Args, l.config.Env)
|
|
}
|