Files
runc/libcontainer/setns_init_linux.go
lifubang f7dda6e6dc libct: setup personality before initializing seccomp
Set the process personality early to ensure it takes effect before
seccomp is initialized. If seccomp filters are applied first and they
block personality-related system calls (e.g., `personality(2)`),
subsequent attempts to set the personality will fail.

Signed-off-by: lifubang <lifubang@acmcoder.com>
2025-09-25 09:39:36 +00:00

165 lines
5.2 KiB
Go

package libcontainer
import (
"errors"
"fmt"
"os"
"os/exec"
"github.com/opencontainers/selinux/go-selinux"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
"github.com/opencontainers/runc/internal/linux"
"github.com/opencontainers/runc/libcontainer/apparmor"
"github.com/opencontainers/runc/libcontainer/keys"
"github.com/opencontainers/runc/libcontainer/seccomp"
"github.com/opencontainers/runc/libcontainer/system"
"github.com/opencontainers/runc/libcontainer/utils"
)
// linuxSetnsInit performs the container's initialization for running a new process
// inside an existing container.
type linuxSetnsInit struct {
pipe *syncSocket
consoleSocket *os.File
pidfdSocket *os.File
config *initConfig
logPipe *os.File
}
func (l *linuxSetnsInit) getSessionRingName() string {
return "_ses." + l.config.ContainerID
}
func (l *linuxSetnsInit) Init() error {
if !l.config.Config.NoNewKeyring {
if l.config.ProcessLabel != "" {
if err := selinux.SetKeyLabel(l.config.ProcessLabel); err != nil {
return err
}
defer selinux.SetKeyLabel("") //nolint: errcheck
}
// Do not inherit the parent's session keyring.
if _, err := keys.JoinSessionKeyring(l.getSessionRingName()); err != nil {
logrus.Warnf("KeyctlJoinSessionKeyring: %v", err)
// Same justification as in standard_init_linux.go as to why we
// don't bail on ENOSYS.
if !errors.Is(err, unix.ENOSYS) {
return fmt.Errorf("unable to join session keyring: %w", err)
}
}
}
if l.config.CreateConsole {
if err := setupConsole(l.consoleSocket, l.config, false); err != nil {
return err
}
if err := system.Setctty(); err != nil {
return err
}
}
if l.pidfdSocket != nil {
if err := setupPidfd(l.pidfdSocket, "setns"); err != nil {
return fmt.Errorf("failed to setup pidfd: %w", err)
}
}
if l.config.NoNewPrivileges {
if err := unix.Prctl(unix.PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil {
return err
}
}
if l.config.Config.Umask != nil {
unix.Umask(int(*l.config.Config.Umask))
}
if err := setupScheduler(l.config); err != nil {
return err
}
if err := setupIOPriority(l.config); err != nil {
return err
}
// Set personality if specified.
if l.config.Config.Personality != nil {
if err := setupPersonality(l.config.Config); err != nil {
return err
}
}
// Tell our parent that we're ready to exec. This must be done before the
// Seccomp rules have been applied, because we need to be able to read and
// write to a socket.
if err := syncParentReady(l.pipe); err != nil {
return fmt.Errorf("sync ready: %w", err)
}
if l.config.ProcessLabel != "" {
if err := selinux.SetExecLabel(l.config.ProcessLabel); err != nil {
return err
}
defer selinux.SetExecLabel("") //nolint: errcheck
}
// Without NoNewPrivileges seccomp is a privileged operation, so we need to
// do this before dropping capabilities; otherwise do it as late as possible
// just before execve so as few syscalls take place after it as possible.
if l.config.Config.Seccomp != nil && !l.config.NoNewPrivileges {
seccompFd, err := seccomp.InitSeccomp(l.config.Config.Seccomp)
if err != nil {
return err
}
if err := syncParentSeccomp(l.pipe, seccompFd); err != nil {
return err
}
}
if err := finalizeNamespace(l.config); err != nil {
return err
}
if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil {
return err
}
// Check for the arg early to make sure it exists.
name, err := exec.LookPath(l.config.Args[0])
if err != nil {
return err
}
// Set seccomp as close to execve as possible, so as few syscalls take
// place afterward (reducing the amount of syscalls that users need to
// enable in their seccomp profiles).
if l.config.Config.Seccomp != nil && l.config.NoNewPrivileges {
seccompFd, err := seccomp.InitSeccomp(l.config.Config.Seccomp)
if err != nil {
return fmt.Errorf("unable to init seccomp: %w", err)
}
if err := syncParentSeccomp(l.pipe, seccompFd); err != nil {
return err
}
}
// Close the pipe to signal that we have completed our init.
// Please keep this because we don't want to get a pipe write error if
// there is an error from `execve` after all fds closed.
_ = l.pipe.Close()
// Close the log pipe fd so the parent's ForwardLogs can exit.
logrus.Debugf("setns_init: about to exec")
if err := l.logPipe.Close(); err != nil {
return fmt.Errorf("close log pipe: %w", err)
}
// Close all file descriptors we are not passing to the container. This is
// necessary because the execve target could use internal runc fds as the
// execve path, potentially giving access to binary files from the host
// (which can then be opened by container processes, leading to container
// escapes). Note that because this operation will close any open file
// descriptors that are referenced by (*os.File) handles from underneath
// the Go runtime, we must not do any file operations after this point
// (otherwise the (*os.File) finaliser could close the wrong file). See
// CVE-2024-21626 for more information as to why this protection is
// necessary.
if err := utils.UnsafeCloseFrom(l.config.PassedFilesCount + 3); err != nil {
return err
}
return linux.Exec(name, l.config.Args, l.config.Env)
}