*: introduce pidfd-socket flag

The container manager like containerd-shim can't use cgroup.kill feature or
freeze all the processes in cgroup to terminate the exec init process.
It's unsafe to call kill(2) since the pid can be recycled. It's good to
provide the pidfd of init process through the pidfd-socket. It's similar to
the console-socket. With the pidfd, the container manager like containerd-shim
can send the signal to target process safely.

And for the standard init process, we can have polling support to get
exit event instead of blocking on wait4.

Signed-off-by: Wei Fu <fuweid89@gmail.com>
This commit is contained in:
Wei Fu
2023-10-01 11:09:49 +08:00
parent 27eb67a649
commit 94505a046a
14 changed files with 371 additions and 5 deletions

1
.gitignore vendored
View File

@@ -6,6 +6,7 @@ vendor/pkg
/contrib/cmd/seccompagent/seccompagent
/contrib/cmd/fs-idmap/fs-idmap
/contrib/cmd/memfd-bind/memfd-bind
/contrib/cmd/pidfd-kill/pidfd-kill
man/man8
release
Vagrantfile

View File

@@ -71,10 +71,10 @@ runc-bin: runc-dmz
$(GO_BUILD) -o runc .
.PHONY: all
all: runc recvtty sd-helper seccompagent fs-idmap memfd-bind
all: runc recvtty sd-helper seccompagent fs-idmap memfd-bind pidfd-kill
.PHONY: recvtty sd-helper seccompagent fs-idmap memfd-bind
recvtty sd-helper seccompagent fs-idmap memfd-bind:
.PHONY: recvtty sd-helper seccompagent fs-idmap memfd-bind pidfd-kill
recvtty sd-helper seccompagent fs-idmap memfd-bind pidfd-kill:
$(GO_BUILD) -o contrib/cmd/$@/$@ ./contrib/cmd/$@
.PHONY: static
@@ -194,6 +194,7 @@ clean:
rm -f contrib/cmd/sd-helper/sd-helper
rm -f contrib/cmd/seccompagent/seccompagent
rm -f contrib/cmd/memfd-bind/memfd-bind
rm -f contrib/cmd/pidfd-kill/pidfd-kill
sudo rm -rf release
rm -rf man/man8

View File

@@ -0,0 +1,114 @@
package main
import (
"errors"
"fmt"
"net"
"os"
"os/signal"
"github.com/urfave/cli"
"golang.org/x/sys/unix"
"github.com/opencontainers/runc/libcontainer/utils"
)
const (
usage = `Open Container Initiative contrib/cmd/pidfd-kill
pidfd-kill is an implementation of a consumer of runC's --pidfd-socket API.
After received SIGTERM, pidfd-kill sends the given signal to init process by
pidfd received from --pidfd-socket.
To use pidfd-kill, just specify a socket path at which you want to receive
pidfd:
$ pidfd-kill [--signal KILL] socket.sock
`
)
func main() {
app := cli.NewApp()
app.Name = "pidfd-kill"
app.Usage = usage
app.Flags = []cli.Flag{
cli.StringFlag{
Name: "signal",
Value: "SIGKILL",
Usage: "Signal to send to the init process",
},
cli.StringFlag{
Name: "pid-file",
Value: "",
Usage: "Path to write the pidfd-kill process ID to",
},
}
app.Action = func(ctx *cli.Context) error {
args := ctx.Args()
if len(args) != 1 {
return errors.New("required a single socket path")
}
socketFile := ctx.Args()[0]
pidFile := ctx.String("pid-file")
if pidFile != "" {
pid := fmt.Sprintf("%d\n", os.Getpid())
if err := os.WriteFile(pidFile, []byte(pid), 0o644); err != nil {
return err
}
defer os.Remove(pidFile)
}
sigStr := ctx.String("signal")
if sigStr == "" {
sigStr = "SIGKILL"
}
sig := unix.SignalNum(sigStr)
pidfdFile, err := recvPidfd(socketFile)
if err != nil {
return err
}
defer pidfdFile.Close()
signalCh := make(chan os.Signal, 16)
signal.Notify(signalCh, unix.SIGTERM)
<-signalCh
return unix.PidfdSendSignal(int(pidfdFile.Fd()), sig, nil, 0)
}
if err := app.Run(os.Args); err != nil {
fmt.Fprintln(os.Stderr, "fatal error:", err)
os.Exit(1)
}
}
func recvPidfd(socketFile string) (*os.File, error) {
ln, err := net.Listen("unix", socketFile)
if err != nil {
return nil, err
}
defer ln.Close()
conn, err := ln.Accept()
if err != nil {
return nil, err
}
defer conn.Close()
unixconn, ok := conn.(*net.UnixConn)
if !ok {
return nil, errors.New("failed to cast to unixconn")
}
socket, err := unixconn.File()
if err != nil {
return nil, err
}
defer socket.Close()
return utils.RecvFile(socket)
}

View File

@@ -34,6 +34,10 @@ command(s) that get executed on start, edit the args parameter of the spec. See
Value: "",
Usage: "path to an AF_UNIX socket which will receive a file descriptor referencing the master end of the console's pseudoterminal",
},
cli.StringFlag{
Name: "pidfd-socket",
Usage: "path to an AF_UNIX socket which will receive a file descriptor referencing the init process",
},
cli.StringFlag{
Name: "pid-file",
Value: "",

View File

@@ -33,6 +33,10 @@ following will output a list of processes running in the container:
Name: "console-socket",
Usage: "path to an AF_UNIX socket which will receive a file descriptor referencing the master end of the console's pseudoterminal",
},
cli.StringFlag{
Name: "pidfd-socket",
Usage: "path to an AF_UNIX socket which will receive a file descriptor referencing the exec process",
},
cli.StringFlag{
Name: "cwd",
Usage: "current working directory in the container",
@@ -181,6 +185,7 @@ func execProcess(context *cli.Context) (int, error) {
shouldDestroy: false,
container: container,
consoleSocket: context.String("console-socket"),
pidfdSocket: context.String("pidfd-socket"),
detach: context.Bool("detach"),
pidFile: context.String("pid-file"),
action: CT_ACT_RUN,

View File

@@ -586,6 +586,13 @@ func (c *Container) newParentProcess(p *Process) (parentProcess, error) {
cmd.Env = append(cmd.Env, "_LIBCONTAINER_LOGLEVEL="+p.LogLevel)
}
if p.PidfdSocket != nil {
cmd.ExtraFiles = append(cmd.ExtraFiles, p.PidfdSocket)
cmd.Env = append(cmd.Env,
"_LIBCONTAINER_PIDFD_SOCK="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1),
)
}
if safeExe != nil {
// Due to a Go stdlib bug, we need to add safeExe to the set of
// ExtraFiles otherwise it is possible for the stdlib to clobber the fd

View File

@@ -179,6 +179,16 @@ func startInitialization() (retErr error) {
defer consoleSocket.Close()
}
var pidfdSocket *os.File
if envSockFd := os.Getenv("_LIBCONTAINER_PIDFD_SOCK"); envSockFd != "" {
sockFd, err := strconv.Atoi(envSockFd)
if err != nil {
return fmt.Errorf("unable to convert _LIBCONTAINER_PIDFD_SOCK: %w", err)
}
pidfdSocket = os.NewFile(uintptr(sockFd), "pidfd-socket")
defer pidfdSocket.Close()
}
// Get mount files (O_PATH).
mountSrcFds, err := parseFdsFromEnv("_LIBCONTAINER_MOUNT_FDS")
if err != nil {
@@ -222,10 +232,10 @@ func startInitialization() (retErr error) {
}
// If init succeeds, it will not return, hence none of the defers will be called.
return containerInit(it, &config, syncPipe, consoleSocket, fifofd, logFD, dmzExe, mountFds{sourceFds: mountSrcFds, idmapFds: idmapFds})
return containerInit(it, &config, syncPipe, consoleSocket, pidfdSocket, fifofd, logFD, dmzExe, mountFds{sourceFds: mountSrcFds, idmapFds: idmapFds})
}
func containerInit(t initType, config *initConfig, pipe *syncSocket, consoleSocket *os.File, fifoFd, logFd int, dmzExe *os.File, mountFds mountFds) error {
func containerInit(t initType, config *initConfig, pipe *syncSocket, consoleSocket, pidfdSocket *os.File, fifoFd, logFd int, dmzExe *os.File, mountFds mountFds) error {
if err := populateProcessEnvironment(config.Env); err != nil {
return err
}
@@ -240,6 +250,7 @@ func containerInit(t initType, config *initConfig, pipe *syncSocket, consoleSock
i := &linuxSetnsInit{
pipe: pipe,
consoleSocket: consoleSocket,
pidfdSocket: pidfdSocket,
config: config,
logFd: logFd,
dmzExe: dmzExe,
@@ -249,6 +260,7 @@ func containerInit(t initType, config *initConfig, pipe *syncSocket, consoleSock
i := &linuxStandardInit{
pipe: pipe,
consoleSocket: consoleSocket,
pidfdSocket: pidfdSocket,
parentPid: unix.Getppid(),
config: config,
fifoFd: fifoFd,
@@ -690,3 +702,20 @@ func signalAllProcesses(m cgroups.Manager, s unix.Signal) error {
return nil
}
// setupPidfd opens a process file descriptor of init process, and sends the
// file descriptor back to the socket.
func setupPidfd(socket *os.File, initType string) error {
defer socket.Close()
pidFd, err := unix.PidfdOpen(os.Getpid(), 0)
if err != nil {
return fmt.Errorf("failed to pidfd_open: %w", err)
}
if err := utils.SendRawFd(socket, initType, uintptr(pidFd)); err != nil {
unix.Close(pidFd)
return fmt.Errorf("failed to send pidfd on socket: %w", err)
}
return unix.Close(pidFd)
}

View File

@@ -77,6 +77,9 @@ type Process struct {
// ConsoleSocket provides the masterfd console.
ConsoleSocket *os.File
// PidfdSocket provides process file descriptor of it own.
PidfdSocket *os.File
// Init specifies whether the process is the first process in the container.
Init bool

View File

@@ -22,6 +22,7 @@ import (
type linuxSetnsInit struct {
pipe *syncSocket
consoleSocket *os.File
pidfdSocket *os.File
config *initConfig
logFd int
dmzExe *os.File
@@ -56,6 +57,11 @@ func (l *linuxSetnsInit) Init() error {
return err
}
}
if l.pidfdSocket != nil {
if err := setupPidfd(l.pidfdSocket, "setns"); err != nil {
return fmt.Errorf("failed to setup pidfd: %w", err)
}
}
if l.config.NoNewPrivileges {
if err := unix.Prctl(unix.PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil {
return err

View File

@@ -22,6 +22,7 @@ import (
type linuxStandardInit struct {
pipe *syncSocket
consoleSocket *os.File
pidfdSocket *os.File
parentPid int
fifoFd int
logFd int
@@ -114,6 +115,12 @@ func (l *linuxStandardInit) Init() error {
}
}
if l.pidfdSocket != nil {
if err := setupPidfd(l.pidfdSocket, "standard"); err != nil {
return fmt.Errorf("failed to setup pidfd: %w", err)
}
}
// Finish the rootfs setup.
if l.config.Config.Namespaces.Contains(configs.NEWNS) {
if err := finalizeRootfs(l.config.Config); err != nil {

4
run.go
View File

@@ -35,6 +35,10 @@ command(s) that get executed on start, edit the args parameter of the spec. See
Value: "",
Usage: "path to an AF_UNIX socket which will receive a file descriptor referencing the master end of the console's pseudoterminal",
},
cli.StringFlag{
Name: "pidfd-socket",
Usage: "path to an AF_UNIX socket which will receive a file descriptor referencing the init process",
},
cli.BoolFlag{
Name: "detach, d",
Usage: "detach from the container's process",

View File

@@ -17,6 +17,7 @@ RECVTTY="${INTEGRATION_ROOT}/../../contrib/cmd/recvtty/recvtty"
SD_HELPER="${INTEGRATION_ROOT}/../../contrib/cmd/sd-helper/sd-helper"
SECCOMP_AGENT="${INTEGRATION_ROOT}/../../contrib/cmd/seccompagent/seccompagent"
FS_IDMAP="${INTEGRATION_ROOT}/../../contrib/cmd/fs-idmap/fs-idmap"
PIDFD_KILL="${INTEGRATION_ROOT}/../../contrib/cmd/pidfd-kill/pidfd-kill"
# Some variables may not always be set. Set those to empty value,
# if unset, to avoid "unbound variable" error.
@@ -697,3 +698,44 @@ function requires_idmap_fs() {
esac
# If we have another error, the integration test will fail and report it.
}
# setup_pidfd_kill runs pidfd-kill process in background and receives the
# SIGTERM as signal to send the given signal to init process.
function setup_pidfd_kill() {
local signal=$1
[ ! -v ROOT ] && return 1
local dir="${ROOT}/pidfd"
mkdir "${dir}"
export PIDFD_SOCKET="${dir}/sock"
("${PIDFD_KILL}" --pid-file "${dir}/pid" --signal "${signal}" "${PIDFD_SOCKET}" &) &
# ensure socket is ready
retry 10 1 stat "${PIDFD_SOCKET}"
}
# teardown_pidfd_kill cleanups all the resources related to pidfd-kill.
function teardown_pidfd_kill() {
[ ! -v ROOT ] && return 0
local dir="${ROOT}/pidfd"
if [ -f "${dir}/pid" ]; then
kill -9 "$(cat "${dir}/pid")"
fi
rm -rf "${dir}"
}
# pidfd_kill sends the signal to init process.
function pidfd_kill() {
[ ! -v ROOT ] && return 0
local dir="${ROOT}/pidfd"
if [ -f "${dir}/pid" ]; then
kill "$(cat "${dir}/pid")"
fi
}

View File

@@ -0,0 +1,99 @@
#!/usr/bin/env bats
load helpers
function setup() {
requires root
requires_kernel 5.3
setup_busybox
update_config '.process.args = ["/bin/sleep", "1d"]'
}
function teardown() {
teardown_pidfd_kill
teardown_bundle
}
@test "runc create [ --pidfd-socket ] " {
setup_pidfd_kill "SIGTERM"
runc create --console-socket "$CONSOLE_SOCKET" --pidfd-socket "${PIDFD_SOCKET}" test_pidfd
[ "$status" -eq 0 ]
testcontainer test_pidfd created
pidfd_kill
wait_for_container 10 1 test_pidfd stopped
}
@test "runc run [ --pidfd-socket ] " {
setup_pidfd_kill "SIGKILL"
runc run -d --console-socket "$CONSOLE_SOCKET" --pidfd-socket "${PIDFD_SOCKET}" test_pidfd
[ "$status" -eq 0 ]
testcontainer test_pidfd running
pidfd_kill
wait_for_container 10 1 test_pidfd stopped
}
@test "runc exec [ --pidfd-socket ] [cgroups_v1] " {
requires cgroups_v1
set_cgroups_path
runc run -d --console-socket "$CONSOLE_SOCKET" test_pidfd
[ "$status" -eq 0 ]
testcontainer test_pidfd running
# Use sub-cgroup to ensure that exec process has been killed
test_pidfd_cgroup_path=$(get_cgroup_path "pids")
mkdir "${test_pidfd_cgroup_path}/exec_pidfd"
[ "$status" -eq 0 ]
setup_pidfd_kill "SIGKILL"
__runc exec -d --cgroup "pids:exec_pidfd" --pid-file "exec_pid.txt" --pidfd-socket "${PIDFD_SOCKET}" test_pidfd sleep 1d
[ "$status" -eq 0 ]
exec_pid=$(cat exec_pid.txt)
exec_pid_in_cgroup=$(cat "${test_pidfd_cgroup_path}/exec_pidfd/cgroup.procs")
[ "${exec_pid}" -eq "${exec_pid_in_cgroup}" ]
pidfd_kill
# ensure exec process has been reaped
retry 10 1 rmdir "${test_pidfd_cgroup_path}/exec_pidfd"
testcontainer test_pidfd running
}
@test "runc exec [ --pidfd-socket ] [cgroups_v2] " {
requires cgroups_v2
set_cgroups_path
runc run -d --console-socket "$CONSOLE_SOCKET" test_pidfd
[ "$status" -eq 0 ]
testcontainer test_pidfd running
# Use sub-cgroup to ensure that exec process has been killed
test_pidfd_cgroup_path=$(get_cgroup_path "pids")
mkdir "${test_pidfd_cgroup_path}/exec_pidfd"
[ "$status" -eq 0 ]
setup_pidfd_kill "SIGKILL"
__runc exec -d --cgroup "exec_pidfd" --pid-file "exec_pid.txt" --pidfd-socket "${PIDFD_SOCKET}" test_pidfd sleep 1d
[ "$status" -eq 0 ]
exec_pid=$(cat exec_pid.txt)
exec_pid_in_cgroup=$(cat "${test_pidfd_cgroup_path}/exec_pidfd/cgroup.procs")
[ "${exec_pid}" -eq "${exec_pid_in_cgroup}" ]
pidfd_kill
# ensure exec process has been reaped
retry 10 1 rmdir "${test_pidfd_cgroup_path}/exec_pidfd"
testcontainer test_pidfd running
}

View File

@@ -18,6 +18,7 @@ import (
"github.com/opencontainers/runc/libcontainer"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/specconv"
"github.com/opencontainers/runc/libcontainer/system/kernelversion"
"github.com/opencontainers/runc/libcontainer/utils"
)
@@ -199,6 +200,7 @@ type runner struct {
preserveFDs int
pidFile string
consoleSocket string
pidfdSocket string
container *libcontainer.Container
action CtAct
notifySocket *notifySocket
@@ -255,6 +257,14 @@ func (r *runner) run(config *specs.Process) (int, error) {
}
defer tty.Close()
if r.pidfdSocket != "" {
connClose, err := setupPidfdSocket(process, r.pidfdSocket)
if err != nil {
return -1, err
}
defer connClose()
}
switch r.action {
case CT_ACT_CREATE:
err = r.container.Start(process)
@@ -390,6 +400,7 @@ func startContainer(context *cli.Context, action CtAct, criuOpts *libcontainer.C
listenFDs: listenFDs,
notifySocket: notifySocket,
consoleSocket: context.String("console-socket"),
pidfdSocket: context.String("pidfd-socket"),
detach: context.Bool("detach"),
pidFile: context.String("pid-file"),
preserveFDs: context.Int("preserve-fds"),
@@ -399,3 +410,36 @@ func startContainer(context *cli.Context, action CtAct, criuOpts *libcontainer.C
}
return r.run(spec.Process)
}
func setupPidfdSocket(process *libcontainer.Process, sockpath string) (_clean func(), _ error) {
linux530 := kernelversion.KernelVersion{Kernel: 5, Major: 3}
ok, err := kernelversion.GreaterEqualThan(linux530)
if err != nil {
return nil, err
}
if !ok {
return nil, fmt.Errorf("--pidfd-socket requires >= v5.3 kernel")
}
conn, err := net.Dial("unix", sockpath)
if err != nil {
return nil, fmt.Errorf("failed to dail %s: %w", sockpath, err)
}
uc, ok := conn.(*net.UnixConn)
if !ok {
conn.Close()
return nil, errors.New("failed to cast to UnixConn")
}
socket, err := uc.File()
if err != nil {
conn.Close()
return nil, fmt.Errorf("failed to dup socket: %w", err)
}
process.PidfdSocket = socket
return func() {
conn.Close()
}, nil
}