mirror of
https://github.com/opencontainers/runc.git
synced 2025-10-05 23:46:57 +08:00
libct: move killing logic to container.Signal
By default, the container has its own PID namespace, and killing (with SIGKILL) its init process from the parent PID namespace also kills all the other processes. Obviously, it does not work that way when the container is sharing its PID namespace with the host or another container, since init is no longer special (it's not PID 1). In this case, killing container's init will result in a bunch of other processes left running (and thus the inability to remove the cgroup). The solution to the above problem is killing all the container processes, not just init. The problem with the current implementation is, the killing logic is implemented in libcontainer's initProcess.wait, and thus only available to libcontainer users, but not the runc kill command (which uses nonChildProcess.kill and does not use wait at all). So, some workarounds exist: - func destroy(c *Container) calls signalAllProcesses; - runc kill implements -a flag. This code became very tangled over time. Let's simplify things by moving the killing all processes from initProcess.wait to container.Signal, and documents the new behavior. In essence, this also makes `runc kill` to automatically kill all container processes when the container does not have its own PID namespace. Document that as well. Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
This commit is contained in:
@@ -358,9 +358,9 @@ func (c *Container) start(process *Process) (retErr error) {
|
||||
}
|
||||
|
||||
// Signal sends a specified signal to container's init, or, if all is true,
|
||||
// true, to all container's processes (as determined by container's cgroup).
|
||||
// to all container's processes (as determined by container's cgroup).
|
||||
//
|
||||
// Setting all=true is useful when s is SIGKILL and the container does not have
|
||||
// Note all=true is implied when s is SIGKILL and the container does not have
|
||||
// its own PID namespace. In this scenario, the libcontainer user may be required
|
||||
// to implement a proper child reaper.
|
||||
func (c *Container) Signal(s os.Signal, all bool) error {
|
||||
@@ -382,22 +382,36 @@ func (c *Container) Signal(s os.Signal, all bool) error {
|
||||
}
|
||||
return c.ignoreCgroupError(signalAllProcesses(c.cgroupManager, sig))
|
||||
}
|
||||
// to avoid a PID reuse attack
|
||||
if status == Running || status == Created || status == Paused {
|
||||
if err := c.initProcess.signal(s); err != nil {
|
||||
return fmt.Errorf("unable to signal init: %w", err)
|
||||
}
|
||||
if status == Paused {
|
||||
// For cgroup v1, killing a process in a frozen cgroup
|
||||
// does nothing until it's thawed. Only thaw the cgroup
|
||||
// for SIGKILL.
|
||||
if s, ok := s.(unix.Signal); ok && s == unix.SIGKILL {
|
||||
_ = c.cgroupManager.Freeze(configs.Thawed)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
|
||||
// To avoid a PID reuse attack, don't kill non-running container.
|
||||
switch status {
|
||||
case Running, Created, Paused:
|
||||
default:
|
||||
return ErrNotRunning
|
||||
}
|
||||
return ErrNotRunning
|
||||
|
||||
// When a container has its own PID namespace, inside it the init PID
|
||||
// is 1, and thus it is handled specially by the kernel. In particular,
|
||||
// killing init with SIGKILL from an ancestor namespace will also kill
|
||||
// all other processes in that PID namespace (see pid_namespaces(7)).
|
||||
//
|
||||
// OTOH, if PID namespace is shared, we should kill all pids to avoid
|
||||
// leftover processes.
|
||||
if s == unix.SIGKILL && !c.config.Namespaces.IsPrivate(configs.NEWPID) {
|
||||
err = signalAllProcesses(c.cgroupManager, unix.SIGKILL)
|
||||
} else {
|
||||
err = c.initProcess.signal(s)
|
||||
}
|
||||
if err != nil {
|
||||
return fmt.Errorf("unable to signal init: %w", err)
|
||||
}
|
||||
if status == Paused && s == unix.SIGKILL {
|
||||
// For cgroup v1, killing a process in a frozen cgroup
|
||||
// does nothing until it's thawed. Only thaw the cgroup
|
||||
// for SIGKILL.
|
||||
_ = c.cgroupManager.Freeze(configs.Thawed)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *Container) createExecFifo() error {
|
||||
@@ -593,7 +607,6 @@ func (c *Container) newInitProcess(p *Process, cmd *exec.Cmd, messageSockPair, l
|
||||
container: c,
|
||||
process: p,
|
||||
bootstrapData: data,
|
||||
sharePidns: !c.config.Namespaces.IsPrivate(configs.NEWPID),
|
||||
}
|
||||
c.initProcess = init
|
||||
return init, nil
|
||||
@@ -696,8 +709,7 @@ func (c *Container) newInitConfig(process *Process) *initConfig {
|
||||
return cfg
|
||||
}
|
||||
|
||||
// Destroy destroys the container, if its in a valid state, after killing any
|
||||
// remaining running processes.
|
||||
// Destroy destroys the container, if its in a valid state.
|
||||
//
|
||||
// Any event registrations are removed before the container is destroyed.
|
||||
// No error is returned if the container is already destroyed.
|
||||
|
@@ -1399,8 +1399,8 @@ func testPidnsInitKill(t *testing.T, config *configs.Config) {
|
||||
err = container.Run(process2)
|
||||
ok(t, err)
|
||||
|
||||
// Kill the init process and Wait for it.
|
||||
err = process1.Signal(syscall.SIGKILL)
|
||||
// Kill the container.
|
||||
err = container.Signal(syscall.SIGKILL, false)
|
||||
ok(t, err)
|
||||
_, err = process1.Wait()
|
||||
if err == nil {
|
||||
|
@@ -304,7 +304,6 @@ type initProcess struct {
|
||||
fds []string
|
||||
process *Process
|
||||
bootstrapData io.Reader
|
||||
sharePidns bool
|
||||
}
|
||||
|
||||
func (p *initProcess) pid() int {
|
||||
@@ -616,10 +615,6 @@ func (p *initProcess) start() (retErr error) {
|
||||
|
||||
func (p *initProcess) wait() (*os.ProcessState, error) {
|
||||
err := p.cmd.Wait()
|
||||
// we should kill all processes in cgroup when init is died if we use host PID namespace
|
||||
if p.sharePidns {
|
||||
_ = signalAllProcesses(p.manager, unix.SIGKILL)
|
||||
}
|
||||
return p.cmd.ProcessState, err
|
||||
}
|
||||
|
||||
|
@@ -7,7 +7,6 @@ import (
|
||||
|
||||
"github.com/opencontainers/runc/libcontainer/configs"
|
||||
"github.com/opencontainers/runtime-spec/specs-go"
|
||||
"github.com/sirupsen/logrus"
|
||||
"golang.org/x/sys/unix"
|
||||
)
|
||||
|
||||
@@ -36,12 +35,6 @@ type containerState interface {
|
||||
}
|
||||
|
||||
func destroy(c *Container) error {
|
||||
if !c.config.Namespaces.Contains(configs.NEWPID) ||
|
||||
c.config.Namespaces.PathOf(configs.NEWPID) != "" {
|
||||
if err := signalAllProcesses(c.cgroupManager, unix.SIGKILL); err != nil {
|
||||
logrus.Warn(err)
|
||||
}
|
||||
}
|
||||
err := c.cgroupManager.Destroy()
|
||||
if c.intelRdtManager != nil {
|
||||
if ierr := c.intelRdtManager.Destroy(); err == nil {
|
||||
|
@@ -18,8 +18,8 @@ to list available signals.
|
||||
# OPTIONS
|
||||
**--all**|**-a**
|
||||
: Send the signal to all processes inside the container, rather than
|
||||
the container's init only. This option is useful when the container does not
|
||||
have its own PID namespace.
|
||||
the container's init only. This option is implied when the _signal_ is **KILL**
|
||||
and the container does not have its own PID namespace.
|
||||
|
||||
: When this option is set, no error is returned if the container is stopped
|
||||
or does not exist.
|
||||
|
Reference in New Issue
Block a user