runc start/run: report OOM

In some cases, container init fails to start because it is killed by
the kernel OOM killer. The errors returned by runc in such cases are
semi-random and rather cryptic. Below are a few examples.

On cgroup v1 + systemd cgroup driver:

> process_linux.go:348: copying bootstrap data to pipe caused: write init-p: broken pipe

> process_linux.go:352: getting the final child's pid from pipe caused: EOF

On cgroup v2:

> process_linux.go:495: container init caused: read init-p: connection reset by peer

> process_linux.go:484: writing syncT 'resume' caused: write init-p: broken pipe

This commits adds the OOM method to cgroup managers, which tells whether
the container was OOM-killed. In case that has happened, the original error
is discarded (unless --debug is set), and the new OOM error is reported
instead:

> ERRO[0000] container_linux.go:367: starting container process caused: container init was OOM-killed (memory limit too low?)

Also, fix the rootless test cases that are failing because they expect
an error in the first line, and we have an additional warning now:

> unable to get oom kill count" error="no directory specified for memory.oom_control

Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
This commit is contained in:
Kir Kolyshkin
2021-02-19 02:53:27 -08:00
parent 7e137b9044
commit 5d0ffbf9c8
8 changed files with 53 additions and 2 deletions

View File

@@ -48,4 +48,7 @@ type Manager interface {
// Whether the cgroup path exists or not
Exists() bool
// OOMKillCount reports OOM kill count for the cgroup.
OOMKillCount() (uint64, error)
}

View File

@@ -9,6 +9,7 @@ import (
"sync"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils"
"github.com/pkg/errors"
@@ -421,3 +422,11 @@ func (m *manager) GetFreezerState() (configs.FreezerState, error) {
func (m *manager) Exists() bool {
return cgroups.PathExists(m.Path("devices"))
}
func OOMKillCount(path string) (uint64, error) {
return fscommon.GetValueByKey(path, "memory.oom_control", "oom_kill")
}
func (m *manager) OOMKillCount() (uint64, error) {
return OOMKillCount(m.Path("memory"))
}

View File

@@ -257,3 +257,11 @@ func (m *manager) GetFreezerState() (configs.FreezerState, error) {
func (m *manager) Exists() bool {
return cgroups.PathExists(m.dirPath)
}
func OOMKillCount(path string) (uint64, error) {
return fscommon.GetValueByKey(path, "memory.events", "oom_kill")
}
func (m *manager) OOMKillCount() (uint64, error) {
return OOMKillCount(m.dirPath)
}

View File

@@ -450,3 +450,7 @@ func (m *legacyManager) GetFreezerState() (configs.FreezerState, error) {
func (m *legacyManager) Exists() bool {
return cgroups.PathExists(m.Path("devices"))
}
func (m *legacyManager) OOMKillCount() (uint64, error) {
return fs.OOMKillCount(m.Path("memory"))
}

View File

@@ -495,3 +495,7 @@ func (m *unifiedManager) GetFreezerState() (configs.FreezerState, error) {
func (m *unifiedManager) Exists() bool {
return cgroups.PathExists(m.path)
}
func (m *unifiedManager) OOMKillCount() (uint64, error) {
return fs2.OOMKillCount(m.path)
}

View File

@@ -55,6 +55,10 @@ func (m *mockCgroupManager) Exists() bool {
return err == nil
}
func (m *mockCgroupManager) OOMKillCount() (uint64, error) {
return 0, nil
}
func (m *mockCgroupManager) GetPaths() map[string]string {
return m.paths
}

View File

@@ -321,6 +321,24 @@ func (p *initProcess) start() (retErr error) {
}
defer func() {
if retErr != nil {
// init might be killed by the kernel's OOM killer.
oom, err := p.manager.OOMKillCount()
if err != nil {
logrus.WithError(err).Warn("unable to get oom kill count")
} else if oom > 0 {
// Does not matter what the particular error was,
// its cause is most probably OOM, so report that.
const oomError = "container init was OOM-killed (memory limit too low?)"
if logrus.GetLevel() >= logrus.DebugLevel {
// Only show the original error if debug is set,
// as it is not generally very useful.
retErr = newSystemErrorWithCause(retErr, oomError)
} else {
retErr = newSystemError(errors.New(oomError))
}
}
// terminate the process to ensure we can remove cgroups
if err := ignoreTerminateErrors(p.terminate()); err != nil {
logrus.WithError(err).Warn("unable to terminate initProcess")

View File

@@ -79,7 +79,7 @@ function setup() {
runc run -d --console-socket "$CONSOLE_SOCKET" test_cgroups_permissions
[ "$status" -eq 1 ]
[[ ${lines[0]} == *"permission denied"* ]]
[[ "$output" == *"applying cgroup configuration"*"permission denied"* ]]
}
@test "runc create (rootless + limits + no cgrouppath + no permission) fails with informative error" {
@@ -92,7 +92,8 @@ function setup() {
runc run -d --console-socket "$CONSOLE_SOCKET" test_cgroups_permissions
[ "$status" -eq 1 ]
[[ ${lines[0]} == *"rootless needs no limits + no cgrouppath when no permission is granted for cgroups"* ]] || [[ ${lines[0]} == *"cannot set pids limit: container could not join or create cgroup"* ]]
[[ "$output" == *"rootless needs no limits + no cgrouppath when no permission is granted for cgroups"* ]] ||
[[ "$output" == *"cannot set pids limit: container could not join or create cgroup"* ]]
}
@test "runc create (limits + cgrouppath + permission on the cgroup dir) succeeds" {