mirror of
https://github.com/opencontainers/runc.git
synced 2025-10-01 13:52:27 +08:00
runc start/run: report OOM
In some cases, container init fails to start because it is killed by the kernel OOM killer. The errors returned by runc in such cases are semi-random and rather cryptic. Below are a few examples. On cgroup v1 + systemd cgroup driver: > process_linux.go:348: copying bootstrap data to pipe caused: write init-p: broken pipe > process_linux.go:352: getting the final child's pid from pipe caused: EOF On cgroup v2: > process_linux.go:495: container init caused: read init-p: connection reset by peer > process_linux.go:484: writing syncT 'resume' caused: write init-p: broken pipe This commits adds the OOM method to cgroup managers, which tells whether the container was OOM-killed. In case that has happened, the original error is discarded (unless --debug is set), and the new OOM error is reported instead: > ERRO[0000] container_linux.go:367: starting container process caused: container init was OOM-killed (memory limit too low?) Also, fix the rootless test cases that are failing because they expect an error in the first line, and we have an additional warning now: > unable to get oom kill count" error="no directory specified for memory.oom_control Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
This commit is contained in:
@@ -48,4 +48,7 @@ type Manager interface {
|
||||
|
||||
// Whether the cgroup path exists or not
|
||||
Exists() bool
|
||||
|
||||
// OOMKillCount reports OOM kill count for the cgroup.
|
||||
OOMKillCount() (uint64, error)
|
||||
}
|
||||
|
@@ -9,6 +9,7 @@ import (
|
||||
"sync"
|
||||
|
||||
"github.com/opencontainers/runc/libcontainer/cgroups"
|
||||
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
|
||||
"github.com/opencontainers/runc/libcontainer/configs"
|
||||
libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils"
|
||||
"github.com/pkg/errors"
|
||||
@@ -421,3 +422,11 @@ func (m *manager) GetFreezerState() (configs.FreezerState, error) {
|
||||
func (m *manager) Exists() bool {
|
||||
return cgroups.PathExists(m.Path("devices"))
|
||||
}
|
||||
|
||||
func OOMKillCount(path string) (uint64, error) {
|
||||
return fscommon.GetValueByKey(path, "memory.oom_control", "oom_kill")
|
||||
}
|
||||
|
||||
func (m *manager) OOMKillCount() (uint64, error) {
|
||||
return OOMKillCount(m.Path("memory"))
|
||||
}
|
||||
|
@@ -257,3 +257,11 @@ func (m *manager) GetFreezerState() (configs.FreezerState, error) {
|
||||
func (m *manager) Exists() bool {
|
||||
return cgroups.PathExists(m.dirPath)
|
||||
}
|
||||
|
||||
func OOMKillCount(path string) (uint64, error) {
|
||||
return fscommon.GetValueByKey(path, "memory.events", "oom_kill")
|
||||
}
|
||||
|
||||
func (m *manager) OOMKillCount() (uint64, error) {
|
||||
return OOMKillCount(m.dirPath)
|
||||
}
|
||||
|
@@ -450,3 +450,7 @@ func (m *legacyManager) GetFreezerState() (configs.FreezerState, error) {
|
||||
func (m *legacyManager) Exists() bool {
|
||||
return cgroups.PathExists(m.Path("devices"))
|
||||
}
|
||||
|
||||
func (m *legacyManager) OOMKillCount() (uint64, error) {
|
||||
return fs.OOMKillCount(m.Path("memory"))
|
||||
}
|
||||
|
@@ -495,3 +495,7 @@ func (m *unifiedManager) GetFreezerState() (configs.FreezerState, error) {
|
||||
func (m *unifiedManager) Exists() bool {
|
||||
return cgroups.PathExists(m.path)
|
||||
}
|
||||
|
||||
func (m *unifiedManager) OOMKillCount() (uint64, error) {
|
||||
return fs2.OOMKillCount(m.path)
|
||||
}
|
||||
|
@@ -55,6 +55,10 @@ func (m *mockCgroupManager) Exists() bool {
|
||||
return err == nil
|
||||
}
|
||||
|
||||
func (m *mockCgroupManager) OOMKillCount() (uint64, error) {
|
||||
return 0, nil
|
||||
}
|
||||
|
||||
func (m *mockCgroupManager) GetPaths() map[string]string {
|
||||
return m.paths
|
||||
}
|
||||
|
@@ -321,6 +321,24 @@ func (p *initProcess) start() (retErr error) {
|
||||
}
|
||||
defer func() {
|
||||
if retErr != nil {
|
||||
// init might be killed by the kernel's OOM killer.
|
||||
oom, err := p.manager.OOMKillCount()
|
||||
if err != nil {
|
||||
logrus.WithError(err).Warn("unable to get oom kill count")
|
||||
} else if oom > 0 {
|
||||
// Does not matter what the particular error was,
|
||||
// its cause is most probably OOM, so report that.
|
||||
const oomError = "container init was OOM-killed (memory limit too low?)"
|
||||
|
||||
if logrus.GetLevel() >= logrus.DebugLevel {
|
||||
// Only show the original error if debug is set,
|
||||
// as it is not generally very useful.
|
||||
retErr = newSystemErrorWithCause(retErr, oomError)
|
||||
} else {
|
||||
retErr = newSystemError(errors.New(oomError))
|
||||
}
|
||||
}
|
||||
|
||||
// terminate the process to ensure we can remove cgroups
|
||||
if err := ignoreTerminateErrors(p.terminate()); err != nil {
|
||||
logrus.WithError(err).Warn("unable to terminate initProcess")
|
||||
|
@@ -79,7 +79,7 @@ function setup() {
|
||||
|
||||
runc run -d --console-socket "$CONSOLE_SOCKET" test_cgroups_permissions
|
||||
[ "$status" -eq 1 ]
|
||||
[[ ${lines[0]} == *"permission denied"* ]]
|
||||
[[ "$output" == *"applying cgroup configuration"*"permission denied"* ]]
|
||||
}
|
||||
|
||||
@test "runc create (rootless + limits + no cgrouppath + no permission) fails with informative error" {
|
||||
@@ -92,7 +92,8 @@ function setup() {
|
||||
|
||||
runc run -d --console-socket "$CONSOLE_SOCKET" test_cgroups_permissions
|
||||
[ "$status" -eq 1 ]
|
||||
[[ ${lines[0]} == *"rootless needs no limits + no cgrouppath when no permission is granted for cgroups"* ]] || [[ ${lines[0]} == *"cannot set pids limit: container could not join or create cgroup"* ]]
|
||||
[[ "$output" == *"rootless needs no limits + no cgrouppath when no permission is granted for cgroups"* ]] ||
|
||||
[[ "$output" == *"cannot set pids limit: container could not join or create cgroup"* ]]
|
||||
}
|
||||
|
||||
@test "runc create (limits + cgrouppath + permission on the cgroup dir) succeeds" {
|
||||
|
Reference in New Issue
Block a user