libct/cg: improve cgroup removal logic

The current code is only doing retries in RemovePaths, which is only
used for cgroup v1 (cgroup v2 uses RemovePath, which makes no retries).

Let's remove all retry logic and logging from RemovePaths, together
with:

 - os.Stat check from RemovePaths (its usage probably made sense before
   commit 19be8e5ba5 but not after);

 - error/warning logging from RemovePaths (this was added by commit
   19be8e5ba5 in 2020 and so far we've seen no errors other
   than EBUSY, so reporting the actual error proved to be useless).

Add the retry logic to rmdir, and the second retry bool argument.
Decrease the initial delay and increase the number of retries from the
old implementation so it can take up to ~1 sec before returning EBUSY
(was about 0.3 sec).

Hopefully, as a result, we'll have less "failed to remove cgroup paths"
errors.

Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
This commit is contained in:
Kir Kolyshkin
2023-11-09 12:08:01 -08:00
parent 29283bb7db
commit d3d7f7d85a

View File

@@ -217,10 +217,26 @@ func PathExists(path string) bool {
return true
}
func rmdir(path string) error {
// rmdir tries to remove a directory, optionally retrying on EBUSY.
func rmdir(path string, retry bool) error {
delay := time.Millisecond
tries := 10
again:
err := unix.Rmdir(path)
if err == nil || err == unix.ENOENT {
switch err { // nolint:errorlint // unix errors are bare
case nil, unix.ENOENT:
return nil
case unix.EINTR:
goto again
case unix.EBUSY:
if retry && tries > 0 {
time.Sleep(delay)
delay *= 2
tries--
goto again
}
}
return &os.PathError{Op: "rmdir", Path: path, Err: err}
}
@@ -228,68 +244,42 @@ func rmdir(path string) error {
// RemovePath aims to remove cgroup path. It does so recursively,
// by removing any subdirectories (sub-cgroups) first.
func RemovePath(path string) error {
// try the fast path first
if err := rmdir(path); err == nil {
// Try the fast path first.
if err := rmdir(path, false); err == nil {
return nil
}
infos, err := os.ReadDir(path)
if err != nil {
if os.IsNotExist(err) {
err = nil
}
if err != nil && !os.IsNotExist(err) {
return err
}
for _, info := range infos {
if info.IsDir() {
// We should remove subcgroups dir first
// We should remove subcgroup first.
if err = RemovePath(filepath.Join(path, info.Name())); err != nil {
break
}
}
}
if err == nil {
err = rmdir(path)
err = rmdir(path, true)
}
return err
}
// RemovePaths iterates over the provided paths removing them.
// We trying to remove all paths five times with increasing delay between tries.
// If after all there are not removed cgroups - appropriate error will be
// returned.
func RemovePaths(paths map[string]string) (err error) {
const retries = 5
delay := 10 * time.Millisecond
for i := 0; i < retries; i++ {
if i != 0 {
time.Sleep(delay)
delay *= 2
}
for s, p := range paths {
if err := RemovePath(p); err != nil {
// do not log intermediate iterations
switch i {
case 0:
logrus.WithError(err).Warnf("Failed to remove cgroup (will retry)")
case retries - 1:
logrus.WithError(err).Error("Failed to remove cgroup")
}
}
_, err := os.Stat(p)
// We need this strange way of checking cgroups existence because
// RemoveAll almost always returns error, even on already removed
// cgroups
if os.IsNotExist(err) {
delete(paths, s)
}
}
if len(paths) == 0 {
//nolint:ineffassign,staticcheck // done to help garbage collecting: opencontainers/runc#2506
paths = make(map[string]string)
return nil
for s, p := range paths {
if err := RemovePath(p); err == nil {
delete(paths, s)
}
}
if len(paths) == 0 {
//nolint:ineffassign,staticcheck // done to help garbage collecting: opencontainers/runc#2506
// TODO: switch to clear once Go < 1.21 is not supported.
paths = make(map[string]string)
return nil
}
return fmt.Errorf("Failed to remove paths: %v", paths)
}