mirror of
https://github.com/opencontainers/runc.git
synced 2025-10-12 10:50:35 +08:00
criu checkpoint/restore: print errors from criu log
When criu fails, it does not give us much context to understand what was the cause of an error -- for that, we need to take a look into its log file. This is somewhat complicated to do (as you can see in parts of checkpoint.bats removed by this commit), and not very user-friendly. Add a function to find and log errors from criu logs, together with some preceding context, in case either checkpoint or restore has failed. Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
This commit is contained in:
@@ -1,6 +1,8 @@
|
|||||||
package libcontainer
|
package libcontainer
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"bufio"
|
||||||
|
"bytes"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
@@ -277,6 +279,7 @@ func (c *Container) handleRestoringExternalNamespaces(rpcOpts *criurpc.CriuOpts,
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (c *Container) Checkpoint(criuOpts *CriuOpts) error {
|
func (c *Container) Checkpoint(criuOpts *CriuOpts) error {
|
||||||
|
const logFile = "dump.log"
|
||||||
c.m.Lock()
|
c.m.Lock()
|
||||||
defer c.m.Unlock()
|
defer c.m.Unlock()
|
||||||
|
|
||||||
@@ -301,6 +304,7 @@ func (c *Container) Checkpoint(criuOpts *CriuOpts) error {
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
logDir := criuOpts.ImagesDirectory
|
||||||
imageDir, err := os.Open(criuOpts.ImagesDirectory)
|
imageDir, err := os.Open(criuOpts.ImagesDirectory)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
@@ -310,7 +314,7 @@ func (c *Container) Checkpoint(criuOpts *CriuOpts) error {
|
|||||||
rpcOpts := criurpc.CriuOpts{
|
rpcOpts := criurpc.CriuOpts{
|
||||||
ImagesDirFd: proto.Int32(int32(imageDir.Fd())),
|
ImagesDirFd: proto.Int32(int32(imageDir.Fd())),
|
||||||
LogLevel: proto.Int32(4),
|
LogLevel: proto.Int32(4),
|
||||||
LogFile: proto.String("dump.log"),
|
LogFile: proto.String(logFile),
|
||||||
Root: proto.String(c.config.Rootfs),
|
Root: proto.String(c.config.Rootfs),
|
||||||
ManageCgroups: proto.Bool(true),
|
ManageCgroups: proto.Bool(true),
|
||||||
NotifyScripts: proto.Bool(true),
|
NotifyScripts: proto.Bool(true),
|
||||||
@@ -337,6 +341,7 @@ func (c *Container) Checkpoint(criuOpts *CriuOpts) error {
|
|||||||
}
|
}
|
||||||
defer workDir.Close()
|
defer workDir.Close()
|
||||||
rpcOpts.WorkDirFd = proto.Int32(int32(workDir.Fd()))
|
rpcOpts.WorkDirFd = proto.Int32(int32(workDir.Fd()))
|
||||||
|
logDir = criuOpts.WorkDirectory
|
||||||
}
|
}
|
||||||
|
|
||||||
c.handleCriuConfigurationFile(&rpcOpts)
|
c.handleCriuConfigurationFile(&rpcOpts)
|
||||||
@@ -479,6 +484,7 @@ func (c *Container) Checkpoint(criuOpts *CriuOpts) error {
|
|||||||
|
|
||||||
err = c.criuSwrk(nil, req, criuOpts, nil)
|
err = c.criuSwrk(nil, req, criuOpts, nil)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
logCriuErrors(logDir, logFile)
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
@@ -627,6 +633,7 @@ func (c *Container) prepareCriuRestoreMounts(mounts []*configs.Mount) error {
|
|||||||
// Restore restores the checkpointed container to a running state using the
|
// Restore restores the checkpointed container to a running state using the
|
||||||
// criu(8) utility.
|
// criu(8) utility.
|
||||||
func (c *Container) Restore(process *Process, criuOpts *CriuOpts) error {
|
func (c *Container) Restore(process *Process, criuOpts *CriuOpts) error {
|
||||||
|
const logFile = "restore.log"
|
||||||
c.m.Lock()
|
c.m.Lock()
|
||||||
defer c.m.Unlock()
|
defer c.m.Unlock()
|
||||||
|
|
||||||
@@ -644,6 +651,7 @@ func (c *Container) Restore(process *Process, criuOpts *CriuOpts) error {
|
|||||||
if criuOpts.ImagesDirectory == "" {
|
if criuOpts.ImagesDirectory == "" {
|
||||||
return errors.New("invalid directory to restore checkpoint")
|
return errors.New("invalid directory to restore checkpoint")
|
||||||
}
|
}
|
||||||
|
logDir := criuOpts.ImagesDirectory
|
||||||
imageDir, err := os.Open(criuOpts.ImagesDirectory)
|
imageDir, err := os.Open(criuOpts.ImagesDirectory)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
@@ -675,7 +683,7 @@ func (c *Container) Restore(process *Process, criuOpts *CriuOpts) error {
|
|||||||
ImagesDirFd: proto.Int32(int32(imageDir.Fd())),
|
ImagesDirFd: proto.Int32(int32(imageDir.Fd())),
|
||||||
EvasiveDevices: proto.Bool(true),
|
EvasiveDevices: proto.Bool(true),
|
||||||
LogLevel: proto.Int32(4),
|
LogLevel: proto.Int32(4),
|
||||||
LogFile: proto.String("restore.log"),
|
LogFile: proto.String(logFile),
|
||||||
RstSibling: proto.Bool(true),
|
RstSibling: proto.Bool(true),
|
||||||
Root: proto.String(root),
|
Root: proto.String(root),
|
||||||
ManageCgroups: proto.Bool(true),
|
ManageCgroups: proto.Bool(true),
|
||||||
@@ -718,6 +726,7 @@ func (c *Container) Restore(process *Process, criuOpts *CriuOpts) error {
|
|||||||
}
|
}
|
||||||
defer workDir.Close()
|
defer workDir.Close()
|
||||||
req.Opts.WorkDirFd = proto.Int32(int32(workDir.Fd()))
|
req.Opts.WorkDirFd = proto.Int32(int32(workDir.Fd()))
|
||||||
|
logDir = criuOpts.WorkDirectory
|
||||||
}
|
}
|
||||||
c.handleCriuConfigurationFile(req.Opts)
|
c.handleCriuConfigurationFile(req.Opts)
|
||||||
|
|
||||||
@@ -791,6 +800,9 @@ func (c *Container) Restore(process *Process, criuOpts *CriuOpts) error {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
err = c.criuSwrk(process, req, criuOpts, extraFiles)
|
err = c.criuSwrk(process, req, criuOpts, extraFiles)
|
||||||
|
if err != nil {
|
||||||
|
logCriuErrors(logDir, logFile)
|
||||||
|
}
|
||||||
|
|
||||||
// Now that CRIU is done let's close all opened FDs CRIU needed.
|
// Now that CRIU is done let's close all opened FDs CRIU needed.
|
||||||
for _, fd := range extraFiles {
|
for _, fd := range extraFiles {
|
||||||
@@ -800,6 +812,56 @@ func (c *Container) Restore(process *Process, criuOpts *CriuOpts) error {
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// logCriuErrors tries to find and log errors from a criu log file.
|
||||||
|
// The output is similar to what "grep -n -B5 Error" does.
|
||||||
|
func logCriuErrors(dir, file string) {
|
||||||
|
lookFor := []byte("Error") // Print the line that contains this...
|
||||||
|
const max = 5 + 1 // ... and a few preceding lines.
|
||||||
|
|
||||||
|
logFile := filepath.Join(dir, file)
|
||||||
|
f, err := os.Open(logFile)
|
||||||
|
if err != nil {
|
||||||
|
logrus.Warn(err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
defer f.Close()
|
||||||
|
|
||||||
|
var lines [max][]byte
|
||||||
|
var idx, lineNo, printedLineNo int
|
||||||
|
s := bufio.NewScanner(f)
|
||||||
|
for s.Scan() {
|
||||||
|
lineNo++
|
||||||
|
lines[idx] = s.Bytes()
|
||||||
|
idx = (idx + 1) % max
|
||||||
|
if !bytes.Contains(s.Bytes(), lookFor) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
// Found an error.
|
||||||
|
if printedLineNo == 0 {
|
||||||
|
logrus.Warnf("--- Quoting %q", logFile)
|
||||||
|
} else if lineNo-max > printedLineNo {
|
||||||
|
// Mark the gap.
|
||||||
|
logrus.Warn("...")
|
||||||
|
}
|
||||||
|
// Print the last lines.
|
||||||
|
for add := 0; add < max; add++ {
|
||||||
|
i := (idx + add) % max
|
||||||
|
s := lines[i]
|
||||||
|
actLineNo := lineNo + add - max + 1
|
||||||
|
if len(s) > 0 && actLineNo > printedLineNo {
|
||||||
|
logrus.Warnf("%d:%s", actLineNo, s)
|
||||||
|
printedLineNo = actLineNo
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if printedLineNo != 0 {
|
||||||
|
logrus.Warn("---") // End of "Quoting ...".
|
||||||
|
}
|
||||||
|
if err := s.Err(); err != nil {
|
||||||
|
logrus.Warnf("read %q: %v", logFile, err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func (c *Container) criuApplyCgroups(pid int, req *criurpc.CriuReq) error {
|
func (c *Container) criuApplyCgroups(pid int, req *criurpc.CriuReq) error {
|
||||||
// need to apply cgroups only on restore
|
// need to apply cgroups only on restore
|
||||||
if req.GetType() != criurpc.CriuReqType_RESTORE {
|
if req.GetType() != criurpc.CriuReqType_RESTORE {
|
||||||
|
Reference in New Issue
Block a user