//go:build !runc_nocriu package libcontainer import ( "bufio" "bytes" "encoding/json" "errors" "fmt" "net" "os" "os/exec" "path/filepath" "reflect" "strings" "time" "github.com/checkpoint-restore/go-criu/v7" criurpc "github.com/checkpoint-restore/go-criu/v7/rpc" securejoin "github.com/cyphar/filepath-securejoin" "github.com/sirupsen/logrus" "golang.org/x/sys/unix" "google.golang.org/protobuf/proto" "github.com/opencontainers/cgroups" "github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/utils" ) var criuFeatures *criurpc.CriuFeatures var ErrCriuMissingFeatures = errors.New("criu is missing features") func (c *Container) checkCriuFeatures(criuOpts *CriuOpts, criuFeat *criurpc.CriuFeatures) error { t := criurpc.CriuReqType_FEATURE_CHECK // make sure the features we are looking for are really not from // some previous check criuFeatures = nil req := &criurpc.CriuReq{ Type: &t, Features: criuFeat, } err := c.criuSwrk(nil, req, criuOpts, nil) if err != nil { return fmt.Errorf("CRIU feature check failed: %w", err) } var missingFeatures []string // The outer if checks if the fields actually exist if (criuFeat.MemTrack != nil) && (criuFeatures.MemTrack != nil) { // The inner if checks if they are set to true if *criuFeat.MemTrack && !*criuFeatures.MemTrack { missingFeatures = append(missingFeatures, "MemTrack") logrus.Debugf("CRIU does not support MemTrack") } } // This needs to be repeated for every new feature check. // Is there a way to put this in a function. Reflection? if (criuFeat.LazyPages != nil) && (criuFeatures.LazyPages != nil) { if *criuFeat.LazyPages && !*criuFeatures.LazyPages { missingFeatures = append(missingFeatures, "LazyPages") logrus.Debugf("CRIU does not support LazyPages") } } if len(missingFeatures) != 0 { return fmt.Errorf("%w: %v", ErrCriuMissingFeatures, missingFeatures) } return nil } func compareCriuVersion(criuVersion int, minVersion int) error { // simple function to perform the actual version compare if criuVersion < minVersion { return fmt.Errorf("CRIU version %d must be %d or higher", criuVersion, minVersion) } return nil } // checkCriuVersion checks CRIU version greater than or equal to minVersion. func (c *Container) checkCriuVersion(minVersion int) error { // If the version of criu has already been determined there is no need // to ask criu for the version again. Use the value from c.criuVersion. if c.criuVersion != 0 { return compareCriuVersion(c.criuVersion, minVersion) } criu := criu.MakeCriu() var err error c.criuVersion, err = criu.GetCriuVersion() if err != nil { return fmt.Errorf("CRIU version check failed: %w", err) } return compareCriuVersion(c.criuVersion, minVersion) } const descriptorsFilename = "descriptors.json" func (c *Container) addCriuDumpMount(req *criurpc.CriuReq, m *configs.Mount) { mountDest := strings.TrimPrefix(m.Destination, c.config.Rootfs) if dest, err := securejoin.SecureJoin(c.config.Rootfs, mountDest); err == nil { mountDest = dest[len(c.config.Rootfs):] } extMnt := &criurpc.ExtMountMap{ Key: proto.String(mountDest), Val: proto.String(mountDest), } req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt) } func (c *Container) addMaskPaths(req *criurpc.CriuReq) error { for _, path := range c.config.MaskPaths { fi, err := os.Stat(fmt.Sprintf("/proc/%d/root/%s", c.initProcess.pid(), path)) if err != nil { if os.IsNotExist(err) { continue } return err } if fi.IsDir() { continue } extMnt := &criurpc.ExtMountMap{ Key: proto.String(path), Val: proto.String("/dev/null"), } req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt) } return nil } func (c *Container) handleCriuConfigurationFile(rpcOpts *criurpc.CriuOpts) { // CRIU will evaluate a configuration starting with release 3.11. // Settings in the configuration file will overwrite RPC settings. // Look for annotations. The annotation 'org.criu.config' // specifies if CRIU should use a different, container specific // configuration file. configFile, exists := utils.SearchLabels(c.config.Labels, "org.criu.config") if exists { // If the annotation 'org.criu.config' exists and is set // to a non-empty string, tell CRIU to use that as a // configuration file. If the file does not exist, CRIU // will just ignore it. if configFile != "" { rpcOpts.ConfigFile = proto.String(configFile) } // If 'org.criu.config' exists and is set to an empty // string, a runc specific CRIU configuration file will // be not set at all. } else { // If the mentioned annotation has not been found, specify // a default CRIU configuration file. rpcOpts.ConfigFile = proto.String("/etc/criu/runc.conf") } } func (c *Container) criuSupportsExtNS(t configs.NamespaceType) bool { var minVersion int switch t { case configs.NEWNET: // CRIU supports different external namespace with different released CRIU versions. // For network namespaces to work we need at least criu 3.11.0 => 31100. minVersion = 31100 case configs.NEWPID: // For PID namespaces criu 31500 is needed. minVersion = 31500 default: return false } return c.checkCriuVersion(minVersion) == nil } func criuNsToKey(t configs.NamespaceType) string { return "extRoot" + strings.Title(configs.NsName(t)) + "NS" //nolint:staticcheck // SA1019: strings.Title is deprecated } func (c *Container) handleCheckpointingExternalNamespaces(rpcOpts *criurpc.CriuOpts, t configs.NamespaceType) error { if !c.criuSupportsExtNS(t) { return fmt.Errorf("criu lacks support for external %s namespace during checkpointing process (old criu version?)", configs.NsName(t)) } nsPath := c.config.Namespaces.PathOf(t) if nsPath == "" { return nil } // CRIU expects the information about an external namespace // like this: --external []: // This is always 'extRootNS'. var ns unix.Stat_t if err := unix.Stat(nsPath, &ns); err != nil { return err } criuExternal := fmt.Sprintf("%s[%d]:%s", configs.NsName(t), ns.Ino, criuNsToKey(t)) rpcOpts.External = append(rpcOpts.External, criuExternal) return nil } func (c *Container) handleRestoringNamespaces(rpcOpts *criurpc.CriuOpts, extraFiles *[]*os.File) error { for _, ns := range c.config.Namespaces { switch ns.Type { case configs.NEWNET, configs.NEWPID: // If the container is running in a network or PID namespace and has // a path to the network or PID namespace configured, we will dump // that network or PID namespace as an external namespace and we // will expect that the namespace exists during restore. // This basically means that CRIU will ignore the namespace // and expect it to be setup correctly. if err := c.handleRestoringExternalNamespaces(rpcOpts, extraFiles, ns.Type); err != nil { return err } default: // For all other namespaces except NET and PID CRIU has // a simpler way of joining the existing namespace if set nsPath := c.config.Namespaces.PathOf(ns.Type) if nsPath == "" { continue } if ns.Type == configs.NEWCGROUP { // CRIU has no code to handle NEWCGROUP return fmt.Errorf("Do not know how to handle namespace %v", ns.Type) } // CRIU has code to handle NEWTIME, but it does not seem to be defined in runc // CRIU will issue a warning for NEWUSER: // criu/namespaces.c: 'join-ns with user-namespace is not fully tested and dangerous' rpcOpts.JoinNs = append(rpcOpts.JoinNs, &criurpc.JoinNamespace{ Ns: proto.String(configs.NsName(ns.Type)), NsFile: proto.String(nsPath), }) } } return nil } func (c *Container) handleRestoringExternalNamespaces(rpcOpts *criurpc.CriuOpts, extraFiles *[]*os.File, t configs.NamespaceType) error { if !c.criuSupportsExtNS(t) { return fmt.Errorf("criu lacks support for external %s namespace during the restoration process (old criu version?)", configs.NsName(t)) } nsPath := c.config.Namespaces.PathOf(t) if nsPath == "" { return nil } // CRIU wants the information about an existing namespace // like this: --inherit-fd fd[]: // The needs to be the same as during checkpointing. // We are always using 'extRootNS' as the key in this. nsFd, err := os.Open(nsPath) if err != nil { logrus.Errorf("If a specific network namespace is defined it must exist: %s", err) return fmt.Errorf("Requested network namespace %v does not exist", nsPath) } inheritFd := &criurpc.InheritFd{ Key: proto.String(criuNsToKey(t)), // The offset of four is necessary because 0, 1, 2 and 3 are // already used by stdin, stdout, stderr, 'criu swrk' socket. Fd: proto.Int32(int32(4 + len(*extraFiles))), } rpcOpts.InheritFd = append(rpcOpts.InheritFd, inheritFd) // All open FDs need to be transferred to CRIU via extraFiles *extraFiles = append(*extraFiles, nsFd) return nil } func (c *Container) Checkpoint(criuOpts *CriuOpts) error { const logFile = "dump.log" c.m.Lock() defer c.m.Unlock() // Checkpoint is unlikely to work if os.Geteuid() != 0 || system.RunningInUserNS(). // (CLI prints a warning) // TODO(avagin): Figure out how to make this work nicely. CRIU 2.0 has // support for doing unprivileged dumps, but the setup of // rootless containers might make this complicated. // We are relying on the CRIU version RPC which was introduced with CRIU 3.0.0 if err := c.checkCriuVersion(30000); err != nil { return err } if criuOpts.ImagesDirectory == "" { return errors.New("invalid directory to save checkpoint") } cgMode, err := criuCgMode(criuOpts.ManageCgroupsMode) if err != nil { return err } // Since a container can be C/R'ed multiple times, // the checkpoint directory may already exist. if err := os.Mkdir(criuOpts.ImagesDirectory, 0o700); err != nil && !os.IsExist(err) { return err } logDir := criuOpts.ImagesDirectory imageDir, err := os.Open(criuOpts.ImagesDirectory) if err != nil { return err } defer imageDir.Close() rpcOpts := criurpc.CriuOpts{ ImagesDirFd: proto.Int32(int32(imageDir.Fd())), LogLevel: proto.Int32(4), LogFile: proto.String(logFile), Root: proto.String(c.config.Rootfs), ManageCgroups: proto.Bool(true), // Obsoleted by ManageCgroupsMode. ManageCgroupsMode: &cgMode, NotifyScripts: proto.Bool(true), Pid: proto.Int32(int32(c.initProcess.pid())), ShellJob: proto.Bool(criuOpts.ShellJob), LeaveRunning: proto.Bool(criuOpts.LeaveRunning), TcpEstablished: proto.Bool(criuOpts.TcpEstablished), TcpSkipInFlight: proto.Bool(criuOpts.TcpSkipInFlight), LinkRemap: proto.Bool(criuOpts.LinkRemap), ExtUnixSk: proto.Bool(criuOpts.ExternalUnixConnections), FileLocks: proto.Bool(criuOpts.FileLocks), EmptyNs: proto.Uint32(criuOpts.EmptyNs), OrphanPtsMaster: proto.Bool(true), AutoDedup: proto.Bool(criuOpts.AutoDedup), LazyPages: proto.Bool(criuOpts.LazyPages), } // if criuOpts.WorkDirectory is not set, criu default is used. if criuOpts.WorkDirectory != "" { if err := os.Mkdir(criuOpts.WorkDirectory, 0o700); err != nil && !os.IsExist(err) { return err } workDir, err := os.Open(criuOpts.WorkDirectory) if err != nil { return err } defer workDir.Close() rpcOpts.WorkDirFd = proto.Int32(int32(workDir.Fd())) logDir = criuOpts.WorkDirectory } c.handleCriuConfigurationFile(&rpcOpts) // If the container is running in a network namespace and has // a path to the network namespace configured, we will dump // that network namespace as an external namespace and we // will expect that the namespace exists during restore. // This basically means that CRIU will ignore the namespace // and expect to be setup correctly. if err := c.handleCheckpointingExternalNamespaces(&rpcOpts, configs.NEWNET); err != nil { return err } // Same for possible external PID namespaces if err := c.handleCheckpointingExternalNamespaces(&rpcOpts, configs.NEWPID); err != nil { return err } // CRIU can use cgroup freezer; when rpcOpts.FreezeCgroup // is not set, CRIU uses ptrace() to pause the processes. // Note cgroup v2 freezer is only supported since CRIU release 3.14. if !cgroups.IsCgroup2UnifiedMode() || c.checkCriuVersion(31400) == nil { if fcg := c.cgroupManager.Path("freezer"); fcg != "" { rpcOpts.FreezeCgroup = proto.String(fcg) } } // append optional criu opts, e.g., page-server and port if criuOpts.PageServer.Address != "" && criuOpts.PageServer.Port != 0 { rpcOpts.Ps = &criurpc.CriuPageServerInfo{ Address: proto.String(criuOpts.PageServer.Address), Port: proto.Int32(criuOpts.PageServer.Port), } } // pre-dump may need parentImage param to complete iterative migration if criuOpts.ParentImage != "" { rpcOpts.ParentImg = proto.String(criuOpts.ParentImage) rpcOpts.TrackMem = proto.Bool(true) } var t criurpc.CriuReqType if criuOpts.PreDump { feat := criurpc.CriuFeatures{ MemTrack: proto.Bool(true), } if err := c.checkCriuFeatures(criuOpts, &feat); err != nil { return err } t = criurpc.CriuReqType_PRE_DUMP } else { t = criurpc.CriuReqType_DUMP } if criuOpts.LazyPages { // lazy migration requested; check if criu supports it feat := criurpc.CriuFeatures{ LazyPages: proto.Bool(true), } if err := c.checkCriuFeatures(criuOpts, &feat); err != nil { return err } if fd := criuOpts.StatusFd; fd != -1 { // check that the FD is valid flags, err := unix.FcntlInt(uintptr(fd), unix.F_GETFL, 0) if err != nil { return fmt.Errorf("invalid --status-fd argument %d: %w", fd, err) } // and writable if flags&unix.O_WRONLY == 0 { return fmt.Errorf("invalid --status-fd argument %d: not writable", fd) } if c.checkCriuVersion(31500) != nil { // For criu 3.15+, use notifications (see case "status-ready" // in criuNotifications). Otherwise, rely on criu status fd. rpcOpts.StatusFd = proto.Int32(int32(fd)) } } } req := &criurpc.CriuReq{ Type: &t, Opts: &rpcOpts, } // no need to dump all this in pre-dump if !criuOpts.PreDump { hasCgroupns := c.config.Namespaces.Contains(configs.NEWCGROUP) for _, m := range c.config.Mounts { switch m.Device { case "bind": c.addCriuDumpMount(req, m) case "cgroup": if cgroups.IsCgroup2UnifiedMode() || hasCgroupns { // real mount(s) continue } // a set of "external" bind mounts binds, err := getCgroupMounts(m) if err != nil { return err } for _, b := range binds { c.addCriuDumpMount(req, b) } } } if err := c.addMaskPaths(req); err != nil { return err } for _, node := range c.config.Devices { m := &configs.Mount{Destination: node.Path, Source: node.Path} c.addCriuDumpMount(req, m) } // Write the FD info to a file in the image directory fdsJSON, err := json.Marshal(c.initProcess.externalDescriptors()) if err != nil { return err } err = os.WriteFile(filepath.Join(criuOpts.ImagesDirectory, descriptorsFilename), fdsJSON, 0o600) if err != nil { return err } } err = c.criuSwrk(nil, req, criuOpts, nil) if err != nil { logCriuErrors(logDir, logFile) return err } return nil } func (c *Container) addCriuRestoreMount(req *criurpc.CriuReq, m *configs.Mount) { mountDest := strings.TrimPrefix(m.Destination, c.config.Rootfs) if dest, err := securejoin.SecureJoin(c.config.Rootfs, mountDest); err == nil { mountDest = dest[len(c.config.Rootfs):] } extMnt := &criurpc.ExtMountMap{ Key: proto.String(mountDest), Val: proto.String(m.Source), } req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt) } func (c *Container) restoreNetwork(req *criurpc.CriuReq, criuOpts *CriuOpts) { for _, iface := range c.config.Networks { switch iface.Type { case "veth": veth := new(criurpc.CriuVethPair) veth.IfOut = proto.String(iface.HostInterfaceName) veth.IfIn = proto.String(iface.Name) req.Opts.Veths = append(req.Opts.Veths, veth) case "loopback": // Do nothing } } for _, i := range criuOpts.VethPairs { veth := new(criurpc.CriuVethPair) veth.IfOut = proto.String(i.HostInterfaceName) veth.IfIn = proto.String(i.ContainerInterfaceName) req.Opts.Veths = append(req.Opts.Veths, veth) } } func isOnTmpfs(path string, mounts []*configs.Mount) bool { for _, m := range mounts { if m.Device == "tmpfs" && strings.HasPrefix(path, m.Destination+"/") { return true } } return false } // prepareCriuRestoreMounts tries to set up the rootfs of the // container to be restored in the same way runc does it for // initial container creation. Even for a read-only rootfs container // runc modifies the rootfs to add mountpoints which do not exist. // This function also creates missing mountpoints as long as they // are not on top of a tmpfs, as CRIU will restore tmpfs content anyway. func (c *Container) prepareCriuRestoreMounts(mounts []*configs.Mount) error { umounts := []string{} defer func() { for _, u := range umounts { _ = utils.WithProcfd(c.config.Rootfs, u, func(procfd string) error { if e := unix.Unmount(procfd, unix.MNT_DETACH); e != nil { if e != unix.EINVAL { // Ignore EINVAL as it means 'target is not a mount point.' // It probably has already been unmounted. logrus.Warnf("Error during cleanup unmounting of %s (%s): %v", procfd, u, e) } } return nil }) } }() // Now go through all mounts and create the required mountpoints. for _, m := range mounts { // No cgroup mount point(s) need to be created: // * for v1, mount points are saved by CRIU because // /sys/fs/cgroup is a tmpfs mount; // * for v2, /sys/fs/cgroup is a real mount, but // the mountpoint appears as soon as /sys is mounted. if m.Device == "cgroup" { continue } // If the mountpoint is on a tmpfs, skip it as CRIU will // restore the complete tmpfs content from its checkpoint. if isOnTmpfs(m.Destination, mounts) { continue } if _, err := createMountpoint(c.config.Rootfs, mountEntry{Mount: m}); err != nil { return fmt.Errorf("create criu restore mountpoint for %s mount: %w", m.Destination, err) } // If the mount point is a bind mount, we need to mount // it now so that runc can create the necessary mount // points for mounts in bind mounts. // This also happens during initial container creation. // Without this CRIU restore will fail // See: https://github.com/opencontainers/runc/issues/2748 // It is also not necessary to order the mount points // because during initial container creation mounts are // set up in the order they are configured. if m.Device == "bind" { if err := utils.WithProcfd(c.config.Rootfs, m.Destination, func(dstFd string) error { return mountViaFds(m.Source, nil, m.Destination, dstFd, "", unix.MS_BIND|unix.MS_REC, "") }); err != nil { return err } umounts = append(umounts, m.Destination) } } return nil } // Restore restores the checkpointed container to a running state using the // criu(8) utility. func (c *Container) Restore(process *Process, criuOpts *CriuOpts) error { const logFile = "restore.log" c.m.Lock() defer c.m.Unlock() var extraFiles []*os.File // Restore is unlikely to work if os.Geteuid() != 0 || system.RunningInUserNS(). // (CLI prints a warning) // TODO(avagin): Figure out how to make this work nicely. CRIU doesn't have // support for unprivileged restore at the moment. // We are relying on the CRIU version RPC which was introduced with CRIU 3.0.0 if err := c.checkCriuVersion(30000); err != nil { return err } if criuOpts.ImagesDirectory == "" { return errors.New("invalid directory to restore checkpoint") } cgMode, err := criuCgMode(criuOpts.ManageCgroupsMode) if err != nil { return err } logDir := criuOpts.ImagesDirectory imageDir, err := os.Open(criuOpts.ImagesDirectory) if err != nil { return err } defer imageDir.Close() // CRIU has a few requirements for a root directory: // * it must be a mount point // * its parent must not be overmounted // c.config.Rootfs is bind-mounted to a temporary directory // to satisfy these requirements. root := filepath.Join(c.stateDir, "criu-root") if err := os.Mkdir(root, 0o755); err != nil { return err } defer os.Remove(root) root, err = filepath.EvalSymlinks(root) if err != nil { return err } err = mount(c.config.Rootfs, root, "", unix.MS_BIND|unix.MS_REC, "") if err != nil { return err } defer unix.Unmount(root, unix.MNT_DETACH) //nolint: errcheck t := criurpc.CriuReqType_RESTORE req := &criurpc.CriuReq{ Type: &t, Opts: &criurpc.CriuOpts{ ImagesDirFd: proto.Int32(int32(imageDir.Fd())), EvasiveDevices: proto.Bool(true), LogLevel: proto.Int32(4), LogFile: proto.String(logFile), RstSibling: proto.Bool(true), Root: proto.String(root), ManageCgroups: proto.Bool(true), // Obsoleted by ManageCgroupsMode. ManageCgroupsMode: &cgMode, NotifyScripts: proto.Bool(true), ShellJob: proto.Bool(criuOpts.ShellJob), ExtUnixSk: proto.Bool(criuOpts.ExternalUnixConnections), TcpEstablished: proto.Bool(criuOpts.TcpEstablished), FileLocks: proto.Bool(criuOpts.FileLocks), EmptyNs: proto.Uint32(criuOpts.EmptyNs), OrphanPtsMaster: proto.Bool(true), AutoDedup: proto.Bool(criuOpts.AutoDedup), LazyPages: proto.Bool(criuOpts.LazyPages), }, } if criuOpts.LsmProfile != "" { // CRIU older than 3.16 has a bug which breaks the possibility // to set a different LSM profile. if err := c.checkCriuVersion(31600); err != nil { return errors.New("--lsm-profile requires at least CRIU 3.16") } req.Opts.LsmProfile = proto.String(criuOpts.LsmProfile) } if criuOpts.LsmMountContext != "" { if err := c.checkCriuVersion(31600); err != nil { return errors.New("--lsm-mount-context requires at least CRIU 3.16") } req.Opts.LsmMountContext = proto.String(criuOpts.LsmMountContext) } if criuOpts.WorkDirectory != "" { // Since a container can be C/R'ed multiple times, // the work directory may already exist. if err := os.Mkdir(criuOpts.WorkDirectory, 0o700); err != nil && !os.IsExist(err) { return err } workDir, err := os.Open(criuOpts.WorkDirectory) if err != nil { return err } defer workDir.Close() req.Opts.WorkDirFd = proto.Int32(int32(workDir.Fd())) logDir = criuOpts.WorkDirectory } c.handleCriuConfigurationFile(req.Opts) if err := c.handleRestoringNamespaces(req.Opts, &extraFiles); err != nil { return err } // This will modify the rootfs of the container in the same way runc // modifies the container during initial creation. if err := c.prepareCriuRestoreMounts(c.config.Mounts); err != nil { return err } hasCgroupns := c.config.Namespaces.Contains(configs.NEWCGROUP) for _, m := range c.config.Mounts { switch m.Device { case "bind": c.addCriuRestoreMount(req, m) case "cgroup": if cgroups.IsCgroup2UnifiedMode() || hasCgroupns { continue } // cgroup v1 is a set of bind mounts, unless cgroupns is used binds, err := getCgroupMounts(m) if err != nil { return err } for _, b := range binds { c.addCriuRestoreMount(req, b) } } } if len(c.config.MaskPaths) > 0 { m := &configs.Mount{Destination: "/dev/null", Source: "/dev/null"} c.addCriuRestoreMount(req, m) } for _, node := range c.config.Devices { m := &configs.Mount{Destination: node.Path, Source: node.Path} c.addCriuRestoreMount(req, m) } if criuOpts.EmptyNs&unix.CLONE_NEWNET == 0 { c.restoreNetwork(req, criuOpts) } var ( fds []string fdJSON []byte ) if fdJSON, err = os.ReadFile(filepath.Join(criuOpts.ImagesDirectory, descriptorsFilename)); err != nil { return err } if err := json.Unmarshal(fdJSON, &fds); err != nil { return err } for i := range fds { if s := fds[i]; strings.Contains(s, "pipe:") { inheritFd := new(criurpc.InheritFd) inheritFd.Key = proto.String(s) inheritFd.Fd = proto.Int32(int32(i)) req.Opts.InheritFd = append(req.Opts.InheritFd, inheritFd) } } err = c.criuSwrk(process, req, criuOpts, extraFiles) if err != nil { logCriuErrors(logDir, logFile) } // Now that CRIU is done let's close all opened FDs CRIU needed. for _, fd := range extraFiles { fd.Close() } return err } // logCriuErrors tries to find and log errors from a criu log file. // The output is similar to what "grep -n -B5 Error" does. func logCriuErrors(dir, file string) { lookFor := []byte("Error") // Print the line that contains this... const max = 5 + 1 // ... and a few preceding lines. logFile := filepath.Join(dir, file) f, err := os.Open(logFile) if err != nil { logrus.Warn(err) return } defer f.Close() var lines [max][]byte var idx, lineNo, printedLineNo int s := bufio.NewScanner(f) for s.Scan() { lineNo++ lines[idx] = s.Bytes() idx = (idx + 1) % max if !bytes.Contains(s.Bytes(), lookFor) { continue } // Found an error. if printedLineNo == 0 { logrus.Warnf("--- Quoting %q", logFile) } else if lineNo-max > printedLineNo { // Mark the gap. logrus.Warn("...") } // Print the last lines. for add := range max { i := (idx + add) % max s := lines[i] actLineNo := lineNo + add - max + 1 if len(s) > 0 && actLineNo > printedLineNo { logrus.Warnf("%d:%s", actLineNo, s) printedLineNo = actLineNo } } } if printedLineNo != 0 { logrus.Warn("---") // End of "Quoting ...". } if err := s.Err(); err != nil { logrus.Warnf("read %q: %v", logFile, err) } } func (c *Container) criuApplyCgroups(pid int, req *criurpc.CriuReq) error { // need to apply cgroups only on restore if req.GetType() != criurpc.CriuReqType_RESTORE { return nil } // XXX: Do we need to deal with this case? AFAIK criu still requires root. if err := c.cgroupManager.Apply(pid); err != nil { return err } if err := c.cgroupManager.Set(c.config.Cgroups.Resources); err != nil { return err } // TODO(@kolyshkin): should we use c.cgroupManager.GetPaths() // instead of reading /proc/pid/cgroup? path := fmt.Sprintf("/proc/%d/cgroup", pid) cgroupsPaths, err := cgroups.ParseCgroupFile(path) if err != nil { return err } for c, p := range cgroupsPaths { cgroupRoot := &criurpc.CgroupRoot{ Ctrl: proto.String(c), Path: proto.String(p), } req.Opts.CgRoot = append(req.Opts.CgRoot, cgroupRoot) } return nil } func (c *Container) criuSwrk(process *Process, req *criurpc.CriuReq, opts *CriuOpts, extraFiles []*os.File) error { fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_SEQPACKET|unix.SOCK_CLOEXEC, 0) if err != nil { return err } criuClient := os.NewFile(uintptr(fds[0]), "criu-transport-client") criuClientFileCon, err := net.FileConn(criuClient) criuClient.Close() if err != nil { return err } criuClientCon := criuClientFileCon.(*net.UnixConn) defer criuClientCon.Close() criuServer := os.NewFile(uintptr(fds[1]), "criu-transport-server") defer criuServer.Close() if c.criuVersion != 0 { // If the CRIU Version is still '0' then this is probably // the initial CRIU run to detect the version. Skip it. logrus.Debugf("Using CRIU %d", c.criuVersion) } cmd := exec.Command("criu", "swrk", "3") if process != nil { cmd.Stdin = process.Stdin cmd.Stdout = process.Stdout cmd.Stderr = process.Stderr } cmd.ExtraFiles = append(cmd.ExtraFiles, criuServer) if extraFiles != nil { cmd.ExtraFiles = append(cmd.ExtraFiles, extraFiles...) } if err := cmd.Start(); err != nil { return err } // we close criuServer so that even if CRIU crashes or unexpectedly exits, runc will not hang. criuServer.Close() // cmd.Process will be replaced by a restored init. criuProcess := cmd.Process var criuProcessState *os.ProcessState defer func() { if criuProcessState == nil { criuClientCon.Close() _, err := criuProcess.Wait() if err != nil { logrus.Warnf("wait on criuProcess returned %v", err) } } }() if err := c.criuApplyCgroups(criuProcess.Pid, req); err != nil { return err } var extFds []string if process != nil { extFds, err = getPipeFds(criuProcess.Pid) if err != nil { return err } } logrus.Debugf("Using CRIU in %s mode", req.GetType().String()) // In the case of criurpc.CriuReqType_FEATURE_CHECK req.GetOpts() // should be empty. For older CRIU versions it still will be // available but empty. criurpc.CriuReqType_VERSION actually // has no req.GetOpts(). if logrus.GetLevel() >= logrus.DebugLevel && (req.GetType() != criurpc.CriuReqType_FEATURE_CHECK && req.GetType() != criurpc.CriuReqType_VERSION) { val := reflect.ValueOf(req.GetOpts()) v := reflect.Indirect(val) for i := range v.NumField() { st := v.Type() name := st.Field(i).Name if 'A' <= name[0] && name[0] <= 'Z' { value := val.MethodByName("Get" + name).Call([]reflect.Value{}) logrus.Debugf("CRIU option %s with value %v", name, value[0]) } } } data, err := proto.Marshal(req) if err != nil { return err } _, err = criuClientCon.Write(data) if err != nil { return err } buf := make([]byte, 10*4096) oob := make([]byte, 4096) for { n, oobn, _, _, err := criuClientCon.ReadMsgUnix(buf, oob) if req.Opts != nil && req.Opts.StatusFd != nil { // Close status_fd as soon as we got something back from criu, // assuming it has consumed (reopened) it by this time. // Otherwise it will might be left open forever and whoever // is waiting on it will wait forever. fd := int(*req.Opts.StatusFd) _ = unix.Close(fd) req.Opts.StatusFd = nil } if err != nil { return err } if n == 0 { return errors.New("unexpected EOF") } if n == len(buf) { return errors.New("buffer is too small") } resp := new(criurpc.CriuResp) err = proto.Unmarshal(buf[:n], resp) if err != nil { return err } t := resp.GetType() if !resp.GetSuccess() { return fmt.Errorf("criu failed: type %s errno %d", t, resp.GetCrErrno()) } switch t { case criurpc.CriuReqType_FEATURE_CHECK: logrus.Debugf("Feature check says: %s", resp) criuFeatures = resp.GetFeatures() case criurpc.CriuReqType_NOTIFY: if err := c.criuNotifications(resp, process, cmd, opts, extFds, oob[:oobn]); err != nil { return err } req = &criurpc.CriuReq{ Type: &t, NotifySuccess: proto.Bool(true), } data, err = proto.Marshal(req) if err != nil { return err } _, err = criuClientCon.Write(data) if err != nil { return err } continue case criurpc.CriuReqType_RESTORE: case criurpc.CriuReqType_DUMP: case criurpc.CriuReqType_PRE_DUMP: default: return fmt.Errorf("unable to parse the response %s", resp.String()) } break } _ = criuClientCon.CloseWrite() // cmd.Wait() waits cmd.goroutines which are used for proxying file descriptors. // Here we want to wait only the CRIU process. criuProcessState, err = criuProcess.Wait() if err != nil { return err } // In pre-dump mode CRIU is in a loop and waits for // the final DUMP command. // The current runc pre-dump approach, however, is // start criu in PRE_DUMP once for a single pre-dump // and not the whole series of pre-dump, pre-dump, ...m, dump // If we got the message CriuReqType_PRE_DUMP it means // CRIU was successful and we need to forcefully stop CRIU if !criuProcessState.Success() && *req.Type != criurpc.CriuReqType_PRE_DUMP { return fmt.Errorf("criu failed: %s", criuProcessState) } return nil } // lockNetwork blocks any external network activity. func lockNetwork(config *configs.Config) error { for _, config := range config.Networks { strategy, err := getStrategy(config.Type) if err != nil { return err } if err := strategy.detach(config); err != nil { return err } } return nil } func unlockNetwork(config *configs.Config) error { for _, config := range config.Networks { strategy, err := getStrategy(config.Type) if err != nil { return err } if err = strategy.attach(config); err != nil { return err } } return nil } func (c *Container) criuNotifications(resp *criurpc.CriuResp, process *Process, cmd *exec.Cmd, opts *CriuOpts, fds []string, oob []byte) error { notify := resp.GetNotify() if notify == nil { return fmt.Errorf("invalid response: %s", resp.String()) } script := notify.GetScript() logrus.Debugf("notify: %s\n", script) switch script { case "post-dump": f, err := os.Create(filepath.Join(c.stateDir, "checkpoint")) if err != nil { return err } f.Close() case "network-unlock": if err := unlockNetwork(c.config); err != nil { return err } case "network-lock": if err := lockNetwork(c.config); err != nil { return err } case "setup-namespaces": if c.config.HasHook(configs.Prestart, configs.CreateRuntime) { s, err := c.currentOCIState() if err != nil { return nil } s.Pid = int(notify.GetPid()) if err := c.config.Hooks.Run(configs.Prestart, s); err != nil { return err } if err := c.config.Hooks.Run(configs.CreateRuntime, s); err != nil { return err } } case "post-restore": pid := notify.GetPid() p, err := os.FindProcess(int(pid)) if err != nil { return err } cmd.Process = p r, err := newRestoredProcess(cmd, fds) if err != nil { return err } process.ops = r if err := c.state.transition(&restoredState{ imageDir: opts.ImagesDirectory, c: c, }); err != nil { return err } // create a timestamp indicating when the restored checkpoint was started c.created = time.Now().UTC() if !c.config.Namespaces.Contains(configs.NEWTIME) && configs.IsNamespaceSupported(configs.NEWTIME) && c.checkCriuVersion(31400) == nil { // CRIU restores processes into a time namespace. c.config.Namespaces = append(c.config.Namespaces, configs.Namespace{Type: configs.NEWTIME}) } if _, err := c.updateState(r); err != nil { return err } if err := os.Remove(filepath.Join(c.stateDir, "checkpoint")); err != nil { if !os.IsNotExist(err) { logrus.Error(err) } } case "orphan-pts-master": scm, err := unix.ParseSocketControlMessage(oob) if err != nil { return err } fds, err := unix.ParseUnixRights(&scm[0]) if err != nil { return err } master := os.NewFile(uintptr(fds[0]), "orphan-pts-master") defer master.Close() // While we can access console.master, using the API is a good idea. if err := utils.SendFile(process.ConsoleSocket, master); err != nil { return err } case "status-ready": if opts.StatusFd != -1 { // write \0 to status fd to notify that lazy page server is ready _, err := unix.Write(opts.StatusFd, []byte{0}) if err != nil { logrus.Warnf("can't write \\0 to status fd: %v", err) } _ = unix.Close(opts.StatusFd) opts.StatusFd = -1 } } return nil } func criuCgMode(mode string) (criurpc.CriuCgMode, error) { switch mode { case "": return criurpc.CriuCgMode_DEFAULT, nil case "soft": return criurpc.CriuCgMode_SOFT, nil case "full": return criurpc.CriuCgMode_FULL, nil case "strict": return criurpc.CriuCgMode_STRICT, nil case "ignore": return criurpc.CriuCgMode_IGNORE, nil default: return 0, errors.New("invalid manage-cgroups-mode value") } }