| package libcontainer |
| |
| import ( |
| "bufio" |
| "bytes" |
| "encoding/json" |
| "errors" |
| "fmt" |
| "net" |
| "os" |
| "os/exec" |
| "path/filepath" |
| "reflect" |
| "strings" |
| "time" |
| |
| "github.com/checkpoint-restore/go-criu/v6" |
| criurpc "github.com/checkpoint-restore/go-criu/v6/rpc" |
| securejoin "github.com/cyphar/filepath-securejoin" |
| "github.com/sirupsen/logrus" |
| "golang.org/x/sys/unix" |
| "google.golang.org/protobuf/proto" |
| |
| "github.com/opencontainers/runc/libcontainer/cgroups" |
| "github.com/opencontainers/runc/libcontainer/configs" |
| "github.com/opencontainers/runc/libcontainer/utils" |
| ) |
| |
| var criuFeatures *criurpc.CriuFeatures |
| |
| var ErrCriuMissingFeatures = errors.New("criu is missing features") |
| |
| func (c *Container) checkCriuFeatures(criuOpts *CriuOpts, criuFeat *criurpc.CriuFeatures) error { |
| t := criurpc.CriuReqType_FEATURE_CHECK |
| |
| // make sure the features we are looking for are really not from |
| // some previous check |
| criuFeatures = nil |
| |
| req := &criurpc.CriuReq{ |
| Type: &t, |
| Features: criuFeat, |
| } |
| |
| err := c.criuSwrk(nil, req, criuOpts, nil) |
| if err != nil { |
| return fmt.Errorf("CRIU feature check failed: %w", err) |
| } |
| |
| var missingFeatures []string |
| |
| // The outer if checks if the fields actually exist |
| if (criuFeat.MemTrack != nil) && |
| (criuFeatures.MemTrack != nil) { |
| // The inner if checks if they are set to true |
| if *criuFeat.MemTrack && !*criuFeatures.MemTrack { |
| missingFeatures = append(missingFeatures, "MemTrack") |
| logrus.Debugf("CRIU does not support MemTrack") |
| } |
| } |
| |
| // This needs to be repeated for every new feature check. |
| // Is there a way to put this in a function. Reflection? |
| if (criuFeat.LazyPages != nil) && |
| (criuFeatures.LazyPages != nil) { |
| if *criuFeat.LazyPages && !*criuFeatures.LazyPages { |
| missingFeatures = append(missingFeatures, "LazyPages") |
| logrus.Debugf("CRIU does not support LazyPages") |
| } |
| } |
| |
| if len(missingFeatures) != 0 { |
| return fmt.Errorf("%w: %v", ErrCriuMissingFeatures, missingFeatures) |
| } |
| |
| return nil |
| } |
| |
| func compareCriuVersion(criuVersion int, minVersion int) error { |
| // simple function to perform the actual version compare |
| if criuVersion < minVersion { |
| return fmt.Errorf("CRIU version %d must be %d or higher", criuVersion, minVersion) |
| } |
| |
| return nil |
| } |
| |
| // checkCriuVersion checks CRIU version greater than or equal to minVersion. |
| func (c *Container) checkCriuVersion(minVersion int) error { |
| // If the version of criu has already been determined there is no need |
| // to ask criu for the version again. Use the value from c.criuVersion. |
| if c.criuVersion != 0 { |
| return compareCriuVersion(c.criuVersion, minVersion) |
| } |
| |
| criu := criu.MakeCriu() |
| var err error |
| c.criuVersion, err = criu.GetCriuVersion() |
| if err != nil { |
| return fmt.Errorf("CRIU version check failed: %w", err) |
| } |
| |
| return compareCriuVersion(c.criuVersion, minVersion) |
| } |
| |
| const descriptorsFilename = "descriptors.json" |
| |
| func (c *Container) addCriuDumpMount(req *criurpc.CriuReq, m *configs.Mount) { |
| mountDest := strings.TrimPrefix(m.Destination, c.config.Rootfs) |
| if dest, err := securejoin.SecureJoin(c.config.Rootfs, mountDest); err == nil { |
| mountDest = dest[len(c.config.Rootfs):] |
| } |
| extMnt := &criurpc.ExtMountMap{ |
| Key: proto.String(mountDest), |
| Val: proto.String(mountDest), |
| } |
| req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt) |
| } |
| |
| func (c *Container) addMaskPaths(req *criurpc.CriuReq) error { |
| for _, path := range c.config.MaskPaths { |
| fi, err := os.Stat(fmt.Sprintf("/proc/%d/root/%s", c.initProcess.pid(), path)) |
| if err != nil { |
| if os.IsNotExist(err) { |
| continue |
| } |
| return err |
| } |
| if fi.IsDir() { |
| continue |
| } |
| |
| extMnt := &criurpc.ExtMountMap{ |
| Key: proto.String(path), |
| Val: proto.String("/dev/null"), |
| } |
| req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt) |
| } |
| return nil |
| } |
| |
| func (c *Container) handleCriuConfigurationFile(rpcOpts *criurpc.CriuOpts) { |
| // CRIU will evaluate a configuration starting with release 3.11. |
| // Settings in the configuration file will overwrite RPC settings. |
| // Look for annotations. The annotation 'org.criu.config' |
| // specifies if CRIU should use a different, container specific |
| // configuration file. |
| configFile, exists := utils.SearchLabels(c.config.Labels, "org.criu.config") |
| if exists { |
| // If the annotation 'org.criu.config' exists and is set |
| // to a non-empty string, tell CRIU to use that as a |
| // configuration file. If the file does not exist, CRIU |
| // will just ignore it. |
| if configFile != "" { |
| rpcOpts.ConfigFile = proto.String(configFile) |
| } |
| // If 'org.criu.config' exists and is set to an empty |
| // string, a runc specific CRIU configuration file will |
| // be not set at all. |
| } else { |
| // If the mentioned annotation has not been found, specify |
| // a default CRIU configuration file. |
| rpcOpts.ConfigFile = proto.String("/etc/criu/runc.conf") |
| } |
| } |
| |
| func (c *Container) criuSupportsExtNS(t configs.NamespaceType) bool { |
| var minVersion int |
| switch t { |
| case configs.NEWNET: |
| // CRIU supports different external namespace with different released CRIU versions. |
| // For network namespaces to work we need at least criu 3.11.0 => 31100. |
| minVersion = 31100 |
| case configs.NEWPID: |
| // For PID namespaces criu 31500 is needed. |
| minVersion = 31500 |
| default: |
| return false |
| } |
| return c.checkCriuVersion(minVersion) == nil |
| } |
| |
| func criuNsToKey(t configs.NamespaceType) string { |
| return "extRoot" + strings.Title(configs.NsName(t)) + "NS" //nolint:staticcheck // SA1019: strings.Title is deprecated |
| } |
| |
| func (c *Container) handleCheckpointingExternalNamespaces(rpcOpts *criurpc.CriuOpts, t configs.NamespaceType) error { |
| if !c.criuSupportsExtNS(t) { |
| return nil |
| } |
| |
| nsPath := c.config.Namespaces.PathOf(t) |
| if nsPath == "" { |
| return nil |
| } |
| // CRIU expects the information about an external namespace |
| // like this: --external <TYPE>[<inode>]:<key> |
| // This <key> is always 'extRoot<TYPE>NS'. |
| var ns unix.Stat_t |
| if err := unix.Stat(nsPath, &ns); err != nil { |
| return err |
| } |
| criuExternal := fmt.Sprintf("%s[%d]:%s", configs.NsName(t), ns.Ino, criuNsToKey(t)) |
| rpcOpts.External = append(rpcOpts.External, criuExternal) |
| |
| return nil |
| } |
| |
| func (c *Container) handleRestoringNamespaces(rpcOpts *criurpc.CriuOpts, extraFiles *[]*os.File) error { |
| for _, ns := range c.config.Namespaces { |
| switch ns.Type { |
| case configs.NEWNET, configs.NEWPID: |
| // If the container is running in a network or PID namespace and has |
| // a path to the network or PID namespace configured, we will dump |
| // that network or PID namespace as an external namespace and we |
| // will expect that the namespace exists during restore. |
| // This basically means that CRIU will ignore the namespace |
| // and expect it to be setup correctly. |
| if err := c.handleRestoringExternalNamespaces(rpcOpts, extraFiles, ns.Type); err != nil { |
| return err |
| } |
| default: |
| // For all other namespaces except NET and PID CRIU has |
| // a simpler way of joining the existing namespace if set |
| nsPath := c.config.Namespaces.PathOf(ns.Type) |
| if nsPath == "" { |
| continue |
| } |
| if ns.Type == configs.NEWCGROUP { |
| // CRIU has no code to handle NEWCGROUP |
| return fmt.Errorf("Do not know how to handle namespace %v", ns.Type) |
| } |
| // CRIU has code to handle NEWTIME, but it does not seem to be defined in runc |
| |
| // CRIU will issue a warning for NEWUSER: |
| // criu/namespaces.c: 'join-ns with user-namespace is not fully tested and dangerous' |
| rpcOpts.JoinNs = append(rpcOpts.JoinNs, &criurpc.JoinNamespace{ |
| Ns: proto.String(configs.NsName(ns.Type)), |
| NsFile: proto.String(nsPath), |
| }) |
| } |
| } |
| |
| return nil |
| } |
| |
| func (c *Container) handleRestoringExternalNamespaces(rpcOpts *criurpc.CriuOpts, extraFiles *[]*os.File, t configs.NamespaceType) error { |
| if !c.criuSupportsExtNS(t) { |
| return nil |
| } |
| |
| nsPath := c.config.Namespaces.PathOf(t) |
| if nsPath == "" { |
| return nil |
| } |
| // CRIU wants the information about an existing namespace |
| // like this: --inherit-fd fd[<fd>]:<key> |
| // The <key> needs to be the same as during checkpointing. |
| // We are always using 'extRoot<TYPE>NS' as the key in this. |
| nsFd, err := os.Open(nsPath) |
| if err != nil { |
| logrus.Errorf("If a specific network namespace is defined it must exist: %s", err) |
| return fmt.Errorf("Requested network namespace %v does not exist", nsPath) |
| } |
| inheritFd := &criurpc.InheritFd{ |
| Key: proto.String(criuNsToKey(t)), |
| // The offset of four is necessary because 0, 1, 2 and 3 are |
| // already used by stdin, stdout, stderr, 'criu swrk' socket. |
| Fd: proto.Int32(int32(4 + len(*extraFiles))), |
| } |
| rpcOpts.InheritFd = append(rpcOpts.InheritFd, inheritFd) |
| // All open FDs need to be transferred to CRIU via extraFiles |
| *extraFiles = append(*extraFiles, nsFd) |
| |
| return nil |
| } |
| |
| func (c *Container) Checkpoint(criuOpts *CriuOpts) error { |
| const logFile = "dump.log" |
| c.m.Lock() |
| defer c.m.Unlock() |
| |
| // Checkpoint is unlikely to work if os.Geteuid() != 0 || system.RunningInUserNS(). |
| // (CLI prints a warning) |
| // TODO(avagin): Figure out how to make this work nicely. CRIU 2.0 has |
| // support for doing unprivileged dumps, but the setup of |
| // rootless containers might make this complicated. |
| |
| // We are relying on the CRIU version RPC which was introduced with CRIU 3.0.0 |
| if err := c.checkCriuVersion(30000); err != nil { |
| return err |
| } |
| |
| if criuOpts.ImagesDirectory == "" { |
| return errors.New("invalid directory to save checkpoint") |
| } |
| |
| // Since a container can be C/R'ed multiple times, |
| // the checkpoint directory may already exist. |
| if err := os.Mkdir(criuOpts.ImagesDirectory, 0o700); err != nil && !os.IsExist(err) { |
| return err |
| } |
| |
| logDir := criuOpts.ImagesDirectory |
| imageDir, err := os.Open(criuOpts.ImagesDirectory) |
| if err != nil { |
| return err |
| } |
| defer imageDir.Close() |
| |
| rpcOpts := criurpc.CriuOpts{ |
| ImagesDirFd: proto.Int32(int32(imageDir.Fd())), |
| LogLevel: proto.Int32(4), |
| LogFile: proto.String(logFile), |
| Root: proto.String(c.config.Rootfs), |
| ManageCgroups: proto.Bool(true), |
| NotifyScripts: proto.Bool(true), |
| Pid: proto.Int32(int32(c.initProcess.pid())), |
| ShellJob: proto.Bool(criuOpts.ShellJob), |
| LeaveRunning: proto.Bool(criuOpts.LeaveRunning), |
| TcpEstablished: proto.Bool(criuOpts.TcpEstablished), |
| ExtUnixSk: proto.Bool(criuOpts.ExternalUnixConnections), |
| FileLocks: proto.Bool(criuOpts.FileLocks), |
| EmptyNs: proto.Uint32(criuOpts.EmptyNs), |
| OrphanPtsMaster: proto.Bool(true), |
| AutoDedup: proto.Bool(criuOpts.AutoDedup), |
| LazyPages: proto.Bool(criuOpts.LazyPages), |
| } |
| |
| // if criuOpts.WorkDirectory is not set, criu default is used. |
| if criuOpts.WorkDirectory != "" { |
| if err := os.Mkdir(criuOpts.WorkDirectory, 0o700); err != nil && !os.IsExist(err) { |
| return err |
| } |
| workDir, err := os.Open(criuOpts.WorkDirectory) |
| if err != nil { |
| return err |
| } |
| defer workDir.Close() |
| rpcOpts.WorkDirFd = proto.Int32(int32(workDir.Fd())) |
| logDir = criuOpts.WorkDirectory |
| } |
| |
| c.handleCriuConfigurationFile(&rpcOpts) |
| |
| // If the container is running in a network namespace and has |
| // a path to the network namespace configured, we will dump |
| // that network namespace as an external namespace and we |
| // will expect that the namespace exists during restore. |
| // This basically means that CRIU will ignore the namespace |
| // and expect to be setup correctly. |
| if err := c.handleCheckpointingExternalNamespaces(&rpcOpts, configs.NEWNET); err != nil { |
| return err |
| } |
| |
| // Same for possible external PID namespaces |
| if err := c.handleCheckpointingExternalNamespaces(&rpcOpts, configs.NEWPID); err != nil { |
| return err |
| } |
| |
| // CRIU can use cgroup freezer; when rpcOpts.FreezeCgroup |
| // is not set, CRIU uses ptrace() to pause the processes. |
| // Note cgroup v2 freezer is only supported since CRIU release 3.14. |
| if !cgroups.IsCgroup2UnifiedMode() || c.checkCriuVersion(31400) == nil { |
| if fcg := c.cgroupManager.Path("freezer"); fcg != "" { |
| rpcOpts.FreezeCgroup = proto.String(fcg) |
| } |
| } |
| |
| // append optional criu opts, e.g., page-server and port |
| if criuOpts.PageServer.Address != "" && criuOpts.PageServer.Port != 0 { |
| rpcOpts.Ps = &criurpc.CriuPageServerInfo{ |
| Address: proto.String(criuOpts.PageServer.Address), |
| Port: proto.Int32(criuOpts.PageServer.Port), |
| } |
| } |
| |
| // pre-dump may need parentImage param to complete iterative migration |
| if criuOpts.ParentImage != "" { |
| rpcOpts.ParentImg = proto.String(criuOpts.ParentImage) |
| rpcOpts.TrackMem = proto.Bool(true) |
| } |
| |
| // append optional manage cgroups mode |
| if criuOpts.ManageCgroupsMode != 0 { |
| mode := criuOpts.ManageCgroupsMode |
| rpcOpts.ManageCgroupsMode = &mode |
| } |
| |
| var t criurpc.CriuReqType |
| if criuOpts.PreDump { |
| feat := criurpc.CriuFeatures{ |
| MemTrack: proto.Bool(true), |
| } |
| |
| if err := c.checkCriuFeatures(criuOpts, &feat); err != nil { |
| return err |
| } |
| |
| t = criurpc.CriuReqType_PRE_DUMP |
| } else { |
| t = criurpc.CriuReqType_DUMP |
| } |
| |
| if criuOpts.LazyPages { |
| // lazy migration requested; check if criu supports it |
| feat := criurpc.CriuFeatures{ |
| LazyPages: proto.Bool(true), |
| } |
| if err := c.checkCriuFeatures(criuOpts, &feat); err != nil { |
| return err |
| } |
| |
| if fd := criuOpts.StatusFd; fd != -1 { |
| // check that the FD is valid |
| flags, err := unix.FcntlInt(uintptr(fd), unix.F_GETFL, 0) |
| if err != nil { |
| return fmt.Errorf("invalid --status-fd argument %d: %w", fd, err) |
| } |
| // and writable |
| if flags&unix.O_WRONLY == 0 { |
| return fmt.Errorf("invalid --status-fd argument %d: not writable", fd) |
| } |
| |
| if c.checkCriuVersion(31500) != nil { |
| // For criu 3.15+, use notifications (see case "status-ready" |
| // in criuNotifications). Otherwise, rely on criu status fd. |
| rpcOpts.StatusFd = proto.Int32(int32(fd)) |
| } |
| } |
| } |
| |
| req := &criurpc.CriuReq{ |
| Type: &t, |
| Opts: &rpcOpts, |
| } |
| |
| // no need to dump all this in pre-dump |
| if !criuOpts.PreDump { |
| hasCgroupns := c.config.Namespaces.Contains(configs.NEWCGROUP) |
| for _, m := range c.config.Mounts { |
| switch m.Device { |
| case "bind": |
| c.addCriuDumpMount(req, m) |
| case "cgroup": |
| if cgroups.IsCgroup2UnifiedMode() || hasCgroupns { |
| // real mount(s) |
| continue |
| } |
| // a set of "external" bind mounts |
| binds, err := getCgroupMounts(m) |
| if err != nil { |
| return err |
| } |
| for _, b := range binds { |
| c.addCriuDumpMount(req, b) |
| } |
| } |
| } |
| |
| if err := c.addMaskPaths(req); err != nil { |
| return err |
| } |
| |
| for _, node := range c.config.Devices { |
| m := &configs.Mount{Destination: node.Path, Source: node.Path} |
| c.addCriuDumpMount(req, m) |
| } |
| |
| // Write the FD info to a file in the image directory |
| fdsJSON, err := json.Marshal(c.initProcess.externalDescriptors()) |
| if err != nil { |
| return err |
| } |
| |
| err = os.WriteFile(filepath.Join(criuOpts.ImagesDirectory, descriptorsFilename), fdsJSON, 0o600) |
| if err != nil { |
| return err |
| } |
| } |
| |
| err = c.criuSwrk(nil, req, criuOpts, nil) |
| if err != nil { |
| logCriuErrors(logDir, logFile) |
| return err |
| } |
| return nil |
| } |
| |
| func (c *Container) addCriuRestoreMount(req *criurpc.CriuReq, m *configs.Mount) { |
| mountDest := strings.TrimPrefix(m.Destination, c.config.Rootfs) |
| if dest, err := securejoin.SecureJoin(c.config.Rootfs, mountDest); err == nil { |
| mountDest = dest[len(c.config.Rootfs):] |
| } |
| extMnt := &criurpc.ExtMountMap{ |
| Key: proto.String(mountDest), |
| Val: proto.String(m.Source), |
| } |
| req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt) |
| } |
| |
| func (c *Container) restoreNetwork(req *criurpc.CriuReq, criuOpts *CriuOpts) { |
| for _, iface := range c.config.Networks { |
| switch iface.Type { |
| case "veth": |
| veth := new(criurpc.CriuVethPair) |
| veth.IfOut = proto.String(iface.HostInterfaceName) |
| veth.IfIn = proto.String(iface.Name) |
| req.Opts.Veths = append(req.Opts.Veths, veth) |
| case "loopback": |
| // Do nothing |
| } |
| } |
| for _, i := range criuOpts.VethPairs { |
| veth := new(criurpc.CriuVethPair) |
| veth.IfOut = proto.String(i.HostInterfaceName) |
| veth.IfIn = proto.String(i.ContainerInterfaceName) |
| req.Opts.Veths = append(req.Opts.Veths, veth) |
| } |
| } |
| |
| func isOnTmpfs(path string, mounts []*configs.Mount) bool { |
| for _, m := range mounts { |
| if m.Device == "tmpfs" && strings.HasPrefix(path, m.Destination+"/") { |
| return true |
| } |
| } |
| return false |
| } |
| |
| // prepareCriuRestoreMounts tries to set up the rootfs of the |
| // container to be restored in the same way runc does it for |
| // initial container creation. Even for a read-only rootfs container |
| // runc modifies the rootfs to add mountpoints which do not exist. |
| // This function also creates missing mountpoints as long as they |
| // are not on top of a tmpfs, as CRIU will restore tmpfs content anyway. |
| func (c *Container) prepareCriuRestoreMounts(mounts []*configs.Mount) error { |
| umounts := []string{} |
| defer func() { |
| for _, u := range umounts { |
| _ = utils.WithProcfd(c.config.Rootfs, u, func(procfd string) error { |
| if e := unix.Unmount(procfd, unix.MNT_DETACH); e != nil { |
| if e != unix.EINVAL { |
| // Ignore EINVAL as it means 'target is not a mount point.' |
| // It probably has already been unmounted. |
| logrus.Warnf("Error during cleanup unmounting of %s (%s): %v", procfd, u, e) |
| } |
| } |
| return nil |
| }) |
| } |
| }() |
| // Now go through all mounts and create the required mountpoints. |
| for _, m := range mounts { |
| // No cgroup mount point(s) need to be created: |
| // * for v1, mount points are saved by CRIU because |
| // /sys/fs/cgroup is a tmpfs mount; |
| // * for v2, /sys/fs/cgroup is a real mount, but |
| // the mountpoint appears as soon as /sys is mounted. |
| if m.Device == "cgroup" { |
| continue |
| } |
| // If the mountpoint is on a tmpfs, skip it as CRIU will |
| // restore the complete tmpfs content from its checkpoint. |
| if isOnTmpfs(m.Destination, mounts) { |
| continue |
| } |
| me := mountEntry{Mount: m} |
| if err := me.createOpenMountpoint(c.config.Rootfs); err != nil { |
| return fmt.Errorf("create criu restore mountpoint for %s mount: %w", me.Destination, err) |
| } |
| if me.dstFile != nil { |
| defer me.dstFile.Close() |
| } |
| // If the mount point is a bind mount, we need to mount |
| // it now so that runc can create the necessary mount |
| // points for mounts in bind mounts. |
| // This also happens during initial container creation. |
| // Without this CRIU restore will fail |
| // See: https://github.com/opencontainers/runc/issues/2748 |
| // It is also not necessary to order the mount points |
| // because during initial container creation mounts are |
| // set up in the order they are configured. |
| if m.Device == "bind" { |
| if err := utils.WithProcfdFile(me.dstFile, func(dstFd string) error { |
| return mountViaFds(m.Source, nil, m.Destination, dstFd, "", unix.MS_BIND|unix.MS_REC, "") |
| }); err != nil { |
| return err |
| } |
| umounts = append(umounts, m.Destination) |
| } |
| if me.dstFile != nil { |
| // As this is being done in a loop, the defer earlier will be |
| // delayed until all mountpoints are handled -- for a config with |
| // many mountpoints this could result in a lot of open files. So we |
| // opportunistically close the file as well as deferring it. |
| _ = me.dstFile.Close() |
| } |
| } |
| return nil |
| } |
| |
| // Restore restores the checkpointed container to a running state using the |
| // criu(8) utility. |
| func (c *Container) Restore(process *Process, criuOpts *CriuOpts) error { |
| const logFile = "restore.log" |
| c.m.Lock() |
| defer c.m.Unlock() |
| |
| var extraFiles []*os.File |
| |
| // Restore is unlikely to work if os.Geteuid() != 0 || system.RunningInUserNS(). |
| // (CLI prints a warning) |
| // TODO(avagin): Figure out how to make this work nicely. CRIU doesn't have |
| // support for unprivileged restore at the moment. |
| |
| // We are relying on the CRIU version RPC which was introduced with CRIU 3.0.0 |
| if err := c.checkCriuVersion(30000); err != nil { |
| return err |
| } |
| if criuOpts.ImagesDirectory == "" { |
| return errors.New("invalid directory to restore checkpoint") |
| } |
| logDir := criuOpts.ImagesDirectory |
| imageDir, err := os.Open(criuOpts.ImagesDirectory) |
| if err != nil { |
| return err |
| } |
| defer imageDir.Close() |
| // CRIU has a few requirements for a root directory: |
| // * it must be a mount point |
| // * its parent must not be overmounted |
| // c.config.Rootfs is bind-mounted to a temporary directory |
| // to satisfy these requirements. |
| root := filepath.Join(c.stateDir, "criu-root") |
| if err := os.Mkdir(root, 0o755); err != nil { |
| return err |
| } |
| defer os.Remove(root) |
| root, err = filepath.EvalSymlinks(root) |
| if err != nil { |
| return err |
| } |
| err = mount(c.config.Rootfs, root, "", unix.MS_BIND|unix.MS_REC, "") |
| if err != nil { |
| return err |
| } |
| defer unix.Unmount(root, unix.MNT_DETACH) //nolint: errcheck |
| t := criurpc.CriuReqType_RESTORE |
| req := &criurpc.CriuReq{ |
| Type: &t, |
| Opts: &criurpc.CriuOpts{ |
| ImagesDirFd: proto.Int32(int32(imageDir.Fd())), |
| EvasiveDevices: proto.Bool(true), |
| LogLevel: proto.Int32(4), |
| LogFile: proto.String(logFile), |
| RstSibling: proto.Bool(true), |
| Root: proto.String(root), |
| ManageCgroups: proto.Bool(true), |
| NotifyScripts: proto.Bool(true), |
| ShellJob: proto.Bool(criuOpts.ShellJob), |
| ExtUnixSk: proto.Bool(criuOpts.ExternalUnixConnections), |
| TcpEstablished: proto.Bool(criuOpts.TcpEstablished), |
| FileLocks: proto.Bool(criuOpts.FileLocks), |
| EmptyNs: proto.Uint32(criuOpts.EmptyNs), |
| OrphanPtsMaster: proto.Bool(true), |
| AutoDedup: proto.Bool(criuOpts.AutoDedup), |
| LazyPages: proto.Bool(criuOpts.LazyPages), |
| }, |
| } |
| |
| if criuOpts.LsmProfile != "" { |
| // CRIU older than 3.16 has a bug which breaks the possibility |
| // to set a different LSM profile. |
| if err := c.checkCriuVersion(31600); err != nil { |
| return errors.New("--lsm-profile requires at least CRIU 3.16") |
| } |
| req.Opts.LsmProfile = proto.String(criuOpts.LsmProfile) |
| } |
| if criuOpts.LsmMountContext != "" { |
| if err := c.checkCriuVersion(31600); err != nil { |
| return errors.New("--lsm-mount-context requires at least CRIU 3.16") |
| } |
| req.Opts.LsmMountContext = proto.String(criuOpts.LsmMountContext) |
| } |
| |
| if criuOpts.WorkDirectory != "" { |
| // Since a container can be C/R'ed multiple times, |
| // the work directory may already exist. |
| if err := os.Mkdir(criuOpts.WorkDirectory, 0o700); err != nil && !os.IsExist(err) { |
| return err |
| } |
| workDir, err := os.Open(criuOpts.WorkDirectory) |
| if err != nil { |
| return err |
| } |
| defer workDir.Close() |
| req.Opts.WorkDirFd = proto.Int32(int32(workDir.Fd())) |
| logDir = criuOpts.WorkDirectory |
| } |
| c.handleCriuConfigurationFile(req.Opts) |
| |
| if err := c.handleRestoringNamespaces(req.Opts, &extraFiles); err != nil { |
| return err |
| } |
| |
| // This will modify the rootfs of the container in the same way runc |
| // modifies the container during initial creation. |
| if err := c.prepareCriuRestoreMounts(c.config.Mounts); err != nil { |
| return err |
| } |
| |
| hasCgroupns := c.config.Namespaces.Contains(configs.NEWCGROUP) |
| for _, m := range c.config.Mounts { |
| switch m.Device { |
| case "bind": |
| c.addCriuRestoreMount(req, m) |
| case "cgroup": |
| if cgroups.IsCgroup2UnifiedMode() || hasCgroupns { |
| continue |
| } |
| // cgroup v1 is a set of bind mounts, unless cgroupns is used |
| binds, err := getCgroupMounts(m) |
| if err != nil { |
| return err |
| } |
| for _, b := range binds { |
| c.addCriuRestoreMount(req, b) |
| } |
| } |
| } |
| |
| if len(c.config.MaskPaths) > 0 { |
| m := &configs.Mount{Destination: "/dev/null", Source: "/dev/null"} |
| c.addCriuRestoreMount(req, m) |
| } |
| |
| for _, node := range c.config.Devices { |
| m := &configs.Mount{Destination: node.Path, Source: node.Path} |
| c.addCriuRestoreMount(req, m) |
| } |
| |
| if criuOpts.EmptyNs&unix.CLONE_NEWNET == 0 { |
| c.restoreNetwork(req, criuOpts) |
| } |
| |
| // append optional manage cgroups mode |
| if criuOpts.ManageCgroupsMode != 0 { |
| mode := criuOpts.ManageCgroupsMode |
| req.Opts.ManageCgroupsMode = &mode |
| } |
| |
| var ( |
| fds []string |
| fdJSON []byte |
| ) |
| if fdJSON, err = os.ReadFile(filepath.Join(criuOpts.ImagesDirectory, descriptorsFilename)); err != nil { |
| return err |
| } |
| |
| if err := json.Unmarshal(fdJSON, &fds); err != nil { |
| return err |
| } |
| for i := range fds { |
| if s := fds[i]; strings.Contains(s, "pipe:") { |
| inheritFd := new(criurpc.InheritFd) |
| inheritFd.Key = proto.String(s) |
| inheritFd.Fd = proto.Int32(int32(i)) |
| req.Opts.InheritFd = append(req.Opts.InheritFd, inheritFd) |
| } |
| } |
| err = c.criuSwrk(process, req, criuOpts, extraFiles) |
| if err != nil { |
| logCriuErrors(logDir, logFile) |
| } |
| |
| // Now that CRIU is done let's close all opened FDs CRIU needed. |
| for _, fd := range extraFiles { |
| fd.Close() |
| } |
| |
| return err |
| } |
| |
| // logCriuErrors tries to find and log errors from a criu log file. |
| // The output is similar to what "grep -n -B5 Error" does. |
| func logCriuErrors(dir, file string) { |
| lookFor := []byte("Error") // Print the line that contains this... |
| const max = 5 + 1 // ... and a few preceding lines. |
| |
| logFile := filepath.Join(dir, file) |
| f, err := os.Open(logFile) |
| if err != nil { |
| logrus.Warn(err) |
| return |
| } |
| defer f.Close() |
| |
| var lines [max][]byte |
| var idx, lineNo, printedLineNo int |
| s := bufio.NewScanner(f) |
| for s.Scan() { |
| lineNo++ |
| lines[idx] = s.Bytes() |
| idx = (idx + 1) % max |
| if !bytes.Contains(s.Bytes(), lookFor) { |
| continue |
| } |
| // Found an error. |
| if printedLineNo == 0 { |
| logrus.Warnf("--- Quoting %q", logFile) |
| } else if lineNo-max > printedLineNo { |
| // Mark the gap. |
| logrus.Warn("...") |
| } |
| // Print the last lines. |
| for add := 0; add < max; add++ { |
| i := (idx + add) % max |
| s := lines[i] |
| actLineNo := lineNo + add - max + 1 |
| if len(s) > 0 && actLineNo > printedLineNo { |
| logrus.Warnf("%d:%s", actLineNo, s) |
| printedLineNo = actLineNo |
| } |
| } |
| } |
| if printedLineNo != 0 { |
| logrus.Warn("---") // End of "Quoting ...". |
| } |
| if err := s.Err(); err != nil { |
| logrus.Warnf("read %q: %v", logFile, err) |
| } |
| } |
| |
| func (c *Container) criuApplyCgroups(pid int, req *criurpc.CriuReq) error { |
| // need to apply cgroups only on restore |
| if req.GetType() != criurpc.CriuReqType_RESTORE { |
| return nil |
| } |
| |
| // XXX: Do we need to deal with this case? AFAIK criu still requires root. |
| if err := c.cgroupManager.Apply(pid); err != nil { |
| return err |
| } |
| |
| if err := c.cgroupManager.Set(c.config.Cgroups.Resources); err != nil { |
| return err |
| } |
| |
| // TODO(@kolyshkin): should we use c.cgroupManager.GetPaths() |
| // instead of reading /proc/pid/cgroup? |
| path := fmt.Sprintf("/proc/%d/cgroup", pid) |
| cgroupsPaths, err := cgroups.ParseCgroupFile(path) |
| if err != nil { |
| return err |
| } |
| |
| for c, p := range cgroupsPaths { |
| cgroupRoot := &criurpc.CgroupRoot{ |
| Ctrl: proto.String(c), |
| Path: proto.String(p), |
| } |
| req.Opts.CgRoot = append(req.Opts.CgRoot, cgroupRoot) |
| } |
| |
| return nil |
| } |
| |
| func (c *Container) criuSwrk(process *Process, req *criurpc.CriuReq, opts *CriuOpts, extraFiles []*os.File) error { |
| fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_SEQPACKET|unix.SOCK_CLOEXEC, 0) |
| if err != nil { |
| return err |
| } |
| |
| criuClient := os.NewFile(uintptr(fds[0]), "criu-transport-client") |
| criuClientFileCon, err := net.FileConn(criuClient) |
| criuClient.Close() |
| if err != nil { |
| return err |
| } |
| |
| criuClientCon := criuClientFileCon.(*net.UnixConn) |
| defer criuClientCon.Close() |
| |
| criuServer := os.NewFile(uintptr(fds[1]), "criu-transport-server") |
| defer criuServer.Close() |
| |
| if c.criuVersion != 0 { |
| // If the CRIU Version is still '0' then this is probably |
| // the initial CRIU run to detect the version. Skip it. |
| logrus.Debugf("Using CRIU %d", c.criuVersion) |
| } |
| cmd := exec.Command("criu", "swrk", "3") |
| if process != nil { |
| cmd.Stdin = process.Stdin |
| cmd.Stdout = process.Stdout |
| cmd.Stderr = process.Stderr |
| } |
| cmd.ExtraFiles = append(cmd.ExtraFiles, criuServer) |
| if extraFiles != nil { |
| cmd.ExtraFiles = append(cmd.ExtraFiles, extraFiles...) |
| } |
| |
| if err := cmd.Start(); err != nil { |
| return err |
| } |
| // we close criuServer so that even if CRIU crashes or unexpectedly exits, runc will not hang. |
| criuServer.Close() |
| // cmd.Process will be replaced by a restored init. |
| criuProcess := cmd.Process |
| |
| var criuProcessState *os.ProcessState |
| defer func() { |
| if criuProcessState == nil { |
| criuClientCon.Close() |
| _, err := criuProcess.Wait() |
| if err != nil { |
| logrus.Warnf("wait on criuProcess returned %v", err) |
| } |
| } |
| }() |
| |
| if err := c.criuApplyCgroups(criuProcess.Pid, req); err != nil { |
| return err |
| } |
| |
| var extFds []string |
| if process != nil { |
| extFds, err = getPipeFds(criuProcess.Pid) |
| if err != nil { |
| return err |
| } |
| } |
| |
| logrus.Debugf("Using CRIU in %s mode", req.GetType().String()) |
| // In the case of criurpc.CriuReqType_FEATURE_CHECK req.GetOpts() |
| // should be empty. For older CRIU versions it still will be |
| // available but empty. criurpc.CriuReqType_VERSION actually |
| // has no req.GetOpts(). |
| if logrus.GetLevel() >= logrus.DebugLevel && |
| !(req.GetType() == criurpc.CriuReqType_FEATURE_CHECK || |
| req.GetType() == criurpc.CriuReqType_VERSION) { |
| |
| val := reflect.ValueOf(req.GetOpts()) |
| v := reflect.Indirect(val) |
| for i := 0; i < v.NumField(); i++ { |
| st := v.Type() |
| name := st.Field(i).Name |
| if 'A' <= name[0] && name[0] <= 'Z' { |
| value := val.MethodByName("Get" + name).Call([]reflect.Value{}) |
| logrus.Debugf("CRIU option %s with value %v", name, value[0]) |
| } |
| } |
| } |
| data, err := proto.Marshal(req) |
| if err != nil { |
| return err |
| } |
| _, err = criuClientCon.Write(data) |
| if err != nil { |
| return err |
| } |
| |
| buf := make([]byte, 10*4096) |
| oob := make([]byte, 4096) |
| for { |
| n, oobn, _, _, err := criuClientCon.ReadMsgUnix(buf, oob) |
| if req.Opts != nil && req.Opts.StatusFd != nil { |
| // Close status_fd as soon as we got something back from criu, |
| // assuming it has consumed (reopened) it by this time. |
| // Otherwise it will might be left open forever and whoever |
| // is waiting on it will wait forever. |
| fd := int(*req.Opts.StatusFd) |
| _ = unix.Close(fd) |
| req.Opts.StatusFd = nil |
| } |
| if err != nil { |
| return err |
| } |
| if n == 0 { |
| return errors.New("unexpected EOF") |
| } |
| if n == len(buf) { |
| return errors.New("buffer is too small") |
| } |
| |
| resp := new(criurpc.CriuResp) |
| err = proto.Unmarshal(buf[:n], resp) |
| if err != nil { |
| return err |
| } |
| t := resp.GetType() |
| if !resp.GetSuccess() { |
| return fmt.Errorf("criu failed: type %s errno %d", t, resp.GetCrErrno()) |
| } |
| |
| switch t { |
| case criurpc.CriuReqType_FEATURE_CHECK: |
| logrus.Debugf("Feature check says: %s", resp) |
| criuFeatures = resp.GetFeatures() |
| case criurpc.CriuReqType_NOTIFY: |
| if err := c.criuNotifications(resp, process, cmd, opts, extFds, oob[:oobn]); err != nil { |
| return err |
| } |
| req = &criurpc.CriuReq{ |
| Type: &t, |
| NotifySuccess: proto.Bool(true), |
| } |
| data, err = proto.Marshal(req) |
| if err != nil { |
| return err |
| } |
| _, err = criuClientCon.Write(data) |
| if err != nil { |
| return err |
| } |
| continue |
| case criurpc.CriuReqType_RESTORE: |
| case criurpc.CriuReqType_DUMP: |
| case criurpc.CriuReqType_PRE_DUMP: |
| default: |
| return fmt.Errorf("unable to parse the response %s", resp.String()) |
| } |
| |
| break |
| } |
| |
| _ = criuClientCon.CloseWrite() |
| // cmd.Wait() waits cmd.goroutines which are used for proxying file descriptors. |
| // Here we want to wait only the CRIU process. |
| criuProcessState, err = criuProcess.Wait() |
| if err != nil { |
| return err |
| } |
| |
| // In pre-dump mode CRIU is in a loop and waits for |
| // the final DUMP command. |
| // The current runc pre-dump approach, however, is |
| // start criu in PRE_DUMP once for a single pre-dump |
| // and not the whole series of pre-dump, pre-dump, ...m, dump |
| // If we got the message CriuReqType_PRE_DUMP it means |
| // CRIU was successful and we need to forcefully stop CRIU |
| if !criuProcessState.Success() && *req.Type != criurpc.CriuReqType_PRE_DUMP { |
| return fmt.Errorf("criu failed: %s", criuProcessState) |
| } |
| return nil |
| } |
| |
| // lockNetwork blocks any external network activity. |
| func lockNetwork(config *configs.Config) error { |
| for _, config := range config.Networks { |
| strategy, err := getStrategy(config.Type) |
| if err != nil { |
| return err |
| } |
| |
| if err := strategy.detach(config); err != nil { |
| return err |
| } |
| } |
| return nil |
| } |
| |
| func unlockNetwork(config *configs.Config) error { |
| for _, config := range config.Networks { |
| strategy, err := getStrategy(config.Type) |
| if err != nil { |
| return err |
| } |
| if err = strategy.attach(config); err != nil { |
| return err |
| } |
| } |
| return nil |
| } |
| |
| func (c *Container) criuNotifications(resp *criurpc.CriuResp, process *Process, cmd *exec.Cmd, opts *CriuOpts, fds []string, oob []byte) error { |
| notify := resp.GetNotify() |
| if notify == nil { |
| return fmt.Errorf("invalid response: %s", resp.String()) |
| } |
| script := notify.GetScript() |
| logrus.Debugf("notify: %s\n", script) |
| switch script { |
| case "post-dump": |
| f, err := os.Create(filepath.Join(c.stateDir, "checkpoint")) |
| if err != nil { |
| return err |
| } |
| f.Close() |
| case "network-unlock": |
| if err := unlockNetwork(c.config); err != nil { |
| return err |
| } |
| case "network-lock": |
| if err := lockNetwork(c.config); err != nil { |
| return err |
| } |
| case "setup-namespaces": |
| if c.config.Hooks != nil { |
| s, err := c.currentOCIState() |
| if err != nil { |
| return nil |
| } |
| s.Pid = int(notify.GetPid()) |
| |
| if err := c.config.Hooks.Run(configs.Prestart, s); err != nil { |
| return err |
| } |
| if err := c.config.Hooks.Run(configs.CreateRuntime, s); err != nil { |
| return err |
| } |
| } |
| case "post-restore": |
| pid := notify.GetPid() |
| |
| p, err := os.FindProcess(int(pid)) |
| if err != nil { |
| return err |
| } |
| cmd.Process = p |
| |
| r, err := newRestoredProcess(cmd, fds) |
| if err != nil { |
| return err |
| } |
| process.ops = r |
| if err := c.state.transition(&restoredState{ |
| imageDir: opts.ImagesDirectory, |
| c: c, |
| }); err != nil { |
| return err |
| } |
| // create a timestamp indicating when the restored checkpoint was started |
| c.created = time.Now().UTC() |
| if !c.config.Namespaces.Contains(configs.NEWTIME) && |
| configs.IsNamespaceSupported(configs.NEWTIME) && |
| c.checkCriuVersion(31400) == nil { |
| // CRIU restores processes into a time namespace. |
| c.config.Namespaces = append(c.config.Namespaces, |
| configs.Namespace{Type: configs.NEWTIME}) |
| } |
| if _, err := c.updateState(r); err != nil { |
| return err |
| } |
| if err := os.Remove(filepath.Join(c.stateDir, "checkpoint")); err != nil { |
| if !os.IsNotExist(err) { |
| logrus.Error(err) |
| } |
| } |
| case "orphan-pts-master": |
| scm, err := unix.ParseSocketControlMessage(oob) |
| if err != nil { |
| return err |
| } |
| fds, err := unix.ParseUnixRights(&scm[0]) |
| if err != nil { |
| return err |
| } |
| |
| master := os.NewFile(uintptr(fds[0]), "orphan-pts-master") |
| defer master.Close() |
| |
| // While we can access console.master, using the API is a good idea. |
| if err := utils.SendFile(process.ConsoleSocket, master); err != nil { |
| return err |
| } |
| case "status-ready": |
| if opts.StatusFd != -1 { |
| // write \0 to status fd to notify that lazy page server is ready |
| _, err := unix.Write(opts.StatusFd, []byte{0}) |
| if err != nil { |
| logrus.Warnf("can't write \\0 to status fd: %v", err) |
| } |
| _ = unix.Close(opts.StatusFd) |
| opts.StatusFd = -1 |
| } |
| } |
| return nil |
| } |