diff --git a/libcontainer/configs/config.go b/libcontainer/configs/config.go index 2dad5c8b5..e0db8e017 100644 --- a/libcontainer/configs/config.go +++ b/libcontainer/configs/config.go @@ -31,10 +31,12 @@ type IDMap struct { // for syscalls. Additional architectures can be added by specifying them in // Architectures. type Seccomp struct { - DefaultAction Action `json:"default_action"` - Architectures []string `json:"architectures"` - Syscalls []*Syscall `json:"syscalls"` - DefaultErrnoRet *uint `json:"default_errno_ret"` + DefaultAction Action `json:"default_action"` + Architectures []string `json:"architectures"` + Syscalls []*Syscall `json:"syscalls"` + DefaultErrnoRet *uint `json:"default_errno_ret"` + ListenerPath string `json:"listener_path,omitempty"` + ListenerMetadata string `json:"listener_metadata,omitempty"` } // Action is taken upon rule match in Seccomp @@ -47,6 +49,7 @@ const ( Allow Trace Log + Notify ) // Operator is a comparison operator to be used when matching syscall arguments in Seccomp diff --git a/libcontainer/factory_linux.go b/libcontainer/factory_linux.go index 18d12916f..81bc2c297 100644 --- a/libcontainer/factory_linux.go +++ b/libcontainer/factory_linux.go @@ -357,7 +357,7 @@ func (l *LinuxFactory) StartInitialization() (err error) { defer func() { // We have an error during the initialization of the container's init, // send it back to the parent process in the form of an initError. - if werr := utils.WriteJSON(pipe, syncT{procError}); werr != nil { + if werr := writeSync(pipe, procError); werr != nil { fmt.Fprintln(os.Stderr, err) return } diff --git a/libcontainer/init_linux.go b/libcontainer/init_linux.go index 5bbe29202..9f7dc75f2 100644 --- a/libcontainer/init_linux.go +++ b/libcontainer/init_linux.go @@ -271,6 +271,36 @@ func syncParentHooks(pipe io.ReadWriter) error { return readSync(pipe, procResume) } +// syncParentSeccomp sends to the given pipe a JSON payload which +// indicates that the parent should pick up the seccomp fd with pidfd_getfd() +// and send it to the seccomp agent over a unix socket. It then waits for +// the parent to indicate that it is cleared to resume and closes the seccompFd. +// If the seccompFd is -1, there isn't anything to sync with the parent, so it +// returns no error. +func syncParentSeccomp(pipe io.ReadWriter, seccompFd int) error { + if seccompFd == -1 { + return nil + } + + // Tell parent. + if err := writeSyncWithFd(pipe, procSeccomp, seccompFd); err != nil { + unix.Close(seccompFd) + return err + } + + // Wait for parent to give the all-clear. + if err := readSync(pipe, procSeccompDone); err != nil { + unix.Close(seccompFd) + return fmt.Errorf("sync parent seccomp: %w", err) + } + + if err := unix.Close(seccompFd); err != nil { + return fmt.Errorf("close seccomp fd: %w", err) + } + + return nil +} + // setupUser changes the groups, gid, and uid for the user inside the container func setupUser(config *initConfig) error { // Set up defaults. diff --git a/libcontainer/process_linux.go b/libcontainer/process_linux.go index ba0258123..13812e731 100644 --- a/libcontainer/process_linux.go +++ b/libcontainer/process_linux.go @@ -5,6 +5,7 @@ import ( "errors" "fmt" "io" + "net" "os" "os/exec" "path/filepath" @@ -172,6 +173,42 @@ func (p *setnsProcess) start() (retErr error) { case procHooks: // This shouldn't happen. panic("unexpected procHooks in setns") + case procSeccomp: + if p.config.Config.Seccomp.ListenerPath == "" { + return errors.New("listenerPath is not set") + } + + seccompFd, err := recvSeccompFd(uintptr(p.pid()), uintptr(sync.Fd)) + if err != nil { + return err + } + defer unix.Close(seccompFd) + + bundle, annotations := utils.Annotations(p.config.Config.Labels) + containerProcessState := &specs.ContainerProcessState{ + Version: specs.Version, + Fds: []string{specs.SeccompFdName}, + Pid: p.cmd.Process.Pid, + Metadata: p.config.Config.Seccomp.ListenerMetadata, + State: specs.State{ + Version: specs.Version, + ID: p.config.ContainerId, + Status: specs.StateRunning, + Pid: p.initProcessPid, + Bundle: bundle, + Annotations: annotations, + }, + } + if err := sendContainerProcessState(p.config.Config.Seccomp.ListenerPath, + containerProcessState, seccompFd); err != nil { + return err + } + + // Sync with child. + if err := writeSync(p.messageSockPair.parent, procSeccompDone); err != nil { + return err + } + return nil default: return errors.New("invalid JSON payload from child") } @@ -426,6 +463,41 @@ func (p *initProcess) start() (retErr error) { ierr := parseSync(p.messageSockPair.parent, func(sync *syncT) error { switch sync.Type { + case procSeccomp: + if p.config.Config.Seccomp.ListenerPath == "" { + return errors.New("listenerPath is not set") + } + + seccompFd, err := recvSeccompFd(uintptr(childPid), uintptr(sync.Fd)) + if err != nil { + return err + } + defer unix.Close(seccompFd) + + s, err := p.container.currentOCIState() + if err != nil { + return err + } + + // initProcessStartTime hasn't been set yet. + s.Pid = p.cmd.Process.Pid + s.Status = specs.StateCreating + containerProcessState := &specs.ContainerProcessState{ + Version: specs.Version, + Fds: []string{specs.SeccompFdName}, + Pid: s.Pid, + Metadata: p.config.Config.Seccomp.ListenerMetadata, + State: *s, + } + if err := sendContainerProcessState(p.config.Config.Seccomp.ListenerPath, + containerProcessState, seccompFd); err != nil { + return err + } + + // Sync with child. + if err := writeSync(p.messageSockPair.parent, procSeccompDone); err != nil { + return err + } case procReady: // set rlimits, this has to be done here because we lose permissions // to raise the limits once we enter a user-namespace @@ -486,7 +558,7 @@ func (p *initProcess) start() (retErr error) { // Sync with child. if err := writeSync(p.messageSockPair.parent, procRun); err != nil { - return fmt.Errorf("error writing syncT 'run': %w", err) + return err } sentRun = true case procHooks: @@ -518,7 +590,7 @@ func (p *initProcess) start() (retErr error) { } // Sync with child. if err := writeSync(p.messageSockPair.parent, procResume); err != nil { - return fmt.Errorf("error writing syncT 'resume': %w", err) + return err } sentResume = true default: @@ -621,6 +693,46 @@ func (p *initProcess) forwardChildLogs() chan error { return logs.ForwardLogs(p.logFilePair.parent) } +func recvSeccompFd(childPid, childFd uintptr) (int, error) { + pidfd, _, errno := unix.Syscall(unix.SYS_PIDFD_OPEN, childPid, 0, 0) + if errno != 0 { + return -1, fmt.Errorf("performing SYS_PIDFD_OPEN syscall: %w", errno) + } + defer unix.Close(int(pidfd)) + + seccompFd, _, errno := unix.Syscall(unix.SYS_PIDFD_GETFD, pidfd, childFd, 0) + if errno != 0 { + return -1, fmt.Errorf("performing SYS_PIDFD_GETFD syscall: %w", errno) + } + + return int(seccompFd), nil +} + +func sendContainerProcessState(listenerPath string, state *specs.ContainerProcessState, fd int) error { + conn, err := net.Dial("unix", listenerPath) + if err != nil { + return fmt.Errorf("failed to connect with seccomp agent specified in the seccomp profile: %w", err) + } + + socket, err := conn.(*net.UnixConn).File() + if err != nil { + return fmt.Errorf("cannot get seccomp socket: %w", err) + } + defer socket.Close() + + b, err := json.Marshal(state) + if err != nil { + return fmt.Errorf("cannot marshall seccomp state: %w", err) + } + + err = utils.SendFds(socket, b, fd) + if err != nil { + return fmt.Errorf("cannot send seccomp fd to %s: %w", listenerPath, err) + } + + return nil +} + func getPipeFds(pid int) ([]string, error) { fds := make([]string, 3) diff --git a/libcontainer/seccomp/config.go b/libcontainer/seccomp/config.go index 2e5adfd3a..2c69a51c2 100644 --- a/libcontainer/seccomp/config.go +++ b/libcontainer/seccomp/config.go @@ -17,12 +17,13 @@ var operators = map[string]configs.Operator{ } var actions = map[string]configs.Action{ - "SCMP_ACT_KILL": configs.Kill, - "SCMP_ACT_ERRNO": configs.Errno, - "SCMP_ACT_TRAP": configs.Trap, - "SCMP_ACT_ALLOW": configs.Allow, - "SCMP_ACT_TRACE": configs.Trace, - "SCMP_ACT_LOG": configs.Log, + "SCMP_ACT_KILL": configs.Kill, + "SCMP_ACT_ERRNO": configs.Errno, + "SCMP_ACT_TRAP": configs.Trap, + "SCMP_ACT_ALLOW": configs.Allow, + "SCMP_ACT_TRACE": configs.Trace, + "SCMP_ACT_LOG": configs.Log, + "SCMP_ACT_NOTIFY": configs.Notify, } var archs = map[string]string{ diff --git a/libcontainer/seccomp/patchbpf/enosys_linux.go b/libcontainer/seccomp/patchbpf/enosys_linux.go index bdeda4f23..dfb8a0a8e 100644 --- a/libcontainer/seccomp/patchbpf/enosys_linux.go +++ b/libcontainer/seccomp/patchbpf/enosys_linux.go @@ -43,6 +43,11 @@ const uintptr_t C_SET_MODE_FILTER = SECCOMP_SET_MODE_FILTER; #endif const uintptr_t C_FILTER_FLAG_LOG = SECCOMP_FILTER_FLAG_LOG; +#ifndef SECCOMP_FILTER_FLAG_NEW_LISTENER +# define SECCOMP_FILTER_FLAG_NEW_LISTENER (1UL << 3) +#endif +const uintptr_t C_FILTER_FLAG_NEW_LISTENER = SECCOMP_FILTER_FLAG_NEW_LISTENER; + // We use the AUDIT_ARCH_* values because those are the ones used by the kernel // and SCMP_ARCH_* sometimes has fake values (such as SCMP_ARCH_X32). But we // use so we get libseccomp's fallback definitions of AUDIT_ARCH_*. @@ -582,7 +587,7 @@ func enosysPatchFilter(config *configs.Seccomp, filter *libseccomp.ScmpFilter) ( return fprog, nil } -func filterFlags(filter *libseccomp.ScmpFilter) (flags uint, noNewPrivs bool, err error) { +func filterFlags(config *configs.Seccomp, filter *libseccomp.ScmpFilter) (flags uint, noNewPrivs bool, err error) { // Ignore the error since pre-2.4 libseccomp is treated as API level 0. apiLevel, _ := libseccomp.GetAPI() @@ -600,26 +605,38 @@ func filterFlags(filter *libseccomp.ScmpFilter) (flags uint, noNewPrivs bool, er } // TODO: Support seccomp flags not yet added to libseccomp-golang... + + for _, call := range config.Syscalls { + if call.Action == configs.Notify { + flags |= uint(C.C_FILTER_FLAG_NEW_LISTENER) + break + } + } + return } -func sysSeccompSetFilter(flags uint, filter []unix.SockFilter) (err error) { +func sysSeccompSetFilter(flags uint, filter []unix.SockFilter) (fd int, err error) { fprog := unix.SockFprog{ Len: uint16(len(filter)), Filter: &filter[0], } + fd = -1 // only return a valid fd when C_FILTER_FLAG_NEW_LISTENER is set // If no seccomp flags were requested we can use the old-school prctl(2). if flags == 0 { err = unix.Prctl(unix.PR_SET_SECCOMP, unix.SECCOMP_MODE_FILTER, uintptr(unsafe.Pointer(&fprog)), 0, 0) } else { - _, _, errno := unix.RawSyscall(unix.SYS_SECCOMP, + fdptr, _, errno := unix.RawSyscall(unix.SYS_SECCOMP, uintptr(C.C_SET_MODE_FILTER), uintptr(flags), uintptr(unsafe.Pointer(&fprog))) if errno != 0 { err = errno } + if flags&uint(C.C_FILTER_FLAG_NEW_LISTENER) != 0 { + fd = int(fdptr) + } } runtime.KeepAlive(filter) runtime.KeepAlive(fprog) @@ -631,17 +648,17 @@ func sysSeccompSetFilter(flags uint, filter []unix.SockFilter) (err error) { // patches said filter to handle -ENOSYS in a much nicer manner than the // default libseccomp default action behaviour, and loads the patched filter // into the kernel for the current process. -func PatchAndLoad(config *configs.Seccomp, filter *libseccomp.ScmpFilter) error { +func PatchAndLoad(config *configs.Seccomp, filter *libseccomp.ScmpFilter) (int, error) { // Generate a patched filter. fprog, err := enosysPatchFilter(config, filter) if err != nil { - return fmt.Errorf("error patching filter: %w", err) + return -1, fmt.Errorf("error patching filter: %w", err) } // Get the set of libseccomp flags set. - seccompFlags, noNewPrivs, err := filterFlags(filter) + seccompFlags, noNewPrivs, err := filterFlags(config, filter) if err != nil { - return fmt.Errorf("unable to fetch seccomp filter flags: %w", err) + return -1, fmt.Errorf("unable to fetch seccomp filter flags: %w", err) } // Set no_new_privs if it was requested, though in runc we handle @@ -649,13 +666,15 @@ func PatchAndLoad(config *configs.Seccomp, filter *libseccomp.ScmpFilter) error if noNewPrivs { logrus.Warnf("potentially misconfigured filter -- setting no_new_privs in seccomp path") if err := unix.Prctl(unix.PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil { - return fmt.Errorf("error enabling no_new_privs bit: %w", err) + return -1, fmt.Errorf("error enabling no_new_privs bit: %w", err) } } // Finally, load the filter. - if err := sysSeccompSetFilter(seccompFlags, fprog); err != nil { - return fmt.Errorf("error loading seccomp filter: %w", err) + fd, err := sysSeccompSetFilter(seccompFlags, fprog) + if err != nil { + return -1, fmt.Errorf("error loading seccomp filter: %w", err) } - return nil + + return fd, nil } diff --git a/libcontainer/seccomp/seccomp_linux.go b/libcontainer/seccomp/seccomp_linux.go index f46250825..ad599e994 100644 --- a/libcontainer/seccomp/seccomp_linux.go +++ b/libcontainer/seccomp/seccomp_linux.go @@ -16,12 +16,13 @@ import ( ) var ( - actAllow = libseccomp.ActAllow - actTrap = libseccomp.ActTrap - actKill = libseccomp.ActKill - actTrace = libseccomp.ActTrace.SetReturnCode(int16(unix.EPERM)) - actLog = libseccomp.ActLog - actErrno = libseccomp.ActErrno.SetReturnCode(int16(unix.EPERM)) + actAllow = libseccomp.ActAllow + actTrap = libseccomp.ActTrap + actKill = libseccomp.ActKill + actTrace = libseccomp.ActTrace.SetReturnCode(int16(unix.EPERM)) + actLog = libseccomp.ActLog + actErrno = libseccomp.ActErrno.SetReturnCode(int16(unix.EPERM)) + actNotify = libseccomp.ActNotify ) const ( @@ -29,54 +30,89 @@ const ( syscallMaxArguments int = 6 ) -// Filters given syscalls in a container, preventing them from being used -// Started in the container init process, and carried over to all child processes -// Setns calls, however, require a separate invocation, as they are not children -// of the init until they join the namespace -func InitSeccomp(config *configs.Seccomp) error { +// InitSeccomp installs the seccomp filters to be used in the container as +// specified in config. +// Returns the seccomp file descriptor if any of the filters include a +// SCMP_ACT_NOTIFY action, otherwise returns -1. +func InitSeccomp(config *configs.Seccomp) (int, error) { if config == nil { - return errors.New("cannot initialize Seccomp - nil config passed") + return -1, errors.New("cannot initialize Seccomp - nil config passed") } defaultAction, err := getAction(config.DefaultAction, config.DefaultErrnoRet) if err != nil { - return errors.New("error initializing seccomp - invalid default action") + return -1, errors.New("error initializing seccomp - invalid default action") + } + + // Ignore the error since pre-2.4 libseccomp is treated as API level 0. + apiLevel, _ := libseccomp.GetAPI() + for _, call := range config.Syscalls { + if call.Action == configs.Notify { + if apiLevel < 6 { + return -1, fmt.Errorf("seccomp notify unsupported: API level: got %d, want at least 6. Please try with libseccomp >= 2.5.0 and Linux >= 5.7", apiLevel) + } + + // We can't allow the write syscall to notify to the seccomp agent. + // After InitSeccomp() is called, we need to syncParentSeccomp() to write the seccomp fd plain + // number, so the parent sends it to the seccomp agent. If we use SCMP_ACT_NOTIFY on write, we + // never can write the seccomp fd to the parent and therefore the seccomp agent never receives + // the seccomp fd and runc is hang during initialization. + // + // Note that read()/close(), that are also used in syncParentSeccomp(), _can_ use SCMP_ACT_NOTIFY. + // Because we write the seccomp fd on the pipe to the parent, the parent is able to proceed and + // send the seccomp fd to the agent (it is another process and not subject to the seccomp + // filter). We will be blocked on read()/close() inside syncParentSeccomp() but if the seccomp + // agent allows those syscalls to proceed, initialization works just fine and the agent can + // handle future read()/close() syscalls as it wanted. + if call.Name == "write" { + return -1, errors.New("SCMP_ACT_NOTIFY cannot be used for the write syscall") + } + } + } + + // See comment on why write is not allowed. The same reason applies, as this can mean handling write too. + if defaultAction == actNotify { + return -1, errors.New("SCMP_ACT_NOTIFY cannot be used as default action") } filter, err := libseccomp.NewFilter(defaultAction) if err != nil { - return fmt.Errorf("error creating filter: %w", err) + return -1, fmt.Errorf("error creating filter: %w", err) } // Add extra architectures for _, arch := range config.Architectures { scmpArch, err := libseccomp.GetArchFromString(arch) if err != nil { - return fmt.Errorf("error validating Seccomp architecture: %w", err) + return -1, fmt.Errorf("error validating Seccomp architecture: %w", err) } if err := filter.AddArch(scmpArch); err != nil { - return fmt.Errorf("error adding architecture to seccomp filter: %w", err) + return -1, fmt.Errorf("error adding architecture to seccomp filter: %w", err) } } // Unset no new privs bit if err := filter.SetNoNewPrivsBit(false); err != nil { - return fmt.Errorf("error setting no new privileges: %w", err) + return -1, fmt.Errorf("error setting no new privileges: %w", err) } // Add a rule for each syscall for _, call := range config.Syscalls { if call == nil { - return errors.New("encountered nil syscall while initializing Seccomp") + return -1, errors.New("encountered nil syscall while initializing Seccomp") } + if err := matchCall(filter, call, defaultAction); err != nil { - return err + return -1, err } } - if err := patchbpf.PatchAndLoad(config, filter); err != nil { - return fmt.Errorf("error loading seccomp filter into kernel: %w", err) + + seccompFd, err := patchbpf.PatchAndLoad(config, filter) + if err != nil { + return -1, fmt.Errorf("error loading seccomp filter into kernel: %w", err) } - return nil + + return seccompFd, nil } // Convert Libcontainer Action to Libseccomp ScmpAction @@ -100,6 +136,8 @@ func getAction(act configs.Action, errnoRet *uint) (libseccomp.ScmpAction, error return actTrace, nil case configs.Log: return actLog, nil + case configs.Notify: + return actNotify, nil default: return libseccomp.ActInvalid, errors.New("invalid action, cannot use in rule") } diff --git a/libcontainer/seccomp/seccomp_unsupported.go b/libcontainer/seccomp/seccomp_unsupported.go index 6e593dbec..293e9c5cb 100644 --- a/libcontainer/seccomp/seccomp_unsupported.go +++ b/libcontainer/seccomp/seccomp_unsupported.go @@ -12,11 +12,11 @@ import ( var ErrSeccompNotEnabled = errors.New("seccomp: config provided but seccomp not supported") // InitSeccomp does nothing because seccomp is not supported. -func InitSeccomp(config *configs.Seccomp) error { +func InitSeccomp(config *configs.Seccomp) (int, error) { if config != nil { - return ErrSeccompNotEnabled + return -1, ErrSeccompNotEnabled } - return nil + return -1, nil } // Version returns major, minor, and micro. diff --git a/libcontainer/setns_init_linux.go b/libcontainer/setns_init_linux.go index 09a7d8953..a2c9efc4f 100644 --- a/libcontainer/setns_init_linux.go +++ b/libcontainer/setns_init_linux.go @@ -71,7 +71,12 @@ func (l *linuxSetnsInit) Init() error { // do this before dropping capabilities; otherwise do it as late as possible // just before execve so as few syscalls take place after it as possible. if l.config.Config.Seccomp != nil && !l.config.NoNewPrivileges { - if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil { + seccompFd, err := seccomp.InitSeccomp(l.config.Config.Seccomp) + if err != nil { + return err + } + + if err := syncParentSeccomp(l.pipe, seccompFd); err != nil { return err } } @@ -85,9 +90,14 @@ func (l *linuxSetnsInit) Init() error { // place afterward (reducing the amount of syscalls that users need to // enable in their seccomp profiles). if l.config.Config.Seccomp != nil && l.config.NoNewPrivileges { - if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil { + seccompFd, err := seccomp.InitSeccomp(l.config.Config.Seccomp) + if err != nil { return fmt.Errorf("unable to init seccomp: %w", err) } + + if err := syncParentSeccomp(l.pipe, seccompFd); err != nil { + return err + } } logrus.Debugf("setns_init: about to exec") // Close the log pipe fd so the parent's ForwardLogs can exit. diff --git a/libcontainer/specconv/spec_linux.go b/libcontainer/specconv/spec_linux.go index f0d3506e6..991c08a19 100644 --- a/libcontainer/specconv/spec_linux.go +++ b/libcontainer/specconv/spec_linux.go @@ -882,6 +882,9 @@ func SetupSeccomp(config *specs.LinuxSeccomp) (*configs.Seccomp, error) { newConfig.DefaultAction = newDefaultAction newConfig.DefaultErrnoRet = config.DefaultErrnoRet + newConfig.ListenerPath = config.ListenerPath + newConfig.ListenerMetadata = config.ListenerMetadata + // Loop through all syscall blocks and convert them to libcontainer format for _, call := range config.Syscalls { newAction, err := seccomp.ConvertStringToAction(string(call.Action)) diff --git a/libcontainer/standard_init_linux.go b/libcontainer/standard_init_linux.go index c02f0c45d..6dfea9998 100644 --- a/libcontainer/standard_init_linux.go +++ b/libcontainer/standard_init_linux.go @@ -156,7 +156,12 @@ func (l *linuxStandardInit) Init() error { // do this before dropping capabilities; otherwise do it as late as possible // just before execve so as few syscalls take place after it as possible. if l.config.Config.Seccomp != nil && !l.config.NoNewPrivileges { - if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil { + seccompFd, err := seccomp.InitSeccomp(l.config.Config.Seccomp) + if err != nil { + return err + } + + if err := syncParentSeccomp(l.pipe, seccompFd); err != nil { return err } } @@ -181,6 +186,21 @@ func (l *linuxStandardInit) Init() error { if err != nil { return err } + // Set seccomp as close to execve as possible, so as few syscalls take + // place afterward (reducing the amount of syscalls that users need to + // enable in their seccomp profiles). However, this needs to be done + // before closing the pipe since we need it to pass the seccompFd to + // the parent. + if l.config.Config.Seccomp != nil && l.config.NoNewPrivileges { + seccompFd, err := seccomp.InitSeccomp(l.config.Config.Seccomp) + if err != nil { + return fmt.Errorf("unable to init seccomp: %w", err) + } + + if err := syncParentSeccomp(l.pipe, seccompFd); err != nil { + return err + } + } // Close the pipe to signal that we have completed our init. logrus.Debugf("init: closing the pipe to signal completion") _ = l.pipe.Close() @@ -202,6 +222,7 @@ func (l *linuxStandardInit) Init() error { if _, err := unix.Write(fd, []byte("0")); err != nil { return &os.PathError{Op: "write exec fifo", Path: fifoPath, Err: err} } + // Close the O_PATH fifofd fd before exec because the kernel resets // dumpable in the wrong order. This has been fixed in newer kernels, but // we keep this to ensure CVE-2016-9962 doesn't re-emerge on older kernels. @@ -209,14 +230,6 @@ func (l *linuxStandardInit) Init() error { // since been resolved. // https://github.com/torvalds/linux/blob/v4.9/fs/exec.c#L1290-L1318 _ = unix.Close(l.fifoFd) - // Set seccomp as close to execve as possible, so as few syscalls take - // place afterward (reducing the amount of syscalls that users need to - // enable in their seccomp profiles). - if l.config.Config.Seccomp != nil && l.config.NoNewPrivileges { - if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil { - return fmt.Errorf("unable to init seccomp: %w", err) - } - } s := l.config.SpecState s.Pid = unix.Getpid() diff --git a/libcontainer/sync.go b/libcontainer/sync.go index 906a8c2ca..c9a23ef3a 100644 --- a/libcontainer/sync.go +++ b/libcontainer/sync.go @@ -22,16 +22,22 @@ type syncType string // // procReady --> [final setup] // <-- procRun +// +// procSeccomp --> [pick up seccomp fd with pidfd_getfd()] +// <-- procSeccompDone const ( - procError syncType = "procError" - procReady syncType = "procReady" - procRun syncType = "procRun" - procHooks syncType = "procHooks" - procResume syncType = "procResume" + procError syncType = "procError" + procReady syncType = "procReady" + procRun syncType = "procRun" + procHooks syncType = "procHooks" + procResume syncType = "procResume" + procSeccomp syncType = "procSeccomp" + procSeccompDone syncType = "procSeccompDone" ) type syncT struct { Type syncType `json:"type"` + Fd int `json:"fd"` } // initError is used to wrap errors for passing them via JSON, @@ -47,7 +53,16 @@ func (i initError) Error() string { // writeSync is used to write to a synchronisation pipe. An error is returned // if there was a problem writing the payload. func writeSync(pipe io.Writer, sync syncType) error { - return utils.WriteJSON(pipe, syncT{sync}) + return writeSyncWithFd(pipe, sync, -1) +} + +// writeSyncWithFd is used to write to a synchronisation pipe. An error is +// returned if there was a problem writing the payload. +func writeSyncWithFd(pipe io.Writer, sync syncType, fd int) error { + if err := utils.WriteJSON(pipe, syncT{sync, fd}); err != nil { + return fmt.Errorf("writing syncT %q: %w", string(sync), err) + } + return nil } // readSync is used to read from a synchronisation pipe. An error is returned