mirror of
https://github.com/opencontainers/runc.git
synced 2025-10-07 16:31:38 +08:00

Having -EPERM is the default was a fairly significant mistake from a future-proofing standpoint in that it makes any new syscall return a non-ignorable error (from glibc's point of view). We need to correct this now because faccessat2(2) is something glibc critically needs to have support for, but they're blocked on container runtimes because we return -EPERM unconditionally (leading to confusion in glibc). This is also a problem we're probably going to keep running into in the future. Unfortunately there are several issues which stop us from having a clean solution to this problem: 1. libseccomp has several limitations which require us to emulate behaviour we want: a. We cannot do logic based on syscall number, meaning we cannot specify a "largest known syscall number"; b. libseccomp doesn't know in which kernel version a syscall was added, and has no API for "minimum kernel version" so we cannot simply ask libseccomp to generate sane -ENOSYS rules for us. c. Additional seccomp rules for the same syscall are not treated as distinct rules -- if rules overlap, seccomp will merge them. This means we cannot add per-syscall -EPERM fallbacks; d. There is no inverse operation for SCMP_CMP_MASKED_EQ; e. libseccomp does not allow you to specify multiple rules for a single argument, making it impossible to invert OR rules for arguments. 2. The runtime-spec does not have any way of specifying: a. The errno for the default action; b. The minimum kernel version or "newest syscall at time of profile creation"; nor c. Which syscalls were intentionally excluded from the allow list (weird syscalls that are no longer used were excluded entirely, but Docker et al expect those syscalls to get EPERM not ENOSYS). 3. Certain syscalls should not return -ENOSYS (especially only for certain argument combinations) because this could also trigger glibc confusion. This means we have to return -EPERM for certain syscalls but not as a global default. 4. There is not an obvious (and reasonable) upper limit to syscall numbers, so we cannot create a set of rules for each syscall above the largest syscall number in libseccomp. This means we must handle inverse rules as described below. 5. Any syscall can be specified multiple times, which can make generation of hotfix rules much harder. As a result, we have to work around all of these things by coming up with a heuristic to stop the bleeding. In the future we could hopefully improve the situation in the runtime-spec and libseccomp. The solution applied here is to prepend a "stub" filter which returns -ENOSYS if the requested syscall has a larger syscall number than any syscall mentioned in the filter. The reason for this specific rule is that syscall numbers are (roughly) allocated sequentially and thus newer syscalls will (usually) have a larger syscall number -- thus causing our filters to produce -ENOSYS if the filter was written before the syscall existed. Sadly this is not a perfect solution because syscalls can be added out-of-order and the syscall table can contain holes for several releases. Unfortuntely we do not have a nicer solution at the moment because there is no library which provides information about which Linux version a syscall was introduced in. Until that exists, this workaround will have to be good enough. The above behaviour only happens if the default action is a blocking action (in other words it is not SCMP_ACT_LOG or SCMP_ACT_ALLOW). If the default action is permissive then we don't do any patching. Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
124 lines
3.3 KiB
Go
124 lines
3.3 KiB
Go
package utils
|
|
|
|
import (
|
|
"encoding/binary"
|
|
"encoding/json"
|
|
"io"
|
|
"os"
|
|
"path/filepath"
|
|
"strings"
|
|
"unsafe"
|
|
|
|
"golang.org/x/sys/unix"
|
|
)
|
|
|
|
const (
|
|
exitSignalOffset = 128
|
|
)
|
|
|
|
// NativeEndian is the native byte order of the host system.
|
|
var NativeEndian binary.ByteOrder
|
|
|
|
func init() {
|
|
// Copied from <golang.org/x/net/internal/socket/sys.go>.
|
|
i := uint32(1)
|
|
b := (*[4]byte)(unsafe.Pointer(&i))
|
|
if b[0] == 1 {
|
|
NativeEndian = binary.LittleEndian
|
|
} else {
|
|
NativeEndian = binary.BigEndian
|
|
}
|
|
}
|
|
|
|
// ResolveRootfs ensures that the current working directory is
|
|
// not a symlink and returns the absolute path to the rootfs
|
|
func ResolveRootfs(uncleanRootfs string) (string, error) {
|
|
rootfs, err := filepath.Abs(uncleanRootfs)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
return filepath.EvalSymlinks(rootfs)
|
|
}
|
|
|
|
// ExitStatus returns the correct exit status for a process based on if it
|
|
// was signaled or exited cleanly
|
|
func ExitStatus(status unix.WaitStatus) int {
|
|
if status.Signaled() {
|
|
return exitSignalOffset + int(status.Signal())
|
|
}
|
|
return status.ExitStatus()
|
|
}
|
|
|
|
// WriteJSON writes the provided struct v to w using standard json marshaling
|
|
func WriteJSON(w io.Writer, v interface{}) error {
|
|
data, err := json.Marshal(v)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
_, err = w.Write(data)
|
|
return err
|
|
}
|
|
|
|
// CleanPath makes a path safe for use with filepath.Join. This is done by not
|
|
// only cleaning the path, but also (if the path is relative) adding a leading
|
|
// '/' and cleaning it (then removing the leading '/'). This ensures that a
|
|
// path resulting from prepending another path will always resolve to lexically
|
|
// be a subdirectory of the prefixed path. This is all done lexically, so paths
|
|
// that include symlinks won't be safe as a result of using CleanPath.
|
|
func CleanPath(path string) string {
|
|
// Deal with empty strings nicely.
|
|
if path == "" {
|
|
return ""
|
|
}
|
|
|
|
// Ensure that all paths are cleaned (especially problematic ones like
|
|
// "/../../../../../" which can cause lots of issues).
|
|
path = filepath.Clean(path)
|
|
|
|
// If the path isn't absolute, we need to do more processing to fix paths
|
|
// such as "../../../../<etc>/some/path". We also shouldn't convert absolute
|
|
// paths to relative ones.
|
|
if !filepath.IsAbs(path) {
|
|
path = filepath.Clean(string(os.PathSeparator) + path)
|
|
// This can't fail, as (by definition) all paths are relative to root.
|
|
path, _ = filepath.Rel(string(os.PathSeparator), path)
|
|
}
|
|
|
|
// Clean the path again for good measure.
|
|
return filepath.Clean(path)
|
|
}
|
|
|
|
// SearchLabels searches a list of key-value pairs for the provided key and
|
|
// returns the corresponding value. The pairs must be separated with '='.
|
|
func SearchLabels(labels []string, query string) string {
|
|
for _, l := range labels {
|
|
parts := strings.SplitN(l, "=", 2)
|
|
if len(parts) < 2 {
|
|
continue
|
|
}
|
|
if parts[0] == query {
|
|
return parts[1]
|
|
}
|
|
}
|
|
return ""
|
|
}
|
|
|
|
// Annotations returns the bundle path and user defined annotations from the
|
|
// libcontainer state. We need to remove the bundle because that is a label
|
|
// added by libcontainer.
|
|
func Annotations(labels []string) (bundle string, userAnnotations map[string]string) {
|
|
userAnnotations = make(map[string]string)
|
|
for _, l := range labels {
|
|
parts := strings.SplitN(l, "=", 2)
|
|
if len(parts) < 2 {
|
|
continue
|
|
}
|
|
if parts[0] == "bundle" {
|
|
bundle = parts[1]
|
|
} else {
|
|
userAnnotations[parts[0]] = parts[1]
|
|
}
|
|
}
|
|
return
|
|
}
|