mirror of
https://github.com/opencontainers/runc.git
synced 2025-10-08 17:00:13 +08:00

With the idmap work, we will have a tainted Go thread in our thread-group that has a different mount namespace to the other threads. It seems that (due to some bad luck) the Go scheduler tends to make this thread the thread-group leader in our tests, which results in very baffling failures where /proc/self/mountinfo produces gibberish results. In order to avoid this, switch to using /proc/thread-self for everything that is thread-local. This primarily includes switching all file descriptor paths (CLONE_FS), all of the places that check the current cgroup (technically we never will run a single runc thread in a separate cgroup, but better to be safe than sorry), and the aforementioned mountinfo code. We don't need to do anything for the following because the results we need aren't thread-local: * Checks that certain namespaces are supported by stat(2)ing /proc/self/ns/... * /proc/self/exe and /proc/self/cmdline are not thread-local. * While threads can be in different cgroups, we do not do this for the runc binary (or libcontainer) and thus we do not need to switch to the thread-local version of /proc/self/cgroups. * All of the CLONE_NEWUSER files are not thread-local because you cannot set the usernamespace of a single thread (setns(CLONE_NEWUSER) is blocked for multi-threaded programs). Note that we have to use runtime.LockOSThread when we have an open handle to a tid-specific procfs file that we are operating on multiple times. Go can reschedule us such that we are running on a different thread and then kill the original thread (causing -ENOENT or similarly confusing errors). This is not strictly necessary for most usages of /proc/thread-self (such as using /proc/thread-self/fd/$n directly) since only operating on the actual inodes associated with the tid requires this locking, but because of the pre-3.17 fallback for CentOS, we have to do this in most cases. In addition, CentOS's kernel is too old for /proc/thread-self, which requires us to emulate it -- however in rootfs_linux.go, we are in the container pid namespace but /proc is the host's procfs. This leads to the incredibly frustrating situation where there is no way (on pre-4.1 Linux) to figure out which /proc/self/task/... entry refers to the current tid. We can just use /proc/self in this case. Yes this is all pretty ugly. I also wish it wasn't necessary. Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
132 lines
3.8 KiB
Go
132 lines
3.8 KiB
Go
package utils
|
|
|
|
import (
|
|
"encoding/binary"
|
|
"encoding/json"
|
|
"io"
|
|
"os"
|
|
"path/filepath"
|
|
"strings"
|
|
"unsafe"
|
|
|
|
"golang.org/x/sys/unix"
|
|
)
|
|
|
|
const (
|
|
exitSignalOffset = 128
|
|
)
|
|
|
|
// NativeEndian is the native byte order of the host system.
|
|
var NativeEndian binary.ByteOrder
|
|
|
|
func init() {
|
|
// Copied from <golang.org/x/net/internal/socket/sys.go>.
|
|
i := uint32(1)
|
|
b := (*[4]byte)(unsafe.Pointer(&i))
|
|
if b[0] == 1 {
|
|
NativeEndian = binary.LittleEndian
|
|
} else {
|
|
NativeEndian = binary.BigEndian
|
|
}
|
|
}
|
|
|
|
// ExitStatus returns the correct exit status for a process based on if it
|
|
// was signaled or exited cleanly
|
|
func ExitStatus(status unix.WaitStatus) int {
|
|
if status.Signaled() {
|
|
return exitSignalOffset + int(status.Signal())
|
|
}
|
|
return status.ExitStatus()
|
|
}
|
|
|
|
// WriteJSON writes the provided struct v to w using standard json marshaling
|
|
// without a trailing newline. This is used instead of json.Encoder because
|
|
// there might be a problem in json decoder in some cases, see:
|
|
// https://github.com/docker/docker/issues/14203#issuecomment-174177790
|
|
func WriteJSON(w io.Writer, v interface{}) error {
|
|
data, err := json.Marshal(v)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
_, err = w.Write(data)
|
|
return err
|
|
}
|
|
|
|
// CleanPath makes a path safe for use with filepath.Join. This is done by not
|
|
// only cleaning the path, but also (if the path is relative) adding a leading
|
|
// '/' and cleaning it (then removing the leading '/'). This ensures that a
|
|
// path resulting from prepending another path will always resolve to lexically
|
|
// be a subdirectory of the prefixed path. This is all done lexically, so paths
|
|
// that include symlinks won't be safe as a result of using CleanPath.
|
|
func CleanPath(path string) string {
|
|
// Deal with empty strings nicely.
|
|
if path == "" {
|
|
return ""
|
|
}
|
|
|
|
// Ensure that all paths are cleaned (especially problematic ones like
|
|
// "/../../../../../" which can cause lots of issues).
|
|
path = filepath.Clean(path)
|
|
|
|
// If the path isn't absolute, we need to do more processing to fix paths
|
|
// such as "../../../../<etc>/some/path". We also shouldn't convert absolute
|
|
// paths to relative ones.
|
|
if !filepath.IsAbs(path) {
|
|
path = filepath.Clean(string(os.PathSeparator) + path)
|
|
// This can't fail, as (by definition) all paths are relative to root.
|
|
path, _ = filepath.Rel(string(os.PathSeparator), path)
|
|
}
|
|
|
|
// Clean the path again for good measure.
|
|
return filepath.Clean(path)
|
|
}
|
|
|
|
// stripRoot returns the passed path, stripping the root path if it was
|
|
// (lexicially) inside it. Note that both passed paths will always be treated
|
|
// as absolute, and the returned path will also always be absolute. In
|
|
// addition, the paths are cleaned before stripping the root.
|
|
func stripRoot(root, path string) string {
|
|
// Make the paths clean and absolute.
|
|
root, path = CleanPath("/"+root), CleanPath("/"+path)
|
|
switch {
|
|
case path == root:
|
|
path = "/"
|
|
case root == "/":
|
|
// do nothing
|
|
case strings.HasPrefix(path, root+"/"):
|
|
path = strings.TrimPrefix(path, root+"/")
|
|
}
|
|
return CleanPath("/" + path)
|
|
}
|
|
|
|
// SearchLabels searches through a list of key=value pairs for a given key,
|
|
// returning its value, and the binary flag telling whether the key exist.
|
|
func SearchLabels(labels []string, key string) (string, bool) {
|
|
key += "="
|
|
for _, s := range labels {
|
|
if strings.HasPrefix(s, key) {
|
|
return s[len(key):], true
|
|
}
|
|
}
|
|
return "", false
|
|
}
|
|
|
|
// Annotations returns the bundle path and user defined annotations from the
|
|
// libcontainer state. We need to remove the bundle because that is a label
|
|
// added by libcontainer.
|
|
func Annotations(labels []string) (bundle string, userAnnotations map[string]string) {
|
|
userAnnotations = make(map[string]string)
|
|
for _, l := range labels {
|
|
parts := strings.SplitN(l, "=", 2)
|
|
if len(parts) < 2 {
|
|
continue
|
|
}
|
|
if parts[0] == "bundle" {
|
|
bundle = parts[1]
|
|
} else {
|
|
userAnnotations[parts[0]] = parts[1]
|
|
}
|
|
}
|
|
return
|
|
}
|