mirror of
				https://github.com/opencontainers/runc.git
				synced 2025-10-31 19:13:12 +08:00 
			
		
		
		
	 2f27649848
			
		
	
	2f27649848
	
	
	
		
			
			Today mounts in pre-start hooks get overriden by the default mounts. Moving the pre-start hooks to after the container mounts and before the pivot/move root gives better flexiblity in the hooks. Signed-off-by: Mrunal Patel <mrunalp@gmail.com>
		
			
				
	
	
		
			379 lines
		
	
	
		
			9.7 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
			
		
		
	
	
			379 lines
		
	
	
		
			9.7 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
| // +build linux
 | |
| 
 | |
| package libcontainer
 | |
| 
 | |
| import (
 | |
| 	"encoding/json"
 | |
| 	"fmt"
 | |
| 	"io"
 | |
| 	"io/ioutil"
 | |
| 	"net"
 | |
| 	"os"
 | |
| 	"strconv"
 | |
| 	"strings"
 | |
| 	"syscall"
 | |
| 
 | |
| 	"github.com/Sirupsen/logrus"
 | |
| 	"github.com/opencontainers/runc/libcontainer/cgroups"
 | |
| 	"github.com/opencontainers/runc/libcontainer/configs"
 | |
| 	"github.com/opencontainers/runc/libcontainer/system"
 | |
| 	"github.com/opencontainers/runc/libcontainer/user"
 | |
| 	"github.com/opencontainers/runc/libcontainer/utils"
 | |
| 	"github.com/vishvananda/netlink"
 | |
| )
 | |
| 
 | |
| type initType string
 | |
| 
 | |
| const (
 | |
| 	initSetns    initType = "setns"
 | |
| 	initStandard initType = "standard"
 | |
| )
 | |
| 
 | |
| type pid struct {
 | |
| 	Pid int `json:"pid"`
 | |
| }
 | |
| 
 | |
| // network is an internal struct used to setup container networks.
 | |
| type network struct {
 | |
| 	configs.Network
 | |
| 
 | |
| 	// TempVethPeerName is a unique temporary veth peer name that was placed into
 | |
| 	// the container's namespace.
 | |
| 	TempVethPeerName string `json:"temp_veth_peer_name"`
 | |
| }
 | |
| 
 | |
| // initConfig is used for transferring parameters from Exec() to Init()
 | |
| type initConfig struct {
 | |
| 	Args             []string        `json:"args"`
 | |
| 	Env              []string        `json:"env"`
 | |
| 	Cwd              string          `json:"cwd"`
 | |
| 	Capabilities     []string        `json:"capabilities"`
 | |
| 	User             string          `json:"user"`
 | |
| 	Config           *configs.Config `json:"config"`
 | |
| 	Console          string          `json:"console"`
 | |
| 	Networks         []*network      `json:"network"`
 | |
| 	PassedFilesCount int             `json:"passed_files_count"`
 | |
| }
 | |
| 
 | |
| type initer interface {
 | |
| 	Init() error
 | |
| }
 | |
| 
 | |
| func newContainerInit(t initType, pipe *os.File) (initer, error) {
 | |
| 	var config *initConfig
 | |
| 	if err := json.NewDecoder(pipe).Decode(&config); err != nil {
 | |
| 		return nil, err
 | |
| 	}
 | |
| 	if err := populateProcessEnvironment(config.Env); err != nil {
 | |
| 		return nil, err
 | |
| 	}
 | |
| 	switch t {
 | |
| 	case initSetns:
 | |
| 		return &linuxSetnsInit{
 | |
| 			config: config,
 | |
| 		}, nil
 | |
| 	case initStandard:
 | |
| 		return &linuxStandardInit{
 | |
| 			pipe:      pipe,
 | |
| 			parentPid: syscall.Getppid(),
 | |
| 			config:    config,
 | |
| 		}, nil
 | |
| 	}
 | |
| 	return nil, fmt.Errorf("unknown init type %q", t)
 | |
| }
 | |
| 
 | |
| // populateProcessEnvironment loads the provided environment variables into the
 | |
| // current processes's environment.
 | |
| func populateProcessEnvironment(env []string) error {
 | |
| 	for _, pair := range env {
 | |
| 		p := strings.SplitN(pair, "=", 2)
 | |
| 		if len(p) < 2 {
 | |
| 			return fmt.Errorf("invalid environment '%v'", pair)
 | |
| 		}
 | |
| 		if err := os.Setenv(p[0], p[1]); err != nil {
 | |
| 			return err
 | |
| 		}
 | |
| 	}
 | |
| 	return nil
 | |
| }
 | |
| 
 | |
| // finalizeNamespace drops the caps, sets the correct user
 | |
| // and working dir, and closes any leaked file descriptors
 | |
| // before executing the command inside the namespace
 | |
| func finalizeNamespace(config *initConfig) error {
 | |
| 	// Ensure that all unwanted fds we may have accidentally
 | |
| 	// inherited are marked close-on-exec so they stay out of the
 | |
| 	// container
 | |
| 	if err := utils.CloseExecFrom(config.PassedFilesCount + 3); err != nil {
 | |
| 		return err
 | |
| 	}
 | |
| 
 | |
| 	capabilities := config.Config.Capabilities
 | |
| 	if config.Capabilities != nil {
 | |
| 		capabilities = config.Capabilities
 | |
| 	}
 | |
| 	w, err := newCapWhitelist(capabilities)
 | |
| 	if err != nil {
 | |
| 		return err
 | |
| 	}
 | |
| 	// drop capabilities in bounding set before changing user
 | |
| 	if err := w.dropBoundingSet(); err != nil {
 | |
| 		return err
 | |
| 	}
 | |
| 	// preserve existing capabilities while we change users
 | |
| 	if err := system.SetKeepCaps(); err != nil {
 | |
| 		return err
 | |
| 	}
 | |
| 	if err := setupUser(config); err != nil {
 | |
| 		return err
 | |
| 	}
 | |
| 	if err := system.ClearKeepCaps(); err != nil {
 | |
| 		return err
 | |
| 	}
 | |
| 	// drop all other capabilities
 | |
| 	if err := w.drop(); err != nil {
 | |
| 		return err
 | |
| 	}
 | |
| 	if config.Cwd != "" {
 | |
| 		if err := syscall.Chdir(config.Cwd); err != nil {
 | |
| 			return err
 | |
| 		}
 | |
| 	}
 | |
| 	return nil
 | |
| }
 | |
| 
 | |
| // syncParentReady sends to the given pipe a JSON payload which indicates that
 | |
| // the init is ready to Exec the child process. It then waits for the parent to
 | |
| // indicate that it is cleared to Exec.
 | |
| func syncParentReady(pipe io.ReadWriter) error {
 | |
| 	// Tell parent.
 | |
| 	if err := utils.WriteJSON(pipe, syncT{procReady}); err != nil {
 | |
| 		return err
 | |
| 	}
 | |
| 	// Wait for parent to give the all-clear.
 | |
| 	var procSync syncT
 | |
| 	if err := json.NewDecoder(pipe).Decode(&procSync); err != nil {
 | |
| 		if err == io.EOF {
 | |
| 			return fmt.Errorf("parent closed synchronisation channel")
 | |
| 		}
 | |
| 		if procSync.Type != procRun {
 | |
| 			return fmt.Errorf("invalid synchronisation flag from parent")
 | |
| 		}
 | |
| 	}
 | |
| 	return nil
 | |
| }
 | |
| 
 | |
| // syncParentHooks sends to the given pipe a JSON payload which indicates that
 | |
| // the parent should execute pre-start hooks. It then waits for the parent to
 | |
| // indicate that it is cleared to resume.
 | |
| func syncParentHooks(pipe io.ReadWriter) error {
 | |
| 	// Tell parent.
 | |
| 	if err := utils.WriteJSON(pipe, syncT{procHooks}); err != nil {
 | |
| 		return err
 | |
| 	}
 | |
| 	// Wait for parent to give the all-clear.
 | |
| 	var procSync syncT
 | |
| 	if err := json.NewDecoder(pipe).Decode(&procSync); err != nil {
 | |
| 		if err == io.EOF {
 | |
| 			return fmt.Errorf("parent closed synchronisation channel")
 | |
| 		}
 | |
| 		if procSync.Type != procResume {
 | |
| 			return fmt.Errorf("invalid synchronisation flag from parent")
 | |
| 		}
 | |
| 	}
 | |
| 	return nil
 | |
| }
 | |
| 
 | |
| // joinExistingNamespaces gets all the namespace paths specified for the container and
 | |
| // does a setns on the namespace fd so that the current process joins the namespace.
 | |
| func joinExistingNamespaces(namespaces []configs.Namespace) error {
 | |
| 	for _, ns := range namespaces {
 | |
| 		if ns.Path != "" {
 | |
| 			f, err := os.OpenFile(ns.Path, os.O_RDONLY, 0)
 | |
| 			if err != nil {
 | |
| 				return err
 | |
| 			}
 | |
| 			err = system.Setns(f.Fd(), uintptr(ns.Syscall()))
 | |
| 			f.Close()
 | |
| 			if err != nil {
 | |
| 				return err
 | |
| 			}
 | |
| 		}
 | |
| 	}
 | |
| 	return nil
 | |
| }
 | |
| 
 | |
| // setupUser changes the groups, gid, and uid for the user inside the container
 | |
| func setupUser(config *initConfig) error {
 | |
| 	// Set up defaults.
 | |
| 	defaultExecUser := user.ExecUser{
 | |
| 		Uid:  syscall.Getuid(),
 | |
| 		Gid:  syscall.Getgid(),
 | |
| 		Home: "/",
 | |
| 	}
 | |
| 	passwdPath, err := user.GetPasswdPath()
 | |
| 	if err != nil {
 | |
| 		return err
 | |
| 	}
 | |
| 	groupPath, err := user.GetGroupPath()
 | |
| 	if err != nil {
 | |
| 		return err
 | |
| 	}
 | |
| 	execUser, err := user.GetExecUserPath(config.User, &defaultExecUser, passwdPath, groupPath)
 | |
| 	if err != nil {
 | |
| 		return err
 | |
| 	}
 | |
| 
 | |
| 	var addGroups []int
 | |
| 	if len(config.Config.AdditionalGroups) > 0 {
 | |
| 		addGroups, err = user.GetAdditionalGroupsPath(config.Config.AdditionalGroups, groupPath)
 | |
| 		if err != nil {
 | |
| 			return err
 | |
| 		}
 | |
| 	}
 | |
| 	// before we change to the container's user make sure that the processes STDIO
 | |
| 	// is correctly owned by the user that we are switching to.
 | |
| 	if err := fixStdioPermissions(execUser); err != nil {
 | |
| 		return err
 | |
| 	}
 | |
| 	suppGroups := append(execUser.Sgids, addGroups...)
 | |
| 	if err := syscall.Setgroups(suppGroups); err != nil {
 | |
| 		return err
 | |
| 	}
 | |
| 
 | |
| 	if err := system.Setgid(execUser.Gid); err != nil {
 | |
| 		return err
 | |
| 	}
 | |
| 	if err := system.Setuid(execUser.Uid); err != nil {
 | |
| 		return err
 | |
| 	}
 | |
| 	// if we didn't get HOME already, set it based on the user's HOME
 | |
| 	if envHome := os.Getenv("HOME"); envHome == "" {
 | |
| 		if err := os.Setenv("HOME", execUser.Home); err != nil {
 | |
| 			return err
 | |
| 		}
 | |
| 	}
 | |
| 	return nil
 | |
| }
 | |
| 
 | |
| // fixStdioPermissions fixes the permissions of PID 1's STDIO within the container to the specified user.
 | |
| // The ownership needs to match because it is created outside of the container and needs to be
 | |
| // localized.
 | |
| func fixStdioPermissions(u *user.ExecUser) error {
 | |
| 	var null syscall.Stat_t
 | |
| 	if err := syscall.Stat("/dev/null", &null); err != nil {
 | |
| 		return err
 | |
| 	}
 | |
| 	for _, fd := range []uintptr{
 | |
| 		os.Stdin.Fd(),
 | |
| 		os.Stderr.Fd(),
 | |
| 		os.Stdout.Fd(),
 | |
| 	} {
 | |
| 		var s syscall.Stat_t
 | |
| 		if err := syscall.Fstat(int(fd), &s); err != nil {
 | |
| 			return err
 | |
| 		}
 | |
| 		// skip chown of /dev/null if it was used as one of the STDIO fds.
 | |
| 		if s.Rdev == null.Rdev {
 | |
| 			continue
 | |
| 		}
 | |
| 		if err := syscall.Fchown(int(fd), u.Uid, u.Gid); err != nil {
 | |
| 			return err
 | |
| 		}
 | |
| 	}
 | |
| 	return nil
 | |
| }
 | |
| 
 | |
| // setupNetwork sets up and initializes any network interface inside the container.
 | |
| func setupNetwork(config *initConfig) error {
 | |
| 	for _, config := range config.Networks {
 | |
| 		strategy, err := getStrategy(config.Type)
 | |
| 		if err != nil {
 | |
| 			return err
 | |
| 		}
 | |
| 		if err := strategy.initialize(config); err != nil {
 | |
| 			return err
 | |
| 		}
 | |
| 	}
 | |
| 	return nil
 | |
| }
 | |
| 
 | |
| func setupRoute(config *configs.Config) error {
 | |
| 	for _, config := range config.Routes {
 | |
| 		_, dst, err := net.ParseCIDR(config.Destination)
 | |
| 		if err != nil {
 | |
| 			return err
 | |
| 		}
 | |
| 		src := net.ParseIP(config.Source)
 | |
| 		if src == nil {
 | |
| 			return fmt.Errorf("Invalid source for route: %s", config.Source)
 | |
| 		}
 | |
| 		gw := net.ParseIP(config.Gateway)
 | |
| 		if gw == nil {
 | |
| 			return fmt.Errorf("Invalid gateway for route: %s", config.Gateway)
 | |
| 		}
 | |
| 		l, err := netlink.LinkByName(config.InterfaceName)
 | |
| 		if err != nil {
 | |
| 			return err
 | |
| 		}
 | |
| 		route := &netlink.Route{
 | |
| 			Scope:     netlink.SCOPE_UNIVERSE,
 | |
| 			Dst:       dst,
 | |
| 			Src:       src,
 | |
| 			Gw:        gw,
 | |
| 			LinkIndex: l.Attrs().Index,
 | |
| 		}
 | |
| 		if err := netlink.RouteAdd(route); err != nil {
 | |
| 			return err
 | |
| 		}
 | |
| 	}
 | |
| 	return nil
 | |
| }
 | |
| 
 | |
| func setupRlimits(config *configs.Config) error {
 | |
| 	for _, rlimit := range config.Rlimits {
 | |
| 		l := &syscall.Rlimit{Max: rlimit.Hard, Cur: rlimit.Soft}
 | |
| 		if err := syscall.Setrlimit(rlimit.Type, l); err != nil {
 | |
| 			return fmt.Errorf("error setting rlimit type %v: %v", rlimit.Type, err)
 | |
| 		}
 | |
| 	}
 | |
| 	return nil
 | |
| }
 | |
| 
 | |
| func setOomScoreAdj(oomScoreAdj int) error {
 | |
| 	path := "/proc/self/oom_score_adj"
 | |
| 	return ioutil.WriteFile(path, []byte(strconv.Itoa(oomScoreAdj)), 0700)
 | |
| }
 | |
| 
 | |
| // killCgroupProcesses freezes then iterates over all the processes inside the
 | |
| // manager's cgroups sending a SIGKILL to each process then waiting for them to
 | |
| // exit.
 | |
| func killCgroupProcesses(m cgroups.Manager) error {
 | |
| 	var procs []*os.Process
 | |
| 	if err := m.Freeze(configs.Frozen); err != nil {
 | |
| 		logrus.Warn(err)
 | |
| 	}
 | |
| 	pids, err := m.GetAllPids()
 | |
| 	if err != nil {
 | |
| 		m.Freeze(configs.Thawed)
 | |
| 		return err
 | |
| 	}
 | |
| 	for _, pid := range pids {
 | |
| 		if p, err := os.FindProcess(pid); err == nil {
 | |
| 			procs = append(procs, p)
 | |
| 			if err := p.Kill(); err != nil {
 | |
| 				logrus.Warn(err)
 | |
| 			}
 | |
| 		}
 | |
| 	}
 | |
| 	if err := m.Freeze(configs.Thawed); err != nil {
 | |
| 		logrus.Warn(err)
 | |
| 	}
 | |
| 	for _, p := range procs {
 | |
| 		if _, err := p.Wait(); err != nil {
 | |
| 			logrus.Warn(err)
 | |
| 		}
 | |
| 	}
 | |
| 	return nil
 | |
| }
 |