Files
runc/libcontainer/network_linux.go
Antonio Ojea 8d180e9658 Add support for Linux Network Devices
Implement support for passing Linux Network Devices to the container
network namespace.

The network device is passed during the creation of the container,
before the process is started.

It implements the logic defined in the OCI runtime specification.

Signed-off-by: Antonio Ojea <aojea@google.com>
2025-06-18 15:52:30 +01:00

233 lines
8.4 KiB
Go

package libcontainer
import (
"bytes"
"errors"
"fmt"
"os"
"path/filepath"
"strconv"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/types"
"github.com/sirupsen/logrus"
"github.com/vishvananda/netlink"
"github.com/vishvananda/netlink/nl"
"github.com/vishvananda/netns"
"golang.org/x/sys/unix"
)
var strategies = map[string]networkStrategy{
"loopback": &loopback{},
}
// networkStrategy represents a specific network configuration for
// a container's networking stack
type networkStrategy interface {
create(*network, int) error
initialize(*network) error
detach(*configs.Network) error
attach(*configs.Network) error
}
// getStrategy returns the specific network strategy for the
// provided type.
func getStrategy(tpe string) (networkStrategy, error) {
s, exists := strategies[tpe]
if !exists {
return nil, fmt.Errorf("unknown strategy type %q", tpe)
}
return s, nil
}
// Returns the network statistics for the network interfaces represented by the NetworkRuntimeInfo.
func getNetworkInterfaceStats(interfaceName string) (*types.NetworkInterface, error) {
out := &types.NetworkInterface{Name: interfaceName}
// This can happen if the network runtime information is missing - possible if the
// container was created by an old version of libcontainer.
if interfaceName == "" {
return out, nil
}
type netStatsPair struct {
// Where to write the output.
Out *uint64
// The network stats file to read.
File string
}
// Ingress for host veth is from the container. Hence tx_bytes stat on the host veth is actually number of bytes received by the container.
netStats := []netStatsPair{
{Out: &out.RxBytes, File: "tx_bytes"},
{Out: &out.RxPackets, File: "tx_packets"},
{Out: &out.RxErrors, File: "tx_errors"},
{Out: &out.RxDropped, File: "tx_dropped"},
{Out: &out.TxBytes, File: "rx_bytes"},
{Out: &out.TxPackets, File: "rx_packets"},
{Out: &out.TxErrors, File: "rx_errors"},
{Out: &out.TxDropped, File: "rx_dropped"},
}
for _, netStat := range netStats {
data, err := readSysfsNetworkStats(interfaceName, netStat.File)
if err != nil {
return nil, err
}
*(netStat.Out) = data
}
return out, nil
}
// Reads the specified statistics available under /sys/class/net/<EthInterface>/statistics
func readSysfsNetworkStats(ethInterface, statsFile string) (uint64, error) {
data, err := os.ReadFile(filepath.Join("/sys/class/net", ethInterface, "statistics", statsFile))
if err != nil {
return 0, err
}
return strconv.ParseUint(string(bytes.TrimSpace(data)), 10, 64)
}
// loopback is a network strategy that provides a basic loopback device
type loopback struct{}
func (l *loopback) create(n *network, nspid int) error {
return nil
}
func (l *loopback) initialize(config *network) error {
return netlink.LinkSetUp(&netlink.Device{LinkAttrs: netlink.LinkAttrs{Name: "lo"}})
}
func (l *loopback) attach(n *configs.Network) (err error) {
return nil
}
func (l *loopback) detach(n *configs.Network) (err error) {
return nil
}
// devChangeNetNamespace allows to move a device given by name to a network namespace given by nsPath
// and optionally change the device name.
// The device name will be kept the same if device.Name is the zero value.
// This function ensures that the move and rename operations occur atomically.
// It preserves existing interface attributes, including global IP addresses.
func devChangeNetNamespace(name string, nsPath string, device configs.LinuxNetDevice) error {
logrus.Debugf("attaching network device %s with attrs %+v to network namespace %s", name, device, nsPath)
link, err := netlink.LinkByName(name)
// recover same behavior on vishvananda/netlink@1.2.1 and do not fail when the kernel returns NLM_F_DUMP_INTR.
if err != nil && !errors.Is(err, netlink.ErrDumpInterrupted) {
return fmt.Errorf("link not found for interface %s on runtime namespace: %w", name, err)
}
// Set the interface link state to DOWN before modifying attributes like namespace or name.
// This prevents potential conflicts or disruptions on the host network during the transition,
// particularly if other host components depend on this specific interface or its properties.
err = netlink.LinkSetDown(link)
if err != nil {
return fmt.Errorf("fail to set link down: %w", err)
}
// Get the existing IP addresses on the interface.
addresses, err := netlink.AddrList(link, netlink.FAMILY_ALL)
// recover same behavior on vishvananda/netlink@1.2.1 and do not fail when the kernel returns NLM_F_DUMP_INTR.
if err != nil && !errors.Is(err, netlink.ErrDumpInterrupted) {
return fmt.Errorf("fail to get ip addresses: %w", err)
}
// Do interface rename and namespace change in the same operation to avoid
// possible conflicts with the interface name.
// NLM_F_REQUEST: "It must be set on all request messages."
// NLM_F_ACK: "Request for an acknowledgment on success."
// netlink(7) man page: https://man7.org/linux/man-pages/man7/netlink.7.html
flags := unix.NLM_F_REQUEST | unix.NLM_F_ACK
req := nl.NewNetlinkRequest(unix.RTM_NEWLINK, flags)
// Get a netlink socket in current namespace
nlSock, err := nl.GetNetlinkSocketAt(netns.None(), netns.None(), unix.NETLINK_ROUTE)
if err != nil {
return fmt.Errorf("could not get network namespace handle: %w", err)
}
defer nlSock.Close()
req.Sockets = map[int]*nl.SocketHandle{
unix.NETLINK_ROUTE: {Socket: nlSock},
}
// Set the interface index.
msg := nl.NewIfInfomsg(unix.AF_UNSPEC)
msg.Index = int32(link.Attrs().Index)
req.AddData(msg)
// Set the interface name, also rename if requested.
newName := name
if device.Name != "" {
newName = device.Name
}
nameData := nl.NewRtAttr(unix.IFLA_IFNAME, nl.ZeroTerminated(newName))
req.AddData(nameData)
// Get the new network namespace.
ns, err := netns.GetFromPath(nsPath)
if err != nil {
return fmt.Errorf("could not get network namespace from path %s for network device %s : %w", nsPath, name, err)
}
defer ns.Close()
val := nl.Uint32Attr(uint32(ns))
attr := nl.NewRtAttr(unix.IFLA_NET_NS_FD, val)
req.AddData(attr)
_, err = req.Execute(unix.NETLINK_ROUTE, 0)
// recover same behavior on vishvananda/netlink@1.2.1 and do not fail when the kernel returns NLM_F_DUMP_INTR.
if err != nil && !errors.Is(err, netlink.ErrDumpInterrupted) {
return fmt.Errorf("fail to move network device %s to network namespace %s: %w", name, nsPath, err)
}
// To avoid us the husle with goroutines when joining a netns,
// we let the library create the socket in the namespace for us.
nhNs, err := netlink.NewHandleAt(ns)
if err != nil {
return err
}
defer nhNs.Close()
nsLink, err := nhNs.LinkByName(newName)
// recover same behavior on vishvananda/netlink@1.2.1 and do not fail when the kernel returns NLM_F_DUMP_INTR.
if err != nil && !errors.Is(err, netlink.ErrDumpInterrupted) {
return fmt.Errorf("link not found for interface %s on namespace %s : %w", newName, nsPath, err)
}
// Re-add the original IP addresses to the interface in the new namespace.
// The kernel removes IP addresses when an interface is moved between network namespaces.
for _, address := range addresses {
logrus.Debugf("processing address %s from network device %s", address.String(), name)
// Only move permanent IP addresses configured by the user, dynamic addresses are excluded because
// their validity may rely on the original network namespace's context and they may have limited
// lifetimes and are not guaranteed to be available in a new namespace.
// Ref: https://www.ietf.org/rfc/rfc3549.txt
if address.Flags&unix.IFA_F_PERMANENT == 0 {
logrus.Debugf("skipping address %s from network device %s: not a permanent address", address.String(), name)
continue
}
// Only move IP addresses with global scope because those are not host-specific, auto-configured,
// or have limited network scope, making them unsuitable inside the container namespace.
// Ref: https://www.ietf.org/rfc/rfc3549.txt
if address.Scope != unix.RT_SCOPE_UNIVERSE {
logrus.Debugf("skipping address %s from network device %s: not an address with global scope", address.String(), name)
continue
}
// Remove the interface attribute of the original address
// to avoid issues when the interface is renamed.
err = nhNs.AddrAdd(nsLink, &netlink.Addr{IPNet: address.IPNet})
if err != nil {
return fmt.Errorf("fail to set up address %s on namespace %s: %w", address.String(), nsPath, err)
}
}
err = nhNs.LinkSetUp(nsLink)
if err != nil {
return fmt.Errorf("fail to set up interface %s on namespace %s: %w", nsLink.Attrs().Name, nsPath, err)
}
return nil
}