diff --git a/internal/fdbased_darwin/endpoint.go b/internal/fdbased_darwin/endpoint.go index b542793..91a33aa 100644 --- a/internal/fdbased_darwin/endpoint.go +++ b/internal/fdbased_darwin/endpoint.go @@ -74,12 +74,29 @@ const ( // dispatch options but the one that is supported by all underlying FD // types. Readv PacketDispatchMode = iota + // RecvMMsg enables use of recvmmsg() syscall instead of readv() to + // read inbound packets. This reduces # of syscalls needed to process + // packets. + // + // NOTE: recvmmsg() is only supported for sockets, so if the underlying + // FD is not a socket then the code will still fall back to the readv() + // path. + RecvMMsg + // PacketMMap enables use of PACKET_RX_RING to receive packets from the + // NIC. PacketMMap requires that the underlying FD be an AF_PACKET. The + // primary use-case for this is runsc which uses an AF_PACKET FD to + // receive packets from the veth device. + PacketMMap ) func (p PacketDispatchMode) String() string { switch p { case Readv: return "Readv" + case RecvMMsg: + return "RecvMMsg" + case PacketMMap: + return "PacketMMap" default: return fmt.Sprintf("unknown packet dispatch mode '%d'", p) } @@ -283,14 +300,6 @@ func New(opts *Options) (stack.LinkEndpoint, error) { return e, nil } -func isSocketFD(fd int) (bool, error) { - var stat unix.Stat_t - if err := unix.Fstat(fd, &stat); err != nil { - return false, fmt.Errorf("unix.Fstat(%v,...) failed: %v", fd, err) - } - return (stat.Mode & unix.S_IFSOCK) == unix.S_IFSOCK, nil -} - // Attach launches the goroutine that reads packets from the file descriptor and // dispatches them via the provided dispatcher. If one is already attached, // then nothing happens. diff --git a/internal/fdbased_darwin/packet_dispatchers.go b/internal/fdbased_darwin/packet_dispatchers.go index 967f2a8..f66c438 100644 --- a/internal/fdbased_darwin/packet_dispatchers.go +++ b/internal/fdbased_darwin/packet_dispatchers.go @@ -25,51 +25,31 @@ import ( "golang.org/x/sys/unix" ) -// BufConfig defines the shape of the buffer used to read packets from the NIC. -var BufConfig = []int{4, 128, 256, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768} - -// +stateify savable type iovecBuffer struct { - // buffer is the actual buffer that holds the packet contents. Some contents - // are reused across calls to pullBuffer if number of requested bytes is - // smaller than the number of bytes allocated in the buffer. - views []*buffer.View - - // iovecs are initialized with base pointers/len of the corresponding - // entries in the views defined above, except when GSO is enabled - // (skipsVnetHdr) then the first iovec points to a buffer for the vnet header - // which is stripped before the views are passed up the stack for further - // processing. + mtu int + views []*buffer.View iovecs []unix.Iovec `state:"nosave"` - - // sizes is an array of buffer sizes for the underlying views. sizes is - // immutable. - sizes []int - - // pulledIndex is the index of the last []byte buffer pulled from the - // underlying buffer storage during a call to pullBuffers. It is -1 - // if no buffer is pulled. - pulledIndex int } -func newIovecBuffer(sizes []int) *iovecBuffer { +func newIovecBuffer(mtu uint32) *iovecBuffer { b := &iovecBuffer{ - views: make([]*buffer.View, len(sizes)), - iovecs: make([]unix.Iovec, len(sizes)), - sizes: sizes, + mtu: int(mtu), + views: make([]*buffer.View, 2), + iovecs: make([]unix.Iovec, 2), } return b } func (b *iovecBuffer) nextIovecs() []unix.Iovec { - for i := range b.views { - if b.views[i] != nil { - break - } - v := buffer.NewViewSize(b.sizes[i]) - b.views[i] = v - b.iovecs[i] = unix.Iovec{Base: v.BasePtr()} - b.iovecs[i].SetLen(v.Size()) + if b.views[0] == nil { + b.views[0] = buffer.NewViewSize(4) + b.iovecs[0] = unix.Iovec{Base: b.views[0].BasePtr()} + b.iovecs[0].SetLen(4) + } + if b.views[1] == nil { + b.views[1] = buffer.NewViewSize(b.mtu) + b.iovecs[1] = unix.Iovec{Base: b.views[1].BasePtr()} + b.iovecs[1].SetLen(b.mtu) } return b.iovecs } @@ -80,25 +60,13 @@ func (b *iovecBuffer) nextIovecs() []unix.Iovec { // of b.buffer's storage must be reallocated during the next call to // nextIovecs. func (b *iovecBuffer) pullBuffer(n int) buffer.Buffer { - var views []*buffer.View - c := 0 - // Remove the used views from the buffer. - for i, v := range b.views { - c += v.Size() - if c >= n { - b.views[i].CapLength(v.Size() - (c - n)) - views = append(views, b.views[:i+1]...) - break - } - } - for i := range views { - b.views[i] = nil - } pulled := buffer.Buffer{} - for _, v := range views { - pulled.Append(v) - } + pulled.Append(b.views[0]) + pulled.Append(b.views[1]) pulled.Truncate(int64(n)) + pulled.TrimFront(4) + b.views[0] = nil + b.views[1] = nil return pulled } @@ -147,7 +115,12 @@ func newRecvMMsgDispatcher(fd int, e *endpoint, opts *Options) (linkDispatcher, if err != nil { return nil, err } - batchSize := int((512*1024)/(opts.MTU)) + 1 + var batchSize int + if opts.MTU < 49152 { + batchSize = int((512*1024)/(opts.MTU)) + 1 + } else { + batchSize = 1 + } d := &recvMMsgDispatcher{ StopFD: stopFD, fd: fd, @@ -155,9 +128,8 @@ func newRecvMMsgDispatcher(fd int, e *endpoint, opts *Options) (linkDispatcher, bufs: make([]*iovecBuffer, batchSize), msgHdrs: make([]rawfile.MsgHdrX, batchSize), } - bufConfig := []int{4, int(opts.MTU)} for i := range d.bufs { - d.bufs[i] = newIovecBuffer(bufConfig) + d.bufs[i] = newIovecBuffer(opts.MTU) } d.gro.Init(false) d.mgr = newProcessorManager(opts, e) @@ -178,12 +150,11 @@ func (d *recvMMsgDispatcher) release() { func (d *recvMMsgDispatcher) dispatch() (bool, tcpip.Error) { // Fill message headers. for k := range d.msgHdrs { - if d.msgHdrs[k].Msg.Iovlen > 0 { - break - } iovecs := d.bufs[k].nextIovecs() iovLen := len(iovecs) - d.msgHdrs[k].DataLen = 0 + // Cannot clear only the length field. Older versions of the darwin kernel will check whether other data is empty. + // https://github.com/Darm64/XNU/blob/xnu-2782.40.9/bsd/kern/uipc_syscalls.c#L2026-L2048 + d.msgHdrs[k] = rawfile.MsgHdrX{} d.msgHdrs[k].Msg.Iov = &iovecs[0] d.msgHdrs[k].Msg.SetIovlen(iovLen) } @@ -209,7 +180,6 @@ func (d *recvMMsgDispatcher) dispatch() (bool, tcpip.Error) { for k := 0; k < nMsgs; k++ { n := int(d.msgHdrs[k].DataLen) payload := d.bufs[k].pullBuffer(n) - payload.TrimFront(4) pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{ Payload: payload, }) diff --git a/tun_darwin.go b/tun_darwin.go index ed04234..80e5e94 100644 --- a/tun_darwin.go +++ b/tun_darwin.go @@ -111,14 +111,14 @@ func New(options Options) (Tun, error) { unix.Close(tunFd) return nil, err } - err = configure(tunFd, batchSize) + err = configure(tunFd, options.MTU, batchSize) if err != nil { unix.Close(tunFd) return nil, err } } else { tunFd = options.FileDescriptor - err := configure(tunFd, batchSize) + err := configure(tunFd, options.MTU, batchSize) if err != nil { return nil, err } @@ -332,15 +332,17 @@ func create(tunFd int, ifIndex int, name string, options Options) error { return nil } -func configure(tunFd int, batchSize int) error { +func configure(tunFd int, tunMTU uint32, batchSize int) error { err := unix.SetNonblock(tunFd, true) if err != nil { return os.NewSyscallError("SetNonblock", err) } - const UTUN_OPT_MAX_PENDING_PACKETS = 16 - err = unix.SetsockoptInt(tunFd, 2, UTUN_OPT_MAX_PENDING_PACKETS, batchSize) - if err != nil { - return os.NewSyscallError("SetsockoptInt UTUN_OPT_MAX_PENDING_PACKETS", err) + if tunMTU < 49152 { + const UTUN_OPT_MAX_PENDING_PACKETS = 16 + err = unix.SetsockoptInt(tunFd, 2, UTUN_OPT_MAX_PENDING_PACKETS, batchSize) + if err != nil { + return os.NewSyscallError("SetsockoptInt UTUN_OPT_MAX_PENDING_PACKETS", err) + } } return nil } @@ -352,12 +354,19 @@ func (t *NativeTun) BatchSize() int { func (t *NativeTun) BatchRead() ([]*buf.Buffer, error) { for i := 0; i < t.batchSize; i++ { iovecs := t.iovecs[i].nextIovecs() - t.msgHdrs[i].DataLen = 0 + // Cannot clear only the length field. Older versions of the darwin kernel will check whether other data is empty. + // https://github.com/Darm64/XNU/blob/xnu-2782.40.9/bsd/kern/uipc_syscalls.c#L2026-L2048 + t.msgHdrs[i] = rawfile.MsgHdrX{} t.msgHdrs[i].Msg.Iov = &iovecs[0] t.msgHdrs[i].Msg.Iovlen = 2 } n, errno := rawfile.BlockingRecvMMsgUntilStopped(t.stopFd.ReadFD, t.tunFd, t.msgHdrs) if errno != 0 { + for k := 0; k < n; k++ { + t.iovecs[k].buffer.Release() + t.iovecs[k].buffer = nil + } + t.buffers = t.buffers[:0] return nil, errno } if n < 1 { diff --git a/tun_darwin_gvisor.go b/tun_darwin_gvisor.go index 16ecbe7..6a2286d 100644 --- a/tun_darwin_gvisor.go +++ b/tun_darwin_gvisor.go @@ -12,14 +12,17 @@ var _ GVisorTun = (*NativeTun)(nil) func (t *NativeTun) NewEndpoint() (stack.LinkEndpoint, stack.NICOptions, error) { ep, err := fdbased.New(&fdbased.Options{ - FDs: []int{t.tunFd}, - MTU: t.options.MTU, - RXChecksumOffload: true, + FDs: []int{t.tunFd}, + MTU: t.options.MTU, + RXChecksumOffload: true, + PacketDispatchMode: fdbased.RecvMMsg, }) if err != nil { return nil, stack.NICOptions{}, err } - return ep, stack.NICOptions{ - QDisc: fifo.New(ep, 1, 1000), - }, nil + var nicOptions stack.NICOptions + if t.options.MTU < 49152 { + nicOptions.QDisc = fifo.New(ep, 1, 1000) + } + return ep, nicOptions, nil }