diff --git a/.gitignore b/.gitignore index 66fd13c..9b0d83c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,15 +1,15 @@ -# Binaries for programs and plugins -*.exe -*.exe~ -*.dll -*.so -*.dylib - -# Test binary, built with `go test -c` -*.test - -# Output of the go coverage tool, specifically when used with LiteIDE -*.out - -# Dependency directories (remove the comment below to include it) -# vendor/ +# Binaries for programs and plugins +*.exe +*.exe~ +*.dll +*.so +*.dylib + +# Test binary, built with `go test -c` +*.test + +# Output of the go coverage tool, specifically when used with LiteIDE +*.out + +# Dependency directories (remove the comment below to include it) +# vendor/ diff --git a/LICENSE b/LICENSE index 0a18129..11fde47 100644 --- a/LICENSE +++ b/LICENSE @@ -1,21 +1,21 @@ -MIT License - -Copyright (c) 2021 impact-eintr - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. +MIT License + +Copyright (c) 2021 impact-eintr + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md index aa2c950..0210d8e 100644 --- a/README.md +++ b/README.md @@ -1,733 +1,733 @@ -# netstack -用golang 实现tcp/ip协议 - -src_code 是用来学习的源码 - -# TCPIP 和开放系统互连(OSI)模型 - -本节先简单介绍互联网的发展史,然后讲解 TCPIP 和开放系统互连(OSI)模型,最后会介绍一下用户态协议栈的整体框架。 - -## 计算机互联网发展史 -最开始的时候计算机是单独运算的,一般有一个大型主机放在那里,然后可以多个终端连接一个主机进行操作。那时候美国国防部认为,如果仅有一个集中的军事指挥中心,万一这个中心被原苏联摧毁,全国的军事指挥将处于瘫痪状态,其后果将不堪设想,因此有必要设计这样一个分散的指挥系统,它由一个个分散的指挥点组成,当部分指挥点被摧毁后其它点仍能正常工作,而这些分散的点又能通过某种形式的通讯网取得联系,这个研究项目就是著名的 ARPANET(阿帕网),也就是互联网的前身。在 1969 年,ARPANET 真正把计算机第一次互联,使用 BBN 公司开发的接口消息处理器(IMP)建立节点。当时的详细节点信息如下: - -``` markdown -节点1:UCLA(8月30日,9月2日接入) -功能:网络测量中心 -主机、操作系统:SDS SIGMA 7 - -节点2:斯坦福研究院(SRI)(10月1日) -功能:网络信息中心(NIC) -主机、操作系统:SDS940 -Doug Engelbart有关Augmentation of Human Intellect的计划 - -节点3:加州大学圣巴巴拉分校(UCSB)(11月1日) -功能:Culler-Fried交互式数学 -主机、操作系统:IBM 360/75 - -节点4:Utah大学(12月) -功能:图形处理 -主机、操作系统:DEC PDP-10 -由Steve Crocker编写第一份RFC文件Host Software(1969年4月7日)。 -REC 4:Network Timetable -UCLA的Charley Kline试图登录到SRI上,发出了第一个数据包, -他的第一次尝试在键入LOGIN的G的时候引起了系统的崩溃。(1969年10月20日或29日,需查实) -``` - -可以看出,当时硬件和系统都不是统一的,当然通信的接口也是不统一的,他们需要 IMP 连接和处理才能彼此通信。 - -> ARPANET 的特点 - -- 可以共享硬件、软件和数据库资源。 -- 利用分散控制结构。 -- 应用分组交换技术(包交换技术)。 -- 运用高功能的通信处理机。 -- 采用分层的网络协议。 - -ARPANET 早期使用一种网络控制协议(Network Control Protocol,NCP)来达到主机与主机之间的通信,但是它无法和个别的计算机网络做交流,因为设备之间没有一个标准协议。1972 年,ARPANET 项目组的核心成员 Vinton Cerf 和 Bob Kahn 开始合作开展所谓的网络互联相互(Interneting Project)。他们希望连接不同的网络,使得一个网络上的主机能够与另一个主机网络上进行通信,需要克服的问题很多:不同的分组大小、不同的接口类型、不同的传输速率、以及不同的可靠性要求。Cerf 和 Kahn 提出利用被称为网关的一种设备作为中间的硬件,进行一个网络到另一个网络的数据传输。 - -之后 Cerf 和 Kahn 在 1974 年发表了里程碑式的文章 Protocol for Packet Network Interconnection,描述了实现端到端数据投递的协议,这是一个新版的 NCP,叫传输控制协议(TCP)。这篇文章包括了封装、数据报、网关的功能等概念,其中主要思想是把纠错功能从 IMP 移到了主机。同时该协议(TCP)被应用到 ARPANET 网络,但是此时依然没有形成一个网络标准,各种协议并存包括 NCP,TCP 等协议。 - -在 1977 年后,TCP 被拆分成两个网络协议:传输控制协议(TCP)和因特网协议(IP),IP 处理数据包的路由选择,TCP 负责高层次的功能,如分段、重组、检错。这个新的联合体就是人们熟知的 TCP/IP。 - -1980 年发表 UDP 协议。 - -1981 年 UNIX 系统集成了 TCP/IP 协议栈,包含网络软件的流行操作系统对网络的普及起了很大的作用。 - -1983 年原先的交流协议 NCP 被禁用,TCP/IP 协议变成了 ARPANET 的正式协议,同时 ARPANET 分裂成两个网络:军用网(MILNET)和非军用的 ARPANET。之后,NCP 成为历史,TCP/IP 开始成为通用协议。 - -1984 年 ISO 发布了开放式系统互联模型(OSI)。 - -再之后,互联网极速发展,更多的主干网被搭建,更多的主机连接进来,直至组成了世界互联的巨大网络。 - -## OSI 模型和 TCPIP 模型 - -> OSI 模型 - -|层级|level| -|:------:|:------------:| -| 应用层 | Application | -| 表示层 | Presentation | -| 会话层 | Session | -| 传输层 | Transport | -| 网络层 | Network | -| 链路层 | Link | -| 物理层 | Physical | - -> TCPIP 模型 - -|层级|level| -|:------:|:------------:| -| 应用层 | Application| -| 传输层 | Transport| -| 网络层 | Network| -| 链路层 | Link| - -虽然现实中实现的协议栈都是 TCP/IP 模型,但是我们也需要了解 OSI 模型,它很有参考意义,我们平常交流讲到网络分层的时候都是用 OSI 模型来讲的,所以开发者一般实现的时候是 TCPIP 模型,但和别人讨论的时候是 OSI 模型。比如我们一般讲的二层网络,三层网络指的是 OSI 模型的链路层和网络层。下面介绍 TCP/IP 模型各层功能。 - -## TCPIP 各个层的主要功能 -### 链路层 -链路层也是将数据包发送到另一台主机,但是这两台主机一定是同个局域网的(不考虑广域网二层打通的情况),链路层负责将网络层交下来的 IP 数据报组装成帧,在两个相邻节点间的链路上传送帧。链路层的通信就像在一栋小楼里面互相讲话一下,小明想与小红讲话,只要在楼里喊一下,“小红你在吗?”,小红听到了就会回复说,“小明,我在啊”。小明在喊小红的时候,在这栋楼里的其他人也听得到,这种行为叫广播。链路层网络不适合大型网络,因为一旦主机多了,广播会比较占用资源,就像楼里大家都在喊别人一下,听起来会很乱。 - -## 网络层 -网络层负责将数据报从一台主机发送到一台目标主机上(注意:这两个主机可以不相邻),并给每个主机分配一个地址。最著名的就是 IP 协议了,每个主机都至少有一个 IP 地址,根据路由策略将收到数据报发往下一个主机,这个过程就叫路由转发,它是实现国际网的基础。对于网络层的通信,小明和小红就不是在一栋楼里了,他们可能隔了一个省,此时小明再怎么喊,小红也听不到,怎么办?那就寄信封吧,把信写好,交给邮差,邮差根据地址送给下一个驿站,驿站再根据地址送给下一站,知道送到小红那,这个过程就很像路由,根据目的地址选择下一跳地址。有时候小明有太多话想跟小红讲,导致一封信已经装不下了,那么就会用几张信封来装信件,这个过程就像分片,因上层数据包太大,将数据包切割。当然逆向过程就叫重组。 - -### 传输层 -**传输层最主要的目的就是给两个应用程序传输数据,注意是两个程序,不是两个主机。***主要的协议有 tcp 和 udp,tcp 为应用提供了虚拟连接的服务,也提供了数据的可靠性。udp 提供的是无连接服务,也不提供可靠服务,仅仅实现让两个程序之间交换数据。 - -### 应用层 -应用层是利用传输层的接口来实现用户自定义的网络应用,例如 HTTP 应用,SMTP(邮件传输)应用等。正因为应用层各色各样的应用,才让网络传输有了意义。比如微信,QQ,淘宝网等,这些我们常见的应用都离不开网络的传输。 - -### 为何要分层 -分层当然是有原因的,主要的目的是为了灵活性和方便实现。分层可以允许供应商进行独立开发,各层通过一个接口在相邻层通信。每层只要专注自己的事情,而不是关心其他层,这样方便软件或者硬件的实现,定义好每个层之间的接口,更改一层的内部实现,不会影响其他层,这样更灵活。比如,TCP 协议就不管下一层是 ipv4 还是 ipv6,它们都实现了网络层的接口,能寻址发送和接收数据包。 - -这种思想到处可见,我们要解决一个复杂的问题时,一般都是拆分层小问题,然后分别解决小问题,分层也是一样,它的本质就是为了分离关注点而让问题简单化或者更高效。 - - -## 封装与解封装 -各个层需要加上自己的控制信息,就会为每层定义自己的控制信息,不同的协议层对数据包有不同的称谓,在传输层叫做段(segment),在网络层叫做数据报(datagram),链路层叫做帧(frame),物理层层叫比特,数据封装成帧后发到传输介质上,到达目的主机后每层协议再剥掉相应的首部,最后将应用层数据交给应用程序处理。 - -每层发送数据时加上自己的控制信息叫封装。 -![img](img/document-uid949121labid10418timestamp1555394988939.png) - -收到数据包剥掉相应的首部叫解封装,解封装的时候会得到上层的协议,去除本层首部信息后,将会把数据包分发给上层协议。 - -![img](img/document-uid949121labid10418timestamp1555395022259.png) - -## 你是如何冲浪的呢 -![img](img/document-uid949121labid10418timestamp1555395048260.png) -一般情况家里的上网流程如下,但不是一定是这样,请读者注意! - -首先你得购买互联网服务提供商(ISP,如:中国电信)提供的账号密码; -- 启动家用路由器,假设路由器内网地址为 192.168.1.1,接着配置账号密码,通过拨号和 ISP 建立连接,ISP 会返回一个公网 IP 地址,假如 IP 为 1.1.10.1; -- 然后再把电脑插到家用路由器的网口上,那么电脑就获取到了内网 IP 地址,假如为 192.168.1.2,这时候家用路由器就是电脑的默认网关,和家用路由器的相连的网卡假设为 en0; -- 当在浏览器访问 https://www.baidu.com 时,浏览器会发起 DNS 请求得到对应的 IP,假如为 180.97.33.18,DNS 请求的详细过程我们暂时忽略; -- 拿到 IP 后,浏览器会使用 tcp 连接系统调用和远端主机建立连接,系统调用会进入内核; -- 内核先通过路由最长匹配查询目标 IP 下一跳地址,也就是邻居地址,比如目的 180.97.33.18 会匹配下一跳地址 192.168.1.1; -- 内核接着查询 ARP 表,得知下一跳地址的网卡和物理 MAC 地址,如果没有查询到,则会发送广播 ARP 请求,得到 MAC 地址; -- 到目前为止发送 tcp 报文所需的信息都有了,目标 IP 和目标 MAC 地址,此时系统会给 tcp 的连接分配一个源端口,假如为 33306; -- 之后完成 tcp 三次握手,将 HTTP 请求报文封装在 tcp 数据段中发送给网卡 en0; -- 家用路由器接收到电脑的数据报,经过源地址转换(SNAT),将数据报文发送给 ISP; -- ISP 通过路由表选择发送给下一个路由,经过多个路由转发最终达到百度的服务主机; -- 百度服务器得到电脑发送的报文,返回 HTTP 响应,按原路返回给家用路由器; -- 家用路由器接收到 HTTP 响应报文后,经过目标地址转换(DNAT),将数据发送给电脑; -- 电脑上的浏览器接收到 HTTP 响应,渲染页面,呈现出网页; - -## 协议栈整体框架 -本课程参考netstack,基于 linux 的 tap 网卡,来实现一个用户态的 tcp/ip 协议栈。 - -协议栈的整体架构如下: -|协议|网络层级| -|:--:|:------:| -| tcp/udp | transport| -| arp/ipv4/ipv6 | network| -| vnic | nic_manager| -| tap | link| - -- 链路层我们用 tap 网卡实现,它是 linux 下的一种虚拟网卡,能提供链路层的功能,发送和接收以太网帧。 -- 协议栈还实现了对虚拟网卡的一定管理,就像 linux 对物理网卡的管理一样。 -- 网络层实现了 arp、ipv4 和 ipv6 协议,arp 协议虽然被划分在网络层,但是链路层的工作离不开它,所以第二章讲链路层通信的时候会介绍。ipv4 协议中的 icmp 和 ipv6 中的邻居协议也实现了,后面会讲解 ipv4 和 icmp 协议。 -- 传输层会实现了 tcp 和 udp,在讲传输层之前会先介绍端口的概念。传输层中的 tcp 实现应该是整个协议栈中最复杂的,会按功能拆分来讲解。 - -值得注意的是这里说的协议栈是主机上的协议栈,不是交换机,也不是路由器的协议栈。 - -整体来说,实现一个协议栈并没有想象中的那么复杂,如果排除了 tcp 的各种机制实现,那么协议栈其实很简单,就是封装与解封装的过程,所以协议栈的代码有很大部分也是各层对数据包的封装与解封装。tcp 的实现会根据每个特性来拆分讲解和实现,以便更容易理解和实现。 - -# 链路层 -## 链路层的介绍和基本实现 -本节主要介绍链路层的基本实现,主要讲以太网网卡、虚拟网卡和 arp 协议。 - -### 链路层的目的 -数据链路层属于计算机网络的底层,使用的信道主要有点对点信道和广播信道两种类型。 在 TCP/IP 协议族中,数据链路层主要有以下几个目的: - -1. 接收和发送链路层数据,提供 io 的能力。 -2. 为 IP 模块发送和接收数据 -3. 为 ARP 模块发送 ARP 请求和接收 ARP 应答 -4. 为 RARP 模块发送 RARP 请求和接收 RARP 应答 - -**TCP/IP 支持多种不同的链路层协议,这取决于网络所使用的硬件。** -数据链路层的协议数据单元——`帧`:将 IP 层(网络层)的数据报添加首部和尾部封装成帧。 -数据链路层协议有许多种,都会解决三个基本问题,封装成帧,透明传输,差错检测。 - -### 以太网介绍 -我们这章讲的是链路层,为何要讲以太网,那是因为以太网实在应用太广了,以至于我们在现实生活中看到的链路层协议的数据封装都是以太网协议封装的,所以要实现链路层数据的处理,我们必须要了解以太网。 - -以太网(Ethernet)是一种计算机局域网技术。IEEE 组织的 IEEE 802.3 标准制定了以太网的技术标准,它规定了包括物理层的连线、电子信号和介质访问层协议的内容。以太网是目前应用最普遍的局域网技术,取代了其他局域网标准如令牌环、FDDI 和 ARCNET。以太网协议,是当今现有局域网采用的最通用的通信协议标准,故可认为以太网就是局域网。 - -### 链路层的寻址 -通信当然得知道发送者的地址和接受者的地址,这是最基础的。以太网规定,所有连入网络的设备,都必须具有“网卡”接口。然后**数据包是从一块网卡,传输到另一块网卡的**。网卡的地址,就是数据包的发送地址和接收地址,叫做 MAC 地址,也叫物理地址,这是最底层的地址。每块网卡出厂的时候,都有一个全世界独一无二的 MAC 地址,长度是 48 个二进制位,通常用 12 个十六进制数表示。有了这个地址,我们可以定位网卡和数据包的路径了。 - -### MTU(最大传输单元) -**MTU 表示在链路层最大的传输单元,也就是链路层一帧数据的数据内容最大长度,单位为字节**,MTU 是协议栈实现一个很重要的参数,请大家务必理解该参数。一般网卡默认 MTU 是 1500,当你往网卡写入的内容超过 1518bytes,就会报错,后面我们可以写代码试试。 - -![img](img/document-uid949121labid10418timestamp1555399038307.png ) -上面的图片是 linux 上链路层的实现,链路层的实现可以分为三层,真实的以太网卡,网卡驱动,网卡逻辑抽象。 - -真实的网卡我们不关心,因为那是硬件工程,我们只需要知道,它能接收和发送网络数据给网卡驱动就好了。网卡驱动我们也不关心,一般驱动都是网卡生产商就写好了,我们只需知道,它能接收协议栈的数据发送给网卡,接收网卡的数据发送给协议栈。网卡逻辑抽象表示,这个是我们关心的,我需要对真实的网卡进行抽象, - -在系统中表示,也需要对抽象的网卡进行管理。 - -> 注意:后面系统中网卡的逻辑抽象我们都描述为网卡。 - -比如在 linux 上,当你敲下 ifconfig 命令,会输出类似如下内容: - -``` bash -eth0 Link encap:Ethernet HWaddr 00:16:3e:08:a1:7a - inet addr:172.18.153.158 Bcast:172.18.159.255 Mask:255.255.240.0 - UP BROADCAST RUNNING MULTICAST MTU:1500 Metric:1 - RX packets:285941546 errors:0 dropped:0 overruns:0 frame:0 - TX packets:281609568 errors:0 dropped:0 overruns:0 carrier:0 - collisions:0 txqueuelen:1000 - RX bytes:142994767953 (142.9 GB) TX bytes:44791940275 (44.7 GB) - -lo Link encap:Local Loopback - inet addr:127.0.0.1 Mask:255.0.0.0 - UP LOOPBACK RUNNING MTU:65536 Metric:1 - RX packets:363350690 errors:0 dropped:0 overruns:0 frame:0 - TX packets:363350690 errors:0 dropped:0 overruns:0 carrier:0 - collisions:0 txqueuelen:1 - RX bytes:28099158493 (28.0 GB) TX bytes:28099158493 (28.0 GB) -``` - -示例里显示了两个网卡,一个 eth0 以太网网卡,一个 lo 本地回环网卡。还可以看到两个网卡的信息,当我们要表示一个网卡的时候,需要具备几个属性: - -1. 网卡的名字、类型和 MAC 地址 -- eth0 Link encap:Ethernet HWaddr 00:16:3e:08:a1:7a - - eth0是网卡名,方便表示一个网卡,网卡名在同个系统里不能重复 - - Link encap:Ethernet 表示该网卡类型为以太网网卡。 - - HWaddr 00:16:3e:08:a1:7a 表示 MAC 地址 00:16:3e:08:a1:7a,是链路层寻址的地址。 -2. 网卡的 IP 地址及掩码 -- inet addr:172.18.153.158 Bcast:172.18.159.255 Mask:255.255.240.0 - - inet addr:172.18.153.158 表示该网卡的 ipv4 地址是 172.18.153.158。 - - Bcast:172.18.159.255 表示该网卡 ip 层的广播地址。 - - 255.255.240.0 该网卡的子网掩码。 -3. 网卡的状态和 MTU -- UP BROADCAST RUNNING MULTICAST MTU:1500 Metric:1 - - UP BROADCAST RUNNING MULTICAST都是表示网卡的状态 - - UP(代表网卡开启状态) - - BROADCAST (支持广播) - - RUNNING(代表网卡的网线被接上) - - MULTICAST(支持组播)。 - - MTU:1500 最大传输单元为 1500 字节。 - - Metric:1 接口度量值为 1,接口度量值表示在这个路径上发送一个分组的成本。 - -### linux的虚拟网卡介绍 -实现协议栈,我们需要一个网卡,因为这样我们才能接收和发送网络数据,但是一般情况下,我们电脑的操作系统已经帮我们管理好网卡了,我们想实现自由的控制网卡是不太方便的,还好 linux 系统还有另一个功能-虚拟网卡,它是操作系统虚拟出来的一个网卡,我们协议栈的实现都是基于虚拟网卡,用虚拟网卡的好处是: - -对于用户来说虚拟网卡和真实网卡几乎没有差别,而且我们控制或更改虚拟网卡大部分情况下不会影响到真实的网卡,也就不会影响到用户的网络。 -虚拟网卡的数据可以直接从用户态直接读取和写入,这样我们就可以直接在用户态编写协议栈。 -Linux 中虚拟网络设备 -TUN/TAP 设备、VETH 设备、Bridge 设备、Bond 设备、VLAN 设备、MACVTAP 设备,下面我们只讲 tun/tap 设备,其他虚拟设备感兴趣的同学可以去网上自行搜索。 - -TAP/TUN 设备是一种让用户态和内核之间进行数据交换的虚拟设备,TAP 工作在二层,TUN 工作在三层,TAP/TUN 网卡的两头分别是内核网络协议栈和用户层,其作用是将协议栈中的部分数据包转发给用户空间的应用程序,给用户空间的程序一个处理数据包的机会。 - -当我们想在 linux 中创建一个 TAP 设备时,其实很容易,像普通文件一样打开字符设备 /dev/net/tun 可以得到一个文件描述符,接着用系统调用 ioctl 将文件描述符和 kernel 的 tap 驱动绑定在一起,那么之后对该文件描述符的读写就是对虚拟网卡 TAP 的读写。详细的实现可以看 (tuntap)[https://www.kernel.org/doc/Documentation/networking/tuntap.txt] 所以最终我们实现的协议栈和 TAP 虚拟网卡的关系,如下图: - - `userland netstack` <- `tap` <- kernel` - -### tap网卡实验 -在 linux 中创建虚拟网卡,我们可以用 linux 自带的 ip 命令来实现,关于 ip 命令的更多用法请看 man ip。 - -创建 tap 网卡 - -#### 创建一个tap模式的虚拟网卡tap0 - -``` bash - -sudo ip tuntap add mode tap tap0 -``` - -#### 开启该网卡 - -``` bash - -sudo ip link set tap0 up -``` - -#### 设置该网卡的ip及掩码 - -``` bash - -sudo ip addr add 192.168.1.1/24 dev tap0 -``` - -我们创建一个为名 tap0,ip 及掩码为 192.168.1.1/24 的虚拟网卡,执行 ifconfig 看看,会看到一个 tap0 的网卡: - -``` bash -tap0 Link encap:Ethernet HWaddr 22:e2:f2:93:ff:bf - inet addr:192.168.1.1 Bcast:0.0.0.0 Mask:255.255.255.0 - UP BROADCAST MULTICAST MTU:1500 Metric:1 - RX packets:0 errors:0 dropped:0 overruns:0 frame:0 - TX packets:0 errors:0 dropped:0 overruns:0 carrier:0 - collisions:0 txqueuelen:1000 - RX bytes:0 (0.0 B) TX bytes:0 (0.0 B) - -``` - - -删除网卡可以使用如下命令: - -#### 删除虚拟网卡 - -``` bash - -sudo ip tuntap del mode tap tap0 -``` - -看起来和真实的网卡没有任何区别,接下来我们自己用 golang 来实现创建网卡。 - -golang 创建 tuntap 网卡的库实现,在 netstack/tcpip/link/tuntap 目录下可以查看源文件 tuntap.go 的代码: - -``` go -// +build linux - -package tuntap - -import ( - "errors" - "fmt" - "os/exec" - "syscall" - "unsafe" -) - -const ( - TUN = 1 - TAP = 2 -) - -var ( - ErrDeviceMode = errors.New("unsupport device mode") -) - -type rawSockaddr struct { - Family uint16 - Data [14]byte -} - -// 虚拟网卡设置的配置 -type Config struct { - Name string // 网卡名 - Mode int // 网卡模式,TUN or TAP -} - -// NewNetDev根据配置返回虚拟网卡的文件描述符 -func NewNetDev(c *Config) (fd int, err error) { - switch c.Mode { - case TUN: - fd, err = newTun(c.Name) - case TAP: - fd, err = newTAP(c.Name) - default: - err = ErrDeviceMode - return - } - if err != nil { - return - } - return -} - -// SetLinkUp 让系统启动该网卡 -func SetLinkUp(name string) (err error) { - // ip link set up - out, cmdErr := exec.Command("ip", "link", "set", name, "up").CombinedOutput() - if cmdErr != nil { - err = fmt.Errorf("%v:%v", cmdErr, string(out)) - return - } - return -} - -// SetRoute 通过ip命令添加路由 -func SetRoute(name, cidr string) (err error) { - // ip route add 192.168.1.0/24 dev tap0 - out, cmdErr := exec.Command("ip", "route", "add", cidr, "dev", name).CombinedOutput() - if cmdErr != nil { - err = fmt.Errorf("%v:%v", cmdErr, string(out)) - return - } - return -} - -// AddIP 通过ip命令添加IP地址 -func AddIP(name, ip string) (err error) { - // ip addr add 192.168.1.1 dev tap0 - out, cmdErr := exec.Command("ip", "addr", "add", ip, "dev", name).CombinedOutput() - if cmdErr != nil { - err = fmt.Errorf("%v:%v", cmdErr, string(out)) - return - } - return -} - -func GetHardwareAddr(name string) (string, error) { - fd, err := syscall.Socket(syscall.AF_UNIX, syscall.SOCK_DGRAM, 0) - if err != nil { - return "", err - } - - defer syscall.Close(fd) - - var ifreq struct { - name [16]byte - addr rawSockaddr - _ [8]byte - } - - copy(ifreq.name[:], name) - _, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), syscall.SIOCGIFHWADDR, uintptr(unsafe.Pointer(&ifreq))) - if errno != 0 { - return "", errno - } - - mac := ifreq.addr.Data[:6] - return string(mac[:]), nil -} - -// newTun新建一个tun模式的虚拟网卡,然后返回该网卡的文件描述符 -// IFF_NO_PI表示不需要包信息 -func newTun(name string) (int, error) { - return open(name, syscall.IFF_TUN|syscall.IFF_NO_PI) -} - -// newTAP新建一个tap模式的虚拟网卡,然后返回该网卡的文件描述符 -func newTAP(name string) (int, error) { - return open(name, syscall.IFF_TAP|syscall.IFF_NO_PI) -} - -// 先打开一个字符串设备,通过系统调用将虚拟网卡和字符串设备fd绑定在一起 -func open(name string, flags uint16) (int, error) { - // 打开tuntap的字符设备,得到字符设备的文件描述符 - fd, err := syscall.Open("/dev/net/tun", syscall.O_RDWR, 0) - if err != nil { - return -1, err - } - - var ifr struct { - name [16]byte - flags uint16 - _ [22]byte - } - - copy(ifr.name[:], name) - ifr.flags = flags - // 通过ioctl系统调用,将fd和虚拟网卡驱动绑定在一起 - _, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), syscall.TUNSETIFF, uintptr(unsafe.Pointer(&ifr))) - if errno != 0 { - syscall.Close(fd) - return -1, errno - } - return fd, nil -} -``` - -根据这个库,我们写一个从网卡读取数据的程序,并打印读取到的字节数。新建文件 tcpip/lab/link/tap1/main.go,输入如下代码: - -``` go -package main - -import ( - "log" - "tcpip/netstack/tcpip/link/rawfile" - "tcpip/netstack/tcpip/link/tuntap" -) - -func main() { - tapName := "tap0" - c := &tuntap.Config{tapName, tuntap.TAP} - fd, err := tuntap.NewNetDev(c) - if err != nil { - panic(err) - } - - // 启动tap网卡 - _ = tuntap.SetLinkUp(tapName) - // 添加ip地址 - _ = tuntap.AddIP(tapName, "192.168.1.1/24") - - buf := make([]byte, 1<<16) - for { - rn, err := rawfile.BlockingRead(fd, buf) - if err != nil { - log.Println(err) - continue - } - log.Printf("read %d bytes", rn) - } -} -``` - - -copy -然后进入目录 tcpip/lab/link/tap1 编译代码。 - -``` bash - -cd ~/tcpip/lab/link/tap1/ -go build -``` - -会生成一个叫 tap1 的可执行文件,我们执行它 - -``` bash - -sudo ./tap1 -``` - -稍等一会再打开另一个终端,利用 tcpdump 抓取经过 tap0 网卡的数据,如果执行 tap1,立马就抓包,可能会抓到一些 ipv6 的组播包,我们这里先忽略。 - -``` bash - -sudo tcpdump -i tap0 -n -``` - -再打开另一个终端,我们试 ping 一下 192.168.1.1 - -``` bash - -ping 192.168.1.1 -``` - - -但是 tcpdump 抓取数据的终端和我们自己写的打印网卡数据的终端中没有任何 icmp 数据,这是为何?这是因为当给一个网卡添加 ip 地址的时候,系统会将相应的路由添加到“本地路由表”,正因为这样,即使看起来 192.168.1.1 是 tap0 网卡的地址,但实际上我们 ping 的数据并没有走到 tap0 网卡,而是在 lo 网卡上,我们可以试试在终端抓去 lo 网卡数据 - -``` bash - -sudo tcpdump src 192.168.1.1 -i lo -n -``` - -再 ping 一下 192.168.1.1 ,查看 tcpdump 的输出: - -``` bash -listening on lo, link-type EN10MB (Ethernet), capture size 262144 bytes -22:40:18.028585 IP 192.168.1.1 > 192.168.1.1: ICMP echo request, id 29728, seq 1, length 64 -22:40:18.028599 IP 192.168.1.1 > 192.168.1.1: ICMP echo reply, id 29728, seq 1, length 64 -22:40:19.029912 IP 192.168.1.1 > 192.168.1.1: ICMP echo request, id 29728, seq 2, length 64 -22:40:19.029925 IP 192.168.1.1 > 192.168.1.1: ICMP echo reply, id 29728, seq 2, length 64 -``` - -查看本地路由的信息,通过 ip route show table local 命令。 - -``` bash -broadcast 10.211.55.0 dev enp0s5 proto kernel scope link src 10.211.55.14 -broadcast 10.211.55.0 dev enp0s6 proto kernel scope link src 10.211.55.16 -local 10.211.55.14 dev enp0s5 proto kernel scope host src 10.211.55.14 -local 10.211.55.16 dev enp0s6 proto kernel scope host src 10.211.55.16 -broadcast 10.211.55.255 dev enp0s5 proto kernel scope link src 10.211.55.14 -broadcast 10.211.55.255 dev enp0s6 proto kernel scope link src 10.211.55.16 -broadcast 127.0.0.0 dev lo proto kernel scope link src 127.0.0.1 -local 127.0.0.0/8 dev lo proto kernel scope host src 127.0.0.1 -local 127.0.0.1 dev lo proto kernel scope host src 127.0.0.1 -broadcast 127.255.255.255 dev lo proto kernel scope link src 127.0.0.1 -broadcast 192.168.1.0 dev tap0 proto kernel scope link src 192.168.1.1 -local 192.168.1.1 dev tap0 proto kernel scope host src 192.168.1.1 -broadcast 192.168.1.255 dev tap0 proto kernel scope link src 192.168.1.1 - -``` - -可以看到倒数第二行,表示了 192.168.1.1 这个地址,在 local 路由表里。同时路由表也显示,只有 192.168.1.1 这个地址在路由表里,该网段的其他地址不在本地路由,那么应该会进入 tap0 网卡,比如我们试试 192.168.1.2 这个地址,ping 一下 - -``` bash - -PING 192.168.1.2 (192.168.1.2) 56(84) bytes of data. -From 192.168.1.1 icmp_seq=1 Destination Host Unreachable -From 192.168.1.1 icmp_seq=2 Destination Host Unreachable -``` - -然后 tcpdump 在 tap0 网卡上的输出 - -``` bash - -listening on tap0, link-type EN10MB (Ethernet), capture size 262144 bytes -22:55:58.322022 ARP, Request who-has 192.168.1.2 tell 192.168.1.1, length 28 -22:55:59.320824 ARP, Request who-has 192.168.1.2 tell 192.168.1.1, length 28 -``` - -说明 tap0 网卡收到了 arp 请求,至于我们使用 ping 之后为何接收到的是 arp 请求报文而不是 icmp 报文,这是因为系统不知道 192.168.1.2 的 MAC 地址,后面会详细说明。 - -在上面的程序中,我们也可以看到上面的程序有打印: - -``` bash -2018/11/11 23:54:10 read 42 bytes -2018/11/11 23:54:11 read 42 bytes -2018/11/11 23:54:12 read 42 bytes -2018/11/11 23:54:13 read 42 bytes - -``` - -其实在链路层通信,是可以不需要 ip 地址的,我们可以手动配置路由,将数据导入虚拟网卡,现在更改我们的程序,代码存放在 tcpip/lab/link/tap2/main.go: - -``` go -package main - -import ( - "log" - "tcpip/netstack/tcpip/link/rawfile" - "tcpip/netstack/tcpip/link/tuntap" -) - -func main() { - tapName := "tap0" - c := &tuntap.Config{tapName, tuntap.TAP} - fd, err := tuntap.NewNetDev(c) - if err != nil { - panic(err) - } - - // 启动tap网卡 - _ = tuntap.SetLinkUp(tapName) - // 设置路由 - _ = tuntap.SetRoute(tapName, "192.168.1.0/24") - - buf := make([]byte, 1<<16) - for { - rn, err := rawfile.BlockingRead(fd, buf) - if err != nil { - log.Println(err) - continue - } - log.Printf("read %d bytes", rn) - } -} -``` - - -进入目录 tcpip/lab/link/tap2,然后编译代码。 - -``` bash -cd ~/tcpip/lab/link/tap2 - -go build -``` - -会生成一个叫tap2的可执行文件,我们执行它 - -``` bash - -sudo ./tap2 -``` - -稍等一会再打开另一个终端,利用 tcpdump 抓取经过 tap0 网卡的数据。 - -``` bash - -sudo tcpdump -i tap0 -n -``` - -再打开另一个终端,我们试 ping 一下 192.168.1.1 - -``` bash - -ping 192.168.1.1 -``` - -查看程序 tap2 的输出: - -``` bash -2019/04/10 11:12:57 read 42 bytes -2019/04/10 11:12:58 read 42 bytes -2019/04/10 11:12:59 read 42 bytes -2019/04/10 11:13:16 read 42 bytes -2019/04/10 11:13:17 read 42 bytes -2019/04/10 11:13:18 read 42 bytes - -``` - -这时候你 ping 192.168.1.0/24 网段的任何一个地址都是进入 tap0 网卡,这样我们就可以实验和处理 tap0 网上上的数据了。目前我们只看到了网卡有读取到数据,而且抓包显示我们现在接收到的数据都是 arp 请求,后面会实现对 arp 报文的处理,接下来我们开始处理网卡的数据并封装链路层,实现网卡的 io。 - -### 链路层数据帧 -数据在链路层传输都是一帧一帧传输的,就像发送邮件一样,将信放入信封中,接着把信封邮寄出去,这样可以把一段信息和另一段信息区分开来,下面先介绍数据帧格式。 - -![](../../img/链路层数据帧.png ) - -- 目的 MAC 地址:目的设备的 MAC 物理地址。 -- 源 MAC 地址:发送设备的 MAC 物理地址。 -- 类型:表示后面所跟数据包的协议类型,例如 Type 为 0x8000 时为 IPv4 协议包,Type 为 0x8060 时,后面为 ARP 协议包。 -- 数据:表示该帧的数据内容,长度为 46 ~ 1500 字节,包含网络层、传输层和应用层的数据。 - -既然前面我们已经知道了链路层数据帧格式,也知道了链路层协议头的详细信息,那么现在就根据这些信息来处理以太网数据。我们把处理头部数据的代码都放在 header 包中 - -``` go -package header - -import ( - "encoding/binary" - - "tcpip/netstack/tcpip" -) - -// 以太网帧头部信息的偏移量 -const ( - dstMAC = 0 - srcMAC = 6 - ethType = 12 -) - -// EthernetFields表示链路层以太网帧的头部 -type EthernetFields struct { - // 源地址 - SrcAddr tcpip.LinkAddress - - // 目的地址 - DstAddr tcpip.LinkAddress - - // 协议类型 - Type tcpip.NetworkProtocolNumber -} - -// Ethernet以太网数据包的封装 -type Ethernet []byte - -const ( - // EthernetMinimumSize以太网帧最小的长度 - EthernetMinimumSize = 14 - - // EthernetAddressSize以太网地址的长度 - EthernetAddressSize = 6 -) - -// SourceAddress从帧头部中得到源地址 -func (b Ethernet) SourceAddress() tcpip.LinkAddress { - return tcpip.LinkAddress(b[srcMAC:][:EthernetAddressSize]) -} - -// DestinationAddress从帧头部中得到目的地址 -func (b Ethernet) DestinationAddress() tcpip.LinkAddress { - return tcpip.LinkAddress(b[dstMAC:][:EthernetAddressSize]) -} - -// Type从帧头部中得到协议类型 -func (b Ethernet) Type() tcpip.NetworkProtocolNumber { - return tcpip.NetworkProtocolNumber(binary.BigEndian.Uint16(b[ethType:])) -} - -// Encode根据传入的帧头部信息编码成Ethernet二进制形式,注意Ethernet应先分配好内存 -func (b Ethernet) Encode(e *EthernetFields) { - binary.BigEndian.PutUint16(b[ethType:], uint16(e.Type)) - copy(b[srcMAC:][:EthernetAddressSize], e.SrcAddr) - copy(b[dstMAC:][:EthernetAddressSize], e.DstAddr) -} -``` - -### 网卡IO的实现 -所谓 io 就是数据的输入输出,对于网卡来说就是接收或发送数据,接收意味着对以太网帧解封装和提交给网络层,发送意味着对上层数据的封装和写入网卡。协议栈定义了链路层的接口如下 +# netstack +用golang 实现tcp/ip协议 + +src_code 是用来学习的源码 + +# TCPIP 和开放系统互连(OSI)模型 + +本节先简单介绍互联网的发展史,然后讲解 TCPIP 和开放系统互连(OSI)模型,最后会介绍一下用户态协议栈的整体框架。 + +## 计算机互联网发展史 +最开始的时候计算机是单独运算的,一般有一个大型主机放在那里,然后可以多个终端连接一个主机进行操作。那时候美国国防部认为,如果仅有一个集中的军事指挥中心,万一这个中心被原苏联摧毁,全国的军事指挥将处于瘫痪状态,其后果将不堪设想,因此有必要设计这样一个分散的指挥系统,它由一个个分散的指挥点组成,当部分指挥点被摧毁后其它点仍能正常工作,而这些分散的点又能通过某种形式的通讯网取得联系,这个研究项目就是著名的 ARPANET(阿帕网),也就是互联网的前身。在 1969 年,ARPANET 真正把计算机第一次互联,使用 BBN 公司开发的接口消息处理器(IMP)建立节点。当时的详细节点信息如下: + +``` markdown +节点1:UCLA(8月30日,9月2日接入) +功能:网络测量中心 +主机、操作系统:SDS SIGMA 7 + +节点2:斯坦福研究院(SRI)(10月1日) +功能:网络信息中心(NIC) +主机、操作系统:SDS940 +Doug Engelbart有关Augmentation of Human Intellect的计划 + +节点3:加州大学圣巴巴拉分校(UCSB)(11月1日) +功能:Culler-Fried交互式数学 +主机、操作系统:IBM 360/75 + +节点4:Utah大学(12月) +功能:图形处理 +主机、操作系统:DEC PDP-10 +由Steve Crocker编写第一份RFC文件Host Software(1969年4月7日)。 +REC 4:Network Timetable +UCLA的Charley Kline试图登录到SRI上,发出了第一个数据包, +他的第一次尝试在键入LOGIN的G的时候引起了系统的崩溃。(1969年10月20日或29日,需查实) +``` + +可以看出,当时硬件和系统都不是统一的,当然通信的接口也是不统一的,他们需要 IMP 连接和处理才能彼此通信。 + +> ARPANET 的特点 + +- 可以共享硬件、软件和数据库资源。 +- 利用分散控制结构。 +- 应用分组交换技术(包交换技术)。 +- 运用高功能的通信处理机。 +- 采用分层的网络协议。 + +ARPANET 早期使用一种网络控制协议(Network Control Protocol,NCP)来达到主机与主机之间的通信,但是它无法和个别的计算机网络做交流,因为设备之间没有一个标准协议。1972 年,ARPANET 项目组的核心成员 Vinton Cerf 和 Bob Kahn 开始合作开展所谓的网络互联相互(Interneting Project)。他们希望连接不同的网络,使得一个网络上的主机能够与另一个主机网络上进行通信,需要克服的问题很多:不同的分组大小、不同的接口类型、不同的传输速率、以及不同的可靠性要求。Cerf 和 Kahn 提出利用被称为网关的一种设备作为中间的硬件,进行一个网络到另一个网络的数据传输。 + +之后 Cerf 和 Kahn 在 1974 年发表了里程碑式的文章 Protocol for Packet Network Interconnection,描述了实现端到端数据投递的协议,这是一个新版的 NCP,叫传输控制协议(TCP)。这篇文章包括了封装、数据报、网关的功能等概念,其中主要思想是把纠错功能从 IMP 移到了主机。同时该协议(TCP)被应用到 ARPANET 网络,但是此时依然没有形成一个网络标准,各种协议并存包括 NCP,TCP 等协议。 + +在 1977 年后,TCP 被拆分成两个网络协议:传输控制协议(TCP)和因特网协议(IP),IP 处理数据包的路由选择,TCP 负责高层次的功能,如分段、重组、检错。这个新的联合体就是人们熟知的 TCP/IP。 + +1980 年发表 UDP 协议。 + +1981 年 UNIX 系统集成了 TCP/IP 协议栈,包含网络软件的流行操作系统对网络的普及起了很大的作用。 + +1983 年原先的交流协议 NCP 被禁用,TCP/IP 协议变成了 ARPANET 的正式协议,同时 ARPANET 分裂成两个网络:军用网(MILNET)和非军用的 ARPANET。之后,NCP 成为历史,TCP/IP 开始成为通用协议。 + +1984 年 ISO 发布了开放式系统互联模型(OSI)。 + +再之后,互联网极速发展,更多的主干网被搭建,更多的主机连接进来,直至组成了世界互联的巨大网络。 + +## OSI 模型和 TCPIP 模型 + +> OSI 模型 + +|层级|level| +|:------:|:------------:| +| 应用层 | Application | +| 表示层 | Presentation | +| 会话层 | Session | +| 传输层 | Transport | +| 网络层 | Network | +| 链路层 | Link | +| 物理层 | Physical | + +> TCPIP 模型 + +|层级|level| +|:------:|:------------:| +| 应用层 | Application| +| 传输层 | Transport| +| 网络层 | Network| +| 链路层 | Link| + +虽然现实中实现的协议栈都是 TCP/IP 模型,但是我们也需要了解 OSI 模型,它很有参考意义,我们平常交流讲到网络分层的时候都是用 OSI 模型来讲的,所以开发者一般实现的时候是 TCPIP 模型,但和别人讨论的时候是 OSI 模型。比如我们一般讲的二层网络,三层网络指的是 OSI 模型的链路层和网络层。下面介绍 TCP/IP 模型各层功能。 + +## TCPIP 各个层的主要功能 +### 链路层 +链路层也是将数据包发送到另一台主机,但是这两台主机一定是同个局域网的(不考虑广域网二层打通的情况),链路层负责将网络层交下来的 IP 数据报组装成帧,在两个相邻节点间的链路上传送帧。链路层的通信就像在一栋小楼里面互相讲话一下,小明想与小红讲话,只要在楼里喊一下,“小红你在吗?”,小红听到了就会回复说,“小明,我在啊”。小明在喊小红的时候,在这栋楼里的其他人也听得到,这种行为叫广播。链路层网络不适合大型网络,因为一旦主机多了,广播会比较占用资源,就像楼里大家都在喊别人一下,听起来会很乱。 + +## 网络层 +网络层负责将数据报从一台主机发送到一台目标主机上(注意:这两个主机可以不相邻),并给每个主机分配一个地址。最著名的就是 IP 协议了,每个主机都至少有一个 IP 地址,根据路由策略将收到数据报发往下一个主机,这个过程就叫路由转发,它是实现国际网的基础。对于网络层的通信,小明和小红就不是在一栋楼里了,他们可能隔了一个省,此时小明再怎么喊,小红也听不到,怎么办?那就寄信封吧,把信写好,交给邮差,邮差根据地址送给下一个驿站,驿站再根据地址送给下一站,知道送到小红那,这个过程就很像路由,根据目的地址选择下一跳地址。有时候小明有太多话想跟小红讲,导致一封信已经装不下了,那么就会用几张信封来装信件,这个过程就像分片,因上层数据包太大,将数据包切割。当然逆向过程就叫重组。 + +### 传输层 +**传输层最主要的目的就是给两个应用程序传输数据,注意是两个程序,不是两个主机。***主要的协议有 tcp 和 udp,tcp 为应用提供了虚拟连接的服务,也提供了数据的可靠性。udp 提供的是无连接服务,也不提供可靠服务,仅仅实现让两个程序之间交换数据。 + +### 应用层 +应用层是利用传输层的接口来实现用户自定义的网络应用,例如 HTTP 应用,SMTP(邮件传输)应用等。正因为应用层各色各样的应用,才让网络传输有了意义。比如微信,QQ,淘宝网等,这些我们常见的应用都离不开网络的传输。 + +### 为何要分层 +分层当然是有原因的,主要的目的是为了灵活性和方便实现。分层可以允许供应商进行独立开发,各层通过一个接口在相邻层通信。每层只要专注自己的事情,而不是关心其他层,这样方便软件或者硬件的实现,定义好每个层之间的接口,更改一层的内部实现,不会影响其他层,这样更灵活。比如,TCP 协议就不管下一层是 ipv4 还是 ipv6,它们都实现了网络层的接口,能寻址发送和接收数据包。 + +这种思想到处可见,我们要解决一个复杂的问题时,一般都是拆分层小问题,然后分别解决小问题,分层也是一样,它的本质就是为了分离关注点而让问题简单化或者更高效。 + + +## 封装与解封装 +各个层需要加上自己的控制信息,就会为每层定义自己的控制信息,不同的协议层对数据包有不同的称谓,在传输层叫做段(segment),在网络层叫做数据报(datagram),链路层叫做帧(frame),物理层层叫比特,数据封装成帧后发到传输介质上,到达目的主机后每层协议再剥掉相应的首部,最后将应用层数据交给应用程序处理。 + +每层发送数据时加上自己的控制信息叫封装。 +![img](img/document-uid949121labid10418timestamp1555394988939.png) + +收到数据包剥掉相应的首部叫解封装,解封装的时候会得到上层的协议,去除本层首部信息后,将会把数据包分发给上层协议。 + +![img](img/document-uid949121labid10418timestamp1555395022259.png) + +## 你是如何冲浪的呢 +![img](img/document-uid949121labid10418timestamp1555395048260.png) +一般情况家里的上网流程如下,但不是一定是这样,请读者注意! + +首先你得购买互联网服务提供商(ISP,如:中国电信)提供的账号密码; +- 启动家用路由器,假设路由器内网地址为 192.168.1.1,接着配置账号密码,通过拨号和 ISP 建立连接,ISP 会返回一个公网 IP 地址,假如 IP 为 1.1.10.1; +- 然后再把电脑插到家用路由器的网口上,那么电脑就获取到了内网 IP 地址,假如为 192.168.1.2,这时候家用路由器就是电脑的默认网关,和家用路由器的相连的网卡假设为 en0; +- 当在浏览器访问 https://www.baidu.com 时,浏览器会发起 DNS 请求得到对应的 IP,假如为 180.97.33.18,DNS 请求的详细过程我们暂时忽略; +- 拿到 IP 后,浏览器会使用 tcp 连接系统调用和远端主机建立连接,系统调用会进入内核; +- 内核先通过路由最长匹配查询目标 IP 下一跳地址,也就是邻居地址,比如目的 180.97.33.18 会匹配下一跳地址 192.168.1.1; +- 内核接着查询 ARP 表,得知下一跳地址的网卡和物理 MAC 地址,如果没有查询到,则会发送广播 ARP 请求,得到 MAC 地址; +- 到目前为止发送 tcp 报文所需的信息都有了,目标 IP 和目标 MAC 地址,此时系统会给 tcp 的连接分配一个源端口,假如为 33306; +- 之后完成 tcp 三次握手,将 HTTP 请求报文封装在 tcp 数据段中发送给网卡 en0; +- 家用路由器接收到电脑的数据报,经过源地址转换(SNAT),将数据报文发送给 ISP; +- ISP 通过路由表选择发送给下一个路由,经过多个路由转发最终达到百度的服务主机; +- 百度服务器得到电脑发送的报文,返回 HTTP 响应,按原路返回给家用路由器; +- 家用路由器接收到 HTTP 响应报文后,经过目标地址转换(DNAT),将数据发送给电脑; +- 电脑上的浏览器接收到 HTTP 响应,渲染页面,呈现出网页; + +## 协议栈整体框架 +本课程参考netstack,基于 linux 的 tap 网卡,来实现一个用户态的 tcp/ip 协议栈。 + +协议栈的整体架构如下: +|协议|网络层级| +|:--:|:------:| +| tcp/udp | transport| +| arp/ipv4/ipv6 | network| +| vnic | nic_manager| +| tap | link| + +- 链路层我们用 tap 网卡实现,它是 linux 下的一种虚拟网卡,能提供链路层的功能,发送和接收以太网帧。 +- 协议栈还实现了对虚拟网卡的一定管理,就像 linux 对物理网卡的管理一样。 +- 网络层实现了 arp、ipv4 和 ipv6 协议,arp 协议虽然被划分在网络层,但是链路层的工作离不开它,所以第二章讲链路层通信的时候会介绍。ipv4 协议中的 icmp 和 ipv6 中的邻居协议也实现了,后面会讲解 ipv4 和 icmp 协议。 +- 传输层会实现了 tcp 和 udp,在讲传输层之前会先介绍端口的概念。传输层中的 tcp 实现应该是整个协议栈中最复杂的,会按功能拆分来讲解。 + +值得注意的是这里说的协议栈是主机上的协议栈,不是交换机,也不是路由器的协议栈。 + +整体来说,实现一个协议栈并没有想象中的那么复杂,如果排除了 tcp 的各种机制实现,那么协议栈其实很简单,就是封装与解封装的过程,所以协议栈的代码有很大部分也是各层对数据包的封装与解封装。tcp 的实现会根据每个特性来拆分讲解和实现,以便更容易理解和实现。 + +# 链路层 +## 链路层的介绍和基本实现 +本节主要介绍链路层的基本实现,主要讲以太网网卡、虚拟网卡和 arp 协议。 + +### 链路层的目的 +数据链路层属于计算机网络的底层,使用的信道主要有点对点信道和广播信道两种类型。 在 TCP/IP 协议族中,数据链路层主要有以下几个目的: + +1. 接收和发送链路层数据,提供 io 的能力。 +2. 为 IP 模块发送和接收数据 +3. 为 ARP 模块发送 ARP 请求和接收 ARP 应答 +4. 为 RARP 模块发送 RARP 请求和接收 RARP 应答 + +**TCP/IP 支持多种不同的链路层协议,这取决于网络所使用的硬件。** +数据链路层的协议数据单元——`帧`:将 IP 层(网络层)的数据报添加首部和尾部封装成帧。 +数据链路层协议有许多种,都会解决三个基本问题,封装成帧,透明传输,差错检测。 + +### 以太网介绍 +我们这章讲的是链路层,为何要讲以太网,那是因为以太网实在应用太广了,以至于我们在现实生活中看到的链路层协议的数据封装都是以太网协议封装的,所以要实现链路层数据的处理,我们必须要了解以太网。 + +以太网(Ethernet)是一种计算机局域网技术。IEEE 组织的 IEEE 802.3 标准制定了以太网的技术标准,它规定了包括物理层的连线、电子信号和介质访问层协议的内容。以太网是目前应用最普遍的局域网技术,取代了其他局域网标准如令牌环、FDDI 和 ARCNET。以太网协议,是当今现有局域网采用的最通用的通信协议标准,故可认为以太网就是局域网。 + +### 链路层的寻址 +通信当然得知道发送者的地址和接受者的地址,这是最基础的。以太网规定,所有连入网络的设备,都必须具有“网卡”接口。然后**数据包是从一块网卡,传输到另一块网卡的**。网卡的地址,就是数据包的发送地址和接收地址,叫做 MAC 地址,也叫物理地址,这是最底层的地址。每块网卡出厂的时候,都有一个全世界独一无二的 MAC 地址,长度是 48 个二进制位,通常用 12 个十六进制数表示。有了这个地址,我们可以定位网卡和数据包的路径了。 + +### MTU(最大传输单元) +**MTU 表示在链路层最大的传输单元,也就是链路层一帧数据的数据内容最大长度,单位为字节**,MTU 是协议栈实现一个很重要的参数,请大家务必理解该参数。一般网卡默认 MTU 是 1500,当你往网卡写入的内容超过 1518bytes,就会报错,后面我们可以写代码试试。 + +![img](img/document-uid949121labid10418timestamp1555399038307.png ) +上面的图片是 linux 上链路层的实现,链路层的实现可以分为三层,真实的以太网卡,网卡驱动,网卡逻辑抽象。 + +真实的网卡我们不关心,因为那是硬件工程,我们只需要知道,它能接收和发送网络数据给网卡驱动就好了。网卡驱动我们也不关心,一般驱动都是网卡生产商就写好了,我们只需知道,它能接收协议栈的数据发送给网卡,接收网卡的数据发送给协议栈。网卡逻辑抽象表示,这个是我们关心的,我需要对真实的网卡进行抽象, + +在系统中表示,也需要对抽象的网卡进行管理。 + +> 注意:后面系统中网卡的逻辑抽象我们都描述为网卡。 + +比如在 linux 上,当你敲下 ifconfig 命令,会输出类似如下内容: + +``` bash +eth0 Link encap:Ethernet HWaddr 00:16:3e:08:a1:7a + inet addr:172.18.153.158 Bcast:172.18.159.255 Mask:255.255.240.0 + UP BROADCAST RUNNING MULTICAST MTU:1500 Metric:1 + RX packets:285941546 errors:0 dropped:0 overruns:0 frame:0 + TX packets:281609568 errors:0 dropped:0 overruns:0 carrier:0 + collisions:0 txqueuelen:1000 + RX bytes:142994767953 (142.9 GB) TX bytes:44791940275 (44.7 GB) + +lo Link encap:Local Loopback + inet addr:127.0.0.1 Mask:255.0.0.0 + UP LOOPBACK RUNNING MTU:65536 Metric:1 + RX packets:363350690 errors:0 dropped:0 overruns:0 frame:0 + TX packets:363350690 errors:0 dropped:0 overruns:0 carrier:0 + collisions:0 txqueuelen:1 + RX bytes:28099158493 (28.0 GB) TX bytes:28099158493 (28.0 GB) +``` + +示例里显示了两个网卡,一个 eth0 以太网网卡,一个 lo 本地回环网卡。还可以看到两个网卡的信息,当我们要表示一个网卡的时候,需要具备几个属性: + +1. 网卡的名字、类型和 MAC 地址 +- eth0 Link encap:Ethernet HWaddr 00:16:3e:08:a1:7a + - eth0是网卡名,方便表示一个网卡,网卡名在同个系统里不能重复 + - Link encap:Ethernet 表示该网卡类型为以太网网卡。 + - HWaddr 00:16:3e:08:a1:7a 表示 MAC 地址 00:16:3e:08:a1:7a,是链路层寻址的地址。 +2. 网卡的 IP 地址及掩码 +- inet addr:172.18.153.158 Bcast:172.18.159.255 Mask:255.255.240.0 + - inet addr:172.18.153.158 表示该网卡的 ipv4 地址是 172.18.153.158。 + - Bcast:172.18.159.255 表示该网卡 ip 层的广播地址。 + - 255.255.240.0 该网卡的子网掩码。 +3. 网卡的状态和 MTU +- UP BROADCAST RUNNING MULTICAST MTU:1500 Metric:1 + - UP BROADCAST RUNNING MULTICAST都是表示网卡的状态 + - UP(代表网卡开启状态) + - BROADCAST (支持广播) + - RUNNING(代表网卡的网线被接上) + - MULTICAST(支持组播)。 + - MTU:1500 最大传输单元为 1500 字节。 + - Metric:1 接口度量值为 1,接口度量值表示在这个路径上发送一个分组的成本。 + +### linux的虚拟网卡介绍 +实现协议栈,我们需要一个网卡,因为这样我们才能接收和发送网络数据,但是一般情况下,我们电脑的操作系统已经帮我们管理好网卡了,我们想实现自由的控制网卡是不太方便的,还好 linux 系统还有另一个功能-虚拟网卡,它是操作系统虚拟出来的一个网卡,我们协议栈的实现都是基于虚拟网卡,用虚拟网卡的好处是: + +对于用户来说虚拟网卡和真实网卡几乎没有差别,而且我们控制或更改虚拟网卡大部分情况下不会影响到真实的网卡,也就不会影响到用户的网络。 +虚拟网卡的数据可以直接从用户态直接读取和写入,这样我们就可以直接在用户态编写协议栈。 +Linux 中虚拟网络设备 +TUN/TAP 设备、VETH 设备、Bridge 设备、Bond 设备、VLAN 设备、MACVTAP 设备,下面我们只讲 tun/tap 设备,其他虚拟设备感兴趣的同学可以去网上自行搜索。 + +TAP/TUN 设备是一种让用户态和内核之间进行数据交换的虚拟设备,TAP 工作在二层,TUN 工作在三层,TAP/TUN 网卡的两头分别是内核网络协议栈和用户层,其作用是将协议栈中的部分数据包转发给用户空间的应用程序,给用户空间的程序一个处理数据包的机会。 + +当我们想在 linux 中创建一个 TAP 设备时,其实很容易,像普通文件一样打开字符设备 /dev/net/tun 可以得到一个文件描述符,接着用系统调用 ioctl 将文件描述符和 kernel 的 tap 驱动绑定在一起,那么之后对该文件描述符的读写就是对虚拟网卡 TAP 的读写。详细的实现可以看 (tuntap)[https://www.kernel.org/doc/Documentation/networking/tuntap.txt] 所以最终我们实现的协议栈和 TAP 虚拟网卡的关系,如下图: + + `userland netstack` <- `tap` <- kernel` + +### tap网卡实验 +在 linux 中创建虚拟网卡,我们可以用 linux 自带的 ip 命令来实现,关于 ip 命令的更多用法请看 man ip。 + +创建 tap 网卡 + +#### 创建一个tap模式的虚拟网卡tap0 + +``` bash + +sudo ip tuntap add mode tap tap0 +``` + +#### 开启该网卡 + +``` bash + +sudo ip link set tap0 up +``` + +#### 设置该网卡的ip及掩码 + +``` bash + +sudo ip addr add 192.168.1.1/24 dev tap0 +``` + +我们创建一个为名 tap0,ip 及掩码为 192.168.1.1/24 的虚拟网卡,执行 ifconfig 看看,会看到一个 tap0 的网卡: + +``` bash +tap0 Link encap:Ethernet HWaddr 22:e2:f2:93:ff:bf + inet addr:192.168.1.1 Bcast:0.0.0.0 Mask:255.255.255.0 + UP BROADCAST MULTICAST MTU:1500 Metric:1 + RX packets:0 errors:0 dropped:0 overruns:0 frame:0 + TX packets:0 errors:0 dropped:0 overruns:0 carrier:0 + collisions:0 txqueuelen:1000 + RX bytes:0 (0.0 B) TX bytes:0 (0.0 B) + +``` + + +删除网卡可以使用如下命令: + +#### 删除虚拟网卡 + +``` bash + +sudo ip tuntap del mode tap tap0 +``` + +看起来和真实的网卡没有任何区别,接下来我们自己用 golang 来实现创建网卡。 + +golang 创建 tuntap 网卡的库实现,在 netstack/tcpip/link/tuntap 目录下可以查看源文件 tuntap.go 的代码: + +``` go +// +build linux + +package tuntap + +import ( + "errors" + "fmt" + "os/exec" + "syscall" + "unsafe" +) + +const ( + TUN = 1 + TAP = 2 +) + +var ( + ErrDeviceMode = errors.New("unsupport device mode") +) + +type rawSockaddr struct { + Family uint16 + Data [14]byte +} + +// 虚拟网卡设置的配置 +type Config struct { + Name string // 网卡名 + Mode int // 网卡模式,TUN or TAP +} + +// NewNetDev根据配置返回虚拟网卡的文件描述符 +func NewNetDev(c *Config) (fd int, err error) { + switch c.Mode { + case TUN: + fd, err = newTun(c.Name) + case TAP: + fd, err = newTAP(c.Name) + default: + err = ErrDeviceMode + return + } + if err != nil { + return + } + return +} + +// SetLinkUp 让系统启动该网卡 +func SetLinkUp(name string) (err error) { + // ip link set up + out, cmdErr := exec.Command("ip", "link", "set", name, "up").CombinedOutput() + if cmdErr != nil { + err = fmt.Errorf("%v:%v", cmdErr, string(out)) + return + } + return +} + +// SetRoute 通过ip命令添加路由 +func SetRoute(name, cidr string) (err error) { + // ip route add 192.168.1.0/24 dev tap0 + out, cmdErr := exec.Command("ip", "route", "add", cidr, "dev", name).CombinedOutput() + if cmdErr != nil { + err = fmt.Errorf("%v:%v", cmdErr, string(out)) + return + } + return +} + +// AddIP 通过ip命令添加IP地址 +func AddIP(name, ip string) (err error) { + // ip addr add 192.168.1.1 dev tap0 + out, cmdErr := exec.Command("ip", "addr", "add", ip, "dev", name).CombinedOutput() + if cmdErr != nil { + err = fmt.Errorf("%v:%v", cmdErr, string(out)) + return + } + return +} + +func GetHardwareAddr(name string) (string, error) { + fd, err := syscall.Socket(syscall.AF_UNIX, syscall.SOCK_DGRAM, 0) + if err != nil { + return "", err + } + + defer syscall.Close(fd) + + var ifreq struct { + name [16]byte + addr rawSockaddr + _ [8]byte + } + + copy(ifreq.name[:], name) + _, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), syscall.SIOCGIFHWADDR, uintptr(unsafe.Pointer(&ifreq))) + if errno != 0 { + return "", errno + } + + mac := ifreq.addr.Data[:6] + return string(mac[:]), nil +} + +// newTun新建一个tun模式的虚拟网卡,然后返回该网卡的文件描述符 +// IFF_NO_PI表示不需要包信息 +func newTun(name string) (int, error) { + return open(name, syscall.IFF_TUN|syscall.IFF_NO_PI) +} + +// newTAP新建一个tap模式的虚拟网卡,然后返回该网卡的文件描述符 +func newTAP(name string) (int, error) { + return open(name, syscall.IFF_TAP|syscall.IFF_NO_PI) +} + +// 先打开一个字符串设备,通过系统调用将虚拟网卡和字符串设备fd绑定在一起 +func open(name string, flags uint16) (int, error) { + // 打开tuntap的字符设备,得到字符设备的文件描述符 + fd, err := syscall.Open("/dev/net/tun", syscall.O_RDWR, 0) + if err != nil { + return -1, err + } + + var ifr struct { + name [16]byte + flags uint16 + _ [22]byte + } + + copy(ifr.name[:], name) + ifr.flags = flags + // 通过ioctl系统调用,将fd和虚拟网卡驱动绑定在一起 + _, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), syscall.TUNSETIFF, uintptr(unsafe.Pointer(&ifr))) + if errno != 0 { + syscall.Close(fd) + return -1, errno + } + return fd, nil +} +``` + +根据这个库,我们写一个从网卡读取数据的程序,并打印读取到的字节数。新建文件 tcpip/lab/link/tap1/main.go,输入如下代码: + +``` go +package main + +import ( + "log" + "tcpip/netstack/tcpip/link/rawfile" + "tcpip/netstack/tcpip/link/tuntap" +) + +func main() { + tapName := "tap0" + c := &tuntap.Config{tapName, tuntap.TAP} + fd, err := tuntap.NewNetDev(c) + if err != nil { + panic(err) + } + + // 启动tap网卡 + _ = tuntap.SetLinkUp(tapName) + // 添加ip地址 + _ = tuntap.AddIP(tapName, "192.168.1.1/24") + + buf := make([]byte, 1<<16) + for { + rn, err := rawfile.BlockingRead(fd, buf) + if err != nil { + log.Println(err) + continue + } + log.Printf("read %d bytes", rn) + } +} +``` + + +copy +然后进入目录 tcpip/lab/link/tap1 编译代码。 + +``` bash + +cd ~/tcpip/lab/link/tap1/ +go build +``` + +会生成一个叫 tap1 的可执行文件,我们执行它 + +``` bash + +sudo ./tap1 +``` + +稍等一会再打开另一个终端,利用 tcpdump 抓取经过 tap0 网卡的数据,如果执行 tap1,立马就抓包,可能会抓到一些 ipv6 的组播包,我们这里先忽略。 + +``` bash + +sudo tcpdump -i tap0 -n +``` + +再打开另一个终端,我们试 ping 一下 192.168.1.1 + +``` bash + +ping 192.168.1.1 +``` + + +但是 tcpdump 抓取数据的终端和我们自己写的打印网卡数据的终端中没有任何 icmp 数据,这是为何?这是因为当给一个网卡添加 ip 地址的时候,系统会将相应的路由添加到“本地路由表”,正因为这样,即使看起来 192.168.1.1 是 tap0 网卡的地址,但实际上我们 ping 的数据并没有走到 tap0 网卡,而是在 lo 网卡上,我们可以试试在终端抓去 lo 网卡数据 + +``` bash + +sudo tcpdump src 192.168.1.1 -i lo -n +``` + +再 ping 一下 192.168.1.1 ,查看 tcpdump 的输出: + +``` bash +listening on lo, link-type EN10MB (Ethernet), capture size 262144 bytes +22:40:18.028585 IP 192.168.1.1 > 192.168.1.1: ICMP echo request, id 29728, seq 1, length 64 +22:40:18.028599 IP 192.168.1.1 > 192.168.1.1: ICMP echo reply, id 29728, seq 1, length 64 +22:40:19.029912 IP 192.168.1.1 > 192.168.1.1: ICMP echo request, id 29728, seq 2, length 64 +22:40:19.029925 IP 192.168.1.1 > 192.168.1.1: ICMP echo reply, id 29728, seq 2, length 64 +``` + +查看本地路由的信息,通过 ip route show table local 命令。 + +``` bash +broadcast 10.211.55.0 dev enp0s5 proto kernel scope link src 10.211.55.14 +broadcast 10.211.55.0 dev enp0s6 proto kernel scope link src 10.211.55.16 +local 10.211.55.14 dev enp0s5 proto kernel scope host src 10.211.55.14 +local 10.211.55.16 dev enp0s6 proto kernel scope host src 10.211.55.16 +broadcast 10.211.55.255 dev enp0s5 proto kernel scope link src 10.211.55.14 +broadcast 10.211.55.255 dev enp0s6 proto kernel scope link src 10.211.55.16 +broadcast 127.0.0.0 dev lo proto kernel scope link src 127.0.0.1 +local 127.0.0.0/8 dev lo proto kernel scope host src 127.0.0.1 +local 127.0.0.1 dev lo proto kernel scope host src 127.0.0.1 +broadcast 127.255.255.255 dev lo proto kernel scope link src 127.0.0.1 +broadcast 192.168.1.0 dev tap0 proto kernel scope link src 192.168.1.1 +local 192.168.1.1 dev tap0 proto kernel scope host src 192.168.1.1 +broadcast 192.168.1.255 dev tap0 proto kernel scope link src 192.168.1.1 + +``` + +可以看到倒数第二行,表示了 192.168.1.1 这个地址,在 local 路由表里。同时路由表也显示,只有 192.168.1.1 这个地址在路由表里,该网段的其他地址不在本地路由,那么应该会进入 tap0 网卡,比如我们试试 192.168.1.2 这个地址,ping 一下 + +``` bash + +PING 192.168.1.2 (192.168.1.2) 56(84) bytes of data. +From 192.168.1.1 icmp_seq=1 Destination Host Unreachable +From 192.168.1.1 icmp_seq=2 Destination Host Unreachable +``` + +然后 tcpdump 在 tap0 网卡上的输出 + +``` bash + +listening on tap0, link-type EN10MB (Ethernet), capture size 262144 bytes +22:55:58.322022 ARP, Request who-has 192.168.1.2 tell 192.168.1.1, length 28 +22:55:59.320824 ARP, Request who-has 192.168.1.2 tell 192.168.1.1, length 28 +``` + +说明 tap0 网卡收到了 arp 请求,至于我们使用 ping 之后为何接收到的是 arp 请求报文而不是 icmp 报文,这是因为系统不知道 192.168.1.2 的 MAC 地址,后面会详细说明。 + +在上面的程序中,我们也可以看到上面的程序有打印: + +``` bash +2018/11/11 23:54:10 read 42 bytes +2018/11/11 23:54:11 read 42 bytes +2018/11/11 23:54:12 read 42 bytes +2018/11/11 23:54:13 read 42 bytes + +``` + +其实在链路层通信,是可以不需要 ip 地址的,我们可以手动配置路由,将数据导入虚拟网卡,现在更改我们的程序,代码存放在 tcpip/lab/link/tap2/main.go: + +``` go +package main + +import ( + "log" + "tcpip/netstack/tcpip/link/rawfile" + "tcpip/netstack/tcpip/link/tuntap" +) + +func main() { + tapName := "tap0" + c := &tuntap.Config{tapName, tuntap.TAP} + fd, err := tuntap.NewNetDev(c) + if err != nil { + panic(err) + } + + // 启动tap网卡 + _ = tuntap.SetLinkUp(tapName) + // 设置路由 + _ = tuntap.SetRoute(tapName, "192.168.1.0/24") + + buf := make([]byte, 1<<16) + for { + rn, err := rawfile.BlockingRead(fd, buf) + if err != nil { + log.Println(err) + continue + } + log.Printf("read %d bytes", rn) + } +} +``` + + +进入目录 tcpip/lab/link/tap2,然后编译代码。 + +``` bash +cd ~/tcpip/lab/link/tap2 + +go build +``` + +会生成一个叫tap2的可执行文件,我们执行它 + +``` bash + +sudo ./tap2 +``` + +稍等一会再打开另一个终端,利用 tcpdump 抓取经过 tap0 网卡的数据。 + +``` bash + +sudo tcpdump -i tap0 -n +``` + +再打开另一个终端,我们试 ping 一下 192.168.1.1 + +``` bash + +ping 192.168.1.1 +``` + +查看程序 tap2 的输出: + +``` bash +2019/04/10 11:12:57 read 42 bytes +2019/04/10 11:12:58 read 42 bytes +2019/04/10 11:12:59 read 42 bytes +2019/04/10 11:13:16 read 42 bytes +2019/04/10 11:13:17 read 42 bytes +2019/04/10 11:13:18 read 42 bytes + +``` + +这时候你 ping 192.168.1.0/24 网段的任何一个地址都是进入 tap0 网卡,这样我们就可以实验和处理 tap0 网上上的数据了。目前我们只看到了网卡有读取到数据,而且抓包显示我们现在接收到的数据都是 arp 请求,后面会实现对 arp 报文的处理,接下来我们开始处理网卡的数据并封装链路层,实现网卡的 io。 + +### 链路层数据帧 +数据在链路层传输都是一帧一帧传输的,就像发送邮件一样,将信放入信封中,接着把信封邮寄出去,这样可以把一段信息和另一段信息区分开来,下面先介绍数据帧格式。 + +![](../../img/链路层数据帧.png ) + +- 目的 MAC 地址:目的设备的 MAC 物理地址。 +- 源 MAC 地址:发送设备的 MAC 物理地址。 +- 类型:表示后面所跟数据包的协议类型,例如 Type 为 0x8000 时为 IPv4 协议包,Type 为 0x8060 时,后面为 ARP 协议包。 +- 数据:表示该帧的数据内容,长度为 46 ~ 1500 字节,包含网络层、传输层和应用层的数据。 + +既然前面我们已经知道了链路层数据帧格式,也知道了链路层协议头的详细信息,那么现在就根据这些信息来处理以太网数据。我们把处理头部数据的代码都放在 header 包中 + +``` go +package header + +import ( + "encoding/binary" + + "tcpip/netstack/tcpip" +) + +// 以太网帧头部信息的偏移量 +const ( + dstMAC = 0 + srcMAC = 6 + ethType = 12 +) + +// EthernetFields表示链路层以太网帧的头部 +type EthernetFields struct { + // 源地址 + SrcAddr tcpip.LinkAddress + + // 目的地址 + DstAddr tcpip.LinkAddress + + // 协议类型 + Type tcpip.NetworkProtocolNumber +} + +// Ethernet以太网数据包的封装 +type Ethernet []byte + +const ( + // EthernetMinimumSize以太网帧最小的长度 + EthernetMinimumSize = 14 + + // EthernetAddressSize以太网地址的长度 + EthernetAddressSize = 6 +) + +// SourceAddress从帧头部中得到源地址 +func (b Ethernet) SourceAddress() tcpip.LinkAddress { + return tcpip.LinkAddress(b[srcMAC:][:EthernetAddressSize]) +} + +// DestinationAddress从帧头部中得到目的地址 +func (b Ethernet) DestinationAddress() tcpip.LinkAddress { + return tcpip.LinkAddress(b[dstMAC:][:EthernetAddressSize]) +} + +// Type从帧头部中得到协议类型 +func (b Ethernet) Type() tcpip.NetworkProtocolNumber { + return tcpip.NetworkProtocolNumber(binary.BigEndian.Uint16(b[ethType:])) +} + +// Encode根据传入的帧头部信息编码成Ethernet二进制形式,注意Ethernet应先分配好内存 +func (b Ethernet) Encode(e *EthernetFields) { + binary.BigEndian.PutUint16(b[ethType:], uint16(e.Type)) + copy(b[srcMAC:][:EthernetAddressSize], e.SrcAddr) + copy(b[dstMAC:][:EthernetAddressSize], e.DstAddr) +} +``` + +### 网卡IO的实现 +所谓 io 就是数据的输入输出,对于网卡来说就是接收或发送数据,接收意味着对以太网帧解封装和提交给网络层,发送意味着对上层数据的封装和写入网卡。协议栈定义了链路层的接口如下 diff --git a/cmd/arp/main.go b/cmd/arp/main.go index ad4302c..87a3e9e 100644 --- a/cmd/arp/main.go +++ b/cmd/arp/main.go @@ -1,104 +1,104 @@ -package main - -import ( - "flag" - "log" - "net" - "os" - - "netstack/tcpip" - "netstack/tcpip/link/fdbased" - "netstack/tcpip/link/tuntap" - "netstack/tcpip/network/arp" - "netstack/tcpip/network/ipv4" - "netstack/tcpip/stack" -) - -// 链路层主要负责管理网卡和处理网卡数据, -// 包括新建网卡对象绑定真实网卡,更改网卡参数,接收网卡数据,去除以太网头部后分发给上层,接收上层数据,封装以太网头部写入网卡。 -// 需要注意的是主机与主机之间的二层通信,也需要主机有 ip 地址, -// 因为主机需要通过 arp 表来进行二层寻址,而 arp 表记录的是 ip 与 mac 地址的映射关系,所以主机的 ip 地址是必须的。 -// 经过上面的实验我们已经知道,只要配好路由,我们在系统发送的数据就都可以进入到 tap 网卡, -// 然后程序就可以读取到网卡数据,进行处理,实现对 arp 报文的处理,那如果我们继续处理 ip 报文、tcp 报文就可以实现整个协议栈了。 -func main() { - flag.Parse() - if len(flag.Args()) < 2 { - log.Fatal("Usage: ", os.Args[0], " ") - } - - log.SetFlags(log.Lshortfile | log.LstdFlags) - tapName := flag.Arg(0) - cidrName := flag.Arg(1) - - log.Printf("tap: %v, cidrName: %v", tapName, cidrName) - - parsedAddr, cidr, err := net.ParseCIDR(cidrName) - if err != nil { - log.Fatalf("Bad cidr: %v", cidrName) - } - - // 解析地址ip地址,ipv4或者ipv6地址都支持 - var addr tcpip.Address - var proto tcpip.NetworkProtocolNumber - if parsedAddr.To4() != nil { - addr = tcpip.Address(parsedAddr.To4()) - proto = ipv4.ProtocolNumber - } else if parsedAddr.To16() != nil { - addr = tcpip.Address(parsedAddr.To16()) - //proto = ipv6.ProtocolNumber - } else { - log.Fatalf("Unknown IP type: %v", parsedAddr) - } - - // 虚拟网卡配置 - conf := &tuntap.Config{ - Name: tapName, - Mode: tuntap.TAP, - } - - var fd int - // 新建虚拟网卡 - fd, err = tuntap.NewNetDev(conf) - if err != nil { - log.Fatal(err) - } - - // 启动tap网卡 - tuntap.SetLinkUp(tapName) - // 设置路由 - tuntap.SetRoute(tapName, cidr.String()) - - // 获取mac地址 - mac, err := tuntap.GetHardwareAddr(tapName) - if err != nil { - panic(err) - } - - // 抽象网卡的文件接口 - linkID := fdbased.New(&fdbased.Options{ - FD: fd, - MTU: 1500, - Address: tcpip.LinkAddress(mac), - }) - - // 新建相关协议的协议栈 - s := stack.New([]string{ipv4.ProtocolName, arp.ProtocolName}, - []string{}, stack.Options{}) - - // 新建抽象的网卡 - if err := s.CreateNamedNIC(1, "vnic1", linkID); err != nil { - log.Fatal(err) - } - - // 在该协议栈上添加和注册相应的网络层 - if err := s.AddAddress(1, proto, addr); err != nil { - log.Fatal(err) - } - - // 在该协议栈上添加和注册ARP协议 - if err := s.AddAddress(1, arp.ProtocolNumber, arp.ProtocolAddress); err != nil { - log.Fatal(err) - } - - select {} -} +package main + +import ( + "flag" + "log" + "net" + "os" + + "netstack/tcpip" + "netstack/tcpip/link/fdbased" + "netstack/tcpip/link/tuntap" + "netstack/tcpip/network/arp" + "netstack/tcpip/network/ipv4" + "netstack/tcpip/stack" +) + +// 链路层主要负责管理网卡和处理网卡数据, +// 包括新建网卡对象绑定真实网卡,更改网卡参数,接收网卡数据,去除以太网头部后分发给上层,接收上层数据,封装以太网头部写入网卡。 +// 需要注意的是主机与主机之间的二层通信,也需要主机有 ip 地址, +// 因为主机需要通过 arp 表来进行二层寻址,而 arp 表记录的是 ip 与 mac 地址的映射关系,所以主机的 ip 地址是必须的。 +// 经过上面的实验我们已经知道,只要配好路由,我们在系统发送的数据就都可以进入到 tap 网卡, +// 然后程序就可以读取到网卡数据,进行处理,实现对 arp 报文的处理,那如果我们继续处理 ip 报文、tcp 报文就可以实现整个协议栈了。 +func main() { + flag.Parse() + if len(flag.Args()) < 2 { + log.Fatal("Usage: ", os.Args[0], " ") + } + + log.SetFlags(log.Lshortfile | log.LstdFlags) + tapName := flag.Arg(0) + cidrName := flag.Arg(1) + + log.Printf("tap: %v, cidrName: %v", tapName, cidrName) + + parsedAddr, cidr, err := net.ParseCIDR(cidrName) + if err != nil { + log.Fatalf("Bad cidr: %v", cidrName) + } + + // 解析地址ip地址,ipv4或者ipv6地址都支持 + var addr tcpip.Address + var proto tcpip.NetworkProtocolNumber + if parsedAddr.To4() != nil { + addr = tcpip.Address(parsedAddr.To4()) + proto = ipv4.ProtocolNumber + } else if parsedAddr.To16() != nil { + addr = tcpip.Address(parsedAddr.To16()) + //proto = ipv6.ProtocolNumber + } else { + log.Fatalf("Unknown IP type: %v", parsedAddr) + } + + // 虚拟网卡配置 + conf := &tuntap.Config{ + Name: tapName, + Mode: tuntap.TAP, + } + + var fd int + // 新建虚拟网卡 + fd, err = tuntap.NewNetDev(conf) + if err != nil { + log.Fatal(err) + } + + // 启动tap网卡 + tuntap.SetLinkUp(tapName) + // 设置路由 + tuntap.SetRoute(tapName, cidr.String()) + + // 获取mac地址 + mac, err := tuntap.GetHardwareAddr(tapName) + if err != nil { + panic(err) + } + + // 抽象网卡的文件接口 + linkID := fdbased.New(&fdbased.Options{ + FD: fd, + MTU: 1500, + Address: tcpip.LinkAddress(mac), + }) + + // 新建相关协议的协议栈 + s := stack.New([]string{ipv4.ProtocolName, arp.ProtocolName}, + []string{}, stack.Options{}) + + // 新建抽象的网卡 + if err := s.CreateNamedNIC(1, "vnic1", linkID); err != nil { + log.Fatal(err) + } + + // 在该协议栈上添加和注册相应的网络层 + if err := s.AddAddress(1, proto, addr); err != nil { + log.Fatal(err) + } + + // 在该协议栈上添加和注册ARP协议 + if err := s.AddAddress(1, arp.ProtocolNumber, arp.ProtocolAddress); err != nil { + log.Fatal(err) + } + + select {} +} diff --git a/cmd/port/main.go b/cmd/port/main.go index 5f6214a..c39586d 100644 --- a/cmd/port/main.go +++ b/cmd/port/main.go @@ -1,163 +1,163 @@ -package main - -import ( - "flag" - "log" - "net" - "netstack/tcpip" - "netstack/tcpip/link/fdbased" - "netstack/tcpip/link/tuntap" - "netstack/tcpip/network/arp" - "netstack/tcpip/network/ipv4" - "netstack/tcpip/network/ipv6" - "netstack/tcpip/stack" - "netstack/tcpip/transport/udp" - "netstack/waiter" - "os" - "strconv" - "strings" -) - -var mac = flag.String("mac", "01:01:01:01:01:01", "mac address to use in tap device") - -func main() { - flag.Parse() - if len(flag.Args()) != 3 { - log.Fatal("Usage: ", os.Args[0], " port") - } - - log.SetFlags(log.Lshortfile | log.LstdFlags) - tapName := flag.Arg(0) - listeAddr := flag.Arg(1) - portName := flag.Arg(2) - - log.Printf("tap: %v, listeAddr: %v, portName: %v", tapName, listeAddr, portName) - - // Parse the mac address. - maddr, err := net.ParseMAC(*mac) - if err != nil { - log.Fatalf("Bad MAC address: %v", *mac) - } - - parsedAddr := net.ParseIP(listeAddr) - - // 解析地址ip地址,ipv4或者ipv6地址都支持 - var addr tcpip.Address - var proto tcpip.NetworkProtocolNumber - if parsedAddr.To4() != nil { - addr = tcpip.Address(parsedAddr.To4()) - proto = ipv4.ProtocolNumber - } else if parsedAddr.To16() != nil { - addr = tcpip.Address(parsedAddr.To16()) - proto = ipv6.ProtocolNumber - } else { - log.Fatalf("Unknown IP type: %v", parsedAddr) - } - - localPort, err := strconv.Atoi(portName) - if err != nil { - log.Fatalf("Unable to convert port %v: %v", portName, err) - } - - // 虚拟网卡配置 - conf := &tuntap.Config{ - Name: tapName, - Mode: tuntap.TAP, - } - - var fd int - // 新建虚拟网卡 - fd, err = tuntap.NewNetDev(conf) - if err != nil { - log.Fatal(err) - } - - // 启动tap网卡 - _ = tuntap.SetLinkUp(tapName) - // 设置tap网卡IP地址 - _ = tuntap.AddIP(tapName, listeAddr) - - // 抽象网卡的文件接口 - linkID := fdbased.New(&fdbased.Options{ - FD: fd, - MTU: 1500, - Address: tcpip.LinkAddress(maddr), - }) - - // 新建相关协议的协议栈 - s := stack.New([]string{ipv4.ProtocolName, arp.ProtocolName}, - []string{ /*tcp.ProtocolName, */ udp.ProtocolName}, stack.Options{}) - - // 新建抽象的网卡 - if err := s.CreateNamedNIC(1, "vnic1", linkID); err != nil { - log.Fatal(err) - } - - // 在该协议栈上添加和注册相应的网络层 - if err := s.AddAddress(1, proto, addr); err != nil { - log.Fatal(err) - } - - // 在该协议栈上添加和注册ARP协议 - if err := s.AddAddress(1, arp.ProtocolNumber, arp.ProtocolAddress); err != nil { - log.Fatal(err) - } - - // 添加默认路由 - s.SetRouteTable([]tcpip.Route{ - { - Destination: tcpip.Address(strings.Repeat("\x00", len(addr))), - Mask: tcpip.AddressMask(strings.Repeat("\x00", len(addr))), - Gateway: "", - NIC: 1, - }, - }) - - // 同时监听tcp和udp localPort端口 - //tcpEp := tcpListen(s, proto, localPort) - udpEp := udpListen(s, proto, localPort) - // 关闭监听服务,此时会释放端口 - //tcpEp.Close() - udpEp.Close() -} - -//func tcpListen(s *stack.Stack, proto tcpip.NetworkProtocolNumber, localPort int) tcpip.Endpoint { -// var wq waiter.Queue -// // 新建一个tcp端 -// ep, err := s.NewEndpoint(tcp.ProtocolNumber, proto, &wq) -// if err != nil { -// log.Fatal(err) -// } -// -// // 绑定IP和端口,这里的IP地址为空,表示绑定任何IP -// // 此时就会调用端口管理器 -// if err := ep.Bind(tcpip.FullAddress{0, "", uint16(localPort)}, nil); err != nil { -// log.Fatal("Bind failed: ", err) -// } -// -// // 开始监听 -// if err := ep.Listen(10); err != nil { -// log.Fatal("Listen failed: ", err) -// } -// -// return ep -//} - -func udpListen(s *stack.Stack, proto tcpip.NetworkProtocolNumber, localPort int) tcpip.Endpoint { - var wq waiter.Queue - // 新建一个udp端 - ep, err := s.NewEndpoint(udp.ProtocolNumber, proto, &wq) - if err != nil { - log.Fatal(err) - } - - // 绑定IP和端口,这里的IP地址为空,表示绑定任何IP - // 0.0.0.0:9999 这台机器上的所有ip的9999段端口数据都会使用该传输层实现 - // 此时就会调用端口管理器 - if err := ep.Bind(tcpip.FullAddress{NIC: 0, Addr: "", Port: uint16(localPort)}, nil); err != nil { - log.Fatal("Bind failed: ", err) - } - - // 注意UDP是无连接的,它不需要Listen - return ep -} +package main + +import ( + "flag" + "log" + "net" + "netstack/tcpip" + "netstack/tcpip/link/fdbased" + "netstack/tcpip/link/tuntap" + "netstack/tcpip/network/arp" + "netstack/tcpip/network/ipv4" + "netstack/tcpip/network/ipv6" + "netstack/tcpip/stack" + "netstack/tcpip/transport/udp" + "netstack/waiter" + "os" + "strconv" + "strings" +) + +var mac = flag.String("mac", "01:01:01:01:01:01", "mac address to use in tap device") + +func main() { + flag.Parse() + if len(flag.Args()) != 3 { + log.Fatal("Usage: ", os.Args[0], " port") + } + + log.SetFlags(log.Lshortfile | log.LstdFlags) + tapName := flag.Arg(0) + listeAddr := flag.Arg(1) + portName := flag.Arg(2) + + log.Printf("tap: %v, listeAddr: %v, portName: %v", tapName, listeAddr, portName) + + // Parse the mac address. + maddr, err := net.ParseMAC(*mac) + if err != nil { + log.Fatalf("Bad MAC address: %v", *mac) + } + + parsedAddr := net.ParseIP(listeAddr) + + // 解析地址ip地址,ipv4或者ipv6地址都支持 + var addr tcpip.Address + var proto tcpip.NetworkProtocolNumber + if parsedAddr.To4() != nil { + addr = tcpip.Address(parsedAddr.To4()) + proto = ipv4.ProtocolNumber + } else if parsedAddr.To16() != nil { + addr = tcpip.Address(parsedAddr.To16()) + proto = ipv6.ProtocolNumber + } else { + log.Fatalf("Unknown IP type: %v", parsedAddr) + } + + localPort, err := strconv.Atoi(portName) + if err != nil { + log.Fatalf("Unable to convert port %v: %v", portName, err) + } + + // 虚拟网卡配置 + conf := &tuntap.Config{ + Name: tapName, + Mode: tuntap.TAP, + } + + var fd int + // 新建虚拟网卡 + fd, err = tuntap.NewNetDev(conf) + if err != nil { + log.Fatal(err) + } + + // 启动tap网卡 + _ = tuntap.SetLinkUp(tapName) + // 设置tap网卡IP地址 + _ = tuntap.AddIP(tapName, listeAddr) + + // 抽象网卡的文件接口 + linkID := fdbased.New(&fdbased.Options{ + FD: fd, + MTU: 1500, + Address: tcpip.LinkAddress(maddr), + }) + + // 新建相关协议的协议栈 + s := stack.New([]string{ipv4.ProtocolName, arp.ProtocolName}, + []string{ /*tcp.ProtocolName, */ udp.ProtocolName}, stack.Options{}) + + // 新建抽象的网卡 + if err := s.CreateNamedNIC(1, "vnic1", linkID); err != nil { + log.Fatal(err) + } + + // 在该协议栈上添加和注册相应的网络层 + if err := s.AddAddress(1, proto, addr); err != nil { + log.Fatal(err) + } + + // 在该协议栈上添加和注册ARP协议 + if err := s.AddAddress(1, arp.ProtocolNumber, arp.ProtocolAddress); err != nil { + log.Fatal(err) + } + + // 添加默认路由 + s.SetRouteTable([]tcpip.Route{ + { + Destination: tcpip.Address(strings.Repeat("\x00", len(addr))), + Mask: tcpip.AddressMask(strings.Repeat("\x00", len(addr))), + Gateway: "", + NIC: 1, + }, + }) + + // 同时监听tcp和udp localPort端口 + //tcpEp := tcpListen(s, proto, localPort) + udpEp := udpListen(s, proto, localPort) + // 关闭监听服务,此时会释放端口 + //tcpEp.Close() + udpEp.Close() +} + +//func tcpListen(s *stack.Stack, proto tcpip.NetworkProtocolNumber, localPort int) tcpip.Endpoint { +// var wq waiter.Queue +// // 新建一个tcp端 +// ep, err := s.NewEndpoint(tcp.ProtocolNumber, proto, &wq) +// if err != nil { +// log.Fatal(err) +// } +// +// // 绑定IP和端口,这里的IP地址为空,表示绑定任何IP +// // 此时就会调用端口管理器 +// if err := ep.Bind(tcpip.FullAddress{0, "", uint16(localPort)}, nil); err != nil { +// log.Fatal("Bind failed: ", err) +// } +// +// // 开始监听 +// if err := ep.Listen(10); err != nil { +// log.Fatal("Listen failed: ", err) +// } +// +// return ep +//} + +func udpListen(s *stack.Stack, proto tcpip.NetworkProtocolNumber, localPort int) tcpip.Endpoint { + var wq waiter.Queue + // 新建一个udp端 + ep, err := s.NewEndpoint(udp.ProtocolNumber, proto, &wq) + if err != nil { + log.Fatal(err) + } + + // 绑定IP和端口,这里的IP地址为空,表示绑定任何IP + // 0.0.0.0:9999 这台机器上的所有ip的9999段端口数据都会使用该传输层实现 + // 此时就会调用端口管理器 + if err := ep.Bind(tcpip.FullAddress{NIC: 0, Addr: "", Port: uint16(localPort)}, nil); err != nil { + log.Fatal("Bind failed: ", err) + } + + // 注意UDP是无连接的,它不需要Listen + return ep +} diff --git a/cmd/tap1/main.go b/cmd/tap1/main.go index be5eebe..7de7de1 100644 --- a/cmd/tap1/main.go +++ b/cmd/tap1/main.go @@ -1,32 +1,32 @@ -package main - -import ( - "log" - "netstack/tcpip/link/rawfile" - "netstack/tcpip/link/tuntap" -) - -func main() { - tapName := "tap0" - c := &tuntap.Config{Name: tapName, Mode: tuntap.TAP} - fd, err := tuntap.NewNetDev(c) - if err != nil { - panic(err) - } - - // 启动tap网卡 - _ = tuntap.SetLinkUp(tapName) - //_ = tuntap.AddIP(tapName, "192.168.1.1/24") - _ = tuntap.SetRoute(tapName, "192.168.1.0/24") // 其实在链路层通信,是可以不需要 ip 地址的 - log.Println("启动tap网卡", tapName, "192.169.1.1/24") - - buf := make([]byte, 1<<16) - for { - rn, err := rawfile.BlockingRead(fd, buf) - if err != nil { - log.Println(err) - continue - } - log.Printf("read %d bytes", rn) - } -} +package main + +import ( + "log" + "netstack/tcpip/link/rawfile" + "netstack/tcpip/link/tuntap" +) + +func main() { + tapName := "tap0" + c := &tuntap.Config{Name: tapName, Mode: tuntap.TAP} + fd, err := tuntap.NewNetDev(c) + if err != nil { + panic(err) + } + + // 启动tap网卡 + _ = tuntap.SetLinkUp(tapName) + //_ = tuntap.AddIP(tapName, "192.168.1.1/24") + _ = tuntap.SetRoute(tapName, "192.168.1.0/24") // 其实在链路层通信,是可以不需要 ip 地址的 + log.Println("启动tap网卡", tapName, "192.169.1.1/24") + + buf := make([]byte, 1<<16) + for { + rn, err := rawfile.BlockingRead(fd, buf) + if err != nil { + log.Println(err) + continue + } + log.Printf("read %d bytes", rn) + } +} diff --git a/cmd/tcpclient/main.go b/cmd/tcpclient/main.go index fd1577a..0e78301 100644 --- a/cmd/tcpclient/main.go +++ b/cmd/tcpclient/main.go @@ -1,14 +1,14 @@ -package main - -import ( - "fmt" - "net" -) - -func main() { - _, err := net.Dial("tcp", "192.168.1.1:9999") - if err != nil { - fmt.Println("err : ", err) - return - } -} +package main + +import ( + "fmt" + "net" +) + +func main() { + _, err := net.Dial("tcp", "192.168.1.1:9999") + if err != nil { + fmt.Println("err : ", err) + return + } +} diff --git a/cmd/tcpserver/main.go b/cmd/tcpserver/main.go index 80749f1..0d2b3e0 100644 --- a/cmd/tcpserver/main.go +++ b/cmd/tcpserver/main.go @@ -1,15 +1,15 @@ -package main - -import ( - "fmt" - "net" - "os" -) - -func main() { - _, err := net.Listen("tcp", "192.168.1.1:9999") - if err != nil { - fmt.Println("Error listening:", err) - os.Exit(1) - } -} +package main + +import ( + "fmt" + "net" + "os" +) + +func main() { + _, err := net.Listen("tcp", "192.168.1.1:9999") + if err != nil { + fmt.Println("Error listening:", err) + os.Exit(1) + } +} diff --git a/cmd/udp/main.go b/cmd/udp/main.go index 479ccdd..a6f7c14 100644 --- a/cmd/udp/main.go +++ b/cmd/udp/main.go @@ -1,167 +1,167 @@ -package main - -import ( - "flag" - "log" - "net" - "netstack/tcpip" - "netstack/tcpip/link/fdbased" - "netstack/tcpip/link/tuntap" - "netstack/tcpip/network/arp" - "netstack/tcpip/network/ipv4" - "netstack/tcpip/network/ipv6" - "netstack/tcpip/stack" - "netstack/tcpip/transport/udp" - "netstack/waiter" - "os" - "strconv" - "strings" -) - -var mac = flag.String("mac", "01:01:01:01:01:01", "mac address to use in tap device") - -func main() { - flag.Parse() - if len(flag.Args()) != 3 { - log.Fatal("Usage: ", os.Args[0], " port") - } - - log.SetFlags(log.Lshortfile | log.LstdFlags) - tapName := flag.Arg(0) - listeAddr := flag.Arg(1) - portName := flag.Arg(2) - - log.Printf("tap: %v, listeAddr: %v, portName: %v", tapName, listeAddr, portName) - - // Parse the mac address. - maddr, err := net.ParseMAC(*mac) - if err != nil { - log.Fatalf("Bad MAC address: %v", *mac) - } - - parsedAddr := net.ParseIP(listeAddr) - - // 解析地址ip地址,ipv4或者ipv6地址都支持 - var addr tcpip.Address - var proto tcpip.NetworkProtocolNumber - if parsedAddr.To4() != nil { - addr = tcpip.Address(parsedAddr.To4()) - proto = ipv4.ProtocolNumber - } else if parsedAddr.To16() != nil { - addr = tcpip.Address(parsedAddr.To16()) - proto = ipv6.ProtocolNumber - } else { - log.Fatalf("Unknown IP type: %v", parsedAddr) - } - - localPort, err := strconv.Atoi(portName) - if err != nil { - log.Fatalf("Unable to convert port %v: %v", portName, err) - } - - // 虚拟网卡配置 - conf := &tuntap.Config{ - Name: tapName, - Mode: tuntap.TAP, - } - - var fd int - // 新建虚拟网卡 - fd, err = tuntap.NewNetDev(conf) - if err != nil { - log.Fatal(err) - } - - // 启动tap网卡 - _ = tuntap.SetLinkUp(tapName) - // 设置tap网卡IP地址 - _ = tuntap.AddIP(tapName, listeAddr) - - // 抽象网卡的文件接口 - linkID := fdbased.New(&fdbased.Options{ - FD: fd, - MTU: 1500, - Address: tcpip.LinkAddress(maddr), - }) - - // 新建相关协议的协议栈 - s := stack.New([]string{ipv4.ProtocolName, arp.ProtocolName}, - []string{ /*tcp.ProtocolName, */ udp.ProtocolName}, stack.Options{}) - - // 新建抽象的网卡 - if err := s.CreateNamedNIC(1, "vnic1", linkID); err != nil { - log.Fatal(err) - } - - // 在该协议栈上添加和注册相应的网络层 - if err := s.AddAddress(1, proto, addr); err != nil { - log.Fatal(err) - } - - // 在该协议栈上添加和注册ARP协议 - if err := s.AddAddress(1, arp.ProtocolNumber, arp.ProtocolAddress); err != nil { - log.Fatal(err) - } - - // 添加默认路由 - s.SetRouteTable([]tcpip.Route{ - { - Destination: tcpip.Address(strings.Repeat("\x00", len(addr))), - Mask: tcpip.AddressMask(strings.Repeat("\x00", len(addr))), - Gateway: "", - NIC: 1, - }, - }) - - // 同时监听tcp和udp localPort端口 - //tcpEp := tcpListen(s, proto, localPort) - udpEp := udpListen(s, proto, localPort) - // 关闭监听服务,此时会释放端口 - //tcpEp.Close() - udpEp.Close() -} - -//func tcpListen(s *stack.Stack, proto tcpip.NetworkProtocolNumber, localPort int) tcpip.Endpoint { -// var wq waiter.Queue -// // 新建一个tcp端 -// ep, err := s.NewEndpoint(tcp.ProtocolNumber, proto, &wq) -// if err != nil { -// log.Fatal(err) -// } -// -// // 绑定IP和端口,这里的IP地址为空,表示绑定任何IP -// // 此时就会调用端口管理器 -// if err := ep.Bind(tcpip.FullAddress{0, "", uint16(localPort)}, nil); err != nil { -// log.Fatal("Bind failed: ", err) -// } -// -// // 开始监听 -// if err := ep.Listen(10); err != nil { -// log.Fatal("Listen failed: ", err) -// } -// -// return ep -//} - -func udpListen(s *stack.Stack, proto tcpip.NetworkProtocolNumber, localPort int) tcpip.Endpoint { - var wq waiter.Queue - // 新建一个udp端 - ep, err := s.NewEndpoint(udp.ProtocolNumber, proto, &wq) - if err != nil { - log.Fatal(err) - } - - // 绑定IP和端口,这里的IP地址为空,表示绑定任何IP - // 0.0.0.0:9999 这台机器上的所有ip的9999段端口数据都会使用该传输层实现 - // 此时就会调用端口管理器 - if err := ep.Bind(tcpip.FullAddress{NIC: 0, Addr: "", Port: uint16(localPort)}, nil); err != nil { - log.Fatal("Bind failed: ", err) - } - - if err := ep.Connect(tcpip.FullAddress{NIC: 0, Addr: "", Port: uint16(localPort)}); err != nil { - log.Fatal("Conn failed: ", err) - } - - // 注意UDP是无连接的,它不需要Listen - return ep -} +package main + +import ( + "flag" + "log" + "net" + "netstack/tcpip" + "netstack/tcpip/link/fdbased" + "netstack/tcpip/link/tuntap" + "netstack/tcpip/network/arp" + "netstack/tcpip/network/ipv4" + "netstack/tcpip/network/ipv6" + "netstack/tcpip/stack" + "netstack/tcpip/transport/udp" + "netstack/waiter" + "os" + "strconv" + "strings" +) + +var mac = flag.String("mac", "01:01:01:01:01:01", "mac address to use in tap device") + +func main() { + flag.Parse() + if len(flag.Args()) != 3 { + log.Fatal("Usage: ", os.Args[0], " port") + } + + log.SetFlags(log.Lshortfile | log.LstdFlags) + tapName := flag.Arg(0) + listeAddr := flag.Arg(1) + portName := flag.Arg(2) + + log.Printf("tap: %v, listeAddr: %v, portName: %v", tapName, listeAddr, portName) + + // Parse the mac address. + maddr, err := net.ParseMAC(*mac) + if err != nil { + log.Fatalf("Bad MAC address: %v", *mac) + } + + parsedAddr := net.ParseIP(listeAddr) + + // 解析地址ip地址,ipv4或者ipv6地址都支持 + var addr tcpip.Address + var proto tcpip.NetworkProtocolNumber + if parsedAddr.To4() != nil { + addr = tcpip.Address(parsedAddr.To4()) + proto = ipv4.ProtocolNumber + } else if parsedAddr.To16() != nil { + addr = tcpip.Address(parsedAddr.To16()) + proto = ipv6.ProtocolNumber + } else { + log.Fatalf("Unknown IP type: %v", parsedAddr) + } + + localPort, err := strconv.Atoi(portName) + if err != nil { + log.Fatalf("Unable to convert port %v: %v", portName, err) + } + + // 虚拟网卡配置 + conf := &tuntap.Config{ + Name: tapName, + Mode: tuntap.TAP, + } + + var fd int + // 新建虚拟网卡 + fd, err = tuntap.NewNetDev(conf) + if err != nil { + log.Fatal(err) + } + + // 启动tap网卡 + _ = tuntap.SetLinkUp(tapName) + // 设置tap网卡IP地址 + _ = tuntap.AddIP(tapName, listeAddr) + + // 抽象网卡的文件接口 + linkID := fdbased.New(&fdbased.Options{ + FD: fd, + MTU: 1500, + Address: tcpip.LinkAddress(maddr), + }) + + // 新建相关协议的协议栈 + s := stack.New([]string{ipv4.ProtocolName, arp.ProtocolName}, + []string{ /*tcp.ProtocolName, */ udp.ProtocolName}, stack.Options{}) + + // 新建抽象的网卡 + if err := s.CreateNamedNIC(1, "vnic1", linkID); err != nil { + log.Fatal(err) + } + + // 在该协议栈上添加和注册相应的网络层 + if err := s.AddAddress(1, proto, addr); err != nil { + log.Fatal(err) + } + + // 在该协议栈上添加和注册ARP协议 + if err := s.AddAddress(1, arp.ProtocolNumber, arp.ProtocolAddress); err != nil { + log.Fatal(err) + } + + // 添加默认路由 + s.SetRouteTable([]tcpip.Route{ + { + Destination: tcpip.Address(strings.Repeat("\x00", len(addr))), + Mask: tcpip.AddressMask(strings.Repeat("\x00", len(addr))), + Gateway: "", + NIC: 1, + }, + }) + + // 同时监听tcp和udp localPort端口 + //tcpEp := tcpListen(s, proto, localPort) + udpEp := udpListen(s, proto, localPort) + // 关闭监听服务,此时会释放端口 + //tcpEp.Close() + udpEp.Close() +} + +//func tcpListen(s *stack.Stack, proto tcpip.NetworkProtocolNumber, localPort int) tcpip.Endpoint { +// var wq waiter.Queue +// // 新建一个tcp端 +// ep, err := s.NewEndpoint(tcp.ProtocolNumber, proto, &wq) +// if err != nil { +// log.Fatal(err) +// } +// +// // 绑定IP和端口,这里的IP地址为空,表示绑定任何IP +// // 此时就会调用端口管理器 +// if err := ep.Bind(tcpip.FullAddress{0, "", uint16(localPort)}, nil); err != nil { +// log.Fatal("Bind failed: ", err) +// } +// +// // 开始监听 +// if err := ep.Listen(10); err != nil { +// log.Fatal("Listen failed: ", err) +// } +// +// return ep +//} + +func udpListen(s *stack.Stack, proto tcpip.NetworkProtocolNumber, localPort int) tcpip.Endpoint { + var wq waiter.Queue + // 新建一个udp端 + ep, err := s.NewEndpoint(udp.ProtocolNumber, proto, &wq) + if err != nil { + log.Fatal(err) + } + + // 绑定IP和端口,这里的IP地址为空,表示绑定任何IP + // 0.0.0.0:9999 这台机器上的所有ip的9999段端口数据都会使用该传输层实现 + // 此时就会调用端口管理器 + if err := ep.Bind(tcpip.FullAddress{NIC: 0, Addr: "", Port: uint16(localPort)}, nil); err != nil { + log.Fatal("Bind failed: ", err) + } + + if err := ep.Connect(tcpip.FullAddress{NIC: 0, Addr: "", Port: uint16(localPort)}); err != nil { + log.Fatal("Conn failed: ", err) + } + + // 注意UDP是无连接的,它不需要Listen + return ep +} diff --git a/example/tcp_server.go b/example/tcp_server.go index 82aedb3..0aaf2b1 100644 --- a/example/tcp_server.go +++ b/example/tcp_server.go @@ -1,46 +1,46 @@ -package main - -import ( - "fmt" - "log" - "net" - "runtime" - "strings" -) - -type TCPHandler interface { - Handle(net.Conn) -} - -func TCPServer(listener net.Listener, handler TCPHandler) error { - log.Printf("TCP: listening on %s", listener.Addr()) - - for { - clientConn, err := listener.Accept() - if err != nil { - if nerr, ok := err.(net.Error); ok && nerr.Temporary() { - log.Printf("temporary Accept() failure - %s", err) - runtime.Gosched() - continue - } - // theres no direct way to detect this error because it is not exposed - if !strings.Contains(err.Error(), "use of closed network connection") { - return fmt.Errorf("listener.Accept() error - %s", err) - } - break - } - go handler.Handle(clientConn) - } - - log.Printf("TCP: closing %s", listener.Addr()) - - return nil -} - -func main() { - _, err := net.Dial("tcp", "192.168.1.1:9999") - if err != nil { - fmt.Println("err : ", err) - return - } -} +package main + +import ( + "fmt" + "log" + "net" + "runtime" + "strings" +) + +type TCPHandler interface { + Handle(net.Conn) +} + +func TCPServer(listener net.Listener, handler TCPHandler) error { + log.Printf("TCP: listening on %s", listener.Addr()) + + for { + clientConn, err := listener.Accept() + if err != nil { + if nerr, ok := err.(net.Error); ok && nerr.Temporary() { + log.Printf("temporary Accept() failure - %s", err) + runtime.Gosched() + continue + } + // theres no direct way to detect this error because it is not exposed + if !strings.Contains(err.Error(), "use of closed network connection") { + return fmt.Errorf("listener.Accept() error - %s", err) + } + break + } + go handler.Handle(clientConn) + } + + log.Printf("TCP: closing %s", listener.Addr()) + + return nil +} + +func main() { + _, err := net.Dial("tcp", "192.168.1.1:9999") + if err != nil { + fmt.Println("err : ", err) + return + } +} diff --git a/ilist/list.go b/ilist/list.go index 88038aa..dcba992 100644 --- a/ilist/list.go +++ b/ilist/list.go @@ -1,141 +1,141 @@ -package ilist - -type Linker interface { - Next() Element - Prev() Element - SetNext(Element) - SetPrev(Element) -} - -type Element interface { - Linker -} - -type ElementMapper struct{} - -func (ElementMapper) linkerFor(elem Element) Linker { - return elem -} - -type List struct { - head Element - tail Element -} - -func (l *List) Reset() { - l.head = nil - l.tail = nil -} - -func (l *List) Empty() bool { - return l.head == nil -} - -func (l *List) Front() Element { - return l.head -} - -func (l *List) Back() Element { - return l.tail -} - -func (l *List) PushFront(e Element) { - ElementMapper{}.linkerFor(e).SetNext(l.head) - ElementMapper{}.linkerFor(e).SetPrev(nil) - - if l.head != nil { - ElementMapper{}.linkerFor(l.head).SetPrev(e) - } else { - l.tail = e - } - l.head = e -} - -func (l *List) PushBack(e Element) { - ElementMapper{}.linkerFor(e).SetNext(nil) - ElementMapper{}.linkerFor(e).SetPrev(l.tail) - - if l.tail != nil { - ElementMapper{}.linkerFor(l.tail).SetNext(e) - } else { - l.head = e - } - l.tail = e -} - -// list merge -func (l *List) PushBackList(m *List) { - if l.head == nil { - l.head = m.head - l.tail = m.tail - } else if m.head != nil { - ElementMapper{}.linkerFor(l.tail).SetNext(m.head) - ElementMapper{}.linkerFor(m.head).SetPrev(l.tail) - - l.tail = m.tail - } - m.head = nil - m.tail = nil -} - -func (l *List) InsertAfter(b, e Element) { - a := ElementMapper{}.linkerFor(b).Next() - ElementMapper{}.linkerFor(e).SetNext(a) - ElementMapper{}.linkerFor(e).SetPrev(b) - ElementMapper{}.linkerFor(b).SetNext(e) - if a != nil { - ElementMapper{}.linkerFor(a).SetPrev(e) - } else { - l.tail = e - } -} - -func (l *List) InsertBefore(a, e Element) { - b := ElementMapper{}.linkerFor(a).Prev() - ElementMapper{}.linkerFor(e).SetNext(a) - ElementMapper{}.linkerFor(e).SetPrev(b) - ElementMapper{}.linkerFor(a).SetPrev(e) - if a != nil { - ElementMapper{}.linkerFor(b).SetNext(e) - } else { - l.head = e - } -} - -func (l *List) Remove(e Element) { - prev := ElementMapper{}.linkerFor(e).Prev() - next := ElementMapper{}.linkerFor(e).Next() - - if prev != nil { - ElementMapper{}.linkerFor(prev).SetNext(next) - } else { - l.head = next - } - - if next != nil { - ElementMapper{}.linkerFor(next).SetPrev(prev) - } else { - l.tail = prev - } -} - -type Entry struct { - next Element - prev Element -} - -func (e *Entry) Next() Element { - return e.next -} - -func (e *Entry) Prev() Element { - return e.prev -} - -func (e *Entry) SetNext(elem Element) { - e.next = elem -} - -func (e *Entry) SetPrev(elem Element) { - e.prev = elem -} +package ilist + +type Linker interface { + Next() Element + Prev() Element + SetNext(Element) + SetPrev(Element) +} + +type Element interface { + Linker +} + +type ElementMapper struct{} + +func (ElementMapper) linkerFor(elem Element) Linker { + return elem +} + +type List struct { + head Element + tail Element +} + +func (l *List) Reset() { + l.head = nil + l.tail = nil +} + +func (l *List) Empty() bool { + return l.head == nil +} + +func (l *List) Front() Element { + return l.head +} + +func (l *List) Back() Element { + return l.tail +} + +func (l *List) PushFront(e Element) { + ElementMapper{}.linkerFor(e).SetNext(l.head) + ElementMapper{}.linkerFor(e).SetPrev(nil) + + if l.head != nil { + ElementMapper{}.linkerFor(l.head).SetPrev(e) + } else { + l.tail = e + } + l.head = e +} + +func (l *List) PushBack(e Element) { + ElementMapper{}.linkerFor(e).SetNext(nil) + ElementMapper{}.linkerFor(e).SetPrev(l.tail) + + if l.tail != nil { + ElementMapper{}.linkerFor(l.tail).SetNext(e) + } else { + l.head = e + } + l.tail = e +} + +// list merge +func (l *List) PushBackList(m *List) { + if l.head == nil { + l.head = m.head + l.tail = m.tail + } else if m.head != nil { + ElementMapper{}.linkerFor(l.tail).SetNext(m.head) + ElementMapper{}.linkerFor(m.head).SetPrev(l.tail) + + l.tail = m.tail + } + m.head = nil + m.tail = nil +} + +func (l *List) InsertAfter(b, e Element) { + a := ElementMapper{}.linkerFor(b).Next() + ElementMapper{}.linkerFor(e).SetNext(a) + ElementMapper{}.linkerFor(e).SetPrev(b) + ElementMapper{}.linkerFor(b).SetNext(e) + if a != nil { + ElementMapper{}.linkerFor(a).SetPrev(e) + } else { + l.tail = e + } +} + +func (l *List) InsertBefore(a, e Element) { + b := ElementMapper{}.linkerFor(a).Prev() + ElementMapper{}.linkerFor(e).SetNext(a) + ElementMapper{}.linkerFor(e).SetPrev(b) + ElementMapper{}.linkerFor(a).SetPrev(e) + if a != nil { + ElementMapper{}.linkerFor(b).SetNext(e) + } else { + l.head = e + } +} + +func (l *List) Remove(e Element) { + prev := ElementMapper{}.linkerFor(e).Prev() + next := ElementMapper{}.linkerFor(e).Next() + + if prev != nil { + ElementMapper{}.linkerFor(prev).SetNext(next) + } else { + l.head = next + } + + if next != nil { + ElementMapper{}.linkerFor(next).SetPrev(prev) + } else { + l.tail = prev + } +} + +type Entry struct { + next Element + prev Element +} + +func (e *Entry) Next() Element { + return e.next +} + +func (e *Entry) Prev() Element { + return e.prev +} + +func (e *Entry) SetNext(elem Element) { + e.next = elem +} + +func (e *Entry) SetPrev(elem Element) { + e.prev = elem +} diff --git a/rand/rand.go b/rand/rand.go index 6bcef3d..28a7548 100644 --- a/rand/rand.go +++ b/rand/rand.go @@ -1,11 +1,11 @@ -package rand - -import "crypto/rand" - -// Reader is the default reader. -var Reader = rand.Reader - -// Read implements io.Reader.Read. -func Read(b []byte) (int, error) { - return rand.Read(b) -} +package rand + +import "crypto/rand" + +// Reader is the default reader. +var Reader = rand.Reader + +// Read implements io.Reader.Read. +func Read(b []byte) (int, error) { + return rand.Read(b) +} diff --git a/tcpip/buffer/prependable.go b/tcpip/buffer/prependable.go index fa9e280..02bdef6 100644 --- a/tcpip/buffer/prependable.go +++ b/tcpip/buffer/prependable.go @@ -1,33 +1,33 @@ -package buffer - -// prependable 可预先考虑分配的 -type Prependable struct { - buf View - - usedIdx int -} - -func NewPrependable(size int) Prependable { - return Prependable{buf: NewView(size), usedIdx: size} -} - -func NewPrependableFromView(v View) Prependable { - return Prependable{buf: v, usedIdx: 0} -} - -func (p Prependable) View() View { - return p.buf[p.usedIdx:] -} - -func (p Prependable) UsedLength() int { - return len(p.buf) - p.usedIdx -} - -// 从内到外暴露报文头的协议 eth|ipv4|tcp -func (p *Prependable) Prepend(size int) []byte { - if size > p.usedIdx { - return nil - } - p.usedIdx -= size - return p.View()[:size:size] // p.buf[p.usedIdx:p.usedIdx+size:size] -} +package buffer + +// prependable 可预先考虑分配的 +type Prependable struct { + buf View + + usedIdx int +} + +func NewPrependable(size int) Prependable { + return Prependable{buf: NewView(size), usedIdx: size} +} + +func NewPrependableFromView(v View) Prependable { + return Prependable{buf: v, usedIdx: 0} +} + +func (p Prependable) View() View { + return p.buf[p.usedIdx:] +} + +func (p Prependable) UsedLength() int { + return len(p.buf) - p.usedIdx +} + +// 从内到外暴露报文头的协议 eth|ipv4|tcp +func (p *Prependable) Prepend(size int) []byte { + if size > p.usedIdx { + return nil + } + p.usedIdx -= size + return p.View()[:size:size] // p.buf[p.usedIdx:p.usedIdx+size:size] +} diff --git a/tcpip/buffer/view.go b/tcpip/buffer/view.go index e78aef3..339c452 100644 --- a/tcpip/buffer/view.go +++ b/tcpip/buffer/view.go @@ -1,107 +1,107 @@ -package buffer - -type View []byte - -func NewView(size int) View { - return make(View, size) -} - -func NewViewFromBytes(b []byte) View { - return append(View(nil), b...) // 没见过 🇰🇷了 -} - -// TrimFront 从缓冲区的可见部分中删除第一个“计数”字节 -func (v *View) TrimFront(count int) { - *v = (*v)[count:] -} - -// CapLength 不可逆地将缓冲区可见部分的长度减少到指定的值 -func (v *View) CapLength(length int) { - *v = (*v)[:length:length] -} - -func (v View) ToVectorisedView() VectorisedView { - return NewVectorisedView(len(v), []View{v}) -} - -// VectorisedView 是使用非连续内存的 View 的矢量化版本 -type VectorisedView struct { - views []View - size int -} - -func NewVectorisedView(size int, views []View) VectorisedView { - return VectorisedView{views: views, size: size} -} - -// 截掉count的长度 -func (vv *VectorisedView) TrimFront(count int) { - for count > 0 && len(vv.views) > 0 { - if count < len(vv.views[0]) { - vv.size -= count - vv.views[0].TrimFront(count) - return - } - count -= len(vv.views[0]) - vv.RemoveFirst() - } -} - -// 限制buffer总长度为length -func (vv *VectorisedView) CapLength(length int) { - if length < 0 { - length = 0 - } - if vv.size < length { - return // 不可缩减 - } - vv.size = length - for i := range vv.views { - v := &vv.views[i] - if len(*v) >= length { - if length == 0 { - vv.views = vv.views[:i] - } else { - v.CapLength(length) - vv.views = vv.views[:i+1] - } - return - } - length -= len(*v) - } -} - -func (vv VectorisedView) Clone(buffer []View) VectorisedView { - return VectorisedView{views: append(buffer[:0], vv.views...), size: vv.size} -} - -func (vv VectorisedView) First() View { - if len(vv.views) == 0 { - return nil - } - return vv.views[0] -} - -func (vv *VectorisedView) RemoveFirst() { - if len(vv.views) == 0 { - return - } - vv.size -= len(vv.views[0]) - vv.views = vv.views[1:] -} - -func (vv VectorisedView) Size() int { - return vv.size -} - -func (vv VectorisedView) ToView() View { - u := make([]byte, 0, vv.size) - for _, v := range vv.views { - u = append(u, v...) - } - return u -} - -func (vv VectorisedView) Views() []View { - return vv.views -} +package buffer + +type View []byte + +func NewView(size int) View { + return make(View, size) +} + +func NewViewFromBytes(b []byte) View { + return append(View(nil), b...) // 没见过 🇰🇷了 +} + +// TrimFront 从缓冲区的可见部分中删除第一个“计数”字节 +func (v *View) TrimFront(count int) { + *v = (*v)[count:] +} + +// CapLength 不可逆地将缓冲区可见部分的长度减少到指定的值 +func (v *View) CapLength(length int) { + *v = (*v)[:length:length] +} + +func (v View) ToVectorisedView() VectorisedView { + return NewVectorisedView(len(v), []View{v}) +} + +// VectorisedView 是使用非连续内存的 View 的矢量化版本 +type VectorisedView struct { + views []View + size int +} + +func NewVectorisedView(size int, views []View) VectorisedView { + return VectorisedView{views: views, size: size} +} + +// 截掉count的长度 +func (vv *VectorisedView) TrimFront(count int) { + for count > 0 && len(vv.views) > 0 { + if count < len(vv.views[0]) { + vv.size -= count + vv.views[0].TrimFront(count) + return + } + count -= len(vv.views[0]) + vv.RemoveFirst() + } +} + +// 限制buffer总长度为length +func (vv *VectorisedView) CapLength(length int) { + if length < 0 { + length = 0 + } + if vv.size < length { + return // 不可缩减 + } + vv.size = length + for i := range vv.views { + v := &vv.views[i] + if len(*v) >= length { + if length == 0 { + vv.views = vv.views[:i] + } else { + v.CapLength(length) + vv.views = vv.views[:i+1] + } + return + } + length -= len(*v) + } +} + +func (vv VectorisedView) Clone(buffer []View) VectorisedView { + return VectorisedView{views: append(buffer[:0], vv.views...), size: vv.size} +} + +func (vv VectorisedView) First() View { + if len(vv.views) == 0 { + return nil + } + return vv.views[0] +} + +func (vv *VectorisedView) RemoveFirst() { + if len(vv.views) == 0 { + return + } + vv.size -= len(vv.views[0]) + vv.views = vv.views[1:] +} + +func (vv VectorisedView) Size() int { + return vv.size +} + +func (vv VectorisedView) ToView() View { + u := make([]byte, 0, vv.size) + for _, v := range vv.views { + u = append(u, v...) + } + return u +} + +func (vv VectorisedView) Views() []View { + return vv.views +} diff --git a/tcpip/buffer/view_test.go b/tcpip/buffer/view_test.go index 12caba3..979123a 100644 --- a/tcpip/buffer/view_test.go +++ b/tcpip/buffer/view_test.go @@ -1,15 +1,15 @@ -package buffer - -import ( - "fmt" - "testing" -) - -func TestBaseView(t *testing.T) { - buffer1 := []byte("hello world") - buffer2 := []byte("test test test") - bv1 := NewViewFromBytes(buffer1) - bv2 := NewViewFromBytes(buffer2) - views := NewVectorisedView(2, []View{bv1, bv2}) - fmt.Println(string(views.ToView())) -} +package buffer + +import ( + "fmt" + "testing" +) + +func TestBaseView(t *testing.T) { + buffer1 := []byte("hello world") + buffer2 := []byte("test test test") + bv1 := NewViewFromBytes(buffer1) + bv2 := NewViewFromBytes(buffer2) + views := NewVectorisedView(2, []View{bv1, bv2}) + fmt.Println(string(views.ToView())) +} diff --git a/tcpip/header/arp.go b/tcpip/header/arp.go index faaef1e..57a281f 100644 --- a/tcpip/header/arp.go +++ b/tcpip/header/arp.go @@ -1,105 +1,105 @@ -package header - -import "netstack/tcpip" - -const ( - // ARPProtocolNumber是ARP协议号,为0x0806 - ARPProtocolNumber tcpip.NetworkProtocolNumber = 0x0806 - - // ARPSize是ARP报文在IPV4网络下的长度 - ARPSize = 2 + 2 + 1 + 1 + 2 + 2*6 + 2*4 // 28 Bytes -) - -// ARPOP 代表ARP的操作码 -type ARPOp uint16 - -// RFC 826 定义的操作码 -const ( - // arp 请求 - ARPRequest ARPOp = 1 - // arp应答 - ARPReply ARPOp = 2 -) - -/* -ARP报文的封装 -1. 2B 硬件类型(hard type) 硬件类型用来指代需要什么样的物理地址,如果硬件类型为 1,表示以太网地址 -2. 2B 协议类型 协议类型则是需要映射的协议地址类型,如果协议类型是 0x0800,表示 ipv4 协议。 -3. 1B 硬件地址长度 表示硬件地址的长度,单位字节,一般都是以太网地址的长度为 6 字节。 -4. 1B 协议地址长度: 表示协议地址的长度,单位字节,一般都是 ipv4 地址的长度为 4 字节。 -5. 2B 操作码 这些值用于区分具体操作类型,因为字段都相同,所以必须指明操作码,不然连请求还是应答都分不清。 - 1=>ARP 请求, 2=>ARP 应答,3=>RARP 请求,4=>RARP 应答。 -6. 6B 源硬件地址 源物理地址,如02:f2:02:f2:02:f2 -7. 4B 源协议地址 源协议地址,如192.168.0.1 -8. 6B 目标硬件地址 目标物理地址,如03:f2:03:f2:03:f2 -9. 4B 目标协议地址 目标协议地址,如 192.168.0.2 -*/ -type ARP []byte - -// 从报文中得到硬件类型 -func (a ARP) hardwareAddressSpace() uint16 { return uint16(a[0])<<8 | uint16(a[1]) } - -// 从报文中得到协议类型 -func (a ARP) protocolAddressSpace() uint16 { return uint16(a[2])<<8 | uint16(a[3]) } - -// 从报文中得到硬件地址的长度 -func (a ARP) hardwareAddressSize() int { return int(a[4]) } - -// 从报文中得到协议的地址长度 -func (a ARP) protocolAddressSize() int { return int(a[5]) } - -// Op从报文中得到arp操作码. -func (a ARP) Op() ARPOp { return ARPOp(a[6])<<8 | ARPOp(a[7]) } - -// SetOp设置arp操作码. -func (a ARP) SetOp(op ARPOp) { - a[6] = uint8(op >> 8) - a[7] = uint8(op) -} - -// SetIPv4OverEthernet设置IPV4网络在以太网中arp报文的硬件和协议信息. -func (a ARP) SetIPv4OverEthernet() { - a[0], a[1] = 0, 1 // htypeEthernet - a[2], a[3] = 0x08, 0x00 // IPv4ProtocolNumber - a[4] = 6 // macSize - a[5] = uint8(IPv4AddressSize) -} - -// HardwareAddressSender从报文中得到arp发送方的硬件地址 -func (a ARP) HardwareAddressSender() []byte { - const s = 8 - return a[s : s+6] -} - -// ProtocolAddressSender从报文中得到arp发送方的协议地址,为ipv4地址 -func (a ARP) ProtocolAddressSender() []byte { - const s = 8 + 6 // 8 是arp的协议头部 6是本机MAC - return a[s : s+4] // 本机IP -} - -// HardwareAddressTarget从报文中得到arp目的方的硬件地址 -func (a ARP) HardwareAddressTarget() []byte { - const s = 8 + 6 + 4 // 8是arp协议头部 6 是本机MAC 4是本机ip - return a[s : s+6] // 目标MAC -} - -// ProtocolAddressTarget从报文中得到arp目的方的协议地址,为ipv4地址 -func (a ARP) ProtocolAddressTarget() []byte { - const s = 8 + 6 + 4 + 6 // 8是arp协议头部 6 是本机MAC 4是本机ip 6是目标MAC - return a[s : s+4] // 目标IP -} - -// IsValid检查arp报文是否有效 -func (a ARP) IsValid() bool { - // 比arp报文的长度小,返回无效 - if len(a) < ARPSize { - return false - } - const htypeEthernet = 1 - const macSize = 6 - // 是否以太网、ipv4、硬件和协议长度都对 - return a.hardwareAddressSpace() == htypeEthernet && - a.protocolAddressSpace() == uint16(IPv4ProtocolNumber) && - a.hardwareAddressSize() == macSize && - a.protocolAddressSize() == IPv4AddressSize -} +package header + +import "netstack/tcpip" + +const ( + // ARPProtocolNumber是ARP协议号,为0x0806 + ARPProtocolNumber tcpip.NetworkProtocolNumber = 0x0806 + + // ARPSize是ARP报文在IPV4网络下的长度 + ARPSize = 2 + 2 + 1 + 1 + 2 + 2*6 + 2*4 // 28 Bytes +) + +// ARPOP 代表ARP的操作码 +type ARPOp uint16 + +// RFC 826 定义的操作码 +const ( + // arp 请求 + ARPRequest ARPOp = 1 + // arp应答 + ARPReply ARPOp = 2 +) + +/* +ARP报文的封装 +1. 2B 硬件类型(hard type) 硬件类型用来指代需要什么样的物理地址,如果硬件类型为 1,表示以太网地址 +2. 2B 协议类型 协议类型则是需要映射的协议地址类型,如果协议类型是 0x0800,表示 ipv4 协议。 +3. 1B 硬件地址长度 表示硬件地址的长度,单位字节,一般都是以太网地址的长度为 6 字节。 +4. 1B 协议地址长度: 表示协议地址的长度,单位字节,一般都是 ipv4 地址的长度为 4 字节。 +5. 2B 操作码 这些值用于区分具体操作类型,因为字段都相同,所以必须指明操作码,不然连请求还是应答都分不清。 + 1=>ARP 请求, 2=>ARP 应答,3=>RARP 请求,4=>RARP 应答。 +6. 6B 源硬件地址 源物理地址,如02:f2:02:f2:02:f2 +7. 4B 源协议地址 源协议地址,如192.168.0.1 +8. 6B 目标硬件地址 目标物理地址,如03:f2:03:f2:03:f2 +9. 4B 目标协议地址 目标协议地址,如 192.168.0.2 +*/ +type ARP []byte + +// 从报文中得到硬件类型 +func (a ARP) hardwareAddressSpace() uint16 { return uint16(a[0])<<8 | uint16(a[1]) } + +// 从报文中得到协议类型 +func (a ARP) protocolAddressSpace() uint16 { return uint16(a[2])<<8 | uint16(a[3]) } + +// 从报文中得到硬件地址的长度 +func (a ARP) hardwareAddressSize() int { return int(a[4]) } + +// 从报文中得到协议的地址长度 +func (a ARP) protocolAddressSize() int { return int(a[5]) } + +// Op从报文中得到arp操作码. +func (a ARP) Op() ARPOp { return ARPOp(a[6])<<8 | ARPOp(a[7]) } + +// SetOp设置arp操作码. +func (a ARP) SetOp(op ARPOp) { + a[6] = uint8(op >> 8) + a[7] = uint8(op) +} + +// SetIPv4OverEthernet设置IPV4网络在以太网中arp报文的硬件和协议信息. +func (a ARP) SetIPv4OverEthernet() { + a[0], a[1] = 0, 1 // htypeEthernet + a[2], a[3] = 0x08, 0x00 // IPv4ProtocolNumber + a[4] = 6 // macSize + a[5] = uint8(IPv4AddressSize) +} + +// HardwareAddressSender从报文中得到arp发送方的硬件地址 +func (a ARP) HardwareAddressSender() []byte { + const s = 8 + return a[s : s+6] +} + +// ProtocolAddressSender从报文中得到arp发送方的协议地址,为ipv4地址 +func (a ARP) ProtocolAddressSender() []byte { + const s = 8 + 6 // 8 是arp的协议头部 6是本机MAC + return a[s : s+4] // 本机IP +} + +// HardwareAddressTarget从报文中得到arp目的方的硬件地址 +func (a ARP) HardwareAddressTarget() []byte { + const s = 8 + 6 + 4 // 8是arp协议头部 6 是本机MAC 4是本机ip + return a[s : s+6] // 目标MAC +} + +// ProtocolAddressTarget从报文中得到arp目的方的协议地址,为ipv4地址 +func (a ARP) ProtocolAddressTarget() []byte { + const s = 8 + 6 + 4 + 6 // 8是arp协议头部 6 是本机MAC 4是本机ip 6是目标MAC + return a[s : s+4] // 目标IP +} + +// IsValid检查arp报文是否有效 +func (a ARP) IsValid() bool { + // 比arp报文的长度小,返回无效 + if len(a) < ARPSize { + return false + } + const htypeEthernet = 1 + const macSize = 6 + // 是否以太网、ipv4、硬件和协议长度都对 + return a.hardwareAddressSpace() == htypeEthernet && + a.protocolAddressSpace() == uint16(IPv4ProtocolNumber) && + a.hardwareAddressSize() == macSize && + a.protocolAddressSize() == IPv4AddressSize +} diff --git a/tcpip/header/checksum.go b/tcpip/header/checksum.go index 59054ee..2876224 100644 --- a/tcpip/header/checksum.go +++ b/tcpip/header/checksum.go @@ -1,37 +1,37 @@ -package header - -import "netstack/tcpip" - -// 校验和的计算 -func Checksum(buf []byte, initial uint16) uint16 { - v := uint32(initial) - - l := len(buf) - if l&1 != 0 { - l-- - v += uint32(buf[l]) << 8 - } - - for i := 0; i < l; i += 2 { - v += (uint32(buf[i]) << 8) + uint32(buf[i+1]) - } - - return ChecksumCombine(uint16(v), uint16(v>>16)) -} - -// ChecksumCombine combines the two uint16 to form their checksum. This is done -// by adding them and the carry. -func ChecksumCombine(a, b uint16) uint16 { - v := uint32(a) + uint32(b) - return uint16(v + v>>16) -} - -// PseudoHeaderChecksum calculates the pseudo-header checksum for the -// given destination protocol and network address, ignoring the length -// field. Pseudo-headers are needed by transport layers when calculating -// their own checksum. -func PseudoHeaderChecksum(protocol tcpip.TransportProtocolNumber, srcAddr tcpip.Address, dstAddr tcpip.Address) uint16 { - xsum := Checksum([]byte(srcAddr), 0) - xsum = Checksum([]byte(dstAddr), xsum) - return Checksum([]byte{0, uint8(protocol)}, xsum) -} +package header + +import "netstack/tcpip" + +// 校验和的计算 +func Checksum(buf []byte, initial uint16) uint16 { + v := uint32(initial) + + l := len(buf) + if l&1 != 0 { + l-- + v += uint32(buf[l]) << 8 + } + + for i := 0; i < l; i += 2 { + v += (uint32(buf[i]) << 8) + uint32(buf[i+1]) + } + + return ChecksumCombine(uint16(v), uint16(v>>16)) +} + +// ChecksumCombine combines the two uint16 to form their checksum. This is done +// by adding them and the carry. +func ChecksumCombine(a, b uint16) uint16 { + v := uint32(a) + uint32(b) + return uint16(v + v>>16) +} + +// PseudoHeaderChecksum calculates the pseudo-header checksum for the +// given destination protocol and network address, ignoring the length +// field. Pseudo-headers are needed by transport layers when calculating +// their own checksum. +func PseudoHeaderChecksum(protocol tcpip.TransportProtocolNumber, srcAddr tcpip.Address, dstAddr tcpip.Address) uint16 { + xsum := Checksum([]byte(srcAddr), 0) + xsum = Checksum([]byte(dstAddr), xsum) + return Checksum([]byte{0, uint8(protocol)}, xsum) +} diff --git a/tcpip/header/checksum_test.go b/tcpip/header/checksum_test.go index 637fe94..020a345 100644 --- a/tcpip/header/checksum_test.go +++ b/tcpip/header/checksum_test.go @@ -1,19 +1,19 @@ -package header_test - -import ( - "log" - "math/rand" - "netstack/tcpip/header" - "testing" - "time" -) - -func TestChecksum(t *testing.T) { - buf := make([]byte, 1024) - rand.Seed(time.Now().Unix()) - for i := range buf { - buf[i] = uint8(rand.Intn(255)) - } - sum := header.Checksum(buf, 0) - log.Println(sum) -} +package header_test + +import ( + "log" + "math/rand" + "netstack/tcpip/header" + "testing" + "time" +) + +func TestChecksum(t *testing.T) { + buf := make([]byte, 1024) + rand.Seed(time.Now().Unix()) + for i := range buf { + buf[i] = uint8(rand.Intn(255)) + } + sum := header.Checksum(buf, 0) + log.Println(sum) +} diff --git a/tcpip/header/eth.go b/tcpip/header/eth.go index 698a131..ba182ac 100644 --- a/tcpip/header/eth.go +++ b/tcpip/header/eth.go @@ -1,58 +1,58 @@ -package header - -import ( - "encoding/binary" - "netstack/tcpip" -) - -const ( - dstMAC = 0 - srcMAC = 6 - ethType = 12 -) - -type EthernetFields struct { - // 源地址 - SrcAddr tcpip.LinkAddress - - // 目标地址 - DstAddr tcpip.LinkAddress - - // 协议类型 - // Type = 0x8000 IPv4 Type = 0x8060 = ARP - Type tcpip.NetworkProtocolNumber -} - -// Ethernet以太网数据包的封装 -type Ethernet []byte - -const ( - // EthernetMinimumSize以太网帧最小的长度 - EthernetMinimumSize = 14 // 6 + 6 + 2 - - // EthernetAddressSize以太网地址的长度 - EthernetAddressSize = 6 -) - -// SourceAddress从帧头部中得到源地址 -func (b Ethernet) SourceAddress() tcpip.LinkAddress { - return tcpip.LinkAddress(b[srcMAC:][:EthernetAddressSize]) -} - -// DestinationAddress从帧头部中得到目的地址 -func (b Ethernet) DestinationAddress() tcpip.LinkAddress { - return tcpip.LinkAddress(b[dstMAC:][:EthernetAddressSize]) -} - -// Type从帧头部中得到协议类型 -func (b Ethernet) Type() tcpip.NetworkProtocolNumber { - return tcpip.NetworkProtocolNumber(binary.BigEndian.Uint16(b[ethType:])) -} - -// Encode根据传入的帧头部信息编码成Ethernet二进制形式,注意Ethernet应先分配好内存 -func (b Ethernet) Encode(e *EthernetFields) { - // [6]byte{dst}[6]byte{src}[2]byte{type} - binary.BigEndian.PutUint16(b[ethType:], uint16(e.Type)) - copy(b[srcMAC:][:EthernetAddressSize], e.SrcAddr) - copy(b[dstMAC:][:EthernetAddressSize], e.DstAddr) -} +package header + +import ( + "encoding/binary" + "netstack/tcpip" +) + +const ( + dstMAC = 0 + srcMAC = 6 + ethType = 12 +) + +type EthernetFields struct { + // 源地址 + SrcAddr tcpip.LinkAddress + + // 目标地址 + DstAddr tcpip.LinkAddress + + // 协议类型 + // Type = 0x8000 IPv4 Type = 0x8060 = ARP + Type tcpip.NetworkProtocolNumber +} + +// Ethernet以太网数据包的封装 +type Ethernet []byte + +const ( + // EthernetMinimumSize以太网帧最小的长度 + EthernetMinimumSize = 14 // 6 + 6 + 2 + + // EthernetAddressSize以太网地址的长度 + EthernetAddressSize = 6 +) + +// SourceAddress从帧头部中得到源地址 +func (b Ethernet) SourceAddress() tcpip.LinkAddress { + return tcpip.LinkAddress(b[srcMAC:][:EthernetAddressSize]) +} + +// DestinationAddress从帧头部中得到目的地址 +func (b Ethernet) DestinationAddress() tcpip.LinkAddress { + return tcpip.LinkAddress(b[dstMAC:][:EthernetAddressSize]) +} + +// Type从帧头部中得到协议类型 +func (b Ethernet) Type() tcpip.NetworkProtocolNumber { + return tcpip.NetworkProtocolNumber(binary.BigEndian.Uint16(b[ethType:])) +} + +// Encode根据传入的帧头部信息编码成Ethernet二进制形式,注意Ethernet应先分配好内存 +func (b Ethernet) Encode(e *EthernetFields) { + // [6]byte{dst}[6]byte{src}[2]byte{type} + binary.BigEndian.PutUint16(b[ethType:], uint16(e.Type)) + copy(b[srcMAC:][:EthernetAddressSize], e.SrcAddr) + copy(b[dstMAC:][:EthernetAddressSize], e.DstAddr) +} diff --git a/tcpip/header/icmpv4.go b/tcpip/header/icmpv4.go index bddc80d..36fa052 100644 --- a/tcpip/header/icmpv4.go +++ b/tcpip/header/icmpv4.go @@ -1,108 +1,108 @@ -// Copyright 2018 Google LLC -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package header - -import ( - "encoding/binary" - - "netstack/tcpip" -) - -// ICMPv4 represents an ICMPv4 header stored in a byte array. -type ICMPv4 []byte - -const ( - // ICMPv4MinimumSize is the minimum size of a valid ICMP packet. - ICMPv4MinimumSize = 4 - - // ICMPv4EchoMinimumSize is the minimum size of a valid ICMP echo packet. - ICMPv4EchoMinimumSize = 6 - - // ICMPv4DstUnreachableMinimumSize is the minimum size of a valid ICMP - // destination unreachable packet. - ICMPv4DstUnreachableMinimumSize = ICMPv4MinimumSize + 4 - - // ICMPv4ProtocolNumber is the ICMP transport protocol number. - ICMPv4ProtocolNumber tcpip.TransportProtocolNumber = 1 -) - -// ICMPv4Type is the ICMP type field described in RFC 792. -type ICMPv4Type byte - -// Typical values of ICMPv4Type defined in RFC 792. -const ( - ICMPv4EchoReply ICMPv4Type = 0 - ICMPv4DstUnreachable ICMPv4Type = 3 - ICMPv4SrcQuench ICMPv4Type = 4 - ICMPv4Redirect ICMPv4Type = 5 - ICMPv4Echo ICMPv4Type = 8 - ICMPv4TimeExceeded ICMPv4Type = 11 - ICMPv4ParamProblem ICMPv4Type = 12 - ICMPv4Timestamp ICMPv4Type = 13 - ICMPv4TimestampReply ICMPv4Type = 14 - ICMPv4InfoRequest ICMPv4Type = 15 - ICMPv4InfoReply ICMPv4Type = 16 -) - -// Values for ICMP code as defined in RFC 792. -const ( - ICMPv4PortUnreachable = 3 - ICMPv4FragmentationNeeded = 4 -) - -// Type is the ICMP type field. -func (b ICMPv4) Type() ICMPv4Type { return ICMPv4Type(b[0]) } - -// SetType sets the ICMP type field. -func (b ICMPv4) SetType(t ICMPv4Type) { b[0] = byte(t) } - -// Code is the ICMP code field. Its meaning depends on the value of Type. -func (b ICMPv4) Code() byte { return b[1] } - -// SetCode sets the ICMP code field. -func (b ICMPv4) SetCode(c byte) { b[1] = c } - -// Checksum is the ICMP checksum field. -func (b ICMPv4) Checksum() uint16 { - return binary.BigEndian.Uint16(b[2:]) -} - -// SetChecksum sets the ICMP checksum field. -func (b ICMPv4) SetChecksum(checksum uint16) { - binary.BigEndian.PutUint16(b[2:], checksum) -} - -// SourcePort implements Transport.SourcePort. -func (ICMPv4) SourcePort() uint16 { - return 0 -} - -// DestinationPort implements Transport.DestinationPort. -func (ICMPv4) DestinationPort() uint16 { - return 0 -} - -// SetSourcePort implements Transport.SetSourcePort. -func (ICMPv4) SetSourcePort(uint16) { -} - -// SetDestinationPort implements Transport.SetDestinationPort. -func (ICMPv4) SetDestinationPort(uint16) { -} - -// Payload implements Transport.Payload. -func (b ICMPv4) Payload() []byte { - return b[ICMPv4MinimumSize:] -} +// Copyright 2018 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package header + +import ( + "encoding/binary" + + "netstack/tcpip" +) + +// ICMPv4 represents an ICMPv4 header stored in a byte array. +type ICMPv4 []byte + +const ( + // ICMPv4MinimumSize is the minimum size of a valid ICMP packet. + ICMPv4MinimumSize = 4 + + // ICMPv4EchoMinimumSize is the minimum size of a valid ICMP echo packet. + ICMPv4EchoMinimumSize = 6 + + // ICMPv4DstUnreachableMinimumSize is the minimum size of a valid ICMP + // destination unreachable packet. + ICMPv4DstUnreachableMinimumSize = ICMPv4MinimumSize + 4 + + // ICMPv4ProtocolNumber is the ICMP transport protocol number. + ICMPv4ProtocolNumber tcpip.TransportProtocolNumber = 1 +) + +// ICMPv4Type is the ICMP type field described in RFC 792. +type ICMPv4Type byte + +// Typical values of ICMPv4Type defined in RFC 792. +const ( + ICMPv4EchoReply ICMPv4Type = 0 + ICMPv4DstUnreachable ICMPv4Type = 3 + ICMPv4SrcQuench ICMPv4Type = 4 + ICMPv4Redirect ICMPv4Type = 5 + ICMPv4Echo ICMPv4Type = 8 + ICMPv4TimeExceeded ICMPv4Type = 11 + ICMPv4ParamProblem ICMPv4Type = 12 + ICMPv4Timestamp ICMPv4Type = 13 + ICMPv4TimestampReply ICMPv4Type = 14 + ICMPv4InfoRequest ICMPv4Type = 15 + ICMPv4InfoReply ICMPv4Type = 16 +) + +// Values for ICMP code as defined in RFC 792. +const ( + ICMPv4PortUnreachable = 3 + ICMPv4FragmentationNeeded = 4 +) + +// Type is the ICMP type field. +func (b ICMPv4) Type() ICMPv4Type { return ICMPv4Type(b[0]) } + +// SetType sets the ICMP type field. +func (b ICMPv4) SetType(t ICMPv4Type) { b[0] = byte(t) } + +// Code is the ICMP code field. Its meaning depends on the value of Type. +func (b ICMPv4) Code() byte { return b[1] } + +// SetCode sets the ICMP code field. +func (b ICMPv4) SetCode(c byte) { b[1] = c } + +// Checksum is the ICMP checksum field. +func (b ICMPv4) Checksum() uint16 { + return binary.BigEndian.Uint16(b[2:]) +} + +// SetChecksum sets the ICMP checksum field. +func (b ICMPv4) SetChecksum(checksum uint16) { + binary.BigEndian.PutUint16(b[2:], checksum) +} + +// SourcePort implements Transport.SourcePort. +func (ICMPv4) SourcePort() uint16 { + return 0 +} + +// DestinationPort implements Transport.DestinationPort. +func (ICMPv4) DestinationPort() uint16 { + return 0 +} + +// SetSourcePort implements Transport.SetSourcePort. +func (ICMPv4) SetSourcePort(uint16) { +} + +// SetDestinationPort implements Transport.SetDestinationPort. +func (ICMPv4) SetDestinationPort(uint16) { +} + +// Payload implements Transport.Payload. +func (b ICMPv4) Payload() []byte { + return b[ICMPv4MinimumSize:] +} diff --git a/tcpip/header/icmpv6.go b/tcpip/header/icmpv6.go index 2344b88..264b12b 100644 --- a/tcpip/header/icmpv6.go +++ b/tcpip/header/icmpv6.go @@ -1,121 +1,121 @@ -// Copyright 2018 Google LLC -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package header - -import ( - "encoding/binary" - - "netstack/tcpip" -) - -// ICMPv6 represents an ICMPv6 header stored in a byte array. -type ICMPv6 []byte - -const ( - // ICMPv6MinimumSize is the minimum size of a valid ICMP packet. - ICMPv6MinimumSize = 4 - - // ICMPv6ProtocolNumber is the ICMP transport protocol number. - ICMPv6ProtocolNumber tcpip.TransportProtocolNumber = 58 - - // ICMPv6NeighborSolicitMinimumSize is the minimum size of a - // neighbor solicitation packet. - ICMPv6NeighborSolicitMinimumSize = ICMPv6MinimumSize + 4 + 16 - - // ICMPv6NeighborAdvertSize is size of a neighbor advertisement. - ICMPv6NeighborAdvertSize = 32 - - // ICMPv6EchoMinimumSize is the minimum size of a valid ICMP echo packet. - ICMPv6EchoMinimumSize = 8 - - // ICMPv6DstUnreachableMinimumSize is the minimum size of a valid ICMP - // destination unreachable packet. - ICMPv6DstUnreachableMinimumSize = ICMPv6MinimumSize + 4 - - // ICMPv6PacketTooBigMinimumSize is the minimum size of a valid ICMP - // packet-too-big packet. - ICMPv6PacketTooBigMinimumSize = ICMPv6MinimumSize + 4 -) - -// ICMPv6Type is the ICMP type field described in RFC 4443 and friends. -type ICMPv6Type byte - -// Typical values of ICMPv6Type defined in RFC 4443. -const ( - ICMPv6DstUnreachable ICMPv6Type = 1 - ICMPv6PacketTooBig ICMPv6Type = 2 - ICMPv6TimeExceeded ICMPv6Type = 3 - ICMPv6ParamProblem ICMPv6Type = 4 - ICMPv6EchoRequest ICMPv6Type = 128 - ICMPv6EchoReply ICMPv6Type = 129 - - // Neighbor Discovery Protocol (NDP) messages, see RFC 4861. - - ICMPv6RouterSolicit ICMPv6Type = 133 - ICMPv6RouterAdvert ICMPv6Type = 134 - ICMPv6NeighborSolicit ICMPv6Type = 135 - ICMPv6NeighborAdvert ICMPv6Type = 136 - ICMPv6RedirectMsg ICMPv6Type = 137 -) - -// Values for ICMP code as defined in RFC 4443. -const ( - ICMPv6PortUnreachable = 4 -) - -// Type is the ICMP type field. -func (b ICMPv6) Type() ICMPv6Type { return ICMPv6Type(b[0]) } - -// SetType sets the ICMP type field. -func (b ICMPv6) SetType(t ICMPv6Type) { b[0] = byte(t) } - -// Code is the ICMP code field. Its meaning depends on the value of Type. -func (b ICMPv6) Code() byte { return b[1] } - -// SetCode sets the ICMP code field. -func (b ICMPv6) SetCode(c byte) { b[1] = c } - -// Checksum is the ICMP checksum field. -func (b ICMPv6) Checksum() uint16 { - return binary.BigEndian.Uint16(b[2:]) -} - -// SetChecksum calculates and sets the ICMP checksum field. -func (b ICMPv6) SetChecksum(checksum uint16) { - binary.BigEndian.PutUint16(b[2:], checksum) -} - -// SourcePort implements Transport.SourcePort. -func (ICMPv6) SourcePort() uint16 { - return 0 -} - -// DestinationPort implements Transport.DestinationPort. -func (ICMPv6) DestinationPort() uint16 { - return 0 -} - -// SetSourcePort implements Transport.SetSourcePort. -func (ICMPv6) SetSourcePort(uint16) { -} - -// SetDestinationPort implements Transport.SetDestinationPort. -func (ICMPv6) SetDestinationPort(uint16) { -} - -// Payload implements Transport.Payload. -func (b ICMPv6) Payload() []byte { - return b[ICMPv6MinimumSize:] -} +// Copyright 2018 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package header + +import ( + "encoding/binary" + + "netstack/tcpip" +) + +// ICMPv6 represents an ICMPv6 header stored in a byte array. +type ICMPv6 []byte + +const ( + // ICMPv6MinimumSize is the minimum size of a valid ICMP packet. + ICMPv6MinimumSize = 4 + + // ICMPv6ProtocolNumber is the ICMP transport protocol number. + ICMPv6ProtocolNumber tcpip.TransportProtocolNumber = 58 + + // ICMPv6NeighborSolicitMinimumSize is the minimum size of a + // neighbor solicitation packet. + ICMPv6NeighborSolicitMinimumSize = ICMPv6MinimumSize + 4 + 16 + + // ICMPv6NeighborAdvertSize is size of a neighbor advertisement. + ICMPv6NeighborAdvertSize = 32 + + // ICMPv6EchoMinimumSize is the minimum size of a valid ICMP echo packet. + ICMPv6EchoMinimumSize = 8 + + // ICMPv6DstUnreachableMinimumSize is the minimum size of a valid ICMP + // destination unreachable packet. + ICMPv6DstUnreachableMinimumSize = ICMPv6MinimumSize + 4 + + // ICMPv6PacketTooBigMinimumSize is the minimum size of a valid ICMP + // packet-too-big packet. + ICMPv6PacketTooBigMinimumSize = ICMPv6MinimumSize + 4 +) + +// ICMPv6Type is the ICMP type field described in RFC 4443 and friends. +type ICMPv6Type byte + +// Typical values of ICMPv6Type defined in RFC 4443. +const ( + ICMPv6DstUnreachable ICMPv6Type = 1 + ICMPv6PacketTooBig ICMPv6Type = 2 + ICMPv6TimeExceeded ICMPv6Type = 3 + ICMPv6ParamProblem ICMPv6Type = 4 + ICMPv6EchoRequest ICMPv6Type = 128 + ICMPv6EchoReply ICMPv6Type = 129 + + // Neighbor Discovery Protocol (NDP) messages, see RFC 4861. + + ICMPv6RouterSolicit ICMPv6Type = 133 + ICMPv6RouterAdvert ICMPv6Type = 134 + ICMPv6NeighborSolicit ICMPv6Type = 135 + ICMPv6NeighborAdvert ICMPv6Type = 136 + ICMPv6RedirectMsg ICMPv6Type = 137 +) + +// Values for ICMP code as defined in RFC 4443. +const ( + ICMPv6PortUnreachable = 4 +) + +// Type is the ICMP type field. +func (b ICMPv6) Type() ICMPv6Type { return ICMPv6Type(b[0]) } + +// SetType sets the ICMP type field. +func (b ICMPv6) SetType(t ICMPv6Type) { b[0] = byte(t) } + +// Code is the ICMP code field. Its meaning depends on the value of Type. +func (b ICMPv6) Code() byte { return b[1] } + +// SetCode sets the ICMP code field. +func (b ICMPv6) SetCode(c byte) { b[1] = c } + +// Checksum is the ICMP checksum field. +func (b ICMPv6) Checksum() uint16 { + return binary.BigEndian.Uint16(b[2:]) +} + +// SetChecksum calculates and sets the ICMP checksum field. +func (b ICMPv6) SetChecksum(checksum uint16) { + binary.BigEndian.PutUint16(b[2:], checksum) +} + +// SourcePort implements Transport.SourcePort. +func (ICMPv6) SourcePort() uint16 { + return 0 +} + +// DestinationPort implements Transport.DestinationPort. +func (ICMPv6) DestinationPort() uint16 { + return 0 +} + +// SetSourcePort implements Transport.SetSourcePort. +func (ICMPv6) SetSourcePort(uint16) { +} + +// SetDestinationPort implements Transport.SetDestinationPort. +func (ICMPv6) SetDestinationPort(uint16) { +} + +// Payload implements Transport.Payload. +func (b ICMPv6) Payload() []byte { + return b[ICMPv6MinimumSize:] +} diff --git a/tcpip/header/ipv4.go b/tcpip/header/ipv4.go index 863577c..4dc249e 100644 --- a/tcpip/header/ipv4.go +++ b/tcpip/header/ipv4.go @@ -1,313 +1,313 @@ -package header - -import ( - "encoding/binary" - "fmt" - "netstack/tcpip" -) - -/* _ -|Version 4b|IHL 4b|Type of Service 8b| Total Length 16b | - ---------------------------------------------------------------- -| fragment ID 16b |R|DF|MF|Fragment Offset 13b| - ---------------------------------------------------------------- -| TTL 8b | Protocol 8b | Header Checksum 16b | 20 bytes - ---------------------------------------------------------------- -| Sorece IP Address 32b | - ---------------------------------------------------------------- -| Destination IP Address 32b | _ - ---------------------------------------------------------------- -| Options | Padding | -*/ - -const ( - versIHL = 0 - tos = 1 - totalLen = 2 - id = 4 - flagsFO = 6 - ttl = 8 - protocol = 9 - checksum = 10 - srcAddr = 12 - dstAddr = 16 -) - -// 表示IPv4头部信息的结构体 -type IPv4Fields struct { - // IHL is the "internet header length" field of an IPv4 packet. - // 头部长度 - IHL uint8 - - // TOS is the "type of service" field of an IPv4 packet. - // 服务区分的表示 - TOS uint8 - - // TotalLength is the "total length" field of an IPv4 packet. - // 数据报文总长 - TotalLength uint16 - - // ID is the "identification" field of an IPv4 packet. - // 标识符 注意这个ID对于每个IP报文来说是唯一的 它的每个分片共享这个ID来标识它们同属一个报文 - ID uint16 - - // Flags is the "flags" field of an IPv4 packet. - // 标签 - Flags uint8 - - // FragmentOffset is the "fragment offset" field of an IPv4 packet. - // 分片偏移 - FragmentOffset uint16 - - // TTL is the "time to live" field of an IPv4 packet. - // 存活时间 - TTL uint8 - - // Protocol is the "protocol" field of an IPv4 packet. - // 表示的传输层协议 - Protocol uint8 - - // Checksum is the "checksum" field of an IPv4 packet. - // 首部校验和 - Checksum uint16 - - // SrcAddr is the "source ip address" of an IPv4 packet. - // 源IP地址 - SrcAddr tcpip.Address - - // DstAddr is the "destination ip address" of an IPv4 packet. - // 目的IP地址 - DstAddr tcpip.Address -} - -type IPv4 []byte - -const ( - // IPv4MinimumSize is the minimum size of a valid IPv4 packet. - IPv4MinimumSize = 20 - - // IPv4MaximumHeaderSize is the maximum size of an IPv4 header. Given - // that there are only 4 bits to represents the header length in 32-bit - // units, the header cannot exceed 15*4 = 60 bytes. - IPv4MaximumHeaderSize = 60 - - // IPv4AddressSize is the size, in bytes, of an IPv4 address. - IPv4AddressSize = 4 - - // IPv4ProtocolNumber is IPv4's network protocol number. - IPv4ProtocolNumber tcpip.NetworkProtocolNumber = 0x0800 - - // IPv4Version is the version of the ipv4 protocol. - IPv4Version = 4 - - // IPv4Broadcast is the broadcast address of the IPv4 procotol. - IPv4Broadcast tcpip.Address = "\xff\xff\xff\xff" - - // IPv4Any is the non-routable IPv4 "any" meta address. - IPv4Any tcpip.Address = "\x00\x00\x00\x00" -) - -// Flags that may be set in an IPv4 packet. -const ( - IPv4FlagMoreFragments = 1 << iota - IPv4FlagDontFragment -) - -func IPVersion(b []byte) int { - if len(b) < versIHL+1 { - return -1 - } - return int(b[versIHL] >> 4) -} - -// 首部长度说明首部有多少 32 位字(4 字节) 这个函数返回其实际占用的字节数 -func (b IPv4) HeaderLength() uint8 { - return (b[versIHL] & 0xf) * 4 -} - -func (b IPv4) ID() uint16 { - return binary.BigEndian.Uint16(b[id:]) -} - -// Protocol returns the value of the protocol field of the ipv4 header. -func (b IPv4) Protocol() uint8 { - return b[protocol] -} - -// Flags returns the "flags" field of the ipv4 header. -func (b IPv4) Flags() uint8 { - return uint8(binary.BigEndian.Uint16(b[flagsFO:]) >> 13) -} - -// TTL returns the "TTL" field of the ipv4 header. -func (b IPv4) TTL() uint8 { - return b[ttl] -} - -// FragmentOffset returns the "fragment offset" field of the ipv4 header. -func (b IPv4) FragmentOffset() uint16 { - return binary.BigEndian.Uint16(b[flagsFO:]) << 3 -} - -// TotalLength returns the "total length" field of the ipv4 header. -func (b IPv4) TotalLength() uint16 { - return binary.BigEndian.Uint16(b[totalLen:]) -} - -// Checksum returns the checksum field of the ipv4 header. -func (b IPv4) Checksum() uint16 { - return binary.BigEndian.Uint16(b[checksum:]) -} - -// SourceAddress returns the "source address" field of the ipv4 header. -func (b IPv4) SourceAddress() tcpip.Address { - return tcpip.Address(b[srcAddr : srcAddr+IPv4AddressSize]) -} - -// DestinationAddress returns the "destination address" field of the ipv4 -// header. -func (b IPv4) DestinationAddress() tcpip.Address { - return tcpip.Address(b[dstAddr : dstAddr+IPv4AddressSize]) -} - -// TransportProtocol implements Network.TransportProtocol. -func (b IPv4) TransportProtocol() tcpip.TransportProtocolNumber { - return tcpip.TransportProtocolNumber(b.Protocol()) -} - -// Payload implements Network.Payload. -func (b IPv4) Payload() []byte { - return b[b.HeaderLength():][:b.PayloadLength()] -} - -// PayloadLength returns the length of the payload portion of the ipv4 packet. -func (b IPv4) PayloadLength() uint16 { - return b.TotalLength() - uint16(b.HeaderLength()) -} - -// TOS returns the "type of service" field of the ipv4 header. -func (b IPv4) TOS() (uint8, uint32) { - return b[tos], 0 -} - -// SetTOS sets the "type of service" field of the ipv4 header. -func (b IPv4) SetTOS(v uint8, _ uint32) { - b[tos] = v -} - -// SetTotalLength sets the "total length" field of the ipv4 header. -func (b IPv4) SetTotalLength(totalLength uint16) { - binary.BigEndian.PutUint16(b[totalLen:], totalLength) -} - -// SetChecksum sets the checksum field of the ipv4 header. -func (b IPv4) SetChecksum(v uint16) { - binary.BigEndian.PutUint16(b[checksum:], v) -} - -// SetFlagsFragmentOffset sets the "flags" and "fragment offset" fields of the -// ipv4 header. -func (b IPv4) SetFlagsFragmentOffset(flags uint8, offset uint16) { - v := (uint16(flags) << 13) | (offset >> 3) - binary.BigEndian.PutUint16(b[flagsFO:], v) -} - -// SetSourceAddress sets the "source address" field of the ipv4 header. -func (b IPv4) SetSourceAddress(addr tcpip.Address) { - copy(b[srcAddr:srcAddr+IPv4AddressSize], addr) -} - -// SetDestinationAddress sets the "destination address" field of the ipv4 -// header. -func (b IPv4) SetDestinationAddress(addr tcpip.Address) { - copy(b[dstAddr:dstAddr+IPv4AddressSize], addr) -} - -// CalculateChecksum calculates the checksum of the ipv4 header. -func (b IPv4) CalculateChecksum() uint16 { - return Checksum(b[:b.HeaderLength()], 0) -} - -// Encode encodes all the fields of the ipv4 header. -func (b IPv4) Encode(i *IPv4Fields) { - b[versIHL] = (4 << 4) | ((i.IHL / 4) & 0xf) - b[tos] = i.TOS - b.SetTotalLength(i.TotalLength) - binary.BigEndian.PutUint16(b[id:], i.ID) - b.SetFlagsFragmentOffset(i.Flags, i.FragmentOffset) - b[ttl] = i.TTL - b[protocol] = i.Protocol - b.SetChecksum(i.Checksum) - copy(b[srcAddr:srcAddr+IPv4AddressSize], i.SrcAddr) - copy(b[dstAddr:dstAddr+IPv4AddressSize], i.DstAddr) -} - -// EncodePartial updates the total length and checksum fields of ipv4 header, -// taking in the partial checksum, which is the checksum of the header without -// the total length and checksum fields. It is useful in cases when similar -// packets are produced. -func (b IPv4) EncodePartial(partialChecksum, totalLength uint16) { - b.SetTotalLength(totalLength) - checksum := Checksum(b[totalLen:totalLen+2], partialChecksum) - b.SetChecksum(^checksum) -} - -// IsValid performs basic validation on the packet. -func (b IPv4) IsValid(pktSize int) bool { - if len(b) < IPv4MinimumSize { - return false - } - - hlen := int(b.HeaderLength()) - tlen := int(b.TotalLength()) - if hlen > tlen || tlen > pktSize { - return false - } - - return true -} - -// IsV4MulticastAddress determines if the provided address is an IPv4 multicast -// address (range 224.0.0.0 to 239.255.255.255). The four most significant bits -// will be 1110 = 0xe0. -func IsV4MulticastAddress(addr tcpip.Address) bool { - if len(addr) != IPv4AddressSize { - return false - } - return (addr[0] & 0xf0) == 0xe0 -} - -var ipv4Fmt string = ` -|% 4s|% 4s|% 8s| % 16s| -| % 16s|%s|%s|%s|% 11s| -| % 8s|% 8s|% 16s | -|% 32s | -|% 32s | -| Options | Padding | -%v -` - -type Types []struct{} - -func atoi[T int | int8 | int16 | int32 | int64 | uint | uint8 | uint16 | uint32](i T) string { - return fmt.Sprintf("%d", i) -} - -func (b IPv4) String() string { - for i := range b.Payload() { - if i != int(b.PayloadLength()-1) && b.Payload()[i]^b.Payload()[i+1] != 0 { - return fmt.Sprintf(ipv4Fmt, atoi(IPVersion(b)), atoi(b.HeaderLength()), atoi(0), atoi(b.TotalLength()), - atoi(b.ID()), atoi(b.Flags()>>2), atoi((b.Flags()&2)>>1), atoi(b.Flags()&1), atoi(b.FragmentOffset()), - atoi(b.TTL()), atoi(b.Protocol()), atoi(b.Checksum()), - b.SourceAddress().String(), - b.DestinationAddress().String(), - b.Payload()) - } - } - return fmt.Sprintf(ipv4Fmt, atoi(IPVersion(b)), atoi(b.HeaderLength()), atoi(0), atoi(b.TotalLength()), - atoi(b.ID()), atoi(b.Flags()>>2), atoi((b.Flags()&2)>>1), atoi(b.Flags()&1), atoi(b.FragmentOffset()), - atoi(b.TTL()), atoi(b.Protocol()), atoi(b.Checksum()), - b.SourceAddress().String(), - b.DestinationAddress().String(), - fmt.Sprintf("%v x %d", b.Payload()[0], b.PayloadLength())) -} +package header + +import ( + "encoding/binary" + "fmt" + "netstack/tcpip" +) + +/* _ +|Version 4b|IHL 4b|Type of Service 8b| Total Length 16b | + ---------------------------------------------------------------- +| fragment ID 16b |R|DF|MF|Fragment Offset 13b| + ---------------------------------------------------------------- +| TTL 8b | Protocol 8b | Header Checksum 16b | 20 bytes + ---------------------------------------------------------------- +| Sorece IP Address 32b | + ---------------------------------------------------------------- +| Destination IP Address 32b | _ + ---------------------------------------------------------------- +| Options | Padding | +*/ + +const ( + versIHL = 0 + tos = 1 + totalLen = 2 + id = 4 + flagsFO = 6 + ttl = 8 + protocol = 9 + checksum = 10 + srcAddr = 12 + dstAddr = 16 +) + +// 表示IPv4头部信息的结构体 +type IPv4Fields struct { + // IHL is the "internet header length" field of an IPv4 packet. + // 头部长度 + IHL uint8 + + // TOS is the "type of service" field of an IPv4 packet. + // 服务区分的表示 + TOS uint8 + + // TotalLength is the "total length" field of an IPv4 packet. + // 数据报文总长 + TotalLength uint16 + + // ID is the "identification" field of an IPv4 packet. + // 标识符 注意这个ID对于每个IP报文来说是唯一的 它的每个分片共享这个ID来标识它们同属一个报文 + ID uint16 + + // Flags is the "flags" field of an IPv4 packet. + // 标签 + Flags uint8 + + // FragmentOffset is the "fragment offset" field of an IPv4 packet. + // 分片偏移 + FragmentOffset uint16 + + // TTL is the "time to live" field of an IPv4 packet. + // 存活时间 + TTL uint8 + + // Protocol is the "protocol" field of an IPv4 packet. + // 表示的传输层协议 + Protocol uint8 + + // Checksum is the "checksum" field of an IPv4 packet. + // 首部校验和 + Checksum uint16 + + // SrcAddr is the "source ip address" of an IPv4 packet. + // 源IP地址 + SrcAddr tcpip.Address + + // DstAddr is the "destination ip address" of an IPv4 packet. + // 目的IP地址 + DstAddr tcpip.Address +} + +type IPv4 []byte + +const ( + // IPv4MinimumSize is the minimum size of a valid IPv4 packet. + IPv4MinimumSize = 20 + + // IPv4MaximumHeaderSize is the maximum size of an IPv4 header. Given + // that there are only 4 bits to represents the header length in 32-bit + // units, the header cannot exceed 15*4 = 60 bytes. + IPv4MaximumHeaderSize = 60 + + // IPv4AddressSize is the size, in bytes, of an IPv4 address. + IPv4AddressSize = 4 + + // IPv4ProtocolNumber is IPv4's network protocol number. + IPv4ProtocolNumber tcpip.NetworkProtocolNumber = 0x0800 + + // IPv4Version is the version of the ipv4 protocol. + IPv4Version = 4 + + // IPv4Broadcast is the broadcast address of the IPv4 procotol. + IPv4Broadcast tcpip.Address = "\xff\xff\xff\xff" + + // IPv4Any is the non-routable IPv4 "any" meta address. + IPv4Any tcpip.Address = "\x00\x00\x00\x00" +) + +// Flags that may be set in an IPv4 packet. +const ( + IPv4FlagMoreFragments = 1 << iota + IPv4FlagDontFragment +) + +func IPVersion(b []byte) int { + if len(b) < versIHL+1 { + return -1 + } + return int(b[versIHL] >> 4) +} + +// 首部长度说明首部有多少 32 位字(4 字节) 这个函数返回其实际占用的字节数 +func (b IPv4) HeaderLength() uint8 { + return (b[versIHL] & 0xf) * 4 +} + +func (b IPv4) ID() uint16 { + return binary.BigEndian.Uint16(b[id:]) +} + +// Protocol returns the value of the protocol field of the ipv4 header. +func (b IPv4) Protocol() uint8 { + return b[protocol] +} + +// Flags returns the "flags" field of the ipv4 header. +func (b IPv4) Flags() uint8 { + return uint8(binary.BigEndian.Uint16(b[flagsFO:]) >> 13) +} + +// TTL returns the "TTL" field of the ipv4 header. +func (b IPv4) TTL() uint8 { + return b[ttl] +} + +// FragmentOffset returns the "fragment offset" field of the ipv4 header. +func (b IPv4) FragmentOffset() uint16 { + return binary.BigEndian.Uint16(b[flagsFO:]) << 3 +} + +// TotalLength returns the "total length" field of the ipv4 header. +func (b IPv4) TotalLength() uint16 { + return binary.BigEndian.Uint16(b[totalLen:]) +} + +// Checksum returns the checksum field of the ipv4 header. +func (b IPv4) Checksum() uint16 { + return binary.BigEndian.Uint16(b[checksum:]) +} + +// SourceAddress returns the "source address" field of the ipv4 header. +func (b IPv4) SourceAddress() tcpip.Address { + return tcpip.Address(b[srcAddr : srcAddr+IPv4AddressSize]) +} + +// DestinationAddress returns the "destination address" field of the ipv4 +// header. +func (b IPv4) DestinationAddress() tcpip.Address { + return tcpip.Address(b[dstAddr : dstAddr+IPv4AddressSize]) +} + +// TransportProtocol implements Network.TransportProtocol. +func (b IPv4) TransportProtocol() tcpip.TransportProtocolNumber { + return tcpip.TransportProtocolNumber(b.Protocol()) +} + +// Payload implements Network.Payload. +func (b IPv4) Payload() []byte { + return b[b.HeaderLength():][:b.PayloadLength()] +} + +// PayloadLength returns the length of the payload portion of the ipv4 packet. +func (b IPv4) PayloadLength() uint16 { + return b.TotalLength() - uint16(b.HeaderLength()) +} + +// TOS returns the "type of service" field of the ipv4 header. +func (b IPv4) TOS() (uint8, uint32) { + return b[tos], 0 +} + +// SetTOS sets the "type of service" field of the ipv4 header. +func (b IPv4) SetTOS(v uint8, _ uint32) { + b[tos] = v +} + +// SetTotalLength sets the "total length" field of the ipv4 header. +func (b IPv4) SetTotalLength(totalLength uint16) { + binary.BigEndian.PutUint16(b[totalLen:], totalLength) +} + +// SetChecksum sets the checksum field of the ipv4 header. +func (b IPv4) SetChecksum(v uint16) { + binary.BigEndian.PutUint16(b[checksum:], v) +} + +// SetFlagsFragmentOffset sets the "flags" and "fragment offset" fields of the +// ipv4 header. +func (b IPv4) SetFlagsFragmentOffset(flags uint8, offset uint16) { + v := (uint16(flags) << 13) | (offset >> 3) + binary.BigEndian.PutUint16(b[flagsFO:], v) +} + +// SetSourceAddress sets the "source address" field of the ipv4 header. +func (b IPv4) SetSourceAddress(addr tcpip.Address) { + copy(b[srcAddr:srcAddr+IPv4AddressSize], addr) +} + +// SetDestinationAddress sets the "destination address" field of the ipv4 +// header. +func (b IPv4) SetDestinationAddress(addr tcpip.Address) { + copy(b[dstAddr:dstAddr+IPv4AddressSize], addr) +} + +// CalculateChecksum calculates the checksum of the ipv4 header. +func (b IPv4) CalculateChecksum() uint16 { + return Checksum(b[:b.HeaderLength()], 0) +} + +// Encode encodes all the fields of the ipv4 header. +func (b IPv4) Encode(i *IPv4Fields) { + b[versIHL] = (4 << 4) | ((i.IHL / 4) & 0xf) + b[tos] = i.TOS + b.SetTotalLength(i.TotalLength) + binary.BigEndian.PutUint16(b[id:], i.ID) + b.SetFlagsFragmentOffset(i.Flags, i.FragmentOffset) + b[ttl] = i.TTL + b[protocol] = i.Protocol + b.SetChecksum(i.Checksum) + copy(b[srcAddr:srcAddr+IPv4AddressSize], i.SrcAddr) + copy(b[dstAddr:dstAddr+IPv4AddressSize], i.DstAddr) +} + +// EncodePartial updates the total length and checksum fields of ipv4 header, +// taking in the partial checksum, which is the checksum of the header without +// the total length and checksum fields. It is useful in cases when similar +// packets are produced. +func (b IPv4) EncodePartial(partialChecksum, totalLength uint16) { + b.SetTotalLength(totalLength) + checksum := Checksum(b[totalLen:totalLen+2], partialChecksum) + b.SetChecksum(^checksum) +} + +// IsValid performs basic validation on the packet. +func (b IPv4) IsValid(pktSize int) bool { + if len(b) < IPv4MinimumSize { + return false + } + + hlen := int(b.HeaderLength()) + tlen := int(b.TotalLength()) + if hlen > tlen || tlen > pktSize { + return false + } + + return true +} + +// IsV4MulticastAddress determines if the provided address is an IPv4 multicast +// address (range 224.0.0.0 to 239.255.255.255). The four most significant bits +// will be 1110 = 0xe0. +func IsV4MulticastAddress(addr tcpip.Address) bool { + if len(addr) != IPv4AddressSize { + return false + } + return (addr[0] & 0xf0) == 0xe0 +} + +var ipv4Fmt string = ` +|% 4s|% 4s|% 8s| % 16s| +| % 16s|%s|%s|%s|% 11s| +| % 8s|% 8s|% 16s | +|% 32s | +|% 32s | +| Options | Padding | +%v +` + +type Types []struct{} + +func atoi[T int | int8 | int16 | int32 | int64 | uint | uint8 | uint16 | uint32](i T) string { + return fmt.Sprintf("%d", i) +} + +func (b IPv4) String() string { + for i := range b.Payload() { + if i != int(b.PayloadLength()-1) && b.Payload()[i]^b.Payload()[i+1] != 0 { + return fmt.Sprintf(ipv4Fmt, atoi(IPVersion(b)), atoi(b.HeaderLength()), atoi(0), atoi(b.TotalLength()), + atoi(b.ID()), atoi(b.Flags()>>2), atoi((b.Flags()&2)>>1), atoi(b.Flags()&1), atoi(b.FragmentOffset()), + atoi(b.TTL()), atoi(b.Protocol()), atoi(b.Checksum()), + b.SourceAddress().String(), + b.DestinationAddress().String(), + b.Payload()) + } + } + return fmt.Sprintf(ipv4Fmt, atoi(IPVersion(b)), atoi(b.HeaderLength()), atoi(0), atoi(b.TotalLength()), + atoi(b.ID()), atoi(b.Flags()>>2), atoi((b.Flags()&2)>>1), atoi(b.Flags()&1), atoi(b.FragmentOffset()), + atoi(b.TTL()), atoi(b.Protocol()), atoi(b.Checksum()), + b.SourceAddress().String(), + b.DestinationAddress().String(), + fmt.Sprintf("%v x %d", b.Payload()[0], b.PayloadLength())) +} diff --git a/tcpip/header/ipv6.go b/tcpip/header/ipv6.go index d2fa4b2..9ab6752 100644 --- a/tcpip/header/ipv6.go +++ b/tcpip/header/ipv6.go @@ -1,218 +1,218 @@ -package header - -import ( - "encoding/binary" - "netstack/tcpip" - "strings" -) - -const ( - versTCFL = 0 - payloadLen = 4 - nextHdr = 6 - hopLimit = 7 - v6SrcAddr = 8 - v6DstAddr = 24 -) - -// IPv6Fields contains the fields of an IPv6 packet. It is used to describe the -// fields of a packet that needs to be encoded. -type IPv6Fields struct { - // TrafficClass is the "traffic class" field of an IPv6 packet. - TrafficClass uint8 - - // FlowLabel is the "flow label" field of an IPv6 packet. - FlowLabel uint32 - - // PayloadLength is the "payload length" field of an IPv6 packet. - PayloadLength uint16 - - // NextHeader is the "next header" field of an IPv6 packet. - NextHeader uint8 - - // HopLimit is the "hop limit" field of an IPv6 packet. - HopLimit uint8 - - // SrcAddr is the "source ip address" of an IPv6 packet. - SrcAddr tcpip.Address - - // DstAddr is the "destination ip address" of an IPv6 packet. - DstAddr tcpip.Address -} - -// IPv6 represents an ipv6 header stored in a byte array. -// Most of the methods of IPv6 access to the underlying slice without -// checking the boundaries and could panic because of 'index out of range'. -// Always call IsValid() to validate an instance of IPv6 before using other methods. -type IPv6 []byte - -const ( - // IPv6MinimumSize is the minimum size of a valid IPv6 packet. - IPv6MinimumSize = 40 - - // IPv6AddressSize is the size, in bytes, of an IPv6 address. - IPv6AddressSize = 16 - - // IPv6ProtocolNumber is IPv6's network protocol number. - IPv6ProtocolNumber tcpip.NetworkProtocolNumber = 0x86dd - - // IPv6Version is the version of the ipv6 protocol. - IPv6Version = 6 - - // IPv6MinimumMTU is the minimum MTU required by IPv6, per RFC 2460, - // section 5. - IPv6MinimumMTU = 1280 -) - -// PayloadLength returns the value of the "payload length" field of the ipv6 -// header. -func (b IPv6) PayloadLength() uint16 { - return binary.BigEndian.Uint16(b[payloadLen:]) -} - -// HopLimit returns the value of the "hop limit" field of the ipv6 header. -func (b IPv6) HopLimit() uint8 { - return b[hopLimit] -} - -// NextHeader returns the value of the "next header" field of the ipv6 header. -func (b IPv6) NextHeader() uint8 { - return b[nextHdr] -} - -// TransportProtocol implements Network.TransportProtocol. -func (b IPv6) TransportProtocol() tcpip.TransportProtocolNumber { - return tcpip.TransportProtocolNumber(b.NextHeader()) -} - -// Payload implements Network.Payload. -func (b IPv6) Payload() []byte { - return b[IPv6MinimumSize:][:b.PayloadLength()] -} - -// SourceAddress returns the "source address" field of the ipv6 header. -func (b IPv6) SourceAddress() tcpip.Address { - return tcpip.Address(b[v6SrcAddr : v6SrcAddr+IPv6AddressSize]) -} - -// DestinationAddress returns the "destination address" field of the ipv6 -// header. -func (b IPv6) DestinationAddress() tcpip.Address { - return tcpip.Address(b[v6DstAddr : v6DstAddr+IPv6AddressSize]) -} - -// Checksum implements Network.Checksum. Given that IPv6 doesn't have a -// checksum, it just returns 0. -func (IPv6) Checksum() uint16 { - return 0 -} - -// TOS returns the "traffic class" and "flow label" fields of the ipv6 header. -func (b IPv6) TOS() (uint8, uint32) { - v := binary.BigEndian.Uint32(b[versTCFL:]) - return uint8(v >> 20), v & 0xfffff -} - -// SetTOS sets the "traffic class" and "flow label" fields of the ipv6 header. -func (b IPv6) SetTOS(t uint8, l uint32) { - vtf := (6 << 28) | (uint32(t) << 20) | (l & 0xfffff) - binary.BigEndian.PutUint32(b[versTCFL:], vtf) -} - -// SetPayloadLength sets the "payload length" field of the ipv6 header. -func (b IPv6) SetPayloadLength(payloadLength uint16) { - binary.BigEndian.PutUint16(b[payloadLen:], payloadLength) -} - -// SetSourceAddress sets the "source address" field of the ipv6 header. -func (b IPv6) SetSourceAddress(addr tcpip.Address) { - copy(b[v6SrcAddr:v6SrcAddr+IPv6AddressSize], addr) -} - -// SetDestinationAddress sets the "destination address" field of the ipv6 -// header. -func (b IPv6) SetDestinationAddress(addr tcpip.Address) { - copy(b[v6DstAddr:v6DstAddr+IPv6AddressSize], addr) -} - -// SetNextHeader sets the value of the "next header" field of the ipv6 header. -func (b IPv6) SetNextHeader(v uint8) { - b[nextHdr] = v -} - -// SetChecksum implements Network.SetChecksum. Given that IPv6 doesn't have a -// checksum, it is empty. -func (IPv6) SetChecksum(uint16) { -} - -// Encode encodes all the fields of the ipv6 header. -func (b IPv6) Encode(i *IPv6Fields) { - b.SetTOS(i.TrafficClass, i.FlowLabel) - b.SetPayloadLength(i.PayloadLength) - b[nextHdr] = i.NextHeader - b[hopLimit] = i.HopLimit - copy(b[v6SrcAddr:v6SrcAddr+IPv6AddressSize], i.SrcAddr) - copy(b[v6DstAddr:v6DstAddr+IPv6AddressSize], i.DstAddr) -} - -// IsValid performs basic validation on the packet. -func (b IPv6) IsValid(pktSize int) bool { - if len(b) < IPv6MinimumSize { - return false - } - - dlen := int(b.PayloadLength()) - - return dlen <= pktSize-IPv6MinimumSize -} - -// IsV4MappedAddress determines if the provided address is an IPv4 mapped -// address by checking if its prefix is 0:0:0:0:0:ffff::/96. -func IsV4MappedAddress(addr tcpip.Address) bool { - if len(addr) != IPv6AddressSize { - return false - } - - return strings.HasPrefix(string(addr), "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\xff") -} - -// IsV6MulticastAddress determines if the provided address is an IPv6 -// multicast address (anything starting with FF). -func IsV6MulticastAddress(addr tcpip.Address) bool { - if len(addr) != IPv6AddressSize { - return false - } - return addr[0] == 0xff -} - -// SolicitedNodeAddr computes the solicited-node multicast address. This is -// used for NDP. Described in RFC 4291. The argument must be a full-length IPv6 -// address. -func SolicitedNodeAddr(addr tcpip.Address) tcpip.Address { - const solicitedNodeMulticastPrefix = "\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\xff" - return solicitedNodeMulticastPrefix + addr[len(addr)-3:] -} - -// LinkLocalAddr computes the default IPv6 link-local address from a link-layer -// (MAC) address. -func LinkLocalAddr(linkAddr tcpip.LinkAddress) tcpip.Address { - // Convert a 48-bit MAC to an EUI-64 and then prepend the link-local - // header, FE80::. - // - // The conversion is very nearly: - // aa:bb:cc:dd:ee:ff => FE80::Aabb:ccFF:FEdd:eeff - // Note the capital A. The conversion aa->Aa involves a bit flip. - lladdrb := [16]byte{ - 0: 0xFE, - 1: 0x80, - 8: linkAddr[0] ^ 2, - 9: linkAddr[1], - 10: linkAddr[2], - 11: 0xFF, - 12: 0xFE, - 13: linkAddr[3], - 14: linkAddr[4], - 15: linkAddr[5], - } - return tcpip.Address(lladdrb[:]) -} +package header + +import ( + "encoding/binary" + "netstack/tcpip" + "strings" +) + +const ( + versTCFL = 0 + payloadLen = 4 + nextHdr = 6 + hopLimit = 7 + v6SrcAddr = 8 + v6DstAddr = 24 +) + +// IPv6Fields contains the fields of an IPv6 packet. It is used to describe the +// fields of a packet that needs to be encoded. +type IPv6Fields struct { + // TrafficClass is the "traffic class" field of an IPv6 packet. + TrafficClass uint8 + + // FlowLabel is the "flow label" field of an IPv6 packet. + FlowLabel uint32 + + // PayloadLength is the "payload length" field of an IPv6 packet. + PayloadLength uint16 + + // NextHeader is the "next header" field of an IPv6 packet. + NextHeader uint8 + + // HopLimit is the "hop limit" field of an IPv6 packet. + HopLimit uint8 + + // SrcAddr is the "source ip address" of an IPv6 packet. + SrcAddr tcpip.Address + + // DstAddr is the "destination ip address" of an IPv6 packet. + DstAddr tcpip.Address +} + +// IPv6 represents an ipv6 header stored in a byte array. +// Most of the methods of IPv6 access to the underlying slice without +// checking the boundaries and could panic because of 'index out of range'. +// Always call IsValid() to validate an instance of IPv6 before using other methods. +type IPv6 []byte + +const ( + // IPv6MinimumSize is the minimum size of a valid IPv6 packet. + IPv6MinimumSize = 40 + + // IPv6AddressSize is the size, in bytes, of an IPv6 address. + IPv6AddressSize = 16 + + // IPv6ProtocolNumber is IPv6's network protocol number. + IPv6ProtocolNumber tcpip.NetworkProtocolNumber = 0x86dd + + // IPv6Version is the version of the ipv6 protocol. + IPv6Version = 6 + + // IPv6MinimumMTU is the minimum MTU required by IPv6, per RFC 2460, + // section 5. + IPv6MinimumMTU = 1280 +) + +// PayloadLength returns the value of the "payload length" field of the ipv6 +// header. +func (b IPv6) PayloadLength() uint16 { + return binary.BigEndian.Uint16(b[payloadLen:]) +} + +// HopLimit returns the value of the "hop limit" field of the ipv6 header. +func (b IPv6) HopLimit() uint8 { + return b[hopLimit] +} + +// NextHeader returns the value of the "next header" field of the ipv6 header. +func (b IPv6) NextHeader() uint8 { + return b[nextHdr] +} + +// TransportProtocol implements Network.TransportProtocol. +func (b IPv6) TransportProtocol() tcpip.TransportProtocolNumber { + return tcpip.TransportProtocolNumber(b.NextHeader()) +} + +// Payload implements Network.Payload. +func (b IPv6) Payload() []byte { + return b[IPv6MinimumSize:][:b.PayloadLength()] +} + +// SourceAddress returns the "source address" field of the ipv6 header. +func (b IPv6) SourceAddress() tcpip.Address { + return tcpip.Address(b[v6SrcAddr : v6SrcAddr+IPv6AddressSize]) +} + +// DestinationAddress returns the "destination address" field of the ipv6 +// header. +func (b IPv6) DestinationAddress() tcpip.Address { + return tcpip.Address(b[v6DstAddr : v6DstAddr+IPv6AddressSize]) +} + +// Checksum implements Network.Checksum. Given that IPv6 doesn't have a +// checksum, it just returns 0. +func (IPv6) Checksum() uint16 { + return 0 +} + +// TOS returns the "traffic class" and "flow label" fields of the ipv6 header. +func (b IPv6) TOS() (uint8, uint32) { + v := binary.BigEndian.Uint32(b[versTCFL:]) + return uint8(v >> 20), v & 0xfffff +} + +// SetTOS sets the "traffic class" and "flow label" fields of the ipv6 header. +func (b IPv6) SetTOS(t uint8, l uint32) { + vtf := (6 << 28) | (uint32(t) << 20) | (l & 0xfffff) + binary.BigEndian.PutUint32(b[versTCFL:], vtf) +} + +// SetPayloadLength sets the "payload length" field of the ipv6 header. +func (b IPv6) SetPayloadLength(payloadLength uint16) { + binary.BigEndian.PutUint16(b[payloadLen:], payloadLength) +} + +// SetSourceAddress sets the "source address" field of the ipv6 header. +func (b IPv6) SetSourceAddress(addr tcpip.Address) { + copy(b[v6SrcAddr:v6SrcAddr+IPv6AddressSize], addr) +} + +// SetDestinationAddress sets the "destination address" field of the ipv6 +// header. +func (b IPv6) SetDestinationAddress(addr tcpip.Address) { + copy(b[v6DstAddr:v6DstAddr+IPv6AddressSize], addr) +} + +// SetNextHeader sets the value of the "next header" field of the ipv6 header. +func (b IPv6) SetNextHeader(v uint8) { + b[nextHdr] = v +} + +// SetChecksum implements Network.SetChecksum. Given that IPv6 doesn't have a +// checksum, it is empty. +func (IPv6) SetChecksum(uint16) { +} + +// Encode encodes all the fields of the ipv6 header. +func (b IPv6) Encode(i *IPv6Fields) { + b.SetTOS(i.TrafficClass, i.FlowLabel) + b.SetPayloadLength(i.PayloadLength) + b[nextHdr] = i.NextHeader + b[hopLimit] = i.HopLimit + copy(b[v6SrcAddr:v6SrcAddr+IPv6AddressSize], i.SrcAddr) + copy(b[v6DstAddr:v6DstAddr+IPv6AddressSize], i.DstAddr) +} + +// IsValid performs basic validation on the packet. +func (b IPv6) IsValid(pktSize int) bool { + if len(b) < IPv6MinimumSize { + return false + } + + dlen := int(b.PayloadLength()) + + return dlen <= pktSize-IPv6MinimumSize +} + +// IsV4MappedAddress determines if the provided address is an IPv4 mapped +// address by checking if its prefix is 0:0:0:0:0:ffff::/96. +func IsV4MappedAddress(addr tcpip.Address) bool { + if len(addr) != IPv6AddressSize { + return false + } + + return strings.HasPrefix(string(addr), "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\xff") +} + +// IsV6MulticastAddress determines if the provided address is an IPv6 +// multicast address (anything starting with FF). +func IsV6MulticastAddress(addr tcpip.Address) bool { + if len(addr) != IPv6AddressSize { + return false + } + return addr[0] == 0xff +} + +// SolicitedNodeAddr computes the solicited-node multicast address. This is +// used for NDP. Described in RFC 4291. The argument must be a full-length IPv6 +// address. +func SolicitedNodeAddr(addr tcpip.Address) tcpip.Address { + const solicitedNodeMulticastPrefix = "\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\xff" + return solicitedNodeMulticastPrefix + addr[len(addr)-3:] +} + +// LinkLocalAddr computes the default IPv6 link-local address from a link-layer +// (MAC) address. +func LinkLocalAddr(linkAddr tcpip.LinkAddress) tcpip.Address { + // Convert a 48-bit MAC to an EUI-64 and then prepend the link-local + // header, FE80::. + // + // The conversion is very nearly: + // aa:bb:cc:dd:ee:ff => FE80::Aabb:ccFF:FEdd:eeff + // Note the capital A. The conversion aa->Aa involves a bit flip. + lladdrb := [16]byte{ + 0: 0xFE, + 1: 0x80, + 8: linkAddr[0] ^ 2, + 9: linkAddr[1], + 10: linkAddr[2], + 11: 0xFF, + 12: 0xFE, + 13: linkAddr[3], + 14: linkAddr[4], + 15: linkAddr[5], + } + return tcpip.Address(lladdrb[:]) +} diff --git a/tcpip/header/ipv6_fragment.go b/tcpip/header/ipv6_fragment.go index 1b4362c..b9f739d 100644 --- a/tcpip/header/ipv6_fragment.go +++ b/tcpip/header/ipv6_fragment.go @@ -1,146 +1,146 @@ -// Copyright 2018 Google LLC -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package header - -import ( - "encoding/binary" - - "netstack/tcpip" -) - -const ( - nextHdrFrag = 0 - fragOff = 2 - more = 3 - idV6 = 4 -) - -// IPv6FragmentFields contains the fields of an IPv6 fragment. It is used to describe the -// fields of a packet that needs to be encoded. -type IPv6FragmentFields struct { - // NextHeader is the "next header" field of an IPv6 fragment. - NextHeader uint8 - - // FragmentOffset is the "fragment offset" field of an IPv6 fragment. - FragmentOffset uint16 - - // M is the "more" field of an IPv6 fragment. - M bool - - // Identification is the "identification" field of an IPv6 fragment. - Identification uint32 -} - -// IPv6Fragment represents an ipv6 fragment header stored in a byte array. -// Most of the methods of IPv6Fragment access to the underlying slice without -// checking the boundaries and could panic because of 'index out of range'. -// Always call IsValid() to validate an instance of IPv6Fragment before using other methods. -type IPv6Fragment []byte - -const ( - // IPv6FragmentHeader header is the number used to specify that the next - // header is a fragment header, per RFC 2460. - IPv6FragmentHeader = 44 - - // IPv6FragmentHeaderSize is the size of the fragment header. - IPv6FragmentHeaderSize = 8 -) - -// Encode encodes all the fields of the ipv6 fragment. -func (b IPv6Fragment) Encode(i *IPv6FragmentFields) { - b[nextHdrFrag] = i.NextHeader - binary.BigEndian.PutUint16(b[fragOff:], i.FragmentOffset<<3) - if i.M { - b[more] |= 1 - } - binary.BigEndian.PutUint32(b[idV6:], i.Identification) -} - -// IsValid performs basic validation on the fragment header. -func (b IPv6Fragment) IsValid() bool { - return len(b) >= IPv6FragmentHeaderSize -} - -// NextHeader returns the value of the "next header" field of the ipv6 fragment. -func (b IPv6Fragment) NextHeader() uint8 { - return b[nextHdrFrag] -} - -// FragmentOffset returns the "fragment offset" field of the ipv6 fragment. -func (b IPv6Fragment) FragmentOffset() uint16 { - return binary.BigEndian.Uint16(b[fragOff:]) >> 3 -} - -// More returns the "more" field of the ipv6 fragment. -func (b IPv6Fragment) More() bool { - return b[more]&1 > 0 -} - -// Payload implements Network.Payload. -func (b IPv6Fragment) Payload() []byte { - return b[IPv6FragmentHeaderSize:] -} - -// ID returns the value of the identifier field of the ipv6 fragment. -func (b IPv6Fragment) ID() uint32 { - return binary.BigEndian.Uint32(b[idV6:]) -} - -// TransportProtocol implements Network.TransportProtocol. -func (b IPv6Fragment) TransportProtocol() tcpip.TransportProtocolNumber { - return tcpip.TransportProtocolNumber(b.NextHeader()) -} - -// The functions below have been added only to satisfy the Network interface. - -// Checksum is not supported by IPv6Fragment. -func (b IPv6Fragment) Checksum() uint16 { - panic("not supported") -} - -// SourceAddress is not supported by IPv6Fragment. -func (b IPv6Fragment) SourceAddress() tcpip.Address { - panic("not supported") -} - -// DestinationAddress is not supported by IPv6Fragment. -func (b IPv6Fragment) DestinationAddress() tcpip.Address { - panic("not supported") -} - -// SetSourceAddress is not supported by IPv6Fragment. -func (b IPv6Fragment) SetSourceAddress(tcpip.Address) { - panic("not supported") -} - -// SetDestinationAddress is not supported by IPv6Fragment. -func (b IPv6Fragment) SetDestinationAddress(tcpip.Address) { - panic("not supported") -} - -// SetChecksum is not supported by IPv6Fragment. -func (b IPv6Fragment) SetChecksum(uint16) { - panic("not supported") -} - -// TOS is not supported by IPv6Fragment. -func (b IPv6Fragment) TOS() (uint8, uint32) { - panic("not supported") -} - -// SetTOS is not supported by IPv6Fragment. -func (b IPv6Fragment) SetTOS(t uint8, l uint32) { - panic("not supported") -} +// Copyright 2018 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package header + +import ( + "encoding/binary" + + "netstack/tcpip" +) + +const ( + nextHdrFrag = 0 + fragOff = 2 + more = 3 + idV6 = 4 +) + +// IPv6FragmentFields contains the fields of an IPv6 fragment. It is used to describe the +// fields of a packet that needs to be encoded. +type IPv6FragmentFields struct { + // NextHeader is the "next header" field of an IPv6 fragment. + NextHeader uint8 + + // FragmentOffset is the "fragment offset" field of an IPv6 fragment. + FragmentOffset uint16 + + // M is the "more" field of an IPv6 fragment. + M bool + + // Identification is the "identification" field of an IPv6 fragment. + Identification uint32 +} + +// IPv6Fragment represents an ipv6 fragment header stored in a byte array. +// Most of the methods of IPv6Fragment access to the underlying slice without +// checking the boundaries and could panic because of 'index out of range'. +// Always call IsValid() to validate an instance of IPv6Fragment before using other methods. +type IPv6Fragment []byte + +const ( + // IPv6FragmentHeader header is the number used to specify that the next + // header is a fragment header, per RFC 2460. + IPv6FragmentHeader = 44 + + // IPv6FragmentHeaderSize is the size of the fragment header. + IPv6FragmentHeaderSize = 8 +) + +// Encode encodes all the fields of the ipv6 fragment. +func (b IPv6Fragment) Encode(i *IPv6FragmentFields) { + b[nextHdrFrag] = i.NextHeader + binary.BigEndian.PutUint16(b[fragOff:], i.FragmentOffset<<3) + if i.M { + b[more] |= 1 + } + binary.BigEndian.PutUint32(b[idV6:], i.Identification) +} + +// IsValid performs basic validation on the fragment header. +func (b IPv6Fragment) IsValid() bool { + return len(b) >= IPv6FragmentHeaderSize +} + +// NextHeader returns the value of the "next header" field of the ipv6 fragment. +func (b IPv6Fragment) NextHeader() uint8 { + return b[nextHdrFrag] +} + +// FragmentOffset returns the "fragment offset" field of the ipv6 fragment. +func (b IPv6Fragment) FragmentOffset() uint16 { + return binary.BigEndian.Uint16(b[fragOff:]) >> 3 +} + +// More returns the "more" field of the ipv6 fragment. +func (b IPv6Fragment) More() bool { + return b[more]&1 > 0 +} + +// Payload implements Network.Payload. +func (b IPv6Fragment) Payload() []byte { + return b[IPv6FragmentHeaderSize:] +} + +// ID returns the value of the identifier field of the ipv6 fragment. +func (b IPv6Fragment) ID() uint32 { + return binary.BigEndian.Uint32(b[idV6:]) +} + +// TransportProtocol implements Network.TransportProtocol. +func (b IPv6Fragment) TransportProtocol() tcpip.TransportProtocolNumber { + return tcpip.TransportProtocolNumber(b.NextHeader()) +} + +// The functions below have been added only to satisfy the Network interface. + +// Checksum is not supported by IPv6Fragment. +func (b IPv6Fragment) Checksum() uint16 { + panic("not supported") +} + +// SourceAddress is not supported by IPv6Fragment. +func (b IPv6Fragment) SourceAddress() tcpip.Address { + panic("not supported") +} + +// DestinationAddress is not supported by IPv6Fragment. +func (b IPv6Fragment) DestinationAddress() tcpip.Address { + panic("not supported") +} + +// SetSourceAddress is not supported by IPv6Fragment. +func (b IPv6Fragment) SetSourceAddress(tcpip.Address) { + panic("not supported") +} + +// SetDestinationAddress is not supported by IPv6Fragment. +func (b IPv6Fragment) SetDestinationAddress(tcpip.Address) { + panic("not supported") +} + +// SetChecksum is not supported by IPv6Fragment. +func (b IPv6Fragment) SetChecksum(uint16) { + panic("not supported") +} + +// TOS is not supported by IPv6Fragment. +func (b IPv6Fragment) TOS() (uint8, uint32) { + panic("not supported") +} + +// SetTOS is not supported by IPv6Fragment. +func (b IPv6Fragment) SetTOS(t uint8, l uint32) { + panic("not supported") +} diff --git a/tcpip/header/udp.go b/tcpip/header/udp.go index 5a936e6..4df3682 100644 --- a/tcpip/header/udp.go +++ b/tcpip/header/udp.go @@ -1,38 +1,38 @@ -package header - -import "netstack/tcpip" - -const ( - udpSrcPort = 0 - udpDstPort = 2 - udpLength = 4 - udpChecksum = 6 -) - -// UDPFields contains the fields of a UDP packet. It is used to describe the -// fields of a packet that needs to be encoded. -// udp 首部字段 -type UDPFields struct { - // SrcPort is the "source port" field of a UDP packet. - SrcPort uint16 - - // DstPort is the "destination port" field of a UDP packet. - DstPort uint16 - - // Length is the "length" field of a UDP packet. - Length uint16 - - // Checksum is the "checksum" field of a UDP packet. - Checksum uint16 -} - -// UDP represents a UDP header stored in a byte array. -type UDP []byte - -const ( - // UDPMinimumSize is the minimum size of a valid UDP packet. - UDPMinimumSize = 8 - - // UDPProtocolNumber is UDP's transport protocol number. - UDPProtocolNumber tcpip.TransportProtocolNumber = 17 -) +package header + +import "netstack/tcpip" + +const ( + udpSrcPort = 0 + udpDstPort = 2 + udpLength = 4 + udpChecksum = 6 +) + +// UDPFields contains the fields of a UDP packet. It is used to describe the +// fields of a packet that needs to be encoded. +// udp 首部字段 +type UDPFields struct { + // SrcPort is the "source port" field of a UDP packet. + SrcPort uint16 + + // DstPort is the "destination port" field of a UDP packet. + DstPort uint16 + + // Length is the "length" field of a UDP packet. + Length uint16 + + // Checksum is the "checksum" field of a UDP packet. + Checksum uint16 +} + +// UDP represents a UDP header stored in a byte array. +type UDP []byte + +const ( + // UDPMinimumSize is the minimum size of a valid UDP packet. + UDPMinimumSize = 8 + + // UDPProtocolNumber is UDP's transport protocol number. + UDPProtocolNumber tcpip.TransportProtocolNumber = 17 +) diff --git a/tcpip/link/README.md b/tcpip/link/README.md index 42b893d..169c612 100644 --- a/tcpip/link/README.md +++ b/tcpip/link/README.md @@ -1,107 +1,107 @@ -# 链路层的介绍和基本实现 - -## 链路层的目的 - -数据链路层属于计算机网络的底层,使用的信道主要有点对点信道和广播信道两种类型。 在 TCP/IP 协议族中,数据链路层主要有以下几个目的: - -1. 接收和发送链路层数据,提供 io 的能力。 -2. 为 IP 模块发送和接收数据 -3. 为 ARP 模块发送 ARP 请求和接收 ARP 应答 -4. 为 RARP 模块发送 RARP 请求和接收 RARP 应答 - - TCP/IP 支持多种不同的链路层协议,这取决于网络所使用的硬件。 数据链路层的协议数据单元—帧:将 IP 层(网络层)的数据报添加首部和尾部封装成帧。 数据链路层协议有许多种,都会解决三个基本问题,封装成帧,透明传输,差错检测。 - -## 以太网介绍 - -我们这章讲的是链路层,为何要讲以太网,那是因为以太网实在应用太广了,以至于我们在现实生活中看到的链路层协议的数据封装都是以太网协议封装的,所以要实现链路层数据的处理,我们必须要了解以太网。 - -以太网(Ethernet)是一种计算机局域网技术。IEEE 组织的 IEEE 802.3 标准制定了以太网的技术标准,它规定了包括物理层的连线、电子信号和介质访问层协议的内容。以太网是目前应用最普遍的局域网技术,取代了其他局域网标准如令牌环、FDDI 和 ARCNET。以太网协议,是当今现有局域网采用的最通用的通信协议标准,故可认为以太网就是局域网。 - - -## 链路层的寻址 - -通信当然得知道发送者的地址和接受者的地址,这是最基础的。以太网规定,所有连入网络的设备,都必须具有“网卡”接口。然后数据包是从一块网卡,传输到另一块网卡的。网卡的地址,就是数据包的发送地址和接收地址,叫做 MAC 地址,也叫物理地址,这是最底层的地址。每块网卡出厂的时候,都有一个全世界独一无二的 MAC 地址,长度是 48 个二进制位,通常用 12 个十六进制数表示。有了这个地址,我们可以定位网卡和数据包的路径了。 - - -## MTU(最大传输单元) - -MTU 表示在链路层最大的传输单元,也就是链路层一帧数据的数据内容最大长度,单位为字节,MTU 是协议栈实现一个很重要的参数,请大家务必理解该参数。一般网卡默认 MTU 是 1500,当你往网卡写入的内容超过 1518bytes,就会报错,后面我们可以写代码试试。 - - -## 链路实现的分层 - -链路层的实现可以分为三层,真实的以太网卡,网卡驱动,网卡逻辑抽象。 - -真实的网卡我们不关心,因为那是硬件工程,我们只需要知道,它能接收和发送网络数据给网卡驱动就好了。网卡驱动我们也不关心,一般驱动都是网卡生产商就写好了,我们只需知道,它能接收协议栈的数据发送给网卡,接收网卡的数据发送给协议栈。网卡逻辑抽象表示,这个是我们关心的,我需要对真实的网卡进行抽象, - -一个 eth0 以太网网卡,一个 lo 本地回环网卡。还可以看到两个网卡的信息,当我们要表示一个网卡的时候,需要具备几个属性: - -1. 网卡的名字、类型和 MAC 地址 -- eth0 Link encap:Ethernet HWaddr 00:16:3e:08:a1:7a - - eth0是网卡名,方便表示一个网卡,网卡名在同个系统里不能重复。 - - Link encap:Ethernet 表示该网卡类型为以太网网卡。 - - HWaddr 00:16:3e:08:a1:7a 表示 MAC 地址 00:16:3e:08:a1:7a,是链路层寻址的地址。 - -2. 网卡的 IP 地址及掩码 -- inet addr:172.18.153.158 Bcast:172.18.159.255 Mask:255.255.240.0 - - inet addr:172.18.153.158 表示该网卡的 ipv4 地址是 172.18.153.158。 - - Bcast:172.18.159.255 表示该网卡 ip 层的广播地址。 - - 255.255.240.0 该网卡的子网掩码。 - -3. 网卡的状态和 MTU -- UP BROADCAST RUNNING MULTICAST MTU:1500 Metric:1 - - UP BROADCAST RUNNING MULTICAST 都是表示网卡的状态,UP(代表网卡开启状态) BROADCAST (支持广播) RUNNING(代表网卡的网线被接上)MULTICAST(支持组播)。 - - MTU:1500 最大传输单元为 1500 字节。 - - Metric:1 接口度量值为 1,接口度量值表示在这个路径上发送一个分组的成本。 - - -**实现协议栈,我们需要一个网卡,因为这样我们才能接收和发送网络数据,但是一般情况下,我们电脑的操作系统已经帮我们管理好网卡了,我们想实现自由的控制网卡是不太方便的,还好 linux 系统还有另一个功能-虚拟网卡,它是操作系统虚拟出来的一个网卡,我们协议栈的实现都是基于虚拟网卡** - - -## 虚拟网卡的好处 - -1. 对于用户来说虚拟网卡和真实网卡几乎没有差别,而且我们控制或更改虚拟网卡大部分情况下不会影响到真实的网卡,也就不会影响到用户的网络。 -2. 虚拟网卡的数据可以直接从用户态直接读取和写入,这样我们就可以直接在用户态编写协议栈。 - - -## Linux 中虚拟网络设备 - -TUN/TAP 设备、VETH 设备、Bridge 设备、Bond 设备、VLAN 设备、MACVTAP 设备,下面我们只讲 tun/tap 设备,其他虚拟设备感兴趣的同学可以去网上自行搜索。 - -TAP/TUN 设备是一种让用户态和内核之间进行数据交换的虚拟设备,TAP 工作在二层,TUN 工作在三层,TAP/TUN 网卡的两头分别是内核网络协议栈和用户层,其作用是将协议栈中的部分数据包转发给用户空间的应用程序,给用户空间的程序一个处理数据包的机会。 - -当我们想在 linux 中创建一个 TAP 设备时,其实很容易,像普通文件一样打开字符设备 /dev/net/tun 可以得到一个文件描述符,接着用系统调用 ioctl 将文件描述符和 kernel 的 tap 驱动绑定在一起,那么之后对该文件描述符的读写就是对虚拟网卡 TAP 的读写。 - -``` sh -# 创建一个tap模式的虚拟网卡tap0 -sudo ip tuntap add mode tap tap0 -# 开启该网卡 -sudo ip link set tap0 up -# 设置该网卡的ip及掩码 -sudo ip addr add 192.168.1.1/24 dev tap0 - -tap0 Link encap:Ethernet HWaddr 22:e2:f2:93:ff:bf - inet addr:192.168.1.1 Bcast:0.0.0.0 Mask:255.255.255.0 - UP BROADCAST MULTICAST MTU:1500 Metric:1 - RX packets:0 errors:0 dropped:0 overruns:0 frame:0 - TX packets:0 errors:0 dropped:0 overruns:0 carrier:0 - collisions:0 txqueuelen:1000 - RX bytes:0 (0.0 B) TX bytes:0 (0.0 B) - -sudo ip tuntap del mode tap tap0 -``` - - -## 链路层数据帧 - -|dst MAC(6B)|src MAC(6B)|type(2B)|data(46B - 1500B)| - -1. 目的 MAC 地址:目的设备的 MAC 物理地址。 -2. 源 MAC 地址:发送设备的 MAC 物理地址。 -3. 类型:表示后面所跟数据包的协议类型,例如 Type 为 0x8000 时为 IPv4 协议包,Type 为 0x8060 时,后面为 ARP 协议包。 -4. 数据:表示该帧的数据内容,长度为 46 ~ 1500 字节,包含网络层、传输层和应用层的数据。 - - - - - +# 链路层的介绍和基本实现 + +## 链路层的目的 + +数据链路层属于计算机网络的底层,使用的信道主要有点对点信道和广播信道两种类型。 在 TCP/IP 协议族中,数据链路层主要有以下几个目的: + +1. 接收和发送链路层数据,提供 io 的能力。 +2. 为 IP 模块发送和接收数据 +3. 为 ARP 模块发送 ARP 请求和接收 ARP 应答 +4. 为 RARP 模块发送 RARP 请求和接收 RARP 应答 + + TCP/IP 支持多种不同的链路层协议,这取决于网络所使用的硬件。 数据链路层的协议数据单元—帧:将 IP 层(网络层)的数据报添加首部和尾部封装成帧。 数据链路层协议有许多种,都会解决三个基本问题,封装成帧,透明传输,差错检测。 + +## 以太网介绍 + +我们这章讲的是链路层,为何要讲以太网,那是因为以太网实在应用太广了,以至于我们在现实生活中看到的链路层协议的数据封装都是以太网协议封装的,所以要实现链路层数据的处理,我们必须要了解以太网。 + +以太网(Ethernet)是一种计算机局域网技术。IEEE 组织的 IEEE 802.3 标准制定了以太网的技术标准,它规定了包括物理层的连线、电子信号和介质访问层协议的内容。以太网是目前应用最普遍的局域网技术,取代了其他局域网标准如令牌环、FDDI 和 ARCNET。以太网协议,是当今现有局域网采用的最通用的通信协议标准,故可认为以太网就是局域网。 + + +## 链路层的寻址 + +通信当然得知道发送者的地址和接受者的地址,这是最基础的。以太网规定,所有连入网络的设备,都必须具有“网卡”接口。然后数据包是从一块网卡,传输到另一块网卡的。网卡的地址,就是数据包的发送地址和接收地址,叫做 MAC 地址,也叫物理地址,这是最底层的地址。每块网卡出厂的时候,都有一个全世界独一无二的 MAC 地址,长度是 48 个二进制位,通常用 12 个十六进制数表示。有了这个地址,我们可以定位网卡和数据包的路径了。 + + +## MTU(最大传输单元) + +MTU 表示在链路层最大的传输单元,也就是链路层一帧数据的数据内容最大长度,单位为字节,MTU 是协议栈实现一个很重要的参数,请大家务必理解该参数。一般网卡默认 MTU 是 1500,当你往网卡写入的内容超过 1518bytes,就会报错,后面我们可以写代码试试。 + + +## 链路实现的分层 + +链路层的实现可以分为三层,真实的以太网卡,网卡驱动,网卡逻辑抽象。 + +真实的网卡我们不关心,因为那是硬件工程,我们只需要知道,它能接收和发送网络数据给网卡驱动就好了。网卡驱动我们也不关心,一般驱动都是网卡生产商就写好了,我们只需知道,它能接收协议栈的数据发送给网卡,接收网卡的数据发送给协议栈。网卡逻辑抽象表示,这个是我们关心的,我需要对真实的网卡进行抽象, + +一个 eth0 以太网网卡,一个 lo 本地回环网卡。还可以看到两个网卡的信息,当我们要表示一个网卡的时候,需要具备几个属性: + +1. 网卡的名字、类型和 MAC 地址 +- eth0 Link encap:Ethernet HWaddr 00:16:3e:08:a1:7a + - eth0是网卡名,方便表示一个网卡,网卡名在同个系统里不能重复。 + - Link encap:Ethernet 表示该网卡类型为以太网网卡。 + - HWaddr 00:16:3e:08:a1:7a 表示 MAC 地址 00:16:3e:08:a1:7a,是链路层寻址的地址。 + +2. 网卡的 IP 地址及掩码 +- inet addr:172.18.153.158 Bcast:172.18.159.255 Mask:255.255.240.0 + - inet addr:172.18.153.158 表示该网卡的 ipv4 地址是 172.18.153.158。 + - Bcast:172.18.159.255 表示该网卡 ip 层的广播地址。 + - 255.255.240.0 该网卡的子网掩码。 + +3. 网卡的状态和 MTU +- UP BROADCAST RUNNING MULTICAST MTU:1500 Metric:1 + - UP BROADCAST RUNNING MULTICAST 都是表示网卡的状态,UP(代表网卡开启状态) BROADCAST (支持广播) RUNNING(代表网卡的网线被接上)MULTICAST(支持组播)。 + - MTU:1500 最大传输单元为 1500 字节。 + - Metric:1 接口度量值为 1,接口度量值表示在这个路径上发送一个分组的成本。 + + +**实现协议栈,我们需要一个网卡,因为这样我们才能接收和发送网络数据,但是一般情况下,我们电脑的操作系统已经帮我们管理好网卡了,我们想实现自由的控制网卡是不太方便的,还好 linux 系统还有另一个功能-虚拟网卡,它是操作系统虚拟出来的一个网卡,我们协议栈的实现都是基于虚拟网卡** + + +## 虚拟网卡的好处 + +1. 对于用户来说虚拟网卡和真实网卡几乎没有差别,而且我们控制或更改虚拟网卡大部分情况下不会影响到真实的网卡,也就不会影响到用户的网络。 +2. 虚拟网卡的数据可以直接从用户态直接读取和写入,这样我们就可以直接在用户态编写协议栈。 + + +## Linux 中虚拟网络设备 + +TUN/TAP 设备、VETH 设备、Bridge 设备、Bond 设备、VLAN 设备、MACVTAP 设备,下面我们只讲 tun/tap 设备,其他虚拟设备感兴趣的同学可以去网上自行搜索。 + +TAP/TUN 设备是一种让用户态和内核之间进行数据交换的虚拟设备,TAP 工作在二层,TUN 工作在三层,TAP/TUN 网卡的两头分别是内核网络协议栈和用户层,其作用是将协议栈中的部分数据包转发给用户空间的应用程序,给用户空间的程序一个处理数据包的机会。 + +当我们想在 linux 中创建一个 TAP 设备时,其实很容易,像普通文件一样打开字符设备 /dev/net/tun 可以得到一个文件描述符,接着用系统调用 ioctl 将文件描述符和 kernel 的 tap 驱动绑定在一起,那么之后对该文件描述符的读写就是对虚拟网卡 TAP 的读写。 + +``` sh +# 创建一个tap模式的虚拟网卡tap0 +sudo ip tuntap add mode tap tap0 +# 开启该网卡 +sudo ip link set tap0 up +# 设置该网卡的ip及掩码 +sudo ip addr add 192.168.1.1/24 dev tap0 + +tap0 Link encap:Ethernet HWaddr 22:e2:f2:93:ff:bf + inet addr:192.168.1.1 Bcast:0.0.0.0 Mask:255.255.255.0 + UP BROADCAST MULTICAST MTU:1500 Metric:1 + RX packets:0 errors:0 dropped:0 overruns:0 frame:0 + TX packets:0 errors:0 dropped:0 overruns:0 carrier:0 + collisions:0 txqueuelen:1000 + RX bytes:0 (0.0 B) TX bytes:0 (0.0 B) + +sudo ip tuntap del mode tap tap0 +``` + + +## 链路层数据帧 + +|dst MAC(6B)|src MAC(6B)|type(2B)|data(46B - 1500B)| + +1. 目的 MAC 地址:目的设备的 MAC 物理地址。 +2. 源 MAC 地址:发送设备的 MAC 物理地址。 +3. 类型:表示后面所跟数据包的协议类型,例如 Type 为 0x8000 时为 IPv4 协议包,Type 为 0x8060 时,后面为 ARP 协议包。 +4. 数据:表示该帧的数据内容,长度为 46 ~ 1500 字节,包含网络层、传输层和应用层的数据。 + + + + + diff --git a/tcpip/link/channel/channel.go b/tcpip/link/channel/channel.go index aa5c2c0..5084d67 100644 --- a/tcpip/link/channel/channel.go +++ b/tcpip/link/channel/channel.go @@ -1,101 +1,101 @@ -package channel - -import ( - "netstack/tcpip" - "netstack/tcpip/buffer" - "netstack/tcpip/stack" -) - -type PacketInfo struct { - Header buffer.View - Payload buffer.View - Proto tcpip.NetworkProtocolNumber -} - -type Endpoint struct { - dispatcher stack.NetworkDispatcher - mtu uint32 - linkAddr tcpip.LinkAddress // MAC地址 - C chan PacketInfo -} - -//创建一个新的抽象cahnnel Endpoint 可以接受数据 也可以外发数据 -func New(size int, mtu uint32, linkAddr tcpip.LinkAddress) (tcpip.LinkEndpointID, *Endpoint) { - e := &Endpoint{ - C: make(chan PacketInfo, size), - mtu: mtu, - linkAddr: linkAddr, - } - return stack.RegisterLinkEndpoint(e), e -} - -// Drain 流走 释放channel中的数据 -func (e *Endpoint) Drain() int { - c := 0 - for { - select { - case <-e.C: - c++ - default: - return c - } - } -} - -// Inject 注入 -func (e *Endpoint) Inject(protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView) { - e.InjectLinkAddr(protocol, "", vv) -} - -// InjectLinkAddr injects an inbound packet with a remote link address. -func (e *Endpoint) InjectLinkAddr(protocol tcpip.NetworkProtocolNumber, remoteLinkAddr tcpip.LinkAddress, vv buffer.VectorisedView) { - // 这里的实现在NIC.go中 由 网卡对象进行数据分发 - e.dispatcher.DeliverNetworkPacket(e, remoteLinkAddr, "" /* localLinkAddr */, protocol, vv.Clone(nil)) -} - -func (e *Endpoint) MTU() uint32 { - return e.mtu -} - -// Capabilities返回链路层端点支持的功能集。 -func (e *Endpoint) Capabilities() stack.LinkEndpointCapabilities { - return 0 -} - -// MaxHeaderLength 返回数据链接(和较低级别的图层组合)标头可以具有的最大大小。 -// 较高级别使用此信息来保留它们正在构建的数据包前面预留空间。 -func (e *Endpoint) MaxHeaderLength() uint16 { - return 0 -} - -// 本地链路层地址 -func (e *Endpoint) LinkAddress() tcpip.LinkAddress { - return e.linkAddr -} - -// channel 向外写数据 -func (e *Endpoint) WritePacket(r *stack.Route, hdr buffer.Prependable, payload buffer.VectorisedView, - protocol tcpip.NetworkProtocolNumber) *tcpip.Error { - p := PacketInfo{ - Header: hdr.View(), - Proto: protocol, - Payload: payload.ToView(), - } - - select { - case e.C <- p: - default: - } - - return nil -} - -// Attach 将数据链路层端点附加到协议栈的网络层调度程序。 -func (e *Endpoint) Attach(dispatcher stack.NetworkDispatcher) { - e.dispatcher = dispatcher -} - -// 是否已经添加了网络层调度器 -func (e *Endpoint) IsAttached() bool { - return e.dispatcher != nil -} +package channel + +import ( + "netstack/tcpip" + "netstack/tcpip/buffer" + "netstack/tcpip/stack" +) + +type PacketInfo struct { + Header buffer.View + Payload buffer.View + Proto tcpip.NetworkProtocolNumber +} + +type Endpoint struct { + dispatcher stack.NetworkDispatcher + mtu uint32 + linkAddr tcpip.LinkAddress // MAC地址 + C chan PacketInfo +} + +//创建一个新的抽象cahnnel Endpoint 可以接受数据 也可以外发数据 +func New(size int, mtu uint32, linkAddr tcpip.LinkAddress) (tcpip.LinkEndpointID, *Endpoint) { + e := &Endpoint{ + C: make(chan PacketInfo, size), + mtu: mtu, + linkAddr: linkAddr, + } + return stack.RegisterLinkEndpoint(e), e +} + +// Drain 流走 释放channel中的数据 +func (e *Endpoint) Drain() int { + c := 0 + for { + select { + case <-e.C: + c++ + default: + return c + } + } +} + +// Inject 注入 +func (e *Endpoint) Inject(protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView) { + e.InjectLinkAddr(protocol, "", vv) +} + +// InjectLinkAddr injects an inbound packet with a remote link address. +func (e *Endpoint) InjectLinkAddr(protocol tcpip.NetworkProtocolNumber, remoteLinkAddr tcpip.LinkAddress, vv buffer.VectorisedView) { + // 这里的实现在NIC.go中 由 网卡对象进行数据分发 + e.dispatcher.DeliverNetworkPacket(e, remoteLinkAddr, "" /* localLinkAddr */, protocol, vv.Clone(nil)) +} + +func (e *Endpoint) MTU() uint32 { + return e.mtu +} + +// Capabilities返回链路层端点支持的功能集。 +func (e *Endpoint) Capabilities() stack.LinkEndpointCapabilities { + return 0 +} + +// MaxHeaderLength 返回数据链接(和较低级别的图层组合)标头可以具有的最大大小。 +// 较高级别使用此信息来保留它们正在构建的数据包前面预留空间。 +func (e *Endpoint) MaxHeaderLength() uint16 { + return 0 +} + +// 本地链路层地址 +func (e *Endpoint) LinkAddress() tcpip.LinkAddress { + return e.linkAddr +} + +// channel 向外写数据 +func (e *Endpoint) WritePacket(r *stack.Route, hdr buffer.Prependable, payload buffer.VectorisedView, + protocol tcpip.NetworkProtocolNumber) *tcpip.Error { + p := PacketInfo{ + Header: hdr.View(), + Proto: protocol, + Payload: payload.ToView(), + } + + select { + case e.C <- p: + default: + } + + return nil +} + +// Attach 将数据链路层端点附加到协议栈的网络层调度程序。 +func (e *Endpoint) Attach(dispatcher stack.NetworkDispatcher) { + e.dispatcher = dispatcher +} + +// 是否已经添加了网络层调度器 +func (e *Endpoint) IsAttached() bool { + return e.dispatcher != nil +} diff --git a/tcpip/link/fdbased/endpoint.go b/tcpip/link/fdbased/endpoint.go index 23a2628..c8f4762 100644 --- a/tcpip/link/fdbased/endpoint.go +++ b/tcpip/link/fdbased/endpoint.go @@ -1,226 +1,226 @@ -package fdbased - -import ( - "log" - "netstack/tcpip" - "netstack/tcpip/buffer" - "netstack/tcpip/header" - "netstack/tcpip/link/rawfile" - "netstack/tcpip/stack" - "syscall" -) - -// 从NIC读取数据的多级缓存配置 -var BufConfig = []int{1 << 7, 1 << 8, 1 << 8, 1 << 9, 1 << 10, 1 << 11, 1 << 12, 1 << 13, 1 << 14, 1 << 15} - -// 负责底层网卡的io读写以及数据分发 -type endpoint struct { - // 发送和接收数据的文件描述符 - fd int - // 单个帧的最大长度 - mtu uint32 - // 以太网头部长度 - hdrSize int - // 网卡地址 - addr tcpip.LinkAddress - // 网卡的能力 - caps stack.LinkEndpointCapabilities - - closed func(*tcpip.Error) - - iovecs []syscall.Iovec - views []buffer.View - dispatcher stack.NetworkDispatcher - - // handleLocal指示发往自身的数据包是由内部netstack处理(true)还是转发到FD端点(false) - handleLocal bool -} - -type Options struct { - FD int - MTU uint32 - ClosedFunc func(*tcpip.Error) - Address tcpip.LinkAddress - ResolutionRequired bool - SaveRestore bool - ChecksumOffload bool - DisconnectOk bool - HandleLocal bool - TestLossPacket func(data []byte) bool -} - -// 根据选项参数创建一个链路层的endpoint,并返回该endpoint的id -func New(opts *Options) tcpip.LinkEndpointID { - syscall.SetNonblock(opts.FD, true) - caps := stack.LinkEndpointCapabilities(0) // 初始化 - if opts.ResolutionRequired { - caps |= stack.CapabilityResolutionRequired - } - if opts.ChecksumOffload { - caps |= stack.CapabilityChecksumOffload - } - if opts.SaveRestore { - caps |= stack.CapabilitySaveRestore - } - if opts.DisconnectOk { - caps |= stack.CapabilityDisconnectOK - } - - e := &endpoint{ - fd: opts.FD, - mtu: opts.MTU, - caps: caps, - closed: opts.ClosedFunc, - addr: opts.Address, - hdrSize: header.EthernetMinimumSize, - views: make([]buffer.View, len(BufConfig)), - iovecs: make([]syscall.Iovec, len(BufConfig)), - handleLocal: opts.HandleLocal, - } - - // 全局注册链路层设备 - return stack.RegisterLinkEndpoint(e) -} - -func (e *endpoint) MTU() uint32 { - return e.mtu -} - -func (e *endpoint) Capabilities() stack.LinkEndpointCapabilities { - return e.caps -} - -// 返回当前以太网头部信息长度 -func (e *endpoint) MaxHeaderLength() uint16 { - return uint16(e.hdrSize) -} - -// 返回当前MAC地址 -func (e *endpoint) LinkAddress() tcpip.LinkAddress { - return e.addr -} - -// 将上层的报文经过链路层封装,写入网卡中,如果写入失败则丢弃该报文 -func (e *endpoint) WritePacket(r *stack.Route, hdr buffer.Prependable, - payload buffer.VectorisedView, protocol tcpip.NetworkProtocolNumber) *tcpip.Error { - // 如果目标地址是设备自己 那么将报文重新返回给协议栈 - if e.handleLocal && r.LocalAddress != "" && r.LocalAddress == r.RemoteAddress { - views := make([]buffer.View, 1, 1+len(payload.Views())) - views[0] = hdr.View() - views = append(views, payload.Views()...) - vv := buffer.NewVectorisedView(len(views[0])+payload.Size(), views) // 添加报文头 - e.dispatcher.DeliverNetworkPacket(e, r.RemoteLinkAddress, r.LocalLinkAddress, - protocol, vv) // 分发数据报 - return nil - } - // 封装增加以太网头部 - eth := header.Ethernet(hdr.Prepend(header.EthernetMinimumSize)) // 分配14B的内存 - ethHdr := &header.EthernetFields{ // 配置以太帧信息 - DstAddr: r.RemoteLinkAddress, - Type: protocol, - } - // 如果路由信息中有配置源MAC地址,那么使用该地址 - // 如果没有,则使用本网卡的地址 - if r.LocalLinkAddress != "" { - ethHdr.SrcAddr = r.LocalLinkAddress // 源网卡地址 说明这是一个转发报文 - } else { - ethHdr.SrcAddr = e.addr // 说明这是一个原始报文 - } - eth.Encode(ethHdr) // 将以太帧信息作为报文头编入 - log.Println("链路层写回报文") - // 写入网卡中 - if payload.Size() == 0 { - return rawfile.NonBlockingWrite(e.fd, hdr.View()) - } - return rawfile.NonBlockingWrite2(e.fd, hdr.View(), payload.ToView()) -} - -// Attach 启动从文件描述符中读取数据包的goroutine,并通过提供的分发函数来分发数据报 -func (e *endpoint) Attach(dispatcher stack.NetworkDispatcher) { - e.dispatcher = dispatcher - // 链接端点不可靠。保存传输端点后,它们将停止发送传出数据包,并拒绝所有传入数据包。 - go e.dispatchLoop() -} - -func (e *endpoint) IsAttached() bool { - return e.dispatcher != nil -} - -// 截取需要的内容 -func (e *endpoint) capViews(n int, buffers []int) int { - c := 0 - for i, s := range buffers { - c += s - if c >= n { - e.views[i].CapLength(s - (c - n)) - return i + 1 - } - } - return len(buffers) -} - -// 按照bufConfig的长度分配内存大小 -// 注意e.views 和 e.iovecs共用相同的内存块 -func (e *endpoint) allocateViews(bufConfig []int) { - for i, v := range e.views { - if v != nil { - break - } - b := buffer.NewView(bufConfig[i]) // 分配内存 - e.views[i] = b - e.iovecs[i] = syscall.Iovec{ - Base: &b[0], - Len: uint64(len(b)), - } - } -} - -func (e *endpoint) dispatch() (bool, *tcpip.Error) { - // 读取数据缓存的分配 - e.allocateViews(BufConfig) - - // 从网卡读取数据 - n, err := rawfile.BlockingReadv(e.fd, e.iovecs) // 读到ioves中相当于读到views中 - if err != nil { - return false, err - } - if n <= e.hdrSize { - return false, nil // 读到的数据比头部还小 直接丢弃 - } - - var ( - p tcpip.NetworkProtocolNumber - remoteLinkAddr, localLinkAddr tcpip.LinkAddress // 目标MAC 源MAC - ) - // 获取以太网头部信息 - eth := header.Ethernet(e.views[0]) - p = eth.Type() - remoteLinkAddr = eth.SourceAddress() - localLinkAddr = eth.DestinationAddress() - - used := e.capViews(n, BufConfig) // 从缓存中截有效的内容 - vv := buffer.NewVectorisedView(n, e.views[:used]) // 用这些有效的内容构建vv - vv.TrimFront(e.hdrSize) // 将数据内容删除以太网头部信息 将网络层作为数据头 - - e.dispatcher.DeliverNetworkPacket(e, remoteLinkAddr, localLinkAddr, p, vv) - - // 将分发后的数据无效化(设置nil可以让gc回收这些内存) - for i := 0; i < used; i++ { - e.views[i] = nil - } - - return true, nil -} - -// 循环地从fd中读取数据 然后将数据报分发给协议栈 -func (e *endpoint) dispatchLoop() *tcpip.Error { - for { - cont, err := e.dispatch() - if err != nil || !cont { - if e.closed != nil { - e.closed(err) // 阻塞中 - } - return err - } - } -} +package fdbased + +import ( + "log" + "netstack/tcpip" + "netstack/tcpip/buffer" + "netstack/tcpip/header" + "netstack/tcpip/link/rawfile" + "netstack/tcpip/stack" + "syscall" +) + +// 从NIC读取数据的多级缓存配置 +var BufConfig = []int{1 << 7, 1 << 8, 1 << 8, 1 << 9, 1 << 10, 1 << 11, 1 << 12, 1 << 13, 1 << 14, 1 << 15} + +// 负责底层网卡的io读写以及数据分发 +type endpoint struct { + // 发送和接收数据的文件描述符 + fd int + // 单个帧的最大长度 + mtu uint32 + // 以太网头部长度 + hdrSize int + // 网卡地址 + addr tcpip.LinkAddress + // 网卡的能力 + caps stack.LinkEndpointCapabilities + + closed func(*tcpip.Error) + + iovecs []syscall.Iovec + views []buffer.View + dispatcher stack.NetworkDispatcher + + // handleLocal指示发往自身的数据包是由内部netstack处理(true)还是转发到FD端点(false) + handleLocal bool +} + +type Options struct { + FD int + MTU uint32 + ClosedFunc func(*tcpip.Error) + Address tcpip.LinkAddress + ResolutionRequired bool + SaveRestore bool + ChecksumOffload bool + DisconnectOk bool + HandleLocal bool + TestLossPacket func(data []byte) bool +} + +// 根据选项参数创建一个链路层的endpoint,并返回该endpoint的id +func New(opts *Options) tcpip.LinkEndpointID { + syscall.SetNonblock(opts.FD, true) + caps := stack.LinkEndpointCapabilities(0) // 初始化 + if opts.ResolutionRequired { + caps |= stack.CapabilityResolutionRequired + } + if opts.ChecksumOffload { + caps |= stack.CapabilityChecksumOffload + } + if opts.SaveRestore { + caps |= stack.CapabilitySaveRestore + } + if opts.DisconnectOk { + caps |= stack.CapabilityDisconnectOK + } + + e := &endpoint{ + fd: opts.FD, + mtu: opts.MTU, + caps: caps, + closed: opts.ClosedFunc, + addr: opts.Address, + hdrSize: header.EthernetMinimumSize, + views: make([]buffer.View, len(BufConfig)), + iovecs: make([]syscall.Iovec, len(BufConfig)), + handleLocal: opts.HandleLocal, + } + + // 全局注册链路层设备 + return stack.RegisterLinkEndpoint(e) +} + +func (e *endpoint) MTU() uint32 { + return e.mtu +} + +func (e *endpoint) Capabilities() stack.LinkEndpointCapabilities { + return e.caps +} + +// 返回当前以太网头部信息长度 +func (e *endpoint) MaxHeaderLength() uint16 { + return uint16(e.hdrSize) +} + +// 返回当前MAC地址 +func (e *endpoint) LinkAddress() tcpip.LinkAddress { + return e.addr +} + +// 将上层的报文经过链路层封装,写入网卡中,如果写入失败则丢弃该报文 +func (e *endpoint) WritePacket(r *stack.Route, hdr buffer.Prependable, + payload buffer.VectorisedView, protocol tcpip.NetworkProtocolNumber) *tcpip.Error { + // 如果目标地址是设备自己 那么将报文重新返回给协议栈 + if e.handleLocal && r.LocalAddress != "" && r.LocalAddress == r.RemoteAddress { + views := make([]buffer.View, 1, 1+len(payload.Views())) + views[0] = hdr.View() + views = append(views, payload.Views()...) + vv := buffer.NewVectorisedView(len(views[0])+payload.Size(), views) // 添加报文头 + e.dispatcher.DeliverNetworkPacket(e, r.RemoteLinkAddress, r.LocalLinkAddress, + protocol, vv) // 分发数据报 + return nil + } + // 封装增加以太网头部 + eth := header.Ethernet(hdr.Prepend(header.EthernetMinimumSize)) // 分配14B的内存 + ethHdr := &header.EthernetFields{ // 配置以太帧信息 + DstAddr: r.RemoteLinkAddress, + Type: protocol, + } + // 如果路由信息中有配置源MAC地址,那么使用该地址 + // 如果没有,则使用本网卡的地址 + if r.LocalLinkAddress != "" { + ethHdr.SrcAddr = r.LocalLinkAddress // 源网卡地址 说明这是一个转发报文 + } else { + ethHdr.SrcAddr = e.addr // 说明这是一个原始报文 + } + eth.Encode(ethHdr) // 将以太帧信息作为报文头编入 + log.Println("链路层写回报文") + // 写入网卡中 + if payload.Size() == 0 { + return rawfile.NonBlockingWrite(e.fd, hdr.View()) + } + return rawfile.NonBlockingWrite2(e.fd, hdr.View(), payload.ToView()) +} + +// Attach 启动从文件描述符中读取数据包的goroutine,并通过提供的分发函数来分发数据报 +func (e *endpoint) Attach(dispatcher stack.NetworkDispatcher) { + e.dispatcher = dispatcher + // 链接端点不可靠。保存传输端点后,它们将停止发送传出数据包,并拒绝所有传入数据包。 + go e.dispatchLoop() +} + +func (e *endpoint) IsAttached() bool { + return e.dispatcher != nil +} + +// 截取需要的内容 +func (e *endpoint) capViews(n int, buffers []int) int { + c := 0 + for i, s := range buffers { + c += s + if c >= n { + e.views[i].CapLength(s - (c - n)) + return i + 1 + } + } + return len(buffers) +} + +// 按照bufConfig的长度分配内存大小 +// 注意e.views 和 e.iovecs共用相同的内存块 +func (e *endpoint) allocateViews(bufConfig []int) { + for i, v := range e.views { + if v != nil { + break + } + b := buffer.NewView(bufConfig[i]) // 分配内存 + e.views[i] = b + e.iovecs[i] = syscall.Iovec{ + Base: &b[0], + Len: uint64(len(b)), + } + } +} + +func (e *endpoint) dispatch() (bool, *tcpip.Error) { + // 读取数据缓存的分配 + e.allocateViews(BufConfig) + + // 从网卡读取数据 + n, err := rawfile.BlockingReadv(e.fd, e.iovecs) // 读到ioves中相当于读到views中 + if err != nil { + return false, err + } + if n <= e.hdrSize { + return false, nil // 读到的数据比头部还小 直接丢弃 + } + + var ( + p tcpip.NetworkProtocolNumber + remoteLinkAddr, localLinkAddr tcpip.LinkAddress // 目标MAC 源MAC + ) + // 获取以太网头部信息 + eth := header.Ethernet(e.views[0]) + p = eth.Type() + remoteLinkAddr = eth.SourceAddress() + localLinkAddr = eth.DestinationAddress() + + used := e.capViews(n, BufConfig) // 从缓存中截有效的内容 + vv := buffer.NewVectorisedView(n, e.views[:used]) // 用这些有效的内容构建vv + vv.TrimFront(e.hdrSize) // 将数据内容删除以太网头部信息 将网络层作为数据头 + + e.dispatcher.DeliverNetworkPacket(e, remoteLinkAddr, localLinkAddr, p, vv) + + // 将分发后的数据无效化(设置nil可以让gc回收这些内存) + for i := 0; i < used; i++ { + e.views[i] = nil + } + + return true, nil +} + +// 循环地从fd中读取数据 然后将数据报分发给协议栈 +func (e *endpoint) dispatchLoop() *tcpip.Error { + for { + cont, err := e.dispatch() + if err != nil || !cont { + if e.closed != nil { + e.closed(err) // 阻塞中 + } + return err + } + } +} diff --git a/tcpip/link/fdbased/endpoint_test.go b/tcpip/link/fdbased/endpoint_test.go index a1f5180..32d6b0d 100644 --- a/tcpip/link/fdbased/endpoint_test.go +++ b/tcpip/link/fdbased/endpoint_test.go @@ -1,273 +1,273 @@ -package fdbased - -import ( - "fmt" - "math/rand" - "netstack/tcpip" - "netstack/tcpip/buffer" - "netstack/tcpip/header" - "netstack/tcpip/stack" - "reflect" - "syscall" - "testing" - "time" -) - -const ( - mtu = 1500 - laddr = tcpip.LinkAddress("\x65\x66\x67\x68\x69\x70") - raddr = tcpip.LinkAddress("\x71\x72\x73\x74\x75\x76") - proto = 10 -) - -type packetInfo struct { - raddr tcpip.LinkAddress - proto tcpip.NetworkProtocolNumber - contents buffer.View -} - -type context struct { - t *testing.T - fds [2]int - ep stack.LinkEndpoint - ch chan packetInfo // 信道 - done chan struct{} // 通知退出 -} - -func newContext(t *testing.T, opt *Options) *context { - fds, err := syscall.Socketpair(syscall.AF_UNIX, syscall.SOCK_SEQPACKET, 0) - if err != nil { - t.Fatalf("Socketpair failed: %v", err) - } - - done := make(chan struct{}, 1) - opt.ClosedFunc = func(*tcpip.Error) { - done <- struct{}{} - } - - opt.FD = fds[1] - ep := stack.FindLinkEndpoint(New(opt)).(*endpoint) // 找到端口实现 - - c := &context{ - t: t, - fds: fds, - ep: ep, - ch: make(chan packetInfo, 100), - done: done, - } - - ep.Attach(c) // 启动端口 后台阻塞等待 - - return c -} - -func (c *context) cleanup() { - syscall.Close(c.fds[0]) - <-c.done - syscall.Close(c.fds[1]) -} - -func (c *context) DeliverNetworkPacket(linkEP stack.LinkEndpoint, - dstLinkAddr, srcLinkAddr tcpip.LinkAddress, - protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView) { - c.ch <- packetInfo{dstLinkAddr, protocol, vv.ToView()} -} - -func TestFdbased(t *testing.T) { - c := newContext(t, &Options{MTU: mtu, Address: tcpip.LinkAddress(laddr)}) - defer c.cleanup() - - // Build header - hdr := buffer.NewPrependable(int(c.ep.MaxHeaderLength()) + 100) // 114 - b := hdr.Prepend(100) // payload - for i := range b { - b[i] = uint8(rand.Intn(256)) - } - - // Build payload and write - payload := make(buffer.View, 1024) // payload len = 1024 - for i := range payload { - payload[i] = uint8(rand.Intn(256)) - } - - if err := c.ep.WritePacket(&stack.Route{RemoteLinkAddress: raddr}, hdr, - payload.ToVectorisedView(), proto); err != nil { - panic(err) - } - - b = make([]byte, mtu) - n, err := syscall.Read(c.fds[0], b) - if err != nil { - panic(err) - } - b = b[:n] - h := header.Ethernet(b) - if h.DestinationAddress() != raddr || h.SourceAddress() != laddr { - panic("diff Err") - } -} - -func TestPreserveSrcAddress(t *testing.T) { - baddr := tcpip.LinkAddress("\xcc\xbb\xaa\x77\x88\x99") - - c := newContext(t, &Options{Address: laddr, MTU: mtu}) - defer c.cleanup() - - // Set LocalLinkAddress in route to the value of the bridged address. - r := &stack.Route{ - RemoteLinkAddress: raddr, - LocalLinkAddress: baddr, - } - - // WritePacket panics given a prependable with anything less than - // the minimum size of the ethernet header. - hdr := buffer.NewPrependable(header.EthernetMinimumSize) - if err := c.ep.WritePacket(r, hdr, buffer.VectorisedView{}, proto); err != nil { - t.Fatalf("WritePacket failed: %v", err) - } - - // Read from the FD, then compare with what we wrote. - b := make([]byte, mtu) - n, err := syscall.Read(c.fds[0], b) - if err != nil { - t.Fatalf("Read failed: %v", err) - } - b = b[:n] - h := header.Ethernet(b) - - if a := h.SourceAddress(); a != baddr { - t.Fatalf("SourceAddress() = %v, want %v", a, baddr) - } -} - -func TestDeliverPacket(t *testing.T) { - lengths := []int{100, 1000} - for _, plen := range lengths { - t.Run(fmt.Sprintf("PayloadLen=%v", plen), func(t *testing.T) { - c := newContext(t, &Options{Address: laddr, MTU: mtu}) - defer c.cleanup() - - // Build packet. - b := make([]byte, plen) - all := b - for i := range b { - b[i] = uint8(rand.Intn(256)) - } - - hdr := make(header.Ethernet, header.EthernetMinimumSize) - hdr.Encode(&header.EthernetFields{ - SrcAddr: raddr, - DstAddr: laddr, - Type: proto, - }) - all = append(hdr, b...) - - // Write packet via the file descriptor. - if _, err := syscall.Write(c.fds[0], all); err != nil { - t.Fatalf("Write failed: %v", err) - } - - // Receive packet through the endpoint. - select { - case pi := <-c.ch: - want := packetInfo{ - raddr: raddr, - proto: proto, - contents: b, - } - - if !reflect.DeepEqual(want, pi) { - t.Fatalf("Unexpected received packet: %+v, want %+v", pi, want) - } - case <-time.After(10 * time.Second): - t.Fatalf("Timed out waiting for packet") - } - }) - } -} - -//func TestBufConfigMaxLength(t *testing.T) { -// got := 0 -// for _, i := range BufConfig { -// got += i -// } -// want := header.MaxIPPacketSize // maximum TCP packet size -// if got < want { -// t.Errorf("total buffer size is invalid: got %d, want >= %d", got, want) -// } -//} - -func TestBufConfigFirst(t *testing.T) { - // The stack assumes that the TCP/IP header is enterily contained in the first view. - // Therefore, the first view needs to be large enough to contain the maximum TCP/IP - // header, which is 120 bytes (60 bytes for IP + 60 bytes for TCP). - want := 120 - got := BufConfig[0] - if got < want { - t.Errorf("first view has an invalid size: got %d, want >= %d", got, want) - } -} - -func build(bufConfig []int) *endpoint { - e := &endpoint{ - views: make([]buffer.View, len(bufConfig)), - iovecs: make([]syscall.Iovec, len(bufConfig)), - } - e.allocateViews(bufConfig) - return e -} - -var capLengthTestCases = []struct { - comment string - config []int - n int - wantUsed int - wantLengths []int -}{ - { - comment: "Single slice", - config: []int{2}, - n: 1, - wantUsed: 1, - wantLengths: []int{1}, - }, - { - comment: "Multiple slices", - config: []int{1, 2}, - n: 2, - wantUsed: 2, - wantLengths: []int{1, 1}, - }, - { - comment: "Entire buffer", - config: []int{1, 2}, - n: 3, - wantUsed: 2, - wantLengths: []int{1, 2}, - }, - { - comment: "Entire buffer but not on the last slice", - config: []int{1, 2, 3}, - n: 3, - wantUsed: 2, - wantLengths: []int{1, 2, 3}, - }, -} - -func TestCapLength(t *testing.T) { - for _, c := range capLengthTestCases { - e := build(c.config) - used := e.capViews(c.n, c.config) - if used != c.wantUsed { - t.Errorf("Test \"%s\" failed when calling capViews(%d, %v). Got %d. Want %d", c.comment, c.n, c.config, used, c.wantUsed) - } - lengths := make([]int, len(e.views)) - for i, v := range e.views { - lengths[i] = len(v) - } - if !reflect.DeepEqual(lengths, c.wantLengths) { - t.Errorf("Test \"%s\" failed when calling capViews(%d, %v). Got %v. Want %v", c.comment, c.n, c.config, lengths, c.wantLengths) - } - - } -} +package fdbased + +import ( + "fmt" + "math/rand" + "netstack/tcpip" + "netstack/tcpip/buffer" + "netstack/tcpip/header" + "netstack/tcpip/stack" + "reflect" + "syscall" + "testing" + "time" +) + +const ( + mtu = 1500 + laddr = tcpip.LinkAddress("\x65\x66\x67\x68\x69\x70") + raddr = tcpip.LinkAddress("\x71\x72\x73\x74\x75\x76") + proto = 10 +) + +type packetInfo struct { + raddr tcpip.LinkAddress + proto tcpip.NetworkProtocolNumber + contents buffer.View +} + +type context struct { + t *testing.T + fds [2]int + ep stack.LinkEndpoint + ch chan packetInfo // 信道 + done chan struct{} // 通知退出 +} + +func newContext(t *testing.T, opt *Options) *context { + fds, err := syscall.Socketpair(syscall.AF_UNIX, syscall.SOCK_SEQPACKET, 0) + if err != nil { + t.Fatalf("Socketpair failed: %v", err) + } + + done := make(chan struct{}, 1) + opt.ClosedFunc = func(*tcpip.Error) { + done <- struct{}{} + } + + opt.FD = fds[1] + ep := stack.FindLinkEndpoint(New(opt)).(*endpoint) // 找到端口实现 + + c := &context{ + t: t, + fds: fds, + ep: ep, + ch: make(chan packetInfo, 100), + done: done, + } + + ep.Attach(c) // 启动端口 后台阻塞等待 + + return c +} + +func (c *context) cleanup() { + syscall.Close(c.fds[0]) + <-c.done + syscall.Close(c.fds[1]) +} + +func (c *context) DeliverNetworkPacket(linkEP stack.LinkEndpoint, + dstLinkAddr, srcLinkAddr tcpip.LinkAddress, + protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView) { + c.ch <- packetInfo{dstLinkAddr, protocol, vv.ToView()} +} + +func TestFdbased(t *testing.T) { + c := newContext(t, &Options{MTU: mtu, Address: tcpip.LinkAddress(laddr)}) + defer c.cleanup() + + // Build header + hdr := buffer.NewPrependable(int(c.ep.MaxHeaderLength()) + 100) // 114 + b := hdr.Prepend(100) // payload + for i := range b { + b[i] = uint8(rand.Intn(256)) + } + + // Build payload and write + payload := make(buffer.View, 1024) // payload len = 1024 + for i := range payload { + payload[i] = uint8(rand.Intn(256)) + } + + if err := c.ep.WritePacket(&stack.Route{RemoteLinkAddress: raddr}, hdr, + payload.ToVectorisedView(), proto); err != nil { + panic(err) + } + + b = make([]byte, mtu) + n, err := syscall.Read(c.fds[0], b) + if err != nil { + panic(err) + } + b = b[:n] + h := header.Ethernet(b) + if h.DestinationAddress() != raddr || h.SourceAddress() != laddr { + panic("diff Err") + } +} + +func TestPreserveSrcAddress(t *testing.T) { + baddr := tcpip.LinkAddress("\xcc\xbb\xaa\x77\x88\x99") + + c := newContext(t, &Options{Address: laddr, MTU: mtu}) + defer c.cleanup() + + // Set LocalLinkAddress in route to the value of the bridged address. + r := &stack.Route{ + RemoteLinkAddress: raddr, + LocalLinkAddress: baddr, + } + + // WritePacket panics given a prependable with anything less than + // the minimum size of the ethernet header. + hdr := buffer.NewPrependable(header.EthernetMinimumSize) + if err := c.ep.WritePacket(r, hdr, buffer.VectorisedView{}, proto); err != nil { + t.Fatalf("WritePacket failed: %v", err) + } + + // Read from the FD, then compare with what we wrote. + b := make([]byte, mtu) + n, err := syscall.Read(c.fds[0], b) + if err != nil { + t.Fatalf("Read failed: %v", err) + } + b = b[:n] + h := header.Ethernet(b) + + if a := h.SourceAddress(); a != baddr { + t.Fatalf("SourceAddress() = %v, want %v", a, baddr) + } +} + +func TestDeliverPacket(t *testing.T) { + lengths := []int{100, 1000} + for _, plen := range lengths { + t.Run(fmt.Sprintf("PayloadLen=%v", plen), func(t *testing.T) { + c := newContext(t, &Options{Address: laddr, MTU: mtu}) + defer c.cleanup() + + // Build packet. + b := make([]byte, plen) + all := b + for i := range b { + b[i] = uint8(rand.Intn(256)) + } + + hdr := make(header.Ethernet, header.EthernetMinimumSize) + hdr.Encode(&header.EthernetFields{ + SrcAddr: raddr, + DstAddr: laddr, + Type: proto, + }) + all = append(hdr, b...) + + // Write packet via the file descriptor. + if _, err := syscall.Write(c.fds[0], all); err != nil { + t.Fatalf("Write failed: %v", err) + } + + // Receive packet through the endpoint. + select { + case pi := <-c.ch: + want := packetInfo{ + raddr: raddr, + proto: proto, + contents: b, + } + + if !reflect.DeepEqual(want, pi) { + t.Fatalf("Unexpected received packet: %+v, want %+v", pi, want) + } + case <-time.After(10 * time.Second): + t.Fatalf("Timed out waiting for packet") + } + }) + } +} + +//func TestBufConfigMaxLength(t *testing.T) { +// got := 0 +// for _, i := range BufConfig { +// got += i +// } +// want := header.MaxIPPacketSize // maximum TCP packet size +// if got < want { +// t.Errorf("total buffer size is invalid: got %d, want >= %d", got, want) +// } +//} + +func TestBufConfigFirst(t *testing.T) { + // The stack assumes that the TCP/IP header is enterily contained in the first view. + // Therefore, the first view needs to be large enough to contain the maximum TCP/IP + // header, which is 120 bytes (60 bytes for IP + 60 bytes for TCP). + want := 120 + got := BufConfig[0] + if got < want { + t.Errorf("first view has an invalid size: got %d, want >= %d", got, want) + } +} + +func build(bufConfig []int) *endpoint { + e := &endpoint{ + views: make([]buffer.View, len(bufConfig)), + iovecs: make([]syscall.Iovec, len(bufConfig)), + } + e.allocateViews(bufConfig) + return e +} + +var capLengthTestCases = []struct { + comment string + config []int + n int + wantUsed int + wantLengths []int +}{ + { + comment: "Single slice", + config: []int{2}, + n: 1, + wantUsed: 1, + wantLengths: []int{1}, + }, + { + comment: "Multiple slices", + config: []int{1, 2}, + n: 2, + wantUsed: 2, + wantLengths: []int{1, 1}, + }, + { + comment: "Entire buffer", + config: []int{1, 2}, + n: 3, + wantUsed: 2, + wantLengths: []int{1, 2}, + }, + { + comment: "Entire buffer but not on the last slice", + config: []int{1, 2, 3}, + n: 3, + wantUsed: 2, + wantLengths: []int{1, 2, 3}, + }, +} + +func TestCapLength(t *testing.T) { + for _, c := range capLengthTestCases { + e := build(c.config) + used := e.capViews(c.n, c.config) + if used != c.wantUsed { + t.Errorf("Test \"%s\" failed when calling capViews(%d, %v). Got %d. Want %d", c.comment, c.n, c.config, used, c.wantUsed) + } + lengths := make([]int, len(e.views)) + for i, v := range e.views { + lengths[i] = len(v) + } + if !reflect.DeepEqual(lengths, c.wantLengths) { + t.Errorf("Test \"%s\" failed when calling capViews(%d, %v). Got %v. Want %v", c.comment, c.n, c.config, lengths, c.wantLengths) + } + + } +} diff --git a/tcpip/link/rawfile/blockingpoll_unsafe.go b/tcpip/link/rawfile/blockingpoll_unsafe.go index 486a3e7..9619c73 100644 --- a/tcpip/link/rawfile/blockingpoll_unsafe.go +++ b/tcpip/link/rawfile/blockingpoll_unsafe.go @@ -1,130 +1,130 @@ -package rawfile - -import ( - "syscall" - "netstack/tcpip" - "unsafe" -) - -// GetMTU 确定网络接口设备的 MTU -func GetMTU(name string) (uint32, error) { - fd, err := syscall.Socket(syscall.AF_UNIX, syscall.SOCK_DGRAM, 0) - if err != nil { - return 0, err - } - - defer syscall.Close(fd) - - var ifreq struct { - name [16]byte - mtu int32 - _ [20]byte - } - - copy(ifreq.name[:], name) - _, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), syscall.SIOCGIFMTU, uintptr(unsafe.Pointer(&ifreq))) - if errno != 0 { - return 0, errno - } - - return uint32(ifreq.mtu), nil -} - -type pollEvent struct { - fd int32 - events int16 - revents int16 -} - -func NonBlockingWrite(fd int, buf []byte) *tcpip.Error { - var ptr unsafe.Pointer - if len(buf) > 0 { - ptr = unsafe.Pointer(&buf[0]) - } - - _, _, e := syscall.RawSyscall(syscall.SYS_WRITE, uintptr(fd), - uintptr(ptr), uintptr(len(buf))) - if e != 0 { - return TranslateErrno(e) - } - return nil -} - -func NonBlockingWrite2(fd int, b1, b2 []byte) *tcpip.Error { - if len(b2) == 0 { - return NonBlockingWrite(fd, b1) - } - /* - #include - - struct iovec { - void *iov_base; - size_t iov_len; - }; - **/ - iovec := [...]syscall.Iovec{ - { - Base: &b1[0], - Len: uint64(len(b1)), - }, - { - Base: &b2[0], - Len: uint64(len(b2)), - }, - } - - // ssize_t writev(int fildes, const struct iovec *iov, int iovcnt); - _, _, e := syscall.RawSyscall(syscall.SYS_WRITEV, uintptr(fd), - uintptr(unsafe.Pointer(&iovec[0])), uintptr(len(iovec))) - if e != 0 { - return TranslateErrno(e) - } - - return nil -} - -func BlockingRead(fd int, b []byte) (int, *tcpip.Error) { - for { - n, _, e := syscall.RawSyscall(syscall.SYS_READ, uintptr(fd), - uintptr(unsafe.Pointer(&b[0])), uintptr(len(b))) // read(fd,buf,len) - if e == 0 { - return int(n), nil - } - - event := pollEvent{ - fd: int32(fd), - events: 1, // POLLIN - } - - _, e = blockingPoll(&event, 1, -1) - if e != 0 && e != syscall.EINTR { - return 0, TranslateErrno(e) - } - } -} - -func BlockingReadv(fd int, iovecs []syscall.Iovec) (int, *tcpip.Error) { - for { - n, _, e := syscall.RawSyscall(syscall.SYS_READV, uintptr(fd), - uintptr(unsafe.Pointer(&iovecs[0])), uintptr(len(iovecs))) - if e == 0 { - return int(n), nil - } - - event := pollEvent{ - fd: int32(fd), - events: 1, // POLLIN - } - - _, e = blockingPoll(&event, 1, -1) - if e != 0 && e != syscall.EINTR { - return 0, TranslateErrno(e) - } - } -} - -func blockingPoll(fds *pollEvent, nfds int, timeout int64) (int, syscall.Errno) { - n, _, e := syscall.Syscall(syscall.SYS_POLL, uintptr(unsafe.Pointer(fds)), - uintptr(nfds), uintptr(timeout)) - return int(n), e -} +package rawfile + +import ( + "syscall" + "netstack/tcpip" + "unsafe" +) + +// GetMTU 确定网络接口设备的 MTU +func GetMTU(name string) (uint32, error) { + fd, err := syscall.Socket(syscall.AF_UNIX, syscall.SOCK_DGRAM, 0) + if err != nil { + return 0, err + } + + defer syscall.Close(fd) + + var ifreq struct { + name [16]byte + mtu int32 + _ [20]byte + } + + copy(ifreq.name[:], name) + _, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), syscall.SIOCGIFMTU, uintptr(unsafe.Pointer(&ifreq))) + if errno != 0 { + return 0, errno + } + + return uint32(ifreq.mtu), nil +} + +type pollEvent struct { + fd int32 + events int16 + revents int16 +} + +func NonBlockingWrite(fd int, buf []byte) *tcpip.Error { + var ptr unsafe.Pointer + if len(buf) > 0 { + ptr = unsafe.Pointer(&buf[0]) + } + + _, _, e := syscall.RawSyscall(syscall.SYS_WRITE, uintptr(fd), + uintptr(ptr), uintptr(len(buf))) + if e != 0 { + return TranslateErrno(e) + } + return nil +} + +func NonBlockingWrite2(fd int, b1, b2 []byte) *tcpip.Error { + if len(b2) == 0 { + return NonBlockingWrite(fd, b1) + } + /* + #include + + struct iovec { + void *iov_base; + size_t iov_len; + }; + **/ + iovec := [...]syscall.Iovec{ + { + Base: &b1[0], + Len: uint64(len(b1)), + }, + { + Base: &b2[0], + Len: uint64(len(b2)), + }, + } + + // ssize_t writev(int fildes, const struct iovec *iov, int iovcnt); + _, _, e := syscall.RawSyscall(syscall.SYS_WRITEV, uintptr(fd), + uintptr(unsafe.Pointer(&iovec[0])), uintptr(len(iovec))) + if e != 0 { + return TranslateErrno(e) + } + + return nil +} + +func BlockingRead(fd int, b []byte) (int, *tcpip.Error) { + for { + n, _, e := syscall.RawSyscall(syscall.SYS_READ, uintptr(fd), + uintptr(unsafe.Pointer(&b[0])), uintptr(len(b))) // read(fd,buf,len) + if e == 0 { + return int(n), nil + } + + event := pollEvent{ + fd: int32(fd), + events: 1, // POLLIN + } + + _, e = blockingPoll(&event, 1, -1) + if e != 0 && e != syscall.EINTR { + return 0, TranslateErrno(e) + } + } +} + +func BlockingReadv(fd int, iovecs []syscall.Iovec) (int, *tcpip.Error) { + for { + n, _, e := syscall.RawSyscall(syscall.SYS_READV, uintptr(fd), + uintptr(unsafe.Pointer(&iovecs[0])), uintptr(len(iovecs))) + if e == 0 { + return int(n), nil + } + + event := pollEvent{ + fd: int32(fd), + events: 1, // POLLIN + } + + _, e = blockingPoll(&event, 1, -1) + if e != 0 && e != syscall.EINTR { + return 0, TranslateErrno(e) + } + } +} + +func blockingPoll(fds *pollEvent, nfds int, timeout int64) (int, syscall.Errno) { + n, _, e := syscall.Syscall(syscall.SYS_POLL, uintptr(unsafe.Pointer(fds)), + uintptr(nfds), uintptr(timeout)) + return int(n), e +} diff --git a/tcpip/link/tuntap/tuntap.go b/tcpip/link/tuntap/tuntap.go index 3fb1a17..aeb0c44 100644 --- a/tcpip/link/tuntap/tuntap.go +++ b/tcpip/link/tuntap/tuntap.go @@ -1,138 +1,138 @@ -package tuntap - -import ( - "errors" - "fmt" - "os/exec" - "syscall" - "unsafe" -) - -const ( - TUN = 1 - TAP = 2 -) - -var ( - ErrDeviceMode = errors.New("unsupport device mode") -) - -type rawSockaddr struct { - Family uint16 - Data [14]byte -} - -type Config struct { - Name string // 网卡名 - Mode int // 网卡模式 TUN or TAP -} - -// NewNetDev根据配置返回虚拟网卡的文件描述符 -func NewNetDev(c *Config) (fd int, err error) { - switch c.Mode { - case TUN: - fd, err = newTun(c.Name) - case TAP: - fd, err = newTap(c.Name) - default: - err = ErrDeviceMode - return - } - if err != nil { - return - } - return -} - -// TUN 工作在第二层 -func newTun(name string) (int, error) { - return open(name, syscall.IFF_TUN|syscall.IFF_NO_PI) -} - -// TAP工作在第三层 -func newTap(name string) (int, error){ - return open(name, syscall.IFF_TAP|syscall.IFF_NO_PI) -} - -func open(name string, flags uint16) (int, error) { - // 打开tuntap 设备 - fd, err := syscall.Open("/dev/net/tun", syscall.O_RDWR, 0) - if err != nil { - return -1, err - } - - var ifr struct { - name [16]byte - flags uint16 - _ [22]byte - } - - copy(ifr.name[:], name) - ifr.flags = flags - // 通过ioctl系统调用 将fd和虚拟网卡驱动绑定在一起 - _, _ , errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), - syscall.TUNSETIFF, uintptr(unsafe.Pointer(&ifr))) - if errno != 0 { - syscall.Close(fd) - return -1, errno - } - return fd, nil -} - -// SetLinkUp 让系统启动该网卡 ip link set tap0 up -func SetLinkUp(name string) (err error) { - // ip link set up - out, cmdErr := exec.Command("ip", "link", "set", name, "up").CombinedOutput() - if cmdErr != nil { - err = fmt.Errorf("%v:%v", cmdErr, string(out)) - return - } - return -} - -// SetRoute 通过ip命令添加路由 -func SetRoute(name, cidr string) (err error) { - // ip route add 192.168.1.0/24 dev tap0 - out, cmdErr := exec.Command("ip", "route", "add", cidr, "dev", name).CombinedOutput() - if cmdErr != nil { - err = fmt.Errorf("%v:%v", cmdErr, string(out)) - return - } - return -} - -// AddIP 通过ip命令添加IP地址 -func AddIP(name, ip string) (err error) { - // ip addr add 192.168.1.1 dev tap0 - out, cmdErr := exec.Command("ip", "addr", "add", ip, "dev", name).CombinedOutput() - if cmdErr != nil { - err = fmt.Errorf("%v:%v",cmdErr, string(out)) - return - } - return -} - -func GetHardwareAddr(name string) (string, error) { - fd, err := syscall.Socket(syscall.AF_UNIX, syscall.SOCK_DGRAM, 0) // 新建socket文件 - if err != nil { - return "", nil - } - - defer syscall.Close(fd) - - var ifreq struct { - name [16]byte - addr rawSockaddr - _ [8]byte - } - - copy(ifreq.name[:], name) - _, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), syscall.SIOCGIFHWADDR, - uintptr(unsafe.Pointer(&ifreq))) // 获取硬件地址 - if errno != 0 { - return "", errno - } - - mac := ifreq.addr.Data[:6] - return string(mac[:]), nil -} +package tuntap + +import ( + "errors" + "fmt" + "os/exec" + "syscall" + "unsafe" +) + +const ( + TUN = 1 + TAP = 2 +) + +var ( + ErrDeviceMode = errors.New("unsupport device mode") +) + +type rawSockaddr struct { + Family uint16 + Data [14]byte +} + +type Config struct { + Name string // 网卡名 + Mode int // 网卡模式 TUN or TAP +} + +// NewNetDev根据配置返回虚拟网卡的文件描述符 +func NewNetDev(c *Config) (fd int, err error) { + switch c.Mode { + case TUN: + fd, err = newTun(c.Name) + case TAP: + fd, err = newTap(c.Name) + default: + err = ErrDeviceMode + return + } + if err != nil { + return + } + return +} + +// TUN 工作在第二层 +func newTun(name string) (int, error) { + return open(name, syscall.IFF_TUN|syscall.IFF_NO_PI) +} + +// TAP工作在第三层 +func newTap(name string) (int, error){ + return open(name, syscall.IFF_TAP|syscall.IFF_NO_PI) +} + +func open(name string, flags uint16) (int, error) { + // 打开tuntap 设备 + fd, err := syscall.Open("/dev/net/tun", syscall.O_RDWR, 0) + if err != nil { + return -1, err + } + + var ifr struct { + name [16]byte + flags uint16 + _ [22]byte + } + + copy(ifr.name[:], name) + ifr.flags = flags + // 通过ioctl系统调用 将fd和虚拟网卡驱动绑定在一起 + _, _ , errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), + syscall.TUNSETIFF, uintptr(unsafe.Pointer(&ifr))) + if errno != 0 { + syscall.Close(fd) + return -1, errno + } + return fd, nil +} + +// SetLinkUp 让系统启动该网卡 ip link set tap0 up +func SetLinkUp(name string) (err error) { + // ip link set up + out, cmdErr := exec.Command("ip", "link", "set", name, "up").CombinedOutput() + if cmdErr != nil { + err = fmt.Errorf("%v:%v", cmdErr, string(out)) + return + } + return +} + +// SetRoute 通过ip命令添加路由 +func SetRoute(name, cidr string) (err error) { + // ip route add 192.168.1.0/24 dev tap0 + out, cmdErr := exec.Command("ip", "route", "add", cidr, "dev", name).CombinedOutput() + if cmdErr != nil { + err = fmt.Errorf("%v:%v", cmdErr, string(out)) + return + } + return +} + +// AddIP 通过ip命令添加IP地址 +func AddIP(name, ip string) (err error) { + // ip addr add 192.168.1.1 dev tap0 + out, cmdErr := exec.Command("ip", "addr", "add", ip, "dev", name).CombinedOutput() + if cmdErr != nil { + err = fmt.Errorf("%v:%v",cmdErr, string(out)) + return + } + return +} + +func GetHardwareAddr(name string) (string, error) { + fd, err := syscall.Socket(syscall.AF_UNIX, syscall.SOCK_DGRAM, 0) // 新建socket文件 + if err != nil { + return "", nil + } + + defer syscall.Close(fd) + + var ifreq struct { + name [16]byte + addr rawSockaddr + _ [8]byte + } + + copy(ifreq.name[:], name) + _, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), syscall.SIOCGIFHWADDR, + uintptr(unsafe.Pointer(&ifreq))) // 获取硬件地址 + if errno != 0 { + return "", errno + } + + mac := ifreq.addr.Data[:6] + return string(mac[:]), nil +} diff --git a/tcpip/network/READMD.md b/tcpip/network/READMD.md index 24adee5..5606252 100644 --- a/tcpip/network/READMD.md +++ b/tcpip/network/READMD.md @@ -1,172 +1,172 @@ -# 网络层的基本实现 - -本章介绍网络层的实现,网络层又称网际层、ip 层,它是 tcpip 架构中核心的实现,全球计算机的互联很大部分归功于网络层,核心网络(路由器)都跑在网络层,为网络提供路由交换的功能,将数据包分发到相应的主机。虽然网络层在路由器上的实现比较复杂,因为要实现各种路由协议,但主机协议栈中的网络层并不复杂,因为它没有实现各种路由协议,路由表也很简单。下面介绍网络层提供的服务和实现网络层的 ip 协议-ipv4。 - -## 网络层提供的服务 -在计算机网络领域,曾经为网络层应该提供怎样的服务(面向连接还是无连接)引起了长时间的争论。最终因特网采用的设计思路是:网络层向上提供简单灵活的、无连接的、尽最大努力交付的数据报服务。所谓的数据报服务具有以下几个特点: - -1. 无需建立连接 -2. 不保证可靠性 -3. 每个分组都有终点的完整地址 -4. 每个分组独立选择路由进行转发 -5. 可靠通信应该有上层负责 网络层的目的是实现两个主机之间的数据透明传送,具体功能包括寻址和路由选择等。它提供的服务使传输层不需要了解网络中的数据传输和交换技术。对网络层而言使用一种逻辑地址来唯一标识互联网上的设备,网络层依靠逻辑地址进行相互通信(类似于数据链路层的 MAC 地址),逻辑地址编址方案现主要有两种,IPv4 和 IPv6,我们主要讲协议栈对 IPv4 协议的处理。一般我们说 IP 地址,指的是 ipv4 地址。 - -## 网络层和链路层的功能区别 -之前讲过链路层也可以实现主机到主机的数据透明传输,那为何还需要网络层实现主机到主机的数据传输? - -因为链路层的数据交换是在同个局域网实现的,链路层的交换也就是二层交换,它依赖二层广播 ARP 报文,来学习 MAC 地址和端口的对应关系。当交换机从某个端口收到一个数据包,它会先读取包中的源 MAC 地址,再去读取包中的目的 MAC 地址,并在地址表中查找对应的端口,如表中有和目的 MAC 地址对应的端口,就把数据包直接复制到这个端口上。链路层其最基本的服务是将源自网络层来的数据可靠地传输到相邻节点的目标机网络层。 - -而网络层的数据交换是不限于局域网的,网络层连接着因特网中各局域网、广域网的设备,是互联网络的枢纽。网络层的数据交换(路由交换)是根据目的 IP,查找路由表找到下一跳的 IP 地址,再根据这个下一跳 IP 地址,查找转发表,将数据包转发给相应的端口。简单的说链路层的寻址关心 MAC 地址而不管数据包中的 IP 地址,而网络层的寻址关心 IP 地址,而不关心 MAC 地址,链路层和网络层的结合实现了世界上两台主机的数据互相传输。 - -## ipv4简介 - -IPv4,是互联网协议(Internet Protocol,IP)的第四版,也是第一个被广泛使用,构成现今互联网技术的基础的协议。IPv4 是一种无连接的协议,操作在使用分组交换的链路层(如以太网)上。此协议会尽最大努力交付数据包,意即它不保证任何数据包均能送达目的地,也不保证所有数据包均按照正确的顺序无重复地到达。这些方面是由上层的传输协议(如传输控制协议)处理的。 - -## ip报文 - -- IPv4,是互联网协议(Internet Protocol,IP)的第四版,也是第一个被广泛使用,构成现今互联网技术的基础的协议。IPv4 是一种无连接的协议,操作在使用分组交换的链路层(如以太网)上。此协议会尽最大努力交付数据包,意即它不保证任何数据包均能送达目的地,也不保证所有数据包均按照正确的顺序无重复地到达。这些方面是由上层的传输协议(如传输控制协议)处理的。 - -- 版本(Version) 版本字段占 4bit,通信双方使用的版本必须一致。对于 IPv4,字段的值是 4。 - -- 首部长度(Internet Header Length, IHL) 占 4bit,首部长度说明首部有多少 32 位字(4 字节)。由于 IPv4 首部可能包含数目不定的选项,这个字段也用来确定数据的偏移量。这个字段的最小值是 5(二进制 0101),相当于 5*4=20 字节(RFC 791),最大十进制值是 15。 - -- 区分服务(Differentiated Services,DS) 占 8bit,最初被定义为服务类型字段,实际上并未使用,但 1998 年被 IETF 重定义为区分服务 RFC 2474。只有在使用区分服务时,这个字段才起作用,在一般的情况 下都不使用这个字段。例如需要实时数据流的技术会应用这个字段,一个例子是 VoIP。 - -- 显式拥塞通告( Explicit Congestion Notification,ECN) 在 RFC 3168 中定义,允许在不丢弃报文的同时通知对方网络拥塞的发生。ECN 是一种可选的功能,仅当两端都支持并希望使用,且底层网络支持时才被使用。 - -- 全长(Total Length) 这个 16 位字段定义了报文总长,包含首部和数据,单位为字节。这个字段的最小值是 20(20 字节首部+0 字节数据),最大值是 2^16-1=65,535。IP 规定所有主机都必须支持最小 576 字节的报文,这是假定上层数据长度 512 字节,加上最长 IP 首部 60 字节,加上 4 字节富裕量,得出 576 字节,但大多数现代主机支持更大的报文。**当下层的数据链路协议的最大传输单元(MTU)字段的值小于 IP 报文长度时,报文就必须被分片,详细见下个标题。** - -- 标识符(Identification) 占 16 位,这个字段主要被用来唯一地标识一个报文的所有分片,因为分片不一定按序到达,所以在重组时需要知道分片所属的报文。每产生一个数据报,计数器加 1,并赋值给此字段。一些实验性的工作建议将此字段用于其它目的,例如增加报文跟踪信息以协助探测伪造的源地址。 - -- 标志 (Flags) 这个 3 位字段用于控制和识别分片,它们是: 位 0:保留,必须为 0; 位 1:禁止分片(Don’t Fragment,DF),当 DF=0 时才允许分片; 位 2:更多分片(More Fragment,MF),MF=1 代表后面还有分片,MF=0 代表已经是最后一个分片。 如果 DF 标志被设置为 1,但路由要求必须分片报文,此报文会被丢弃。这个标志可被用于发往没有能力组装分片的主机。当一个报文被分片,除了最后一片外的所有分片都设置 MF 为 1。最后一个片段具有非零片段偏移字段,将其与未分片数据包区分开,未分片的偏移字段为 0。 - -- 分片偏移 (Fragment Offset) 这个 13 位字段指明了每个分片相对于原始报文开头的偏移量,以 8 字节作单位。 - -- 存活时间(Time To Live,TTL) 这个 8 位字段避免报文在互联网中永远存在(例如陷入路由环路)。存活时间以秒为单位,但小于一秒的时间均向上取整到一秒。在现实中,这实际上成了一个跳数计数器:报文经过的每个路由器都将此字段减 1,当此字段等于 0 时,报文不再向下一跳传送并被丢弃,最大值是 255。常规地,一份 ICMP 报文被发回报文发送端说明其发送的报文已被丢弃。这也是 traceroute 的核心原理。 - -- 协议 (Protocol) 占 8bit,这个字段定义了该报文数据区使用的协议。IANA 维护着一份协议列表(最初由 RFC 790 定义),详细参见 IP 协议号列表。 - -- 首部检验和 (Header Checksum) 这个 16 位检验和字段只对首部查错,不包括数据部分。在每一跳,路由器都要重新计算出首部检验和并与此字段进行比对,如果不一致,此报文将会被丢弃。重新计算的必要性是因为每一跳的一些首部字段(如 TTL、Flag、Offset 等)都有可能发生变化,不检查数据部分是为了减少工作量。数据区的错误留待上层协议处理——用户数据报协议(UDP)和传输控制协议(TCP)都有检验和字段。此处的检验计算方法不使用 CRC。 - -- 源地址 一个 IPv4 地址由四个字节共 32 位构成,此字段的值是将每个字节转为二进制并拼在一起所得到的 32 位值。例如,10.9.8.7 是 00001010000010010000100000000111。但请注意,因为 NAT 的存在,这个地址并不总是报文的真实发送端,因此发往此地址的报文会被送往 NAT 设备,并由它被翻译为真实的地址。 - -- 目的地址 与源地址格式相同,但指出报文的接收端。 - -- 选项 附加的首部字段可能跟在目的地址之后,但这并不被经常使用,从 1 到 40 个字节不等。请注意首部长度字段必须包括足够的 32 位字来放下所有的选项(包括任何必须的填充以使首部长度能够被 32 位整除)。当选项列表的结尾不是首部的结尾时,EOL(选项列表结束,0x00)选项被插入列表末尾。下表列出了可能。 - -|字段|长度(位)|描述| -|----|---------|----| -|备份| 1 |当此选项需要被备份到所有分片中时,设为 1。| -| 类 | 2 |常规的选项类别,0 为“控制”,2 为“查错和措施”,1 和 3 保留。| -|数字| 5 |指明一个选项。| -|长度| 8 |指明整个选项的长度,对于简单的选项此字段可能不存在。| -|数据| 可变|选项相关数据,对于简单的选项此字段可能不存在。| - -**注:如果首部长度大于 5,那么选项字段必然存在并必须被考虑。 注:备份、类和数字经常被一并称呼为“类型”。** - -- 数据 数据字段不是首部的一部分,因此并不被包含在首部检验和中。数据的格式在协议首部字段中被指明,并可以是任意的传输层协议。一些常见协议的协议字段值被列在下面: - -|协议字段值| 协议名 |缩写| -|---------|------------|----| -|1 |互联网控制消息协议|ICMP| -|2 |互联网组管理协议 |IGMP| -|6 |传输控制协议 |TCP| -|17 |用户数据报协议 |UDP| -|41 |IPv6 封装 |ENCAP| -|89 |开放式最短路径优先 |OSPF| -|132|流控制传输协议 |SCTP| - -## ipv4地址 -IPv4 使用 32 位(4 字节)地址,因此地址空间中只有 4,294,967,296(2^32)个地址。不过,一些地址是为特殊用途所保留的,如专用网络(约 1800 万 个地址)和多播地址(约 2.7 亿个地址),这减少了可在互联网上路由的地址数量。随着地址不断被分配给最终用户,IPv4 地址枯竭问题也在随之产生。基于分类网络、无类别域间路由和网络地址转换的地址结构重构显著地减少了地址枯竭的速度。但在 2011 年 2 月 3 日,在最后 5 个地址块被分配给 5 个区域互联网注册管理机构之后,IANA 的主要地址池已经用尽。 - -IPv4 地址可被写作任何表示一个 32 位整数值的形式,但为了方便人类阅读和分析,它通常被写作点分十进制的形式,即四个字节被分开用十进制写出,中间用点分隔,如 192.168.1.1。ip 地址的编址方法一共经历过三个阶段: - -### 分类的 IP 地址 -- A 类网络地址占有 1 个字节(8 位),定义最高位为 0 来标识此类网络,余下 7 位为真正的网络地址。后面 3 个字节(24)为主机地址。A 类网络地址第一个字节的十进制值为:001~127.通常用于大型网络。 -- B 类网络地址占 2 个字节,使用最高两位为“10”来标识此类地址,其余 14 位为真正的网络地址,主机地址占后面的 2 个字节(16 位)。B 类网络地址第一个字节的十进制值为:128~191.通常用于中型网络。 -- C 类网络地址占 3 个字节,它是最通用的 Internet 地址。使用最高三位为“110”来标识此类地址。其余 21 位为真正的网络地址。主机地址占最后 1 个字节。C 类网络地址第一个字节的十进制值为:192~223。通常用于小型网络。 -- D 类地址是相当新的。它的识别头是 1110,用于组播,例如用于路由器修改。D 类网络地址第一个字节的十进制值为:224~239。 -- E 类地址为实验保留,其识别头是 1111。E 类网络地址第一个字节的十进制值为:240~255。 - -**但要注意得是,上面得这些地址分类已成为了历史,现在用的都是无分类 IP 地址进行路由选择。** - -### 子网的划分 - -由于上面固定分类的 IP 地址有不少的缺陷,比如,IP 地址空间的利用率很低、固定就意味着不够灵活、使路由表太大而影响性能,为了解决上述的问题,在 IP 地址概念中,又增加了一个“子网字段”,这样的话,一个 IP 地址可以用下面的方式表示 - -``` sh -IP地址 = (网络号,子网号,主机号) -``` - -### 无分类编址(CIDR) - -为了提高 ip 地址资源的利用率,提出了变长子网掩码(VLSM),而在 VLSM 的研究基础上又提出了“无分类编址”方法,也叫无分类域间路由选择-CIDR。 CIDR 最主要有两个以下特点: - -- 消除传统的 A,B,C 地址和划分子网的概念,更有效的分配 IPv4 的地址空间,CIDR 使 IP 地址又回到无分类的两级编码。记法:IP 地址::={<<网络前缀>,<<主机号>}。CIDR 还使用“斜线记法”即在 IP 地址后面加上“/”然后写网络前缀所占的位数。 -- CIDR 把网络前缀都相同的连续 IP 地址组成一个“CIDR 地址块”,即强化路由聚合(构成超网)。 其表示方法 - -``` sh -IP地址 = (网络前缀,主机号) -``` - -CIDR 还使用“斜线记法”,在 IP 地址后面加个“/”,紧跟着网络前缀所占的位数。例如:192.168.1.0/24,这种表示方式其实我们在上一章就用了,也是我们最常用的编址方式。 - -#### CIDR地址的计算方法 -CIDR无类域间路由,打破了原本的ABC类地址的规划限定,使用地址段分配更加灵活,日常工作中也经常使用,也正是因为其灵活的特点使我们无法一眼辨认出网络号、广播地址、网络中的第一台主机等信息,本文主要针对这些信息的获得介绍一些计算方法。 - -当给定一个IP地址,比如18.232.133.86/22,需要求一下这个IP所在网络的 网络地址、子网掩码、广播i地址、这个网络的第一台主机的IP地址: - -斜线后是22并不是8的整数倍,直接很难看出结果,所以需要通过一系列的计算。 - -1. 先用8的整数倍对22进行切割:22 = 16+6 ,所以这个IP地址的前16位保持不动即18.232. - -2. 发现问题出在了第三个8位上,这8位中前面6位被拿来做了网络号,后面2位被拿去做了主机号,所以将这8位转化为二进制得到10000101,拿出前6位为<100001>。这是得到了全部的网络号为 18.232.<100001> - -3. 将主机号全部置0便是网络地址,18.232.<100001><00>.<00000000>即网络地址为18.232.132.0 - -4. 同时也得到了这个网络的第一台主机的ip地址,18.232.<100001><00>.<00000001>即18.232.132.1 - -5. 将主机位全部置1便是广播地址,18.232.<100001><11>.<11111111>即18.232.135.255 - -6. 子网掩码可以直接使用22计算即可,即前22位都为1,其余为0,即255.255.252.0 - - -| TYPE | CODE | Description | -| ---- | ---- | ------------| -| 0 | 0 | Echo Reply——回显应答(Ping 应答)   | -| 3 | 0 | Network Unreachable——网络不可达   | -| 3 | 1 | Host Unreachable——主机不可达   | -| 3 | 2 | Protocol Unreachable——协议不可达   | -| 3 | 3 | Port Unreachable——端口不可达   | -| 3 | 4 | Fragmentation needed but no frag. bit set——需要进行分片但设置不分片标志   | -| 3 | 5 | Source routing failed——源站选路失败   | -| 3 | 6 | Destination network unknown——目的网络未知   | -| 3 | 7 | Destination host unknown——目的主机未知   | -| 3 | 8 | Source host isolated (obsolete)——源主机被隔离(作废不用)   | -| 3 | 9 | Destination network administratively prohibited——目的网络被强制禁止   | -| 3 | 10 | Destination host administratively prohibited——目的主机被强制禁止   | -| 3 | 11 | Network unreachable for TOS——由于服务类型 TOS,网络不可达   | -| 3 | 12 | Host unreachable for TOS——由于服务类型 TOS,主机不可达   | -| 3 | 13 | Communication administratively prohibited by filtering——由于过滤,通信被强制禁止   | -| 3 | 14 | Host precedence violation——主机越权   | -| 3 | 15 | Precedence cutoff in effect——优先中止生效   | -| 4 | 0 | Source quench——源端被关闭(基本流控制)     | -| 5 | 0 | Redirect for network——对网络重定向     | -| 5 | 1 | Redirect for host——对主机重定向     | -| 5 | 2 | Redirect for TOS and network——对服务类型和网络重定向     | -| 5 | 3 | Redirect for TOS and host——对服务类型和主机重定向     | -| 8 | 0 | Echo request——回显请求(Ping 请求)   | -| 9 | 0 | Router advertisement——路由器通告     | -| 10 | 0 | Route solicitation——路由器请求     | -| 11 | 0 | TTL equals 0 during transit——传输期间生存时间为 0   | -| 11 | 1 | TTL equals 0 during reassembly——在数据报组装期间生存时间为 0   | -| 12 | 0 | IP header bad (catchall error)——坏的 IP 首部(包括各种差错)   | -| 12 | 1 | Required options missing——缺少必需的选项   | -| 13 | 0 | Timestamp request (obsolete)——时间戳请求(作废不用)   | -| 14 |   | Timestamp reply (obsolete)——时间戳应答(作废不用)   | -| 15 | 0 | Information request (obsolete)——信息请求(作废不用)   | -| 16 | 0 | Information reply (obsolete)——信息应答(作废不用)   | -| 17 | 0 | Address mask request——地址掩码请求   | -| 18 | 0 | Address mask | reply——地址掩码应答 | - +# 网络层的基本实现 + +本章介绍网络层的实现,网络层又称网际层、ip 层,它是 tcpip 架构中核心的实现,全球计算机的互联很大部分归功于网络层,核心网络(路由器)都跑在网络层,为网络提供路由交换的功能,将数据包分发到相应的主机。虽然网络层在路由器上的实现比较复杂,因为要实现各种路由协议,但主机协议栈中的网络层并不复杂,因为它没有实现各种路由协议,路由表也很简单。下面介绍网络层提供的服务和实现网络层的 ip 协议-ipv4。 + +## 网络层提供的服务 +在计算机网络领域,曾经为网络层应该提供怎样的服务(面向连接还是无连接)引起了长时间的争论。最终因特网采用的设计思路是:网络层向上提供简单灵活的、无连接的、尽最大努力交付的数据报服务。所谓的数据报服务具有以下几个特点: + +1. 无需建立连接 +2. 不保证可靠性 +3. 每个分组都有终点的完整地址 +4. 每个分组独立选择路由进行转发 +5. 可靠通信应该有上层负责 网络层的目的是实现两个主机之间的数据透明传送,具体功能包括寻址和路由选择等。它提供的服务使传输层不需要了解网络中的数据传输和交换技术。对网络层而言使用一种逻辑地址来唯一标识互联网上的设备,网络层依靠逻辑地址进行相互通信(类似于数据链路层的 MAC 地址),逻辑地址编址方案现主要有两种,IPv4 和 IPv6,我们主要讲协议栈对 IPv4 协议的处理。一般我们说 IP 地址,指的是 ipv4 地址。 + +## 网络层和链路层的功能区别 +之前讲过链路层也可以实现主机到主机的数据透明传输,那为何还需要网络层实现主机到主机的数据传输? + +因为链路层的数据交换是在同个局域网实现的,链路层的交换也就是二层交换,它依赖二层广播 ARP 报文,来学习 MAC 地址和端口的对应关系。当交换机从某个端口收到一个数据包,它会先读取包中的源 MAC 地址,再去读取包中的目的 MAC 地址,并在地址表中查找对应的端口,如表中有和目的 MAC 地址对应的端口,就把数据包直接复制到这个端口上。链路层其最基本的服务是将源自网络层来的数据可靠地传输到相邻节点的目标机网络层。 + +而网络层的数据交换是不限于局域网的,网络层连接着因特网中各局域网、广域网的设备,是互联网络的枢纽。网络层的数据交换(路由交换)是根据目的 IP,查找路由表找到下一跳的 IP 地址,再根据这个下一跳 IP 地址,查找转发表,将数据包转发给相应的端口。简单的说链路层的寻址关心 MAC 地址而不管数据包中的 IP 地址,而网络层的寻址关心 IP 地址,而不关心 MAC 地址,链路层和网络层的结合实现了世界上两台主机的数据互相传输。 + +## ipv4简介 + +IPv4,是互联网协议(Internet Protocol,IP)的第四版,也是第一个被广泛使用,构成现今互联网技术的基础的协议。IPv4 是一种无连接的协议,操作在使用分组交换的链路层(如以太网)上。此协议会尽最大努力交付数据包,意即它不保证任何数据包均能送达目的地,也不保证所有数据包均按照正确的顺序无重复地到达。这些方面是由上层的传输协议(如传输控制协议)处理的。 + +## ip报文 + +- IPv4,是互联网协议(Internet Protocol,IP)的第四版,也是第一个被广泛使用,构成现今互联网技术的基础的协议。IPv4 是一种无连接的协议,操作在使用分组交换的链路层(如以太网)上。此协议会尽最大努力交付数据包,意即它不保证任何数据包均能送达目的地,也不保证所有数据包均按照正确的顺序无重复地到达。这些方面是由上层的传输协议(如传输控制协议)处理的。 + +- 版本(Version) 版本字段占 4bit,通信双方使用的版本必须一致。对于 IPv4,字段的值是 4。 + +- 首部长度(Internet Header Length, IHL) 占 4bit,首部长度说明首部有多少 32 位字(4 字节)。由于 IPv4 首部可能包含数目不定的选项,这个字段也用来确定数据的偏移量。这个字段的最小值是 5(二进制 0101),相当于 5*4=20 字节(RFC 791),最大十进制值是 15。 + +- 区分服务(Differentiated Services,DS) 占 8bit,最初被定义为服务类型字段,实际上并未使用,但 1998 年被 IETF 重定义为区分服务 RFC 2474。只有在使用区分服务时,这个字段才起作用,在一般的情况 下都不使用这个字段。例如需要实时数据流的技术会应用这个字段,一个例子是 VoIP。 + +- 显式拥塞通告( Explicit Congestion Notification,ECN) 在 RFC 3168 中定义,允许在不丢弃报文的同时通知对方网络拥塞的发生。ECN 是一种可选的功能,仅当两端都支持并希望使用,且底层网络支持时才被使用。 + +- 全长(Total Length) 这个 16 位字段定义了报文总长,包含首部和数据,单位为字节。这个字段的最小值是 20(20 字节首部+0 字节数据),最大值是 2^16-1=65,535。IP 规定所有主机都必须支持最小 576 字节的报文,这是假定上层数据长度 512 字节,加上最长 IP 首部 60 字节,加上 4 字节富裕量,得出 576 字节,但大多数现代主机支持更大的报文。**当下层的数据链路协议的最大传输单元(MTU)字段的值小于 IP 报文长度时,报文就必须被分片,详细见下个标题。** + +- 标识符(Identification) 占 16 位,这个字段主要被用来唯一地标识一个报文的所有分片,因为分片不一定按序到达,所以在重组时需要知道分片所属的报文。每产生一个数据报,计数器加 1,并赋值给此字段。一些实验性的工作建议将此字段用于其它目的,例如增加报文跟踪信息以协助探测伪造的源地址。 + +- 标志 (Flags) 这个 3 位字段用于控制和识别分片,它们是: 位 0:保留,必须为 0; 位 1:禁止分片(Don’t Fragment,DF),当 DF=0 时才允许分片; 位 2:更多分片(More Fragment,MF),MF=1 代表后面还有分片,MF=0 代表已经是最后一个分片。 如果 DF 标志被设置为 1,但路由要求必须分片报文,此报文会被丢弃。这个标志可被用于发往没有能力组装分片的主机。当一个报文被分片,除了最后一片外的所有分片都设置 MF 为 1。最后一个片段具有非零片段偏移字段,将其与未分片数据包区分开,未分片的偏移字段为 0。 + +- 分片偏移 (Fragment Offset) 这个 13 位字段指明了每个分片相对于原始报文开头的偏移量,以 8 字节作单位。 + +- 存活时间(Time To Live,TTL) 这个 8 位字段避免报文在互联网中永远存在(例如陷入路由环路)。存活时间以秒为单位,但小于一秒的时间均向上取整到一秒。在现实中,这实际上成了一个跳数计数器:报文经过的每个路由器都将此字段减 1,当此字段等于 0 时,报文不再向下一跳传送并被丢弃,最大值是 255。常规地,一份 ICMP 报文被发回报文发送端说明其发送的报文已被丢弃。这也是 traceroute 的核心原理。 + +- 协议 (Protocol) 占 8bit,这个字段定义了该报文数据区使用的协议。IANA 维护着一份协议列表(最初由 RFC 790 定义),详细参见 IP 协议号列表。 + +- 首部检验和 (Header Checksum) 这个 16 位检验和字段只对首部查错,不包括数据部分。在每一跳,路由器都要重新计算出首部检验和并与此字段进行比对,如果不一致,此报文将会被丢弃。重新计算的必要性是因为每一跳的一些首部字段(如 TTL、Flag、Offset 等)都有可能发生变化,不检查数据部分是为了减少工作量。数据区的错误留待上层协议处理——用户数据报协议(UDP)和传输控制协议(TCP)都有检验和字段。此处的检验计算方法不使用 CRC。 + +- 源地址 一个 IPv4 地址由四个字节共 32 位构成,此字段的值是将每个字节转为二进制并拼在一起所得到的 32 位值。例如,10.9.8.7 是 00001010000010010000100000000111。但请注意,因为 NAT 的存在,这个地址并不总是报文的真实发送端,因此发往此地址的报文会被送往 NAT 设备,并由它被翻译为真实的地址。 + +- 目的地址 与源地址格式相同,但指出报文的接收端。 + +- 选项 附加的首部字段可能跟在目的地址之后,但这并不被经常使用,从 1 到 40 个字节不等。请注意首部长度字段必须包括足够的 32 位字来放下所有的选项(包括任何必须的填充以使首部长度能够被 32 位整除)。当选项列表的结尾不是首部的结尾时,EOL(选项列表结束,0x00)选项被插入列表末尾。下表列出了可能。 + +|字段|长度(位)|描述| +|----|---------|----| +|备份| 1 |当此选项需要被备份到所有分片中时,设为 1。| +| 类 | 2 |常规的选项类别,0 为“控制”,2 为“查错和措施”,1 和 3 保留。| +|数字| 5 |指明一个选项。| +|长度| 8 |指明整个选项的长度,对于简单的选项此字段可能不存在。| +|数据| 可变|选项相关数据,对于简单的选项此字段可能不存在。| + +**注:如果首部长度大于 5,那么选项字段必然存在并必须被考虑。 注:备份、类和数字经常被一并称呼为“类型”。** + +- 数据 数据字段不是首部的一部分,因此并不被包含在首部检验和中。数据的格式在协议首部字段中被指明,并可以是任意的传输层协议。一些常见协议的协议字段值被列在下面: + +|协议字段值| 协议名 |缩写| +|---------|------------|----| +|1 |互联网控制消息协议|ICMP| +|2 |互联网组管理协议 |IGMP| +|6 |传输控制协议 |TCP| +|17 |用户数据报协议 |UDP| +|41 |IPv6 封装 |ENCAP| +|89 |开放式最短路径优先 |OSPF| +|132|流控制传输协议 |SCTP| + +## ipv4地址 +IPv4 使用 32 位(4 字节)地址,因此地址空间中只有 4,294,967,296(2^32)个地址。不过,一些地址是为特殊用途所保留的,如专用网络(约 1800 万 个地址)和多播地址(约 2.7 亿个地址),这减少了可在互联网上路由的地址数量。随着地址不断被分配给最终用户,IPv4 地址枯竭问题也在随之产生。基于分类网络、无类别域间路由和网络地址转换的地址结构重构显著地减少了地址枯竭的速度。但在 2011 年 2 月 3 日,在最后 5 个地址块被分配给 5 个区域互联网注册管理机构之后,IANA 的主要地址池已经用尽。 + +IPv4 地址可被写作任何表示一个 32 位整数值的形式,但为了方便人类阅读和分析,它通常被写作点分十进制的形式,即四个字节被分开用十进制写出,中间用点分隔,如 192.168.1.1。ip 地址的编址方法一共经历过三个阶段: + +### 分类的 IP 地址 +- A 类网络地址占有 1 个字节(8 位),定义最高位为 0 来标识此类网络,余下 7 位为真正的网络地址。后面 3 个字节(24)为主机地址。A 类网络地址第一个字节的十进制值为:001~127.通常用于大型网络。 +- B 类网络地址占 2 个字节,使用最高两位为“10”来标识此类地址,其余 14 位为真正的网络地址,主机地址占后面的 2 个字节(16 位)。B 类网络地址第一个字节的十进制值为:128~191.通常用于中型网络。 +- C 类网络地址占 3 个字节,它是最通用的 Internet 地址。使用最高三位为“110”来标识此类地址。其余 21 位为真正的网络地址。主机地址占最后 1 个字节。C 类网络地址第一个字节的十进制值为:192~223。通常用于小型网络。 +- D 类地址是相当新的。它的识别头是 1110,用于组播,例如用于路由器修改。D 类网络地址第一个字节的十进制值为:224~239。 +- E 类地址为实验保留,其识别头是 1111。E 类网络地址第一个字节的十进制值为:240~255。 + +**但要注意得是,上面得这些地址分类已成为了历史,现在用的都是无分类 IP 地址进行路由选择。** + +### 子网的划分 + +由于上面固定分类的 IP 地址有不少的缺陷,比如,IP 地址空间的利用率很低、固定就意味着不够灵活、使路由表太大而影响性能,为了解决上述的问题,在 IP 地址概念中,又增加了一个“子网字段”,这样的话,一个 IP 地址可以用下面的方式表示 + +``` sh +IP地址 = (网络号,子网号,主机号) +``` + +### 无分类编址(CIDR) + +为了提高 ip 地址资源的利用率,提出了变长子网掩码(VLSM),而在 VLSM 的研究基础上又提出了“无分类编址”方法,也叫无分类域间路由选择-CIDR。 CIDR 最主要有两个以下特点: + +- 消除传统的 A,B,C 地址和划分子网的概念,更有效的分配 IPv4 的地址空间,CIDR 使 IP 地址又回到无分类的两级编码。记法:IP 地址::={<<网络前缀>,<<主机号>}。CIDR 还使用“斜线记法”即在 IP 地址后面加上“/”然后写网络前缀所占的位数。 +- CIDR 把网络前缀都相同的连续 IP 地址组成一个“CIDR 地址块”,即强化路由聚合(构成超网)。 其表示方法 + +``` sh +IP地址 = (网络前缀,主机号) +``` + +CIDR 还使用“斜线记法”,在 IP 地址后面加个“/”,紧跟着网络前缀所占的位数。例如:192.168.1.0/24,这种表示方式其实我们在上一章就用了,也是我们最常用的编址方式。 + +#### CIDR地址的计算方法 +CIDR无类域间路由,打破了原本的ABC类地址的规划限定,使用地址段分配更加灵活,日常工作中也经常使用,也正是因为其灵活的特点使我们无法一眼辨认出网络号、广播地址、网络中的第一台主机等信息,本文主要针对这些信息的获得介绍一些计算方法。 + +当给定一个IP地址,比如18.232.133.86/22,需要求一下这个IP所在网络的 网络地址、子网掩码、广播i地址、这个网络的第一台主机的IP地址: + +斜线后是22并不是8的整数倍,直接很难看出结果,所以需要通过一系列的计算。 + +1. 先用8的整数倍对22进行切割:22 = 16+6 ,所以这个IP地址的前16位保持不动即18.232. + +2. 发现问题出在了第三个8位上,这8位中前面6位被拿来做了网络号,后面2位被拿去做了主机号,所以将这8位转化为二进制得到10000101,拿出前6位为<100001>。这是得到了全部的网络号为 18.232.<100001> + +3. 将主机号全部置0便是网络地址,18.232.<100001><00>.<00000000>即网络地址为18.232.132.0 + +4. 同时也得到了这个网络的第一台主机的ip地址,18.232.<100001><00>.<00000001>即18.232.132.1 + +5. 将主机位全部置1便是广播地址,18.232.<100001><11>.<11111111>即18.232.135.255 + +6. 子网掩码可以直接使用22计算即可,即前22位都为1,其余为0,即255.255.252.0 + + +| TYPE | CODE | Description | +| ---- | ---- | ------------| +| 0 | 0 | Echo Reply——回显应答(Ping 应答)   | +| 3 | 0 | Network Unreachable——网络不可达   | +| 3 | 1 | Host Unreachable——主机不可达   | +| 3 | 2 | Protocol Unreachable——协议不可达   | +| 3 | 3 | Port Unreachable——端口不可达   | +| 3 | 4 | Fragmentation needed but no frag. bit set——需要进行分片但设置不分片标志   | +| 3 | 5 | Source routing failed——源站选路失败   | +| 3 | 6 | Destination network unknown——目的网络未知   | +| 3 | 7 | Destination host unknown——目的主机未知   | +| 3 | 8 | Source host isolated (obsolete)——源主机被隔离(作废不用)   | +| 3 | 9 | Destination network administratively prohibited——目的网络被强制禁止   | +| 3 | 10 | Destination host administratively prohibited——目的主机被强制禁止   | +| 3 | 11 | Network unreachable for TOS——由于服务类型 TOS,网络不可达   | +| 3 | 12 | Host unreachable for TOS——由于服务类型 TOS,主机不可达   | +| 3 | 13 | Communication administratively prohibited by filtering——由于过滤,通信被强制禁止   | +| 3 | 14 | Host precedence violation——主机越权   | +| 3 | 15 | Precedence cutoff in effect——优先中止生效   | +| 4 | 0 | Source quench——源端被关闭(基本流控制)     | +| 5 | 0 | Redirect for network——对网络重定向     | +| 5 | 1 | Redirect for host——对主机重定向     | +| 5 | 2 | Redirect for TOS and network——对服务类型和网络重定向     | +| 5 | 3 | Redirect for TOS and host——对服务类型和主机重定向     | +| 8 | 0 | Echo request——回显请求(Ping 请求)   | +| 9 | 0 | Router advertisement——路由器通告     | +| 10 | 0 | Route solicitation——路由器请求     | +| 11 | 0 | TTL equals 0 during transit——传输期间生存时间为 0   | +| 11 | 1 | TTL equals 0 during reassembly——在数据报组装期间生存时间为 0   | +| 12 | 0 | IP header bad (catchall error)——坏的 IP 首部(包括各种差错)   | +| 12 | 1 | Required options missing——缺少必需的选项   | +| 13 | 0 | Timestamp request (obsolete)——时间戳请求(作废不用)   | +| 14 |   | Timestamp reply (obsolete)——时间戳应答(作废不用)   | +| 15 | 0 | Information request (obsolete)——信息请求(作废不用)   | +| 16 | 0 | Information reply (obsolete)——信息应答(作废不用)   | +| 17 | 0 | Address mask request——地址掩码请求   | +| 18 | 0 | Address mask | reply——地址掩码应答 | + IP 层最重要的目的是让两个主机之间通信,无论他们相隔多远。IP 协议理论上允许的最大 IP 数据报为 65535 字节(16 位来表示包总长)。但是因为协议栈网络层下面的数据链路层一般允许的帧长远远小于这个值,例如以太网的 MTU 通常在 1500 字节左右。所以较大的 IP 数据包会被分片传递给数据链路层发送,分片的 IP 数据报可能会以不同的路径传输到接收主机,接收主机通过一系列的重组,将其还原为一个完整的 IP 数据报,再提交给上层协议处理。IP 分片会带来一定的问题,分片和重组会消耗发送方、接收方一定的 CPU 等资源,如果存在大量的分片报文的话,可能会造成较为严重的资源消耗;分片丢包导致的重传问题;分片攻击。 \ No newline at end of file diff --git a/tcpip/network/arp/README.md b/tcpip/network/arp/README.md index b5e6957..57e9d26 100644 --- a/tcpip/network/arp/README.md +++ b/tcpip/network/arp/README.md @@ -1,24 +1,24 @@ -# arp协议介绍 - -在以太网协议中规定,同一局域网中的一台主机要和另一台主机进行直接通信,必须要知道目标主机的 MAC 地址。而在 TCP/IP 协议中,网络层和传输层只关心目标主机的 IP 地址。这就导致在以太网中使用 IP 协议时,数据链路层的以太网协议接到上层 IP 协议提供的数据中,只包含目的主机的 IP 地址。于是需要一种方法,根据目的主机的 IP 地址,获得其 MAC 地址。这就是 ARP 协议要做的事情。所谓地址解析(address resolution)就是主机在发送帧前将目标 IP 地址转换成目标 MAC 地址的过程。 - -当发送主机和目的主机不在同一个局域网中时,即便知道目的主机的 MAC 地址,两者也不能直接通信,必须经过路由转发才可以。所以此时,发送主机通过 ARP 协议获得的将不是目的主机的真实 MAC 地址,而是一台可以通往局域网外的路由器的 MAC 地址。于是此后发送主机发往目的主机的所有帧,都将发往该路由器,通过它向外发送。这种情况称为委托 ARP 或 ARP 代理(ARP Proxy)。 - - -还有一种免费 ARP(gratuitous ARP),它是指主机发送 ARP 查询(广播)自己的 IP 地址,当 ARP 功能被开启或者是端口初始配置完成,主机向网络发送免费 ARP 来查询自己的 IP 地址确认地址唯一可用。用来确定网络中是否有其他主机使用了 IP 地址,如果有应答则产生错误消息。免费 ARP 也可以做更新 ARP 缓存用,网络中的其他主机收到该广播则在缓存中更新条目,收到该广播的主机无论是否存在与 IP 地址相关的条目都会强制更新,如果存在旧条目则会将 MAC 更新为广播包中的 MAC。 - -## arp报文组成 - -1. 硬件类型(hard type) 硬件类型用来指代需要什么样的物理地址,如果硬件类型为 1,表示以太网地址 -2. 协议类型 协议类型则是需要映射的协议地址类型,如果协议类型是 0x0800,表示 ipv4 协议。 -3. 硬件地址长度 表示硬件地址的长度,单位字节,一般都是以太网地址的长度为 6 字节。 -4. 协议地址长度: 表示协议地址的长度,单位字节,一般都是 ipv4 地址的长度为 4 字节。 -5. 操作码 这些值用于区分具体操作类型,因为字段都相同,所以必须指明操作码,不然连请求还是应答都分不清。 1=>ARP 请求, 2=>ARP 应答,3=>RARP 请求,4=>RARP 应答。 -6. 源硬件地址 源物理地址,如02:f2:02:f2:02:f2 -7. 源协议地址 源协议地址,如192.168.0.1 -8. 目标硬件地址 目标物理地址,如03:f2:03:f2:03:f2 -9. 目标协议地址。 目标协议地址,如 192.168.0.2 - -## ARP 高速缓存 - +# arp协议介绍 + +在以太网协议中规定,同一局域网中的一台主机要和另一台主机进行直接通信,必须要知道目标主机的 MAC 地址。而在 TCP/IP 协议中,网络层和传输层只关心目标主机的 IP 地址。这就导致在以太网中使用 IP 协议时,数据链路层的以太网协议接到上层 IP 协议提供的数据中,只包含目的主机的 IP 地址。于是需要一种方法,根据目的主机的 IP 地址,获得其 MAC 地址。这就是 ARP 协议要做的事情。所谓地址解析(address resolution)就是主机在发送帧前将目标 IP 地址转换成目标 MAC 地址的过程。 + +当发送主机和目的主机不在同一个局域网中时,即便知道目的主机的 MAC 地址,两者也不能直接通信,必须经过路由转发才可以。所以此时,发送主机通过 ARP 协议获得的将不是目的主机的真实 MAC 地址,而是一台可以通往局域网外的路由器的 MAC 地址。于是此后发送主机发往目的主机的所有帧,都将发往该路由器,通过它向外发送。这种情况称为委托 ARP 或 ARP 代理(ARP Proxy)。 + + +还有一种免费 ARP(gratuitous ARP),它是指主机发送 ARP 查询(广播)自己的 IP 地址,当 ARP 功能被开启或者是端口初始配置完成,主机向网络发送免费 ARP 来查询自己的 IP 地址确认地址唯一可用。用来确定网络中是否有其他主机使用了 IP 地址,如果有应答则产生错误消息。免费 ARP 也可以做更新 ARP 缓存用,网络中的其他主机收到该广播则在缓存中更新条目,收到该广播的主机无论是否存在与 IP 地址相关的条目都会强制更新,如果存在旧条目则会将 MAC 更新为广播包中的 MAC。 + +## arp报文组成 + +1. 硬件类型(hard type) 硬件类型用来指代需要什么样的物理地址,如果硬件类型为 1,表示以太网地址 +2. 协议类型 协议类型则是需要映射的协议地址类型,如果协议类型是 0x0800,表示 ipv4 协议。 +3. 硬件地址长度 表示硬件地址的长度,单位字节,一般都是以太网地址的长度为 6 字节。 +4. 协议地址长度: 表示协议地址的长度,单位字节,一般都是 ipv4 地址的长度为 4 字节。 +5. 操作码 这些值用于区分具体操作类型,因为字段都相同,所以必须指明操作码,不然连请求还是应答都分不清。 1=>ARP 请求, 2=>ARP 应答,3=>RARP 请求,4=>RARP 应答。 +6. 源硬件地址 源物理地址,如02:f2:02:f2:02:f2 +7. 源协议地址 源协议地址,如192.168.0.1 +8. 目标硬件地址 目标物理地址,如03:f2:03:f2:03:f2 +9. 目标协议地址。 目标协议地址,如 192.168.0.2 + +## ARP 高速缓存 + 知道了 ARP 发送的原理后,我们不禁疑惑,如果每次发之前都要发送 ARP 请求硬件地址会不会太慢,但是实际上 ARP 的运行是非常高效的。那是因为每一个主机上都有一个 ARP 高速缓存,我们可以在命令行键入 arp -a 获取本机 ARP 高速缓存的所有内容。 \ No newline at end of file diff --git a/tcpip/network/arp/arp.go b/tcpip/network/arp/arp.go index bb09fbd..73c8b48 100644 --- a/tcpip/network/arp/arp.go +++ b/tcpip/network/arp/arp.go @@ -1,172 +1,172 @@ -// 主机的链路层寻址是通过 arp 表来实现的 -package arp - -import ( - "log" - "netstack/tcpip" - "netstack/tcpip/buffer" - "netstack/tcpip/header" - "netstack/tcpip/stack" -) - -const ( - ProtocolName = "arp" - ProtocolNumber = header.ARPProtocolNumber - ProtocolAddress = tcpip.Address("arp") -) - -// arp endpoint 一个网络层的实现 Implement stack.NetworkEndpoint -type endpoint struct { - nicid tcpip.NICID // arp报文使用的网卡 - addr tcpip.Address // 网络层地址 - linkEP stack.LinkEndpoint // MAC - linkAddrCache stack.LinkAddressCache // 链路高速缓存 -} - -func (e *endpoint) DefaultTTL() uint8 { - return 0 -} - -func (e *endpoint) MTU() uint32 { - lmtu := e.linkEP.MTU() - return lmtu - uint32(e.MaxHeaderLength()) -} - -func (e *endpoint) NICID() tcpip.NICID { - return e.nicid -} - -func (e *endpoint) Capabilities() stack.LinkEndpointCapabilities { - return e.linkEP.Capabilities() -} - -func (e *endpoint) ID() *stack.NetworkEndpointID { - return &stack.NetworkEndpointID{LocalAddress: ProtocolAddress} -} - -func (e *endpoint) MaxHeaderLength() uint16 { - return e.linkEP.MaxHeaderLength() + header.ARPSize -} - -// arp不支持写包 -func (e *endpoint) WritePacket(r *stack.Route, hdr buffer.Prependable, payload buffer.VectorisedView, protocol tcpip.TransportProtocolNumber, ttl uint8) *tcpip.Error { - return tcpip.ErrNotSupported -} - -// arp数据包的处理,包括arp请求和响应 -func (e *endpoint) HandlePacket(r *stack.Route, vv buffer.VectorisedView) { - v := vv.First() - h := header.ARP(v) - if !h.IsValid() { - return - } - - // 判断操作码类型 - switch h.Op() { - case header.ARPRequest: - // 如果是ARP请求 - localAddr := tcpip.Address(h.ProtocolAddressTarget()) - if e.linkAddrCache.CheckLocalAddress(e.nicid, header.IPv4ProtocolNumber, localAddr) == 0 { - return // 无效的ARP请求 - } - - // arp报文所在的网卡绑定了这个地址 - hdr := buffer.NewPrependable(int(e.linkEP.MaxHeaderLength()) + header.ARPSize) // 以太 + ARP - pkt := header.ARP(hdr.Prepend(header.ARPSize)) // 取出 ARP - pkt.SetIPv4OverEthernet() - pkt.SetOp(header.ARPReply) - copy(pkt.HardwareAddressSender(), r.LocalLinkAddress[:]) // 写入本机MAC作为响应 NOTE - // 倒置目标与源 作为回应 - copy(pkt.ProtocolAddressSender(), h.ProtocolAddressTarget()) - copy(pkt.ProtocolAddressTarget(), h.ProtocolAddressSender()) - log.Println("处理注入的ARP请求 这里将返回一个ARP报文作为响应") - e.linkEP.WritePacket(r, hdr, buffer.VectorisedView{}, ProtocolNumber) // 往链路层写回消息 - // 注意这里的 fallthrough 表示需要继续执行下面分支的代码 - // 当收到 arp 请求需要添加到链路地址缓存中 - fallthrough // also fill the cache from requests - case header.ARPReply: - // 这里记录ip和mac对应关系,也就是arp表 - addr := tcpip.Address(h.ProtocolAddressSender()) - linkAddr := tcpip.LinkAddress(h.HardwareAddressSender()) - e.linkAddrCache.AddLinkAddress(e.nicid, addr, linkAddr) - default: - panic(tcpip.ErrUnknownProtocol) - } -} - -func (e *endpoint) Close() {} - -// 实现了 stack.NetworkProtocol 和 stack.LinkAddressResolver 两个接口 -type protocol struct{} - -func (p *protocol) Number() tcpip.NetworkProtocolNumber { - return ProtocolNumber -} - -func (p *protocol) NewEndpoint(nicid tcpip.NICID, addr tcpip.Address, linkAddrCache stack.LinkAddressCache, - dispatcher stack.TransportDispatcher, linkEP stack.LinkEndpoint) (stack.NetworkEndpoint, *tcpip.Error) { - if addr != ProtocolAddress { - return nil, tcpip.ErrBadLocalAddress - } - return &endpoint{ - nicid: nicid, - addr: addr, - linkEP: linkEP, - linkAddrCache: linkAddrCache, - }, nil -} - -func (p *protocol) MinimumPacketSize() int { - return header.ARPSize -} - -func (p *protocol) ParseAddresses(v buffer.View) (src, dst tcpip.Address) { - h := header.ARP(v) - return tcpip.Address(h.ProtocolAddressSender()), ProtocolAddress -} - -func (p *protocol) SetOption(option interface{}) *tcpip.Error { - return tcpip.ErrUnknownProtocolOption -} - -func (p *protocol) Option(option interface{}) *tcpip.Error { - return tcpip.ErrUnknownProtocolOption -} - -// LinkAddressProtocol implements stack.LinkAddressResolver. -func (*protocol) LinkAddressProtocol() tcpip.NetworkProtocolNumber { - return header.IPv4ProtocolNumber -} - -// LinkAddressRequest implements stack.LinkAddressResolver. -func (*protocol) LinkAddressRequest(addr, localAddr tcpip.Address, linkEP stack.LinkEndpoint) *tcpip.Error { - r := &stack.Route{ - RemoteLinkAddress: broadcastMAC, - } - - hdr := buffer.NewPrependable(int(linkEP.MaxHeaderLength()) + header.ARPSize) - h := header.ARP(hdr.Prepend(header.ARPSize)) - h.SetIPv4OverEthernet() - h.SetOp(header.ARPRequest) - copy(h.HardwareAddressSender(), linkEP.LinkAddress()) - copy(h.ProtocolAddressSender(), localAddr) - copy(h.ProtocolAddressTarget(), addr) - - return linkEP.WritePacket(r, hdr, buffer.VectorisedView{}, ProtocolNumber) -} - -// ResolveStaticAddress implements stack.LinkAddressResolver. -func (*protocol) ResolveStaticAddress(addr tcpip.Address) (tcpip.LinkAddress, bool) { - if addr == "\xff\xff\xff\xff" { - return broadcastMAC, true - } - return "", false -} - -var broadcastMAC = tcpip.LinkAddress([]byte{0xff, 0xff, 0xff, 0xff, 0xff, 0xff}) - -func init() { - stack.RegisterNetworkProtocolFactory(ProtocolName, func() stack.NetworkProtocol { - return &protocol{} - }) -} +// 主机的链路层寻址是通过 arp 表来实现的 +package arp + +import ( + "log" + "netstack/tcpip" + "netstack/tcpip/buffer" + "netstack/tcpip/header" + "netstack/tcpip/stack" +) + +const ( + ProtocolName = "arp" + ProtocolNumber = header.ARPProtocolNumber + ProtocolAddress = tcpip.Address("arp") +) + +// arp endpoint 一个网络层的实现 Implement stack.NetworkEndpoint +type endpoint struct { + nicid tcpip.NICID // arp报文使用的网卡 + addr tcpip.Address // 网络层地址 + linkEP stack.LinkEndpoint // MAC + linkAddrCache stack.LinkAddressCache // 链路高速缓存 +} + +func (e *endpoint) DefaultTTL() uint8 { + return 0 +} + +func (e *endpoint) MTU() uint32 { + lmtu := e.linkEP.MTU() + return lmtu - uint32(e.MaxHeaderLength()) +} + +func (e *endpoint) NICID() tcpip.NICID { + return e.nicid +} + +func (e *endpoint) Capabilities() stack.LinkEndpointCapabilities { + return e.linkEP.Capabilities() +} + +func (e *endpoint) ID() *stack.NetworkEndpointID { + return &stack.NetworkEndpointID{LocalAddress: ProtocolAddress} +} + +func (e *endpoint) MaxHeaderLength() uint16 { + return e.linkEP.MaxHeaderLength() + header.ARPSize +} + +// arp不支持写包 +func (e *endpoint) WritePacket(r *stack.Route, hdr buffer.Prependable, payload buffer.VectorisedView, protocol tcpip.TransportProtocolNumber, ttl uint8) *tcpip.Error { + return tcpip.ErrNotSupported +} + +// arp数据包的处理,包括arp请求和响应 +func (e *endpoint) HandlePacket(r *stack.Route, vv buffer.VectorisedView) { + v := vv.First() + h := header.ARP(v) + if !h.IsValid() { + return + } + + // 判断操作码类型 + switch h.Op() { + case header.ARPRequest: + // 如果是ARP请求 + localAddr := tcpip.Address(h.ProtocolAddressTarget()) + if e.linkAddrCache.CheckLocalAddress(e.nicid, header.IPv4ProtocolNumber, localAddr) == 0 { + return // 无效的ARP请求 + } + + // arp报文所在的网卡绑定了这个地址 + hdr := buffer.NewPrependable(int(e.linkEP.MaxHeaderLength()) + header.ARPSize) // 以太 + ARP + pkt := header.ARP(hdr.Prepend(header.ARPSize)) // 取出 ARP + pkt.SetIPv4OverEthernet() + pkt.SetOp(header.ARPReply) + copy(pkt.HardwareAddressSender(), r.LocalLinkAddress[:]) // 写入本机MAC作为响应 NOTE + // 倒置目标与源 作为回应 + copy(pkt.ProtocolAddressSender(), h.ProtocolAddressTarget()) + copy(pkt.ProtocolAddressTarget(), h.ProtocolAddressSender()) + log.Println("处理注入的ARP请求 这里将返回一个ARP报文作为响应") + e.linkEP.WritePacket(r, hdr, buffer.VectorisedView{}, ProtocolNumber) // 往链路层写回消息 + // 注意这里的 fallthrough 表示需要继续执行下面分支的代码 + // 当收到 arp 请求需要添加到链路地址缓存中 + fallthrough // also fill the cache from requests + case header.ARPReply: + // 这里记录ip和mac对应关系,也就是arp表 + addr := tcpip.Address(h.ProtocolAddressSender()) + linkAddr := tcpip.LinkAddress(h.HardwareAddressSender()) + e.linkAddrCache.AddLinkAddress(e.nicid, addr, linkAddr) + default: + panic(tcpip.ErrUnknownProtocol) + } +} + +func (e *endpoint) Close() {} + +// 实现了 stack.NetworkProtocol 和 stack.LinkAddressResolver 两个接口 +type protocol struct{} + +func (p *protocol) Number() tcpip.NetworkProtocolNumber { + return ProtocolNumber +} + +func (p *protocol) NewEndpoint(nicid tcpip.NICID, addr tcpip.Address, linkAddrCache stack.LinkAddressCache, + dispatcher stack.TransportDispatcher, linkEP stack.LinkEndpoint) (stack.NetworkEndpoint, *tcpip.Error) { + if addr != ProtocolAddress { + return nil, tcpip.ErrBadLocalAddress + } + return &endpoint{ + nicid: nicid, + addr: addr, + linkEP: linkEP, + linkAddrCache: linkAddrCache, + }, nil +} + +func (p *protocol) MinimumPacketSize() int { + return header.ARPSize +} + +func (p *protocol) ParseAddresses(v buffer.View) (src, dst tcpip.Address) { + h := header.ARP(v) + return tcpip.Address(h.ProtocolAddressSender()), ProtocolAddress +} + +func (p *protocol) SetOption(option interface{}) *tcpip.Error { + return tcpip.ErrUnknownProtocolOption +} + +func (p *protocol) Option(option interface{}) *tcpip.Error { + return tcpip.ErrUnknownProtocolOption +} + +// LinkAddressProtocol implements stack.LinkAddressResolver. +func (*protocol) LinkAddressProtocol() tcpip.NetworkProtocolNumber { + return header.IPv4ProtocolNumber +} + +// LinkAddressRequest implements stack.LinkAddressResolver. +func (*protocol) LinkAddressRequest(addr, localAddr tcpip.Address, linkEP stack.LinkEndpoint) *tcpip.Error { + r := &stack.Route{ + RemoteLinkAddress: broadcastMAC, + } + + hdr := buffer.NewPrependable(int(linkEP.MaxHeaderLength()) + header.ARPSize) + h := header.ARP(hdr.Prepend(header.ARPSize)) + h.SetIPv4OverEthernet() + h.SetOp(header.ARPRequest) + copy(h.HardwareAddressSender(), linkEP.LinkAddress()) + copy(h.ProtocolAddressSender(), localAddr) + copy(h.ProtocolAddressTarget(), addr) + + return linkEP.WritePacket(r, hdr, buffer.VectorisedView{}, ProtocolNumber) +} + +// ResolveStaticAddress implements stack.LinkAddressResolver. +func (*protocol) ResolveStaticAddress(addr tcpip.Address) (tcpip.LinkAddress, bool) { + if addr == "\xff\xff\xff\xff" { + return broadcastMAC, true + } + return "", false +} + +var broadcastMAC = tcpip.LinkAddress([]byte{0xff, 0xff, 0xff, 0xff, 0xff, 0xff}) + +func init() { + stack.RegisterNetworkProtocolFactory(ProtocolName, func() stack.NetworkProtocol { + return &protocol{} + }) +} diff --git a/tcpip/network/arp/arp_test.go b/tcpip/network/arp/arp_test.go index 71ec148..66f66d5 100644 --- a/tcpip/network/arp/arp_test.go +++ b/tcpip/network/arp/arp_test.go @@ -1,134 +1,134 @@ -package arp_test - -import ( - "netstack/tcpip" - "netstack/tcpip/buffer" - "netstack/tcpip/header" - "netstack/tcpip/link/channel" - "netstack/tcpip/network/arp" - "netstack/tcpip/network/ipv4" - "netstack/tcpip/stack" - "testing" - "time" -) - -const ( - stackLinkAddr = tcpip.LinkAddress("\x0a\x0a\x0b\x0b\x0c\x0c") // 0a:0a:0b:0b:0c:0c - stackAddr1 = tcpip.Address("\x0a\x00\x00\x01") // 10.0.0.1 - stackAddr2 = tcpip.Address("\x0a\x00\x00\x02") // 10.0.0.2 - stackAddrBad = tcpip.Address("\x0a\x00\x00\x03") // 10.0.0.3 -) - -type testContext struct { - t *testing.T - linkEP *channel.Endpoint - s *stack.Stack -} - -func newTestContext(t *testing.T) *testContext { - s := stack.New([]string{ipv4.ProtocolName, arp.ProtocolName}, nil, stack.Options{}) - - const defaultMTU = 65536 - id, linkEP := channel.New(256, defaultMTU, stackLinkAddr) - if err := s.CreateNIC(1, id); err != nil { - t.Fatalf("CreateNIC failed: %v", err) - } - - if err := s.AddAddress(1, ipv4.ProtocolNumber, stackAddr1); err != nil { - t.Fatalf("AddAddress for ipv4 failed: %v", err) - } - if err := s.AddAddress(1, ipv4.ProtocolNumber, stackAddr2); err != nil { - t.Fatalf("AddAddress for ipv4 failed: %v", err) - } - if err := s.AddAddress(1, arp.ProtocolNumber, arp.ProtocolAddress); err != nil { - t.Fatalf("AddAddress for arp failed: %v", err) - } - - s.SetRouteTable([]tcpip.Route{{ - Destination: "\x00\x00\x00\x00", - Mask: "\x00\x00\x00\x00", - Gateway: "", - NIC: 1, - }}) - - return &testContext{ - t: t, - s: s, - linkEP: linkEP, - } -} - -func (c *testContext) cleanup() { - close(c.linkEP.C) -} - -func TestArpBase(t *testing.T) { - c := newTestContext(t) - defer c.cleanup() - - const senderMAC = "\x01\x02\x03\x04\x05\x06" - const senderIPv4 = "\x0a\x00\x00\x02" - - v := make(buffer.View, header.ARPSize) - h := header.ARP(v) - h.SetIPv4OverEthernet() - h.SetOp(header.ARPRequest) // 一个ARP请求 - copy(h.HardwareAddressSender(), senderMAC) // Local MAC - copy(h.ProtocolAddressSender(), senderIPv4) // Local IP - - inject := func(addr tcpip.Address) { - copy(h.ProtocolAddressTarget(), addr) - c.linkEP.Inject(arp.ProtocolNumber, v.ToVectorisedView()) // 往链路层注入一个arp报文 链路层将会自动分发它 - } - - inject(stackAddr1) // target IP 10.0.0.1 - select { - case pkt := <-c.linkEP.C: - if pkt.Proto != arp.ProtocolNumber { - t.Fatalf("stackAddr1: expected ARP response, got network protocol number %v", pkt.Proto) - } - rep := header.ARP(pkt.Header) - if !rep.IsValid() { - t.Fatalf("stackAddr1: invalid ARP response len(pkt.Header)=%d", len(pkt.Header)) - } - if tcpip.Address(rep.ProtocolAddressSender()) != stackAddr1 { - t.Errorf("stackAddr1: expected sender to be set") - } - if got := tcpip.LinkAddress(rep.HardwareAddressSender()); got != stackLinkAddr { - t.Errorf("stackAddr1: expected sender to be stackLinkAddr, got %q", got) - } - case <-time.After(100 * time.Millisecond): - t.Fatalf("Case #1 Time Out\n") - } - - inject(stackAddr2) - select { - case pkt := <-c.linkEP.C: - if pkt.Proto != arp.ProtocolNumber { - t.Fatalf("stackAddr2: expected ARP response, got network protocol number %v", pkt.Proto) - } - rep := header.ARP(pkt.Header) - if !rep.IsValid() { - t.Fatalf("stackAddr2: invalid ARP response len(pkt.Header)=%d", len(pkt.Header)) - } - if tcpip.Address(rep.ProtocolAddressSender()) != stackAddr2 { - t.Errorf("stackAddr2: expected sender to be set") - } - if got := tcpip.LinkAddress(rep.HardwareAddressSender()); got != stackLinkAddr { - t.Errorf("stackAddr2: expected sender to be stackLinkAddr, got %q", got) - } - - case <-time.After(100 * time.Millisecond): - t.Fatalf("Case #2 Time Out\n") - } - - inject(stackAddrBad) - select { - case pkt := <-c.linkEP.C: - t.Errorf("stackAddrBad: unexpected packet sent, Proto=%v", pkt.Proto) - case <-time.After(100 * time.Millisecond): - // Sleep tests are gross, but this will only potentially flake - // if there's a bug. If there is no bug this will reliably - // succeed. - } -} +package arp_test + +import ( + "netstack/tcpip" + "netstack/tcpip/buffer" + "netstack/tcpip/header" + "netstack/tcpip/link/channel" + "netstack/tcpip/network/arp" + "netstack/tcpip/network/ipv4" + "netstack/tcpip/stack" + "testing" + "time" +) + +const ( + stackLinkAddr = tcpip.LinkAddress("\x0a\x0a\x0b\x0b\x0c\x0c") // 0a:0a:0b:0b:0c:0c + stackAddr1 = tcpip.Address("\x0a\x00\x00\x01") // 10.0.0.1 + stackAddr2 = tcpip.Address("\x0a\x00\x00\x02") // 10.0.0.2 + stackAddrBad = tcpip.Address("\x0a\x00\x00\x03") // 10.0.0.3 +) + +type testContext struct { + t *testing.T + linkEP *channel.Endpoint + s *stack.Stack +} + +func newTestContext(t *testing.T) *testContext { + s := stack.New([]string{ipv4.ProtocolName, arp.ProtocolName}, nil, stack.Options{}) + + const defaultMTU = 65536 + id, linkEP := channel.New(256, defaultMTU, stackLinkAddr) + if err := s.CreateNIC(1, id); err != nil { + t.Fatalf("CreateNIC failed: %v", err) + } + + if err := s.AddAddress(1, ipv4.ProtocolNumber, stackAddr1); err != nil { + t.Fatalf("AddAddress for ipv4 failed: %v", err) + } + if err := s.AddAddress(1, ipv4.ProtocolNumber, stackAddr2); err != nil { + t.Fatalf("AddAddress for ipv4 failed: %v", err) + } + if err := s.AddAddress(1, arp.ProtocolNumber, arp.ProtocolAddress); err != nil { + t.Fatalf("AddAddress for arp failed: %v", err) + } + + s.SetRouteTable([]tcpip.Route{{ + Destination: "\x00\x00\x00\x00", + Mask: "\x00\x00\x00\x00", + Gateway: "", + NIC: 1, + }}) + + return &testContext{ + t: t, + s: s, + linkEP: linkEP, + } +} + +func (c *testContext) cleanup() { + close(c.linkEP.C) +} + +func TestArpBase(t *testing.T) { + c := newTestContext(t) + defer c.cleanup() + + const senderMAC = "\x01\x02\x03\x04\x05\x06" + const senderIPv4 = "\x0a\x00\x00\x02" + + v := make(buffer.View, header.ARPSize) + h := header.ARP(v) + h.SetIPv4OverEthernet() + h.SetOp(header.ARPRequest) // 一个ARP请求 + copy(h.HardwareAddressSender(), senderMAC) // Local MAC + copy(h.ProtocolAddressSender(), senderIPv4) // Local IP + + inject := func(addr tcpip.Address) { + copy(h.ProtocolAddressTarget(), addr) + c.linkEP.Inject(arp.ProtocolNumber, v.ToVectorisedView()) // 往链路层注入一个arp报文 链路层将会自动分发它 + } + + inject(stackAddr1) // target IP 10.0.0.1 + select { + case pkt := <-c.linkEP.C: + if pkt.Proto != arp.ProtocolNumber { + t.Fatalf("stackAddr1: expected ARP response, got network protocol number %v", pkt.Proto) + } + rep := header.ARP(pkt.Header) + if !rep.IsValid() { + t.Fatalf("stackAddr1: invalid ARP response len(pkt.Header)=%d", len(pkt.Header)) + } + if tcpip.Address(rep.ProtocolAddressSender()) != stackAddr1 { + t.Errorf("stackAddr1: expected sender to be set") + } + if got := tcpip.LinkAddress(rep.HardwareAddressSender()); got != stackLinkAddr { + t.Errorf("stackAddr1: expected sender to be stackLinkAddr, got %q", got) + } + case <-time.After(100 * time.Millisecond): + t.Fatalf("Case #1 Time Out\n") + } + + inject(stackAddr2) + select { + case pkt := <-c.linkEP.C: + if pkt.Proto != arp.ProtocolNumber { + t.Fatalf("stackAddr2: expected ARP response, got network protocol number %v", pkt.Proto) + } + rep := header.ARP(pkt.Header) + if !rep.IsValid() { + t.Fatalf("stackAddr2: invalid ARP response len(pkt.Header)=%d", len(pkt.Header)) + } + if tcpip.Address(rep.ProtocolAddressSender()) != stackAddr2 { + t.Errorf("stackAddr2: expected sender to be set") + } + if got := tcpip.LinkAddress(rep.HardwareAddressSender()); got != stackLinkAddr { + t.Errorf("stackAddr2: expected sender to be stackLinkAddr, got %q", got) + } + + case <-time.After(100 * time.Millisecond): + t.Fatalf("Case #2 Time Out\n") + } + + inject(stackAddrBad) + select { + case pkt := <-c.linkEP.C: + t.Errorf("stackAddrBad: unexpected packet sent, Proto=%v", pkt.Proto) + case <-time.After(100 * time.Millisecond): + // Sleep tests are gross, but this will only potentially flake + // if there's a bug. If there is no bug this will reliably + // succeed. + } +} diff --git a/tcpip/network/fragmentation/frag_heap.go b/tcpip/network/fragmentation/frag_heap.go index 5e29b60..138c35c 100644 --- a/tcpip/network/fragmentation/frag_heap.go +++ b/tcpip/network/fragmentation/frag_heap.go @@ -1,77 +1,77 @@ -// Copyright 2018 Google LLC -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package fragmentation - -import ( - "container/heap" - "fmt" - - "netstack/tcpip/buffer" -) - -type fragment struct { - offset uint16 - vv buffer.VectorisedView -} - -type fragHeap []fragment - -func (h *fragHeap) Len() int { - return len(*h) -} - -func (h *fragHeap) Less(i, j int) bool { - return (*h)[i].offset < (*h)[j].offset -} - -func (h *fragHeap) Swap(i, j int) { - (*h)[i], (*h)[j] = (*h)[j], (*h)[i] -} - -func (h *fragHeap) Push(x interface{}) { - *h = append(*h, x.(fragment)) -} - -func (h *fragHeap) Pop() interface{} { - old := *h - n := len(old) - x := old[n-1] - *h = old[:n-1] - return x -} - -// reassamble empties the heap and returns a VectorisedView -// containing a reassambled version of the fragments inside the heap. -func (h *fragHeap) reassemble() (buffer.VectorisedView, error) { - curr := heap.Pop(h).(fragment) - views := curr.vv.Views() - size := curr.vv.Size() - - if curr.offset != 0 { - return buffer.VectorisedView{}, fmt.Errorf("offset of the first packet is != 0 (%d)", curr.offset) - } - - for h.Len() > 0 { - curr := heap.Pop(h).(fragment) - if int(curr.offset) < size { - curr.vv.TrimFront(size - int(curr.offset)) - } else if int(curr.offset) > size { - return buffer.VectorisedView{}, fmt.Errorf("packet has a hole, expected offset %d, got %d", size, curr.offset) - } - size += curr.vv.Size() - views = append(views, curr.vv.Views()...) - } - return buffer.NewVectorisedView(size, views), nil -} +// Copyright 2018 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fragmentation + +import ( + "container/heap" + "fmt" + + "netstack/tcpip/buffer" +) + +type fragment struct { + offset uint16 + vv buffer.VectorisedView +} + +type fragHeap []fragment + +func (h *fragHeap) Len() int { + return len(*h) +} + +func (h *fragHeap) Less(i, j int) bool { + return (*h)[i].offset < (*h)[j].offset +} + +func (h *fragHeap) Swap(i, j int) { + (*h)[i], (*h)[j] = (*h)[j], (*h)[i] +} + +func (h *fragHeap) Push(x interface{}) { + *h = append(*h, x.(fragment)) +} + +func (h *fragHeap) Pop() interface{} { + old := *h + n := len(old) + x := old[n-1] + *h = old[:n-1] + return x +} + +// reassamble empties the heap and returns a VectorisedView +// containing a reassambled version of the fragments inside the heap. +func (h *fragHeap) reassemble() (buffer.VectorisedView, error) { + curr := heap.Pop(h).(fragment) + views := curr.vv.Views() + size := curr.vv.Size() + + if curr.offset != 0 { + return buffer.VectorisedView{}, fmt.Errorf("offset of the first packet is != 0 (%d)", curr.offset) + } + + for h.Len() > 0 { + curr := heap.Pop(h).(fragment) + if int(curr.offset) < size { + curr.vv.TrimFront(size - int(curr.offset)) + } else if int(curr.offset) > size { + return buffer.VectorisedView{}, fmt.Errorf("packet has a hole, expected offset %d, got %d", size, curr.offset) + } + size += curr.vv.Size() + views = append(views, curr.vv.Views()...) + } + return buffer.NewVectorisedView(size, views), nil +} diff --git a/tcpip/network/fragmentation/fragmentation.go b/tcpip/network/fragmentation/fragmentation.go index beaf7fe..1de42de 100644 --- a/tcpip/network/fragmentation/fragmentation.go +++ b/tcpip/network/fragmentation/fragmentation.go @@ -1,105 +1,105 @@ -package fragmentation - -import ( - "log" - "netstack/tcpip/buffer" - "sync" - "time" -) - -// DefaultReassembleTimeout is based on the linux stack: net.ipv4.ipfrag_time. -const DefaultReassembleTimeout = 30 * time.Second - -// HighFragThreshold is the threshold at which we start trimming old -// fragmented packets. Linux uses a default value of 4 MB. See -// net.ipv4.ipfrag_high_thresh for more information. -const HighFragThreshold = 4 << 20 // 4MB - -// LowFragThreshold is the threshold we reach to when we start dropping -// older fragmented packets. It's important that we keep enough room for newer -// packets to be re-assembled. Hence, this needs to be lower than -// HighFragThreshold enough. Linux uses a default value of 3 MB. See -// net.ipv4.ipfrag_low_thresh for more information. -const LowFragThreshold = 3 << 20 // 3MB - -// Fragmentation 分片处理器对象 -type Fragmentation struct { - mu sync.Mutex - highLimit int - lowLimit int - reassemblers map[uint32]*reassembler // IP报文hash:重组器 - rList reassemblerList - size int - timeout time.Duration -} - -// NewFragmentation 新建一个分片处理器 -func NewFragmentation(highMemoryLimit, lowMemoryLimit int, reassemblingTimeout time.Duration) *Fragmentation { - if lowMemoryLimit >= highMemoryLimit { - lowMemoryLimit = highMemoryLimit - } - - if lowMemoryLimit < 0 { - lowMemoryLimit = 0 - } - - return &Fragmentation{ - reassemblers: make(map[uint32]*reassembler), - highLimit: highMemoryLimit, - lowLimit: lowMemoryLimit, - timeout: reassemblingTimeout, - } -} - -// Process 处理ip报文分片 -func (f *Fragmentation) Process(id uint32, first, last uint16, more bool, vv buffer.VectorisedView) (buffer.VectorisedView, bool) { - f.mu.Lock() - r, ok := f.reassemblers[id] - if ok && r.tooOld(f.timeout) { // 检测一个分片是否存在超过了30s - // This is very likely to be an id-collision or someone performing a slow-rate attack. - f.release(r) - ok = false - } - if !ok { // 首次注册该报文的分片 - r = newReassembler(id) - f.reassemblers[id] = r - f.rList.PushFront(r) - } - f.mu.Unlock() - - res, done, consumed := r.process(first, last, more, vv) - - f.mu.Lock() - f.size += consumed - log.Printf("[%d]的分片 [%d,%d] 合并中\n", id, first, last) - if done { - f.release(r) - } - // Evict reassemblers if we are consuming more memory than highLimit until - // we reach lowLimit. - if f.size > f.highLimit { - tail := f.rList.Back() - for f.size > f.lowLimit && tail != nil { - f.release(tail) - tail = tail.Prev() - } - } - f.mu.Unlock() - return res, done -} - -func (f *Fragmentation) release(r *reassembler) { - // Before releasing a fragment we need to check if r is already marked as done. - // Otherwise, we would delete it twice. - if r.checkDoneOrMark() { - return - } - - delete(f.reassemblers, r.id) - f.rList.Remove(r) - f.size -= r.size - if f.size < 0 { - log.Printf("memory counter < 0 (%d), this is an accounting bug that requires investigation", f.size) - f.size = 0 - } -} +package fragmentation + +import ( + "log" + "netstack/tcpip/buffer" + "sync" + "time" +) + +// DefaultReassembleTimeout is based on the linux stack: net.ipv4.ipfrag_time. +const DefaultReassembleTimeout = 30 * time.Second + +// HighFragThreshold is the threshold at which we start trimming old +// fragmented packets. Linux uses a default value of 4 MB. See +// net.ipv4.ipfrag_high_thresh for more information. +const HighFragThreshold = 4 << 20 // 4MB + +// LowFragThreshold is the threshold we reach to when we start dropping +// older fragmented packets. It's important that we keep enough room for newer +// packets to be re-assembled. Hence, this needs to be lower than +// HighFragThreshold enough. Linux uses a default value of 3 MB. See +// net.ipv4.ipfrag_low_thresh for more information. +const LowFragThreshold = 3 << 20 // 3MB + +// Fragmentation 分片处理器对象 +type Fragmentation struct { + mu sync.Mutex + highLimit int + lowLimit int + reassemblers map[uint32]*reassembler // IP报文hash:重组器 + rList reassemblerList + size int + timeout time.Duration +} + +// NewFragmentation 新建一个分片处理器 +func NewFragmentation(highMemoryLimit, lowMemoryLimit int, reassemblingTimeout time.Duration) *Fragmentation { + if lowMemoryLimit >= highMemoryLimit { + lowMemoryLimit = highMemoryLimit + } + + if lowMemoryLimit < 0 { + lowMemoryLimit = 0 + } + + return &Fragmentation{ + reassemblers: make(map[uint32]*reassembler), + highLimit: highMemoryLimit, + lowLimit: lowMemoryLimit, + timeout: reassemblingTimeout, + } +} + +// Process 处理ip报文分片 +func (f *Fragmentation) Process(id uint32, first, last uint16, more bool, vv buffer.VectorisedView) (buffer.VectorisedView, bool) { + f.mu.Lock() + r, ok := f.reassemblers[id] + if ok && r.tooOld(f.timeout) { // 检测一个分片是否存在超过了30s + // This is very likely to be an id-collision or someone performing a slow-rate attack. + f.release(r) + ok = false + } + if !ok { // 首次注册该报文的分片 + r = newReassembler(id) + f.reassemblers[id] = r + f.rList.PushFront(r) + } + f.mu.Unlock() + + res, done, consumed := r.process(first, last, more, vv) + + f.mu.Lock() + f.size += consumed + log.Printf("[%d]的分片 [%d,%d] 合并中\n", id, first, last) + if done { + f.release(r) + } + // Evict reassemblers if we are consuming more memory than highLimit until + // we reach lowLimit. + if f.size > f.highLimit { + tail := f.rList.Back() + for f.size > f.lowLimit && tail != nil { + f.release(tail) + tail = tail.Prev() + } + } + f.mu.Unlock() + return res, done +} + +func (f *Fragmentation) release(r *reassembler) { + // Before releasing a fragment we need to check if r is already marked as done. + // Otherwise, we would delete it twice. + if r.checkDoneOrMark() { + return + } + + delete(f.reassemblers, r.id) + f.rList.Remove(r) + f.size -= r.size + if f.size < 0 { + log.Printf("memory counter < 0 (%d), this is an accounting bug that requires investigation", f.size) + f.size = 0 + } +} diff --git a/tcpip/network/fragmentation/fragmentation_test.go b/tcpip/network/fragmentation/fragmentation_test.go index 70634dc..263316d 100644 --- a/tcpip/network/fragmentation/fragmentation_test.go +++ b/tcpip/network/fragmentation/fragmentation_test.go @@ -1,171 +1,171 @@ -package fragmentation_test - -import ( - "log" - "math" - "netstack/tcpip" - "netstack/tcpip/buffer" - "netstack/tcpip/header" - "netstack/tcpip/link/channel" - "netstack/tcpip/network/arp" - "netstack/tcpip/network/ipv4" - "netstack/tcpip/stack" - "testing" - "time" -) - -const ( - stackLinkAddr = tcpip.LinkAddress("\x0a\x0a\x0b\x0b\x0c\x0c") // 0a:0a:0b:0b:0c:0c - stackAddr1 = tcpip.Address("\x0a\x00\x00\x01") // 10.0.0.1 - stackAddr2 = tcpip.Address("\x0a\x00\x00\x02") // 10.0.0.2 - stackAddrBad = tcpip.Address("\x0a\x00\x00\x03") // 10.0.0.3 -) - -type testContext struct { - t *testing.T - linkEP *channel.Endpoint - s *stack.Stack - id uint16 -} - -func newTestContext(t *testing.T) *testContext { - s := stack.New([]string{ipv4.ProtocolName, arp.ProtocolName}, nil, stack.Options{}) - - const defaultMTU = 65536 - id, linkEP := channel.New(256, defaultMTU, stackLinkAddr) - if err := s.CreateNIC(1, id); err != nil { - t.Fatalf("CreateNIC failed: %v", err) - } - - if err := s.AddAddress(1, ipv4.ProtocolNumber, stackAddr1); err != nil { - t.Fatalf("AddAddress for ipv4 failed: %v", err) - } - if err := s.AddAddress(1, ipv4.ProtocolNumber, stackAddr2); err != nil { - t.Fatalf("AddAddress for ipv4 failed: %v", err) - } - if err := s.AddAddress(1, arp.ProtocolNumber, arp.ProtocolAddress); err != nil { - t.Fatalf("AddAddress for arp failed: %v", err) - } - - s.SetRouteTable([]tcpip.Route{{ - Destination: "\x00\x00\x00\x00", - Mask: "\x00\x00\x00\x00", - Gateway: "", - NIC: 1, - }}) - - return &testContext{ - t: t, - s: s, - linkEP: linkEP, - id: uint16(time.Now().Unix() % math.MaxUint16), - } -} - -func (c *testContext) cleanup() { - close(c.linkEP.C) -} - -func TestFragmentationBase(t *testing.T) { - c := newTestContext(t) - defer c.cleanup() - - const senderMAC = "\x01\x02\x03\x04\x05\x06" - const senderIPv4 = "\x0a\x00\x00\x02" - - v := make(buffer.View, header.ARPSize) - h := header.ARP(v) - h.SetIPv4OverEthernet() - h.SetOp(header.ARPRequest) // 一个ARP请求 - copy(h.HardwareAddressSender(), senderMAC) // Local MAC - copy(h.ProtocolAddressSender(), senderIPv4) // Local IP - - inject := func(addr tcpip.Address) { - copy(h.ProtocolAddressTarget(), addr) - c.linkEP.Inject(arp.ProtocolNumber, v.ToVectorisedView()) // 往链路层注入一个arp报文 链路层将会自动分发它 - } - - inject(stackAddr1) // target IP 10.0.0.1 - select { - case pkt := <-c.linkEP.C: - if pkt.Proto != arp.ProtocolNumber { - t.Fatalf("stackAddr1: expected ARP response, got network protocol number %v", pkt.Proto) - } - rep := header.ARP(pkt.Header) - if !rep.IsValid() { - t.Fatalf("stackAddr1: invalid ARP response len(pkt.Header)=%d", len(pkt.Header)) - } - if tcpip.Address(rep.ProtocolAddressSender()) != stackAddr1 { - t.Errorf("stackAddr1: expected sender to be set") - } - if got := tcpip.LinkAddress(rep.HardwareAddressSender()); got != stackLinkAddr { - t.Errorf("stackAddr1: expected sender to be stackLinkAddr, got %q", got) - } - case <-time.After(100 * time.Millisecond): - t.Fatalf("Case #1 Time Out\n") - } - - // 一个纯粹的IP报文 Part1 - pLen := ((1500 - header.EthernetMinimumSize - header.IPv4MinimumSize) >> 3) << 3 - v = make(buffer.View, header.IPv4MinimumSize+pLen) - hdr := buffer.NewPrependable(header.IPv4MinimumSize) - ip := header.IPv4(hdr.Prepend(header.IPv4MinimumSize)) - buf := make(buffer.View, pLen) - for i := range buf { - buf[i] = 1 - } - payload := buffer.NewVectorisedView(pLen, buf.ToVectorisedView().Views()) - length := uint16(hdr.UsedLength() + payload.Size()) - // ip首部编码 - ip.Encode(&header.IPv4Fields{ - IHL: header.IPv4MinimumSize, - TotalLength: length, - ID: c.id, - Flags: 0x1, - FragmentOffset: 0, - TTL: 255, - Protocol: uint8(0x6), // tcp 伪装报文 - SrcAddr: senderIPv4, - DstAddr: stackAddr1, - }) - //ip.SetFlagsFragmentOffset() - // 计算校验和和设置校验和 - ip.SetChecksum(^ip.CalculateChecksum()) - copy(v, ip) - copy(v[header.IPv4MinimumSize:], payload.First()) - - inject = func(addr tcpip.Address) { - copy(h.ProtocolAddressTarget(), addr) - c.linkEP.Inject(ipv4.ProtocolNumber, v.ToVectorisedView()) // 往链路层注入一个arp报文 链路层将会自动分发它 - } - - inject(stackAddr1) - - // 一个纯粹的IP报文 Part2 - pLen = 256 - v = make(buffer.View, header.IPv4MinimumSize+pLen) - payload = buffer.NewVectorisedView(pLen, buf.ToVectorisedView().Views()) - length = uint16(hdr.UsedLength() + payload.Size()) - // ip首部编码 - ip.Encode(&header.IPv4Fields{ - IHL: header.IPv4MinimumSize, - TotalLength: length, - ID: c.id, - FragmentOffset: 1464, - TTL: 255, - Protocol: uint8(0x6), // tcp 伪装报文 - SrcAddr: senderIPv4, - DstAddr: stackAddr1, - }) - //ip.SetFlagsFragmentOffset() - // 计算校验和和设置校验和 - ip.SetChecksum(^ip.CalculateChecksum()) - copy(v, ip) - copy(v[header.IPv4MinimumSize:], payload.First()) - - inject(stackAddr1) - - msg := <-c.linkEP.C - log.Println(msg.Header) - -} +package fragmentation_test + +import ( + "log" + "math" + "netstack/tcpip" + "netstack/tcpip/buffer" + "netstack/tcpip/header" + "netstack/tcpip/link/channel" + "netstack/tcpip/network/arp" + "netstack/tcpip/network/ipv4" + "netstack/tcpip/stack" + "testing" + "time" +) + +const ( + stackLinkAddr = tcpip.LinkAddress("\x0a\x0a\x0b\x0b\x0c\x0c") // 0a:0a:0b:0b:0c:0c + stackAddr1 = tcpip.Address("\x0a\x00\x00\x01") // 10.0.0.1 + stackAddr2 = tcpip.Address("\x0a\x00\x00\x02") // 10.0.0.2 + stackAddrBad = tcpip.Address("\x0a\x00\x00\x03") // 10.0.0.3 +) + +type testContext struct { + t *testing.T + linkEP *channel.Endpoint + s *stack.Stack + id uint16 +} + +func newTestContext(t *testing.T) *testContext { + s := stack.New([]string{ipv4.ProtocolName, arp.ProtocolName}, nil, stack.Options{}) + + const defaultMTU = 65536 + id, linkEP := channel.New(256, defaultMTU, stackLinkAddr) + if err := s.CreateNIC(1, id); err != nil { + t.Fatalf("CreateNIC failed: %v", err) + } + + if err := s.AddAddress(1, ipv4.ProtocolNumber, stackAddr1); err != nil { + t.Fatalf("AddAddress for ipv4 failed: %v", err) + } + if err := s.AddAddress(1, ipv4.ProtocolNumber, stackAddr2); err != nil { + t.Fatalf("AddAddress for ipv4 failed: %v", err) + } + if err := s.AddAddress(1, arp.ProtocolNumber, arp.ProtocolAddress); err != nil { + t.Fatalf("AddAddress for arp failed: %v", err) + } + + s.SetRouteTable([]tcpip.Route{{ + Destination: "\x00\x00\x00\x00", + Mask: "\x00\x00\x00\x00", + Gateway: "", + NIC: 1, + }}) + + return &testContext{ + t: t, + s: s, + linkEP: linkEP, + id: uint16(time.Now().Unix() % math.MaxUint16), + } +} + +func (c *testContext) cleanup() { + close(c.linkEP.C) +} + +func TestFragmentationBase(t *testing.T) { + c := newTestContext(t) + defer c.cleanup() + + const senderMAC = "\x01\x02\x03\x04\x05\x06" + const senderIPv4 = "\x0a\x00\x00\x02" + + v := make(buffer.View, header.ARPSize) + h := header.ARP(v) + h.SetIPv4OverEthernet() + h.SetOp(header.ARPRequest) // 一个ARP请求 + copy(h.HardwareAddressSender(), senderMAC) // Local MAC + copy(h.ProtocolAddressSender(), senderIPv4) // Local IP + + inject := func(addr tcpip.Address) { + copy(h.ProtocolAddressTarget(), addr) + c.linkEP.Inject(arp.ProtocolNumber, v.ToVectorisedView()) // 往链路层注入一个arp报文 链路层将会自动分发它 + } + + inject(stackAddr1) // target IP 10.0.0.1 + select { + case pkt := <-c.linkEP.C: + if pkt.Proto != arp.ProtocolNumber { + t.Fatalf("stackAddr1: expected ARP response, got network protocol number %v", pkt.Proto) + } + rep := header.ARP(pkt.Header) + if !rep.IsValid() { + t.Fatalf("stackAddr1: invalid ARP response len(pkt.Header)=%d", len(pkt.Header)) + } + if tcpip.Address(rep.ProtocolAddressSender()) != stackAddr1 { + t.Errorf("stackAddr1: expected sender to be set") + } + if got := tcpip.LinkAddress(rep.HardwareAddressSender()); got != stackLinkAddr { + t.Errorf("stackAddr1: expected sender to be stackLinkAddr, got %q", got) + } + case <-time.After(100 * time.Millisecond): + t.Fatalf("Case #1 Time Out\n") + } + + // 一个纯粹的IP报文 Part1 + pLen := ((1500 - header.EthernetMinimumSize - header.IPv4MinimumSize) >> 3) << 3 + v = make(buffer.View, header.IPv4MinimumSize+pLen) + hdr := buffer.NewPrependable(header.IPv4MinimumSize) + ip := header.IPv4(hdr.Prepend(header.IPv4MinimumSize)) + buf := make(buffer.View, pLen) + for i := range buf { + buf[i] = 1 + } + payload := buffer.NewVectorisedView(pLen, buf.ToVectorisedView().Views()) + length := uint16(hdr.UsedLength() + payload.Size()) + // ip首部编码 + ip.Encode(&header.IPv4Fields{ + IHL: header.IPv4MinimumSize, + TotalLength: length, + ID: c.id, + Flags: 0x1, + FragmentOffset: 0, + TTL: 255, + Protocol: uint8(0x6), // tcp 伪装报文 + SrcAddr: senderIPv4, + DstAddr: stackAddr1, + }) + //ip.SetFlagsFragmentOffset() + // 计算校验和和设置校验和 + ip.SetChecksum(^ip.CalculateChecksum()) + copy(v, ip) + copy(v[header.IPv4MinimumSize:], payload.First()) + + inject = func(addr tcpip.Address) { + copy(h.ProtocolAddressTarget(), addr) + c.linkEP.Inject(ipv4.ProtocolNumber, v.ToVectorisedView()) // 往链路层注入一个arp报文 链路层将会自动分发它 + } + + inject(stackAddr1) + + // 一个纯粹的IP报文 Part2 + pLen = 256 + v = make(buffer.View, header.IPv4MinimumSize+pLen) + payload = buffer.NewVectorisedView(pLen, buf.ToVectorisedView().Views()) + length = uint16(hdr.UsedLength() + payload.Size()) + // ip首部编码 + ip.Encode(&header.IPv4Fields{ + IHL: header.IPv4MinimumSize, + TotalLength: length, + ID: c.id, + FragmentOffset: 1464, + TTL: 255, + Protocol: uint8(0x6), // tcp 伪装报文 + SrcAddr: senderIPv4, + DstAddr: stackAddr1, + }) + //ip.SetFlagsFragmentOffset() + // 计算校验和和设置校验和 + ip.SetChecksum(^ip.CalculateChecksum()) + copy(v, ip) + copy(v[header.IPv4MinimumSize:], payload.First()) + + inject(stackAddr1) + + msg := <-c.linkEP.C + log.Println(msg.Header) + +} diff --git a/tcpip/network/fragmentation/reassembler.go b/tcpip/network/fragmentation/reassembler.go index 2b66f78..0539267 100644 --- a/tcpip/network/fragmentation/reassembler.go +++ b/tcpip/network/fragmentation/reassembler.go @@ -1,119 +1,119 @@ -// Copyright 2018 Google LLC -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package fragmentation - -import ( - "container/heap" - "fmt" - "math" - "sync" - "time" - - "netstack/tcpip/buffer" -) - -type hole struct { - first uint16 - last uint16 - deleted bool -} - -// 重组器对象 -type reassembler struct { - reassemblerEntry - id uint32 - size int - mu sync.Mutex - holes []hole // 每个临时ip报文的缓冲区 最大是65535 - deleted int - heap fragHeap // 小根堆用来自动排序 - done bool - creationTime time.Time -} - -func newReassembler(id uint32) *reassembler { - r := &reassembler{ - id: id, - holes: make([]hole, 0, 16), - deleted: 0, - heap: make(fragHeap, 0, 8), - creationTime: time.Now(), - } - r.holes = append(r.holes, hole{ - first: 0, - last: math.MaxUint16, - deleted: false}) - return r -} - -// updateHoles updates the list of holes for an incoming fragment and -// returns true iff the fragment filled at least part of an existing hole. -func (r *reassembler) updateHoles(first, last uint16, more bool) bool { - used := false - for i := range r.holes { - if r.holes[i].deleted || first > r.holes[i].last || last < r.holes[i].first { - continue - } - used = true - r.deleted++ - r.holes[i].deleted = true // 当前位置被占用 - if first > r.holes[i].first { - r.holes = append(r.holes, hole{r.holes[i].first, first - 1, false}) - } - if last < r.holes[i].last && more { - r.holes = append(r.holes, hole{last + 1, r.holes[i].last, false}) - } - } - return used -} - -func (r *reassembler) process(first, last uint16, more bool, vv buffer.VectorisedView) (buffer.VectorisedView, bool, int) { - r.mu.Lock() - defer r.mu.Unlock() - consumed := 0 - if r.done { - // A concurrent goroutine might have already reassembled - // the packet and emptied the heap while this goroutine - // was waiting on the mutex. We don't have to do anything in this case. - return buffer.VectorisedView{}, false, consumed - } - if r.updateHoles(first, last, more) { - // We store the incoming packet only if it filled some holes. - heap.Push(&r.heap, fragment{offset: first, vv: vv.Clone(nil)}) - consumed = vv.Size() - r.size += consumed - } - // Check if all the holes have been deleted and we are ready to reassamble. - if r.deleted < len(r.holes) { - return buffer.VectorisedView{}, false, consumed - } - res, err := r.heap.reassemble() - if err != nil { - panic(fmt.Sprintf("reassemble failed with: %v. There is probably a bug in the code handling the holes.", err)) - } - return res, true, consumed -} - -func (r *reassembler) tooOld(timeout time.Duration) bool { - return time.Now().Sub(r.creationTime) > timeout -} - -func (r *reassembler) checkDoneOrMark() bool { - r.mu.Lock() - prev := r.done - r.done = true - r.mu.Unlock() - return prev -} +// Copyright 2018 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fragmentation + +import ( + "container/heap" + "fmt" + "math" + "sync" + "time" + + "netstack/tcpip/buffer" +) + +type hole struct { + first uint16 + last uint16 + deleted bool +} + +// 重组器对象 +type reassembler struct { + reassemblerEntry + id uint32 + size int + mu sync.Mutex + holes []hole // 每个临时ip报文的缓冲区 最大是65535 + deleted int + heap fragHeap // 小根堆用来自动排序 + done bool + creationTime time.Time +} + +func newReassembler(id uint32) *reassembler { + r := &reassembler{ + id: id, + holes: make([]hole, 0, 16), + deleted: 0, + heap: make(fragHeap, 0, 8), + creationTime: time.Now(), + } + r.holes = append(r.holes, hole{ + first: 0, + last: math.MaxUint16, + deleted: false}) + return r +} + +// updateHoles updates the list of holes for an incoming fragment and +// returns true iff the fragment filled at least part of an existing hole. +func (r *reassembler) updateHoles(first, last uint16, more bool) bool { + used := false + for i := range r.holes { + if r.holes[i].deleted || first > r.holes[i].last || last < r.holes[i].first { + continue + } + used = true + r.deleted++ + r.holes[i].deleted = true // 当前位置被占用 + if first > r.holes[i].first { + r.holes = append(r.holes, hole{r.holes[i].first, first - 1, false}) + } + if last < r.holes[i].last && more { + r.holes = append(r.holes, hole{last + 1, r.holes[i].last, false}) + } + } + return used +} + +func (r *reassembler) process(first, last uint16, more bool, vv buffer.VectorisedView) (buffer.VectorisedView, bool, int) { + r.mu.Lock() + defer r.mu.Unlock() + consumed := 0 + if r.done { + // A concurrent goroutine might have already reassembled + // the packet and emptied the heap while this goroutine + // was waiting on the mutex. We don't have to do anything in this case. + return buffer.VectorisedView{}, false, consumed + } + if r.updateHoles(first, last, more) { + // We store the incoming packet only if it filled some holes. + heap.Push(&r.heap, fragment{offset: first, vv: vv.Clone(nil)}) + consumed = vv.Size() + r.size += consumed + } + // Check if all the holes have been deleted and we are ready to reassamble. + if r.deleted < len(r.holes) { + return buffer.VectorisedView{}, false, consumed + } + res, err := r.heap.reassemble() + if err != nil { + panic(fmt.Sprintf("reassemble failed with: %v. There is probably a bug in the code handling the holes.", err)) + } + return res, true, consumed +} + +func (r *reassembler) tooOld(timeout time.Duration) bool { + return time.Now().Sub(r.creationTime) > timeout +} + +func (r *reassembler) checkDoneOrMark() bool { + r.mu.Lock() + prev := r.done + r.done = true + r.mu.Unlock() + return prev +} diff --git a/tcpip/network/fragmentation/reassembler_list.go b/tcpip/network/fragmentation/reassembler_list.go index 3189cae..a659ed1 100644 --- a/tcpip/network/fragmentation/reassembler_list.go +++ b/tcpip/network/fragmentation/reassembler_list.go @@ -1,173 +1,173 @@ -package fragmentation - -// ElementMapper provides an identity mapping by default. -// -// This can be replaced to provide a struct that maps elements to linker -// objects, if they are not the same. An ElementMapper is not typically -// required if: Linker is left as is, Element is left as is, or Linker and -// Element are the same type. -type reassemblerElementMapper struct{} - -// linkerFor maps an Element to a Linker. -// -// This default implementation should be inlined. -// -//go:nosplit -func (reassemblerElementMapper) linkerFor(elem *reassembler) *reassembler { return elem } - -// List is an intrusive list. Entries can be added to or removed from the list -// in O(1) time and with no additional memory allocations. -// -// The zero value for List is an empty list ready to use. -// -// To iterate over a list (where l is a List): -// for e := l.Front(); e != nil; e = e.Next() { -// // do something with e. -// } -// -// +stateify savable -type reassemblerList struct { - head *reassembler - tail *reassembler -} - -// Reset resets list l to the empty state. -func (l *reassemblerList) Reset() { - l.head = nil - l.tail = nil -} - -// Empty returns true iff the list is empty. -func (l *reassemblerList) Empty() bool { - return l.head == nil -} - -// Front returns the first element of list l or nil. -func (l *reassemblerList) Front() *reassembler { - return l.head -} - -// Back returns the last element of list l or nil. -func (l *reassemblerList) Back() *reassembler { - return l.tail -} - -// PushFront inserts the element e at the front of list l. -func (l *reassemblerList) PushFront(e *reassembler) { - reassemblerElementMapper{}.linkerFor(e).SetNext(l.head) - reassemblerElementMapper{}.linkerFor(e).SetPrev(nil) - - if l.head != nil { - reassemblerElementMapper{}.linkerFor(l.head).SetPrev(e) - } else { - l.tail = e - } - - l.head = e -} - -// PushBack inserts the element e at the back of list l. -func (l *reassemblerList) PushBack(e *reassembler) { - reassemblerElementMapper{}.linkerFor(e).SetNext(nil) - reassemblerElementMapper{}.linkerFor(e).SetPrev(l.tail) - - if l.tail != nil { - reassemblerElementMapper{}.linkerFor(l.tail).SetNext(e) - } else { - l.head = e - } - - l.tail = e -} - -// PushBackList inserts list m at the end of list l, emptying m. -func (l *reassemblerList) PushBackList(m *reassemblerList) { - if l.head == nil { - l.head = m.head - l.tail = m.tail - } else if m.head != nil { - reassemblerElementMapper{}.linkerFor(l.tail).SetNext(m.head) - reassemblerElementMapper{}.linkerFor(m.head).SetPrev(l.tail) - - l.tail = m.tail - } - - m.head = nil - m.tail = nil -} - -// InsertAfter inserts e after b. -func (l *reassemblerList) InsertAfter(b, e *reassembler) { - a := reassemblerElementMapper{}.linkerFor(b).Next() - reassemblerElementMapper{}.linkerFor(e).SetNext(a) - reassemblerElementMapper{}.linkerFor(e).SetPrev(b) - reassemblerElementMapper{}.linkerFor(b).SetNext(e) - - if a != nil { - reassemblerElementMapper{}.linkerFor(a).SetPrev(e) - } else { - l.tail = e - } -} - -// InsertBefore inserts e before a. -func (l *reassemblerList) InsertBefore(a, e *reassembler) { - b := reassemblerElementMapper{}.linkerFor(a).Prev() - reassemblerElementMapper{}.linkerFor(e).SetNext(a) - reassemblerElementMapper{}.linkerFor(e).SetPrev(b) - reassemblerElementMapper{}.linkerFor(a).SetPrev(e) - - if b != nil { - reassemblerElementMapper{}.linkerFor(b).SetNext(e) - } else { - l.head = e - } -} - -// Remove removes e from l. -func (l *reassemblerList) Remove(e *reassembler) { - prev := reassemblerElementMapper{}.linkerFor(e).Prev() - next := reassemblerElementMapper{}.linkerFor(e).Next() - - if prev != nil { - reassemblerElementMapper{}.linkerFor(prev).SetNext(next) - } else { - l.head = next - } - - if next != nil { - reassemblerElementMapper{}.linkerFor(next).SetPrev(prev) - } else { - l.tail = prev - } -} - -// Entry is a default implementation of Linker. Users can add anonymous fields -// of this type to their structs to make them automatically implement the -// methods needed by List. -// -// +stateify savable -type reassemblerEntry struct { - next *reassembler - prev *reassembler -} - -// Next returns the entry that follows e in the list. -func (e *reassemblerEntry) Next() *reassembler { - return e.next -} - -// Prev returns the entry that precedes e in the list. -func (e *reassemblerEntry) Prev() *reassembler { - return e.prev -} - -// SetNext assigns 'entry' as the entry that follows e in the list. -func (e *reassemblerEntry) SetNext(elem *reassembler) { - e.next = elem -} - -// SetPrev assigns 'entry' as the entry that precedes e in the list. -func (e *reassemblerEntry) SetPrev(elem *reassembler) { - e.prev = elem -} +package fragmentation + +// ElementMapper provides an identity mapping by default. +// +// This can be replaced to provide a struct that maps elements to linker +// objects, if they are not the same. An ElementMapper is not typically +// required if: Linker is left as is, Element is left as is, or Linker and +// Element are the same type. +type reassemblerElementMapper struct{} + +// linkerFor maps an Element to a Linker. +// +// This default implementation should be inlined. +// +//go:nosplit +func (reassemblerElementMapper) linkerFor(elem *reassembler) *reassembler { return elem } + +// List is an intrusive list. Entries can be added to or removed from the list +// in O(1) time and with no additional memory allocations. +// +// The zero value for List is an empty list ready to use. +// +// To iterate over a list (where l is a List): +// for e := l.Front(); e != nil; e = e.Next() { +// // do something with e. +// } +// +// +stateify savable +type reassemblerList struct { + head *reassembler + tail *reassembler +} + +// Reset resets list l to the empty state. +func (l *reassemblerList) Reset() { + l.head = nil + l.tail = nil +} + +// Empty returns true iff the list is empty. +func (l *reassemblerList) Empty() bool { + return l.head == nil +} + +// Front returns the first element of list l or nil. +func (l *reassemblerList) Front() *reassembler { + return l.head +} + +// Back returns the last element of list l or nil. +func (l *reassemblerList) Back() *reassembler { + return l.tail +} + +// PushFront inserts the element e at the front of list l. +func (l *reassemblerList) PushFront(e *reassembler) { + reassemblerElementMapper{}.linkerFor(e).SetNext(l.head) + reassemblerElementMapper{}.linkerFor(e).SetPrev(nil) + + if l.head != nil { + reassemblerElementMapper{}.linkerFor(l.head).SetPrev(e) + } else { + l.tail = e + } + + l.head = e +} + +// PushBack inserts the element e at the back of list l. +func (l *reassemblerList) PushBack(e *reassembler) { + reassemblerElementMapper{}.linkerFor(e).SetNext(nil) + reassemblerElementMapper{}.linkerFor(e).SetPrev(l.tail) + + if l.tail != nil { + reassemblerElementMapper{}.linkerFor(l.tail).SetNext(e) + } else { + l.head = e + } + + l.tail = e +} + +// PushBackList inserts list m at the end of list l, emptying m. +func (l *reassemblerList) PushBackList(m *reassemblerList) { + if l.head == nil { + l.head = m.head + l.tail = m.tail + } else if m.head != nil { + reassemblerElementMapper{}.linkerFor(l.tail).SetNext(m.head) + reassemblerElementMapper{}.linkerFor(m.head).SetPrev(l.tail) + + l.tail = m.tail + } + + m.head = nil + m.tail = nil +} + +// InsertAfter inserts e after b. +func (l *reassemblerList) InsertAfter(b, e *reassembler) { + a := reassemblerElementMapper{}.linkerFor(b).Next() + reassemblerElementMapper{}.linkerFor(e).SetNext(a) + reassemblerElementMapper{}.linkerFor(e).SetPrev(b) + reassemblerElementMapper{}.linkerFor(b).SetNext(e) + + if a != nil { + reassemblerElementMapper{}.linkerFor(a).SetPrev(e) + } else { + l.tail = e + } +} + +// InsertBefore inserts e before a. +func (l *reassemblerList) InsertBefore(a, e *reassembler) { + b := reassemblerElementMapper{}.linkerFor(a).Prev() + reassemblerElementMapper{}.linkerFor(e).SetNext(a) + reassemblerElementMapper{}.linkerFor(e).SetPrev(b) + reassemblerElementMapper{}.linkerFor(a).SetPrev(e) + + if b != nil { + reassemblerElementMapper{}.linkerFor(b).SetNext(e) + } else { + l.head = e + } +} + +// Remove removes e from l. +func (l *reassemblerList) Remove(e *reassembler) { + prev := reassemblerElementMapper{}.linkerFor(e).Prev() + next := reassemblerElementMapper{}.linkerFor(e).Next() + + if prev != nil { + reassemblerElementMapper{}.linkerFor(prev).SetNext(next) + } else { + l.head = next + } + + if next != nil { + reassemblerElementMapper{}.linkerFor(next).SetPrev(prev) + } else { + l.tail = prev + } +} + +// Entry is a default implementation of Linker. Users can add anonymous fields +// of this type to their structs to make them automatically implement the +// methods needed by List. +// +// +stateify savable +type reassemblerEntry struct { + next *reassembler + prev *reassembler +} + +// Next returns the entry that follows e in the list. +func (e *reassemblerEntry) Next() *reassembler { + return e.next +} + +// Prev returns the entry that precedes e in the list. +func (e *reassemblerEntry) Prev() *reassembler { + return e.prev +} + +// SetNext assigns 'entry' as the entry that follows e in the list. +func (e *reassemblerEntry) SetNext(elem *reassembler) { + e.next = elem +} + +// SetPrev assigns 'entry' as the entry that precedes e in the list. +func (e *reassemblerEntry) SetPrev(elem *reassembler) { + e.prev = elem +} diff --git a/tcpip/network/hash/hash.go b/tcpip/network/hash/hash.go index 21f3838..8919b3e 100644 --- a/tcpip/network/hash/hash.go +++ b/tcpip/network/hash/hash.go @@ -1,70 +1,70 @@ -package hash - -import ( - "crypto/rand" - "encoding/binary" - "netstack/tcpip/header" -) - -var hashIV = RandN32(1)[0] - -// RandN32 生成 n 个加密随机 32 位数字的切片 -func RandN32(n int) []uint32 { - b := make([]byte, 4*n) - if _, err := rand.Read(b); err != nil { - panic("unable to get random numbers: " + err.Error()) - } - r := make([]uint32, n) - for i := range r { - r[i] = binary.LittleEndian.Uint32(b[4*i : (4*i + 4)]) - } - return r -} - -func Hash3Words(a, b, c, initval uint32) uint32 { - const iv = 0xdeadbeef + (3 << 2) - initval += iv - - a += initval - b += initval - c += initval - - c ^= b - c -= rol32(b, 14) - a ^= c - a -= rol32(c, 11) - b ^= a - b -= rol32(a, 25) - c ^= b - c -= rol32(b, 16) - a ^= c - a -= rol32(c, 4) - b ^= a - b -= rol32(a, 14) - c ^= b - c -= rol32(b, 24) - - return c -} - -// 根据id,源ip,目的ip和协议类型得到hash值 -func IPv4FragmentHash(h header.IPv4) uint32 { - x := uint32(h.ID())<<16 | uint32(h.Protocol()) - t := h.SourceAddress() - y := uint32(t[0]) | uint32(t[1])<<8 | uint32(t[2])<<16 | uint32(t[3])<<24 - t = h.DestinationAddress() - z := uint32(t[0]) | uint32(t[1])<<8 | uint32(t[2])<<16 | uint32(t[3])<<24 - return Hash3Words(x, y, z, hashIV) -} - -func IPv6FragmentHash(h header.IPv6, f header.IPv6Fragment) uint32 { - t := h.SourceAddress() - y := uint32(t[0]) | uint32(t[1])<<8 | uint32(t[2])<<16 | uint32(t[3])<<24 - t = h.DestinationAddress() - z := uint32(t[0]) | uint32(t[1])<<8 | uint32(t[2])<<16 | uint32(t[3])<<24 - return Hash3Words(f.ID(), y, z, hashIV) -} - -func rol32(v, shift uint32) uint32 { - return (v << shift) | (v >> ((-shift) & 31)) -} +package hash + +import ( + "crypto/rand" + "encoding/binary" + "netstack/tcpip/header" +) + +var hashIV = RandN32(1)[0] + +// RandN32 生成 n 个加密随机 32 位数字的切片 +func RandN32(n int) []uint32 { + b := make([]byte, 4*n) + if _, err := rand.Read(b); err != nil { + panic("unable to get random numbers: " + err.Error()) + } + r := make([]uint32, n) + for i := range r { + r[i] = binary.LittleEndian.Uint32(b[4*i : (4*i + 4)]) + } + return r +} + +func Hash3Words(a, b, c, initval uint32) uint32 { + const iv = 0xdeadbeef + (3 << 2) + initval += iv + + a += initval + b += initval + c += initval + + c ^= b + c -= rol32(b, 14) + a ^= c + a -= rol32(c, 11) + b ^= a + b -= rol32(a, 25) + c ^= b + c -= rol32(b, 16) + a ^= c + a -= rol32(c, 4) + b ^= a + b -= rol32(a, 14) + c ^= b + c -= rol32(b, 24) + + return c +} + +// 根据id,源ip,目的ip和协议类型得到hash值 +func IPv4FragmentHash(h header.IPv4) uint32 { + x := uint32(h.ID())<<16 | uint32(h.Protocol()) + t := h.SourceAddress() + y := uint32(t[0]) | uint32(t[1])<<8 | uint32(t[2])<<16 | uint32(t[3])<<24 + t = h.DestinationAddress() + z := uint32(t[0]) | uint32(t[1])<<8 | uint32(t[2])<<16 | uint32(t[3])<<24 + return Hash3Words(x, y, z, hashIV) +} + +func IPv6FragmentHash(h header.IPv6, f header.IPv6Fragment) uint32 { + t := h.SourceAddress() + y := uint32(t[0]) | uint32(t[1])<<8 | uint32(t[2])<<16 | uint32(t[3])<<24 + t = h.DestinationAddress() + z := uint32(t[0]) | uint32(t[1])<<8 | uint32(t[2])<<16 | uint32(t[3])<<24 + return Hash3Words(f.ID(), y, z, hashIV) +} + +func rol32(v, shift uint32) uint32 { + return (v << shift) | (v >> ((-shift) & 31)) +} diff --git a/tcpip/network/ipv4/icmp.go b/tcpip/network/ipv4/icmp.go index 638b528..5407bef 100644 --- a/tcpip/network/ipv4/icmp.go +++ b/tcpip/network/ipv4/icmp.go @@ -1,128 +1,128 @@ -package ipv4 - -import ( - "encoding/binary" - "log" - "netstack/tcpip" - "netstack/tcpip/buffer" - "netstack/tcpip/header" - "netstack/tcpip/stack" -) - -/* - ICMP 的全称是 Internet Control Message Protocol 。与 IP 协议一样同属 TCP/IP 模型中的网络层,并且 ICMP 数据包是包裹在 IP 数据包中的 - - 0 1 2 3 - 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 -+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -| Type | Code | Checksum | -+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -| | -| 不同的Type和Code有不同的内容 | -| | -+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -*/ - -type echoRequest struct { - r stack.Route - v buffer.View -} - -// handleControl处理ICMP数据包包含导致ICMP发送的原始数据包的标头的情况。 -// 此信息用于确定必须通知哪个传输端点有关ICMP数据包。 -func (e *endpoint) handleControl(typ stack.ControlType, extra uint32, vv buffer.VectorisedView) { - h := header.IPv4(vv.First()) - - // We don't use IsValid() here because ICMP only requires that the IP - // header plus 8 bytes of the transport header be included. So it's - // likely that it is truncated, which would cause IsValid to return - // false. - // - // Drop packet if it doesn't have the basic IPv4 header or if the - // original source address doesn't match the endpoint's address. - if len(h) < header.IPv4MinimumSize || h.SourceAddress() != e.id.LocalAddress { - return - } - - hlen := int(h.HeaderLength()) - if vv.Size() < hlen || h.FragmentOffset() != 0 { - // We won't be able to handle this if it doesn't contain the - // full IPv4 header, or if it's a fragment not at offset 0 - // (because it won't have the transport header). - return - } - - // Skip the ip header, then deliver control message. - vv.TrimFront(hlen) - p := h.TransportProtocol() - e.dispatcher.DeliverTransportControlPacket(e.id.LocalAddress, h.DestinationAddress(), ProtocolNumber, p, typ, extra, vv) -} - -// 处理ICMP报文 -func (e *endpoint) handleICMP(r *stack.Route, vv buffer.VectorisedView) { - v := vv.First() - if len(v) < header.ICMPv4MinimumSize { - return - } - h := header.ICMPv4(v) - - // 更具icmp的类型来进行相应的处理 - switch h.Type() { - case header.ICMPv4Echo: // icmp echo请求 - if len(v) < header.ICMPv4EchoMinimumSize { - return - } - log.Printf("ICMP echo") - vv.TrimFront(header.ICMPv4MinimumSize) // 去掉头部 - req := echoRequest{r: r.Clone(), v: vv.ToView()} - select { - case e.echoRequests <- req: // 发送给echoReplier处理 在那里会重新组一个头部 - default: - req.r.Release() - } - - case header.ICMPv4EchoReply: // icmp echo响应 - if len(v) < header.ICMPv4EchoMinimumSize { - return - } - e.dispatcher.DeliverTransportPacket(r, header.ICMPv4ProtocolNumber, vv) - - case header.ICMPv4DstUnreachable: // 目标不可达 - if len(v) < header.ICMPv4DstUnreachableMinimumSize { - return - } - vv.TrimFront(header.ICMPv4DstUnreachableMinimumSize) - switch h.Code() { - case header.ICMPv4PortUnreachable: // 端口不可达 - e.handleControl(stack.ControlPortUnreachable, 0, vv) - - case header.ICMPv4FragmentationNeeded: // 需要进行分片但设置不分片标志 - mtu := uint32(binary.BigEndian.Uint16(v[header.ICMPv4DstUnreachableMinimumSize-2:])) - e.handleControl(stack.ControlPacketTooBig, calculateMTU(mtu), vv) - } - } -} - -// 处理icmp echo请求的goroutine -func (e *endpoint) echoReplier() { - for req := range e.echoRequests { - sendPing4(&req.r, 0, req.v) - req.r.Release() - } -} - -// 根据icmp echo请求,封装icmp echo响应报文,并传给ip层处理 -func sendPing4(r *stack.Route, code byte, data buffer.View) *tcpip.Error { - hdr := buffer.NewPrependable(header.ICMPv4EchoMinimumSize + int(r.MaxHeaderLength())) - - icmpv4 := header.ICMPv4(hdr.Prepend(header.ICMPv4EchoMinimumSize)) - icmpv4.SetType(header.ICMPv4EchoReply) - icmpv4.SetCode(code) - copy(icmpv4[header.ICMPv4MinimumSize:], data) - data = data[header.ICMPv4EchoMinimumSize-header.ICMPv4MinimumSize:] - icmpv4.SetChecksum(^header.Checksum(icmpv4, header.Checksum(data, 0))) - - log.Printf("ICMP 回应报文组完 再次包装到IP报文") - // 传给ip层处理 - return r.WritePacket(hdr, data.ToVectorisedView(), header.ICMPv4ProtocolNumber, r.DefaultTTL()) -} +package ipv4 + +import ( + "encoding/binary" + "log" + "netstack/tcpip" + "netstack/tcpip/buffer" + "netstack/tcpip/header" + "netstack/tcpip/stack" +) + +/* + ICMP 的全称是 Internet Control Message Protocol 。与 IP 协议一样同属 TCP/IP 模型中的网络层,并且 ICMP 数据包是包裹在 IP 数据包中的 + + 0 1 2 3 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +| Type | Code | Checksum | ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +| | +| 不同的Type和Code有不同的内容 | +| | ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +*/ + +type echoRequest struct { + r stack.Route + v buffer.View +} + +// handleControl处理ICMP数据包包含导致ICMP发送的原始数据包的标头的情况。 +// 此信息用于确定必须通知哪个传输端点有关ICMP数据包。 +func (e *endpoint) handleControl(typ stack.ControlType, extra uint32, vv buffer.VectorisedView) { + h := header.IPv4(vv.First()) + + // We don't use IsValid() here because ICMP only requires that the IP + // header plus 8 bytes of the transport header be included. So it's + // likely that it is truncated, which would cause IsValid to return + // false. + // + // Drop packet if it doesn't have the basic IPv4 header or if the + // original source address doesn't match the endpoint's address. + if len(h) < header.IPv4MinimumSize || h.SourceAddress() != e.id.LocalAddress { + return + } + + hlen := int(h.HeaderLength()) + if vv.Size() < hlen || h.FragmentOffset() != 0 { + // We won't be able to handle this if it doesn't contain the + // full IPv4 header, or if it's a fragment not at offset 0 + // (because it won't have the transport header). + return + } + + // Skip the ip header, then deliver control message. + vv.TrimFront(hlen) + p := h.TransportProtocol() + e.dispatcher.DeliverTransportControlPacket(e.id.LocalAddress, h.DestinationAddress(), ProtocolNumber, p, typ, extra, vv) +} + +// 处理ICMP报文 +func (e *endpoint) handleICMP(r *stack.Route, vv buffer.VectorisedView) { + v := vv.First() + if len(v) < header.ICMPv4MinimumSize { + return + } + h := header.ICMPv4(v) + + // 更具icmp的类型来进行相应的处理 + switch h.Type() { + case header.ICMPv4Echo: // icmp echo请求 + if len(v) < header.ICMPv4EchoMinimumSize { + return + } + log.Printf("ICMP echo") + vv.TrimFront(header.ICMPv4MinimumSize) // 去掉头部 + req := echoRequest{r: r.Clone(), v: vv.ToView()} + select { + case e.echoRequests <- req: // 发送给echoReplier处理 在那里会重新组一个头部 + default: + req.r.Release() + } + + case header.ICMPv4EchoReply: // icmp echo响应 + if len(v) < header.ICMPv4EchoMinimumSize { + return + } + e.dispatcher.DeliverTransportPacket(r, header.ICMPv4ProtocolNumber, vv) + + case header.ICMPv4DstUnreachable: // 目标不可达 + if len(v) < header.ICMPv4DstUnreachableMinimumSize { + return + } + vv.TrimFront(header.ICMPv4DstUnreachableMinimumSize) + switch h.Code() { + case header.ICMPv4PortUnreachable: // 端口不可达 + e.handleControl(stack.ControlPortUnreachable, 0, vv) + + case header.ICMPv4FragmentationNeeded: // 需要进行分片但设置不分片标志 + mtu := uint32(binary.BigEndian.Uint16(v[header.ICMPv4DstUnreachableMinimumSize-2:])) + e.handleControl(stack.ControlPacketTooBig, calculateMTU(mtu), vv) + } + } +} + +// 处理icmp echo请求的goroutine +func (e *endpoint) echoReplier() { + for req := range e.echoRequests { + sendPing4(&req.r, 0, req.v) + req.r.Release() + } +} + +// 根据icmp echo请求,封装icmp echo响应报文,并传给ip层处理 +func sendPing4(r *stack.Route, code byte, data buffer.View) *tcpip.Error { + hdr := buffer.NewPrependable(header.ICMPv4EchoMinimumSize + int(r.MaxHeaderLength())) + + icmpv4 := header.ICMPv4(hdr.Prepend(header.ICMPv4EchoMinimumSize)) + icmpv4.SetType(header.ICMPv4EchoReply) + icmpv4.SetCode(code) + copy(icmpv4[header.ICMPv4MinimumSize:], data) + data = data[header.ICMPv4EchoMinimumSize-header.ICMPv4MinimumSize:] + icmpv4.SetChecksum(^header.Checksum(icmpv4, header.Checksum(data, 0))) + + log.Printf("ICMP 回应报文组完 再次包装到IP报文") + // 传给ip层处理 + return r.WritePacket(hdr, data.ToVectorisedView(), header.ICMPv4ProtocolNumber, r.DefaultTTL()) +} diff --git a/tcpip/network/ipv4/ipv4.go b/tcpip/network/ipv4/ipv4.go index 9ab9df6..13759d1 100644 --- a/tcpip/network/ipv4/ipv4.go +++ b/tcpip/network/ipv4/ipv4.go @@ -1,259 +1,259 @@ -package ipv4 - -import ( - "log" - "netstack/tcpip" - "netstack/tcpip/buffer" - "netstack/tcpip/header" - "netstack/tcpip/network/fragmentation" - "netstack/tcpip/network/hash" - "netstack/tcpip/stack" - "sync/atomic" -) - -const ( - // ProtocolName is the string representation of the ipv4 protocol name. - ProtocolName = "ipv4" - - // ProtocolNumber is the ipv4 protocol number. - ProtocolNumber = header.IPv4ProtocolNumber - - // maxTotalSize is maximum size that can be encoded in the 16-bit - // TotalLength field of the ipv4 header. - maxTotalSize = 0xffff - - // buckets is the number of identifier buckets. - buckets = 2048 -) - -// IPv4 实现 -type endpoint struct { - // 网卡id - nicid tcpip.NICID - // 表示该endpoint的id,也是ip地址 - id stack.NetworkEndpointID - // 链路端的表示 - linkEP stack.LinkEndpoint - // 报文分发器 - dispatcher stack.TransportDispatcher - // ping请求报文接收队列 - echoRequests chan echoRequest - // ip报文分片处理器 - fragmentation *fragmentation.Fragmentation -} - -// DefaultTTL is the default time-to-live value for this endpoint. -// 默认的TTL值,TTL每经过路由转发一次就会减1 -func (e *endpoint) DefaultTTL() uint8 { - return 255 -} - -// MTU implements stack.NetworkEndpoint.MTU. It returns the link-layer MTU minus -// the network layer max header length. -// 获取去除ipv4头部后的最大报文长度 -func (e *endpoint) MTU() uint32 { - return calculateMTU(e.linkEP.MTU()) -} - -// Capabilities implements stack.NetworkEndpoint.Capabilities. -func (e *endpoint) Capabilities() stack.LinkEndpointCapabilities { - return e.linkEP.Capabilities() -} - -// NICID returns the ID of the NIC this endpoint belongs to. -func (e *endpoint) NICID() tcpip.NICID { - return e.nicid -} - -// ID returns the ipv4 endpoint ID. -// 获取该网络层端的id,也就是ip地址 -func (e *endpoint) ID() *stack.NetworkEndpointID { - return &e.id -} - -// MaxHeaderLength returns the maximum length needed by ipv4 headers (and -// underlying protocols). -// 链路层和网络层的头部长度 -func (e *endpoint) MaxHeaderLength() uint16 { - return e.linkEP.MaxHeaderLength() + header.IPv4MinimumSize -} - -// WritePacket writes a packet to the given destination address and protocol. -// 将传输层的数据封装加上IP头,并调用网卡的写入接口,写入IP报文 -func (e *endpoint) WritePacket(r *stack.Route, hdr buffer.Prependable, payload buffer.VectorisedView, - protocol tcpip.TransportProtocolNumber, ttl uint8) *tcpip.Error { - // 预留ip报文的空间 - ip := header.IPv4(hdr.Prepend(header.IPv4MinimumSize)) - length := uint16(hdr.UsedLength() + payload.Size()) - id := uint32(0) - // 如果报文长度大于68 - if length > header.IPv4MaximumHeaderSize+8 { - // Packets of 68 bytes or less are required by RFC 791 to not be - // fragmented, so we only assign ids to larger packets. - id = atomic.AddUint32(&ids[hashRoute(r, protocol)%buckets], 1) - } - // ip首部编码 - ip.Encode(&header.IPv4Fields{ - IHL: header.IPv4MinimumSize, - TotalLength: length, - ID: uint16(id), - TTL: ttl, - Protocol: uint8(protocol), - SrcAddr: r.LocalAddress, - DstAddr: r.RemoteAddress, - }) - // 计算校验和和设置校验和 - ip.SetChecksum(^ip.CalculateChecksum()) - r.Stats().IP.PacketsSent.Increment() - - // 写入网卡接口 - if protocol == header.ICMPv4ProtocolNumber { - log.Printf("IP 写回ICMP报文 长度: %d\n", hdr.UsedLength()+payload.Size()) - } else { - //log.Printf("send ipv4 packet %d bytes, proto: 0x%x", hdr.UsedLength()+payload.Size(), protocol) - log.Println(header.IPv4(append(ip, payload.ToView()...))) - } - return e.linkEP.WritePacket(r, hdr, payload, ProtocolNumber) -} - -// HandlePacket is called by the link layer when new ipv4 packets arrive for -// this endpoint. -// 收到ip包的处理 -func (e *endpoint) HandlePacket(r *stack.Route, vv buffer.VectorisedView) { - // 得到ip报文 - h := header.IPv4(vv.First()) - // 检查报文是否有效 - if !h.IsValid(vv.Size()) { - return - } - log.Println(h) - - hlen := int(h.HeaderLength()) - tlen := int(h.TotalLength()) - vv.TrimFront(hlen) - vv.CapLength(tlen - hlen) - - // 报文重组 - more := (h.Flags() & header.IPv4FlagMoreFragments) != 0 - // 是否需要ip重组 - if more || h.FragmentOffset() != 0 { - // The packet is a fragment, let's try to reassemble it. - last := h.FragmentOffset() + uint16(vv.Size()) - 1 - var ready bool - // ip分片重组 - vv, ready = e.fragmentation.Process(hash.IPv4FragmentHash(h), h.FragmentOffset(), last, more, vv) - if !ready { - return - } - } - - // 得到传输层的协议 - p := h.TransportProtocol() - // 如果时ICMP协议,则进入ICMP处理函数 - if p == header.ICMPv4ProtocolNumber { - e.handleICMP(r, vv) - return - } - r.Stats().IP.PacketsDelivered.Increment() - // 根据协议分发到不同处理函数,比如协议时TCP,会进入tcp.HandlePacket - log.Printf("recv ipv4 packet %d bytes, proto: 0x%x", tlen, p) - e.dispatcher.DeliverTransportPacket(r, p, vv) -} - -// Close cleans up resources associated with the endpoint. -func (e *endpoint) Close() { - close(e.echoRequests) -} - -// 实现NetworkProtocol接口 -type protocol struct{} - -// NewEndpoint creates a new ipv4 endpoint. -// 根据参数,新建一个ipv4端 -func (p *protocol) NewEndpoint(nicid tcpip.NICID, addr tcpip.Address, linkAddrCache stack.LinkAddressCache, - dispatcher stack.TransportDispatcher, linkEP stack.LinkEndpoint) (stack.NetworkEndpoint, *tcpip.Error) { - e := &endpoint{ - nicid: nicid, - id: stack.NetworkEndpointID{LocalAddress: addr}, - linkEP: linkEP, - dispatcher: dispatcher, - echoRequests: make(chan echoRequest, 10), - fragmentation: fragmentation.NewFragmentation(fragmentation.HighFragThreshold, - fragmentation.LowFragThreshold, fragmentation.DefaultReassembleTimeout), - } - - go e.echoReplier() - - return e, nil -} - -// NewProtocol creates a new protocol ipv4 protocol descriptor. This is exported -// only for tests that short-circuit the stack. Regular use of the protocol is -// done via the stack, which gets a protocol descriptor from the init() function -// below. -func NewProtocol() stack.NetworkProtocol { - return &protocol{} -} - -// Number returns the ipv4 protocol number. -func (p *protocol) Number() tcpip.NetworkProtocolNumber { - return ProtocolNumber -} - -// MinimumPacketSize returns the minimum valid ipv4 packet size. -func (p *protocol) MinimumPacketSize() int { - return header.IPv4MinimumSize -} - -// ParseAddresses implements NetworkProtocol.ParseAddresses. -func (*protocol) ParseAddresses(v buffer.View) (src, dst tcpip.Address) { - h := header.IPv4(v) - return h.SourceAddress(), h.DestinationAddress() -} - -// SetOption implements NetworkProtocol.SetOption. -func (p *protocol) SetOption(option interface{}) *tcpip.Error { - return tcpip.ErrUnknownProtocolOption -} - -// Option implements NetworkProtocol.Option. -func (p *protocol) Option(option interface{}) *tcpip.Error { - return tcpip.ErrUnknownProtocolOption -} - -// calculateMTU calculates the network-layer payload MTU based on the link-layer -// payload mtu. -func calculateMTU(mtu uint32) uint32 { - if mtu > maxTotalSize { - mtu = maxTotalSize - } - return mtu - header.IPv4MinimumSize -} - -// 用 源地址 目标地址 和 传输层协议号 进行一个哈希 -func hashRoute(r *stack.Route, protocol tcpip.TransportProtocolNumber) uint32 { - t := r.LocalAddress - a := uint32(t[0]) | uint32(t[1])<<8 | uint32(t[2])<<16 | uint32(t[3])<<24 - t = r.RemoteAddress - b := uint32(t[0]) | uint32(t[1])<<8 | uint32(t[2])<<16 | uint32(t[3])<<24 - return hash.Hash3Words(a, b, uint32(protocol), hashIV) -} - -var ( - ids []uint32 - hashIV uint32 -) - -func init() { - ids = make([]uint32, buckets) - - r := hash.RandN32(1 + buckets) - for i := range ids { - ids[i] = r[i] // 初始化ids - } - hashIV = r[buckets] - - stack.RegisterNetworkProtocolFactory(ProtocolName, func() stack.NetworkProtocol { - return &protocol{} - }) -} +package ipv4 + +import ( + "log" + "netstack/tcpip" + "netstack/tcpip/buffer" + "netstack/tcpip/header" + "netstack/tcpip/network/fragmentation" + "netstack/tcpip/network/hash" + "netstack/tcpip/stack" + "sync/atomic" +) + +const ( + // ProtocolName is the string representation of the ipv4 protocol name. + ProtocolName = "ipv4" + + // ProtocolNumber is the ipv4 protocol number. + ProtocolNumber = header.IPv4ProtocolNumber + + // maxTotalSize is maximum size that can be encoded in the 16-bit + // TotalLength field of the ipv4 header. + maxTotalSize = 0xffff + + // buckets is the number of identifier buckets. + buckets = 2048 +) + +// IPv4 实现 +type endpoint struct { + // 网卡id + nicid tcpip.NICID + // 表示该endpoint的id,也是ip地址 + id stack.NetworkEndpointID + // 链路端的表示 + linkEP stack.LinkEndpoint + // 报文分发器 + dispatcher stack.TransportDispatcher + // ping请求报文接收队列 + echoRequests chan echoRequest + // ip报文分片处理器 + fragmentation *fragmentation.Fragmentation +} + +// DefaultTTL is the default time-to-live value for this endpoint. +// 默认的TTL值,TTL每经过路由转发一次就会减1 +func (e *endpoint) DefaultTTL() uint8 { + return 255 +} + +// MTU implements stack.NetworkEndpoint.MTU. It returns the link-layer MTU minus +// the network layer max header length. +// 获取去除ipv4头部后的最大报文长度 +func (e *endpoint) MTU() uint32 { + return calculateMTU(e.linkEP.MTU()) +} + +// Capabilities implements stack.NetworkEndpoint.Capabilities. +func (e *endpoint) Capabilities() stack.LinkEndpointCapabilities { + return e.linkEP.Capabilities() +} + +// NICID returns the ID of the NIC this endpoint belongs to. +func (e *endpoint) NICID() tcpip.NICID { + return e.nicid +} + +// ID returns the ipv4 endpoint ID. +// 获取该网络层端的id,也就是ip地址 +func (e *endpoint) ID() *stack.NetworkEndpointID { + return &e.id +} + +// MaxHeaderLength returns the maximum length needed by ipv4 headers (and +// underlying protocols). +// 链路层和网络层的头部长度 +func (e *endpoint) MaxHeaderLength() uint16 { + return e.linkEP.MaxHeaderLength() + header.IPv4MinimumSize +} + +// WritePacket writes a packet to the given destination address and protocol. +// 将传输层的数据封装加上IP头,并调用网卡的写入接口,写入IP报文 +func (e *endpoint) WritePacket(r *stack.Route, hdr buffer.Prependable, payload buffer.VectorisedView, + protocol tcpip.TransportProtocolNumber, ttl uint8) *tcpip.Error { + // 预留ip报文的空间 + ip := header.IPv4(hdr.Prepend(header.IPv4MinimumSize)) + length := uint16(hdr.UsedLength() + payload.Size()) + id := uint32(0) + // 如果报文长度大于68 + if length > header.IPv4MaximumHeaderSize+8 { + // Packets of 68 bytes or less are required by RFC 791 to not be + // fragmented, so we only assign ids to larger packets. + id = atomic.AddUint32(&ids[hashRoute(r, protocol)%buckets], 1) + } + // ip首部编码 + ip.Encode(&header.IPv4Fields{ + IHL: header.IPv4MinimumSize, + TotalLength: length, + ID: uint16(id), + TTL: ttl, + Protocol: uint8(protocol), + SrcAddr: r.LocalAddress, + DstAddr: r.RemoteAddress, + }) + // 计算校验和和设置校验和 + ip.SetChecksum(^ip.CalculateChecksum()) + r.Stats().IP.PacketsSent.Increment() + + // 写入网卡接口 + if protocol == header.ICMPv4ProtocolNumber { + log.Printf("IP 写回ICMP报文 长度: %d\n", hdr.UsedLength()+payload.Size()) + } else { + //log.Printf("send ipv4 packet %d bytes, proto: 0x%x", hdr.UsedLength()+payload.Size(), protocol) + log.Println(header.IPv4(append(ip, payload.ToView()...))) + } + return e.linkEP.WritePacket(r, hdr, payload, ProtocolNumber) +} + +// HandlePacket is called by the link layer when new ipv4 packets arrive for +// this endpoint. +// 收到ip包的处理 +func (e *endpoint) HandlePacket(r *stack.Route, vv buffer.VectorisedView) { + // 得到ip报文 + h := header.IPv4(vv.First()) + // 检查报文是否有效 + if !h.IsValid(vv.Size()) { + return + } + log.Println(h) + + hlen := int(h.HeaderLength()) + tlen := int(h.TotalLength()) + vv.TrimFront(hlen) + vv.CapLength(tlen - hlen) + + // 报文重组 + more := (h.Flags() & header.IPv4FlagMoreFragments) != 0 + // 是否需要ip重组 + if more || h.FragmentOffset() != 0 { + // The packet is a fragment, let's try to reassemble it. + last := h.FragmentOffset() + uint16(vv.Size()) - 1 + var ready bool + // ip分片重组 + vv, ready = e.fragmentation.Process(hash.IPv4FragmentHash(h), h.FragmentOffset(), last, more, vv) + if !ready { + return + } + } + + // 得到传输层的协议 + p := h.TransportProtocol() + // 如果时ICMP协议,则进入ICMP处理函数 + if p == header.ICMPv4ProtocolNumber { + e.handleICMP(r, vv) + return + } + r.Stats().IP.PacketsDelivered.Increment() + // 根据协议分发到不同处理函数,比如协议时TCP,会进入tcp.HandlePacket + log.Printf("recv ipv4 packet %d bytes, proto: 0x%x", tlen, p) + e.dispatcher.DeliverTransportPacket(r, p, vv) +} + +// Close cleans up resources associated with the endpoint. +func (e *endpoint) Close() { + close(e.echoRequests) +} + +// 实现NetworkProtocol接口 +type protocol struct{} + +// NewEndpoint creates a new ipv4 endpoint. +// 根据参数,新建一个ipv4端 +func (p *protocol) NewEndpoint(nicid tcpip.NICID, addr tcpip.Address, linkAddrCache stack.LinkAddressCache, + dispatcher stack.TransportDispatcher, linkEP stack.LinkEndpoint) (stack.NetworkEndpoint, *tcpip.Error) { + e := &endpoint{ + nicid: nicid, + id: stack.NetworkEndpointID{LocalAddress: addr}, + linkEP: linkEP, + dispatcher: dispatcher, + echoRequests: make(chan echoRequest, 10), + fragmentation: fragmentation.NewFragmentation(fragmentation.HighFragThreshold, + fragmentation.LowFragThreshold, fragmentation.DefaultReassembleTimeout), + } + + go e.echoReplier() + + return e, nil +} + +// NewProtocol creates a new protocol ipv4 protocol descriptor. This is exported +// only for tests that short-circuit the stack. Regular use of the protocol is +// done via the stack, which gets a protocol descriptor from the init() function +// below. +func NewProtocol() stack.NetworkProtocol { + return &protocol{} +} + +// Number returns the ipv4 protocol number. +func (p *protocol) Number() tcpip.NetworkProtocolNumber { + return ProtocolNumber +} + +// MinimumPacketSize returns the minimum valid ipv4 packet size. +func (p *protocol) MinimumPacketSize() int { + return header.IPv4MinimumSize +} + +// ParseAddresses implements NetworkProtocol.ParseAddresses. +func (*protocol) ParseAddresses(v buffer.View) (src, dst tcpip.Address) { + h := header.IPv4(v) + return h.SourceAddress(), h.DestinationAddress() +} + +// SetOption implements NetworkProtocol.SetOption. +func (p *protocol) SetOption(option interface{}) *tcpip.Error { + return tcpip.ErrUnknownProtocolOption +} + +// Option implements NetworkProtocol.Option. +func (p *protocol) Option(option interface{}) *tcpip.Error { + return tcpip.ErrUnknownProtocolOption +} + +// calculateMTU calculates the network-layer payload MTU based on the link-layer +// payload mtu. +func calculateMTU(mtu uint32) uint32 { + if mtu > maxTotalSize { + mtu = maxTotalSize + } + return mtu - header.IPv4MinimumSize +} + +// 用 源地址 目标地址 和 传输层协议号 进行一个哈希 +func hashRoute(r *stack.Route, protocol tcpip.TransportProtocolNumber) uint32 { + t := r.LocalAddress + a := uint32(t[0]) | uint32(t[1])<<8 | uint32(t[2])<<16 | uint32(t[3])<<24 + t = r.RemoteAddress + b := uint32(t[0]) | uint32(t[1])<<8 | uint32(t[2])<<16 | uint32(t[3])<<24 + return hash.Hash3Words(a, b, uint32(protocol), hashIV) +} + +var ( + ids []uint32 + hashIV uint32 +) + +func init() { + ids = make([]uint32, buckets) + + r := hash.RandN32(1 + buckets) + for i := range ids { + ids[i] = r[i] // 初始化ids + } + hashIV = r[buckets] + + stack.RegisterNetworkProtocolFactory(ProtocolName, func() stack.NetworkProtocol { + return &protocol{} + }) +} diff --git a/tcpip/network/ipv4/ipv4_test.go b/tcpip/network/ipv4/ipv4_test.go index d4c85ff..2df0658 100644 --- a/tcpip/network/ipv4/ipv4_test.go +++ b/tcpip/network/ipv4/ipv4_test.go @@ -1,7 +1,7 @@ -package ipv4_test - -import "testing" - -func TestIPv4Base(t *testing.T) { - -} +package ipv4_test + +import "testing" + +func TestIPv4Base(t *testing.T) { + +} diff --git a/tcpip/network/ipv6/icmp.go b/tcpip/network/ipv6/icmp.go index 74029ec..a39e127 100644 --- a/tcpip/network/ipv6/icmp.go +++ b/tcpip/network/ipv6/icmp.go @@ -1,231 +1,231 @@ -// Copyright 2018 Google LLC -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package ipv6 - -import ( - "encoding/binary" - - "netstack/tcpip" - "netstack/tcpip/buffer" - "netstack/tcpip/header" - "netstack/tcpip/stack" -) - -// handleControl handles the case when an ICMP packet contains the headers of -// the original packet that caused the ICMP one to be sent. This information is -// used to find out which transport endpoint must be notified about the ICMP -// packet. -func (e *endpoint) handleControl(typ stack.ControlType, extra uint32, vv buffer.VectorisedView) { - h := header.IPv6(vv.First()) - - // We don't use IsValid() here because ICMP only requires that up to - // 1280 bytes of the original packet be included. So it's likely that it - // is truncated, which would cause IsValid to return false. - // - // Drop packet if it doesn't have the basic IPv6 header or if the - // original source address doesn't match the endpoint's address. - if len(h) < header.IPv6MinimumSize || h.SourceAddress() != e.id.LocalAddress { - return - } - - // Skip the IP header, then handle the fragmentation header if there - // is one. - vv.TrimFront(header.IPv6MinimumSize) - p := h.TransportProtocol() - if p == header.IPv6FragmentHeader { - f := header.IPv6Fragment(vv.First()) - if !f.IsValid() || f.FragmentOffset() != 0 { - // We can't handle fragments that aren't at offset 0 - // because they don't have the transport headers. - return - } - - // Skip fragmentation header and find out the actual protocol - // number. - vv.TrimFront(header.IPv6FragmentHeaderSize) - p = f.TransportProtocol() - } - - // Deliver the control packet to the transport endpoint. - e.dispatcher.DeliverTransportControlPacket(e.id.LocalAddress, h.DestinationAddress(), ProtocolNumber, p, typ, extra, vv) -} - -func (e *endpoint) handleICMP(r *stack.Route, vv buffer.VectorisedView) { - v := vv.First() - if len(v) < header.ICMPv6MinimumSize { - return - } - h := header.ICMPv6(v) - - switch h.Type() { - case header.ICMPv6PacketTooBig: - if len(v) < header.ICMPv6PacketTooBigMinimumSize { - return - } - vv.TrimFront(header.ICMPv6PacketTooBigMinimumSize) - mtu := binary.BigEndian.Uint32(v[header.ICMPv6MinimumSize:]) - e.handleControl(stack.ControlPacketTooBig, calculateMTU(mtu), vv) - - case header.ICMPv6DstUnreachable: - if len(v) < header.ICMPv6DstUnreachableMinimumSize { - return - } - vv.TrimFront(header.ICMPv6DstUnreachableMinimumSize) - switch h.Code() { - case header.ICMPv6PortUnreachable: - e.handleControl(stack.ControlPortUnreachable, 0, vv) - } - - case header.ICMPv6NeighborSolicit: - if len(v) < header.ICMPv6NeighborSolicitMinimumSize { - return - } - targetAddr := tcpip.Address(v[8 : 8+16]) - if e.linkAddrCache.CheckLocalAddress(e.nicid, ProtocolNumber, targetAddr) == 0 { - // We don't have a useful answer; the best we can do is ignore the request. - return - } - hdr := buffer.NewPrependable(int(r.MaxHeaderLength()) + header.IPv6MinimumSize + header.ICMPv6NeighborAdvertSize) - pkt := header.ICMPv6(hdr.Prepend(header.ICMPv6NeighborAdvertSize)) - pkt.SetType(header.ICMPv6NeighborAdvert) - pkt[icmpV6FlagOffset] = ndpSolicitedFlag | ndpOverrideFlag - copy(pkt[icmpV6OptOffset-len(targetAddr):], targetAddr) - pkt[icmpV6OptOffset] = ndpOptDstLinkAddr - pkt[icmpV6LengthOffset] = 1 - copy(pkt[icmpV6LengthOffset+1:], r.LocalLinkAddress[:]) - - // ICMPv6 Neighbor Solicit messages are always sent to - // specially crafted IPv6 multicast addresses. As a result, the - // route we end up with here has as its LocalAddress such a - // multicast address. It would be nonsense to claim that our - // source address is a multicast address, so we manually set - // the source address to the target address requested in the - // solicit message. Since that requires mutating the route, we - // must first clone it. - r := r.Clone() - defer r.Release() - r.LocalAddress = targetAddr - pkt.SetChecksum(icmpChecksum(pkt, r.LocalAddress, r.RemoteAddress, buffer.VectorisedView{})) - r.WritePacket(hdr, buffer.VectorisedView{}, header.ICMPv6ProtocolNumber, r.DefaultTTL()) - - e.linkAddrCache.AddLinkAddress(e.nicid, r.RemoteAddress, r.RemoteLinkAddress) - - case header.ICMPv6NeighborAdvert: - if len(v) < header.ICMPv6NeighborAdvertSize { - return - } - targetAddr := tcpip.Address(v[8 : 8+16]) - e.linkAddrCache.AddLinkAddress(e.nicid, targetAddr, r.RemoteLinkAddress) - if targetAddr != r.RemoteAddress { - e.linkAddrCache.AddLinkAddress(e.nicid, r.RemoteAddress, r.RemoteLinkAddress) - } - - case header.ICMPv6EchoRequest: - if len(v) < header.ICMPv6EchoMinimumSize { - return - } - vv.TrimFront(header.ICMPv6EchoMinimumSize) - hdr := buffer.NewPrependable(int(r.MaxHeaderLength()) + header.IPv6MinimumSize + header.ICMPv6EchoMinimumSize) - pkt := header.ICMPv6(hdr.Prepend(header.ICMPv6EchoMinimumSize)) - copy(pkt, h) - pkt.SetType(header.ICMPv6EchoReply) - pkt.SetChecksum(icmpChecksum(pkt, r.LocalAddress, r.RemoteAddress, vv)) - r.WritePacket(hdr, vv, header.ICMPv6ProtocolNumber, r.DefaultTTL()) - - case header.ICMPv6EchoReply: - if len(v) < header.ICMPv6EchoMinimumSize { - return - } - e.dispatcher.DeliverTransportPacket(r, header.ICMPv6ProtocolNumber, vv) - - } -} - -const ( - ndpSolicitedFlag = 1 << 6 - ndpOverrideFlag = 1 << 5 - - ndpOptSrcLinkAddr = 1 - ndpOptDstLinkAddr = 2 - - icmpV6FlagOffset = 4 - icmpV6OptOffset = 24 - icmpV6LengthOffset = 25 -) - -var broadcastMAC = tcpip.LinkAddress([]byte{0xff, 0xff, 0xff, 0xff, 0xff, 0xff}) - -var _ stack.LinkAddressResolver = (*protocol)(nil) - -// LinkAddressProtocol implements stack.LinkAddressResolver. -func (*protocol) LinkAddressProtocol() tcpip.NetworkProtocolNumber { - return header.IPv6ProtocolNumber -} - -// LinkAddressRequest implements stack.LinkAddressResolver. -func (*protocol) LinkAddressRequest(addr, localAddr tcpip.Address, linkEP stack.LinkEndpoint) *tcpip.Error { - snaddr := header.SolicitedNodeAddr(addr) - r := &stack.Route{ - LocalAddress: localAddr, - RemoteAddress: snaddr, - RemoteLinkAddress: broadcastMAC, - } - hdr := buffer.NewPrependable(int(linkEP.MaxHeaderLength()) + header.IPv6MinimumSize + header.ICMPv6NeighborAdvertSize) - pkt := header.ICMPv6(hdr.Prepend(header.ICMPv6NeighborAdvertSize)) - pkt.SetType(header.ICMPv6NeighborSolicit) - copy(pkt[icmpV6OptOffset-len(addr):], addr) - pkt[icmpV6OptOffset] = ndpOptSrcLinkAddr - pkt[icmpV6LengthOffset] = 1 - copy(pkt[icmpV6LengthOffset+1:], linkEP.LinkAddress()) - pkt.SetChecksum(icmpChecksum(pkt, r.LocalAddress, r.RemoteAddress, buffer.VectorisedView{})) - - length := uint16(hdr.UsedLength()) - ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize)) - ip.Encode(&header.IPv6Fields{ - PayloadLength: length, - NextHeader: uint8(header.ICMPv6ProtocolNumber), - HopLimit: defaultIPv6HopLimit, - SrcAddr: r.LocalAddress, - DstAddr: r.RemoteAddress, - }) - - return linkEP.WritePacket(r, hdr, buffer.VectorisedView{}, ProtocolNumber) -} - -// ResolveStaticAddress implements stack.LinkAddressResolver. -func (*protocol) ResolveStaticAddress(addr tcpip.Address) (tcpip.LinkAddress, bool) { - return "", false -} - -func icmpChecksum(h header.ICMPv6, src, dst tcpip.Address, vv buffer.VectorisedView) uint16 { - // Calculate the IPv6 pseudo-header upper-layer checksum. - xsum := header.Checksum([]byte(src), 0) - xsum = header.Checksum([]byte(dst), xsum) - var upperLayerLength [4]byte - binary.BigEndian.PutUint32(upperLayerLength[:], uint32(len(h)+vv.Size())) - xsum = header.Checksum(upperLayerLength[:], xsum) - xsum = header.Checksum([]byte{0, 0, 0, uint8(header.ICMPv6ProtocolNumber)}, xsum) - for _, v := range vv.Views() { - xsum = header.Checksum(v, xsum) - } - - // h[2:4] is the checksum itself, set it aside to avoid checksumming the checksum. - h2, h3 := h[2], h[3] - h[2], h[3] = 0, 0 - xsum = ^header.Checksum(h, xsum) - h[2], h[3] = h2, h3 - - return xsum -} +// Copyright 2018 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package ipv6 + +import ( + "encoding/binary" + + "netstack/tcpip" + "netstack/tcpip/buffer" + "netstack/tcpip/header" + "netstack/tcpip/stack" +) + +// handleControl handles the case when an ICMP packet contains the headers of +// the original packet that caused the ICMP one to be sent. This information is +// used to find out which transport endpoint must be notified about the ICMP +// packet. +func (e *endpoint) handleControl(typ stack.ControlType, extra uint32, vv buffer.VectorisedView) { + h := header.IPv6(vv.First()) + + // We don't use IsValid() here because ICMP only requires that up to + // 1280 bytes of the original packet be included. So it's likely that it + // is truncated, which would cause IsValid to return false. + // + // Drop packet if it doesn't have the basic IPv6 header or if the + // original source address doesn't match the endpoint's address. + if len(h) < header.IPv6MinimumSize || h.SourceAddress() != e.id.LocalAddress { + return + } + + // Skip the IP header, then handle the fragmentation header if there + // is one. + vv.TrimFront(header.IPv6MinimumSize) + p := h.TransportProtocol() + if p == header.IPv6FragmentHeader { + f := header.IPv6Fragment(vv.First()) + if !f.IsValid() || f.FragmentOffset() != 0 { + // We can't handle fragments that aren't at offset 0 + // because they don't have the transport headers. + return + } + + // Skip fragmentation header and find out the actual protocol + // number. + vv.TrimFront(header.IPv6FragmentHeaderSize) + p = f.TransportProtocol() + } + + // Deliver the control packet to the transport endpoint. + e.dispatcher.DeliverTransportControlPacket(e.id.LocalAddress, h.DestinationAddress(), ProtocolNumber, p, typ, extra, vv) +} + +func (e *endpoint) handleICMP(r *stack.Route, vv buffer.VectorisedView) { + v := vv.First() + if len(v) < header.ICMPv6MinimumSize { + return + } + h := header.ICMPv6(v) + + switch h.Type() { + case header.ICMPv6PacketTooBig: + if len(v) < header.ICMPv6PacketTooBigMinimumSize { + return + } + vv.TrimFront(header.ICMPv6PacketTooBigMinimumSize) + mtu := binary.BigEndian.Uint32(v[header.ICMPv6MinimumSize:]) + e.handleControl(stack.ControlPacketTooBig, calculateMTU(mtu), vv) + + case header.ICMPv6DstUnreachable: + if len(v) < header.ICMPv6DstUnreachableMinimumSize { + return + } + vv.TrimFront(header.ICMPv6DstUnreachableMinimumSize) + switch h.Code() { + case header.ICMPv6PortUnreachable: + e.handleControl(stack.ControlPortUnreachable, 0, vv) + } + + case header.ICMPv6NeighborSolicit: + if len(v) < header.ICMPv6NeighborSolicitMinimumSize { + return + } + targetAddr := tcpip.Address(v[8 : 8+16]) + if e.linkAddrCache.CheckLocalAddress(e.nicid, ProtocolNumber, targetAddr) == 0 { + // We don't have a useful answer; the best we can do is ignore the request. + return + } + hdr := buffer.NewPrependable(int(r.MaxHeaderLength()) + header.IPv6MinimumSize + header.ICMPv6NeighborAdvertSize) + pkt := header.ICMPv6(hdr.Prepend(header.ICMPv6NeighborAdvertSize)) + pkt.SetType(header.ICMPv6NeighborAdvert) + pkt[icmpV6FlagOffset] = ndpSolicitedFlag | ndpOverrideFlag + copy(pkt[icmpV6OptOffset-len(targetAddr):], targetAddr) + pkt[icmpV6OptOffset] = ndpOptDstLinkAddr + pkt[icmpV6LengthOffset] = 1 + copy(pkt[icmpV6LengthOffset+1:], r.LocalLinkAddress[:]) + + // ICMPv6 Neighbor Solicit messages are always sent to + // specially crafted IPv6 multicast addresses. As a result, the + // route we end up with here has as its LocalAddress such a + // multicast address. It would be nonsense to claim that our + // source address is a multicast address, so we manually set + // the source address to the target address requested in the + // solicit message. Since that requires mutating the route, we + // must first clone it. + r := r.Clone() + defer r.Release() + r.LocalAddress = targetAddr + pkt.SetChecksum(icmpChecksum(pkt, r.LocalAddress, r.RemoteAddress, buffer.VectorisedView{})) + r.WritePacket(hdr, buffer.VectorisedView{}, header.ICMPv6ProtocolNumber, r.DefaultTTL()) + + e.linkAddrCache.AddLinkAddress(e.nicid, r.RemoteAddress, r.RemoteLinkAddress) + + case header.ICMPv6NeighborAdvert: + if len(v) < header.ICMPv6NeighborAdvertSize { + return + } + targetAddr := tcpip.Address(v[8 : 8+16]) + e.linkAddrCache.AddLinkAddress(e.nicid, targetAddr, r.RemoteLinkAddress) + if targetAddr != r.RemoteAddress { + e.linkAddrCache.AddLinkAddress(e.nicid, r.RemoteAddress, r.RemoteLinkAddress) + } + + case header.ICMPv6EchoRequest: + if len(v) < header.ICMPv6EchoMinimumSize { + return + } + vv.TrimFront(header.ICMPv6EchoMinimumSize) + hdr := buffer.NewPrependable(int(r.MaxHeaderLength()) + header.IPv6MinimumSize + header.ICMPv6EchoMinimumSize) + pkt := header.ICMPv6(hdr.Prepend(header.ICMPv6EchoMinimumSize)) + copy(pkt, h) + pkt.SetType(header.ICMPv6EchoReply) + pkt.SetChecksum(icmpChecksum(pkt, r.LocalAddress, r.RemoteAddress, vv)) + r.WritePacket(hdr, vv, header.ICMPv6ProtocolNumber, r.DefaultTTL()) + + case header.ICMPv6EchoReply: + if len(v) < header.ICMPv6EchoMinimumSize { + return + } + e.dispatcher.DeliverTransportPacket(r, header.ICMPv6ProtocolNumber, vv) + + } +} + +const ( + ndpSolicitedFlag = 1 << 6 + ndpOverrideFlag = 1 << 5 + + ndpOptSrcLinkAddr = 1 + ndpOptDstLinkAddr = 2 + + icmpV6FlagOffset = 4 + icmpV6OptOffset = 24 + icmpV6LengthOffset = 25 +) + +var broadcastMAC = tcpip.LinkAddress([]byte{0xff, 0xff, 0xff, 0xff, 0xff, 0xff}) + +var _ stack.LinkAddressResolver = (*protocol)(nil) + +// LinkAddressProtocol implements stack.LinkAddressResolver. +func (*protocol) LinkAddressProtocol() tcpip.NetworkProtocolNumber { + return header.IPv6ProtocolNumber +} + +// LinkAddressRequest implements stack.LinkAddressResolver. +func (*protocol) LinkAddressRequest(addr, localAddr tcpip.Address, linkEP stack.LinkEndpoint) *tcpip.Error { + snaddr := header.SolicitedNodeAddr(addr) + r := &stack.Route{ + LocalAddress: localAddr, + RemoteAddress: snaddr, + RemoteLinkAddress: broadcastMAC, + } + hdr := buffer.NewPrependable(int(linkEP.MaxHeaderLength()) + header.IPv6MinimumSize + header.ICMPv6NeighborAdvertSize) + pkt := header.ICMPv6(hdr.Prepend(header.ICMPv6NeighborAdvertSize)) + pkt.SetType(header.ICMPv6NeighborSolicit) + copy(pkt[icmpV6OptOffset-len(addr):], addr) + pkt[icmpV6OptOffset] = ndpOptSrcLinkAddr + pkt[icmpV6LengthOffset] = 1 + copy(pkt[icmpV6LengthOffset+1:], linkEP.LinkAddress()) + pkt.SetChecksum(icmpChecksum(pkt, r.LocalAddress, r.RemoteAddress, buffer.VectorisedView{})) + + length := uint16(hdr.UsedLength()) + ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize)) + ip.Encode(&header.IPv6Fields{ + PayloadLength: length, + NextHeader: uint8(header.ICMPv6ProtocolNumber), + HopLimit: defaultIPv6HopLimit, + SrcAddr: r.LocalAddress, + DstAddr: r.RemoteAddress, + }) + + return linkEP.WritePacket(r, hdr, buffer.VectorisedView{}, ProtocolNumber) +} + +// ResolveStaticAddress implements stack.LinkAddressResolver. +func (*protocol) ResolveStaticAddress(addr tcpip.Address) (tcpip.LinkAddress, bool) { + return "", false +} + +func icmpChecksum(h header.ICMPv6, src, dst tcpip.Address, vv buffer.VectorisedView) uint16 { + // Calculate the IPv6 pseudo-header upper-layer checksum. + xsum := header.Checksum([]byte(src), 0) + xsum = header.Checksum([]byte(dst), xsum) + var upperLayerLength [4]byte + binary.BigEndian.PutUint32(upperLayerLength[:], uint32(len(h)+vv.Size())) + xsum = header.Checksum(upperLayerLength[:], xsum) + xsum = header.Checksum([]byte{0, 0, 0, uint8(header.ICMPv6ProtocolNumber)}, xsum) + for _, v := range vv.Views() { + xsum = header.Checksum(v, xsum) + } + + // h[2:4] is the checksum itself, set it aside to avoid checksumming the checksum. + h2, h3 := h[2], h[3] + h[2], h[3] = 0, 0 + xsum = ^header.Checksum(h, xsum) + h[2], h[3] = h2, h3 + + return xsum +} diff --git a/tcpip/network/ipv6/ipv6.go b/tcpip/network/ipv6/ipv6.go index ac458a0..eef9798 100644 --- a/tcpip/network/ipv6/ipv6.go +++ b/tcpip/network/ipv6/ipv6.go @@ -1,187 +1,187 @@ -// Copyright 2018 Google LLC -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Package ipv6 contains the implementation of the ipv6 network protocol. To use -// it in the networking stack, this package must be added to the project, and -// activated on the stack by passing ipv6.ProtocolName (or "ipv6") as one of the -// network protocols when calling stack.New(). Then endpoints can be created -// by passing ipv6.ProtocolNumber as the network protocol number when calling -// Stack.NewEndpoint(). -package ipv6 - -import ( - "netstack/tcpip" - "netstack/tcpip/buffer" - "netstack/tcpip/header" - "netstack/tcpip/stack" -) - -const ( - // ProtocolName is the string representation of the ipv6 protocol name. - ProtocolName = "ipv6" - - // ProtocolNumber is the ipv6 protocol number. - ProtocolNumber = header.IPv6ProtocolNumber - - // maxTotalSize is maximum size that can be encoded in the 16-bit - // PayloadLength field of the ipv6 header. - maxPayloadSize = 0xffff - - // defaultIPv6HopLimit is the default hop limit for IPv6 Packets - // egressed by Netstack. - defaultIPv6HopLimit = 255 -) - -type endpoint struct { - nicid tcpip.NICID - id stack.NetworkEndpointID - linkEP stack.LinkEndpoint - linkAddrCache stack.LinkAddressCache - dispatcher stack.TransportDispatcher -} - -// DefaultTTL is the default hop limit for this endpoint. -func (e *endpoint) DefaultTTL() uint8 { - return 255 -} - -// MTU implements stack.NetworkEndpoint.MTU. It returns the link-layer MTU minus -// the network layer max header length. -func (e *endpoint) MTU() uint32 { - return calculateMTU(e.linkEP.MTU()) -} - -// NICID returns the ID of the NIC this endpoint belongs to. -func (e *endpoint) NICID() tcpip.NICID { - return e.nicid -} - -// ID returns the ipv6 endpoint ID. -func (e *endpoint) ID() *stack.NetworkEndpointID { - return &e.id -} - -// Capabilities implements stack.NetworkEndpoint.Capabilities. -func (e *endpoint) Capabilities() stack.LinkEndpointCapabilities { - return e.linkEP.Capabilities() -} - -// MaxHeaderLength returns the maximum length needed by ipv6 headers (and -// underlying protocols). -func (e *endpoint) MaxHeaderLength() uint16 { - return e.linkEP.MaxHeaderLength() + header.IPv6MinimumSize -} - -// WritePacket writes a packet to the given destination address and protocol. -func (e *endpoint) WritePacket(r *stack.Route, hdr buffer.Prependable, payload buffer.VectorisedView, protocol tcpip.TransportProtocolNumber, ttl uint8) *tcpip.Error { - length := uint16(hdr.UsedLength() + payload.Size()) - ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize)) - ip.Encode(&header.IPv6Fields{ - PayloadLength: length, - NextHeader: uint8(protocol), - HopLimit: ttl, - SrcAddr: r.LocalAddress, - DstAddr: r.RemoteAddress, - }) - r.Stats().IP.PacketsSent.Increment() - - return e.linkEP.WritePacket(r, hdr, payload, ProtocolNumber) -} - -// HandlePacket is called by the link layer when new ipv6 packets arrive for -// this endpoint. -func (e *endpoint) HandlePacket(r *stack.Route, vv buffer.VectorisedView) { - h := header.IPv6(vv.First()) - if !h.IsValid(vv.Size()) { - return - } - - vv.TrimFront(header.IPv6MinimumSize) - vv.CapLength(int(h.PayloadLength())) - - p := h.TransportProtocol() - if p == header.ICMPv6ProtocolNumber { - e.handleICMP(r, vv) - return - } - - r.Stats().IP.PacketsDelivered.Increment() - e.dispatcher.DeliverTransportPacket(r, p, vv) -} - -// Close cleans up resources associated with the endpoint. -func (*endpoint) Close() {} - -type protocol struct{} - -// NewProtocol creates a new protocol ipv6 protocol descriptor. This is exported -// only for tests that short-circuit the stack. Regular use of the protocol is -// done via the stack, which gets a protocol descriptor from the init() function -// below. -func NewProtocol() stack.NetworkProtocol { - return &protocol{} -} - -// Number returns the ipv6 protocol number. -func (p *protocol) Number() tcpip.NetworkProtocolNumber { - return ProtocolNumber -} - -// MinimumPacketSize returns the minimum valid ipv6 packet size. -func (p *protocol) MinimumPacketSize() int { - return header.IPv6MinimumSize -} - -// ParseAddresses implements NetworkProtocol.ParseAddresses. -func (*protocol) ParseAddresses(v buffer.View) (src, dst tcpip.Address) { - h := header.IPv6(v) - return h.SourceAddress(), h.DestinationAddress() -} - -// NewEndpoint creates a new ipv6 endpoint. -func (p *protocol) NewEndpoint(nicid tcpip.NICID, addr tcpip.Address, linkAddrCache stack.LinkAddressCache, dispatcher stack.TransportDispatcher, linkEP stack.LinkEndpoint) (stack.NetworkEndpoint, *tcpip.Error) { - return &endpoint{ - nicid: nicid, - id: stack.NetworkEndpointID{LocalAddress: addr}, - linkEP: linkEP, - linkAddrCache: linkAddrCache, - dispatcher: dispatcher, - }, nil -} - -// SetOption implements NetworkProtocol.SetOption. -func (p *protocol) SetOption(option interface{}) *tcpip.Error { - return tcpip.ErrUnknownProtocolOption -} - -// Option implements NetworkProtocol.Option. -func (p *protocol) Option(option interface{}) *tcpip.Error { - return tcpip.ErrUnknownProtocolOption -} - -// calculateMTU calculates the network-layer payload MTU based on the link-layer -// payload mtu. -func calculateMTU(mtu uint32) uint32 { - mtu -= header.IPv6MinimumSize - if mtu <= maxPayloadSize { - return mtu - } - return maxPayloadSize -} - -func init() { - stack.RegisterNetworkProtocolFactory(ProtocolName, func() stack.NetworkProtocol { - return &protocol{} - }) -} +// Copyright 2018 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package ipv6 contains the implementation of the ipv6 network protocol. To use +// it in the networking stack, this package must be added to the project, and +// activated on the stack by passing ipv6.ProtocolName (or "ipv6") as one of the +// network protocols when calling stack.New(). Then endpoints can be created +// by passing ipv6.ProtocolNumber as the network protocol number when calling +// Stack.NewEndpoint(). +package ipv6 + +import ( + "netstack/tcpip" + "netstack/tcpip/buffer" + "netstack/tcpip/header" + "netstack/tcpip/stack" +) + +const ( + // ProtocolName is the string representation of the ipv6 protocol name. + ProtocolName = "ipv6" + + // ProtocolNumber is the ipv6 protocol number. + ProtocolNumber = header.IPv6ProtocolNumber + + // maxTotalSize is maximum size that can be encoded in the 16-bit + // PayloadLength field of the ipv6 header. + maxPayloadSize = 0xffff + + // defaultIPv6HopLimit is the default hop limit for IPv6 Packets + // egressed by Netstack. + defaultIPv6HopLimit = 255 +) + +type endpoint struct { + nicid tcpip.NICID + id stack.NetworkEndpointID + linkEP stack.LinkEndpoint + linkAddrCache stack.LinkAddressCache + dispatcher stack.TransportDispatcher +} + +// DefaultTTL is the default hop limit for this endpoint. +func (e *endpoint) DefaultTTL() uint8 { + return 255 +} + +// MTU implements stack.NetworkEndpoint.MTU. It returns the link-layer MTU minus +// the network layer max header length. +func (e *endpoint) MTU() uint32 { + return calculateMTU(e.linkEP.MTU()) +} + +// NICID returns the ID of the NIC this endpoint belongs to. +func (e *endpoint) NICID() tcpip.NICID { + return e.nicid +} + +// ID returns the ipv6 endpoint ID. +func (e *endpoint) ID() *stack.NetworkEndpointID { + return &e.id +} + +// Capabilities implements stack.NetworkEndpoint.Capabilities. +func (e *endpoint) Capabilities() stack.LinkEndpointCapabilities { + return e.linkEP.Capabilities() +} + +// MaxHeaderLength returns the maximum length needed by ipv6 headers (and +// underlying protocols). +func (e *endpoint) MaxHeaderLength() uint16 { + return e.linkEP.MaxHeaderLength() + header.IPv6MinimumSize +} + +// WritePacket writes a packet to the given destination address and protocol. +func (e *endpoint) WritePacket(r *stack.Route, hdr buffer.Prependable, payload buffer.VectorisedView, protocol tcpip.TransportProtocolNumber, ttl uint8) *tcpip.Error { + length := uint16(hdr.UsedLength() + payload.Size()) + ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize)) + ip.Encode(&header.IPv6Fields{ + PayloadLength: length, + NextHeader: uint8(protocol), + HopLimit: ttl, + SrcAddr: r.LocalAddress, + DstAddr: r.RemoteAddress, + }) + r.Stats().IP.PacketsSent.Increment() + + return e.linkEP.WritePacket(r, hdr, payload, ProtocolNumber) +} + +// HandlePacket is called by the link layer when new ipv6 packets arrive for +// this endpoint. +func (e *endpoint) HandlePacket(r *stack.Route, vv buffer.VectorisedView) { + h := header.IPv6(vv.First()) + if !h.IsValid(vv.Size()) { + return + } + + vv.TrimFront(header.IPv6MinimumSize) + vv.CapLength(int(h.PayloadLength())) + + p := h.TransportProtocol() + if p == header.ICMPv6ProtocolNumber { + e.handleICMP(r, vv) + return + } + + r.Stats().IP.PacketsDelivered.Increment() + e.dispatcher.DeliverTransportPacket(r, p, vv) +} + +// Close cleans up resources associated with the endpoint. +func (*endpoint) Close() {} + +type protocol struct{} + +// NewProtocol creates a new protocol ipv6 protocol descriptor. This is exported +// only for tests that short-circuit the stack. Regular use of the protocol is +// done via the stack, which gets a protocol descriptor from the init() function +// below. +func NewProtocol() stack.NetworkProtocol { + return &protocol{} +} + +// Number returns the ipv6 protocol number. +func (p *protocol) Number() tcpip.NetworkProtocolNumber { + return ProtocolNumber +} + +// MinimumPacketSize returns the minimum valid ipv6 packet size. +func (p *protocol) MinimumPacketSize() int { + return header.IPv6MinimumSize +} + +// ParseAddresses implements NetworkProtocol.ParseAddresses. +func (*protocol) ParseAddresses(v buffer.View) (src, dst tcpip.Address) { + h := header.IPv6(v) + return h.SourceAddress(), h.DestinationAddress() +} + +// NewEndpoint creates a new ipv6 endpoint. +func (p *protocol) NewEndpoint(nicid tcpip.NICID, addr tcpip.Address, linkAddrCache stack.LinkAddressCache, dispatcher stack.TransportDispatcher, linkEP stack.LinkEndpoint) (stack.NetworkEndpoint, *tcpip.Error) { + return &endpoint{ + nicid: nicid, + id: stack.NetworkEndpointID{LocalAddress: addr}, + linkEP: linkEP, + linkAddrCache: linkAddrCache, + dispatcher: dispatcher, + }, nil +} + +// SetOption implements NetworkProtocol.SetOption. +func (p *protocol) SetOption(option interface{}) *tcpip.Error { + return tcpip.ErrUnknownProtocolOption +} + +// Option implements NetworkProtocol.Option. +func (p *protocol) Option(option interface{}) *tcpip.Error { + return tcpip.ErrUnknownProtocolOption +} + +// calculateMTU calculates the network-layer payload MTU based on the link-layer +// payload mtu. +func calculateMTU(mtu uint32) uint32 { + mtu -= header.IPv6MinimumSize + if mtu <= maxPayloadSize { + return mtu + } + return maxPayloadSize +} + +func init() { + stack.RegisterNetworkProtocolFactory(ProtocolName, func() stack.NetworkProtocol { + return &protocol{} + }) +} diff --git a/tcpip/ports/README.md b/tcpip/ports/README.md index 1013c32..e3312b0 100644 --- a/tcpip/ports/README.md +++ b/tcpip/ports/README.md @@ -1,45 +1,45 @@ -# 端口 - -## 概念 -在互联网上,各主机间通过 TCP/IP 协议发送和接收数据包,各个数据包根据其目的主机的 ip 地址来进行互联网络中的路由选择,把数据包顺利的传送到目的主机。大多数操作系统都支持多程序(进程)同时运行,那么目的主机应该把接收到的数据包传送给众多同时运行的进程中的哪一个呢?显然这个问题有待解决。 - -运行在计算机中的进程是用进程标识符来标志的。一开始我们可能会想到根据进程标识符来区分数据包给哪个进程,但是因为在因特网上使用的计算机的操作系统种类很多,而不同的操作系统又使用不同格式的进程标识符,因此发送方非常可能无法识别其他机器上的进程。为了使运行不同操作系统的计算机的应用进程能够互相通信,就必须用统一的方法对 TCP/IP 体系的应用进程进行标志,因此 TCP/IP 体系的传输层端口被提了出来。 - -![img](https://doc.shiyanlou.com/document-uid949121labid10418timestamp1555484076771.png) - -TCP/IP 协议在运输层使用协议端口号(protocol port number),或通常简称为端口(port),端口统一用一个 16 位端口号进行标志。端口号只具有本地意义,即端口号只是为了标志本计算机应用层中的各进程。在因特网中不同计算机的相同端口号是没有联系的。虽然通信的终点是应用进程,但我们可以把端口想象是通信的终点,因为我们只要把要传送的报文交到目的主机的某一个合适的目的端口,剩下的工作(即最后交付目的进程)就由 TCP 来完成。 - -如果把 IP 地址比作一栋楼房,端口号就是这栋楼房里各个房子的房间号。数据包来到主机这栋大楼,会查看是个房间号,再把数据发给相应的房间。端口号只有整数,范围是从 0 到 65535(2^16-1),其中 0 一般作为保留端口,表示让系统自动分配端口。 - -最常见的是 TCP 端口和 UDP 端口。由于 TCP 和 UDP 两个协议是独立的,因此各自的端口号也相互独立,比如 TCP 有 235 端口,UDP 也可以有 235 端口,两者并不冲突。 - -TCP 和 UDP 协议首部的前四个字节都是用来表示端口的,分别表示源端口和目的端口,各占 2 个字节,详细的 TCP、UDP 协议头部会在下面的文章中讲到。 - -![img](https://doc.shiyanlou.com/document-uid949121labid10418timestamp1555484120164.png) - -1. 周知端口(Well Known Ports) 周知端口是众所周知的端口号,范围从 0 到 1023,其中 80 端口分配给 WWW 服务,21 端口分配给 FTP 服务等。我们在 IE 的地址栏里输入一个网址的时候是不必指定端口号的,因为在默认情况下 WWW 服务的端口是"80"。网络服务是可以使用其他端口号的,如果不是默认的端口号则应该在 地址栏上指定端口号,方法是在地址后面加上冒号":",再加上端口号。比如使用"8080"作为 WWW 服务的端口,则需要在地址栏里输入"网址:8080"。但是有些系统协议使用固定的端口号,它是不能被改变的,比如 139 端口专门用于 NetBIOS 与 TCP/IP 之间的通信,不能手动改变。 - -2. 注册端口(Registered Ports) 端口 1024 到 49151,分配给用户进程或应用程序。这些进程主要是用户选择安装的一些应用程序,而不是已经分配好了公认端口的常用程序。这些端口在没有被服务器资源占用的时候,可以用用户端动态选用为源端口。 - -3. 动态端口(Dynamic Ports) 动态端口的范围是从 49152 到 65535。之所以称为动态端口,是因为它一般不固定分配某种服务,而是动态分配。比如本地想和远端建立 TCP 连接,如果没有指定本地源端口,系统就会给你自动分配一个未占用的源端口,这个端口值就是动态的,当你断开再次建立连接的时候,很有可能你的源端口和上次得到的端口不一样。 - -### 一些常见的端口号及其用途如下: - -1. TCP21 端口:FTP 文件传输服务 -2. TCP22 端口:SSH 安全外壳协议 -3. TCP23 端口:TELNET 终端仿真服务 -4. TCP25 端口:SMTP 简单邮件传输服务 -5. UDP53 端口:DNS 域名解析服务 -6. UDP67 端口:DHCP 的服务端端口 -7. UDP68 端口:DHCP 的客户端端口 -8. TCP80 端口:HTTP 超文本传输服务 -9. TCP110 端口:POP3“邮局协议版本 3”使用的端口 -10. TCP443 端口:HTTPS 加密的超文本传输服务 - -端口在 tcpip 协议栈中算是比较简单的概念,提出端口的本质需求是希望能将数据包准确的发给某台主机上的进程,实现进程与进程之间的通信。 - -协议栈全局管理端口,一个端口被分配以后,不允许给其他进程使用,但是要注意的是端口是网络层协议地址+传输层协议号+端口号来区分的,比如: - -1. ipv4 的 tcp 80 端口和 ipv4 的 udp 80 端口不会冲突。 -2. 如果你主机有两个 ip 地址 ip1 和 ip2,那么你同时监听 ip1:80 和 ip2:80 不会冲突。 +# 端口 + +## 概念 +在互联网上,各主机间通过 TCP/IP 协议发送和接收数据包,各个数据包根据其目的主机的 ip 地址来进行互联网络中的路由选择,把数据包顺利的传送到目的主机。大多数操作系统都支持多程序(进程)同时运行,那么目的主机应该把接收到的数据包传送给众多同时运行的进程中的哪一个呢?显然这个问题有待解决。 + +运行在计算机中的进程是用进程标识符来标志的。一开始我们可能会想到根据进程标识符来区分数据包给哪个进程,但是因为在因特网上使用的计算机的操作系统种类很多,而不同的操作系统又使用不同格式的进程标识符,因此发送方非常可能无法识别其他机器上的进程。为了使运行不同操作系统的计算机的应用进程能够互相通信,就必须用统一的方法对 TCP/IP 体系的应用进程进行标志,因此 TCP/IP 体系的传输层端口被提了出来。 + +![img](https://doc.shiyanlou.com/document-uid949121labid10418timestamp1555484076771.png) + +TCP/IP 协议在运输层使用协议端口号(protocol port number),或通常简称为端口(port),端口统一用一个 16 位端口号进行标志。端口号只具有本地意义,即端口号只是为了标志本计算机应用层中的各进程。在因特网中不同计算机的相同端口号是没有联系的。虽然通信的终点是应用进程,但我们可以把端口想象是通信的终点,因为我们只要把要传送的报文交到目的主机的某一个合适的目的端口,剩下的工作(即最后交付目的进程)就由 TCP 来完成。 + +如果把 IP 地址比作一栋楼房,端口号就是这栋楼房里各个房子的房间号。数据包来到主机这栋大楼,会查看是个房间号,再把数据发给相应的房间。端口号只有整数,范围是从 0 到 65535(2^16-1),其中 0 一般作为保留端口,表示让系统自动分配端口。 + +最常见的是 TCP 端口和 UDP 端口。由于 TCP 和 UDP 两个协议是独立的,因此各自的端口号也相互独立,比如 TCP 有 235 端口,UDP 也可以有 235 端口,两者并不冲突。 + +TCP 和 UDP 协议首部的前四个字节都是用来表示端口的,分别表示源端口和目的端口,各占 2 个字节,详细的 TCP、UDP 协议头部会在下面的文章中讲到。 + +![img](https://doc.shiyanlou.com/document-uid949121labid10418timestamp1555484120164.png) + +1. 周知端口(Well Known Ports) 周知端口是众所周知的端口号,范围从 0 到 1023,其中 80 端口分配给 WWW 服务,21 端口分配给 FTP 服务等。我们在 IE 的地址栏里输入一个网址的时候是不必指定端口号的,因为在默认情况下 WWW 服务的端口是"80"。网络服务是可以使用其他端口号的,如果不是默认的端口号则应该在 地址栏上指定端口号,方法是在地址后面加上冒号":",再加上端口号。比如使用"8080"作为 WWW 服务的端口,则需要在地址栏里输入"网址:8080"。但是有些系统协议使用固定的端口号,它是不能被改变的,比如 139 端口专门用于 NetBIOS 与 TCP/IP 之间的通信,不能手动改变。 + +2. 注册端口(Registered Ports) 端口 1024 到 49151,分配给用户进程或应用程序。这些进程主要是用户选择安装的一些应用程序,而不是已经分配好了公认端口的常用程序。这些端口在没有被服务器资源占用的时候,可以用用户端动态选用为源端口。 + +3. 动态端口(Dynamic Ports) 动态端口的范围是从 49152 到 65535。之所以称为动态端口,是因为它一般不固定分配某种服务,而是动态分配。比如本地想和远端建立 TCP 连接,如果没有指定本地源端口,系统就会给你自动分配一个未占用的源端口,这个端口值就是动态的,当你断开再次建立连接的时候,很有可能你的源端口和上次得到的端口不一样。 + +### 一些常见的端口号及其用途如下: + +1. TCP21 端口:FTP 文件传输服务 +2. TCP22 端口:SSH 安全外壳协议 +3. TCP23 端口:TELNET 终端仿真服务 +4. TCP25 端口:SMTP 简单邮件传输服务 +5. UDP53 端口:DNS 域名解析服务 +6. UDP67 端口:DHCP 的服务端端口 +7. UDP68 端口:DHCP 的客户端端口 +8. TCP80 端口:HTTP 超文本传输服务 +9. TCP110 端口:POP3“邮局协议版本 3”使用的端口 +10. TCP443 端口:HTTPS 加密的超文本传输服务 + +端口在 tcpip 协议栈中算是比较简单的概念,提出端口的本质需求是希望能将数据包准确的发给某台主机上的进程,实现进程与进程之间的通信。 + +协议栈全局管理端口,一个端口被分配以后,不允许给其他进程使用,但是要注意的是端口是网络层协议地址+传输层协议号+端口号来区分的,比如: + +1. ipv4 的 tcp 80 端口和 ipv4 的 udp 80 端口不会冲突。 +2. 如果你主机有两个 ip 地址 ip1 和 ip2,那么你同时监听 ip1:80 和 ip2:80 不会冲突。 3. ipv4 的 tcp 80 端口和 ipv6 的 tcp 80 端口不会冲突。 \ No newline at end of file diff --git a/tcpip/ports/ports.go b/tcpip/ports/ports.go index 571b6e1..47ad919 100644 --- a/tcpip/ports/ports.go +++ b/tcpip/ports/ports.go @@ -1,165 +1,165 @@ -package ports - -import ( - "log" - "math" - "math/rand" - "netstack/tcpip" - "sync" -) - -const ( - // 临时端口的最小值 - FirstEphemeral = 16000 - - anyIPAddress tcpip.Address = "" -) - -// 端口的唯一标识 : 网络层协议-传输层协议-端口号 -type portDescriptor struct { - network tcpip.NetworkProtocolNumber - transport tcpip.TransportProtocolNumber - port uint16 -} - -// PortManager 管理端口的对象 由他来保留和释放端口 -type PortManager struct { - mu sync.RWMutex - // 用一个map接口来保存被占用的端口 - // port:ips ipv4-tcp-80:[192.168.1.1, 192.168.1.2] - // ipv4-udp-9999:[192.168.10.1, 192.168.10.2] - allocatedPorts map[portDescriptor]bindAddresses -} - -// 一个IP地址的集合 -type bindAddresses map[tcpip.Address]struct{} - -func (b bindAddresses) isAvailable(addr tcpip.Address) bool { - if addr == anyIPAddress { - return len(b) == 0 - } - - if _, ok := b[anyIPAddress]; ok { - return false - } - - if _, ok := b[addr]; ok { - return false - } - return true -} - -// NewPortManager 新建一个端口管理器 -func NewPortManager() *PortManager { - return &PortManager{ - allocatedPorts: make(map[portDescriptor]bindAddresses), - } -} - -// PickEphemeralPort 从端口管理器中随机分配一个端口,并调用testPort来检测是否可用 -func (s *PortManager) PickEphemeralPort(testPort func(p uint16) (bool, *tcpip.Error)) (port uint16, err *tcpip.Error) { - count := uint16(math.MaxUint16 - FirstEphemeral + 1) - offset := uint16(rand.Int31n(int32(count))) - - for i := uint16(0); i < count; i++ { - port = FirstEphemeral + (offset+i)%count - ok, err := testPort(port) - if err != nil { - return 0, nil - } - if ok { - return port, nil - } - } - return 0, tcpip.ErrNoPortAvailable -} - -// IsPortAvailable 根据参数判断该端口号是否已经被占用了 -func (s *PortManager) IsPortAvailable(networks []tcpip.NetworkProtocolNumber, - transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16) bool { - s.mu.Lock() - defer s.mu.Unlock() - return s.isPortAvailableLocked(networks, transport, addr, port) -} - -// 根据参数判断该端口号是否被占用 -func (s *PortManager) isPortAvailableLocked(networks []tcpip.NetworkProtocolNumber, - transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16) bool { - for _, network := range networks { // 遍历网络协议 - desc := portDescriptor{network: network, transport: transport, port: port} // 构造端口描述符 - if addrs, ok := s.allocatedPorts[desc]; ok { // 检查端口描述符绑定的ip集合 - if !addrs.isAvailable(addr) { // 该集合中已经有这个ip 或者是"" 也就是 0.0.0.0 - return false - } - } - } - return true -} - -// ReservePort 将端口和IP地址绑定在一起,这样别的程序就无法使用已经被绑定的端口。 -// 如果传入的端口不为0,那么会尝试绑定该端口,若该端口没有被占用,那么绑定成功。 -// 如果传人的端口等于0,那么就是告诉协议栈自己分配端口,端口管理器就会随机返回一个端口。 -func (s *PortManager) ReservePort(networks []tcpip.NetworkProtocolNumber, - transport tcpip.TransportProtocolNumber, - addr tcpip.Address, port uint16) (reservedPort uint16, err *tcpip.Error) { - s.mu.Lock() - defer s.mu.Unlock() - // defer log.Println(transport, "成功分配端口", *(&reservedPort)) TODO 这样写就有问题 defer给直接取值了? - defer func() { - log.Println(transport, "成功分配端口", *(&reservedPort)) - }() - - // 指定端口进行绑定 - if port != 0 { - if !s.reserveSpecificPort(networks, transport, addr, port) { - return 0, tcpip.ErrPortInUse // 已经被占用 - } - reservedPort = port - return - } - // 随机分配 - reservedPort, err = s.PickEphemeralPort(func(p uint16) (bool, *tcpip.Error) { - return s.reserveSpecificPort(networks, transport, addr, p), nil - }) - return reservedPort, nil -} - -// reserveSpecificPort 尝试根据协议号和IP地址绑定一个端口 -func (s *PortManager) reserveSpecificPort(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, - addr tcpip.Address, port uint16) bool { - if !s.isPortAvailableLocked(networks, transport, addr, port) { - return false - } - - // 根据给定的网络层协议号绑定端口 - for _, network := range networks { - desc := portDescriptor{network: network, transport: transport, port: port} // ipv4-udp-9999 - m, ok := s.allocatedPorts[desc] - if !ok { - m = make(bindAddresses) // Set of IP - s.allocatedPorts[desc] = m - } - // 注册该地址被绑定了 - m[addr] = struct{}{} - } - return true -} - -// ReleasePort 释放绑定的端口,以便别的程序复用。 -func (s *PortManager) ReleasePort(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, - addr tcpip.Address, port uint16) { - s.mu.Lock() - defer s.mu.Unlock() - - // 删除绑定关系 - for _, network := range networks { - desc := portDescriptor{network, transport, port} - if m, ok := s.allocatedPorts[desc]; ok { - log.Println(transport, "释放", port) - delete(m, addr) - if len(m) == 0 { - delete(s.allocatedPorts, desc) - } - } - } -} +package ports + +import ( + "log" + "math" + "math/rand" + "netstack/tcpip" + "sync" +) + +const ( + // 临时端口的最小值 + FirstEphemeral = 16000 + + anyIPAddress tcpip.Address = "" +) + +// 端口的唯一标识 : 网络层协议-传输层协议-端口号 +type portDescriptor struct { + network tcpip.NetworkProtocolNumber + transport tcpip.TransportProtocolNumber + port uint16 +} + +// PortManager 管理端口的对象 由他来保留和释放端口 +type PortManager struct { + mu sync.RWMutex + // 用一个map接口来保存被占用的端口 + // port:ips ipv4-tcp-80:[192.168.1.1, 192.168.1.2] + // ipv4-udp-9999:[192.168.10.1, 192.168.10.2] + allocatedPorts map[portDescriptor]bindAddresses +} + +// 一个IP地址的集合 +type bindAddresses map[tcpip.Address]struct{} + +func (b bindAddresses) isAvailable(addr tcpip.Address) bool { + if addr == anyIPAddress { + return len(b) == 0 + } + + if _, ok := b[anyIPAddress]; ok { + return false + } + + if _, ok := b[addr]; ok { + return false + } + return true +} + +// NewPortManager 新建一个端口管理器 +func NewPortManager() *PortManager { + return &PortManager{ + allocatedPorts: make(map[portDescriptor]bindAddresses), + } +} + +// PickEphemeralPort 从端口管理器中随机分配一个端口,并调用testPort来检测是否可用 +func (s *PortManager) PickEphemeralPort(testPort func(p uint16) (bool, *tcpip.Error)) (port uint16, err *tcpip.Error) { + count := uint16(math.MaxUint16 - FirstEphemeral + 1) + offset := uint16(rand.Int31n(int32(count))) + + for i := uint16(0); i < count; i++ { + port = FirstEphemeral + (offset+i)%count + ok, err := testPort(port) + if err != nil { + return 0, nil + } + if ok { + return port, nil + } + } + return 0, tcpip.ErrNoPortAvailable +} + +// IsPortAvailable 根据参数判断该端口号是否已经被占用了 +func (s *PortManager) IsPortAvailable(networks []tcpip.NetworkProtocolNumber, + transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16) bool { + s.mu.Lock() + defer s.mu.Unlock() + return s.isPortAvailableLocked(networks, transport, addr, port) +} + +// 根据参数判断该端口号是否被占用 +func (s *PortManager) isPortAvailableLocked(networks []tcpip.NetworkProtocolNumber, + transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16) bool { + for _, network := range networks { // 遍历网络协议 + desc := portDescriptor{network: network, transport: transport, port: port} // 构造端口描述符 + if addrs, ok := s.allocatedPorts[desc]; ok { // 检查端口描述符绑定的ip集合 + if !addrs.isAvailable(addr) { // 该集合中已经有这个ip 或者是"" 也就是 0.0.0.0 + return false + } + } + } + return true +} + +// ReservePort 将端口和IP地址绑定在一起,这样别的程序就无法使用已经被绑定的端口。 +// 如果传入的端口不为0,那么会尝试绑定该端口,若该端口没有被占用,那么绑定成功。 +// 如果传人的端口等于0,那么就是告诉协议栈自己分配端口,端口管理器就会随机返回一个端口。 +func (s *PortManager) ReservePort(networks []tcpip.NetworkProtocolNumber, + transport tcpip.TransportProtocolNumber, + addr tcpip.Address, port uint16) (reservedPort uint16, err *tcpip.Error) { + s.mu.Lock() + defer s.mu.Unlock() + // defer log.Println(transport, "成功分配端口", *(&reservedPort)) TODO 这样写就有问题 defer给直接取值了? + defer func() { + log.Println(transport, "成功分配端口", *(&reservedPort)) + }() + + // 指定端口进行绑定 + if port != 0 { + if !s.reserveSpecificPort(networks, transport, addr, port) { + return 0, tcpip.ErrPortInUse // 已经被占用 + } + reservedPort = port + return + } + // 随机分配 + reservedPort, err = s.PickEphemeralPort(func(p uint16) (bool, *tcpip.Error) { + return s.reserveSpecificPort(networks, transport, addr, p), nil + }) + return reservedPort, nil +} + +// reserveSpecificPort 尝试根据协议号和IP地址绑定一个端口 +func (s *PortManager) reserveSpecificPort(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, + addr tcpip.Address, port uint16) bool { + if !s.isPortAvailableLocked(networks, transport, addr, port) { + return false + } + + // 根据给定的网络层协议号绑定端口 + for _, network := range networks { + desc := portDescriptor{network: network, transport: transport, port: port} // ipv4-udp-9999 + m, ok := s.allocatedPorts[desc] + if !ok { + m = make(bindAddresses) // Set of IP + s.allocatedPorts[desc] = m + } + // 注册该地址被绑定了 + m[addr] = struct{}{} + } + return true +} + +// ReleasePort 释放绑定的端口,以便别的程序复用。 +func (s *PortManager) ReleasePort(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, + addr tcpip.Address, port uint16) { + s.mu.Lock() + defer s.mu.Unlock() + + // 删除绑定关系 + for _, network := range networks { + desc := portDescriptor{network, transport, port} + if m, ok := s.allocatedPorts[desc]; ok { + log.Println(transport, "释放", port) + delete(m, addr) + if len(m) == 0 { + delete(s.allocatedPorts, desc) + } + } + } +} diff --git a/tcpip/stack/linkaddrcache.go b/tcpip/stack/linkaddrcache.go index a4df7e5..15bd4c6 100644 --- a/tcpip/stack/linkaddrcache.go +++ b/tcpip/stack/linkaddrcache.go @@ -1,308 +1,308 @@ -// Copyright 2018 Google LLC -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package stack - -import ( - "fmt" - "log" - "sync" - "time" - - "netstack/sleep" - "netstack/tcpip" -) - -const linkAddrCacheSize = 512 // max cache entries - -// linkAddrCache is a fixed-sized cache mapping IP addresses to link addresses. -// -// The entries are stored in a ring buffer, oldest entry replaced first. -// -// This struct is safe for concurrent use. -type linkAddrCache struct { - // ageLimit is how long a cache entry is valid for. - ageLimit time.Duration - - // resolutionTimeout is the amount of time to wait for a link request to - // resolve an address. - resolutionTimeout time.Duration - - // resolutionAttempts is the number of times an address is attempted to be - // resolved before failing. - resolutionAttempts int - - mu sync.Mutex - cache map[tcpip.FullAddress]*linkAddrEntry - next int // array index of next available entry - entries [linkAddrCacheSize]linkAddrEntry -} - -// entryState controls the state of a single entry in the cache. -type entryState int - -const ( - // incomplete means that there is an outstanding request to resolve the - // address. This is the initial state. - incomplete entryState = iota - // ready means that the address has been resolved and can be used. - ready - // failed means that address resolution timed out and the address - // could not be resolved. - failed - // expired means that the cache entry has expired and the address must be - // resolved again. - expired -) - -// String implements Stringer. -func (s entryState) String() string { - switch s { - case incomplete: - return "incomplete" - case ready: - return "ready" - case failed: - return "failed" - case expired: - return "expired" - default: - return fmt.Sprintf("unknown(%d)", s) - } -} - -// A linkAddrEntry is an entry in the linkAddrCache. -// This struct is thread-compatible. -type linkAddrEntry struct { - addr tcpip.FullAddress - linkAddr tcpip.LinkAddress - expiration time.Time - s entryState - - // wakers is a set of waiters for address resolution result. Anytime - // state transitions out of 'incomplete' these waiters are notified. - wakers map[*sleep.Waker]struct{} - - done chan struct{} -} - -func (e *linkAddrEntry) state() entryState { - if e.s != expired && time.Now().After(e.expiration) { - // Force the transition to ensure waiters are notified. - e.changeState(expired) - } - return e.s -} - -func (e *linkAddrEntry) changeState(ns entryState) { - if e.s == ns { - return - } - - // Validate state transition. - switch e.s { - case incomplete: - // All transitions are valid. - case ready, failed: - if ns != expired { - panic(fmt.Sprintf("invalid state transition from %s to %s", e.s, ns)) - } - case expired: - // Terminal state. - panic(fmt.Sprintf("invalid state transition from %s to %s", e.s, ns)) - default: - panic(fmt.Sprintf("invalid state: %s", e.s)) - } - - // Notify whoever is waiting on address resolution when transitioning - // out of 'incomplete'. - if e.s == incomplete { - for w := range e.wakers { - w.Assert() - } - e.wakers = nil - if e.done != nil { - close(e.done) - } - } - e.s = ns -} - -func (e *linkAddrEntry) addWaker(w *sleep.Waker) { - e.wakers[w] = struct{}{} -} - -func (e *linkAddrEntry) removeWaker(w *sleep.Waker) { - delete(e.wakers, w) -} - -// add adds a k -> v mapping to the cache. -func (c *linkAddrCache) add(k tcpip.FullAddress, v tcpip.LinkAddress) { - log.Printf("add link cache: %v-%v", k, v) - c.mu.Lock() - defer c.mu.Unlock() - - entry, ok := c.cache[k] - if ok { - s := entry.state() - if s != expired && entry.linkAddr == v { - // Disregard repeated calls. - return - } - // Check if entry is waiting for address resolution. - if s == incomplete { - entry.linkAddr = v - } else { - // Otherwise create a new entry to replace it. - entry = c.makeAndAddEntry(k, v) - } - } else { - entry = c.makeAndAddEntry(k, v) - } - - entry.changeState(ready) -} - -// makeAndAddEntry is a helper function to create and add a new -// entry to the cache map and evict older entry as needed. -func (c *linkAddrCache) makeAndAddEntry(k tcpip.FullAddress, v tcpip.LinkAddress) *linkAddrEntry { - // Take over the next entry. - entry := &c.entries[c.next] - if c.cache[entry.addr] == entry { - delete(c.cache, entry.addr) - } - - // Mark the soon-to-be-replaced entry as expired, just in case there is - // someone waiting for address resolution on it. - entry.changeState(expired) - - *entry = linkAddrEntry{ - addr: k, - linkAddr: v, - expiration: time.Now().Add(c.ageLimit), - wakers: make(map[*sleep.Waker]struct{}), - done: make(chan struct{}), - } - - c.cache[k] = entry - c.next = (c.next + 1) % len(c.entries) - return entry -} - -// get reports any known link address for k. -func (c *linkAddrCache) get(k tcpip.FullAddress, linkRes LinkAddressResolver, localAddr tcpip.Address, linkEP LinkEndpoint, waker *sleep.Waker) (tcpip.LinkAddress, <-chan struct{}, *tcpip.Error) { - log.Printf("link addr get linkRes: %#v, addr: %+v", linkRes, k) - if linkRes != nil { - if addr, ok := linkRes.ResolveStaticAddress(k.Addr); ok { - return addr, nil, nil - } - } - - c.mu.Lock() - defer c.mu.Unlock() - // 尝试从缓存中得到MAC地址 - if entry, ok := c.cache[k]; ok { - switch s := entry.state(); s { - case expired: - case ready: - return entry.linkAddr, nil, nil - case failed: - return "", nil, tcpip.ErrNoLinkAddress - case incomplete: - // Address resolution is still in progress. - entry.addWaker(waker) - return "", entry.done, tcpip.ErrWouldBlock - default: - panic(fmt.Sprintf("invalid cache entry state: %s", s)) - } - } - - if linkRes == nil { - return "", nil, tcpip.ErrNoLinkAddress - } - - // Add 'incomplete' entry in the cache to mark that resolution is in progress. - e := c.makeAndAddEntry(k, "") - e.addWaker(waker) - - go c.startAddressResolution(k, linkRes, localAddr, linkEP, e.done) - - return "", e.done, tcpip.ErrWouldBlock -} - -// removeWaker removes a waker previously added through get(). -func (c *linkAddrCache) removeWaker(k tcpip.FullAddress, waker *sleep.Waker) { - c.mu.Lock() - defer c.mu.Unlock() - - if entry, ok := c.cache[k]; ok { - entry.removeWaker(waker) - } -} - -func (c *linkAddrCache) startAddressResolution(k tcpip.FullAddress, linkRes LinkAddressResolver, localAddr tcpip.Address, linkEP LinkEndpoint, done <-chan struct{}) { - for i := 0; ; i++ { - // Send link request, then wait for the timeout limit and check - // whether the request succeeded. - linkRes.LinkAddressRequest(k.Addr, localAddr, linkEP) - - select { - case <-time.After(c.resolutionTimeout): - if stop := c.checkLinkRequest(k, i); stop { - return - } - case <-done: - return - } - } -} - -// checkLinkRequest checks whether previous attempt to resolve address has succeeded -// and mark the entry accordingly, e.g. ready, failed, etc. Return true if request -// can stop, false if another request should be sent. -func (c *linkAddrCache) checkLinkRequest(k tcpip.FullAddress, attempt int) bool { - c.mu.Lock() - defer c.mu.Unlock() - - entry, ok := c.cache[k] - if !ok { - // Entry was evicted from the cache. - return true - } - - switch s := entry.state(); s { - case ready, failed, expired: - // Entry was made ready by resolver or failed. Either way we're done. - return true - case incomplete: - if attempt+1 >= c.resolutionAttempts { - // Max number of retries reached, mark entry as failed. - entry.changeState(failed) - return true - } - // No response yet, need to send another ARP request. - return false - default: - panic(fmt.Sprintf("invalid cache entry state: %s", s)) - } -} - -func newLinkAddrCache(ageLimit, resolutionTimeout time.Duration, resolutionAttempts int) *linkAddrCache { - return &linkAddrCache{ - ageLimit: ageLimit, - resolutionTimeout: resolutionTimeout, - resolutionAttempts: resolutionAttempts, - cache: make(map[tcpip.FullAddress]*linkAddrEntry, linkAddrCacheSize), - } -} +// Copyright 2018 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package stack + +import ( + "fmt" + "log" + "sync" + "time" + + "netstack/sleep" + "netstack/tcpip" +) + +const linkAddrCacheSize = 512 // max cache entries + +// linkAddrCache is a fixed-sized cache mapping IP addresses to link addresses. +// +// The entries are stored in a ring buffer, oldest entry replaced first. +// +// This struct is safe for concurrent use. +type linkAddrCache struct { + // ageLimit is how long a cache entry is valid for. + ageLimit time.Duration + + // resolutionTimeout is the amount of time to wait for a link request to + // resolve an address. + resolutionTimeout time.Duration + + // resolutionAttempts is the number of times an address is attempted to be + // resolved before failing. + resolutionAttempts int + + mu sync.Mutex + cache map[tcpip.FullAddress]*linkAddrEntry + next int // array index of next available entry + entries [linkAddrCacheSize]linkAddrEntry +} + +// entryState controls the state of a single entry in the cache. +type entryState int + +const ( + // incomplete means that there is an outstanding request to resolve the + // address. This is the initial state. + incomplete entryState = iota + // ready means that the address has been resolved and can be used. + ready + // failed means that address resolution timed out and the address + // could not be resolved. + failed + // expired means that the cache entry has expired and the address must be + // resolved again. + expired +) + +// String implements Stringer. +func (s entryState) String() string { + switch s { + case incomplete: + return "incomplete" + case ready: + return "ready" + case failed: + return "failed" + case expired: + return "expired" + default: + return fmt.Sprintf("unknown(%d)", s) + } +} + +// A linkAddrEntry is an entry in the linkAddrCache. +// This struct is thread-compatible. +type linkAddrEntry struct { + addr tcpip.FullAddress + linkAddr tcpip.LinkAddress + expiration time.Time + s entryState + + // wakers is a set of waiters for address resolution result. Anytime + // state transitions out of 'incomplete' these waiters are notified. + wakers map[*sleep.Waker]struct{} + + done chan struct{} +} + +func (e *linkAddrEntry) state() entryState { + if e.s != expired && time.Now().After(e.expiration) { + // Force the transition to ensure waiters are notified. + e.changeState(expired) + } + return e.s +} + +func (e *linkAddrEntry) changeState(ns entryState) { + if e.s == ns { + return + } + + // Validate state transition. + switch e.s { + case incomplete: + // All transitions are valid. + case ready, failed: + if ns != expired { + panic(fmt.Sprintf("invalid state transition from %s to %s", e.s, ns)) + } + case expired: + // Terminal state. + panic(fmt.Sprintf("invalid state transition from %s to %s", e.s, ns)) + default: + panic(fmt.Sprintf("invalid state: %s", e.s)) + } + + // Notify whoever is waiting on address resolution when transitioning + // out of 'incomplete'. + if e.s == incomplete { + for w := range e.wakers { + w.Assert() + } + e.wakers = nil + if e.done != nil { + close(e.done) + } + } + e.s = ns +} + +func (e *linkAddrEntry) addWaker(w *sleep.Waker) { + e.wakers[w] = struct{}{} +} + +func (e *linkAddrEntry) removeWaker(w *sleep.Waker) { + delete(e.wakers, w) +} + +// add adds a k -> v mapping to the cache. +func (c *linkAddrCache) add(k tcpip.FullAddress, v tcpip.LinkAddress) { + log.Printf("add link cache: %v-%v", k, v) + c.mu.Lock() + defer c.mu.Unlock() + + entry, ok := c.cache[k] + if ok { + s := entry.state() + if s != expired && entry.linkAddr == v { + // Disregard repeated calls. + return + } + // Check if entry is waiting for address resolution. + if s == incomplete { + entry.linkAddr = v + } else { + // Otherwise create a new entry to replace it. + entry = c.makeAndAddEntry(k, v) + } + } else { + entry = c.makeAndAddEntry(k, v) + } + + entry.changeState(ready) +} + +// makeAndAddEntry is a helper function to create and add a new +// entry to the cache map and evict older entry as needed. +func (c *linkAddrCache) makeAndAddEntry(k tcpip.FullAddress, v tcpip.LinkAddress) *linkAddrEntry { + // Take over the next entry. + entry := &c.entries[c.next] + if c.cache[entry.addr] == entry { + delete(c.cache, entry.addr) + } + + // Mark the soon-to-be-replaced entry as expired, just in case there is + // someone waiting for address resolution on it. + entry.changeState(expired) + + *entry = linkAddrEntry{ + addr: k, + linkAddr: v, + expiration: time.Now().Add(c.ageLimit), + wakers: make(map[*sleep.Waker]struct{}), + done: make(chan struct{}), + } + + c.cache[k] = entry + c.next = (c.next + 1) % len(c.entries) + return entry +} + +// get reports any known link address for k. +func (c *linkAddrCache) get(k tcpip.FullAddress, linkRes LinkAddressResolver, localAddr tcpip.Address, linkEP LinkEndpoint, waker *sleep.Waker) (tcpip.LinkAddress, <-chan struct{}, *tcpip.Error) { + log.Printf("link addr get linkRes: %#v, addr: %+v", linkRes, k) + if linkRes != nil { + if addr, ok := linkRes.ResolveStaticAddress(k.Addr); ok { + return addr, nil, nil + } + } + + c.mu.Lock() + defer c.mu.Unlock() + // 尝试从缓存中得到MAC地址 + if entry, ok := c.cache[k]; ok { + switch s := entry.state(); s { + case expired: + case ready: + return entry.linkAddr, nil, nil + case failed: + return "", nil, tcpip.ErrNoLinkAddress + case incomplete: + // Address resolution is still in progress. + entry.addWaker(waker) + return "", entry.done, tcpip.ErrWouldBlock + default: + panic(fmt.Sprintf("invalid cache entry state: %s", s)) + } + } + + if linkRes == nil { + return "", nil, tcpip.ErrNoLinkAddress + } + + // Add 'incomplete' entry in the cache to mark that resolution is in progress. + e := c.makeAndAddEntry(k, "") + e.addWaker(waker) + + go c.startAddressResolution(k, linkRes, localAddr, linkEP, e.done) + + return "", e.done, tcpip.ErrWouldBlock +} + +// removeWaker removes a waker previously added through get(). +func (c *linkAddrCache) removeWaker(k tcpip.FullAddress, waker *sleep.Waker) { + c.mu.Lock() + defer c.mu.Unlock() + + if entry, ok := c.cache[k]; ok { + entry.removeWaker(waker) + } +} + +func (c *linkAddrCache) startAddressResolution(k tcpip.FullAddress, linkRes LinkAddressResolver, localAddr tcpip.Address, linkEP LinkEndpoint, done <-chan struct{}) { + for i := 0; ; i++ { + // Send link request, then wait for the timeout limit and check + // whether the request succeeded. + linkRes.LinkAddressRequest(k.Addr, localAddr, linkEP) + + select { + case <-time.After(c.resolutionTimeout): + if stop := c.checkLinkRequest(k, i); stop { + return + } + case <-done: + return + } + } +} + +// checkLinkRequest checks whether previous attempt to resolve address has succeeded +// and mark the entry accordingly, e.g. ready, failed, etc. Return true if request +// can stop, false if another request should be sent. +func (c *linkAddrCache) checkLinkRequest(k tcpip.FullAddress, attempt int) bool { + c.mu.Lock() + defer c.mu.Unlock() + + entry, ok := c.cache[k] + if !ok { + // Entry was evicted from the cache. + return true + } + + switch s := entry.state(); s { + case ready, failed, expired: + // Entry was made ready by resolver or failed. Either way we're done. + return true + case incomplete: + if attempt+1 >= c.resolutionAttempts { + // Max number of retries reached, mark entry as failed. + entry.changeState(failed) + return true + } + // No response yet, need to send another ARP request. + return false + default: + panic(fmt.Sprintf("invalid cache entry state: %s", s)) + } +} + +func newLinkAddrCache(ageLimit, resolutionTimeout time.Duration, resolutionAttempts int) *linkAddrCache { + return &linkAddrCache{ + ageLimit: ageLimit, + resolutionTimeout: resolutionTimeout, + resolutionAttempts: resolutionAttempts, + cache: make(map[tcpip.FullAddress]*linkAddrEntry, linkAddrCacheSize), + } +} diff --git a/tcpip/stack/nic.go b/tcpip/stack/nic.go index 3fe526d..6c661c1 100644 --- a/tcpip/stack/nic.go +++ b/tcpip/stack/nic.go @@ -1,452 +1,452 @@ -package stack - -import ( - "log" - "netstack/ilist" - "netstack/tcpip" - "netstack/tcpip/buffer" - "netstack/tcpip/header" - "strings" - "sync" - "sync/atomic" -) - -// PrimaryEndpointBehavior 是端点首要行为的枚举 -type PrimaryEndpointBehavior int - -const ( - // CanBePrimaryEndpoint indicates the endpoint can be used as a primary - // endpoint for new connections with no local address. This is the - // default when calling NIC.AddAddress. - CanBePrimaryEndpoint PrimaryEndpointBehavior = iota - - // FirstPrimaryEndpoint indicates the endpoint should be the first - // primary endpoint considered. If there are multiple endpoints with - // this behavior, the most recently-added one will be first. - FirstPrimaryEndpoint - - // NeverPrimaryEndpoint indicates the endpoint should never be a - // primary endpoint. - NeverPrimaryEndpoint -) - -// 代表一个网卡对象 当我们创建好tap网卡对象后 我们使用NIC来代表它在我们自己的协议栈中的网卡对象 -type NIC struct { - stack *Stack - // 每个网卡的惟一标识号 - id tcpip.NICID - // 网卡名,可有可无 - name string - // 链路层端 - linkEP LinkEndpoint // 在链路层 这就是 fdbased.endpoint - - // 传输层的解复用 - demux *transportDemuxer - - mu sync.RWMutex - spoofing bool // 欺骗 - promiscuous bool // 混杂模式 - primary map[tcpip.NetworkProtocolNumber]*ilist.List // 网络协议号:网络端实现 - // 网络层端的记录 IP:网络端实现 - endpoints map[NetworkEndpointID]*referencedNetworkEndpoint - // 子网的记录 - subnets []tcpip.Subnet -} - -// 创建新的网卡对象 -func newNIC(stack *Stack, id tcpip.NICID, name string, ep LinkEndpoint) *NIC { - return &NIC{ - stack: stack, - id: id, - name: name, - linkEP: ep, - demux: nil, // TODO 需要处理 - primary: make(map[tcpip.NetworkProtocolNumber]*ilist.List), - endpoints: make(map[NetworkEndpointID]*referencedNetworkEndpoint), - } -} - -func (n *NIC) attachLinkEndpoint() { - n.linkEP.Attach(n) -} - -// setPromiscuousMode enables or disables promiscuous mode. -// 设备网卡为混杂模式 -func (n *NIC) setPromiscuousMode(enable bool) { - n.mu.Lock() - n.promiscuous = enable - n.mu.Unlock() -} - -// 判断网卡是否开启混杂模式 -func (n *NIC) isPromiscuousMode() bool { - n.mu.RLock() - rv := n.promiscuous - n.mu.RUnlock() - return rv -} - -// 在NIC上添加addr地址,注册和初始化网络层协议 -// 相当于给网卡添加ip地址 -func (n *NIC) addAddressLocked(protocol tcpip.NetworkProtocolNumber, addr tcpip.Address, - peb PrimaryEndpointBehavior, replace bool) (*referencedNetworkEndpoint, *tcpip.Error) { - netProto, ok := n.stack.networkProtocols[protocol] - if !ok { - log.Println("添加失败") - return nil, tcpip.ErrUnknownProtocol - } - - // 比如netProto是ipv4 会调用ipv4.NewEndpoint 新建一个网络端 - ep, err := netProto.NewEndpoint(n.id, addr, n.stack, n, n.linkEP) - if err != nil { - return nil, err - } - log.Printf("基于[%d]协议 为 #%d 网卡 添加网络层实现 并绑定地址到: %s\n", netProto.Number(), n.id, ep.ID().LocalAddress) - - // 获取网络层端的id 其实就是ip地址 - id := *ep.ID() - if ref, ok := n.endpoints[id]; ok { - // 不是替换 且该id已经存在 - if !replace { - return nil, tcpip.ErrDuplicateAddress - } - n.removeEndpointLocked(ref) // 这里被调用的时候已经上过锁了 NOTE - } - - ref := &referencedNetworkEndpoint{ - refs: 1, - ep: ep, - nic: n, - protocol: protocol, - holdsInsertRef: true, - } - - // Set up cache if link address resolution exists for this protocol. - if n.linkEP.Capabilities()&CapabilityResolutionRequired != 0 { - if _, ok := n.stack.linkAddrResolvers[protocol]; ok { - ref.linkCache = n.stack - } - } - - // 注册该网络端 - n.endpoints[id] = ref - - l, ok := n.primary[protocol] - if !ok { - l = &ilist.List{} - n.primary[protocol] = l - } - - switch peb { - case CanBePrimaryEndpoint: - l.PushBack(ref) // 目前走这一支 - case FirstPrimaryEndpoint: - l.PushFront(ref) - } - return ref, nil -} - -func (n *NIC) AddAddress(protocol tcpip.NetworkProtocolNumber, addr tcpip.Address) *tcpip.Error { - return n.AddAddressWithOptions(protocol, addr, CanBePrimaryEndpoint) -} - -func (n *NIC) AddAddressWithOptions(protocol tcpip.NetworkProtocolNumber, - addr tcpip.Address, peb PrimaryEndpointBehavior) *tcpip.Error { - n.mu.Lock() - _, err := n.addAddressLocked(protocol, addr, peb, false) - n.mu.Unlock() - - return err -} - -// 删除一个网络端 -func (n *NIC) removeEndpointLocked(r *referencedNetworkEndpoint) { - id := *r.ep.ID() - - // Nothing to do if the reference has already been replaced with a - // different one. - if n.endpoints[id] != r { - return - } - - if r.holdsInsertRef { - panic("Reference count dropped to zero before being removed") - } - - delete(n.endpoints, id) - wasInList := r.Next() != nil || r.Prev() != nil || r == n.primary[r.protocol].Front() - if wasInList { - n.primary[r.protocol].Remove(r) - } - - r.ep.Close() -} - -func (n *NIC) removeEndpoint(r *referencedNetworkEndpoint) { - n.mu.Lock() - n.removeEndpointLocked(r) - n.mu.Unlock() -} - -// primaryEndpoint returns the primary endpoint of n for the given network -// protocol. -// 根据网络层协议号找到对应的网络层端 -func (n *NIC) primaryEndpoint(protocol tcpip.NetworkProtocolNumber) *referencedNetworkEndpoint { - n.mu.RLock() - defer n.mu.RUnlock() - - list := n.primary[protocol] - if list == nil { - return nil - } - - for e := list.Front(); e != nil; e = e.Next() { - r := e.(*referencedNetworkEndpoint) - // TODO: allow broadcast address when SO_BROADCAST is set. - switch r.ep.ID().LocalAddress { - case header.IPv4Broadcast, header.IPv4Any: - continue - } - if r.tryIncRef() { - return r - } - } - - return nil -} - -// 根据address参数找到对应的网络层端 -func (n *NIC) findEndpoint(protocol tcpip.NetworkProtocolNumber, address tcpip.Address, - peb PrimaryEndpointBehavior) *referencedNetworkEndpoint { - id := NetworkEndpointID{address} - - n.mu.RLock() - ref := n.endpoints[id] - if ref != nil && !ref.tryIncRef() { // 尝试去使用这个网络端实现 - ref = nil - } - spoofing := n.spoofing - n.mu.RUnlock() - - if ref != nil || !spoofing { - return ref - } - - // Try again with the lock in exclusive mode. If we still can't get the - // endpoint, create a new "temporary" endpoint. It will only exist while - // there's a route through it. - n.mu.Lock() - ref = n.endpoints[id] - if ref == nil || !ref.tryIncRef() { - ref, _ = n.addAddressLocked(protocol, address, peb, true) - if ref != nil { - ref.holdsInsertRef = false - } - } - n.mu.Unlock() - return ref -} - -// AddSubnet adds a new subnet to n, so that it starts accepting packets -// targeted at the given address and network protocol. -// AddSubnet向n添加一个新子网,以便它开始接受针对给定地址和网络协议的数据包。 -func (n *NIC) AddSubnet(protocol tcpip.NetworkProtocolNumber, subnet tcpip.Subnet) { - n.mu.Lock() - n.subnets = append(n.subnets, subnet) - n.mu.Unlock() -} - -// RemoveSubnet removes the given subnet from n. -// 从n中删除一个子网 -func (n *NIC) RemoveSubnet(subnet tcpip.Subnet) { - n.mu.Lock() - - // Use the same underlying array. - tmp := n.subnets[:0] - for _, sub := range n.subnets { - if sub != subnet { - tmp = append(tmp, sub) - } - } - n.subnets = tmp - - n.mu.Unlock() -} - -// ContainsSubnet reports whether this NIC contains the given subnet. -// 判断 subnet 这个子网是否在该网卡下 -func (n *NIC) ContainsSubnet(subnet tcpip.Subnet) bool { - for _, s := range n.Subnets() { - if s == subnet { - return true - } - } - return false -} - -// Subnets returns the Subnets associated with this NIC. -// 获取该网卡的所有子网 -func (n *NIC) Subnets() []tcpip.Subnet { - n.mu.RLock() - defer n.mu.RUnlock() - sns := make([]tcpip.Subnet, 0, len(n.subnets)+len(n.endpoints)) - for nid := range n.endpoints { - sn, err := tcpip.NewSubnet(nid.LocalAddress, tcpip.AddressMask(strings.Repeat("\xff", len(nid.LocalAddress)))) - if err != nil { - // This should never happen as the mask has been carefully crafted to - // match the address. - panic("Invalid endpoint subnet: " + err.Error()) - } - sns = append(sns, sn) - } - return append(sns, n.subnets...) -} - -// 根据协议类型和目标地址,找出关联的Endpoint -func (n *NIC) getRef(protocol tcpip.NetworkProtocolNumber, dst tcpip.Address) *referencedNetworkEndpoint { - id := NetworkEndpointID{dst} - - n.mu.RLock() - if ref, ok := n.endpoints[id]; ok && ref.tryIncRef() { - log.Println("找到了目标网络层实现: ", id.LocalAddress) - n.mu.RUnlock() - return ref - } - - promiscuous := n.promiscuous - // Check if the packet is for a subnet this NIC cares about. - if !promiscuous { - for _, sn := range n.subnets { - if sn.Contains(dst) { - promiscuous = true - break - } - } - } - n.mu.RUnlock() - if promiscuous { - // Try again with the lock in exclusive mode. If we still can't - // get the endpoint, create a new "temporary" one. It will only - // exist while there's a route through it. - n.mu.Lock() - if ref, ok := n.endpoints[id]; ok && ref.tryIncRef() { - n.mu.Unlock() - return ref - } - ref, err := n.addAddressLocked(protocol, dst, CanBePrimaryEndpoint, true) - n.mu.Unlock() - if err == nil { - ref.holdsInsertRef = false - return ref - } - } - - return nil -} - -// DeliverNetworkPacket 当 NIC 从物理接口接收数据包时,将调用函数 DeliverNetworkPacket,用来分发网络层数据包。 -// 比如 protocol 是 arp 协议号,那么会找到arp.HandlePacket来处理数据报。 -// 简单来说就是根据网络层协议和目的地址来找到相应的网络层端,将网络层数据发给它, -// 当前实现的网络层协议有 arp、ipv4 和 ipv6。 -func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remoteLinkAddr, localLinkAddr tcpip.LinkAddress, - protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView) { - netProto, ok := n.stack.networkProtocols[protocol] - if !ok { - n.stack.stats.UnknownProtocolRcvdPackets.Increment() - return - } - - if netProto.Number() == header.IPv4ProtocolNumber || netProto.Number() == header.IPv6ProtocolNumber { - n.stack.stats.IP.PacketsReceived.Increment() - } - - if len(vv.First()) < netProto.MinimumPacketSize() { - n.stack.stats.MalformedRcvdPackets.Increment() - return - } - src, dst := netProto.ParseAddresses(vv.First()) - log.Printf("设备[%v]准备从 [%s] 向 [%s] 分发数据: %v\n", linkEP.LinkAddress(), src, dst, func() []byte { - if len(vv.ToView()) > 64 { - return vv.ToView()[:64] - } - return vv.ToView() - }()) - // 根据网络协议和数据包的目的地址,找到网络端 - // 然后将数据包分发给网络层 - if ref := n.getRef(protocol, dst); ref != nil { - r := makeRoute(protocol, dst, src, linkEP.LinkAddress(), ref) - r.RemoteLinkAddress = remoteLinkAddr - ref.ep.HandlePacket(&r, vv) - ref.decRef() - - return - } - n.stack.stats.IP.InvalidAddressesReceived.Increment() -} - -// DeliverTransportPacket delivers packets to the appropriate -// transport protocol endpoint. -func (n *NIC) DeliverTransportPacket(r *Route, protocol tcpip.TransportProtocolNumber, vv buffer.VectorisedView) { - // 先查找协议栈是否注册了该传输层协议 - _, ok := n.stack.transportProtocols[protocol] - if !ok { - n.stack.stats.UnknownProtocolRcvdPackets.Increment() - return - } - log.Println("准备分发传输层数据报", n.stack.transportProtocols) - -} - -// DeliverTransportControlPacket delivers control packets to the -// appropriate transport protocol endpoint. -func (n *NIC) DeliverTransportControlPacket(local, remote tcpip.Address, net tcpip.NetworkProtocolNumber, - trans tcpip.TransportProtocolNumber, typ ControlType, extra uint32, vv buffer.VectorisedView) { - -} - -func (n *NIC) ID() tcpip.NICID { - return n.id -} - -// 网络端引用 -type referencedNetworkEndpoint struct { - ilist.Entry - refs int32 // 引用计数 - ep NetworkEndpoint // 网络端实现 - nic *NIC - protocol tcpip.NetworkProtocolNumber - - // linkCache is set if link address resolution is enabled for this - // protocol. Set to nil otherwise. - linkCache LinkAddressCache - linkAddrCache - - // holdsInsertRef is protected by the NIC's mutex. It indicates whether - // the reference count is biased by 1 due to the insertion of the - // endpoint. It is reset to false when RemoveAddress is called on the - // NIC. - holdsInsertRef bool -} - -func (r *referencedNetworkEndpoint) decRef() { - if atomic.AddInt32(&r.refs, -1) == 0 { - r.nic.removeEndpoint(r) - } -} - -func (r *referencedNetworkEndpoint) incRef() { - atomic.AddInt32(&r.refs, 1) -} - -func (r *referencedNetworkEndpoint) tryIncRef() bool { - for { - v := atomic.LoadInt32(&r.refs) - if v == 0 { - return false - } - - if atomic.CompareAndSwapInt32(&r.refs, v, v+1) { - return true - } - } -} +package stack + +import ( + "log" + "netstack/ilist" + "netstack/tcpip" + "netstack/tcpip/buffer" + "netstack/tcpip/header" + "strings" + "sync" + "sync/atomic" +) + +// PrimaryEndpointBehavior 是端点首要行为的枚举 +type PrimaryEndpointBehavior int + +const ( + // CanBePrimaryEndpoint indicates the endpoint can be used as a primary + // endpoint for new connections with no local address. This is the + // default when calling NIC.AddAddress. + CanBePrimaryEndpoint PrimaryEndpointBehavior = iota + + // FirstPrimaryEndpoint indicates the endpoint should be the first + // primary endpoint considered. If there are multiple endpoints with + // this behavior, the most recently-added one will be first. + FirstPrimaryEndpoint + + // NeverPrimaryEndpoint indicates the endpoint should never be a + // primary endpoint. + NeverPrimaryEndpoint +) + +// 代表一个网卡对象 当我们创建好tap网卡对象后 我们使用NIC来代表它在我们自己的协议栈中的网卡对象 +type NIC struct { + stack *Stack + // 每个网卡的惟一标识号 + id tcpip.NICID + // 网卡名,可有可无 + name string + // 链路层端 + linkEP LinkEndpoint // 在链路层 这就是 fdbased.endpoint + + // 传输层的解复用 + demux *transportDemuxer + + mu sync.RWMutex + spoofing bool // 欺骗 + promiscuous bool // 混杂模式 + primary map[tcpip.NetworkProtocolNumber]*ilist.List // 网络协议号:网络端实现 + // 网络层端的记录 IP:网络端实现 + endpoints map[NetworkEndpointID]*referencedNetworkEndpoint + // 子网的记录 + subnets []tcpip.Subnet +} + +// 创建新的网卡对象 +func newNIC(stack *Stack, id tcpip.NICID, name string, ep LinkEndpoint) *NIC { + return &NIC{ + stack: stack, + id: id, + name: name, + linkEP: ep, + demux: nil, // TODO 需要处理 + primary: make(map[tcpip.NetworkProtocolNumber]*ilist.List), + endpoints: make(map[NetworkEndpointID]*referencedNetworkEndpoint), + } +} + +func (n *NIC) attachLinkEndpoint() { + n.linkEP.Attach(n) +} + +// setPromiscuousMode enables or disables promiscuous mode. +// 设备网卡为混杂模式 +func (n *NIC) setPromiscuousMode(enable bool) { + n.mu.Lock() + n.promiscuous = enable + n.mu.Unlock() +} + +// 判断网卡是否开启混杂模式 +func (n *NIC) isPromiscuousMode() bool { + n.mu.RLock() + rv := n.promiscuous + n.mu.RUnlock() + return rv +} + +// 在NIC上添加addr地址,注册和初始化网络层协议 +// 相当于给网卡添加ip地址 +func (n *NIC) addAddressLocked(protocol tcpip.NetworkProtocolNumber, addr tcpip.Address, + peb PrimaryEndpointBehavior, replace bool) (*referencedNetworkEndpoint, *tcpip.Error) { + netProto, ok := n.stack.networkProtocols[protocol] + if !ok { + log.Println("添加失败") + return nil, tcpip.ErrUnknownProtocol + } + + // 比如netProto是ipv4 会调用ipv4.NewEndpoint 新建一个网络端 + ep, err := netProto.NewEndpoint(n.id, addr, n.stack, n, n.linkEP) + if err != nil { + return nil, err + } + log.Printf("基于[%d]协议 为 #%d 网卡 添加网络层实现 并绑定地址到: %s\n", netProto.Number(), n.id, ep.ID().LocalAddress) + + // 获取网络层端的id 其实就是ip地址 + id := *ep.ID() + if ref, ok := n.endpoints[id]; ok { + // 不是替换 且该id已经存在 + if !replace { + return nil, tcpip.ErrDuplicateAddress + } + n.removeEndpointLocked(ref) // 这里被调用的时候已经上过锁了 NOTE + } + + ref := &referencedNetworkEndpoint{ + refs: 1, + ep: ep, + nic: n, + protocol: protocol, + holdsInsertRef: true, + } + + // Set up cache if link address resolution exists for this protocol. + if n.linkEP.Capabilities()&CapabilityResolutionRequired != 0 { + if _, ok := n.stack.linkAddrResolvers[protocol]; ok { + ref.linkCache = n.stack + } + } + + // 注册该网络端 + n.endpoints[id] = ref + + l, ok := n.primary[protocol] + if !ok { + l = &ilist.List{} + n.primary[protocol] = l + } + + switch peb { + case CanBePrimaryEndpoint: + l.PushBack(ref) // 目前走这一支 + case FirstPrimaryEndpoint: + l.PushFront(ref) + } + return ref, nil +} + +func (n *NIC) AddAddress(protocol tcpip.NetworkProtocolNumber, addr tcpip.Address) *tcpip.Error { + return n.AddAddressWithOptions(protocol, addr, CanBePrimaryEndpoint) +} + +func (n *NIC) AddAddressWithOptions(protocol tcpip.NetworkProtocolNumber, + addr tcpip.Address, peb PrimaryEndpointBehavior) *tcpip.Error { + n.mu.Lock() + _, err := n.addAddressLocked(protocol, addr, peb, false) + n.mu.Unlock() + + return err +} + +// 删除一个网络端 +func (n *NIC) removeEndpointLocked(r *referencedNetworkEndpoint) { + id := *r.ep.ID() + + // Nothing to do if the reference has already been replaced with a + // different one. + if n.endpoints[id] != r { + return + } + + if r.holdsInsertRef { + panic("Reference count dropped to zero before being removed") + } + + delete(n.endpoints, id) + wasInList := r.Next() != nil || r.Prev() != nil || r == n.primary[r.protocol].Front() + if wasInList { + n.primary[r.protocol].Remove(r) + } + + r.ep.Close() +} + +func (n *NIC) removeEndpoint(r *referencedNetworkEndpoint) { + n.mu.Lock() + n.removeEndpointLocked(r) + n.mu.Unlock() +} + +// primaryEndpoint returns the primary endpoint of n for the given network +// protocol. +// 根据网络层协议号找到对应的网络层端 +func (n *NIC) primaryEndpoint(protocol tcpip.NetworkProtocolNumber) *referencedNetworkEndpoint { + n.mu.RLock() + defer n.mu.RUnlock() + + list := n.primary[protocol] + if list == nil { + return nil + } + + for e := list.Front(); e != nil; e = e.Next() { + r := e.(*referencedNetworkEndpoint) + // TODO: allow broadcast address when SO_BROADCAST is set. + switch r.ep.ID().LocalAddress { + case header.IPv4Broadcast, header.IPv4Any: + continue + } + if r.tryIncRef() { + return r + } + } + + return nil +} + +// 根据address参数找到对应的网络层端 +func (n *NIC) findEndpoint(protocol tcpip.NetworkProtocolNumber, address tcpip.Address, + peb PrimaryEndpointBehavior) *referencedNetworkEndpoint { + id := NetworkEndpointID{address} + + n.mu.RLock() + ref := n.endpoints[id] + if ref != nil && !ref.tryIncRef() { // 尝试去使用这个网络端实现 + ref = nil + } + spoofing := n.spoofing + n.mu.RUnlock() + + if ref != nil || !spoofing { + return ref + } + + // Try again with the lock in exclusive mode. If we still can't get the + // endpoint, create a new "temporary" endpoint. It will only exist while + // there's a route through it. + n.mu.Lock() + ref = n.endpoints[id] + if ref == nil || !ref.tryIncRef() { + ref, _ = n.addAddressLocked(protocol, address, peb, true) + if ref != nil { + ref.holdsInsertRef = false + } + } + n.mu.Unlock() + return ref +} + +// AddSubnet adds a new subnet to n, so that it starts accepting packets +// targeted at the given address and network protocol. +// AddSubnet向n添加一个新子网,以便它开始接受针对给定地址和网络协议的数据包。 +func (n *NIC) AddSubnet(protocol tcpip.NetworkProtocolNumber, subnet tcpip.Subnet) { + n.mu.Lock() + n.subnets = append(n.subnets, subnet) + n.mu.Unlock() +} + +// RemoveSubnet removes the given subnet from n. +// 从n中删除一个子网 +func (n *NIC) RemoveSubnet(subnet tcpip.Subnet) { + n.mu.Lock() + + // Use the same underlying array. + tmp := n.subnets[:0] + for _, sub := range n.subnets { + if sub != subnet { + tmp = append(tmp, sub) + } + } + n.subnets = tmp + + n.mu.Unlock() +} + +// ContainsSubnet reports whether this NIC contains the given subnet. +// 判断 subnet 这个子网是否在该网卡下 +func (n *NIC) ContainsSubnet(subnet tcpip.Subnet) bool { + for _, s := range n.Subnets() { + if s == subnet { + return true + } + } + return false +} + +// Subnets returns the Subnets associated with this NIC. +// 获取该网卡的所有子网 +func (n *NIC) Subnets() []tcpip.Subnet { + n.mu.RLock() + defer n.mu.RUnlock() + sns := make([]tcpip.Subnet, 0, len(n.subnets)+len(n.endpoints)) + for nid := range n.endpoints { + sn, err := tcpip.NewSubnet(nid.LocalAddress, tcpip.AddressMask(strings.Repeat("\xff", len(nid.LocalAddress)))) + if err != nil { + // This should never happen as the mask has been carefully crafted to + // match the address. + panic("Invalid endpoint subnet: " + err.Error()) + } + sns = append(sns, sn) + } + return append(sns, n.subnets...) +} + +// 根据协议类型和目标地址,找出关联的Endpoint +func (n *NIC) getRef(protocol tcpip.NetworkProtocolNumber, dst tcpip.Address) *referencedNetworkEndpoint { + id := NetworkEndpointID{dst} + + n.mu.RLock() + if ref, ok := n.endpoints[id]; ok && ref.tryIncRef() { + log.Println("找到了目标网络层实现: ", id.LocalAddress) + n.mu.RUnlock() + return ref + } + + promiscuous := n.promiscuous + // Check if the packet is for a subnet this NIC cares about. + if !promiscuous { + for _, sn := range n.subnets { + if sn.Contains(dst) { + promiscuous = true + break + } + } + } + n.mu.RUnlock() + if promiscuous { + // Try again with the lock in exclusive mode. If we still can't + // get the endpoint, create a new "temporary" one. It will only + // exist while there's a route through it. + n.mu.Lock() + if ref, ok := n.endpoints[id]; ok && ref.tryIncRef() { + n.mu.Unlock() + return ref + } + ref, err := n.addAddressLocked(protocol, dst, CanBePrimaryEndpoint, true) + n.mu.Unlock() + if err == nil { + ref.holdsInsertRef = false + return ref + } + } + + return nil +} + +// DeliverNetworkPacket 当 NIC 从物理接口接收数据包时,将调用函数 DeliverNetworkPacket,用来分发网络层数据包。 +// 比如 protocol 是 arp 协议号,那么会找到arp.HandlePacket来处理数据报。 +// 简单来说就是根据网络层协议和目的地址来找到相应的网络层端,将网络层数据发给它, +// 当前实现的网络层协议有 arp、ipv4 和 ipv6。 +func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remoteLinkAddr, localLinkAddr tcpip.LinkAddress, + protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView) { + netProto, ok := n.stack.networkProtocols[protocol] + if !ok { + n.stack.stats.UnknownProtocolRcvdPackets.Increment() + return + } + + if netProto.Number() == header.IPv4ProtocolNumber || netProto.Number() == header.IPv6ProtocolNumber { + n.stack.stats.IP.PacketsReceived.Increment() + } + + if len(vv.First()) < netProto.MinimumPacketSize() { + n.stack.stats.MalformedRcvdPackets.Increment() + return + } + src, dst := netProto.ParseAddresses(vv.First()) + log.Printf("设备[%v]准备从 [%s] 向 [%s] 分发数据: %v\n", linkEP.LinkAddress(), src, dst, func() []byte { + if len(vv.ToView()) > 64 { + return vv.ToView()[:64] + } + return vv.ToView() + }()) + // 根据网络协议和数据包的目的地址,找到网络端 + // 然后将数据包分发给网络层 + if ref := n.getRef(protocol, dst); ref != nil { + r := makeRoute(protocol, dst, src, linkEP.LinkAddress(), ref) + r.RemoteLinkAddress = remoteLinkAddr + ref.ep.HandlePacket(&r, vv) + ref.decRef() + + return + } + n.stack.stats.IP.InvalidAddressesReceived.Increment() +} + +// DeliverTransportPacket delivers packets to the appropriate +// transport protocol endpoint. +func (n *NIC) DeliverTransportPacket(r *Route, protocol tcpip.TransportProtocolNumber, vv buffer.VectorisedView) { + // 先查找协议栈是否注册了该传输层协议 + _, ok := n.stack.transportProtocols[protocol] + if !ok { + n.stack.stats.UnknownProtocolRcvdPackets.Increment() + return + } + log.Println("准备分发传输层数据报", n.stack.transportProtocols) + +} + +// DeliverTransportControlPacket delivers control packets to the +// appropriate transport protocol endpoint. +func (n *NIC) DeliverTransportControlPacket(local, remote tcpip.Address, net tcpip.NetworkProtocolNumber, + trans tcpip.TransportProtocolNumber, typ ControlType, extra uint32, vv buffer.VectorisedView) { + +} + +func (n *NIC) ID() tcpip.NICID { + return n.id +} + +// 网络端引用 +type referencedNetworkEndpoint struct { + ilist.Entry + refs int32 // 引用计数 + ep NetworkEndpoint // 网络端实现 + nic *NIC + protocol tcpip.NetworkProtocolNumber + + // linkCache is set if link address resolution is enabled for this + // protocol. Set to nil otherwise. + linkCache LinkAddressCache + linkAddrCache + + // holdsInsertRef is protected by the NIC's mutex. It indicates whether + // the reference count is biased by 1 due to the insertion of the + // endpoint. It is reset to false when RemoveAddress is called on the + // NIC. + holdsInsertRef bool +} + +func (r *referencedNetworkEndpoint) decRef() { + if atomic.AddInt32(&r.refs, -1) == 0 { + r.nic.removeEndpoint(r) + } +} + +func (r *referencedNetworkEndpoint) incRef() { + atomic.AddInt32(&r.refs, 1) +} + +func (r *referencedNetworkEndpoint) tryIncRef() bool { + for { + v := atomic.LoadInt32(&r.refs) + if v == 0 { + return false + } + + if atomic.CompareAndSwapInt32(&r.refs, v, v+1) { + return true + } + } +} diff --git a/tcpip/stack/registration.go b/tcpip/stack/registration.go index ed2f2b3..71f4bcb 100644 --- a/tcpip/stack/registration.go +++ b/tcpip/stack/registration.go @@ -1,283 +1,283 @@ -package stack - -import ( - "netstack/sleep" - "netstack/tcpip" - "netstack/tcpip/buffer" - "netstack/waiter" - "sync" -) - -const ( - CapabilityChecksumOffload LinkEndpointCapabilities = 1 << iota - CapabilityResolutionRequired - CapabilitySaveRestore - CapabilityDisconnectOK - CapabilityLoopback -) - -// ====================链路层相关============================== - -// 所谓 io 就是数据的输入输出,对于网卡来说就是接收或发送数据, -// 接收意味着对以太网帧解封装和提交给网络层,发送意味着对上层数据的封装和写入网卡 - -// 链路层接口 -type LinkEndpoint interface { - // MTU是此端点的最大传输单位。这通常由支持物理网络决定; - // 当这种物理网络不存在时,限制通常为64k,其中包括IP数据包的最大大小。 - MTU() uint32 - - // Capabilities返回链路层端点支持的功能集。 - Capabilities() LinkEndpointCapabilities - - // MaxHeaderLength 返回数据链接(和较低级别的图层组合)标头可以具有的最大大小。 - // 较高级别使用此信息来保留它们正在构建的数据包前面预留空间。 - MaxHeaderLength() uint16 - - // 本地链路层地址 - LinkAddress() tcpip.LinkAddress - - // 要参与透明桥接,LinkEndpoint实现应调用eth.Encode, - // 并将header.EthernetFields.SrcAddr设置为r.LocalLinkAddress(如果已提供)。 - WritePacket(r *Route, hdr buffer.Prependable, payload buffer.VectorisedView, - protocol tcpip.NetworkProtocolNumber) *tcpip.Error - - // Attach 将数据链路层端点附加到协议栈的网络层调度程序。 - Attach(dispatcher NetworkDispatcher) - - // 是否已经添加了网络层调度器 - IsAttached() bool -} - -// LinkAddressResolver 是对可以解析链接地址的 NetworkProtocol 的扩展 TODO 需要解读 -type LinkAddressResolver interface { - LinkAddressRequest(addr, localAddr tcpip.Address, linkEP LinkEndpoint) *tcpip.Error - - ResolveStaticAddress(addr tcpip.Address) (tcpip.LinkAddress, bool) - - LinkAddressProtocol() tcpip.NetworkProtocolNumber -} - -// A LinkAddressCache caches link addresses. -type LinkAddressCache interface { - // CheckLocalAddress determines if the given local address exists, and if it - // does not exist. - CheckLocalAddress(nicid tcpip.NICID, protocol tcpip.NetworkProtocolNumber, addr tcpip.Address) tcpip.NICID - - // AddLinkAddress adds a link address to the cache. - AddLinkAddress(nicid tcpip.NICID, addr tcpip.Address, linkAddr tcpip.LinkAddress) - - // GetLinkAddress looks up the cache to translate address to link address (e.g. IP -> MAC). - // If the LinkEndpoint requests address resolution and there is a LinkAddressResolver - // registered with the network protocol, the cache attempts to resolve the address - // and returns ErrWouldBlock. Waker is notified when address resolution is - // complete (success or not). - // - // If address resolution is required, ErrNoLinkAddress and a notification channel is - // returned for the top level caller to block. Channel is closed once address resolution - // is complete (success or not). - GetLinkAddress(nicid tcpip.NICID, addr, localAddr tcpip.Address, protocol tcpip.NetworkProtocolNumber, w *sleep.Waker) (tcpip.LinkAddress, <-chan struct{}, *tcpip.Error) - - // RemoveWaker removes a waker that has been added in GetLinkAddress(). - RemoveWaker(nicid tcpip.NICID, addr tcpip.Address, waker *sleep.Waker) -} - -type NetworkDispatcher interface { - DeliverNetworkPacket(linkEP LinkEndpoint, dstLinkAddr, srcLinkAddr tcpip.LinkAddress, - protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView) -} - -type LinkEndpointCapabilities uint - -// TransportProtocolFactory 传输层实现工厂 -type TransportProtocolFactory func() TransportProtocol - -// NetworkProtocolFactory 网络层实现工厂 -type NetworkProtocolFactory func() NetworkProtocol - -var ( - // 以下两个map需要在init函数中注册 - // 传输层协议的注册存储结构 - transportProtocols = make(map[string]TransportProtocolFactory) - // 网络层协议的注册存储结构 - networkProtocols = make(map[string]NetworkProtocolFactory) - - linkEPMu sync.RWMutex - nextLinkEndpointID tcpip.LinkEndpointID = 1 - linkEndpoints = make(map[tcpip.LinkEndpointID]LinkEndpoint) // 设备注册表 设备号:设备实现 -) - -// ==============================网络层相关============================== -type NetworkProtocol interface { - // 网络协议版本号 - Number() tcpip.NetworkProtocolNumber - - // MinimumPacketSize returns the minimum valid packet size of this - // network protocol. The stack automatically drops any packets smaller - // than this targeted at this protocol. - MinimumPacketSize() int - - // ParsePorts returns the source and destination addresses stored in a - // packet of this protocol. - ParseAddresses(v buffer.View) (src, dst tcpip.Address) - - // 新建一个网络终端 比如 ipv4 或者 ipv6 的一个实现 - NewEndpoint(nicid tcpip.NICID, addr tcpip.Address, linkAddrCache LinkAddressCache, - dispatcher TransportDispatcher, sender LinkEndpoint) (NetworkEndpoint, *tcpip.Error) - - // SetOption allows enabling/disabling protocol specific features. - // SetOption returns an error if the option is not supported or the - // provided option value is invalid. - SetOption(option interface{}) *tcpip.Error - - // Option allows retrieving protocol specific option values. - // Option returns an error if the option is not supported or the - // provided option value is invalid. - Option(option interface{}) *tcpip.Error -} - -// NetworkEndpoint是需要由网络层协议(例如,ipv4,ipv6)的端点实现的接口 -type NetworkEndpoint interface { - // DefaultTTL is the default time-to-live value (or hop limit, in ipv6) - // for this endpoint. - DefaultTTL() uint8 - - // MTU is the maximum transmission unit for this endpoint. This is - // generally calculated as the MTU of the underlying data link endpoint - // minus the network endpoint max header length. - MTU() uint32 - - // Capabilities returns the set of capabilities supported by the - // underlying link-layer endpoint. - Capabilities() LinkEndpointCapabilities - - // MaxHeaderLength returns the maximum size the network (and lower - // level layers combined) headers can have. Higher levels use this - // information to reserve space in the front of the packets they're - // building. - MaxHeaderLength() uint16 - - // WritePacket writes a packet to the given destination address and - // protocol. - WritePacket(r *Route, hdr buffer.Prependable, payload buffer.VectorisedView, protocol tcpip.TransportProtocolNumber, ttl uint8) *tcpip.Error - - // ID returns the network protocol endpoint ID. - ID() *NetworkEndpointID - - // NICID returns the id of the NIC this endpoint belongs to. - NICID() tcpip.NICID - - // HandlePacket is called by the link layer when new packets arrive to - // this network endpoint. - HandlePacket(r *Route, vv buffer.VectorisedView) - - // Close is called when the endpoint is reomved from a stack. - Close() -} - -type NetworkEndpointID struct { - LocalAddress tcpip.Address -} - -// ==============================传输层相关============================== - -// TransportEndpointID 是某个传输层实现的标识 -type TransportEndpointID struct { - LocalPort uint16 - LocalAddress tcpip.Address - remotePort uint16 - RemoteAddress tcpip.Address -} - -// ControlType 是网络层控制消息的类型 -type ControlType int - -const ( - ControlPacketTooBig ControlType = iota - ControlPortUnreachable - ControlUnknown -) - -// TransportEndpoint 传输层实现接口 -type TransportEndpoint interface { - HandlePacket(r *Route, id TransportEndpointID, vv buffer.VectorisedView) - HandleControlPacket(id TransportEndpointID, typ ControlType, extra uint32, vv buffer.VectorisedView) -} - -// TransportProtocol 传输层协议 TCP OR UDP -type TransportProtocol interface { - // Number returns the transport protocol number. - Number() tcpip.TransportProtocolNumber - - // NewEndpoint creates a new endpoint of the transport protocol. - NewEndpoint(stack *Stack, netProto tcpip.NetworkProtocolNumber, waitQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) - - // MinimumPacketSize returns the minimum valid packet size of this - // transport protocol. The stack automatically drops any packets smaller - // than this targeted at this protocol. - MinimumPacketSize() int - - // ParsePorts returns the source and destination ports stored in a - // packet of this protocol. - ParsePorts(v buffer.View) (src, dst uint16, err *tcpip.Error) - - // HandleUnknownDestinationPacket handles packets targeted at this - // protocol but that don't match any existing endpoint. For example, - // it is targeted at a port that have no listeners. - // - // The return value indicates whether the packet was well-formed (for - // stats purposes only). - HandleUnknownDestinationPacket(r *Route, id TransportEndpointID, vv buffer.VectorisedView) bool - - // SetOption allows enabling/disabling protocol specific features. - // SetOption returns an error if the option is not supported or the - // provided option value is invalid. - SetOption(option interface{}) *tcpip.Error - - // Option allows retrieving protocol specific option values. - // Option returns an error if the option is not supported or the - // provided option value is invalid. - Option(option interface{}) *tcpip.Error -} - -// TransportDispatcher 传输层调度器 -type TransportDispatcher interface { - // DeliverTransportPacket delivers packets to the appropriate - // transport protocol endpoint. - DeliverTransportPacket(r *Route, protocol tcpip.TransportProtocolNumber, vv buffer.VectorisedView) - - // DeliverTransportControlPacket delivers control packets to the - // appropriate transport protocol endpoint. - DeliverTransportControlPacket(local, remote tcpip.Address, net tcpip.NetworkProtocolNumber, - trans tcpip.TransportProtocolNumber, typ ControlType, extra uint32, vv buffer.VectorisedView) -} - -// RegisterTransportProtocolFactory 注册一个新的传输层协议工厂 -func RegisterTransportProtocolFactory(name string, p TransportProtocolFactory) { - transportProtocols[name] = p -} - -// RegisterNetworkProtocolFactory 注册一个新的网络协议工厂 -func RegisterNetworkProtocolFactory(name string, p NetworkProtocolFactory) { - networkProtocols[name] = p -} - -// RegisterLinkEndpoint 注册一个链路层设备 -func RegisterLinkEndpoint(linkEP LinkEndpoint) tcpip.LinkEndpointID { - linkEPMu.Lock() - defer linkEPMu.Unlock() - - v := nextLinkEndpointID - nextLinkEndpointID++ - - linkEndpoints[v] = linkEP - - return v -} - -func FindLinkEndpoint(id tcpip.LinkEndpointID) LinkEndpoint { - linkEPMu.RLock() - defer linkEPMu.RUnlock() - - return linkEndpoints[id] -} +package stack + +import ( + "netstack/sleep" + "netstack/tcpip" + "netstack/tcpip/buffer" + "netstack/waiter" + "sync" +) + +const ( + CapabilityChecksumOffload LinkEndpointCapabilities = 1 << iota + CapabilityResolutionRequired + CapabilitySaveRestore + CapabilityDisconnectOK + CapabilityLoopback +) + +// ====================链路层相关============================== + +// 所谓 io 就是数据的输入输出,对于网卡来说就是接收或发送数据, +// 接收意味着对以太网帧解封装和提交给网络层,发送意味着对上层数据的封装和写入网卡 + +// 链路层接口 +type LinkEndpoint interface { + // MTU是此端点的最大传输单位。这通常由支持物理网络决定; + // 当这种物理网络不存在时,限制通常为64k,其中包括IP数据包的最大大小。 + MTU() uint32 + + // Capabilities返回链路层端点支持的功能集。 + Capabilities() LinkEndpointCapabilities + + // MaxHeaderLength 返回数据链接(和较低级别的图层组合)标头可以具有的最大大小。 + // 较高级别使用此信息来保留它们正在构建的数据包前面预留空间。 + MaxHeaderLength() uint16 + + // 本地链路层地址 + LinkAddress() tcpip.LinkAddress + + // 要参与透明桥接,LinkEndpoint实现应调用eth.Encode, + // 并将header.EthernetFields.SrcAddr设置为r.LocalLinkAddress(如果已提供)。 + WritePacket(r *Route, hdr buffer.Prependable, payload buffer.VectorisedView, + protocol tcpip.NetworkProtocolNumber) *tcpip.Error + + // Attach 将数据链路层端点附加到协议栈的网络层调度程序。 + Attach(dispatcher NetworkDispatcher) + + // 是否已经添加了网络层调度器 + IsAttached() bool +} + +// LinkAddressResolver 是对可以解析链接地址的 NetworkProtocol 的扩展 TODO 需要解读 +type LinkAddressResolver interface { + LinkAddressRequest(addr, localAddr tcpip.Address, linkEP LinkEndpoint) *tcpip.Error + + ResolveStaticAddress(addr tcpip.Address) (tcpip.LinkAddress, bool) + + LinkAddressProtocol() tcpip.NetworkProtocolNumber +} + +// A LinkAddressCache caches link addresses. +type LinkAddressCache interface { + // CheckLocalAddress determines if the given local address exists, and if it + // does not exist. + CheckLocalAddress(nicid tcpip.NICID, protocol tcpip.NetworkProtocolNumber, addr tcpip.Address) tcpip.NICID + + // AddLinkAddress adds a link address to the cache. + AddLinkAddress(nicid tcpip.NICID, addr tcpip.Address, linkAddr tcpip.LinkAddress) + + // GetLinkAddress looks up the cache to translate address to link address (e.g. IP -> MAC). + // If the LinkEndpoint requests address resolution and there is a LinkAddressResolver + // registered with the network protocol, the cache attempts to resolve the address + // and returns ErrWouldBlock. Waker is notified when address resolution is + // complete (success or not). + // + // If address resolution is required, ErrNoLinkAddress and a notification channel is + // returned for the top level caller to block. Channel is closed once address resolution + // is complete (success or not). + GetLinkAddress(nicid tcpip.NICID, addr, localAddr tcpip.Address, protocol tcpip.NetworkProtocolNumber, w *sleep.Waker) (tcpip.LinkAddress, <-chan struct{}, *tcpip.Error) + + // RemoveWaker removes a waker that has been added in GetLinkAddress(). + RemoveWaker(nicid tcpip.NICID, addr tcpip.Address, waker *sleep.Waker) +} + +type NetworkDispatcher interface { + DeliverNetworkPacket(linkEP LinkEndpoint, dstLinkAddr, srcLinkAddr tcpip.LinkAddress, + protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView) +} + +type LinkEndpointCapabilities uint + +// TransportProtocolFactory 传输层实现工厂 +type TransportProtocolFactory func() TransportProtocol + +// NetworkProtocolFactory 网络层实现工厂 +type NetworkProtocolFactory func() NetworkProtocol + +var ( + // 以下两个map需要在init函数中注册 + // 传输层协议的注册存储结构 + transportProtocols = make(map[string]TransportProtocolFactory) + // 网络层协议的注册存储结构 + networkProtocols = make(map[string]NetworkProtocolFactory) + + linkEPMu sync.RWMutex + nextLinkEndpointID tcpip.LinkEndpointID = 1 + linkEndpoints = make(map[tcpip.LinkEndpointID]LinkEndpoint) // 设备注册表 设备号:设备实现 +) + +// ==============================网络层相关============================== +type NetworkProtocol interface { + // 网络协议版本号 + Number() tcpip.NetworkProtocolNumber + + // MinimumPacketSize returns the minimum valid packet size of this + // network protocol. The stack automatically drops any packets smaller + // than this targeted at this protocol. + MinimumPacketSize() int + + // ParsePorts returns the source and destination addresses stored in a + // packet of this protocol. + ParseAddresses(v buffer.View) (src, dst tcpip.Address) + + // 新建一个网络终端 比如 ipv4 或者 ipv6 的一个实现 + NewEndpoint(nicid tcpip.NICID, addr tcpip.Address, linkAddrCache LinkAddressCache, + dispatcher TransportDispatcher, sender LinkEndpoint) (NetworkEndpoint, *tcpip.Error) + + // SetOption allows enabling/disabling protocol specific features. + // SetOption returns an error if the option is not supported or the + // provided option value is invalid. + SetOption(option interface{}) *tcpip.Error + + // Option allows retrieving protocol specific option values. + // Option returns an error if the option is not supported or the + // provided option value is invalid. + Option(option interface{}) *tcpip.Error +} + +// NetworkEndpoint是需要由网络层协议(例如,ipv4,ipv6)的端点实现的接口 +type NetworkEndpoint interface { + // DefaultTTL is the default time-to-live value (or hop limit, in ipv6) + // for this endpoint. + DefaultTTL() uint8 + + // MTU is the maximum transmission unit for this endpoint. This is + // generally calculated as the MTU of the underlying data link endpoint + // minus the network endpoint max header length. + MTU() uint32 + + // Capabilities returns the set of capabilities supported by the + // underlying link-layer endpoint. + Capabilities() LinkEndpointCapabilities + + // MaxHeaderLength returns the maximum size the network (and lower + // level layers combined) headers can have. Higher levels use this + // information to reserve space in the front of the packets they're + // building. + MaxHeaderLength() uint16 + + // WritePacket writes a packet to the given destination address and + // protocol. + WritePacket(r *Route, hdr buffer.Prependable, payload buffer.VectorisedView, protocol tcpip.TransportProtocolNumber, ttl uint8) *tcpip.Error + + // ID returns the network protocol endpoint ID. + ID() *NetworkEndpointID + + // NICID returns the id of the NIC this endpoint belongs to. + NICID() tcpip.NICID + + // HandlePacket is called by the link layer when new packets arrive to + // this network endpoint. + HandlePacket(r *Route, vv buffer.VectorisedView) + + // Close is called when the endpoint is reomved from a stack. + Close() +} + +type NetworkEndpointID struct { + LocalAddress tcpip.Address +} + +// ==============================传输层相关============================== + +// TransportEndpointID 是某个传输层实现的标识 +type TransportEndpointID struct { + LocalPort uint16 + LocalAddress tcpip.Address + remotePort uint16 + RemoteAddress tcpip.Address +} + +// ControlType 是网络层控制消息的类型 +type ControlType int + +const ( + ControlPacketTooBig ControlType = iota + ControlPortUnreachable + ControlUnknown +) + +// TransportEndpoint 传输层实现接口 +type TransportEndpoint interface { + HandlePacket(r *Route, id TransportEndpointID, vv buffer.VectorisedView) + HandleControlPacket(id TransportEndpointID, typ ControlType, extra uint32, vv buffer.VectorisedView) +} + +// TransportProtocol 传输层协议 TCP OR UDP +type TransportProtocol interface { + // Number returns the transport protocol number. + Number() tcpip.TransportProtocolNumber + + // NewEndpoint creates a new endpoint of the transport protocol. + NewEndpoint(stack *Stack, netProto tcpip.NetworkProtocolNumber, waitQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) + + // MinimumPacketSize returns the minimum valid packet size of this + // transport protocol. The stack automatically drops any packets smaller + // than this targeted at this protocol. + MinimumPacketSize() int + + // ParsePorts returns the source and destination ports stored in a + // packet of this protocol. + ParsePorts(v buffer.View) (src, dst uint16, err *tcpip.Error) + + // HandleUnknownDestinationPacket handles packets targeted at this + // protocol but that don't match any existing endpoint. For example, + // it is targeted at a port that have no listeners. + // + // The return value indicates whether the packet was well-formed (for + // stats purposes only). + HandleUnknownDestinationPacket(r *Route, id TransportEndpointID, vv buffer.VectorisedView) bool + + // SetOption allows enabling/disabling protocol specific features. + // SetOption returns an error if the option is not supported or the + // provided option value is invalid. + SetOption(option interface{}) *tcpip.Error + + // Option allows retrieving protocol specific option values. + // Option returns an error if the option is not supported or the + // provided option value is invalid. + Option(option interface{}) *tcpip.Error +} + +// TransportDispatcher 传输层调度器 +type TransportDispatcher interface { + // DeliverTransportPacket delivers packets to the appropriate + // transport protocol endpoint. + DeliverTransportPacket(r *Route, protocol tcpip.TransportProtocolNumber, vv buffer.VectorisedView) + + // DeliverTransportControlPacket delivers control packets to the + // appropriate transport protocol endpoint. + DeliverTransportControlPacket(local, remote tcpip.Address, net tcpip.NetworkProtocolNumber, + trans tcpip.TransportProtocolNumber, typ ControlType, extra uint32, vv buffer.VectorisedView) +} + +// RegisterTransportProtocolFactory 注册一个新的传输层协议工厂 +func RegisterTransportProtocolFactory(name string, p TransportProtocolFactory) { + transportProtocols[name] = p +} + +// RegisterNetworkProtocolFactory 注册一个新的网络协议工厂 +func RegisterNetworkProtocolFactory(name string, p NetworkProtocolFactory) { + networkProtocols[name] = p +} + +// RegisterLinkEndpoint 注册一个链路层设备 +func RegisterLinkEndpoint(linkEP LinkEndpoint) tcpip.LinkEndpointID { + linkEPMu.Lock() + defer linkEPMu.Unlock() + + v := nextLinkEndpointID + nextLinkEndpointID++ + + linkEndpoints[v] = linkEP + + return v +} + +func FindLinkEndpoint(id tcpip.LinkEndpointID) LinkEndpoint { + linkEPMu.RLock() + defer linkEPMu.RUnlock() + + return linkEndpoints[id] +} diff --git a/tcpip/stack/route.go b/tcpip/stack/route.go index d75c854..b4ead51 100644 --- a/tcpip/stack/route.go +++ b/tcpip/stack/route.go @@ -1,113 +1,113 @@ -package stack - -import ( - "netstack/sleep" - "netstack/tcpip" - "netstack/tcpip/buffer" -) - -// 贯穿整个协议栈的路由,也就是在链路层和网络层都可以路由 -// 如果目标地址是链路层地址,那么在链路层路由, -// 如果目标地址是网络层地址,那么在网络层路由。 -type Route struct { - // 远端网络层地址 ipv4 or ipv6 地址 - RemoteAddress tcpip.Address - // 远端网卡MAC地址 - RemoteLinkAddress tcpip.LinkAddress - - // 本地网络层地址 ipv4 or ipv6 地址 - LocalAddress tcpip.Address - // 本地网卡MAC地址 - LocalLinkAddress tcpip.LinkAddress - - // 下一跳网络层地址 - NextHop tcpip.Address - - // 网络层协议号 - NetProto tcpip.NetworkProtocolNumber - - // 相关的网络终端 - ref *referencedNetworkEndpoint -} - -// 根据参数新建一个路由,并关联一个网络层端 -func makeRoute(netProto tcpip.NetworkProtocolNumber, localAddr, remoteAddr tcpip.Address, - localLinkAddr tcpip.LinkAddress, ref *referencedNetworkEndpoint) Route { - return Route{ - NetProto: netProto, - LocalAddress: localAddr, - LocalLinkAddress: localLinkAddr, - RemoteAddress: remoteAddr, - ref: ref, - } -} - -// NICID returns the id of the NIC from which this route originates. -func (r *Route) NICID() tcpip.NICID { - return r.ref.ep.NICID() -} - -// MaxHeaderLength forwards the call to the network endpoint's implementation. -func (r *Route) MaxHeaderLength() uint16 { - return r.ref.ep.MaxHeaderLength() -} - -// Stats returns a mutable copy of current stats. -func (r *Route) Stats() tcpip.Stats { - return r.ref.nic.stack.Stats() -} - -// Capabilities returns the link-layer capabilities of the route. -func (r *Route) Capabilities() LinkEndpointCapabilities { - return r.ref.ep.Capabilities() -} - -// RemoveWaker removes a waker that has been added in Resolve(). -func (r *Route) RemoveWaker(waker *sleep.Waker) { - nextAddr := r.NextHop - if nextAddr == "" { - nextAddr = r.RemoteAddress - } - r.ref.linkCache.RemoveWaker(r.ref.nic.ID(), nextAddr, waker) -} - -// IsResolutionRequired returns true if Resolve() must be called to resolve -// the link address before the this route can be written to. -func (r *Route) IsResolutionRequired() bool { - return r.ref.linkCache != nil && r.RemoteLinkAddress == "" -} - -// WritePacket writes the packet through the given route. -func (r *Route) WritePacket(hdr buffer.Prependable, payload buffer.VectorisedView, - protocol tcpip.TransportProtocolNumber, ttl uint8) *tcpip.Error { - err := r.ref.ep.WritePacket(r, hdr, payload, protocol, ttl) - if err == tcpip.ErrNoRoute { - r.Stats().IP.OutgoingPacketErrors.Increment() - } - return err -} - -// DefaultTTL returns the default TTL of the underlying network endpoint. -func (r *Route) DefaultTTL() uint8 { - return r.ref.ep.DefaultTTL() -} - -// MTU returns the MTU of the underlying network endpoint. -func (r *Route) MTU() uint32 { - return r.ref.ep.MTU() -} - -// Release frees all resources associated with the route. -func (r *Route) Release() { - if r.ref != nil { - r.ref.decRef() - r.ref = nil - } -} - -// Clone Clone a route such that the original one can be released and the new -// one will remain valid. -func (r *Route) Clone() Route { - r.ref.incRef() - return *r -} +package stack + +import ( + "netstack/sleep" + "netstack/tcpip" + "netstack/tcpip/buffer" +) + +// 贯穿整个协议栈的路由,也就是在链路层和网络层都可以路由 +// 如果目标地址是链路层地址,那么在链路层路由, +// 如果目标地址是网络层地址,那么在网络层路由。 +type Route struct { + // 远端网络层地址 ipv4 or ipv6 地址 + RemoteAddress tcpip.Address + // 远端网卡MAC地址 + RemoteLinkAddress tcpip.LinkAddress + + // 本地网络层地址 ipv4 or ipv6 地址 + LocalAddress tcpip.Address + // 本地网卡MAC地址 + LocalLinkAddress tcpip.LinkAddress + + // 下一跳网络层地址 + NextHop tcpip.Address + + // 网络层协议号 + NetProto tcpip.NetworkProtocolNumber + + // 相关的网络终端 + ref *referencedNetworkEndpoint +} + +// 根据参数新建一个路由,并关联一个网络层端 +func makeRoute(netProto tcpip.NetworkProtocolNumber, localAddr, remoteAddr tcpip.Address, + localLinkAddr tcpip.LinkAddress, ref *referencedNetworkEndpoint) Route { + return Route{ + NetProto: netProto, + LocalAddress: localAddr, + LocalLinkAddress: localLinkAddr, + RemoteAddress: remoteAddr, + ref: ref, + } +} + +// NICID returns the id of the NIC from which this route originates. +func (r *Route) NICID() tcpip.NICID { + return r.ref.ep.NICID() +} + +// MaxHeaderLength forwards the call to the network endpoint's implementation. +func (r *Route) MaxHeaderLength() uint16 { + return r.ref.ep.MaxHeaderLength() +} + +// Stats returns a mutable copy of current stats. +func (r *Route) Stats() tcpip.Stats { + return r.ref.nic.stack.Stats() +} + +// Capabilities returns the link-layer capabilities of the route. +func (r *Route) Capabilities() LinkEndpointCapabilities { + return r.ref.ep.Capabilities() +} + +// RemoveWaker removes a waker that has been added in Resolve(). +func (r *Route) RemoveWaker(waker *sleep.Waker) { + nextAddr := r.NextHop + if nextAddr == "" { + nextAddr = r.RemoteAddress + } + r.ref.linkCache.RemoveWaker(r.ref.nic.ID(), nextAddr, waker) +} + +// IsResolutionRequired returns true if Resolve() must be called to resolve +// the link address before the this route can be written to. +func (r *Route) IsResolutionRequired() bool { + return r.ref.linkCache != nil && r.RemoteLinkAddress == "" +} + +// WritePacket writes the packet through the given route. +func (r *Route) WritePacket(hdr buffer.Prependable, payload buffer.VectorisedView, + protocol tcpip.TransportProtocolNumber, ttl uint8) *tcpip.Error { + err := r.ref.ep.WritePacket(r, hdr, payload, protocol, ttl) + if err == tcpip.ErrNoRoute { + r.Stats().IP.OutgoingPacketErrors.Increment() + } + return err +} + +// DefaultTTL returns the default TTL of the underlying network endpoint. +func (r *Route) DefaultTTL() uint8 { + return r.ref.ep.DefaultTTL() +} + +// MTU returns the MTU of the underlying network endpoint. +func (r *Route) MTU() uint32 { + return r.ref.ep.MTU() +} + +// Release frees all resources associated with the route. +func (r *Route) Release() { + if r.ref != nil { + r.ref.decRef() + r.ref = nil + } +} + +// Clone Clone a route such that the original one can be released and the new +// one will remain valid. +func (r *Route) Clone() Route { + r.ref.incRef() + return *r +} diff --git a/tcpip/stack/stack.go b/tcpip/stack/stack.go index d9ac26c..b2e4b17 100644 --- a/tcpip/stack/stack.go +++ b/tcpip/stack/stack.go @@ -1,401 +1,401 @@ -package stack - -import ( - "log" - "netstack/sleep" - "netstack/tcpip" - "netstack/tcpip/buffer" - "netstack/tcpip/ports" - "netstack/waiter" - "sync" - "time" -) - -const ( - // ageLimit is set to the same cache stale time used in Linux. - ageLimit = 1 * time.Minute - // resolutionTimeout is set to the same ARP timeout used in Linux. - resolutionTimeout = 1 * time.Second - // resolutionAttempts is set to the same ARP retries used in Linux. - resolutionAttempts = 3 -) - -// TODO 需要解读 -type TCPProbeFunc func(s TcpEndpointState) - -// TODO 需要解读 -type TcpEndpointState struct { - // TODO 需要添加 -} - -// 传输层协议状态机 包含传输层协议以及默认处理方法 -type transportProtocolState struct { - proto TransportProtocol - defaultHandler func(*Route, TransportEndpointID, buffer.VectorisedView) bool -} - -// Stack 是一个网络堆栈,具有所有支持的协议、NIC 和路由表。 -type Stack struct { - transportProtocols map[tcpip.TransportProtocolNumber]*transportProtocolState // 各种传输层协议 - networkProtocols map[tcpip.NetworkProtocolNumber]NetworkProtocol // 各种网络层协议 - linkAddrResolvers map[tcpip.NetworkProtocolNumber]LinkAddressResolver // 各种链接解析器 - - demux *transportDemuxer // 传输层的复用器 - - stats tcpip.Stats // 网络栈的状态监测器 - - linkAddrCache *linkAddrCache // 链路层地址的缓存 - - mu sync.RWMutex - nics map[tcpip.NICID]*NIC // 所有的网卡设备 - forwarding bool // 是否正在转发 - - // route is the route table passed in by the user via SetRouteTable(), - // it is used by FindRoute() to build a route for a specific - // destination. - routeTable []tcpip.Route // 路由表 - - *ports.PortManager // 端口管理器 - - // If not nil, then any new endpoints will have this probe function - // invoked everytime they receive a TCP segment. - tcpProbeFunc TCPProbeFunc - - // clock is used to generate user-visible times. - clock tcpip.Clock -} - -// Options contains optional Stack configuration. -type Options struct { - // Clock is an optional clock source used for timestampping packets. - // - // If no Clock is specified, the clock source will be time.Now. - Clock tcpip.Clock - - // Stats are optional statistic counters. - Stats tcpip.Stats -} - -func New(network []string, transport []string, opts Options) *Stack { - clock := opts.Clock - if clock == nil { - clock = &tcpip.StdClock{} - } - - s := &Stack{ - transportProtocols: make(map[tcpip.TransportProtocolNumber]*transportProtocolState), - networkProtocols: make(map[tcpip.NetworkProtocolNumber]NetworkProtocol), - linkAddrResolvers: make(map[tcpip.NetworkProtocolNumber]LinkAddressResolver), - nics: make(map[tcpip.NICID]*NIC), - linkAddrCache: newLinkAddrCache(ageLimit, resolutionTimeout, resolutionAttempts), - PortManager: ports.NewPortManager(), - clock: clock, - stats: opts.Stats.FillIn(), - } - - // 添加指定的网络端协议 必须已经在init中注册过 - for _, name := range network { - // 先检查这个网络协议是否注册过工厂方法 - netProtoFactory, ok := networkProtocols[name] - if !ok { - continue // 没有就略过 - } - netProto := netProtoFactory() // 制造一个该型号协议的示实例 - s.networkProtocols[netProto.Number()] = netProto // 注册该型号的网络协议 - } - - // 添加指定的传输层协议 必已经在init中注册过 - for _, name := range transport { - transProtoFactory, ok := transportProtocols[name] - if !ok { - continue - } - transProto := transProtoFactory() // 新建一个传输层协议 - s.transportProtocols[transProto.Number()] = &transportProtocolState{ - proto: transProto, - } - } - // TODO 添加传输层分流器 - return s -} - -func (s *Stack) Stats() tcpip.Stats { - return s.stats -} - -// SetForwarding enables or disables the packet forwarding between NICs. -func (s *Stack) SetForwarding(enable bool) { - // TODO: Expose via /proc/sys/net/ipv4/ip_forward. - s.mu.Lock() - s.forwarding = enable - s.mu.Unlock() -} - -// Forwarding returns if the packet forwarding between NICs is enabled. -func (s *Stack) Forwarding() bool { - // TODO: Expose via /proc/sys/net/ipv4/ip_forward. - s.mu.RLock() - defer s.mu.RUnlock() - return s.forwarding -} - -// SetRouteTable assigns the route table to be used by this stack. It -// specifies which NIC to use for given destination address ranges. -func (s *Stack) SetRouteTable(table []tcpip.Route) { - s.mu.Lock() - defer s.mu.Unlock() - - s.routeTable = table -} - -// GetRouteTable returns the route table which is currently in use. -func (s *Stack) GetRouteTable() []tcpip.Route { - s.mu.Lock() - defer s.mu.Unlock() - return append([]tcpip.Route(nil), s.routeTable...) -} - -// NewEndpoint 根据给定的网络层协议号和传输层协议号新建一个传输层实现 -func (s *Stack) NewEndpoint(transport tcpip.TransportProtocolNumber, - network tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) { - t, ok := s.transportProtocols[transport] - if !ok { - return nil, tcpip.ErrUnknownProtocol - } - return t.proto.NewEndpoint(s, network, waiterQueue) // 新建一个传输层实现 -} - -// CreateNIC 根据给定的网卡号 和 链路层设备号 创建一个网卡对象 -func (s *Stack) CreateNIC(id tcpip.NICID, linkEP tcpip.LinkEndpointID) *tcpip.Error { - return s.createNIC(id, "", linkEP, true) -} - -// CreateNamedNIC creates a NIC with the provided id and link-layer endpoint, -// and a human-readable name. -func (s *Stack) CreateNamedNIC(id tcpip.NICID, name string, linkEP tcpip.LinkEndpointID) *tcpip.Error { - return s.createNIC(id, name, linkEP, true) -} - -// 新建一个网卡对象,并且激活它 激活就是准备好熊网卡中读取和写入数据 -func (s *Stack) createNIC(id tcpip.NICID, name string, linkEP tcpip.LinkEndpointID, enable bool) *tcpip.Error { - ep := FindLinkEndpoint(linkEP) - if ep == nil { - return tcpip.ErrBadLinkEndpoint - } - - s.mu.Lock() - defer s.mu.Unlock() - - // Make sure id is unique - if _, ok := s.nics[id]; ok { - return tcpip.ErrDuplicateNICID - } - n := newNIC(s, id, name, ep) - - s.nics[id] = n - if enable { - n.attachLinkEndpoint() - } - - return nil -} - -// 给网卡添加ip地址 -func (s *Stack) AddAddress(id tcpip.NICID, protocol tcpip.NetworkProtocolNumber, addr tcpip.Address) *tcpip.Error { - return s.AddAddressWithOptions(id, protocol, addr, CanBePrimaryEndpoint) -} - -func (s *Stack) AddAddressWithOptions(id tcpip.NICID, protocol tcpip.NetworkProtocolNumber, - addr tcpip.Address, peb PrimaryEndpointBehavior) *tcpip.Error { - s.mu.RLock() - defer s.mu.RUnlock() - - nic := s.nics[id] - if nic == nil { - return tcpip.ErrUnknownNICID - } - - return nic.AddAddressWithOptions(protocol, addr, peb) -} - -// AddSubnet adds a subnet range to the specified NIC. -func (s *Stack) AddSubnet(id tcpip.NICID, protocol tcpip.NetworkProtocolNumber, subnet tcpip.Subnet) *tcpip.Error { - s.mu.RLock() - defer s.mu.RUnlock() - - if nic, ok := s.nics[id]; ok { - nic.AddSubnet(protocol, subnet) - return nil - } - - return tcpip.ErrUnknownNICID -} - -// RemoveSubnet removes the subnet range from the specified NIC. -func (s *Stack) RemoveSubnet(id tcpip.NICID, subnet tcpip.Subnet) *tcpip.Error { - s.mu.RLock() - defer s.mu.RUnlock() - - if nic, ok := s.nics[id]; ok { - nic.RemoveSubnet(subnet) - return nil - } - - return tcpip.ErrUnknownNICID -} - -// ContainsSubnet reports whether the specified NIC contains the specified -// subnet. -func (s *Stack) ContainsSubnet(id tcpip.NICID, subnet tcpip.Subnet) (bool, *tcpip.Error) { - s.mu.RLock() - defer s.mu.RUnlock() - - if nic, ok := s.nics[id]; ok { - return nic.ContainsSubnet(subnet), nil - } - - return false, tcpip.ErrUnknownNICID -} - -// 路由查找实现,比如当tcp建立连接时,会用该函数得到路由信息 -func (s *Stack) FindRoute(id tcpip.NICID, localAddr, remoteAddr tcpip.Address, - netProto tcpip.NetworkProtocolNumber) (Route, *tcpip.Error) { - s.mu.RLock() - defer s.mu.RUnlock() - - for i := range s.routeTable { - if (id != 0 && id != s.routeTable[i].NIC) || - (len(remoteAddr) != 0 && !s.routeTable[i].Match(remoteAddr)) { - continue - } - - nic := s.nics[s.routeTable[i].NIC] - if nic == nil { - continue - } - - var ref *referencedNetworkEndpoint - if len(localAddr) != 0 { - ref = nic.findEndpoint(netProto, localAddr, CanBePrimaryEndpoint) - } else { - ref = nic.primaryEndpoint(netProto) - } - if ref == nil { - continue - } - - if len(remoteAddr) == 0 { - // If no remote address was provided, then the route - // provided will refer to the link local address. - remoteAddr = ref.ep.ID().LocalAddress // 发回自己? TODO - } - - r := makeRoute(netProto, ref.ep.ID().LocalAddress, remoteAddr, nic.linkEP.LinkAddress(), ref) - r.NextHop = s.routeTable[i].Gateway - log.Println(r.LocalLinkAddress, r.LocalAddress, r.RemoteLinkAddress, r.RemoteAddress, r.NextHop) - return r, nil - } - - return Route{}, tcpip.ErrNoRoute -} - -// ===============本机链路层缓存实现================== - -// CheckLocalAddress 检查本地是否绑定过该网络层地址 注意 NICID 为0表示寻找本机所有网卡 -func (s *Stack) CheckLocalAddress(nicid tcpip.NICID, protocol tcpip.NetworkProtocolNumber, addr tcpip.Address) tcpip.NICID { - s.mu.RLock() - defer s.mu.RUnlock() - - if nicid != 0 { - nic := s.nics[nicid] // 先拿到网卡 - if nic == nil { - return 0 - } - - ref := nic.findEndpoint(protocol, addr, CanBePrimaryEndpoint) // 看看这张网卡是否绑定过这个地址 - if ref == nil { - return 0 - } - - ref.decRef() // 这个网络端实现使用结束 释放对它的占用 - - return nic.id - } - // Go through all the NICs. - for _, nic := range s.nics { - ref := nic.findEndpoint(protocol, addr, CanBePrimaryEndpoint) - if ref != nil { - ref.decRef() - return nic.id - } - } - return 0 -} - -func (s *Stack) AddLinkAddress(nicid tcpip.NICID, addr tcpip.Address, linkAddr tcpip.LinkAddress) { - fullAddr := tcpip.FullAddress{NIC: nicid, Addr: addr} - s.linkAddrCache.add(fullAddr, linkAddr) -} - -func (s *Stack) GetLinkAddress(nicid tcpip.NICID, addr, localAddr tcpip.Address, - protocol tcpip.NetworkProtocolNumber, w *sleep.Waker) (tcpip.LinkAddress, <-chan struct{}, *tcpip.Error) { - s.mu.RLock() - // 获取网卡对象 - nic := s.nics[nicid] - if nic == nil { - s.mu.RUnlock() - return "", nil, tcpip.ErrUnknownNICID - } - s.mu.RUnlock() - - fullAddr := tcpip.FullAddress{NIC: nicid, Addr: addr} - // 根据网络层协议号找到对应的地址解析协议 - linkRes := s.linkAddrResolvers[protocol] - return s.linkAddrCache.get(fullAddr, linkRes, localAddr, nic.linkEP, w) -} - -func (s *Stack) RemoveWaker(nicid tcpip.NICID, addr tcpip.Address, waker *sleep.Waker) { - s.mu.RLock() - defer s.mu.RUnlock() - - if nic := s.nics[nicid]; nic == nil { - fullAddr := tcpip.FullAddress{NIC: nicid, Addr: addr} - s.linkAddrCache.removeWaker(fullAddr, waker) - } -} - -// RegisterTransportEndpoint 协议栈或者NIC的分流器注册给定传输层端点。 -// 收到的与提供的id匹配的数据包将被传送到给定的端点;指定nic是可选的,但特定于nic的ID优先于全局ID。 -// 最终调用 demuxer.registerEndpoint 函数来实现注册。 -func (s *Stack) RegisterTransportEndpoint(nicID tcpip.NICID, netProtos []tcpip.NetworkProtocolNumber, - protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint) *tcpip.Error { - // TODO 需要实现 - return nil -} - -// UnregisterTransportEndpoint removes the endpoint with the given id from the -// stack transport dispatcher. -func (s *Stack) UnregisterTransportEndpoint(nicID tcpip.NICID, netProtos []tcpip.NetworkProtocolNumber, - protocol tcpip.TransportProtocolNumber, id TransportEndpointID) { - -} - -// NetworkProtocolInstance returns the protocol instance in the stack for the -// specified network protocol. This method is public for protocol implementers -// and tests to use. -func (s *Stack) NetworkProtocolInstance(num tcpip.NetworkProtocolNumber) NetworkProtocol { - if p, ok := s.networkProtocols[num]; ok { - return p - } - return nil -} - -// TransportProtocolInstance returns the protocol instance in the stack for the -// specified transport protocol. This method is public for protocol implementers -// and tests to use. -func (s *Stack) TransportProtocolInstance(num tcpip.TransportProtocolNumber) TransportProtocol { - if pState, ok := s.transportProtocols[num]; ok { - return pState.proto - } - return nil -} +package stack + +import ( + "log" + "netstack/sleep" + "netstack/tcpip" + "netstack/tcpip/buffer" + "netstack/tcpip/ports" + "netstack/waiter" + "sync" + "time" +) + +const ( + // ageLimit is set to the same cache stale time used in Linux. + ageLimit = 1 * time.Minute + // resolutionTimeout is set to the same ARP timeout used in Linux. + resolutionTimeout = 1 * time.Second + // resolutionAttempts is set to the same ARP retries used in Linux. + resolutionAttempts = 3 +) + +// TODO 需要解读 +type TCPProbeFunc func(s TcpEndpointState) + +// TODO 需要解读 +type TcpEndpointState struct { + // TODO 需要添加 +} + +// 传输层协议状态机 包含传输层协议以及默认处理方法 +type transportProtocolState struct { + proto TransportProtocol + defaultHandler func(*Route, TransportEndpointID, buffer.VectorisedView) bool +} + +// Stack 是一个网络堆栈,具有所有支持的协议、NIC 和路由表。 +type Stack struct { + transportProtocols map[tcpip.TransportProtocolNumber]*transportProtocolState // 各种传输层协议 + networkProtocols map[tcpip.NetworkProtocolNumber]NetworkProtocol // 各种网络层协议 + linkAddrResolvers map[tcpip.NetworkProtocolNumber]LinkAddressResolver // 各种链接解析器 + + demux *transportDemuxer // 传输层的复用器 + + stats tcpip.Stats // 网络栈的状态监测器 + + linkAddrCache *linkAddrCache // 链路层地址的缓存 + + mu sync.RWMutex + nics map[tcpip.NICID]*NIC // 所有的网卡设备 + forwarding bool // 是否正在转发 + + // route is the route table passed in by the user via SetRouteTable(), + // it is used by FindRoute() to build a route for a specific + // destination. + routeTable []tcpip.Route // 路由表 + + *ports.PortManager // 端口管理器 + + // If not nil, then any new endpoints will have this probe function + // invoked everytime they receive a TCP segment. + tcpProbeFunc TCPProbeFunc + + // clock is used to generate user-visible times. + clock tcpip.Clock +} + +// Options contains optional Stack configuration. +type Options struct { + // Clock is an optional clock source used for timestampping packets. + // + // If no Clock is specified, the clock source will be time.Now. + Clock tcpip.Clock + + // Stats are optional statistic counters. + Stats tcpip.Stats +} + +func New(network []string, transport []string, opts Options) *Stack { + clock := opts.Clock + if clock == nil { + clock = &tcpip.StdClock{} + } + + s := &Stack{ + transportProtocols: make(map[tcpip.TransportProtocolNumber]*transportProtocolState), + networkProtocols: make(map[tcpip.NetworkProtocolNumber]NetworkProtocol), + linkAddrResolvers: make(map[tcpip.NetworkProtocolNumber]LinkAddressResolver), + nics: make(map[tcpip.NICID]*NIC), + linkAddrCache: newLinkAddrCache(ageLimit, resolutionTimeout, resolutionAttempts), + PortManager: ports.NewPortManager(), + clock: clock, + stats: opts.Stats.FillIn(), + } + + // 添加指定的网络端协议 必须已经在init中注册过 + for _, name := range network { + // 先检查这个网络协议是否注册过工厂方法 + netProtoFactory, ok := networkProtocols[name] + if !ok { + continue // 没有就略过 + } + netProto := netProtoFactory() // 制造一个该型号协议的示实例 + s.networkProtocols[netProto.Number()] = netProto // 注册该型号的网络协议 + } + + // 添加指定的传输层协议 必已经在init中注册过 + for _, name := range transport { + transProtoFactory, ok := transportProtocols[name] + if !ok { + continue + } + transProto := transProtoFactory() // 新建一个传输层协议 + s.transportProtocols[transProto.Number()] = &transportProtocolState{ + proto: transProto, + } + } + // TODO 添加传输层分流器 + return s +} + +func (s *Stack) Stats() tcpip.Stats { + return s.stats +} + +// SetForwarding enables or disables the packet forwarding between NICs. +func (s *Stack) SetForwarding(enable bool) { + // TODO: Expose via /proc/sys/net/ipv4/ip_forward. + s.mu.Lock() + s.forwarding = enable + s.mu.Unlock() +} + +// Forwarding returns if the packet forwarding between NICs is enabled. +func (s *Stack) Forwarding() bool { + // TODO: Expose via /proc/sys/net/ipv4/ip_forward. + s.mu.RLock() + defer s.mu.RUnlock() + return s.forwarding +} + +// SetRouteTable assigns the route table to be used by this stack. It +// specifies which NIC to use for given destination address ranges. +func (s *Stack) SetRouteTable(table []tcpip.Route) { + s.mu.Lock() + defer s.mu.Unlock() + + s.routeTable = table +} + +// GetRouteTable returns the route table which is currently in use. +func (s *Stack) GetRouteTable() []tcpip.Route { + s.mu.Lock() + defer s.mu.Unlock() + return append([]tcpip.Route(nil), s.routeTable...) +} + +// NewEndpoint 根据给定的网络层协议号和传输层协议号新建一个传输层实现 +func (s *Stack) NewEndpoint(transport tcpip.TransportProtocolNumber, + network tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) { + t, ok := s.transportProtocols[transport] + if !ok { + return nil, tcpip.ErrUnknownProtocol + } + return t.proto.NewEndpoint(s, network, waiterQueue) // 新建一个传输层实现 +} + +// CreateNIC 根据给定的网卡号 和 链路层设备号 创建一个网卡对象 +func (s *Stack) CreateNIC(id tcpip.NICID, linkEP tcpip.LinkEndpointID) *tcpip.Error { + return s.createNIC(id, "", linkEP, true) +} + +// CreateNamedNIC creates a NIC with the provided id and link-layer endpoint, +// and a human-readable name. +func (s *Stack) CreateNamedNIC(id tcpip.NICID, name string, linkEP tcpip.LinkEndpointID) *tcpip.Error { + return s.createNIC(id, name, linkEP, true) +} + +// 新建一个网卡对象,并且激活它 激活就是准备好熊网卡中读取和写入数据 +func (s *Stack) createNIC(id tcpip.NICID, name string, linkEP tcpip.LinkEndpointID, enable bool) *tcpip.Error { + ep := FindLinkEndpoint(linkEP) + if ep == nil { + return tcpip.ErrBadLinkEndpoint + } + + s.mu.Lock() + defer s.mu.Unlock() + + // Make sure id is unique + if _, ok := s.nics[id]; ok { + return tcpip.ErrDuplicateNICID + } + n := newNIC(s, id, name, ep) + + s.nics[id] = n + if enable { + n.attachLinkEndpoint() + } + + return nil +} + +// 给网卡添加ip地址 +func (s *Stack) AddAddress(id tcpip.NICID, protocol tcpip.NetworkProtocolNumber, addr tcpip.Address) *tcpip.Error { + return s.AddAddressWithOptions(id, protocol, addr, CanBePrimaryEndpoint) +} + +func (s *Stack) AddAddressWithOptions(id tcpip.NICID, protocol tcpip.NetworkProtocolNumber, + addr tcpip.Address, peb PrimaryEndpointBehavior) *tcpip.Error { + s.mu.RLock() + defer s.mu.RUnlock() + + nic := s.nics[id] + if nic == nil { + return tcpip.ErrUnknownNICID + } + + return nic.AddAddressWithOptions(protocol, addr, peb) +} + +// AddSubnet adds a subnet range to the specified NIC. +func (s *Stack) AddSubnet(id tcpip.NICID, protocol tcpip.NetworkProtocolNumber, subnet tcpip.Subnet) *tcpip.Error { + s.mu.RLock() + defer s.mu.RUnlock() + + if nic, ok := s.nics[id]; ok { + nic.AddSubnet(protocol, subnet) + return nil + } + + return tcpip.ErrUnknownNICID +} + +// RemoveSubnet removes the subnet range from the specified NIC. +func (s *Stack) RemoveSubnet(id tcpip.NICID, subnet tcpip.Subnet) *tcpip.Error { + s.mu.RLock() + defer s.mu.RUnlock() + + if nic, ok := s.nics[id]; ok { + nic.RemoveSubnet(subnet) + return nil + } + + return tcpip.ErrUnknownNICID +} + +// ContainsSubnet reports whether the specified NIC contains the specified +// subnet. +func (s *Stack) ContainsSubnet(id tcpip.NICID, subnet tcpip.Subnet) (bool, *tcpip.Error) { + s.mu.RLock() + defer s.mu.RUnlock() + + if nic, ok := s.nics[id]; ok { + return nic.ContainsSubnet(subnet), nil + } + + return false, tcpip.ErrUnknownNICID +} + +// 路由查找实现,比如当tcp建立连接时,会用该函数得到路由信息 +func (s *Stack) FindRoute(id tcpip.NICID, localAddr, remoteAddr tcpip.Address, + netProto tcpip.NetworkProtocolNumber) (Route, *tcpip.Error) { + s.mu.RLock() + defer s.mu.RUnlock() + + for i := range s.routeTable { + if (id != 0 && id != s.routeTable[i].NIC) || + (len(remoteAddr) != 0 && !s.routeTable[i].Match(remoteAddr)) { + continue + } + + nic := s.nics[s.routeTable[i].NIC] + if nic == nil { + continue + } + + var ref *referencedNetworkEndpoint + if len(localAddr) != 0 { + ref = nic.findEndpoint(netProto, localAddr, CanBePrimaryEndpoint) + } else { + ref = nic.primaryEndpoint(netProto) + } + if ref == nil { + continue + } + + if len(remoteAddr) == 0 { + // If no remote address was provided, then the route + // provided will refer to the link local address. + remoteAddr = ref.ep.ID().LocalAddress // 发回自己? TODO + } + + r := makeRoute(netProto, ref.ep.ID().LocalAddress, remoteAddr, nic.linkEP.LinkAddress(), ref) + r.NextHop = s.routeTable[i].Gateway + log.Println(r.LocalLinkAddress, r.LocalAddress, r.RemoteLinkAddress, r.RemoteAddress, r.NextHop) + return r, nil + } + + return Route{}, tcpip.ErrNoRoute +} + +// ===============本机链路层缓存实现================== + +// CheckLocalAddress 检查本地是否绑定过该网络层地址 注意 NICID 为0表示寻找本机所有网卡 +func (s *Stack) CheckLocalAddress(nicid tcpip.NICID, protocol tcpip.NetworkProtocolNumber, addr tcpip.Address) tcpip.NICID { + s.mu.RLock() + defer s.mu.RUnlock() + + if nicid != 0 { + nic := s.nics[nicid] // 先拿到网卡 + if nic == nil { + return 0 + } + + ref := nic.findEndpoint(protocol, addr, CanBePrimaryEndpoint) // 看看这张网卡是否绑定过这个地址 + if ref == nil { + return 0 + } + + ref.decRef() // 这个网络端实现使用结束 释放对它的占用 + + return nic.id + } + // Go through all the NICs. + for _, nic := range s.nics { + ref := nic.findEndpoint(protocol, addr, CanBePrimaryEndpoint) + if ref != nil { + ref.decRef() + return nic.id + } + } + return 0 +} + +func (s *Stack) AddLinkAddress(nicid tcpip.NICID, addr tcpip.Address, linkAddr tcpip.LinkAddress) { + fullAddr := tcpip.FullAddress{NIC: nicid, Addr: addr} + s.linkAddrCache.add(fullAddr, linkAddr) +} + +func (s *Stack) GetLinkAddress(nicid tcpip.NICID, addr, localAddr tcpip.Address, + protocol tcpip.NetworkProtocolNumber, w *sleep.Waker) (tcpip.LinkAddress, <-chan struct{}, *tcpip.Error) { + s.mu.RLock() + // 获取网卡对象 + nic := s.nics[nicid] + if nic == nil { + s.mu.RUnlock() + return "", nil, tcpip.ErrUnknownNICID + } + s.mu.RUnlock() + + fullAddr := tcpip.FullAddress{NIC: nicid, Addr: addr} + // 根据网络层协议号找到对应的地址解析协议 + linkRes := s.linkAddrResolvers[protocol] + return s.linkAddrCache.get(fullAddr, linkRes, localAddr, nic.linkEP, w) +} + +func (s *Stack) RemoveWaker(nicid tcpip.NICID, addr tcpip.Address, waker *sleep.Waker) { + s.mu.RLock() + defer s.mu.RUnlock() + + if nic := s.nics[nicid]; nic == nil { + fullAddr := tcpip.FullAddress{NIC: nicid, Addr: addr} + s.linkAddrCache.removeWaker(fullAddr, waker) + } +} + +// RegisterTransportEndpoint 协议栈或者NIC的分流器注册给定传输层端点。 +// 收到的与提供的id匹配的数据包将被传送到给定的端点;指定nic是可选的,但特定于nic的ID优先于全局ID。 +// 最终调用 demuxer.registerEndpoint 函数来实现注册。 +func (s *Stack) RegisterTransportEndpoint(nicID tcpip.NICID, netProtos []tcpip.NetworkProtocolNumber, + protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint) *tcpip.Error { + // TODO 需要实现 + return nil +} + +// UnregisterTransportEndpoint removes the endpoint with the given id from the +// stack transport dispatcher. +func (s *Stack) UnregisterTransportEndpoint(nicID tcpip.NICID, netProtos []tcpip.NetworkProtocolNumber, + protocol tcpip.TransportProtocolNumber, id TransportEndpointID) { + +} + +// NetworkProtocolInstance returns the protocol instance in the stack for the +// specified network protocol. This method is public for protocol implementers +// and tests to use. +func (s *Stack) NetworkProtocolInstance(num tcpip.NetworkProtocolNumber) NetworkProtocol { + if p, ok := s.networkProtocols[num]; ok { + return p + } + return nil +} + +// TransportProtocolInstance returns the protocol instance in the stack for the +// specified transport protocol. This method is public for protocol implementers +// and tests to use. +func (s *Stack) TransportProtocolInstance(num tcpip.TransportProtocolNumber) TransportProtocol { + if pState, ok := s.transportProtocols[num]; ok { + return pState.proto + } + return nil +} diff --git a/tcpip/stack/stack_test.go b/tcpip/stack/stack_test.go index 5e44c2f..7bf3807 100644 --- a/tcpip/stack/stack_test.go +++ b/tcpip/stack/stack_test.go @@ -1,160 +1,160 @@ -package stack_test - -import ( - "log" - "netstack/tcpip" - "netstack/tcpip/buffer" - "netstack/tcpip/link/channel" - "netstack/tcpip/stack" - "testing" -) - -const ( - fakeNetHeaderLen = 12 - defaultMTU = 65536 -) - -type fakeNetworkEndpoint struct { - nicid tcpip.NICID - id stack.NetworkEndpointID - proto *fakeNetworkProtocol - dispatcher stack.TransportDispatcher - linkEP stack.LinkEndpoint -} - -func (f *fakeNetworkEndpoint) DefaultTTL() uint8 { - return 123 -} - -func (f *fakeNetworkEndpoint) MTU() uint32 { - return f.linkEP.MTU() - uint32(f.MaxHeaderLength()) -} - -func (f *fakeNetworkEndpoint) Capabilities() stack.LinkEndpointCapabilities { - return f.linkEP.Capabilities() -} - -func (f *fakeNetworkEndpoint) MaxHeaderLength() uint16 { - return f.linkEP.MaxHeaderLength() + fakeNetHeaderLen -} -func (f *fakeNetworkEndpoint) WritePacket(r *stack.Route, hdr buffer.Prependable, payload buffer.VectorisedView, - protocol tcpip.TransportProtocolNumber, ttl uint8) *tcpip.Error { - b := hdr.Prepend(fakeNetHeaderLen) - copy(b[:4], []byte(r.RemoteAddress)) - copy(b[4:8], []byte(f.id.LocalAddress)) - b[8] = byte(protocol) - log.Println("写入网络层数据 下一层去往链路层", b, payload) - - return f.linkEP.WritePacket(r, hdr, payload, 114514) -} - -func (f *fakeNetworkEndpoint) ID() *stack.NetworkEndpointID { - return &f.id -} - -func (f *fakeNetworkEndpoint) NICID() tcpip.NICID { - return f.nicid -} - -func (f *fakeNetworkEndpoint) HandlePacket(r *stack.Route, vv buffer.VectorisedView) { - log.Println("执行这个函数 接下来它会去向传输层分发数据") -} - -func (f *fakeNetworkEndpoint) Close() {} - -// dst|src|payload -type fakeNetworkProtocol struct{} - -func (f *fakeNetworkProtocol) Number() tcpip.NetworkProtocolNumber { - return 114514 -} - -func (f *fakeNetworkProtocol) NewEndpoint(nicid tcpip.NICID, addr tcpip.Address, linkAddrCache stack.LinkAddressCache, - dispatcher stack.TransportDispatcher, linkEP stack.LinkEndpoint) (stack.NetworkEndpoint, *tcpip.Error) { - return &fakeNetworkEndpoint{ - nicid: nicid, - id: stack.NetworkEndpointID{addr}, - proto: f, - dispatcher: dispatcher, - linkEP: linkEP, - }, nil -} - -func (f *fakeNetworkProtocol) MinimumPacketSize() int { - return fakeNetHeaderLen -} - -func (f *fakeNetworkProtocol) ParseAddresses(v buffer.View) (src, dst tcpip.Address) { - return tcpip.Address(v[4:8]), tcpip.Address(v[0:4]) -} - -func (f *fakeNetworkProtocol) SetOption(option interface{}) *tcpip.Error { - return nil -} - -func (f *fakeNetworkProtocol) Option(option interface{}) *tcpip.Error { - return nil -} - -func init() { - stack.RegisterNetworkProtocolFactory("fakeNet", func() stack.NetworkProtocol { - return &fakeNetworkProtocol{} - }) -} - -func TestStackBase(t *testing.T) { - - myStack := stack.New([]string{"fakeNet"}, nil, stack.Options{}) - id1, ep1 := channel.New(10, defaultMTU, "00:15:5d:26:d7:a1") // 这是一个物理设备 - - if err := myStack.CreateNIC(1, id1); err != nil { // 将上面的物理设备抽象成我们的网卡对象 - panic(err) - } - myStack.AddAddress(1, 114514, "\x0a\xff\x01\x01") // 给网卡对象绑定一个IP地址 可以绑定多个 - - id2, _ := channel.New(10, defaultMTU, "50:5B:C2:D0:96:57") // 这是一个物理设备 - if err := myStack.CreateNIC(2, id2); err != nil { // 将上面的物理设备抽象成我们的网卡对象 - panic(err) - } - myStack.AddAddress(2, 114514, "\x0a\xff\x01\x02") // 给网卡对象绑定一个IP地址 可以绑定多个 - - buf := buffer.NewView(30) - for i := range buf { - buf[i] = 0 - } - // dst 10.255.1.2 - buf[0] = '\x0a' - buf[1] = '\xff' - buf[2] = '\x01' - buf[3] = '\x02' - // src 10.255.1.1 - buf[4] = '\x0a' - buf[5] = '\xff' - buf[6] = '\x01' - buf[7] = '\x01' - - myStack.SetRouteTable([]tcpip.Route{ - {"\x01", "\x01", "\x00", 1}, - {"\x00", "\x01", "\x00", 2}, - }) - - sendTo(t, myStack, tcpip.Address("\x0a\xff\x01\x02")) - - //log.Println(ep1.Drain()) - p := <-ep1.C - log.Println(p) -} - -func sendTo(t *testing.T, s *stack.Stack, addr tcpip.Address) { - r, err := s.FindRoute(0, "", addr, 114514) - if err != nil { - t.Fatalf("FindRoute failed: %v", err) - } - defer r.Release() - - hdr := buffer.NewPrependable(int(r.MaxHeaderLength())) - if err := r.WritePacket(hdr, buffer.VectorisedView{}, 10086, 123); err != nil { - t.Errorf("WritePacket failed: %v", err) - return - } -} +package stack_test + +import ( + "log" + "netstack/tcpip" + "netstack/tcpip/buffer" + "netstack/tcpip/link/channel" + "netstack/tcpip/stack" + "testing" +) + +const ( + fakeNetHeaderLen = 12 + defaultMTU = 65536 +) + +type fakeNetworkEndpoint struct { + nicid tcpip.NICID + id stack.NetworkEndpointID + proto *fakeNetworkProtocol + dispatcher stack.TransportDispatcher + linkEP stack.LinkEndpoint +} + +func (f *fakeNetworkEndpoint) DefaultTTL() uint8 { + return 123 +} + +func (f *fakeNetworkEndpoint) MTU() uint32 { + return f.linkEP.MTU() - uint32(f.MaxHeaderLength()) +} + +func (f *fakeNetworkEndpoint) Capabilities() stack.LinkEndpointCapabilities { + return f.linkEP.Capabilities() +} + +func (f *fakeNetworkEndpoint) MaxHeaderLength() uint16 { + return f.linkEP.MaxHeaderLength() + fakeNetHeaderLen +} +func (f *fakeNetworkEndpoint) WritePacket(r *stack.Route, hdr buffer.Prependable, payload buffer.VectorisedView, + protocol tcpip.TransportProtocolNumber, ttl uint8) *tcpip.Error { + b := hdr.Prepend(fakeNetHeaderLen) + copy(b[:4], []byte(r.RemoteAddress)) + copy(b[4:8], []byte(f.id.LocalAddress)) + b[8] = byte(protocol) + log.Println("写入网络层数据 下一层去往链路层", b, payload) + + return f.linkEP.WritePacket(r, hdr, payload, 114514) +} + +func (f *fakeNetworkEndpoint) ID() *stack.NetworkEndpointID { + return &f.id +} + +func (f *fakeNetworkEndpoint) NICID() tcpip.NICID { + return f.nicid +} + +func (f *fakeNetworkEndpoint) HandlePacket(r *stack.Route, vv buffer.VectorisedView) { + log.Println("执行这个函数 接下来它会去向传输层分发数据") +} + +func (f *fakeNetworkEndpoint) Close() {} + +// dst|src|payload +type fakeNetworkProtocol struct{} + +func (f *fakeNetworkProtocol) Number() tcpip.NetworkProtocolNumber { + return 114514 +} + +func (f *fakeNetworkProtocol) NewEndpoint(nicid tcpip.NICID, addr tcpip.Address, linkAddrCache stack.LinkAddressCache, + dispatcher stack.TransportDispatcher, linkEP stack.LinkEndpoint) (stack.NetworkEndpoint, *tcpip.Error) { + return &fakeNetworkEndpoint{ + nicid: nicid, + id: stack.NetworkEndpointID{addr}, + proto: f, + dispatcher: dispatcher, + linkEP: linkEP, + }, nil +} + +func (f *fakeNetworkProtocol) MinimumPacketSize() int { + return fakeNetHeaderLen +} + +func (f *fakeNetworkProtocol) ParseAddresses(v buffer.View) (src, dst tcpip.Address) { + return tcpip.Address(v[4:8]), tcpip.Address(v[0:4]) +} + +func (f *fakeNetworkProtocol) SetOption(option interface{}) *tcpip.Error { + return nil +} + +func (f *fakeNetworkProtocol) Option(option interface{}) *tcpip.Error { + return nil +} + +func init() { + stack.RegisterNetworkProtocolFactory("fakeNet", func() stack.NetworkProtocol { + return &fakeNetworkProtocol{} + }) +} + +func TestStackBase(t *testing.T) { + + myStack := stack.New([]string{"fakeNet"}, nil, stack.Options{}) + id1, ep1 := channel.New(10, defaultMTU, "00:15:5d:26:d7:a1") // 这是一个物理设备 + + if err := myStack.CreateNIC(1, id1); err != nil { // 将上面的物理设备抽象成我们的网卡对象 + panic(err) + } + myStack.AddAddress(1, 114514, "\x0a\xff\x01\x01") // 给网卡对象绑定一个IP地址 可以绑定多个 + + id2, _ := channel.New(10, defaultMTU, "50:5B:C2:D0:96:57") // 这是一个物理设备 + if err := myStack.CreateNIC(2, id2); err != nil { // 将上面的物理设备抽象成我们的网卡对象 + panic(err) + } + myStack.AddAddress(2, 114514, "\x0a\xff\x01\x02") // 给网卡对象绑定一个IP地址 可以绑定多个 + + buf := buffer.NewView(30) + for i := range buf { + buf[i] = 0 + } + // dst 10.255.1.2 + buf[0] = '\x0a' + buf[1] = '\xff' + buf[2] = '\x01' + buf[3] = '\x02' + // src 10.255.1.1 + buf[4] = '\x0a' + buf[5] = '\xff' + buf[6] = '\x01' + buf[7] = '\x01' + + myStack.SetRouteTable([]tcpip.Route{ + {"\x01", "\x01", "\x00", 1}, + {"\x00", "\x01", "\x00", 2}, + }) + + sendTo(t, myStack, tcpip.Address("\x0a\xff\x01\x02")) + + //log.Println(ep1.Drain()) + p := <-ep1.C + log.Println(p) +} + +func sendTo(t *testing.T, s *stack.Stack, addr tcpip.Address) { + r, err := s.FindRoute(0, "", addr, 114514) + if err != nil { + t.Fatalf("FindRoute failed: %v", err) + } + defer r.Release() + + hdr := buffer.NewPrependable(int(r.MaxHeaderLength())) + if err := r.WritePacket(hdr, buffer.VectorisedView{}, 10086, 123); err != nil { + t.Errorf("WritePacket failed: %v", err) + return + } +} diff --git a/tcpip/stack/transport_demuxer.go b/tcpip/stack/transport_demuxer.go index 23c3964..8dde1e0 100644 --- a/tcpip/stack/transport_demuxer.go +++ b/tcpip/stack/transport_demuxer.go @@ -1,23 +1,23 @@ -package stack - -import ( - "netstack/tcpip" - "sync" -) - -// 网络层协议号和传输层协议号的组合 当作分流器的key值 -type protocolIDs struct { - network tcpip.NetworkProtocolNumber - transport tcpip.TransportProtocolNumber -} - -type transportEndpoints struct { - mu sync.RWMutex - endpoints map[TransportEndpointID]TransportEndpoint -} - -// transportDemuxer 解复用战队传输端点的数据包 -// 他执行两级解复用:首先基于网络层和传输协议 然后基于端点ID -type transportDemuxer struct { - protocol map[protocolIDs]*transportEndpoints -} +package stack + +import ( + "netstack/tcpip" + "sync" +) + +// 网络层协议号和传输层协议号的组合 当作分流器的key值 +type protocolIDs struct { + network tcpip.NetworkProtocolNumber + transport tcpip.TransportProtocolNumber +} + +type transportEndpoints struct { + mu sync.RWMutex + endpoints map[TransportEndpointID]TransportEndpoint +} + +// transportDemuxer 解复用战队传输端点的数据包 +// 他执行两级解复用:首先基于网络层和传输协议 然后基于端点ID +type transportDemuxer struct { + protocol map[protocolIDs]*transportEndpoints +} diff --git a/tcpip/tcpip.go b/tcpip/tcpip.go index 2d846dd..e3c573b 100644 --- a/tcpip/tcpip.go +++ b/tcpip/tcpip.go @@ -1,563 +1,563 @@ -package tcpip - -import ( - "errors" - "fmt" - "netstack/tcpip/buffer" - "netstack/waiter" - "reflect" - "strings" - "sync/atomic" -) - -type Error struct { - msg string - ignoreStats bool -} - -func (e *Error) String() string { - return e.msg -} - -func (e *Error) IgnoreStats() bool { - return e.ignoreStats -} - -var ( - ErrUnknownProtocol = &Error{msg: "unknown protocol"} - ErrUnknownNICID = &Error{msg: "unknown nic id"} - ErrUnknownProtocolOption = &Error{msg: "unknown option for protocol"} - ErrDuplicateNICID = &Error{msg: "duplicate nic id"} - ErrDuplicateAddress = &Error{msg: "duplicate address"} - ErrNoRoute = &Error{msg: "no route"} - ErrBadLinkEndpoint = &Error{msg: "bad link layer endpoint"} - ErrAlreadyBound = &Error{msg: "endpoint already bound", ignoreStats: true} - ErrInvalidEndpointState = &Error{msg: "endpoint is in invalid state"} - ErrAlreadyConnecting = &Error{msg: "endpoint is already connecting", ignoreStats: true} - ErrAlreadyConnected = &Error{msg: "endpoint is already connected", ignoreStats: true} - ErrNoPortAvailable = &Error{msg: "no ports are available"} - ErrPortInUse = &Error{msg: "port is in use"} - ErrBadLocalAddress = &Error{msg: "bad local address"} - ErrClosedForSend = &Error{msg: "endpoint is closed for send"} - ErrClosedForReceive = &Error{msg: "endpoint is closed for receive"} - ErrWouldBlock = &Error{msg: "operation would block", ignoreStats: true} - ErrConnectionRefused = &Error{msg: "connection was refused"} - ErrTimeout = &Error{msg: "operation timed out"} - ErrAborted = &Error{msg: "operation aborted"} - ErrConnectStarted = &Error{msg: "connection attempt started", ignoreStats: true} - ErrDestinationRequired = &Error{msg: "destination address is required"} - ErrNotSupported = &Error{msg: "operation not supported"} - ErrQueueSizeNotSupported = &Error{msg: "queue size querying not supported"} - ErrNotConnected = &Error{msg: "endpoint not connected"} - ErrConnectionReset = &Error{msg: "connection reset by peer"} - ErrConnectionAborted = &Error{msg: "connection aborted"} - ErrNoSuchFile = &Error{msg: "no such file"} - ErrInvalidOptionValue = &Error{msg: "invalid option value specified"} - ErrNoLinkAddress = &Error{msg: "no remote link address"} - ErrBadAddress = &Error{msg: "bad address"} - ErrNetworkUnreachable = &Error{msg: "network is unreachable"} - ErrMessageTooLong = &Error{msg: "message too long"} - ErrNoBufferSpace = &Error{msg: "no buffer space available"} -) - -// Errors related to Subnet -var ( - errSubnetLengthMismatch = errors.New("subnet length of address and mask differ") - errSubnetAddressMasked = errors.New("subnet address has bits set outside the mask") -) - -// Clock 提供当前的时间戳 -type Clock interface { - NowNanoseconds() int64 - - NowMonotonic() int64 -} - -// 地址是一个字节切片,转换为表示网络节点地址的字符串。或者,在 unix 端点的情况下,它可能代表一条路径 -type Address string - -type AddressMask string - -func (a AddressMask) String() string { - return Address(a).String() -} - -type Subnet struct { - address Address - mask AddressMask -} - -// NewSubnet creates a new Subnet, checking that the address and mask are the same length. -func NewSubnet(a Address, m AddressMask) (Subnet, error) { - if len(a) != len(m) { - return Subnet{}, errSubnetLengthMismatch - } - for i := 0; i < len(a); i++ { - if a[i]&^m[i] != 0 { - return Subnet{}, errSubnetAddressMasked - } - } - return Subnet{a, m}, nil -} - -// Contains returns true iff the address is of the same length and matches the -// subnet address and mask. -func (s *Subnet) Contains(a Address) bool { - if len(a) != len(s.address) { - return false - } - for i := 0; i < len(a); i++ { - if a[i]&s.mask[i] != s.address[i] { - return false - } - } - return true -} - -// ID returns the subnet ID. -func (s *Subnet) ID() Address { - return s.address -} - -// Bits returns the number of ones (network bits) and zeros (host bits) in the -// subnet mask. -func (s *Subnet) Bits() (ones int, zeros int) { - for _, b := range []byte(s.mask) { - for i := uint(0); i < 8; i++ { - if b&(1<= 0; j-- { - if b&(1< s.Size() { - size = s.Size() - } - return s[:size], nil -} - -// Size implements Payload. -func (s SlicePayload) Size() int { - return len(s) -} - -// A ControlMessages contains socket control messages for IP sockets. -// -// +stateify savable -type ControlMessages struct { - // HasTimestamp indicates whether Timestamp is valid/set. - HasTimestamp bool - - // Timestamp is the time (in ns) that the last packed used to create - // the read data was received. - Timestamp int64 -} - -// Endpoint is the interface implemented by transport protocols (e.g., tcp, udp) -// that exposes functionality like read, write, connect, etc. to users of the -// networking stack. -// 传输层接口 -type Endpoint interface { - // Close puts the endpoint in a closed state and frees all resources - // associated with it. - Close() - - // Read reads data from the endpoint and optionally returns the sender. - // - // This method does not block if there is no data pending. It will also - // either return an error or data, never both. - // - // A timestamp (in ns) is optionally returned. A zero value indicates - // that no timestamp was available. - Read(*FullAddress) (buffer.View, ControlMessages, *Error) - - // Write writes data to the endpoint's peer. This method does not block if - // the data cannot be written. - // - // Unlike io.Writer.Write, Endpoint.Write transfers ownership of any bytes - // successfully written to the Endpoint. That is, if a call to - // Write(SlicePayload{data}) returns (n, err), it may retain data[:n], and - // the caller should not use data[:n] after Write returns. - // - // Note that unlike io.Writer.Write, it is not an error for Write to - // perform a partial write. - // - // For UDP and Ping sockets if address resolution is required, - // ErrNoLinkAddress and a notification channel is returned for the caller to - // block. Channel is closed once address resolution is complete (success or - // not). The channel is only non-nil in this case. - Write(Payload, WriteOptions) (uintptr, <-chan struct{}, *Error) - - // Peek reads data without consuming it from the endpoint. - // - // This method does not block if there is no data pending. - // - // A timestamp (in ns) is optionally returned. A zero value indicates - // that no timestamp was available. - Peek([][]byte) (uintptr, ControlMessages, *Error) - - // Connect connects the endpoint to its peer. Specifying a NIC is - // optional. - // - // There are three classes of return values: - // nil -- the attempt to connect succeeded. - // ErrConnectStarted/ErrAlreadyConnecting -- the connect attempt started - // but hasn't completed yet. In this case, the caller must call Connect - // or GetSockOpt(ErrorOption) when the endpoint becomes writable to - // get the actual result. The first call to Connect after the socket has - // connected returns nil. Calling connect again results in ErrAlreadyConnected. - // Anything else -- the attempt to connect failed. - Connect(address FullAddress) *Error - - // Shutdown closes the read and/or write end of the endpoint connection - // to its peer. - Shutdown(flags ShutdownFlags) *Error - - // Listen puts the endpoint in "listen" mode, which allows it to accept - // new connections. - Listen(backlog int) *Error - - // Accept returns a new endpoint if a peer has established a connection - // to an endpoint previously set to listen mode. This method does not - // block if no new connections are available. - // - // The returned Queue is the wait queue for the newly created endpoint. - Accept() (Endpoint, *waiter.Queue, *Error) - - // Bind binds the endpoint to a specific local address and port. - // Specifying a NIC is optional. - // - // An optional commit function will be executed atomically with respect - // to binding the endpoint. If this returns an error, the bind will not - // occur and the error will be propagated back to the caller. - Bind(address FullAddress, commit func() *Error) *Error - - // GetLocalAddress returns the address to which the endpoint is bound. - GetLocalAddress() (FullAddress, *Error) - - // GetRemoteAddress returns the address to which the endpoint is - // connected. - GetRemoteAddress() (FullAddress, *Error) - - // Readiness returns the current readiness of the endpoint. For example, - // if waiter.EventIn is set, the endpoint is immediately readable. - Readiness(mask waiter.EventMask) waiter.EventMask - - // SetSockOpt sets a socket option. opt should be one of the *Option types. - SetSockOpt(opt interface{}) *Error - - // GetSockOpt gets a socket option. opt should be a pointer to one of the - // *Option types. - GetSockOpt(opt interface{}) *Error -} - -// WriteOptions contains options for Endpoint.Write. -type WriteOptions struct { - // If To is not nil, write to the given address instead of the endpoint's - // peer. - To *FullAddress - - // More has the same semantics as Linux's MSG_MORE. - More bool - - // EndOfRecord has the same semantics as Linux's MSG_EOR. - EndOfRecord bool -} - -type Route struct { - Destination Address // 目标地址 - Mask AddressMask // 掩码 - Gateway Address // 网关 - NIC NICID // 使用的网卡设备 -} - -// Match determines if r is viable for the given destination address. -func (r *Route) Match(addr Address) bool { - if len(addr) != len(r.Destination) { - return false - } - - for i := 0; i < len(r.Destination); i++ { - if (addr[i] & r.Mask[i]) != r.Destination[i] { - return false - } - } - - return true -} - -// Stats 包含了网络栈的统计信息 -type Stats struct { - // TODO 需要解读 - // UnknownProtocolRcvdPackets is the number of packets received by the - // stack that were for an unknown or unsupported protocol. - UnknownProtocolRcvdPackets *StatCounter - - // MalformedRcvPackets is the number of packets received by the stack - // that were deemed malformed. - MalformedRcvdPackets *StatCounter - - // DroppedPackets is the number of packets dropped due to full queues. - DroppedPackets *StatCounter - - // IP breaks out IP-specific stats (both v4 and v6). - IP IPStats - - // TCP breaks out TCP-specific stats. - TCP TCPStats - - // UDP breaks out UDP-specific stats. - UDP UDPStats -} - -// A StatCounter keeps track of a statistic. -type StatCounter struct { - count uint64 -} - -// Increment adds one to the counter. -func (s *StatCounter) Increment() { - s.IncrementBy(1) -} - -// Value returns the current value of the counter. -func (s *StatCounter) Value() uint64 { - return atomic.LoadUint64(&s.count) -} - -// IncrementBy increments the counter by v. -func (s *StatCounter) IncrementBy(v uint64) { - atomic.AddUint64(&s.count, v) -} - -type IPStats struct { - // PacketsReceived is the total number of IP packets received from the link - // layer in nic.DeliverNetworkPacket. - PacketsReceived *StatCounter - - // InvalidAddressesReceived is the total number of IP packets received - // with an unknown or invalid destination address. - InvalidAddressesReceived *StatCounter - - // PacketsDelivered is the total number of incoming IP packets that - // are successfully delivered to the transport layer via HandlePacket. - PacketsDelivered *StatCounter - - // PacketsSent is the total number of IP packets sent via WritePacket. - PacketsSent *StatCounter - - // OutgoingPacketErrors is the total number of IP packets which failed - // to write to a link-layer endpoint. - OutgoingPacketErrors *StatCounter -} - -type TCPStats struct { - // ActiveConnectionOpenings is the number of connections opened successfully - // via Connect. - ActiveConnectionOpenings *StatCounter - - // PassiveConnectionOpenings is the number of connections opened - // successfully via Listen. - PassiveConnectionOpenings *StatCounter - - // FailedConnectionAttempts is the number of calls to Connect or Listen - // (active and passive openings, respectively) that end in an error. - FailedConnectionAttempts *StatCounter - - // ValidSegmentsReceived is the number of TCP segments received that the - // transport layer successfully parsed. - ValidSegmentsReceived *StatCounter - - // InvalidSegmentsReceived is the number of TCP segments received that - // the transport layer could not parse. - InvalidSegmentsReceived *StatCounter - - // SegmentsSent is the number of TCP segments sent. - SegmentsSent *StatCounter - - // ResetsSent is the number of TCP resets sent. - ResetsSent *StatCounter - - // ResetsReceived is the number of TCP resets received. - ResetsReceived *StatCounter -} - -type UDPStats struct { - // PacketsReceived is the number of UDP datagrams received via - // HandlePacket. - PacketsReceived *StatCounter - - // UnknownPortErrors is the number of incoming UDP datagrams dropped - // because they did not have a known destination port. - UnknownPortErrors *StatCounter - - // ReceiveBufferErrors is the number of incoming UDP datagrams dropped - // due to the receiving buffer being in an invalid state. - ReceiveBufferErrors *StatCounter - - // MalformedPacketsReceived is the number of incoming UDP datagrams - // dropped due to the UDP header being in a malformed state. - MalformedPacketsReceived *StatCounter - - // PacketsSent is the number of UDP datagrams sent via sendUDP. - PacketsSent *StatCounter -} - -func fillIn(v reflect.Value) { - for i := 0; i < v.NumField(); i++ { - v := v.Field(i) - switch v.Kind() { - case reflect.Ptr: - x := v.Addr().Interface() - if s, ok := x.(**StatCounter); ok { - if *s == nil { - *s = &StatCounter{} - } - } - case reflect.Struct: - fillIn(v) - } - } -} - -// FillIn returns a copy of s with nil fields initialized to new StatCounters. -func (s Stats) FillIn() Stats { - fillIn(reflect.ValueOf(&s).Elem()) - return s -} - -func (a Address) String() string { - switch len(a) { - case 4: - return fmt.Sprintf("%d.%d.%d.%d", int(a[0]), int(a[1]), int(a[2]), int(a[3])) - case 16: - // Find the longest subsequence of hexadecimal zeros. - start, end := -1, -1 - for i := 0; i < len(a); i += 2 { - j := i - for j < len(a) && a[j] == 0 && a[j+1] == 0 { - j += 2 - } - if j > i+2 && j-i > end-start { - start, end = i, j - } - } - var b strings.Builder - for i := 0; i < len(a); i += 2 { - if i == start { - b.WriteString("::") - i = end - if end >= len(a) { - break - } - } else if i > 0 { - b.WriteByte(':') - } - v := uint16(a[i+0])<<8 | uint16(a[i+1]) - if v == 0 { - b.WriteByte('0') - } else { - const digits = "0123456789abcdef" - for i := uint(3); i < 4; i-- { - if v := v >> (i * 4); v != 0 { - b.WriteByte(digits[v&0xf]) - } - } - } - } - return b.String() - default: - return fmt.Sprintf("%s", string(a)) - } -} +package tcpip + +import ( + "errors" + "fmt" + "netstack/tcpip/buffer" + "netstack/waiter" + "reflect" + "strings" + "sync/atomic" +) + +type Error struct { + msg string + ignoreStats bool +} + +func (e *Error) String() string { + return e.msg +} + +func (e *Error) IgnoreStats() bool { + return e.ignoreStats +} + +var ( + ErrUnknownProtocol = &Error{msg: "unknown protocol"} + ErrUnknownNICID = &Error{msg: "unknown nic id"} + ErrUnknownProtocolOption = &Error{msg: "unknown option for protocol"} + ErrDuplicateNICID = &Error{msg: "duplicate nic id"} + ErrDuplicateAddress = &Error{msg: "duplicate address"} + ErrNoRoute = &Error{msg: "no route"} + ErrBadLinkEndpoint = &Error{msg: "bad link layer endpoint"} + ErrAlreadyBound = &Error{msg: "endpoint already bound", ignoreStats: true} + ErrInvalidEndpointState = &Error{msg: "endpoint is in invalid state"} + ErrAlreadyConnecting = &Error{msg: "endpoint is already connecting", ignoreStats: true} + ErrAlreadyConnected = &Error{msg: "endpoint is already connected", ignoreStats: true} + ErrNoPortAvailable = &Error{msg: "no ports are available"} + ErrPortInUse = &Error{msg: "port is in use"} + ErrBadLocalAddress = &Error{msg: "bad local address"} + ErrClosedForSend = &Error{msg: "endpoint is closed for send"} + ErrClosedForReceive = &Error{msg: "endpoint is closed for receive"} + ErrWouldBlock = &Error{msg: "operation would block", ignoreStats: true} + ErrConnectionRefused = &Error{msg: "connection was refused"} + ErrTimeout = &Error{msg: "operation timed out"} + ErrAborted = &Error{msg: "operation aborted"} + ErrConnectStarted = &Error{msg: "connection attempt started", ignoreStats: true} + ErrDestinationRequired = &Error{msg: "destination address is required"} + ErrNotSupported = &Error{msg: "operation not supported"} + ErrQueueSizeNotSupported = &Error{msg: "queue size querying not supported"} + ErrNotConnected = &Error{msg: "endpoint not connected"} + ErrConnectionReset = &Error{msg: "connection reset by peer"} + ErrConnectionAborted = &Error{msg: "connection aborted"} + ErrNoSuchFile = &Error{msg: "no such file"} + ErrInvalidOptionValue = &Error{msg: "invalid option value specified"} + ErrNoLinkAddress = &Error{msg: "no remote link address"} + ErrBadAddress = &Error{msg: "bad address"} + ErrNetworkUnreachable = &Error{msg: "network is unreachable"} + ErrMessageTooLong = &Error{msg: "message too long"} + ErrNoBufferSpace = &Error{msg: "no buffer space available"} +) + +// Errors related to Subnet +var ( + errSubnetLengthMismatch = errors.New("subnet length of address and mask differ") + errSubnetAddressMasked = errors.New("subnet address has bits set outside the mask") +) + +// Clock 提供当前的时间戳 +type Clock interface { + NowNanoseconds() int64 + + NowMonotonic() int64 +} + +// 地址是一个字节切片,转换为表示网络节点地址的字符串。或者,在 unix 端点的情况下,它可能代表一条路径 +type Address string + +type AddressMask string + +func (a AddressMask) String() string { + return Address(a).String() +} + +type Subnet struct { + address Address + mask AddressMask +} + +// NewSubnet creates a new Subnet, checking that the address and mask are the same length. +func NewSubnet(a Address, m AddressMask) (Subnet, error) { + if len(a) != len(m) { + return Subnet{}, errSubnetLengthMismatch + } + for i := 0; i < len(a); i++ { + if a[i]&^m[i] != 0 { + return Subnet{}, errSubnetAddressMasked + } + } + return Subnet{a, m}, nil +} + +// Contains returns true iff the address is of the same length and matches the +// subnet address and mask. +func (s *Subnet) Contains(a Address) bool { + if len(a) != len(s.address) { + return false + } + for i := 0; i < len(a); i++ { + if a[i]&s.mask[i] != s.address[i] { + return false + } + } + return true +} + +// ID returns the subnet ID. +func (s *Subnet) ID() Address { + return s.address +} + +// Bits returns the number of ones (network bits) and zeros (host bits) in the +// subnet mask. +func (s *Subnet) Bits() (ones int, zeros int) { + for _, b := range []byte(s.mask) { + for i := uint(0); i < 8; i++ { + if b&(1<= 0; j-- { + if b&(1< s.Size() { + size = s.Size() + } + return s[:size], nil +} + +// Size implements Payload. +func (s SlicePayload) Size() int { + return len(s) +} + +// A ControlMessages contains socket control messages for IP sockets. +// +// +stateify savable +type ControlMessages struct { + // HasTimestamp indicates whether Timestamp is valid/set. + HasTimestamp bool + + // Timestamp is the time (in ns) that the last packed used to create + // the read data was received. + Timestamp int64 +} + +// Endpoint is the interface implemented by transport protocols (e.g., tcp, udp) +// that exposes functionality like read, write, connect, etc. to users of the +// networking stack. +// 传输层接口 +type Endpoint interface { + // Close puts the endpoint in a closed state and frees all resources + // associated with it. + Close() + + // Read reads data from the endpoint and optionally returns the sender. + // + // This method does not block if there is no data pending. It will also + // either return an error or data, never both. + // + // A timestamp (in ns) is optionally returned. A zero value indicates + // that no timestamp was available. + Read(*FullAddress) (buffer.View, ControlMessages, *Error) + + // Write writes data to the endpoint's peer. This method does not block if + // the data cannot be written. + // + // Unlike io.Writer.Write, Endpoint.Write transfers ownership of any bytes + // successfully written to the Endpoint. That is, if a call to + // Write(SlicePayload{data}) returns (n, err), it may retain data[:n], and + // the caller should not use data[:n] after Write returns. + // + // Note that unlike io.Writer.Write, it is not an error for Write to + // perform a partial write. + // + // For UDP and Ping sockets if address resolution is required, + // ErrNoLinkAddress and a notification channel is returned for the caller to + // block. Channel is closed once address resolution is complete (success or + // not). The channel is only non-nil in this case. + Write(Payload, WriteOptions) (uintptr, <-chan struct{}, *Error) + + // Peek reads data without consuming it from the endpoint. + // + // This method does not block if there is no data pending. + // + // A timestamp (in ns) is optionally returned. A zero value indicates + // that no timestamp was available. + Peek([][]byte) (uintptr, ControlMessages, *Error) + + // Connect connects the endpoint to its peer. Specifying a NIC is + // optional. + // + // There are three classes of return values: + // nil -- the attempt to connect succeeded. + // ErrConnectStarted/ErrAlreadyConnecting -- the connect attempt started + // but hasn't completed yet. In this case, the caller must call Connect + // or GetSockOpt(ErrorOption) when the endpoint becomes writable to + // get the actual result. The first call to Connect after the socket has + // connected returns nil. Calling connect again results in ErrAlreadyConnected. + // Anything else -- the attempt to connect failed. + Connect(address FullAddress) *Error + + // Shutdown closes the read and/or write end of the endpoint connection + // to its peer. + Shutdown(flags ShutdownFlags) *Error + + // Listen puts the endpoint in "listen" mode, which allows it to accept + // new connections. + Listen(backlog int) *Error + + // Accept returns a new endpoint if a peer has established a connection + // to an endpoint previously set to listen mode. This method does not + // block if no new connections are available. + // + // The returned Queue is the wait queue for the newly created endpoint. + Accept() (Endpoint, *waiter.Queue, *Error) + + // Bind binds the endpoint to a specific local address and port. + // Specifying a NIC is optional. + // + // An optional commit function will be executed atomically with respect + // to binding the endpoint. If this returns an error, the bind will not + // occur and the error will be propagated back to the caller. + Bind(address FullAddress, commit func() *Error) *Error + + // GetLocalAddress returns the address to which the endpoint is bound. + GetLocalAddress() (FullAddress, *Error) + + // GetRemoteAddress returns the address to which the endpoint is + // connected. + GetRemoteAddress() (FullAddress, *Error) + + // Readiness returns the current readiness of the endpoint. For example, + // if waiter.EventIn is set, the endpoint is immediately readable. + Readiness(mask waiter.EventMask) waiter.EventMask + + // SetSockOpt sets a socket option. opt should be one of the *Option types. + SetSockOpt(opt interface{}) *Error + + // GetSockOpt gets a socket option. opt should be a pointer to one of the + // *Option types. + GetSockOpt(opt interface{}) *Error +} + +// WriteOptions contains options for Endpoint.Write. +type WriteOptions struct { + // If To is not nil, write to the given address instead of the endpoint's + // peer. + To *FullAddress + + // More has the same semantics as Linux's MSG_MORE. + More bool + + // EndOfRecord has the same semantics as Linux's MSG_EOR. + EndOfRecord bool +} + +type Route struct { + Destination Address // 目标地址 + Mask AddressMask // 掩码 + Gateway Address // 网关 + NIC NICID // 使用的网卡设备 +} + +// Match determines if r is viable for the given destination address. +func (r *Route) Match(addr Address) bool { + if len(addr) != len(r.Destination) { + return false + } + + for i := 0; i < len(r.Destination); i++ { + if (addr[i] & r.Mask[i]) != r.Destination[i] { + return false + } + } + + return true +} + +// Stats 包含了网络栈的统计信息 +type Stats struct { + // TODO 需要解读 + // UnknownProtocolRcvdPackets is the number of packets received by the + // stack that were for an unknown or unsupported protocol. + UnknownProtocolRcvdPackets *StatCounter + + // MalformedRcvPackets is the number of packets received by the stack + // that were deemed malformed. + MalformedRcvdPackets *StatCounter + + // DroppedPackets is the number of packets dropped due to full queues. + DroppedPackets *StatCounter + + // IP breaks out IP-specific stats (both v4 and v6). + IP IPStats + + // TCP breaks out TCP-specific stats. + TCP TCPStats + + // UDP breaks out UDP-specific stats. + UDP UDPStats +} + +// A StatCounter keeps track of a statistic. +type StatCounter struct { + count uint64 +} + +// Increment adds one to the counter. +func (s *StatCounter) Increment() { + s.IncrementBy(1) +} + +// Value returns the current value of the counter. +func (s *StatCounter) Value() uint64 { + return atomic.LoadUint64(&s.count) +} + +// IncrementBy increments the counter by v. +func (s *StatCounter) IncrementBy(v uint64) { + atomic.AddUint64(&s.count, v) +} + +type IPStats struct { + // PacketsReceived is the total number of IP packets received from the link + // layer in nic.DeliverNetworkPacket. + PacketsReceived *StatCounter + + // InvalidAddressesReceived is the total number of IP packets received + // with an unknown or invalid destination address. + InvalidAddressesReceived *StatCounter + + // PacketsDelivered is the total number of incoming IP packets that + // are successfully delivered to the transport layer via HandlePacket. + PacketsDelivered *StatCounter + + // PacketsSent is the total number of IP packets sent via WritePacket. + PacketsSent *StatCounter + + // OutgoingPacketErrors is the total number of IP packets which failed + // to write to a link-layer endpoint. + OutgoingPacketErrors *StatCounter +} + +type TCPStats struct { + // ActiveConnectionOpenings is the number of connections opened successfully + // via Connect. + ActiveConnectionOpenings *StatCounter + + // PassiveConnectionOpenings is the number of connections opened + // successfully via Listen. + PassiveConnectionOpenings *StatCounter + + // FailedConnectionAttempts is the number of calls to Connect or Listen + // (active and passive openings, respectively) that end in an error. + FailedConnectionAttempts *StatCounter + + // ValidSegmentsReceived is the number of TCP segments received that the + // transport layer successfully parsed. + ValidSegmentsReceived *StatCounter + + // InvalidSegmentsReceived is the number of TCP segments received that + // the transport layer could not parse. + InvalidSegmentsReceived *StatCounter + + // SegmentsSent is the number of TCP segments sent. + SegmentsSent *StatCounter + + // ResetsSent is the number of TCP resets sent. + ResetsSent *StatCounter + + // ResetsReceived is the number of TCP resets received. + ResetsReceived *StatCounter +} + +type UDPStats struct { + // PacketsReceived is the number of UDP datagrams received via + // HandlePacket. + PacketsReceived *StatCounter + + // UnknownPortErrors is the number of incoming UDP datagrams dropped + // because they did not have a known destination port. + UnknownPortErrors *StatCounter + + // ReceiveBufferErrors is the number of incoming UDP datagrams dropped + // due to the receiving buffer being in an invalid state. + ReceiveBufferErrors *StatCounter + + // MalformedPacketsReceived is the number of incoming UDP datagrams + // dropped due to the UDP header being in a malformed state. + MalformedPacketsReceived *StatCounter + + // PacketsSent is the number of UDP datagrams sent via sendUDP. + PacketsSent *StatCounter +} + +func fillIn(v reflect.Value) { + for i := 0; i < v.NumField(); i++ { + v := v.Field(i) + switch v.Kind() { + case reflect.Ptr: + x := v.Addr().Interface() + if s, ok := x.(**StatCounter); ok { + if *s == nil { + *s = &StatCounter{} + } + } + case reflect.Struct: + fillIn(v) + } + } +} + +// FillIn returns a copy of s with nil fields initialized to new StatCounters. +func (s Stats) FillIn() Stats { + fillIn(reflect.ValueOf(&s).Elem()) + return s +} + +func (a Address) String() string { + switch len(a) { + case 4: + return fmt.Sprintf("%d.%d.%d.%d", int(a[0]), int(a[1]), int(a[2]), int(a[3])) + case 16: + // Find the longest subsequence of hexadecimal zeros. + start, end := -1, -1 + for i := 0; i < len(a); i += 2 { + j := i + for j < len(a) && a[j] == 0 && a[j+1] == 0 { + j += 2 + } + if j > i+2 && j-i > end-start { + start, end = i, j + } + } + var b strings.Builder + for i := 0; i < len(a); i += 2 { + if i == start { + b.WriteString("::") + i = end + if end >= len(a) { + break + } + } else if i > 0 { + b.WriteByte(':') + } + v := uint16(a[i+0])<<8 | uint16(a[i+1]) + if v == 0 { + b.WriteByte('0') + } else { + const digits = "0123456789abcdef" + for i := uint(3); i < 4; i-- { + if v := v >> (i * 4); v != 0 { + b.WriteByte(digits[v&0xf]) + } + } + } + } + return b.String() + default: + return fmt.Sprintf("%s", string(a)) + } +} diff --git a/tcpip/time_unsafe.go b/tcpip/time_unsafe.go index 829feae..a6d6a17 100644 --- a/tcpip/time_unsafe.go +++ b/tcpip/time_unsafe.go @@ -1,42 +1,42 @@ -// Copyright 2018 Google LLC -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// +build go1.9 - -package tcpip - -import ( - _ "time" // Used with go:linkname. - _ "unsafe" // Required for go:linkname. -) - -// StdClock implements Clock with the time package. -type StdClock struct{} - -var _ Clock = (*StdClock)(nil) - -//go:linkname now time.now -func now() (sec int64, nsec int32, mono int64) - -// NowNanoseconds implements Clock.NowNanoseconds. -func (*StdClock) NowNanoseconds() int64 { - sec, nsec, _ := now() - return sec*1e9 + int64(nsec) -} - -// NowMonotonic implements Clock.NowMonotonic. -func (*StdClock) NowMonotonic() int64 { - _, _, mono := now() - return mono -} +// Copyright 2018 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build go1.9 + +package tcpip + +import ( + _ "time" // Used with go:linkname. + _ "unsafe" // Required for go:linkname. +) + +// StdClock implements Clock with the time package. +type StdClock struct{} + +var _ Clock = (*StdClock)(nil) + +//go:linkname now time.now +func now() (sec int64, nsec int32, mono int64) + +// NowNanoseconds implements Clock.NowNanoseconds. +func (*StdClock) NowNanoseconds() int64 { + sec, nsec, _ := now() + return sec*1e9 + int64(nsec) +} + +// NowMonotonic implements Clock.NowMonotonic. +func (*StdClock) NowMonotonic() int64 { + _, _, mono := now() + return mono +} diff --git a/tcpip/transport/udp/README.md b/tcpip/transport/udp/README.md index 647088b..d05bdf2 100644 --- a/tcpip/transport/udp/README.md +++ b/tcpip/transport/udp/README.md @@ -1,18 +1,18 @@ -# 传输层 - -![img](https://doc.shiyanlou.com/document-uid949121labid10418timestamp1555488741384.png) - -传输层是整个网络体系结构中的关键之一,我们很多编程都是直接和传输层打交道的,我们需要了解以下的概念: -1. 端口的意义 - 上一章已经介绍过了 -2. 无连接 UDP 协议及特点 - 本章介绍 -3. 面向连接 TCP 协议及特点 - 下章会介绍 - -传输层向它上面的应用层提供通信服务,传输题主要提供了以下功能: - -1. 为相互通信的应用进程提供逻辑通信。 网络层是为主机之间提供通信,而传输层是为应用进程之间提供端到端的逻辑通信。 - -2. 复用和分用 复用是指发送方不同的应用进程都可以使用同一个传输协议来传送数据,而分用是指接收方的传输层在剥去报文的首部后, 能够把这些数据正确的交付给目的进程。其实复用和分用就是端口来实现的。 - -3. 报文差错检测 网络层只对 IP 首部进行差错检测,而传输层对整个报文进行差错检测。 - +# 传输层 + +![img](https://doc.shiyanlou.com/document-uid949121labid10418timestamp1555488741384.png) + +传输层是整个网络体系结构中的关键之一,我们很多编程都是直接和传输层打交道的,我们需要了解以下的概念: +1. 端口的意义 - 上一章已经介绍过了 +2. 无连接 UDP 协议及特点 - 本章介绍 +3. 面向连接 TCP 协议及特点 - 下章会介绍 + +传输层向它上面的应用层提供通信服务,传输题主要提供了以下功能: + +1. 为相互通信的应用进程提供逻辑通信。 网络层是为主机之间提供通信,而传输层是为应用进程之间提供端到端的逻辑通信。 + +2. 复用和分用 复用是指发送方不同的应用进程都可以使用同一个传输协议来传送数据,而分用是指接收方的传输层在剥去报文的首部后, 能够把这些数据正确的交付给目的进程。其实复用和分用就是端口来实现的。 + +3. 报文差错检测 网络层只对 IP 首部进行差错检测,而传输层对整个报文进行差错检测。 + 4. 提供不可靠和可靠通信 网络层只提供了不可靠通信,而在传输层的 TCP 协议提供了可靠通信。 \ No newline at end of file diff --git a/tcpip/transport/udp/endpoint.go b/tcpip/transport/udp/endpoint.go index b4b6968..a8fa760 100644 --- a/tcpip/transport/udp/endpoint.go +++ b/tcpip/transport/udp/endpoint.go @@ -1,281 +1,281 @@ -package udp - -import ( - "log" - "netstack/tcpip" - "netstack/tcpip/buffer" - "netstack/tcpip/header" - "netstack/tcpip/stack" - "netstack/waiter" - "sync" -) - -// udp报文结构 当收到udp报文时 会用这个结构来保存udp报文数据 -type udpPacket struct { - udpPacketEntry // 链表实现 - // TODO 需要添加 -} - -type endpointState int - -// 表示UDP端的状态参数 -const ( - stateInitial endpointState = iota - stateBound - stateConnected - stateClosed -) - -type endpoint struct { - stack *stack.Stack // udp所依赖的用户协议栈 - netProto tcpip.NetworkProtocolNumber // udp网络协议号 ipv4/ipv6 - waiterQueue *waiter.Queue // TODO 需要解析 - - // TODO 需要解析 - // The following fields are used to manage the receive queue, and are - // protected by rcvMu. - rcvMu sync.Mutex - rcvReady bool - rcvList udpPacketList - rcvBufSizeMax int - rcvBufSize int - rcvClosed bool - rcvTimestamp bool - - // The following fields are protected by the mu mutex. - mu sync.RWMutex - sndBufSize int // 发送缓冲区大小 - id stack.TransportEndpointID - state endpointState - bindNICID tcpip.NICID // 绑定的网卡 - regNICID tcpip.NICID // - route stack.Route // 路由? TODO - dstPort uint16 // 目标端口 - v6only bool // 仅支持ipv6 - multicastTTL uint8 // 广播TTL - - // shutdownFlags represent the current shutdown state of the endpoint. - shutdownFlags tcpip.ShutdownFlags - - // TODO - - // effectiveNetProtos contains the network protocols actually in use. In - // most cases it will only contain "netProto", but in cases like IPv6 - // endpoints with v6only set to false, this could include multiple - // protocols (e.g., IPv6 and IPv4) or a single different protocol (e.g., - // IPv4 when IPv6 endpoint is bound or connected to an IPv4 mapped - // address). 当前生效的网络层协议 - effectiveNetProtos []tcpip.NetworkProtocolNumber -} - -func newEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, - waiterQueue *waiter.Queue) *endpoint { - log.Println("新建一个udp端") - return &endpoint{ - stack: stack, - netProto: netProto, - waiterQueue: waiterQueue, - multicastTTL: 1, - rcvBufSizeMax: 32 * 1024, - sndBufSize: 32 * 1024} -} - -// Close UDP端的关闭,释放相应的资源 -func (e *endpoint) Close() { - e.mu.Lock() - - e.shutdownFlags = tcpip.ShutdownRead | tcpip.ShutdownWrite - - switch e.state { - case stateBound, stateConnected: - // 释放在协议栈中注册的UDP端 - e.stack.UnregisterTransportEndpoint(e.regNICID, e.effectiveNetProtos, ProtocolNumber, e.id) - // 释放端口占用 - e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.id.LocalAddress, e.id.LocalPort) - } - - // TODO - e.mu.Unlock() -} - -func (e *endpoint) Read(*tcpip.FullAddress) (buffer.View, tcpip.ControlMessages, *tcpip.Error) { - return nil, tcpip.ControlMessages{}, nil -} - -func (e *endpoint) Write(tcpip.Payload, tcpip.WriteOptions) (uintptr, <-chan struct{}, *tcpip.Error) { - return 0, nil, nil -} - -func (e *endpoint) Peek([][]byte) (uintptr, tcpip.ControlMessages, *tcpip.Error) { - return 0, tcpip.ControlMessages{}, nil -} - -// IPV6于IPV4地址的映射 -func (e *endpoint) checkV4Mapped(addr *tcpip.FullAddress, allowMismatch bool) (tcpip.NetworkProtocolNumber, *tcpip.Error) { - netProto := e.netProto - if header.IsV4MappedAddress(addr.Addr) { - // Fail if using a v4 mapped address on a v6only endpoint. - if e.v6only { - return 0, tcpip.ErrNoRoute - } - - netProto = header.IPv4ProtocolNumber - addr.Addr = addr.Addr[header.IPv6AddressSize-header.IPv4AddressSize:] - if addr.Addr == "\x00\x00\x00\x00" { - addr.Addr = "" - } - - // Fail if we are bound to an IPv6 address. - if !allowMismatch && len(e.id.LocalAddress) == 16 { - return 0, tcpip.ErrNetworkUnreachable - } - } - - // Fail if we're bound to an address length different from the one we're - // checking. - // 源地址用与目标地址使用的ip协议不能不一致 - if l := len(e.id.LocalAddress); l != 0 && l != len(addr.Addr) { - return 0, tcpip.ErrInvalidEndpointState - } - - return netProto, nil -} - -func (e *endpoint) Connect(address tcpip.FullAddress) *tcpip.Error { - log.Println("连接") - return nil -} - -func (e *endpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error { - return nil -} - -func (e *endpoint) Listen(backlog int) *tcpip.Error { - return nil -} - -func (e *endpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) { - return nil, nil, nil -} - -func (e *endpoint) registerWithStack(nicid tcpip.NICID, netProtos []tcpip.NetworkProtocolNumber, - id stack.TransportEndpointID) (stack.TransportEndpointID, *tcpip.Error) { - if e.id.LocalPort == 0 { // 一个没有绑定过端口的udp端 - port, err := e.stack.ReservePort(netProtos, ProtocolNumber, id.LocalAddress, id.LocalPort) // 为这个udp端绑定一个端口 - if err != nil { - return id, err - } - id.LocalPort = port - } - err := e.stack.RegisterTransportEndpoint(nicid, netProtos, ProtocolNumber, id, e) - if err != nil { - // 释放端口 - e.stack.ReleasePort(netProtos, ProtocolNumber, id.LocalAddress, id.LocalPort) - } - return id, err -} - -func (e *endpoint) bindLocked(addr tcpip.FullAddress, commit func() *tcpip.Error) *tcpip.Error { - // 不是初始状态的UDP实现不允许绑定 - if e.state != stateInitial { - return tcpip.ErrInvalidEndpointState - } - - netProto, err := e.checkV4Mapped(&addr, true) - if err != nil { - return nil - } - - netProtos := []tcpip.NetworkProtocolNumber{netProto} - if netProto == header.IPv6ProtocolNumber && !e.v6only && addr.Addr == "" { // IPv6 && 支持ipv4 & 任意地址 - netProtos = []tcpip.NetworkProtocolNumber{ - header.IPv6ProtocolNumber, - header.IPv4ProtocolNumber, - } - } - - // 不是任意地址的话 需要检验本地网卡是否绑定个这个ip地址 - if len(addr.Addr) != 0 { - if e.stack.CheckLocalAddress(addr.NIC, netProto, addr.Addr) == 0 { - return tcpip.ErrBadLocalAddress - } - } - - // 开始绑定 绑定的时候 传输端ID : srcIP + srcPort - id := stack.TransportEndpointID{ - LocalAddress: addr.Addr, - LocalPort: addr.Port, - } - // 在协议栈中注册该UDP端 - id, err = e.registerWithStack(addr.NIC, netProtos, id) - if err != nil { - return err - } - // 如果指定了 commit 函数 执行并处理错误 - if commit != nil { - if err := commit(); err != nil { - // Unregister, the commit failed. - e.stack.UnregisterTransportEndpoint(addr.NIC, netProtos, ProtocolNumber, id) - e.stack.ReleasePort(netProtos, ProtocolNumber, id.LocalAddress, id.LocalPort) - return err - } - } - - e.id = id - e.regNICID = addr.NIC - e.effectiveNetProtos = netProtos - - // Mark endpoint as bound. - // 标记状态为已绑定 - e.state = stateBound - - return nil -} - -// Bind binds the endpoint to a specific local address and port. -// Specifying a NIC is optional. -// Bind 将该UDP端绑定本地的一个IP+端口 -// 例如:绑定本地0.0.0.0的9000端口,那么其他机器给这台机器9000端口发消息,该UDP端就能收到消息了 -func (e *endpoint) Bind(addr tcpip.FullAddress, commit func() *tcpip.Error) *tcpip.Error { - e.mu.Lock() - defer e.mu.Unlock() - - // 执行绑定IP+端口操作 - err := e.bindLocked(addr, commit) - if err != nil { - return err - } - - // 绑定的网卡ID - e.bindNICID = addr.NIC - return nil -} - -func (e *endpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) { - return tcpip.FullAddress{}, nil -} - -func (e *endpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) { - return tcpip.FullAddress{}, nil -} - -func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask { - return waiter.EventErr -} - -func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error { - return nil -} - -func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error { - return nil -} - -// 从网络层接收到UDP数据报时的处理函数 -func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, vv buffer.VectorisedView) { - -} - -// HandleControlPacket implements stack.TransportEndpoint.HandleControlPacket. -func (e *endpoint) HandleControlPacket(id stack.TransportEndpointID, typ stack.ControlType, extra uint32, vv buffer.VectorisedView) { -} +package udp + +import ( + "log" + "netstack/tcpip" + "netstack/tcpip/buffer" + "netstack/tcpip/header" + "netstack/tcpip/stack" + "netstack/waiter" + "sync" +) + +// udp报文结构 当收到udp报文时 会用这个结构来保存udp报文数据 +type udpPacket struct { + udpPacketEntry // 链表实现 + // TODO 需要添加 +} + +type endpointState int + +// 表示UDP端的状态参数 +const ( + stateInitial endpointState = iota + stateBound + stateConnected + stateClosed +) + +type endpoint struct { + stack *stack.Stack // udp所依赖的用户协议栈 + netProto tcpip.NetworkProtocolNumber // udp网络协议号 ipv4/ipv6 + waiterQueue *waiter.Queue // TODO 需要解析 + + // TODO 需要解析 + // The following fields are used to manage the receive queue, and are + // protected by rcvMu. + rcvMu sync.Mutex + rcvReady bool + rcvList udpPacketList + rcvBufSizeMax int + rcvBufSize int + rcvClosed bool + rcvTimestamp bool + + // The following fields are protected by the mu mutex. + mu sync.RWMutex + sndBufSize int // 发送缓冲区大小 + id stack.TransportEndpointID + state endpointState + bindNICID tcpip.NICID // 绑定的网卡 + regNICID tcpip.NICID // + route stack.Route // 路由? TODO + dstPort uint16 // 目标端口 + v6only bool // 仅支持ipv6 + multicastTTL uint8 // 广播TTL + + // shutdownFlags represent the current shutdown state of the endpoint. + shutdownFlags tcpip.ShutdownFlags + + // TODO + + // effectiveNetProtos contains the network protocols actually in use. In + // most cases it will only contain "netProto", but in cases like IPv6 + // endpoints with v6only set to false, this could include multiple + // protocols (e.g., IPv6 and IPv4) or a single different protocol (e.g., + // IPv4 when IPv6 endpoint is bound or connected to an IPv4 mapped + // address). 当前生效的网络层协议 + effectiveNetProtos []tcpip.NetworkProtocolNumber +} + +func newEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, + waiterQueue *waiter.Queue) *endpoint { + log.Println("新建一个udp端") + return &endpoint{ + stack: stack, + netProto: netProto, + waiterQueue: waiterQueue, + multicastTTL: 1, + rcvBufSizeMax: 32 * 1024, + sndBufSize: 32 * 1024} +} + +// Close UDP端的关闭,释放相应的资源 +func (e *endpoint) Close() { + e.mu.Lock() + + e.shutdownFlags = tcpip.ShutdownRead | tcpip.ShutdownWrite + + switch e.state { + case stateBound, stateConnected: + // 释放在协议栈中注册的UDP端 + e.stack.UnregisterTransportEndpoint(e.regNICID, e.effectiveNetProtos, ProtocolNumber, e.id) + // 释放端口占用 + e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.id.LocalAddress, e.id.LocalPort) + } + + // TODO + e.mu.Unlock() +} + +func (e *endpoint) Read(*tcpip.FullAddress) (buffer.View, tcpip.ControlMessages, *tcpip.Error) { + return nil, tcpip.ControlMessages{}, nil +} + +func (e *endpoint) Write(tcpip.Payload, tcpip.WriteOptions) (uintptr, <-chan struct{}, *tcpip.Error) { + return 0, nil, nil +} + +func (e *endpoint) Peek([][]byte) (uintptr, tcpip.ControlMessages, *tcpip.Error) { + return 0, tcpip.ControlMessages{}, nil +} + +// IPV6于IPV4地址的映射 +func (e *endpoint) checkV4Mapped(addr *tcpip.FullAddress, allowMismatch bool) (tcpip.NetworkProtocolNumber, *tcpip.Error) { + netProto := e.netProto + if header.IsV4MappedAddress(addr.Addr) { + // Fail if using a v4 mapped address on a v6only endpoint. + if e.v6only { + return 0, tcpip.ErrNoRoute + } + + netProto = header.IPv4ProtocolNumber + addr.Addr = addr.Addr[header.IPv6AddressSize-header.IPv4AddressSize:] + if addr.Addr == "\x00\x00\x00\x00" { + addr.Addr = "" + } + + // Fail if we are bound to an IPv6 address. + if !allowMismatch && len(e.id.LocalAddress) == 16 { + return 0, tcpip.ErrNetworkUnreachable + } + } + + // Fail if we're bound to an address length different from the one we're + // checking. + // 源地址用与目标地址使用的ip协议不能不一致 + if l := len(e.id.LocalAddress); l != 0 && l != len(addr.Addr) { + return 0, tcpip.ErrInvalidEndpointState + } + + return netProto, nil +} + +func (e *endpoint) Connect(address tcpip.FullAddress) *tcpip.Error { + log.Println("连接") + return nil +} + +func (e *endpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error { + return nil +} + +func (e *endpoint) Listen(backlog int) *tcpip.Error { + return nil +} + +func (e *endpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) { + return nil, nil, nil +} + +func (e *endpoint) registerWithStack(nicid tcpip.NICID, netProtos []tcpip.NetworkProtocolNumber, + id stack.TransportEndpointID) (stack.TransportEndpointID, *tcpip.Error) { + if e.id.LocalPort == 0 { // 一个没有绑定过端口的udp端 + port, err := e.stack.ReservePort(netProtos, ProtocolNumber, id.LocalAddress, id.LocalPort) // 为这个udp端绑定一个端口 + if err != nil { + return id, err + } + id.LocalPort = port + } + err := e.stack.RegisterTransportEndpoint(nicid, netProtos, ProtocolNumber, id, e) + if err != nil { + // 释放端口 + e.stack.ReleasePort(netProtos, ProtocolNumber, id.LocalAddress, id.LocalPort) + } + return id, err +} + +func (e *endpoint) bindLocked(addr tcpip.FullAddress, commit func() *tcpip.Error) *tcpip.Error { + // 不是初始状态的UDP实现不允许绑定 + if e.state != stateInitial { + return tcpip.ErrInvalidEndpointState + } + + netProto, err := e.checkV4Mapped(&addr, true) + if err != nil { + return nil + } + + netProtos := []tcpip.NetworkProtocolNumber{netProto} + if netProto == header.IPv6ProtocolNumber && !e.v6only && addr.Addr == "" { // IPv6 && 支持ipv4 & 任意地址 + netProtos = []tcpip.NetworkProtocolNumber{ + header.IPv6ProtocolNumber, + header.IPv4ProtocolNumber, + } + } + + // 不是任意地址的话 需要检验本地网卡是否绑定个这个ip地址 + if len(addr.Addr) != 0 { + if e.stack.CheckLocalAddress(addr.NIC, netProto, addr.Addr) == 0 { + return tcpip.ErrBadLocalAddress + } + } + + // 开始绑定 绑定的时候 传输端ID : srcIP + srcPort + id := stack.TransportEndpointID{ + LocalAddress: addr.Addr, + LocalPort: addr.Port, + } + // 在协议栈中注册该UDP端 + id, err = e.registerWithStack(addr.NIC, netProtos, id) + if err != nil { + return err + } + // 如果指定了 commit 函数 执行并处理错误 + if commit != nil { + if err := commit(); err != nil { + // Unregister, the commit failed. + e.stack.UnregisterTransportEndpoint(addr.NIC, netProtos, ProtocolNumber, id) + e.stack.ReleasePort(netProtos, ProtocolNumber, id.LocalAddress, id.LocalPort) + return err + } + } + + e.id = id + e.regNICID = addr.NIC + e.effectiveNetProtos = netProtos + + // Mark endpoint as bound. + // 标记状态为已绑定 + e.state = stateBound + + return nil +} + +// Bind binds the endpoint to a specific local address and port. +// Specifying a NIC is optional. +// Bind 将该UDP端绑定本地的一个IP+端口 +// 例如:绑定本地0.0.0.0的9000端口,那么其他机器给这台机器9000端口发消息,该UDP端就能收到消息了 +func (e *endpoint) Bind(addr tcpip.FullAddress, commit func() *tcpip.Error) *tcpip.Error { + e.mu.Lock() + defer e.mu.Unlock() + + // 执行绑定IP+端口操作 + err := e.bindLocked(addr, commit) + if err != nil { + return err + } + + // 绑定的网卡ID + e.bindNICID = addr.NIC + return nil +} + +func (e *endpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) { + return tcpip.FullAddress{}, nil +} + +func (e *endpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) { + return tcpip.FullAddress{}, nil +} + +func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask { + return waiter.EventErr +} + +func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error { + return nil +} + +func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error { + return nil +} + +// 从网络层接收到UDP数据报时的处理函数 +func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, vv buffer.VectorisedView) { + +} + +// HandleControlPacket implements stack.TransportEndpoint.HandleControlPacket. +func (e *endpoint) HandleControlPacket(id stack.TransportEndpointID, typ stack.ControlType, extra uint32, vv buffer.VectorisedView) { +} diff --git a/tcpip/transport/udp/protocol.go b/tcpip/transport/udp/protocol.go index 11ed8c4..780eb06 100644 --- a/tcpip/transport/udp/protocol.go +++ b/tcpip/transport/udp/protocol.go @@ -1,66 +1,66 @@ -package udp - -import ( - "netstack/tcpip" - "netstack/tcpip/buffer" - "netstack/tcpip/header" - "netstack/tcpip/stack" - "netstack/waiter" -) - -const ( - // ProtocolName is the string representation of the udp protocol name. - ProtocolName = "udp" - - // ProtocolNumber is the udp protocol number. - ProtocolNumber = header.UDPProtocolNumber -) - -// tcpip.Endpoint 接口的UDP协议实现 -type protocol struct{} - -// Number returns the udp protocol number. -func (*protocol) Number() tcpip.TransportProtocolNumber { - return ProtocolNumber -} - -// NewEndpoint creates a new udp endpoint. -func (*protocol) NewEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, - waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) { - return newEndpoint(stack, netProto, waiterQueue), nil -} - -// MinimumPacketSize returns the minimum valid udp packet size. -func (*protocol) MinimumPacketSize() int { - return header.UDPMinimumSize -} - -// ParsePorts returns the source and destination ports stored in the given udp -// packet. -func (*protocol) ParsePorts(v buffer.View) (src, dst uint16, err *tcpip.Error) { - //h := header.UDP(v) - //return h.SourcePort(), h.DestinationPort(), nil - return 0, 0, nil -} - -// HandleUnknownDestinationPacket handles packets targeted at this protocol but -// that don't match any existing endpoint. -func (p *protocol) HandleUnknownDestinationPacket(*stack.Route, stack.TransportEndpointID, buffer.VectorisedView) bool { - return true -} - -// SetOption implements TransportProtocol.SetOption. -func (p *protocol) SetOption(option interface{}) *tcpip.Error { - return tcpip.ErrUnknownProtocolOption -} - -// Option implements TransportProtocol.Option. -func (p *protocol) Option(option interface{}) *tcpip.Error { - return tcpip.ErrUnknownProtocolOption -} - -func init() { - stack.RegisterTransportProtocolFactory(ProtocolName, func() stack.TransportProtocol { - return &protocol{} - }) -} +package udp + +import ( + "netstack/tcpip" + "netstack/tcpip/buffer" + "netstack/tcpip/header" + "netstack/tcpip/stack" + "netstack/waiter" +) + +const ( + // ProtocolName is the string representation of the udp protocol name. + ProtocolName = "udp" + + // ProtocolNumber is the udp protocol number. + ProtocolNumber = header.UDPProtocolNumber +) + +// tcpip.Endpoint 接口的UDP协议实现 +type protocol struct{} + +// Number returns the udp protocol number. +func (*protocol) Number() tcpip.TransportProtocolNumber { + return ProtocolNumber +} + +// NewEndpoint creates a new udp endpoint. +func (*protocol) NewEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, + waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) { + return newEndpoint(stack, netProto, waiterQueue), nil +} + +// MinimumPacketSize returns the minimum valid udp packet size. +func (*protocol) MinimumPacketSize() int { + return header.UDPMinimumSize +} + +// ParsePorts returns the source and destination ports stored in the given udp +// packet. +func (*protocol) ParsePorts(v buffer.View) (src, dst uint16, err *tcpip.Error) { + //h := header.UDP(v) + //return h.SourcePort(), h.DestinationPort(), nil + return 0, 0, nil +} + +// HandleUnknownDestinationPacket handles packets targeted at this protocol but +// that don't match any existing endpoint. +func (p *protocol) HandleUnknownDestinationPacket(*stack.Route, stack.TransportEndpointID, buffer.VectorisedView) bool { + return true +} + +// SetOption implements TransportProtocol.SetOption. +func (p *protocol) SetOption(option interface{}) *tcpip.Error { + return tcpip.ErrUnknownProtocolOption +} + +// Option implements TransportProtocol.Option. +func (p *protocol) Option(option interface{}) *tcpip.Error { + return tcpip.ErrUnknownProtocolOption +} + +func init() { + stack.RegisterTransportProtocolFactory(ProtocolName, func() stack.TransportProtocol { + return &protocol{} + }) +} diff --git a/tcpip/transport/udp/udp_packet_list.go b/tcpip/transport/udp/udp_packet_list.go index c965ed8..6eeafff 100644 --- a/tcpip/transport/udp/udp_packet_list.go +++ b/tcpip/transport/udp/udp_packet_list.go @@ -1,174 +1,174 @@ -package udp - -// ElementMapper provides an identity mapping by default. -// -// This can be replaced to provide a struct that maps elements to linker -// objects, if they are not the same. An ElementMapper is not typically -// required if: Linker is left as is, Element is left as is, or Linker and -// Element are the same type. -type udpPacketElementMapper struct{} - -// linkerFor maps an Element to a Linker. -// -// This default implementation should be inlined. -// -//go:nosplit -func (udpPacketElementMapper) linkerFor(elem *udpPacket) *udpPacket { return elem } - -// List is an intrusive list. Entries can be added to or removed from the list -// in O(1) time and with no additional memory allocations. -// -// The zero value for List is an empty list ready to use. -// -// To iterate over a list (where l is a List): -// for e := l.Front(); e != nil; e = e.Next() { -// // do something with e. -// } -// -// +stateify savable -// udp数据报的双向链表结构 -type udpPacketList struct { - head *udpPacket - tail *udpPacket -} - -// Reset resets list l to the empty state. -func (l *udpPacketList) Reset() { - l.head = nil - l.tail = nil -} - -// Empty returns true iff the list is empty. -func (l *udpPacketList) Empty() bool { - return l.head == nil -} - -// Front returns the first element of list l or nil. -func (l *udpPacketList) Front() *udpPacket { - return l.head -} - -// Back returns the last element of list l or nil. -func (l *udpPacketList) Back() *udpPacket { - return l.tail -} - -// PushFront inserts the element e at the front of list l. -func (l *udpPacketList) PushFront(e *udpPacket) { - udpPacketElementMapper{}.linkerFor(e).SetNext(l.head) - udpPacketElementMapper{}.linkerFor(e).SetPrev(nil) - - if l.head != nil { - udpPacketElementMapper{}.linkerFor(l.head).SetPrev(e) - } else { - l.tail = e - } - - l.head = e -} - -// PushBack inserts the element e at the back of list l. -func (l *udpPacketList) PushBack(e *udpPacket) { - udpPacketElementMapper{}.linkerFor(e).SetNext(nil) - udpPacketElementMapper{}.linkerFor(e).SetPrev(l.tail) - - if l.tail != nil { - udpPacketElementMapper{}.linkerFor(l.tail).SetNext(e) - } else { - l.head = e - } - - l.tail = e -} - -// PushBackList inserts list m at the end of list l, emptying m. -func (l *udpPacketList) PushBackList(m *udpPacketList) { - if l.head == nil { - l.head = m.head - l.tail = m.tail - } else if m.head != nil { - udpPacketElementMapper{}.linkerFor(l.tail).SetNext(m.head) - udpPacketElementMapper{}.linkerFor(m.head).SetPrev(l.tail) - - l.tail = m.tail - } - - m.head = nil - m.tail = nil -} - -// InsertAfter inserts e after b. -func (l *udpPacketList) InsertAfter(b, e *udpPacket) { - a := udpPacketElementMapper{}.linkerFor(b).Next() - udpPacketElementMapper{}.linkerFor(e).SetNext(a) - udpPacketElementMapper{}.linkerFor(e).SetPrev(b) - udpPacketElementMapper{}.linkerFor(b).SetNext(e) - - if a != nil { - udpPacketElementMapper{}.linkerFor(a).SetPrev(e) - } else { - l.tail = e - } -} - -// InsertBefore inserts e before a. -func (l *udpPacketList) InsertBefore(a, e *udpPacket) { - b := udpPacketElementMapper{}.linkerFor(a).Prev() - udpPacketElementMapper{}.linkerFor(e).SetNext(a) - udpPacketElementMapper{}.linkerFor(e).SetPrev(b) - udpPacketElementMapper{}.linkerFor(a).SetPrev(e) - - if b != nil { - udpPacketElementMapper{}.linkerFor(b).SetNext(e) - } else { - l.head = e - } -} - -// Remove removes e from l. -func (l *udpPacketList) Remove(e *udpPacket) { - prev := udpPacketElementMapper{}.linkerFor(e).Prev() - next := udpPacketElementMapper{}.linkerFor(e).Next() - - if prev != nil { - udpPacketElementMapper{}.linkerFor(prev).SetNext(next) - } else { - l.head = next - } - - if next != nil { - udpPacketElementMapper{}.linkerFor(next).SetPrev(prev) - } else { - l.tail = prev - } -} - -// Entry is a default implementation of Linker. Users can add anonymous fields -// of this type to their structs to make them automatically implement the -// methods needed by List. -// -// +stateify savable -type udpPacketEntry struct { - next *udpPacket - prev *udpPacket -} - -// Next returns the entry that follows e in the list. -func (e *udpPacketEntry) Next() *udpPacket { - return e.next -} - -// Prev returns the entry that precedes e in the list. -func (e *udpPacketEntry) Prev() *udpPacket { - return e.prev -} - -// SetNext assigns 'entry' as the entry that follows e in the list. -func (e *udpPacketEntry) SetNext(elem *udpPacket) { - e.next = elem -} - -// SetPrev assigns 'entry' as the entry that precedes e in the list. -func (e *udpPacketEntry) SetPrev(elem *udpPacket) { - e.prev = elem -} +package udp + +// ElementMapper provides an identity mapping by default. +// +// This can be replaced to provide a struct that maps elements to linker +// objects, if they are not the same. An ElementMapper is not typically +// required if: Linker is left as is, Element is left as is, or Linker and +// Element are the same type. +type udpPacketElementMapper struct{} + +// linkerFor maps an Element to a Linker. +// +// This default implementation should be inlined. +// +//go:nosplit +func (udpPacketElementMapper) linkerFor(elem *udpPacket) *udpPacket { return elem } + +// List is an intrusive list. Entries can be added to or removed from the list +// in O(1) time and with no additional memory allocations. +// +// The zero value for List is an empty list ready to use. +// +// To iterate over a list (where l is a List): +// for e := l.Front(); e != nil; e = e.Next() { +// // do something with e. +// } +// +// +stateify savable +// udp数据报的双向链表结构 +type udpPacketList struct { + head *udpPacket + tail *udpPacket +} + +// Reset resets list l to the empty state. +func (l *udpPacketList) Reset() { + l.head = nil + l.tail = nil +} + +// Empty returns true iff the list is empty. +func (l *udpPacketList) Empty() bool { + return l.head == nil +} + +// Front returns the first element of list l or nil. +func (l *udpPacketList) Front() *udpPacket { + return l.head +} + +// Back returns the last element of list l or nil. +func (l *udpPacketList) Back() *udpPacket { + return l.tail +} + +// PushFront inserts the element e at the front of list l. +func (l *udpPacketList) PushFront(e *udpPacket) { + udpPacketElementMapper{}.linkerFor(e).SetNext(l.head) + udpPacketElementMapper{}.linkerFor(e).SetPrev(nil) + + if l.head != nil { + udpPacketElementMapper{}.linkerFor(l.head).SetPrev(e) + } else { + l.tail = e + } + + l.head = e +} + +// PushBack inserts the element e at the back of list l. +func (l *udpPacketList) PushBack(e *udpPacket) { + udpPacketElementMapper{}.linkerFor(e).SetNext(nil) + udpPacketElementMapper{}.linkerFor(e).SetPrev(l.tail) + + if l.tail != nil { + udpPacketElementMapper{}.linkerFor(l.tail).SetNext(e) + } else { + l.head = e + } + + l.tail = e +} + +// PushBackList inserts list m at the end of list l, emptying m. +func (l *udpPacketList) PushBackList(m *udpPacketList) { + if l.head == nil { + l.head = m.head + l.tail = m.tail + } else if m.head != nil { + udpPacketElementMapper{}.linkerFor(l.tail).SetNext(m.head) + udpPacketElementMapper{}.linkerFor(m.head).SetPrev(l.tail) + + l.tail = m.tail + } + + m.head = nil + m.tail = nil +} + +// InsertAfter inserts e after b. +func (l *udpPacketList) InsertAfter(b, e *udpPacket) { + a := udpPacketElementMapper{}.linkerFor(b).Next() + udpPacketElementMapper{}.linkerFor(e).SetNext(a) + udpPacketElementMapper{}.linkerFor(e).SetPrev(b) + udpPacketElementMapper{}.linkerFor(b).SetNext(e) + + if a != nil { + udpPacketElementMapper{}.linkerFor(a).SetPrev(e) + } else { + l.tail = e + } +} + +// InsertBefore inserts e before a. +func (l *udpPacketList) InsertBefore(a, e *udpPacket) { + b := udpPacketElementMapper{}.linkerFor(a).Prev() + udpPacketElementMapper{}.linkerFor(e).SetNext(a) + udpPacketElementMapper{}.linkerFor(e).SetPrev(b) + udpPacketElementMapper{}.linkerFor(a).SetPrev(e) + + if b != nil { + udpPacketElementMapper{}.linkerFor(b).SetNext(e) + } else { + l.head = e + } +} + +// Remove removes e from l. +func (l *udpPacketList) Remove(e *udpPacket) { + prev := udpPacketElementMapper{}.linkerFor(e).Prev() + next := udpPacketElementMapper{}.linkerFor(e).Next() + + if prev != nil { + udpPacketElementMapper{}.linkerFor(prev).SetNext(next) + } else { + l.head = next + } + + if next != nil { + udpPacketElementMapper{}.linkerFor(next).SetPrev(prev) + } else { + l.tail = prev + } +} + +// Entry is a default implementation of Linker. Users can add anonymous fields +// of this type to their structs to make them automatically implement the +// methods needed by List. +// +// +stateify savable +type udpPacketEntry struct { + next *udpPacket + prev *udpPacket +} + +// Next returns the entry that follows e in the list. +func (e *udpPacketEntry) Next() *udpPacket { + return e.next +} + +// Prev returns the entry that precedes e in the list. +func (e *udpPacketEntry) Prev() *udpPacket { + return e.prev +} + +// SetNext assigns 'entry' as the entry that follows e in the list. +func (e *udpPacketEntry) SetNext(elem *udpPacket) { + e.next = elem +} + +// SetPrev assigns 'entry' as the entry that precedes e in the list. +func (e *udpPacketEntry) SetPrev(elem *udpPacket) { + e.prev = elem +} diff --git a/tmutex/tmutex.go b/tmutex/tmutex.go index 78d95e0..34e3bb0 100644 --- a/tmutex/tmutex.go +++ b/tmutex/tmutex.go @@ -1,51 +1,51 @@ -package tmutex - -import ( - "sync/atomic" -) - -type Mutex struct { - v int32 - ch chan struct{} -} - -func (m *Mutex) Init() { - m.v = 1 - m.ch = make(chan struct{}, 1) -} - -func (m *Mutex) Lock() { - // ==0时 只有一个锁持有者 - if atomic.AddInt32(&m.v, -1) == 0 { - return - } - // !=0时 有多个想持有锁者 - for { - if v := atomic.LoadInt32(&m.v);v >= 0 && atomic.SwapInt32(&m.v, -1) == 1 { - return - } - <-m.ch // 排队阻塞 等待锁释放 - } -} - -func (m *Mutex) TryLock() bool { - v := atomic.LoadInt32(&m.v) - if v <= 0 { - return false - } - // CAS操作需要输入两个数值,一个旧值(期望操作前的值)和一个新值, - // 在操作期间先比较下旧值有没有发生变化, - // 如果没有发生变化,才交换成新值,发生了变化则不交换。 - return atomic.CompareAndSwapInt32(&m.v, 1, 0) -} - -func (m *Mutex) Unlock() { - if atomic.SwapInt32(&m.v, 1) == 0 { // 没有任何持有者 - return - } - - select { - case m.ch <- struct{}{}: - default: - } -} +package tmutex + +import ( + "sync/atomic" +) + +type Mutex struct { + v int32 + ch chan struct{} +} + +func (m *Mutex) Init() { + m.v = 1 + m.ch = make(chan struct{}, 1) +} + +func (m *Mutex) Lock() { + // ==0时 只有一个锁持有者 + if atomic.AddInt32(&m.v, -1) == 0 { + return + } + // !=0时 有多个想持有锁者 + for { + if v := atomic.LoadInt32(&m.v);v >= 0 && atomic.SwapInt32(&m.v, -1) == 1 { + return + } + <-m.ch // 排队阻塞 等待锁释放 + } +} + +func (m *Mutex) TryLock() bool { + v := atomic.LoadInt32(&m.v) + if v <= 0 { + return false + } + // CAS操作需要输入两个数值,一个旧值(期望操作前的值)和一个新值, + // 在操作期间先比较下旧值有没有发生变化, + // 如果没有发生变化,才交换成新值,发生了变化则不交换。 + return atomic.CompareAndSwapInt32(&m.v, 1, 0) +} + +func (m *Mutex) Unlock() { + if atomic.SwapInt32(&m.v, 1) == 0 { // 没有任何持有者 + return + } + + select { + case m.ch <- struct{}{}: + default: + } +} diff --git a/tmutex/tmutex_test.go b/tmutex/tmutex_test.go index fb058e7..9043bcf 100644 --- a/tmutex/tmutex_test.go +++ b/tmutex/tmutex_test.go @@ -1,47 +1,47 @@ -package tmutex - -import ( - "fmt" - "runtime" - "testing" - "time" -) - -func TestBasicLock(t *testing.T) { - var race = 0 - var m Mutex - m.Init() - - m.Lock() - - go func(){ - m.Lock() - race++ - m.Unlock() - }() - - go func(){ - m.Lock() - race++ - m.Unlock() - }() - - runtime.Gosched() // 让渡cpu - race++ - - m.Unlock() - - time.Sleep(time.Second) -} - -func TestShutOut(t *testing.T) { - - a := 1 - if a < 3 || func() bool { - fmt.Println("ShutOut") - return false - }() { - t.Logf("Ok\n") - } - -} +package tmutex + +import ( + "fmt" + "runtime" + "testing" + "time" +) + +func TestBasicLock(t *testing.T) { + var race = 0 + var m Mutex + m.Init() + + m.Lock() + + go func(){ + m.Lock() + race++ + m.Unlock() + }() + + go func(){ + m.Lock() + race++ + m.Unlock() + }() + + runtime.Gosched() // 让渡cpu + race++ + + m.Unlock() + + time.Sleep(time.Second) +} + +func TestShutOut(t *testing.T) { + + a := 1 + if a < 3 || func() bool { + fmt.Println("ShutOut") + return false + }() { + t.Logf("Ok\n") + } + +} diff --git a/waiter/waiter.go b/waiter/waiter.go index 7998d4e..25b2f97 100644 --- a/waiter/waiter.go +++ b/waiter/waiter.go @@ -1,240 +1,240 @@ -// Copyright 2018 Google LLC -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Package waiter provides the implementation of a wait queue, where waiters can -// be enqueued to be notified when an event of interest happens. -// -// Becoming readable and/or writable are examples of events. Waiters are -// expected to use a pattern similar to this to make a blocking function out of -// a non-blocking one: -// -// func (o *object) blockingRead(...) error { -// err := o.nonBlockingRead(...) -// if err != ErrAgain { -// // Completed with no need to wait! -// return err -// } -// -// e := createOrGetWaiterEntry(...) -// o.EventRegister(&e, waiter.EventIn) -// defer o.EventUnregister(&e) -// -// // We need to try to read again after registration because the -// // object may have become readable between the last attempt to -// // read and read registration. -// err = o.nonBlockingRead(...) -// for err == ErrAgain { -// wait() -// err = o.nonBlockingRead(...) -// } -// -// return err -// } -// -// Another goroutine needs to notify waiters when events happen. For example: -// -// func (o *object) Write(...) ... { -// // Do write work. -// [...] -// -// if oldDataAvailableSize == 0 && dataAvailableSize > 0 { -// // If no data was available and now some data is -// // available, the object became readable, so notify -// // potential waiters about this. -// o.Notify(waiter.EventIn) -// } -// } -package waiter - -import ( - "sync" - - "netstack/ilist" -) - -// EventMask represents io events as used in the poll() syscall. -type EventMask uint16 - -// Events that waiters can wait on. The meaning is the same as those in the -// poll() syscall. -const ( - EventIn EventMask = 0x01 // syscall.EPOLLIN - EventPri EventMask = 0x02 // syscall.EPOLLPRI - EventOut EventMask = 0x04 // syscall.EPOLLOUT - EventErr EventMask = 0x08 // syscall.EPOLLERR - EventHUp EventMask = 0x10 // syscall.EPOLLHUP - EventNVal EventMask = 0x20 // Not defined in syscall. -) - -// Waitable contains the methods that need to be implemented by waitable -// objects. -type Waitable interface { - // Readiness returns what the object is currently ready for. If it's - // not ready for a desired purpose, the caller may use EventRegister and - // EventUnregister to get notifications once the object becomes ready. - // - // Implementations should allow for events like EventHUp and EventErr - // to be returned regardless of whether they are in the input EventMask. - Readiness(mask EventMask) EventMask - - // EventRegister registers the given waiter entry to receive - // notifications when an event occurs that makes the object ready for - // at least one of the events in mask. - EventRegister(e *Entry, mask EventMask) - - // EventUnregister unregisters a waiter entry previously registered with - // EventRegister(). - EventUnregister(e *Entry) -} - -// EntryCallback provides a notify callback. -type EntryCallback interface { - // Callback is the function to be called when the waiter entry is - // notified. It is responsible for doing whatever is needed to wake up - // the waiter. - // - // The callback is supposed to perform minimal work, and cannot call - // any method on the queue itself because it will be locked while the - // callback is running. - Callback(e *Entry) -} - -// Entry represents a waiter that can be add to the a wait queue. It can -// only be in one queue at a time, and is added "intrusively" to the queue with -// no extra memory allocations. -// -// +stateify savable -type Entry struct { - // Context stores any state the waiter may wish to store in the entry - // itself, which may be used at wake up time. - // - // Note that use of this field is optional and state may alternatively be - // stored in the callback itself. - Context interface{} - - Callback EntryCallback - - // The following fields are protected by the queue lock. - mask EventMask - ilist.Entry -} - -type channelCallback struct{} - -// Callback implements EntryCallback.Callback. -func (*channelCallback) Callback(e *Entry) { - ch := e.Context.(chan struct{}) - select { - case ch <- struct{}{}: - default: - } -} - -// NewChannelEntry initializes a new Entry that does a non-blocking write to a -// struct{} channel when the callback is called. It returns the new Entry -// instance and the channel being used. -// -// If a channel isn't specified (i.e., if "c" is nil), then NewChannelEntry -// allocates a new channel. -func NewChannelEntry(c chan struct{}) (Entry, chan struct{}) { - if c == nil { - c = make(chan struct{}, 1) - } - - return Entry{Context: c, Callback: &channelCallback{}}, c -} - -// Queue represents the wait queue where waiters can be added and -// notifiers can notify them when events happen. -// -// The zero value for waiter.Queue is an empty queue ready for use. -// -// +stateify savable -type Queue struct { - list ilist.List - mu sync.RWMutex -} - -// EventRegister adds a waiter to the wait queue; the waiter will be notified -// when at least one of the events specified in mask happens. -func (q *Queue) EventRegister(e *Entry, mask EventMask) { - q.mu.Lock() - e.mask = mask - q.list.PushBack(e) - q.mu.Unlock() -} - -// EventUnregister removes the given waiter entry from the wait queue. -func (q *Queue) EventUnregister(e *Entry) { - q.mu.Lock() - q.list.Remove(e) - q.mu.Unlock() -} - -// Notify notifies all waiters in the queue whose masks have at least one bit -// in common with the notification mask. -func (q *Queue) Notify(mask EventMask) { - q.mu.RLock() - for it := q.list.Front(); it != nil; it = it.Next() { - e := it.(*Entry) - if mask&e.mask != 0 { - e.Callback.Callback(e) - } - } - q.mu.RUnlock() -} - -// Events returns the set of events being waited on. It is the union of the -// masks of all registered entries. -func (q *Queue) Events() EventMask { - ret := EventMask(0) - - q.mu.RLock() - for it := q.list.Front(); it != nil; it = it.Next() { - e := it.(*Entry) - ret |= e.mask - } - q.mu.RUnlock() - - return ret -} - -// IsEmpty returns if the wait queue is empty or not. -func (q *Queue) IsEmpty() bool { - q.mu.Lock() - defer q.mu.Unlock() - - return q.list.Front() == nil -} - -// AlwaysReady implements the Waitable interface but is always ready. Embedding -// this struct into another struct makes it implement the boilerplate empty -// functions automatically. -type AlwaysReady struct { -} - -// Readiness always returns the input mask because this object is always ready. -func (*AlwaysReady) Readiness(mask EventMask) EventMask { - return mask -} - -// EventRegister doesn't do anything because this object doesn't need to issue -// notifications because its readiness never changes. -func (*AlwaysReady) EventRegister(*Entry, EventMask) { -} - -// EventUnregister doesn't do anything because this object doesn't need to issue -// notifications because its readiness never changes. -func (*AlwaysReady) EventUnregister(e *Entry) { -} +// Copyright 2018 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package waiter provides the implementation of a wait queue, where waiters can +// be enqueued to be notified when an event of interest happens. +// +// Becoming readable and/or writable are examples of events. Waiters are +// expected to use a pattern similar to this to make a blocking function out of +// a non-blocking one: +// +// func (o *object) blockingRead(...) error { +// err := o.nonBlockingRead(...) +// if err != ErrAgain { +// // Completed with no need to wait! +// return err +// } +// +// e := createOrGetWaiterEntry(...) +// o.EventRegister(&e, waiter.EventIn) +// defer o.EventUnregister(&e) +// +// // We need to try to read again after registration because the +// // object may have become readable between the last attempt to +// // read and read registration. +// err = o.nonBlockingRead(...) +// for err == ErrAgain { +// wait() +// err = o.nonBlockingRead(...) +// } +// +// return err +// } +// +// Another goroutine needs to notify waiters when events happen. For example: +// +// func (o *object) Write(...) ... { +// // Do write work. +// [...] +// +// if oldDataAvailableSize == 0 && dataAvailableSize > 0 { +// // If no data was available and now some data is +// // available, the object became readable, so notify +// // potential waiters about this. +// o.Notify(waiter.EventIn) +// } +// } +package waiter + +import ( + "sync" + + "netstack/ilist" +) + +// EventMask represents io events as used in the poll() syscall. +type EventMask uint16 + +// Events that waiters can wait on. The meaning is the same as those in the +// poll() syscall. +const ( + EventIn EventMask = 0x01 // syscall.EPOLLIN + EventPri EventMask = 0x02 // syscall.EPOLLPRI + EventOut EventMask = 0x04 // syscall.EPOLLOUT + EventErr EventMask = 0x08 // syscall.EPOLLERR + EventHUp EventMask = 0x10 // syscall.EPOLLHUP + EventNVal EventMask = 0x20 // Not defined in syscall. +) + +// Waitable contains the methods that need to be implemented by waitable +// objects. +type Waitable interface { + // Readiness returns what the object is currently ready for. If it's + // not ready for a desired purpose, the caller may use EventRegister and + // EventUnregister to get notifications once the object becomes ready. + // + // Implementations should allow for events like EventHUp and EventErr + // to be returned regardless of whether they are in the input EventMask. + Readiness(mask EventMask) EventMask + + // EventRegister registers the given waiter entry to receive + // notifications when an event occurs that makes the object ready for + // at least one of the events in mask. + EventRegister(e *Entry, mask EventMask) + + // EventUnregister unregisters a waiter entry previously registered with + // EventRegister(). + EventUnregister(e *Entry) +} + +// EntryCallback provides a notify callback. +type EntryCallback interface { + // Callback is the function to be called when the waiter entry is + // notified. It is responsible for doing whatever is needed to wake up + // the waiter. + // + // The callback is supposed to perform minimal work, and cannot call + // any method on the queue itself because it will be locked while the + // callback is running. + Callback(e *Entry) +} + +// Entry represents a waiter that can be add to the a wait queue. It can +// only be in one queue at a time, and is added "intrusively" to the queue with +// no extra memory allocations. +// +// +stateify savable +type Entry struct { + // Context stores any state the waiter may wish to store in the entry + // itself, which may be used at wake up time. + // + // Note that use of this field is optional and state may alternatively be + // stored in the callback itself. + Context interface{} + + Callback EntryCallback + + // The following fields are protected by the queue lock. + mask EventMask + ilist.Entry +} + +type channelCallback struct{} + +// Callback implements EntryCallback.Callback. +func (*channelCallback) Callback(e *Entry) { + ch := e.Context.(chan struct{}) + select { + case ch <- struct{}{}: + default: + } +} + +// NewChannelEntry initializes a new Entry that does a non-blocking write to a +// struct{} channel when the callback is called. It returns the new Entry +// instance and the channel being used. +// +// If a channel isn't specified (i.e., if "c" is nil), then NewChannelEntry +// allocates a new channel. +func NewChannelEntry(c chan struct{}) (Entry, chan struct{}) { + if c == nil { + c = make(chan struct{}, 1) + } + + return Entry{Context: c, Callback: &channelCallback{}}, c +} + +// Queue represents the wait queue where waiters can be added and +// notifiers can notify them when events happen. +// +// The zero value for waiter.Queue is an empty queue ready for use. +// +// +stateify savable +type Queue struct { + list ilist.List + mu sync.RWMutex +} + +// EventRegister adds a waiter to the wait queue; the waiter will be notified +// when at least one of the events specified in mask happens. +func (q *Queue) EventRegister(e *Entry, mask EventMask) { + q.mu.Lock() + e.mask = mask + q.list.PushBack(e) + q.mu.Unlock() +} + +// EventUnregister removes the given waiter entry from the wait queue. +func (q *Queue) EventUnregister(e *Entry) { + q.mu.Lock() + q.list.Remove(e) + q.mu.Unlock() +} + +// Notify notifies all waiters in the queue whose masks have at least one bit +// in common with the notification mask. +func (q *Queue) Notify(mask EventMask) { + q.mu.RLock() + for it := q.list.Front(); it != nil; it = it.Next() { + e := it.(*Entry) + if mask&e.mask != 0 { + e.Callback.Callback(e) + } + } + q.mu.RUnlock() +} + +// Events returns the set of events being waited on. It is the union of the +// masks of all registered entries. +func (q *Queue) Events() EventMask { + ret := EventMask(0) + + q.mu.RLock() + for it := q.list.Front(); it != nil; it = it.Next() { + e := it.(*Entry) + ret |= e.mask + } + q.mu.RUnlock() + + return ret +} + +// IsEmpty returns if the wait queue is empty or not. +func (q *Queue) IsEmpty() bool { + q.mu.Lock() + defer q.mu.Unlock() + + return q.list.Front() == nil +} + +// AlwaysReady implements the Waitable interface but is always ready. Embedding +// this struct into another struct makes it implement the boilerplate empty +// functions automatically. +type AlwaysReady struct { +} + +// Readiness always returns the input mask because this object is always ready. +func (*AlwaysReady) Readiness(mask EventMask) EventMask { + return mask +} + +// EventRegister doesn't do anything because this object doesn't need to issue +// notifications because its readiness never changes. +func (*AlwaysReady) EventRegister(*Entry, EventMask) { +} + +// EventUnregister doesn't do anything because this object doesn't need to issue +// notifications because its readiness never changes. +func (*AlwaysReady) EventUnregister(e *Entry) { +} diff --git a/waiter/waiter_test.go b/waiter/waiter_test.go index 60853f9..d0590e9 100644 --- a/waiter/waiter_test.go +++ b/waiter/waiter_test.go @@ -1,192 +1,192 @@ -// Copyright 2018 Google LLC -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package waiter - -import ( - "sync/atomic" - "testing" -) - -type callbackStub struct { - f func(e *Entry) -} - -// Callback implements EntryCallback.Callback. -func (c *callbackStub) Callback(e *Entry) { - c.f(e) -} - -func TestEmptyQueue(t *testing.T) { - var q Queue - - // Notify the zero-value of a queue. - q.Notify(EventIn) - - // Register then unregister a waiter, then notify the queue. - cnt := 0 - e := Entry{Callback: &callbackStub{func(*Entry) { cnt++ }}} - q.EventRegister(&e, EventIn) - q.EventUnregister(&e) - q.Notify(EventIn) - if cnt != 0 { - t.Errorf("Callback was called when it shouldn't have been") - } -} - -func TestMask(t *testing.T) { - // Register a waiter. - var q Queue - var cnt int - e := Entry{Callback: &callbackStub{func(*Entry) { cnt++ }}} - q.EventRegister(&e, EventIn|EventErr) - - // Notify with an overlapping mask. - cnt = 0 - q.Notify(EventIn | EventOut) - if cnt != 1 { - t.Errorf("Callback wasn't called when it should have been") - } - - // Notify with a subset mask. - cnt = 0 - q.Notify(EventIn) - if cnt != 1 { - t.Errorf("Callback wasn't called when it should have been") - } - - // Notify with a superset mask. - cnt = 0 - q.Notify(EventIn | EventErr | EventOut) - if cnt != 1 { - t.Errorf("Callback wasn't called when it should have been") - } - - // Notify with the exact same mask. - cnt = 0 - q.Notify(EventIn | EventErr) - if cnt != 1 { - t.Errorf("Callback wasn't called when it should have been") - } - - // Notify with a disjoint mask. - cnt = 0 - q.Notify(EventOut | EventHUp) - if cnt != 0 { - t.Errorf("Callback was called when it shouldn't have been") - } -} - -func TestConcurrentRegistration(t *testing.T) { - var q Queue - var cnt int - const concurrency = 1000 - - ch1 := make(chan struct{}) - ch2 := make(chan struct{}) - ch3 := make(chan struct{}) - - // Create goroutines that will all register/unregister concurrently. - for i := 0; i < concurrency; i++ { - go func() { - var e Entry - e.Callback = &callbackStub{func(entry *Entry) { - cnt++ - if entry != &e { - t.Errorf("entry = %p, want %p", entry, &e) - } - }} - - // Wait for notification, then register. - <-ch1 - q.EventRegister(&e, EventIn|EventErr) - - // Tell main goroutine that we're done registering. - ch2 <- struct{}{} - - // Wait for notification, then unregister. - <-ch3 - q.EventUnregister(&e) - - // Tell main goroutine that we're done unregistering. - ch2 <- struct{}{} - }() - } - - // Let the goroutines register. - close(ch1) - for i := 0; i < concurrency; i++ { - <-ch2 - } - - // Issue a notification. - q.Notify(EventIn) - if cnt != concurrency { - t.Errorf("cnt = %d, want %d", cnt, concurrency) - } - - // Let the goroutine unregister. - close(ch3) - for i := 0; i < concurrency; i++ { - <-ch2 - } - - // Issue a notification. - q.Notify(EventIn) - if cnt != concurrency { - t.Errorf("cnt = %d, want %d", cnt, concurrency) - } -} - -func TestConcurrentNotification(t *testing.T) { - var q Queue - var cnt int32 - const concurrency = 1000 - const waiterCount = 1000 - - // Register waiters. - for i := 0; i < waiterCount; i++ { - var e Entry - e.Callback = &callbackStub{func(entry *Entry) { - atomic.AddInt32(&cnt, 1) - if entry != &e { - t.Errorf("entry = %p, want %p", entry, &e) - } - }} - - q.EventRegister(&e, EventIn|EventErr) - } - - // Launch notifiers. - ch1 := make(chan struct{}) - ch2 := make(chan struct{}) - for i := 0; i < concurrency; i++ { - go func() { - <-ch1 - q.Notify(EventIn) - ch2 <- struct{}{} - }() - } - - // Let notifiers go. - close(ch1) - for i := 0; i < concurrency; i++ { - <-ch2 - } - - // Check the count. - if cnt != concurrency*waiterCount { - t.Errorf("cnt = %d, want %d", cnt, concurrency*waiterCount) - } -} +// Copyright 2018 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package waiter + +import ( + "sync/atomic" + "testing" +) + +type callbackStub struct { + f func(e *Entry) +} + +// Callback implements EntryCallback.Callback. +func (c *callbackStub) Callback(e *Entry) { + c.f(e) +} + +func TestEmptyQueue(t *testing.T) { + var q Queue + + // Notify the zero-value of a queue. + q.Notify(EventIn) + + // Register then unregister a waiter, then notify the queue. + cnt := 0 + e := Entry{Callback: &callbackStub{func(*Entry) { cnt++ }}} + q.EventRegister(&e, EventIn) + q.EventUnregister(&e) + q.Notify(EventIn) + if cnt != 0 { + t.Errorf("Callback was called when it shouldn't have been") + } +} + +func TestMask(t *testing.T) { + // Register a waiter. + var q Queue + var cnt int + e := Entry{Callback: &callbackStub{func(*Entry) { cnt++ }}} + q.EventRegister(&e, EventIn|EventErr) + + // Notify with an overlapping mask. + cnt = 0 + q.Notify(EventIn | EventOut) + if cnt != 1 { + t.Errorf("Callback wasn't called when it should have been") + } + + // Notify with a subset mask. + cnt = 0 + q.Notify(EventIn) + if cnt != 1 { + t.Errorf("Callback wasn't called when it should have been") + } + + // Notify with a superset mask. + cnt = 0 + q.Notify(EventIn | EventErr | EventOut) + if cnt != 1 { + t.Errorf("Callback wasn't called when it should have been") + } + + // Notify with the exact same mask. + cnt = 0 + q.Notify(EventIn | EventErr) + if cnt != 1 { + t.Errorf("Callback wasn't called when it should have been") + } + + // Notify with a disjoint mask. + cnt = 0 + q.Notify(EventOut | EventHUp) + if cnt != 0 { + t.Errorf("Callback was called when it shouldn't have been") + } +} + +func TestConcurrentRegistration(t *testing.T) { + var q Queue + var cnt int + const concurrency = 1000 + + ch1 := make(chan struct{}) + ch2 := make(chan struct{}) + ch3 := make(chan struct{}) + + // Create goroutines that will all register/unregister concurrently. + for i := 0; i < concurrency; i++ { + go func() { + var e Entry + e.Callback = &callbackStub{func(entry *Entry) { + cnt++ + if entry != &e { + t.Errorf("entry = %p, want %p", entry, &e) + } + }} + + // Wait for notification, then register. + <-ch1 + q.EventRegister(&e, EventIn|EventErr) + + // Tell main goroutine that we're done registering. + ch2 <- struct{}{} + + // Wait for notification, then unregister. + <-ch3 + q.EventUnregister(&e) + + // Tell main goroutine that we're done unregistering. + ch2 <- struct{}{} + }() + } + + // Let the goroutines register. + close(ch1) + for i := 0; i < concurrency; i++ { + <-ch2 + } + + // Issue a notification. + q.Notify(EventIn) + if cnt != concurrency { + t.Errorf("cnt = %d, want %d", cnt, concurrency) + } + + // Let the goroutine unregister. + close(ch3) + for i := 0; i < concurrency; i++ { + <-ch2 + } + + // Issue a notification. + q.Notify(EventIn) + if cnt != concurrency { + t.Errorf("cnt = %d, want %d", cnt, concurrency) + } +} + +func TestConcurrentNotification(t *testing.T) { + var q Queue + var cnt int32 + const concurrency = 1000 + const waiterCount = 1000 + + // Register waiters. + for i := 0; i < waiterCount; i++ { + var e Entry + e.Callback = &callbackStub{func(entry *Entry) { + atomic.AddInt32(&cnt, 1) + if entry != &e { + t.Errorf("entry = %p, want %p", entry, &e) + } + }} + + q.EventRegister(&e, EventIn|EventErr) + } + + // Launch notifiers. + ch1 := make(chan struct{}) + ch2 := make(chan struct{}) + for i := 0; i < concurrency; i++ { + go func() { + <-ch1 + q.Notify(EventIn) + ch2 <- struct{}{} + }() + } + + // Let notifiers go. + close(ch1) + for i := 0; i < concurrency; i++ { + <-ch2 + } + + // Check the count. + if cnt != concurrency*waiterCount { + t.Errorf("cnt = %d, want %d", cnt, concurrency*waiterCount) + } +}