【Linux网络协议分析】网络协议剖析之IP转发介绍
说明
Linux内核网络协议栈的实现遵循TCP/IP协议分层机制。IP转发处于TCP/IP网络分层中的网络层转发部分。这一层的主要作用就是通过查处路由表进行路由转发。通过网络层的路由转发可以实现不同网段的主机间的通信。如下是TCP/IP网络分层模型:
本节主要对网络层转发进行剖析。以IPv4转发为例。
Linux内核中IP转发流程
IP整体转发框架入下图所示:
IP层收包流程概述:
(1) 在inet_init中注册了类型为ETH_P_IP协议的数据包回调函数ip_rcv
(2)当二层数据包接收完毕,会调用netif_receive_skb根据协议进行上层分发
(3)类型为ETH_P_IP的数据包,会被传送到网络层,调用ip_rcv函数进行处理
(4)ip_rcv完成进本的校验和处理工作后,经过PRE_ROUTING钩子点
(5)经过PRE_ROUTING钩子点之后,调用ip_rcv_finish完成数据包接收,包括选项处理,路由查询,并根据路由决定是丢弃数据包、发往本机还是发往网络中。
下面对IP转发流程主要函数进行剖析:
1、ip_recv --> 网络协议栈报文入口函数。接着调用ip_rcv_core函数对IP报文合法性等进行检查,最后调用ip_rcv_finish函数进行最后转发处理。
int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct net *net = dev_net(dev);
skb = ip_rcv_core(skb, net); ---》IP数据包主处理函数,数据包合法性检测等… if (skb == NULL) return NET_RX_DROP;
return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTIN) --》数据包转发之前,先进行neifilter检测 net, NULL, skb, dev, NULL, ip_rcv_finish); } |
2、ip_rcv_core(skb,net)
//IP报文主处理函数,对IP报文的合法性等进行检查,
/* * Main IP Receive routine. */ static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net) { const struct iphdr *iph; u32 len;
/* When the interface is in promisc. mode, drop all the crap * that it receives, do not try to analyse it. */ //当设备处于混杂模式下,对于非本机,非广播,非多播的数据帧,设置其pkt_type=PACKET_OTHERHOS,此类数据包丢弃处理 if (skb->pkt_type == PACKET_OTHERHOST) goto drop;
__IP_UPD_PO_STATS(net, IPSTATS_MIB_IN, skb->len); //snmp协议
skb = skb_share_check(skb, GFP_ATOMIC); //如果skb共享,复制一份 if (!skb) { __IP_INC_STATS(net, IPSTATS_MIB_INDISCARDS); goto out; }
//确保skb->data指向的内存包含的数据至少为IP头部大小,由于每个IP数据包包括IP分片必须包含一个完整的IP头部。如果小于IP头部 //大小,则缺失的部分将从数据分片中拷贝。这些分片保存在skb_shinfo(skb)->frags[]中 if (!pskb_may_pull(skb, sizeof(struct iphdr)))
goto inhdr_error;
iph = ip_hdr(skb);
/* * RFC1122: 3.2.1.2 MUST silently discard any IP frame that fails the checksum. * * Is the datagram acceptable? * * 1. Length at least the size of an ip header * 2. Version of 4 * 3. Checksums correctly. [Speed optimisation for later, skip loopback checksums] * 4. Doesn't have a bogus length */
if (iph->ihl < 5 || iph->version != 4) //IP头最小20字节,偏移*4 goto inhdr_error;
BUILD_BUG_ON(IPSTATS_MIB_ECT1PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_1); BUILD_BUG_ON(IPSTATS_MIB_ECT0PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_0); BUILD_BUG_ON(IPSTATS_MIB_CEPKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_CE); __IP_ADD_STATS(net, IPSTATS_MIB_NOECTPKTS + (iph->tos & INET_ECN_MASK), max_t(unsigned short, 1, skb_shinfo(skb)->gso_segs));
if (!pskb_may_pull(skb, iph->ihl*4)) goto inhdr_error;
iph = ip_hdr(skb);
if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) goto csum_error;
len = ntohs(iph->tot_len); if (skb->len < len) { __IP_INC_STATS(net, IPSTATS_MIB_INTRUNCATEDPKTS); goto drop; } else if (len < (iph->ihl*4)) goto inhdr_error;
/* Our transport medium may have padded the buffer out. Now we know it * is IP we can trim to the true length of the frame. * Note this now means skb->len holds ntohs(iph->tot_len). */ if (pskb_trim_rcsum(skb, len)) { __IP_INC_STATS(net, IPSTATS_MIB_INDISCARDS); goto drop; }
iph = ip_hdr(skb); skb->transport_header = skb->network_header + iph->ihl*4;
/* Remove any debris in the socket control block */ memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); IPCB(skb)->iif = skb->skb_iif;
/* Must drop socket now because of tproxy. */ skb_orphan(skb);
return skb;
csum_error: __IP_INC_STATS(net, IPSTATS_MIB_CSUMERRORS); inhdr_error: __IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS); drop: kfree_skb(skb); out: return NULL; } |
3、ip_rcv_finish(struct sk_buff *skb)
// 在ip_rcv函数中,首先对报文合法性进行处理。最后会掉调用NF_HOOK函数,首先是将报文发往netfilter的 NF_INET_PRE_ROUTING节点,如下:
NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTIN, net, NULL, skb, dev, NULL, ip_rcv_finish), 执行完PRE_ROUTING上的钩子函数后,所有的钩子函数都返回NF_ACCEPT
后,数据包会交由ip_rcv_finish函数。这个函数主要功能是对数据包做路由选择处理。觉得数据包是上送本机还是发到远端网络中。
(1)static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { struct net_device *dev = skb->dev; int ret;
/* if ingress device is enslaved to an L3 master device pass the * skb to its handler for processing */ skb = l3mdev_ip_rcv(skb); if (!skb) return NET_RX_SUCCESS;
ret = ip_rcv_finish_core(net, sk, skb, dev); if (ret != NET_RX_DROP) ret = dst_input(skb); return ret; } (2) dst_input --> ip_local_deliver() //如果报文是发往本机的则调用此函数进行本机处理 --> ip_forward() // 如果报文是发往网络中其他主机,则调用此函数进行处理,并发到网络中 /* * Deliver IP Packets to the higher protocol layers. */ int ip_local_deliver(struct sk_buff *skb) { /* * Reassemble IP fragments. */ struct net *net = dev_net(skb->dev);
//检查报文是否是分片报文,如果是分片报文,进行报文重组; if (ip_is_fragment(ip_hdr(skb))) { if (ip_defrag(net, skb, IP_DEFRAG_LOCAL_DELIVER)) return 0; }
//挂载NF_INET_LOCAL_IN钩子函数。数据包传入NETFILTER(LOCAL_IN)过滤。最后经由ip_local_deliver_finish()上送的L4层进行处理; return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN, net, NULL, skb, skb->dev, NULL, ip_local_deliver_finish); } //发往非本机报文处理流程 //远端路由转发,查找路由表,从指定的接口向网络转发 (1) int ip_forward(struct sk_buff *skb) { u32 mtu; struct iphdr *iph; /* Our header */ struct rtable *rt; /* Route we use */ struct ip_options *opt = &(IPCB(skb)->opt); struct net *net;
/* that should never happen */ if (skb->pkt_type != PACKET_HOST) goto drop;
if (unlikely(skb->sk)) goto drop;
if (skb_warn_if_lro(skb)) goto drop;
if (!xfrm4_policy_check(NULL, XFRM_POLICY_FWD, skb)) goto drop;
if (IPCB(skb)->opt.router_alert && ip_call_ra_chain(skb)) return NET_RX_SUCCESS;
skb_forward_csum(skb); net = dev_net(skb->dev);
/* * According to the RFC, we must first decrease the TTL field. If * that reaches zero, we must reply an ICMP control message telling * that the packet's lifetime expired. */ if (ip_hdr(skb)->ttl <= 1) goto too_many_hops;
if (!xfrm4_route_forward(skb)) goto drop;
rt = skb_rtable(skb);
if (opt->is_strictroute && rt->rt_gw_family) goto sr_failed;
IPCB(skb)->flags |= IPSKB_FORWARDED; mtu = ip_dst_mtu_maybe_forward(&rt->dst, true); if (ip_exceeds_mtu(skb, mtu)) { IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); goto drop; }
/* We are about to mangle packet. Copy it! */ //检查skb是否共享,或是否头预留有足够的空间粗放L2的头部,因为在转发此数据包的时候需要拷贝L2的头部进去; if (skb_cow(skb, LL_RESERVED_SPACE(rt->dst.dev)+rt->dst.header_len)) goto drop; iph = ip_hdr(skb);
/* Decrease ttl after skb cow done */ ip_decrease_ttl(iph); //经历了一跳路由,ip头中TTL值减一
/* * We now generate an ICMP HOST REDIRECT giving the route * we calculated. */ if (IPCB(skb)->flags & IPSKB_DOREDIRECT && !opt->srr && !skb_sec_path(skb)) ip_rt_send_redirect(skb);
if (net->ipv4.sysctl_ip_fwd_update_priority) skb->priority = rt_tos2priority(iph->tos);
//报文送入NETFILTER处理后,调用ip_forward_finish()转发 return NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, net, NULL, skb, skb->dev, rt->dst.dev, ip_forward_finish);
sr_failed: /* * Strict routing permits no gatewaying */ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_SR_FAILED, 0); goto drop;
too_many_hops: /* Tell the sender its packet died... */ __IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS); icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0); drop: kfree_skb(skb); return NET_RX_DROP; }
(2) ip_forward_finish// static int ip_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { struct ip_options *opt = &(IPCB(skb)->opt);
__IP_INC_STATS(net, IPSTATS_MIB_OUTFORWDATAGRAMS); __IP_ADD_STATS(net, IPSTATS_MIB_OUTOCTETS, skb->len);
#ifdef CONFIG_NET_SWITCHDEV if (skb->offload_l3_fwd_mark) { consume_skb(skb); return 0; } #endif
if (unlikely(opt->optlen)) ip_forward_options(skb); //IP报文头处理
skb->tstamp = 0; return dst_output(net, sk, skb); //转发IP报文 } (3) //ip转发函数,经过NETFILTER(POST_ROUTING)后,调用ip_finish_output函数转发 所有要发送的数据包,要发送到网络中的其他主机都要经过dst_output发送到目标网络。这时IP协议头已经处理完毕,协议头中嵌入了要发送的信息和本机系统要加入的信息。dis_output调用函数指针skb_output,skb_output根据目标地址类型进行初始化,目标地址为某一个主机地址是,skb_output初始化为 ip_output,目标地址是组播发送地址时就出事化为ip_mac_output。ip_output函数会初始化数据包的输出网络设备和传输协议,最后进入neitfilter的 POST_ROUTING链处理钩子函数,POST_ROUTING链上的钩子函数处理结束后钓友ip_finish_output函数 : int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct net_device *dev = skb_dst(skb)->dev;
IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len);
skb->dev = dev; skb->protocol = htons(ETH_P_IP);
return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, net, sk, skb, NULL, dev, ip_finish_output, !(IPCB(skb)->flags & IPSKB_REROUTED)); }
//发送IP数据包 static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) { unsigned int mtu; int ret;
ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb); if (ret) { kfree_skb(skb); return ret; }
#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) /* Policy lookup after SNAT yielded a new policy */ if (skb_dst(skb)->xfrm) { IPCB(skb)->flags |= IPSKB_REROUTED; return dst_output(net, sk, skb); } #endif mtu = ip_skb_dst_mtu(sk, skb); //获取对端MTU if (skb_is_gso(skb)) //检查网卡是否开启gso,如果网卡没有开启gso就在CPU进行分片发送。否则直接发送给网卡,由网卡进行分片发送; return ip_finish_output_gso(net, sk, skb, mtu);
if (skb->len > mtu || (IPCB(skb)->flags & IPSKB_FRAG_PMTU)) //网卡没有开启gso,对于超MTU的包,由CPU进行分片转发。 return ip_fragment(net, sk, skb, mtu, ip_finish_output2);
return ip_finish_output2(net, sk, skb); } |
- 点赞
- 收藏
- 关注作者
评论(0)