【内核源码】linux UDP实现
对于不设置MSG_MORE和UDP_CORK的通常情况, udp_sendmsg查找路由和获取ip选项后,直接调用ip_make_skb()来创建skb,并调用udp_send_skb发送,由ip层来执行分片
对于需要cork的情况,调用ip_append_data()来添加数据到队列
ip_make_skb
struct sk_buff *ip_make_skb(struct sock *sk,
struct flowi4 *fl4,
int getfrag(void *from, char *to, int offset,
int len, int odd, struct sk_buff *skb),
void *from, int length, int transhdrlen,
struct ipcm_cookie *ipc, struct rtable **rtp,
unsigned int flags)
{
struct inet_cork cork;
struct sk_buff_head queue;
int err;
if (flags & MSG_PROBE)
return NULL;
__skb_queue_head_init(&queue);
cork.flags = 0;
cork.addr = 0;
cork.opt = NULL;
err = ip_setup_cork(sk, &cork, ipc, rtp);
if (err)
return ERR_PTR(err);
err = __ip_append_data(sk, fl4, &queue, &cork, //创建添加copy数据到skb, 并放入queue中
¤t->task_frag, getfrag,
from, length, transhdrlen, flags);
if (err) {
__ip_flush_pending_frames(sk, &queue, &cork);
return ERR_PTR(err);
}
return __ip_make_skb(sk, fl4, &queue, &cork); //把queue中所有skb都合并到一个skb及其frag_list中, 并设置好ip头
}
ip_append_data
/*
* ip_append_data() and ip_append_page() can make one large IP datagram
* from many pieces of data. Each pieces will be holded on the socket
* until ip_push_pending_frames() is called. Each piece can be a page
* or non-page data.
*
* Not only UDP, other transport protocols - e.g. raw sockets - can use
* this interface potentially.
*
* LATER: length must be adjusted by pad at tail, when it is required.
*/
int ip_append_data(struct sock *sk, struct flowi4 *fl4,
int getfrag(void *from, char *to, int offset, int len,
int odd, struct sk_buff *skb),
void *from, int length, int transhdrlen,
struct ipcm_cookie *ipc, struct rtable **rtp,
unsigned int flags)
{
struct inet_sock *inet = inet_sk(sk);
int err;
if (flags&MSG_PROBE)
return 0;
if (skb_queue_empty(&sk->sk_write_queue)) {
err = ip_setup_cork(sk, &inet->cork.base, ipc, rtp); //sk_write_queue为空,初始化cork
if (err)
return err;
} else {
transhdrlen = 0;
}
return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base, //添加skb到sk_write_queue
sk_page_frag(sk), getfrag,
from, length, transhdrlen, flags);
}
__ip_append_data
- 如果支持UFO则调用ip_ufo_append_data
- 尝试添加到队列sk_write_queue尾部skb,如果head空间够则copy到head,否则根据是否支持sg,创建新的skb或者添加到frag中
static int __ip_append_data(struct sock *sk,
struct flowi4 *fl4,
struct sk_buff_head *queue,
struct inet_cork *cork,
struct page_frag *pfrag,
int getfrag(void *from, char *to, int offset,
int len, int odd, struct sk_buff *skb),
void *from, int length, int transhdrlen,
unsigned int flags)
{
struct inet_sock *inet = inet_sk(sk);
struct sk_buff *skb;
struct ip_options *opt = cork->opt;
int hh_len;
int exthdrlen;
int mtu;
int copy;
int err;
int offset = 0;
unsigned int maxfraglen, fragheaderlen, maxnonfragsize;
int csummode = CHECKSUM_NONE;
struct rtable *rt = (struct rtable *)cork->dst;
u32 tskey = 0;
skb = skb_peek_tail(queue);
exthdrlen = !skb ? rt->dst.header_len : 0;
mtu = cork->fragsize;
if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
tskey = sk->sk_tskey++;
hh_len = LL_RESERVED_SPACE(rt->dst.dev);
fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; //ip包长,8对齐
maxnonfragsize = ip_sk_ignore_df(sk) ? 0xFFFF : mtu;
if (cork->length + length > maxnonfragsize - fragheaderlen) { //超过最大包长
ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
mtu - (opt ? opt->optlen : 0));
return -EMSGSIZE;
}
/*
* transhdrlen > 0 means that this is the first fragment and we wish
* it won't be fragmented in the future.
*/
if (transhdrlen && // sizeof(struct udphdr)
length + fragheaderlen <= mtu &&
rt->dst.dev->features & (NETIF_F_HW_CSUM | NETIF_F_IP_CSUM) &&
!(flags & MSG_MORE) &&
!exthdrlen)
csummode = CHECKSUM_PARTIAL; //硬件计算校验和
cork->length += length;
if (((length > mtu) || (skb && skb_is_gso(skb))) &&
(sk->sk_protocol == IPPROTO_UDP) &&
(rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len &&
(sk->sk_type == SOCK_DGRAM) && !sk->sk_no_check_tx) { //支持ufo
err = ip_ufo_append_data(sk, queue, getfrag, from, length, //创建ufo skb,添加数据到frag中
hh_len, fragheaderlen, transhdrlen,
maxfraglen, flags);
if (err)
goto error;
return 0;
}
/* So, what's going on in the loop below?
*
* We use calculated fragment length to generate chained skb,
* each of segments is IP fragment ready for sending to network after
* adding appropriate IP header.
*/
//下面是不支持gso/ufo的处理
if (!skb) //队列中没有skb,则需要创建新的
goto alloc_new_skb;
while (length > 0) {
/* Check if the remaining data fits into current packet. */ //检查当前skb是否还由空间能copy
copy = mtu - skb->len;
if (copy < length)
copy = maxfraglen - skb->len;
if (copy <= 0) { //剩余空间不够,创建新的skb插入
char *data;
unsigned int datalen;
unsigned int fraglen;
unsigned int fraggap;
unsigned int alloclen;
struct sk_buff *skb_prev;
alloc_new_skb:
skb_prev = skb;
if (skb_prev)
fraggap = skb_prev->len - maxfraglen;
else
fraggap = 0;
/*
* If remaining data exceeds the mtu,
* we know we need more fragment(s).
*/
datalen = length + fraggap;
if (datalen > mtu - fragheaderlen)
datalen = maxfraglen - fragheaderlen;
fraglen = datalen + fragheaderlen;
if ((flags & MSG_MORE) &&
!(rt->dst.dev->features&NETIF_F_SG))
alloclen = mtu;
else
alloclen = fraglen;
alloclen += exthdrlen;
/* The last fragment gets additional space at tail.
* Note, with MSG_MORE we overallocate on fragments,
* because we have no idea what fragment will be
* the last.
*/
if (datalen == length + fraggap)
alloclen += rt->dst.trailer_len;
if (transhdrlen) {
skb = sock_alloc_send_skb(sk,
alloclen + hh_len + 15,
(flags & MSG_DONTWAIT), &err);
} else {
skb = NULL;
if (atomic_read(&sk->sk_wmem_alloc) <=
2 * sk->sk_sndbuf)
skb = sock_wmalloc(sk,
alloclen + hh_len + 15, 1,
sk->sk_allocation);
if (unlikely(!skb))
err = -ENOBUFS;
}
if (!skb)
goto error;
/*
* Fill in the control structures
*/
skb->ip_summed = csummode;
skb->csum = 0;
skb_reserve(skb, hh_len);
/* only the initial fragment is time stamped */
skb_shinfo(skb)->tx_flags = cork->tx_flags;
cork->tx_flags = 0;
skb_shinfo(skb)->tskey = tskey;
tskey = 0;
/*
* Find where to start putting bytes.
*/
data = skb_put(skb, fraglen + exthdrlen);
skb_set_network_header(skb, exthdrlen);
skb->transport_header = (skb->network_header +
fragheaderlen);
data += fragheaderlen + exthdrlen;
if (fraggap) {
skb->csum = skb_copy_and_csum_bits(
skb_prev, maxfraglen,
data + transhdrlen, fraggap, 0);
skb_prev->csum = csum_sub(skb_prev->csum,
skb->csum);
data += fraggap;
pskb_trim_unique(skb_prev, maxfraglen);
}
copy = datalen - transhdrlen - fraggap;
if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
err = -EFAULT;
kfree_skb(skb);
goto error;
}
offset += copy;
length -= datalen - fraggap;
transhdrlen = 0;
exthdrlen = 0;
csummode = CHECKSUM_NONE;
/*
* Put the packet on the pending queue.
*/
__skb_queue_tail(queue, skb); //把新的skb插入队列
continue;
}
if (copy > length)
copy = length;
if (!(rt->dst.dev->features&NETIF_F_SG)) { //不支持sg,copy数据到head区
unsigned int off;
off = skb->len;
if (getfrag(from, skb_put(skb, copy), //ip_generic_getfrag
offset, copy, off, skb) < 0) {
__skb_trim(skb, off);
err = -EFAULT;
goto error;
}
} else { //支持sg,copy数据到frag
int i = skb_shinfo(skb)->nr_frags;
err = -ENOMEM;
if (!sk_page_frag_refill(sk, pfrag))
goto error;
if (!skb_can_coalesce(skb, i, pfrag->page,
pfrag->offset)) {
err = -EMSGSIZE;
if (i == MAX_SKB_FRAGS)
goto error;
__skb_fill_page_desc(skb, i, pfrag->page,
pfrag->offset, 0);
skb_shinfo(skb)->nr_frags = ++i;
get_page(pfrag->page);
}
copy = min_t(int, copy, pfrag->size - pfrag->offset);
if (getfrag(from,
page_address(pfrag->page) + pfrag->offset,
offset, copy, skb->len, skb) < 0)
goto error_efault;
pfrag->offset += copy;
skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
skb->len += copy;
skb->data_len += copy;
skb->truesize += copy;
atomic_add(copy, &sk->sk_wmem_alloc);
}
offset += copy;
length -= copy;
}
return 0;
error_efault:
err = -EFAULT;
error:
cork->length -= length;
IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
return err;
}
ip_ufo_append_data
static inline int ip_ufo_append_data(struct sock *sk,
struct sk_buff_head *queue,
int getfrag(void *from, char *to, int offset, int len,
int odd, struct sk_buff *skb),
void *from, int length, int hh_len, int fragheaderlen,
int transhdrlen, int maxfraglen, unsigned int flags)
{
struct sk_buff *skb;
int err;
/* There is support for UDP fragmentation offload by network
* device, so create one single skb packet containing complete
* udp datagram
*/
skb = skb_peek_tail(queue);
if (!skb) {
skb = sock_alloc_send_skb(sk, //分配skb空间
hh_len + fragheaderlen + transhdrlen + 20, //这部分在skb head区
(flags & MSG_DONTWAIT), &err);
if (!skb)
return err;
/* reserve space for Hardware header */
skb_reserve(skb, hh_len);
/* create space for UDP/IP header */
skb_put(skb, fragheaderlen + transhdrlen);
/* initialize network header pointer */
skb_reset_network_header(skb);
/* initialize protocol header pointer */
skb->transport_header = skb->network_header + fragheaderlen;
skb->csum = 0;
__skb_queue_tail(queue, skb); //把这个不带数据部分的skb,插入队列中
} else if (skb_is_gso(skb)) {
goto append;
}
skb->ip_summed = CHECKSUM_PARTIAL;
/* specify the length of each IP datagram fragment */
skb_shinfo(skb)->gso_size = maxfraglen - fragheaderlen;
skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
append:
return skb_append_datato_frags(sk, skb, getfrag, from, //copy数据部分到frag
(length - transhdrlen));
}
int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb,
int (*getfrag)(void *from, char *to, int offset,
int len, int odd, struct sk_buff *skb),
void *from, int length)
{
int frg_cnt = skb_shinfo(skb)->nr_frags;
int copy;
int offset = 0;
int ret;
struct page_frag *pfrag = ¤t->task_frag;
do {
/* Return error if we don't have space for new frag */
if (frg_cnt >= MAX_SKB_FRAGS)
return -EMSGSIZE;
if (!sk_page_frag_refill(sk, pfrag))
return -ENOMEM;
/* copy the user data to page */
copy = min_t(int, length, pfrag->size - pfrag->offset);
ret = getfrag(from, page_address(pfrag->page) + pfrag->offset, //ip_generic_getfrag
offset, copy, 0, skb);
if (ret < 0)
return -EFAULT;
/* copy was successful so update the size parameters */
skb_fill_page_desc(skb, frg_cnt, pfrag->page, pfrag->offset,
copy);
frg_cnt++;
pfrag->offset += copy;
get_page(pfrag->page);
skb->truesize += copy;
atomic_add(copy, &sk->sk_wmem_alloc);
skb->len += copy;
skb->data_len += copy;
offset += copy;
length -= copy;
} while (length > 0);
return 0;
}
int ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
{
struct msghdr *msg = from;
if (skb->ip_summed == CHECKSUM_PARTIAL) {
if (copy_from_iter(to, len, &msg->msg_iter) != len)
return -EFAULT;
} else {
__wsum csum = 0;
if (csum_and_copy_from_iter(to, len, &csum, &msg->msg_iter) != len)
return -EFAULT;
skb->csum = csum_block_add(skb->csum, csum, odd);
}
return 0;
}
udp_send_skb
udp_send_skb()->ip_send_skb()->ip_local_out()
static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4)
{
/*
* Create a UDP header
*/
uh = udp_hdr(skb);
uh->source = inet->inet_sport;
uh->dest = fl4->fl4_dport;
uh->len = htons(len);
uh->check = 0;
if (is_udplite) /* UDP-Lite */
csum = udplite_csum(skb);
else if (sk->sk_no_check_tx) { /* UDP csum disabled */
skb->ip_summed = CHECKSUM_NONE;
goto send;
} else if (skb->ip_summed == CHECKSUM_PARTIAL) { /* UDP hardware csum */
udp4_hwcsum(skb, fl4->saddr, fl4->daddr);
goto send;
} else
csum = udp_csum(skb);
/* add protocol-dependent pseudo-header */
uh->check = csum_tcpudp_magic(fl4->saddr, fl4->daddr, len,
sk->sk_protocol, csum);
if (uh->check == 0)
uh->check = CSUM_MANGLED_0;
send:
err = ip_send_skb(sock_net(sk), skb);
if (err) {
if (err == -ENOBUFS && !inet->recverr) {
UDP_INC_STATS(sock_net(sk),
UDP_MIB_SNDBUFERRORS, is_udplite);
err = 0;
}
} else
UDP_INC_STATS(sock_net(sk),
UDP_MIB_OUTDATAGRAMS, is_udplite);
return err;
}
udp_recvmsg
udp_recvmsg()只要是调用__skb_recv_datagram()从sk_receive_queue取出一个skb,然后调用skb_copy_datagram_msg拷贝数据到用户态
int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
int flags, int *addr_len)
{
struct inet_sock *inet = inet_sk(sk);
DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name);
struct sk_buff *skb;
unsigned int ulen, copied;
int peeked, peeking, off;
int err;
int is_udplite = IS_UDPLITE(sk);
bool checksum_valid = false;
bool slow;
if (flags & MSG_ERRQUEUE)
return ip_recv_error(sk, msg, len, addr_len); //返回sk_error_queue队列中的数据
try_again:
peeking = off = sk_peek_offset(sk, flags);
skb = __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0),
&peeked, &off, &err);
if (!skb)
return err;
ulen = skb->len;
copied = len;
if (copied > ulen - off)
copied = ulen - off;
else if (copied < ulen)
msg->msg_flags |= MSG_TRUNC;
/*
* If checksum is needed at all, try to do it while copying the
* data. If the data is truncated, or if we only want a partial
* coverage checksum (UDP-Lite), do it before the copy.
*/
if (copied < ulen || UDP_SKB_CB(skb)->partial_cov || peeking) {
checksum_valid = !udp_lib_checksum_complete(skb);
if (!checksum_valid)
goto csum_copy_err;
}
if (checksum_valid || skb_csum_unnecessary(skb))
err = skb_copy_datagram_msg(skb, off, msg, copied);
else {
err = skb_copy_and_csum_datagram_msg(skb, off, msg);
if (err == -EINVAL)
goto csum_copy_err;
}
if (unlikely(err)) {
trace_kfree_skb(skb, udp_recvmsg);
if (!peeked) {
atomic_inc(&sk->sk_drops);
UDP_INC_STATS(sock_net(sk),
UDP_MIB_INERRORS, is_udplite);
}
skb_free_datagram_locked(sk, skb);
return err;
}
if (!peeked)
UDP_INC_STATS(sock_net(sk),
UDP_MIB_INDATAGRAMS, is_udplite);
sock_recv_ts_and_drops(msg, sk, skb);
/* Copy the address. */
if (sin) {
sin->sin_family = AF_INET;
sin->sin_port = udp_hdr(skb)->source;
sin->sin_addr.s_addr = ip_hdr(skb)->saddr;
memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
*addr_len = sizeof(*sin);
}
if (inet->cmsg_flags)
ip_cmsg_recv_offset(msg, skb, sizeof(struct udphdr), off);
err = copied;
if (flags & MSG_TRUNC)
err = ulen;
__skb_free_datagram_locked(sk, skb, peeking ? -err : err);
return err;
csum_copy_err:
slow = lock_sock_fast(sk);
if (!skb_kill_datagram(sk, skb, flags)) {
UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite);
UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
}
unlock_sock_fast(sk, slow);
/* starting over for a new packet, but check if we need to yield */
cond_resched();
msg->msg_flags &= ~MSG_TRUNC;
goto try_again;
}
struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags,
int *peeked, int *off, int *err)
{
struct sk_buff *skb, *last;
long timeo;
timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
do {
skb = __skb_try_recv_datagram(sk, flags, peeked, off, err, //从sk_receive_queue取出一个skb
&last);
if (skb)
return skb;
if (*err != -EAGAIN)
break;
} while (timeo &&
!__skb_wait_for_more_packets(sk, err, &timeo, last)); //阻塞等待
return NULL;
}
udp_rcv
在中断下半部会调用udp_rcv来进行udp协议的处理
主要是调用__udp4_lib_lookup_skb在udp hash中查找要接收该skb的socket,
然后调用udp_queue_rcv_skb把skb放到socket的sk_receive_queue中
int udp_rcv(struct sk_buff *skb)
{
return __udp4_lib_rcv(skb, &udp_table, IPPROTO_UDP);
}
int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
int proto)
{
struct sock *sk;
struct udphdr *uh;
unsigned short ulen;
struct rtable *rt = skb_rtable(skb);
__be32 saddr, daddr;
struct net *net = dev_net(skb->dev);
/*
* Validate the packet.
*/
if (!pskb_may_pull(skb, sizeof(struct udphdr)))
goto drop; /* No space for header. */
uh = udp_hdr(skb);
ulen = ntohs(uh->len);
saddr = ip_hdr(skb)->saddr;
daddr = ip_hdr(skb)->daddr;
if (ulen > skb->len)
goto short_packet;
if (proto == IPPROTO_UDP) {
/* UDP validates ulen. */
if (ulen < sizeof(*uh) || pskb_trim_rcsum(skb, ulen))
goto short_packet;
uh = udp_hdr(skb);
}
/* UDP只提供有限的校验能力,如果不计算,则发送时为全0
* 校验和初始化,若udp头部的校验和字段为0,则设置不必校验标志位,若校验由硬
* 完成,且伪首部通过校验,则同样设置不必校验标志,若以上都不满足,则将伪首
* 部的校验和给skb->csum,之后还需要对数据部分进行校验*/
if (udp4_csum_init(skb, uh, proto))
goto csum_error;
sk = skb_steal_sock(skb);
if (sk) {
struct dst_entry *dst = skb_dst(skb);
int ret;
if (unlikely(sk->sk_rx_dst != dst))
udp_sk_rx_dst_set(sk, dst);
ret = udp_queue_rcv_skb(sk, skb);
sock_put(sk);
/* a return value > 0 means to resubmit the input, but
* it wants the return to be -protocol, or 0
*/
if (ret > 0)
return -ret;
return 0;
}
if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST))
return __udp4_lib_mcast_deliver(net, skb, uh, // 组播和广播处理,所有满足条件的sk都能接收
saddr, daddr, udptable, proto);
sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable); //在hash表中查找满足条件的sk
if (sk) {
int ret;
if (inet_get_convert_csum(sk) && uh->check && !IS_UDPLITE(sk))
skb_checksum_try_convert(skb, IPPROTO_UDP, uh->check,
inet_compute_pseudo);
ret = udp_queue_rcv_skb(sk, skb);
/* a return value > 0 means to resubmit the input, but
* it wants the return to be -protocol, or 0
*/
if (ret > 0)
return -ret;
return 0;
}
//没有满足条件的sk
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
goto drop;
nf_reset(skb);
//对数据包进行校验和,如果校验失败则安静地丢弃
/* No socket. Drop packet silently, if checksum is wrong */
if (udp_lib_checksum_complete(skb))
goto csum_error;
//校验成功,发送端口不可达ICMP
__UDP_INC_STATS(net, UDP_MIB_NOPORTS, proto == IPPROTO_UDPLITE);
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
/*
* Hmm. We got an UDP packet to a port to which we
* don't wanna listen. Ignore it.
*/
kfree_skb(skb);
return 0;
short_packet:
net_dbg_ratelimited("UDP%s: short packet: From %pI4:%u %d/%d to %pI4:%u\n",
proto == IPPROTO_UDPLITE ? "Lite" : "",
&saddr, ntohs(uh->source),
ulen, skb->len,
&daddr, ntohs(uh->dest));
goto drop;
csum_error:
/*
* RFC1122: OK. Discards the bad packet silently (as far as
* the network is concerned, anyway) as per 4.1.3.4 (MUST).
*/
net_dbg_ratelimited("UDP%s: bad checksum. From %pI4:%u to %pI4:%u ulen %d\n",
proto == IPPROTO_UDPLITE ? "Lite" : "",
&saddr, ntohs(uh->source), &daddr, ntohs(uh->dest),
ulen);
__UDP_INC_STATS(net, UDP_MIB_CSUMERRORS, proto == IPPROTO_UDPLITE);
drop:
__UDP_INC_STATS(net, UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE);
kfree_skb(skb);
return 0;
}
static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
struct udphdr *uh,
__be32 saddr, __be32 daddr,
struct udp_table *udptable,
int proto)
{
struct sock *sk, *first = NULL;
unsigned short hnum = ntohs(uh->dest);
struct udp_hslot *hslot = udp_hashslot(udptable, net, hnum);
unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10);
unsigned int offset = offsetof(typeof(*sk), sk_node);
int dif = skb->dev->ifindex;
struct hlist_node *node;
struct sk_buff *nskb;
if (use_hash2) {
hash2_any = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum) &
udptable->mask;
hash2 = udp4_portaddr_hash(net, daddr, hnum) & udptable->mask;
start_lookup:
hslot = &udptable->hash2[hash2];
offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node);
}
sk_for_each_entry_offset_rcu(sk, node, &hslot->head, offset) {
if (!__udp_is_mcast_sock(net, sk, uh->dest, daddr, //检查sk是否满足接收条件,并判断组播是否允许通过
uh->source, saddr, dif, hnum))
continue;
if (!first) {
first = sk;
continue;
}
nskb = skb_clone(skb, GFP_ATOMIC);
if (unlikely(!nskb)) {
atomic_inc(&sk->sk_drops);
__UDP_INC_STATS(net, UDP_MIB_RCVBUFERRORS,
IS_UDPLITE(sk));
__UDP_INC_STATS(net, UDP_MIB_INERRORS,
IS_UDPLITE(sk));
continue;
}
if (udp_queue_rcv_skb(sk, nskb) > 0) //所有满足条件的sk都能接收
consume_skb(nskb);
}
/* Also lookup *:port if we are using hash2 and haven't done so yet. */
if (use_hash2 && hash2 != hash2_any) { //再遍历INADDR_ANY的情况
hash2 = hash2_any;
goto start_lookup;
}
if (first) {
if (udp_queue_rcv_skb(first, skb) > 0) //第一个sk使用原始skb,不用clone了
consume_skb(skb);
} else {// 说明没有满足条件的sk,直接丢弃
kfree_skb(skb);
__UDP_INC_STATS(net, UDP_MIB_IGNOREDMULTI,
proto == IPPROTO_UDPLITE);
}
return 0;
}
__udp4_lib_lookup_skb
__udp4_lib_lookup_skb在两个udp hash中,较短的那个链表中查找。
查找的时候跟tcp一样通过compute_score来打分,最高分数的socket来进行接收。 如果是reuseport,则使用reuseport对应的算法来选择sock
struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
__be16 sport, __be32 daddr, __be16 dport,
int dif, struct udp_table *udptable, struct sk_buff *skb)
{
struct sock *sk, *result;
unsigned short hnum = ntohs(dport);
unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask); //根据目的端口和net来hash1
struct udp_hslot *hslot2, *hslot = &udptable->hash[slot];
int score, badness, matches = 0, reuseport = 0;
u32 hash = 0;
if (hslot->count > 10) { //超过10个, 在hash2中查找
hash2 = udp4_portaddr_hash(net, daddr, hnum);
slot2 = hash2 & udptable->mask;
hslot2 = &udptable->hash2[slot2];
if (hslot->count < hslot2->count) //哪个链表短查哪个
goto begin;
result = udp4_lib_lookup2(net, saddr, sport, //跟begin相同的逻辑
daddr, hnum, dif,
hslot2, skb);
if (!result) {
unsigned int old_slot2 = slot2;
hash2 = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum); //再差INADDR_ANY的情况
slot2 = hash2 & udptable->mask;
/* avoid searching the same slot again. */
if (unlikely(slot2 == old_slot2))
return result;
hslot2 = &udptable->hash2[slot2];
if (hslot->count < hslot2->count)
goto begin;
result = udp4_lib_lookup2(net, saddr, sport,
daddr, hnum, dif,
hslot2, skb);
}
return result;
}
begin:
result = NULL;
badness = 0;
sk_for_each_rcu(sk, &hslot->head) {
score = compute_score(sk, net, saddr, sport,
daddr, hnum, dif);
if (score > badness) {
reuseport = sk->sk_reuseport;
if (reuseport) {
hash = udp_ehashfn(net, daddr, hnum,
saddr, sport);
result = reuseport_select_sock(sk, hash, skb,
sizeof(struct udphdr)); //根据reuseport算法选择一个sock直接返回
if (result)
return result;
matches = 1;
}
result = sk; //使用最大分数的sk作为结果返回
badness = score;
} else if (score == badness && reuseport) { //和最大分数相等,并且最大分数的sk是reuseport
matches++;
if (reciprocal_scale(hash, matches) == 0) // 判断当前sk能否作为结果
result = sk;
hash = next_pseudo_random32(hash);
}
}
return result;
}
static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb,
__be16 sport, __be16 dport,
struct udp_table *udptable)
{
const struct iphdr *iph = ip_hdr(skb);
return __udp4_lib_lookup(dev_net(skb->dev), iph->saddr, sport,
iph->daddr, dport, inet_iif(skb),
udptable, skb);
}
udp_queue_rcv_skb
/* returns:
* -1: error
* 0: success
* >0: "udp encap" protocol resubmission
*
* Note that in the success and error cases, the skb is assumed to
* have either been requeued or freed.
*/
int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
struct udp_sock *up = udp_sk(sk);
int rc;
int is_udplite = IS_UDPLITE(sk);
/*
* Charge it to the socket, dropping if the queue is full.
*/
if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
goto drop;
nf_reset(skb);
if (static_key_false(&udp_encap_needed) && up->encap_type) { //udp tunnel处理
int (*encap_rcv)(struct sock *sk, struct sk_buff *skb);
/*
* This is an encapsulation socket so pass the skb to
* the socket's udp_encap_rcv() hook. Otherwise, just
* fall through and pass this up the UDP socket.
* up->encap_rcv() returns the following value:
* =0 if skb was successfully passed to the encap
* handler or was discarded by it.
* >0 if skb should be passed on to UDP.
* <0 if skb should be resubmitted as proto -N
*/
/* if we're overly short, let UDP handle it */
encap_rcv = ACCESS_ONCE(up->encap_rcv);
if (encap_rcv) {
int ret;
/* Verify checksum before giving to encap */
if (udp_lib_checksum_complete(skb)) //校验
goto csum_error;
ret = encap_rcv(sk, skb);
if (ret <= 0) {
__UDP_INC_STATS(sock_net(sk),
UDP_MIB_INDATAGRAMS,
is_udplite);
return -ret;
}
}
/* FALLTHROUGH -- it's a UDP Packet */
}
/*
* UDP-Lite specific tests, ignored on UDP sockets
*/
if ((is_udplite & UDPLITE_RECV_CC) && UDP_SKB_CB(skb)->partial_cov) { //udplite处理
...
}
if (rcu_access_pointer(sk->sk_filter) &&
udp_lib_checksum_complete(skb))
goto csum_error;
if (sk_filter_trim_cap(sk, skb, sizeof(struct udphdr)))
goto drop;
udp_csum_pull_header(skb);
if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) { //接收缓存满了
__UDP_INC_STATS(sock_net(sk), UDP_MIB_RCVBUFERRORS,
is_udplite);
goto drop;
}
rc = 0;
ipv4_pktinfo_prepare(sk, skb);
bh_lock_sock(sk);
if (!sock_owned_by_user(sk))
rc = __udp_queue_rcv_skb(sk, skb); //应用程序没在使用,添加到sk_receive_queue队列
else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) { //应用程序在使用,添加到backlog中,等release_lock的是否处理
bh_unlock_sock(sk);
goto drop;
}
bh_unlock_sock(sk);
return rc;
csum_error:
__UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite);
drop:
__UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
atomic_inc(&sk->sk_drops);
kfree_skb(skb);
return -1;
}
由上可知,udp非常简单,只有两个hash表,一个backlog队列,以及sk_receive_queue和sk_write_queue。
本文作者: bhpike65