【内核源码】linux UDP实现

个人博客

对于不设置MSG_MORE和UDP_CORK的通常情况, udp_sendmsg查找路由和获取ip选项后,直接调用ip_make_skb()来创建skb,并调用udp_send_skb发送,由ip层来执行分片
对于需要cork的情况,调用ip_append_data()来添加数据到队列

ip_make_skb

struct sk_buff *ip_make_skb(struct sock *sk,
			    struct flowi4 *fl4,
			    int getfrag(void *from, char *to, int offset,
					int len, int odd, struct sk_buff *skb),
			    void *from, int length, int transhdrlen,
			    struct ipcm_cookie *ipc, struct rtable **rtp,
			    unsigned int flags)
{
	struct inet_cork cork;
	struct sk_buff_head queue;
	int err;
	if (flags & MSG_PROBE)
		return NULL;
	__skb_queue_head_init(&queue);
	cork.flags = 0;
	cork.addr = 0;
	cork.opt = NULL;
	err = ip_setup_cork(sk, &cork, ipc, rtp);
	if (err)
		return ERR_PTR(err);
	err = __ip_append_data(sk, fl4, &queue, &cork,	//创建添加copy数据到skb, 并放入queue中
			       ¤t->task_frag, getfrag,
			       from, length, transhdrlen, flags);
	if (err) {
		__ip_flush_pending_frames(sk, &queue, &cork);
		return ERR_PTR(err);
	}
	return __ip_make_skb(sk, fl4, &queue, &cork);	//把queue中所有skb都合并到一个skb及其frag_list中, 并设置好ip头
}
ip_append_data
/*
 *	ip_append_data() and ip_append_page() can make one large IP datagram
 *	from many pieces of data. Each pieces will be holded on the socket
 *	until ip_push_pending_frames() is called. Each piece can be a page
 *	or non-page data.
 *
 *	Not only UDP, other transport protocols - e.g. raw sockets - can use
 *	this interface potentially.
 *
 *	LATER: length must be adjusted by pad at tail, when it is required.
 */
int ip_append_data(struct sock *sk, struct flowi4 *fl4,
		   int getfrag(void *from, char *to, int offset, int len,
			       int odd, struct sk_buff *skb),
		   void *from, int length, int transhdrlen,
		   struct ipcm_cookie *ipc, struct rtable **rtp,
		   unsigned int flags)
{
	struct inet_sock *inet = inet_sk(sk);
	int err;
	if (flags&MSG_PROBE)
		return 0;
	if (skb_queue_empty(&sk->sk_write_queue)) {
		err = ip_setup_cork(sk, &inet->cork.base, ipc, rtp);	//sk_write_queue为空,初始化cork
		if (err)
			return err;
	} else {
		transhdrlen = 0;
	}
	return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base,	//添加skb到sk_write_queue
				sk_page_frag(sk), getfrag,
				from, length, transhdrlen, flags);
}
__ip_append_data
static int __ip_append_data(struct sock *sk,
			    struct flowi4 *fl4,
			    struct sk_buff_head *queue,
			    struct inet_cork *cork,
			    struct page_frag *pfrag,
			    int getfrag(void *from, char *to, int offset,
					int len, int odd, struct sk_buff *skb),
			    void *from, int length, int transhdrlen,
			    unsigned int flags)
{
	struct inet_sock *inet = inet_sk(sk);
	struct sk_buff *skb;
	struct ip_options *opt = cork->opt;
	int hh_len;
	int exthdrlen;
	int mtu;
	int copy;
	int err;
	int offset = 0;
	unsigned int maxfraglen, fragheaderlen, maxnonfragsize;
	int csummode = CHECKSUM_NONE;
	struct rtable *rt = (struct rtable *)cork->dst;
	u32 tskey = 0;
	skb = skb_peek_tail(queue);
	exthdrlen = !skb ? rt->dst.header_len : 0;
	mtu = cork->fragsize;
	if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
	    sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
		tskey = sk->sk_tskey++;
	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;	//ip包长,8对齐
	maxnonfragsize = ip_sk_ignore_df(sk) ? 0xFFFF : mtu;
	if (cork->length + length > maxnonfragsize - fragheaderlen) {	//超过最大包长
		ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
			       mtu - (opt ? opt->optlen : 0));
		return -EMSGSIZE;
	}
	/*
	 * transhdrlen > 0 means that this is the first fragment and we wish
	 * it won't be fragmented in the future.
	 */
	if (transhdrlen &&	// sizeof(struct udphdr)
	    length + fragheaderlen <= mtu &&
	    rt->dst.dev->features & (NETIF_F_HW_CSUM | NETIF_F_IP_CSUM) &&
	    !(flags & MSG_MORE) &&
	    !exthdrlen)
		csummode = CHECKSUM_PARTIAL;	//硬件计算校验和
	cork->length += length;
	if (((length > mtu) || (skb && skb_is_gso(skb))) &&
	    (sk->sk_protocol == IPPROTO_UDP) &&
	    (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len &&
	    (sk->sk_type == SOCK_DGRAM) && !sk->sk_no_check_tx) {	//支持ufo
		err = ip_ufo_append_data(sk, queue, getfrag, from, length,	//创建ufo skb,添加数据到frag中
					 hh_len, fragheaderlen, transhdrlen,
					 maxfraglen, flags);
		if (err)
			goto error;
		return 0;
	}
	/* So, what's going on in the loop below?
	 *
	 * We use calculated fragment length to generate chained skb,
	 * each of segments is IP fragment ready for sending to network after
	 * adding appropriate IP header.
	 */
	//下面是不支持gso/ufo的处理
	if (!skb)	//队列中没有skb,则需要创建新的
		goto alloc_new_skb;
	while (length > 0) {
		/* Check if the remaining data fits into current packet. */	//检查当前skb是否还由空间能copy
		copy = mtu - skb->len;
		if (copy < length)
			copy = maxfraglen - skb->len;
		if (copy <= 0) {	//剩余空间不够,创建新的skb插入
			char *data;
			unsigned int datalen;
			unsigned int fraglen;
			unsigned int fraggap;
			unsigned int alloclen;
			struct sk_buff *skb_prev;
alloc_new_skb:
			skb_prev = skb;
			if (skb_prev)
				fraggap = skb_prev->len - maxfraglen;
			else
				fraggap = 0;
			/*
			 * If remaining data exceeds the mtu,
			 * we know we need more fragment(s).
			 */
			datalen = length + fraggap;
			if (datalen > mtu - fragheaderlen)
				datalen = maxfraglen - fragheaderlen;
			fraglen = datalen + fragheaderlen;
			if ((flags & MSG_MORE) &&
			    !(rt->dst.dev->features&NETIF_F_SG))
				alloclen = mtu;
			else
				alloclen = fraglen;
			alloclen += exthdrlen;
			/* The last fragment gets additional space at tail.
			 * Note, with MSG_MORE we overallocate on fragments,
			 * because we have no idea what fragment will be
			 * the last.
			 */
			if (datalen == length + fraggap)
				alloclen += rt->dst.trailer_len;
			if (transhdrlen) {
				skb = sock_alloc_send_skb(sk,
						alloclen + hh_len + 15,
						(flags & MSG_DONTWAIT), &err);
			} else {
				skb = NULL;
				if (atomic_read(&sk->sk_wmem_alloc) <=
				    2 * sk->sk_sndbuf)
					skb = sock_wmalloc(sk,
							   alloclen + hh_len + 15, 1,
							   sk->sk_allocation);
				if (unlikely(!skb))
					err = -ENOBUFS;
			}
			if (!skb)
				goto error;
			/*
			 *	Fill in the control structures
			 */
			skb->ip_summed = csummode;
			skb->csum = 0;
			skb_reserve(skb, hh_len);
			/* only the initial fragment is time stamped */
			skb_shinfo(skb)->tx_flags = cork->tx_flags;
			cork->tx_flags = 0;
			skb_shinfo(skb)->tskey = tskey;
			tskey = 0;
			/*
			 *	Find where to start putting bytes.
			 */
			data = skb_put(skb, fraglen + exthdrlen);
			skb_set_network_header(skb, exthdrlen);
			skb->transport_header = (skb->network_header +
						 fragheaderlen);
			data += fragheaderlen + exthdrlen;
			if (fraggap) {
				skb->csum = skb_copy_and_csum_bits(
					skb_prev, maxfraglen,
					data + transhdrlen, fraggap, 0);
				skb_prev->csum = csum_sub(skb_prev->csum,
							  skb->csum);
				data += fraggap;
				pskb_trim_unique(skb_prev, maxfraglen);
			}
			copy = datalen - transhdrlen - fraggap;
			if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
				err = -EFAULT;
				kfree_skb(skb);
				goto error;
			}
			offset += copy;
			length -= datalen - fraggap;
			transhdrlen = 0;
			exthdrlen = 0;
			csummode = CHECKSUM_NONE;
			/*
			 * Put the packet on the pending queue.
			 */
			__skb_queue_tail(queue, skb);	//把新的skb插入队列
			continue;
		}
		if (copy > length)
			copy = length;
		if (!(rt->dst.dev->features&NETIF_F_SG)) {	//不支持sg,copy数据到head区
			unsigned int off;
			off = skb->len;
			if (getfrag(from, skb_put(skb, copy),	//ip_generic_getfrag
					offset, copy, off, skb) < 0) {
				__skb_trim(skb, off);
				err = -EFAULT;
				goto error;
			}
		} else {	//支持sg,copy数据到frag
			int i = skb_shinfo(skb)->nr_frags;
			err = -ENOMEM;
			if (!sk_page_frag_refill(sk, pfrag))
				goto error;
			if (!skb_can_coalesce(skb, i, pfrag->page,
					      pfrag->offset)) {
				err = -EMSGSIZE;
				if (i == MAX_SKB_FRAGS)
					goto error;
				__skb_fill_page_desc(skb, i, pfrag->page,
						     pfrag->offset, 0);
				skb_shinfo(skb)->nr_frags = ++i;
				get_page(pfrag->page);
			}
			copy = min_t(int, copy, pfrag->size - pfrag->offset);
			if (getfrag(from,
				    page_address(pfrag->page) + pfrag->offset,
				    offset, copy, skb->len, skb) < 0)
				goto error_efault;
			pfrag->offset += copy;
			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
			skb->len += copy;
			skb->data_len += copy;
			skb->truesize += copy;
			atomic_add(copy, &sk->sk_wmem_alloc);
		}
		offset += copy;
		length -= copy;
	}
	return 0;
error_efault:
	err = -EFAULT;
error:
	cork->length -= length;
	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
	return err;
}
ip_ufo_append_data
static inline int ip_ufo_append_data(struct sock *sk,
			struct sk_buff_head *queue,
			int getfrag(void *from, char *to, int offset, int len,
			       int odd, struct sk_buff *skb),
			void *from, int length, int hh_len, int fragheaderlen,
			int transhdrlen, int maxfraglen, unsigned int flags)
{
	struct sk_buff *skb;
	int err;
	/* There is support for UDP fragmentation offload by network
	 * device, so create one single skb packet containing complete
	 * udp datagram
	 */
	skb = skb_peek_tail(queue);
	if (!skb) {
		skb = sock_alloc_send_skb(sk,	//分配skb空间
			hh_len + fragheaderlen + transhdrlen + 20,	//这部分在skb head区
			(flags & MSG_DONTWAIT), &err);
		if (!skb)
			return err;
		/* reserve space for Hardware header */
		skb_reserve(skb, hh_len);
		/* create space for UDP/IP header */
		skb_put(skb, fragheaderlen + transhdrlen);
		/* initialize network header pointer */
		skb_reset_network_header(skb);
		/* initialize protocol header pointer */
		skb->transport_header = skb->network_header + fragheaderlen;
		skb->csum = 0;
		__skb_queue_tail(queue, skb);	//把这个不带数据部分的skb,插入队列中
	} else if (skb_is_gso(skb)) {
		goto append;
	}
	skb->ip_summed = CHECKSUM_PARTIAL;
	/* specify the length of each IP datagram fragment */
	skb_shinfo(skb)->gso_size = maxfraglen - fragheaderlen;
	skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
append:
	return skb_append_datato_frags(sk, skb, getfrag, from,	//copy数据部分到frag
				       (length - transhdrlen));
}
int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb,
			int (*getfrag)(void *from, char *to, int offset,
					int len, int odd, struct sk_buff *skb),
			void *from, int length)
{
	int frg_cnt = skb_shinfo(skb)->nr_frags;
	int copy;
	int offset = 0;
	int ret;
	struct page_frag *pfrag = &curren;t->task_frag;
	do {
		/* Return error if we don't have space for new frag */
		if (frg_cnt >= MAX_SKB_FRAGS)
			return -EMSGSIZE;
		if (!sk_page_frag_refill(sk, pfrag))
			return -ENOMEM;
		/* copy the user data to page */
		copy = min_t(int, length, pfrag->size - pfrag->offset);
		ret = getfrag(from, page_address(pfrag->page) + pfrag->offset,	//ip_generic_getfrag
			      offset, copy, 0, skb);
		if (ret < 0)
			return -EFAULT;
		/* copy was successful so update the size parameters */
		skb_fill_page_desc(skb, frg_cnt, pfrag->page, pfrag->offset,
				   copy);
		frg_cnt++;
		pfrag->offset += copy;
		get_page(pfrag->page);
		skb->truesize += copy;
		atomic_add(copy, &sk->sk_wmem_alloc);
		skb->len += copy;
		skb->data_len += copy;
		offset += copy;
		length -= copy;
	} while (length > 0);
	return 0;
}
int ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
{
	struct msghdr *msg = from;
	if (skb->ip_summed == CHECKSUM_PARTIAL) {
		if (copy_from_iter(to, len, &msg->msg_iter) != len)
			return -EFAULT;
	} else {
		__wsum csum = 0;
		if (csum_and_copy_from_iter(to, len, &csum, &msg->msg_iter) != len)
			return -EFAULT;
		skb->csum = csum_block_add(skb->csum, csum, odd);
	}
	return 0;
}
(2/4)上一页 下一页| 剩余全文

分享到:
  网友评论(0)
 
回到顶部