【内核源码】linux UDP实现

来源:个人博客     2021-09-06 11:27:50    人气:     我有话说( 0 人参与)

创建udp socket在socket()创建的时候,会设置对应协议的操作集。 inet_dgram_ops是系统调用层直接调用的操作。udp_prot是底层协议的处理。可以看到相比TCP,UDP不用accept(),lis

创建udp socket

在socket()创建的时候,会设置对应协议的操作集。 inet_dgram_ops是系统调用层直接调用的操作。udp_prot是底层协议的处理。
可以看到相比TCP,UDP不用accept(),listen(). 但是可以bind(), connect().
直接通过sendto()等发送数据即可

int __sock_create(struct net *net, int family, int type, int protocol,
             struct socket **res, int kern)
{
	...
	pf = rcu_dereference(net_families[family]);	// inet_family_ops
    err = pf->create(net, sock, protocol, kern);  // inet_create
    ...
}
static int inet_create(struct net *net, struct socket *sock, int protocol,
		       int kern)
{
	...
	list_for_each_entry_rcu(answer, &inetsw[sock->type], list) {	//inet_init->inet_register_protosw, inetsw_array
		err = 0;
		/* Check the non-wild match. */
		if (protocol == answer->protocol) {
			if (protocol != IPPROTO_IP)
				break;
			} else {
			..
		}
	}
	...
	sock->ops = answer->ops;	//inet_dgram_ops
	answer_prot = answer->prot;	//udp_prot
	...
}
static struct inet_protosw inetsw_array[] =
{	
	...
	{
		.type =       SOCK_DGRAM,
		.protocol =   IPPROTO_UDP,
		.prot =       &udp_prot,
		.ops =        &inet_dgram_ops,
		.flags =      INET_PROTOSW_PERMANENT,
       },
   	...
};
const struct proto_ops inet_dgram_ops = {
	.family		   = PF_INET,
	.owner		   = THIS_MODULE,
	.release	   = inet_release,
	.bind		   = inet_bind,
	.connect	   = inet_dgram_connect,
	.socketpair	   = sock_no_socketpair,
	.accept		   = sock_no_accept,
	.getname	   = inet_getname,
	.poll		   = udp_poll,
	.ioctl		   = inet_ioctl,
	.listen		   = sock_no_listen,
	.shutdown	   = inet_shutdown,
	.setsockopt	   = sock_common_setsockopt,
	.getsockopt	   = sock_common_getsockopt,
	.sendmsg	   = inet_sendmsg,
	.recvmsg	   = inet_recvmsg,
	.mmap		   = sock_no_mmap,
	.sendpage	   = inet_sendpage,
	.set_peek_off	   = sk_set_peek_off,
#ifdef CONFIG_COMPAT
	.compat_setsockopt = compat_sock_common_setsockopt,
	.compat_getsockopt = compat_sock_common_getsockopt,
	.compat_ioctl	   = inet_compat_ioctl,
#endif
};
struct proto udp_prot = {
	.name		   = "UDP",
	.owner		   = THIS_MODULE,
	.close		   = udp_lib_close,
	.connect	   = ip4_datagram_connect,
	.disconnect	   = udp_disconnect,
	.ioctl		   = udp_ioctl,
	.destroy	   = udp_destroy_sock,
	.setsockopt	   = udp_setsockopt,
	.getsockopt	   = udp_getsockopt,
	.sendmsg	   = udp_sendmsg,
	.recvmsg	   = udp_recvmsg,
	.sendpage	   = udp_sendpage,
	.backlog_rcv	   = __udp_queue_rcv_skb,
	.release_cb	   = ip4_datagram_release_cb,
	.hash		   = udp_lib_hash,
	.unhash		   = udp_lib_unhash,
	.rehash		   = udp_v4_rehash,
	.get_port	   = udp_v4_get_port,
	.memory_allocated  = &udp_memory_allocated,
	.sysctl_mem	   = sysctl_udp_mem,
	.sysctl_wmem	   = &sysctl_udp_wmem_min,
	.sysctl_rmem	   = &sysctl_udp_rmem_min,
	.obj_size	   = sizeof(struct udp_sock),
	.h.udp_table	   = &udp_table,
#ifdef CONFIG_COMPAT
	.compat_setsockopt = compat_udp_setsockopt,
	.compat_getsockopt = compat_udp_getsockopt,
#endif
	.diag_destroy	   = udp_abort,
};

bind()

bind()->inet_bind()->udp_v4_get_port()

int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
{
	...
	lock_sock(sk);
	inet->inet_rcv_saddr = inet->inet_saddr = addr->sin_addr.s_addr;	//绑定地址
	
	/* Make sure we are allowed to bind here. */
	if ((snum || !inet->bind_address_no_port) &&
	    sk->sk_prot->get_port(sk, snum)) {	// udp_v4_get_port 绑定端口
		inet->inet_saddr = inet->inet_rcv_saddr = 0;
		err = -EADDRINUSE;
		goto out_release_sock;
	}
	if (inet->inet_rcv_saddr)
		sk->sk_userlocks |= SOCK_BINDADDR_LOCK;
	if (snum)
		sk->sk_userlocks |= SOCK_BINDPORT_LOCK;
	inet->inet_sport = htons(inet->inet_num);
	inet->inet_daddr = 0;
	inet->inet_dport = 0;
	sk_dst_reset(sk);
	err = 0;
out_release_sock:
	release_sock(sk);
out:
	return err;
}
int udp_v4_get_port(struct sock *sk, unsigned short snum)
{
	unsigned int hash2_nulladdr =
		udp4_portaddr_hash(sock_net(sk), htonl(INADDR_ANY), snum);
	unsigned int hash2_partial =
		udp4_portaddr_hash(sock_net(sk), inet_sk(sk)->inet_rcv_saddr, 0);
	/* precompute partial secondary hash */
	udp_sk(sk)->udp_portaddr_hash = hash2_partial;
	return udp_lib_get_port(sk, snum, ipv4_rcv_saddr_equal, hash2_nulladdr);
}
int udp_lib_get_port(struct sock *sk, unsigned short snum,
		     int (*saddr_comp)(const struct sock *sk1,
				       const struct sock *sk2,
				       bool match_wildcard),
		     unsigned int hash2_nulladdr)
{
	struct udp_hslot *hslot, *hslot2;
	struct udp_table *udptable = sk->sk_prot->h.udp_table;
	if (!snum) { //sendto的时候自动bind, 查找一个可用端口
		...
	} else {
		hslot = udp_hashslot(udptable, net, snum);
		spin_lock_bh(&hslot->lock);
		if (hslot->count > 10) { // 超过10个要检查hash2
			int exist;
			unsigned int slot2 = udp_sk(sk)->udp_portaddr_hash ^ snum;	//udp_portaddr_hash之前只对inet_rcv_saddr做过hash
			slot2          &= udptable->mask;
			hash2_nulladdr &= udptable->mask;
			hslot2 = udp_hashslot2(udptable, slot2);
			if (hslot->count < hslot2->count)	//检查较短的那个hash链表
				goto scan_primary_hash;
			exist = udp_lib_lport_inuse2(net, snum, hslot2,	//判断hash2是否有该端口正在被使用
						     sk, saddr_comp);
			if (!exist && (hash2_nulladdr != slot2)) {
				hslot2 = udp_hashslot2(udptable, hash2_nulladdr);
				exist = udp_lib_lport_inuse2(net, snum, hslot2,	//再判断通配地址的情况
							     sk, saddr_comp);
			}
			if (exist)
				goto fail_unlock;
			else
				goto found;
		}
scan_primary_hash:
		if (udp_lib_lport_inuse(net, snum, hslot, NULL, sk,
					saddr_comp, 0))
			goto fail_unlock;
	}
found:
	inet_sk(sk)->inet_num = snum;
	udp_sk(sk)->udp_port_hash = snum;
	udp_sk(sk)->udp_portaddr_hash ^= snum;	//地址+端口 hash
	if (sk_unhashed(sk)) {	//未添加过hash表中
		if (sk->sk_reuseport &&
		    udp_reuseport_add_sock(sk, hslot, saddr_comp)) { //添加到sk_reuseport_cb
			inet_sk(sk)->inet_num = 0;
			udp_sk(sk)->udp_port_hash = 0;
			udp_sk(sk)->udp_portaddr_hash ^= snum;
			goto fail_unlock;
		}
		sk_add_node_rcu(sk, &hslot->head);	//根据net+端口的hash,添加到hash1
		hslot->count++;
		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
		hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash); //根据地址+端口hash,添加到hash2
		spin_lock(&hslot2->lock);
		if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport &&
		    sk->sk_family == AF_INET6)
			hlist_add_tail_rcu(&udp_sk(sk)->udp_portaddr_node,
					   &hslot2->head);
		else
			hlist_add_head_rcu(&udp_sk(sk)->udp_portaddr_node,
					   &hslot2->head);
		hslot2->count++;
		spin_unlock(&hslot2->lock);
	}
	sock_set_flag(sk, SOCK_RCU_FREE);
	error = 0;
fail_unlock:
	spin_unlock_bh(&hslot->lock);
fail:
	return error;
}

bind()会设置inet_rcv_saddr接收地址,以及通过udp_lib_get_port绑定到特定端口, 如果可以绑定,会在hash1和hash2两个hash表的对应链表中都添加

  • 为什么使用两个hash? 两个hash的key为什么不同?
    最初的时候只有一个hash表,hash key为本地端口,当使用了大量ip地址的情况下,有可能导致单个链表的上有很多socket,导致查询慢
    添加第二个hash表,使用地址和端口作为key, 来应对上述情况。 查询的时候只用查两个hash链表中较短的那个。

udp sencondary hash

connect()

sys_connect()->inet_dgram_connect()->ip4_datagram_connect()
调用connect之后,就不需要sendto()指定目的地址了,直接调用send()即可

int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{
	int res;
	lock_sock(sk);
	res = __ip4_datagram_connect(sk, uaddr, addr_len);
	release_sock(sk);
	return res;
}
int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{
	...
	// 查找路由
	rt = ip_route_connect(fl4, usin->sin_addr.s_addr, saddr,
			      RT_CONN_FLAGS(sk), oif,
			      sk->sk_protocol,
			      inet->inet_sport, usin->sin_port, sk);
	...
	//更新源地址,如果更新后需要重新hash
	if (!inet->inet_saddr)
		inet->inet_saddr = fl4->saddr;	/* Update source address */
	if (!inet->inet_rcv_saddr) {
		inet->inet_rcv_saddr = fl4->saddr;
		if (sk->sk_prot->rehash)
			sk->sk_prot->rehash(sk);
	}
	inet->inet_daddr = fl4->daddr;
	inet->inet_dport = usin->sin_port;
	sk->sk_state = TCP_ESTABLISHED;	//进入TCP_ESTABLISHED状态
	sk_set_txhash(sk);
	inet->inet_id = jiffies;
	sk_dst_set(sk, &rt->dst);
}

sendto

sys_sendto()->sock_sendmsg()->sock_sendmsg_nosec()->inet_sendmsg()

static int inet_autobind(struct sock *sk)
{
	struct inet_sock *inet;
	/* We may need to bind the socket. */
	lock_sock(sk);
	inet = inet_sk(sk);
	if (!inet->inet_num) {
		if (sk->sk_prot->get_port(sk, 0)) {
			release_sock(sk);
			return -EAGAIN;
		}
		inet->inet_sport = htons(inet->inet_num);
	}
	release_sock(sk);
	return 0;
}
int inet_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
{
	struct sock *sk = sock->sk;
	sock_rps_record_flow(sk);
	/* We may need to bind the socket. */
	if (!inet_sk(sk)->inet_num && !sk->sk_prot->no_autobind &&
	    inet_autobind(sk))		//如果没调用bind(), 则自动绑定一个可用端口
		return -EAGAIN;
	return sk->sk_prot->sendmsg(sk, msg, size);	//udp_sendmsg
}
int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
{
	struct inet_sock *inet = inet_sk(sk);
	struct udp_sock *up = udp_sk(sk);
	struct flowi4 fl4_stack;
	struct flowi4 *fl4;
	int ulen = len;
	struct ipcm_cookie ipc;
	struct rtable *rt = NULL;
	int free = 0;
	int connected = 0;
	__be32 daddr, faddr, saddr;
	__be16 dport;
	u8  tos;
	int err, is_udplite = IS_UDPLITE(sk);
	int corkreq = up->corkflag || msg->msg_flags&MSG_MORE;	//UDP_CORK或者MSG_MORE
	int (*getfrag)(void *, char *, int, int, int, struct sk_buff *);
	struct sk_buff *skb;
	struct ip_options_data opt_copy;
	if (len > 0xFFFF) //长度不能超过64k, UDP头部只有16位长度位
		return -EMSGSIZE;
	/*
	 *	Check the flags.
	 */
	if (msg->msg_flags & MSG_OOB) /* Mirror BSD error message compatibility */
		return -EOPNOTSUPP;
	ipc.opt = NULL;
	ipc.tx_flags = 0;
	ipc.ttl = 0;
	ipc.tos = -1;
	getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag;
	fl4 = &inet->cork.fl.u.ip4;
	if (up->pending) {
		/*
		 * There are pending frames.
		 * The socket lock must be held while it's corked.
		 */
		lock_sock(sk);
		if (likely(up->pending)) {
			if (unlikely(up->pending != AF_INET)) {
				release_sock(sk);
				return -EINVAL;
			}
			goto do_append_data;	//有数据pending,就往后添加
		}
		release_sock(sk);
	}
	ulen += sizeof(struct udphdr);	//否则开始一个新的udp包
	/*
	 *	Get and verify the address.
	 */
	if (msg->msg_name) {
		DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name);
		if (msg->msg_namelen < sizeof(*usin))
			return -EINVAL;
		if (usin->sin_family != AF_INET) {
			if (usin->sin_family != AF_UNSPEC)
				return -EAFNOSUPPORT;
		}
		daddr = usin->sin_addr.s_addr;
		dport = usin->sin_port;
		if (dport == 0)
			return -EINVAL;
	} else {
		//已经调用connect()进入TCP_ESTABLISHED状态
		if (sk->sk_state != TCP_ESTABLISHED)
			return -EDESTADDRREQ;
		daddr = inet->inet_daddr;
		dport = inet->inet_dport;
		/* Open fast path for connected socket.
		   Route will not be used, if at least one option is set.
		 */
		connected = 1;
	}
	ipc.sockc.tsflags = sk->sk_tsflags;
	ipc.addr = inet->inet_saddr;
	ipc.oif = sk->sk_bound_dev_if;
	if (msg->msg_controllen) {
		//处理控制信息,如IP选项
		err = ip_cmsg_send(sk, msg, &ipc, sk->sk_family == AF_INET6);
		if (unlikely(err)) {
			kfree(ipc.opt);
			return err;
		}
		if (ipc.opt)
			free = 1;
		connected = 0;
	}
	if (!ipc.opt) {  // 如果发送数据中的控制信息中没有IP选项信息,则尝试从inet_sock结构中获取
		struct ip_options_rcu *inet_opt;
		rcu_read_lock();
		inet_opt = rcu_dereference(inet->inet_opt);
		if (inet_opt) {	//如果setsockopt设置了ip选项
			memcpy(&opt_copy, inet_opt,
			       sizeof(*inet_opt) + inet_opt->opt.optlen);
			ipc.opt = &opt_copy.opt;
		}
		rcu_read_unlock();
	}
	saddr = ipc.addr;
	ipc.addr = faddr = daddr;	//控制信息改成目的地址
	sock_tx_timestamp(sk, ipc.sockc.tsflags, &ipc.tx_flags);
	if (ipc.opt && ipc.opt->opt.srr) {//设置了源路由
		if (!daddr)
			return -EINVAL;
		faddr = ipc.opt->opt.faddr; //需要使用ip选项中的下一跳作为目的地址
		connected = 0;	//因为重新选择路由,清除connected
	}
	tos = get_rttos(&ipc, inet);
	if (sock_flag(sk, SOCK_LOCALROUTE) ||	//设置了SO_DONTROUTE
	    (msg->msg_flags & MSG_DONTROUTE) ||	
	    (ipc.opt && ipc.opt->opt.is_strictroute)) {	//ip选项中设置了严格源路由
		tos |= RTO_ONLINK;	//此时目的地址和下一跳必然在本地网络,设置RTO_ONLINK表示后续查找时与目的地址直连,而不用通过gateway
		connected = 0;
	}
	if (ipv4_is_multicast(daddr)) {	//如果是多播,同样控制信息中不存在的话,获取setsockopt配置
		if (!ipc.oif)
			ipc.oif = inet->mc_index;
		if (!saddr)
			saddr = inet->mc_addr;
		connected = 0;
	} else if (!ipc.oif)
		ipc.oif = inet->uc_index;
	if (connected)	//connect()的时候已经查找过路由
		rt = (struct rtable *)sk_dst_check(sk, 0);
	if (!rt) {	//查找路由
		struct net *net = sock_net(sk);
		__u8 flow_flags = inet_sk_flowi_flags(sk);
		fl4 = &fl4_stack;
		flowi4_init_output(fl4, ipc.oif, sk->sk_mark, tos,
				   RT_SCOPE_UNIVERSE, sk->sk_protocol,
				   flow_flags,
				   faddr, saddr, dport, inet->inet_sport);
		security_sk_classify_flow(sk, flowi4_to_flowi(fl4));
		rt = ip_route_output_flow(net, fl4, sk);
		if (IS_ERR(rt)) {
			err = PTR_ERR(rt);
			rt = NULL;
			if (err == -ENETUNREACH)
				IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
			goto out;
		}
		err = -EACCES;
		if ((rt->rt_flags & RTCF_BROADCAST) &&
		    !sock_flag(sk, SOCK_BROADCAST))
			goto out;
		if (connected)
			sk_dst_set(sk, dst_clone(&rt->dst));
	}
	if (msg->msg_flags&MSG_CONFIRM) //设置MSG_CONFIRM,表示确认ip地址到mac地址的映射不会过期,不用定时发送arp
		goto do_confirm;
back_from_confirm:
	saddr = fl4->saddr;	//获取路由源地址
	if (!ipc.addr)	//如果选项中更没有指定目的地址,从路由中获取. 比如发送的时候没有指定目的地址,而在控制信息中提供严格或者宽松的源路由
		daddr = ipc.addr = fl4->daddr;
	//不用cork的话不需要加锁,只是创建一个skb发出去就可以了, 由ip层执行分片
	/* Lockless fast path for the non-corking case. */
	if (!corkreq) {
		skb = ip_make_skb(sk, fl4, getfrag, msg, ulen,	//创建好skb,并设置ip头
				  sizeof(struct udphdr), &ipc, &rt,
				  msg->msg_flags);
		err = PTR_ERR(skb);
		if (!IS_ERR_OR_NULL(skb))
			err = udp_send_skb(skb, fl4);	//直接发送
		goto out;
	}
	//需要cork数据了,需要加锁
	lock_sock(sk);
	if (unlikely(up->pending)) {
		/* The socket is already corked while preparing it. */
		/* ... which is an evident application bug. --ANK */
		release_sock(sk);
		net_dbg_ratelimited("cork app bug 2\n");
		err = -EINVAL;
		goto out;
	}
	/*
	 *	Now cork the socket to pend data.
	 */
	fl4 = &inet->cork.fl.u.ip4;
	fl4->daddr = daddr;
	fl4->saddr = saddr;
	fl4->fl4_dport = dport;
	fl4->fl4_sport = inet->inet_sport;
	up->pending = AF_INET;	//标记,表明正在处理
do_append_data:
	up->len += ulen;
	err = ip_append_data(sk, fl4, getfrag, msg, ulen,
			     sizeof(struct udphdr), &ipc, &rt,
			     corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags);
	if (err)
		udp_flush_pending_frames(sk);	//出错就清空数据,和pending标记, 因为udp不保证可靠性
	else if (!corkreq)	//不需要cork
		err = udp_push_pending_frames(sk); //直接发送所有数据
	else if (unlikely(skb_queue_empty(&sk->sk_write_queue)))
		up->pending = 0;	//没有数据等待发送,则清空pendind
	release_sock(sk);
out:
	ip_rt_put(rt);	//发送完成,减少路由缓存的引用计数
	if (free)
		kfree(ipc.opt);
	if (!err)
		return len;	//发送成功,返回发送的字节数
	/*
	 * ENOBUFS = no kernel mem, SOCK_NOSPACE = no sndbuf space.  Reporting
	 * ENOBUFS might not be good (it's not tunable per se), but otherwise
	 * we don't have a good statistic (IpOutDiscards but it can be too many
	 * things).  We could add another new stat but at least for now that
	 * seems like overkill.
	 */
	if (err == -ENOBUFS || test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
		UDP_INC_STATS(sock_net(sk),
			      UDP_MIB_SNDBUFERRORS, is_udplite);
	}
	return err; //发送失败,返回错误码
do_confirm:
	dst_confirm(&rt->dst);
	if (!(msg->msg_flags&MSG_PROBE) || len)	// 标记MSG_PROBE和MSG_CONFIRM,且长度为0,则不发送数据
		goto back_from_confirm;
	err = 0;
	goto out;
}

对于不设置MSG_MORE和UDP_CORK的通常情况, udp_sendmsg查找路由和获取ip选项后,直接调用ip_make_skb()来创建skb,并调用udp_send_skb发送,由ip层来执行分片
对于需要cork的情况,调用ip_append_data()来添加数据到队列

ip_make_skb

struct sk_buff *ip_make_skb(struct sock *sk,
			    struct flowi4 *fl4,
			    int getfrag(void *from, char *to, int offset,
					int len, int odd, struct sk_buff *skb),
			    void *from, int length, int transhdrlen,
			    struct ipcm_cookie *ipc, struct rtable **rtp,
			    unsigned int flags)
{
	struct inet_cork cork;
	struct sk_buff_head queue;
	int err;
	if (flags & MSG_PROBE)
		return NULL;
	__skb_queue_head_init(&queue);
	cork.flags = 0;
	cork.addr = 0;
	cork.opt = NULL;
	err = ip_setup_cork(sk, &cork, ipc, rtp);
	if (err)
		return ERR_PTR(err);
	err = __ip_append_data(sk, fl4, &queue, &cork,	//创建添加copy数据到skb, 并放入queue中
			       ¤t->task_frag, getfrag,
			       from, length, transhdrlen, flags);
	if (err) {
		__ip_flush_pending_frames(sk, &queue, &cork);
		return ERR_PTR(err);
	}
	return __ip_make_skb(sk, fl4, &queue, &cork);	//把queue中所有skb都合并到一个skb及其frag_list中, 并设置好ip头
}
ip_append_data
/*
 *	ip_append_data() and ip_append_page() can make one large IP datagram
 *	from many pieces of data. Each pieces will be holded on the socket
 *	until ip_push_pending_frames() is called. Each piece can be a page
 *	or non-page data.
 *
 *	Not only UDP, other transport protocols - e.g. raw sockets - can use
 *	this interface potentially.
 *
 *	LATER: length must be adjusted by pad at tail, when it is required.
 */
int ip_append_data(struct sock *sk, struct flowi4 *fl4,
		   int getfrag(void *from, char *to, int offset, int len,
			       int odd, struct sk_buff *skb),
		   void *from, int length, int transhdrlen,
		   struct ipcm_cookie *ipc, struct rtable **rtp,
		   unsigned int flags)
{
	struct inet_sock *inet = inet_sk(sk);
	int err;
	if (flags&MSG_PROBE)
		return 0;
	if (skb_queue_empty(&sk->sk_write_queue)) {
		err = ip_setup_cork(sk, &inet->cork.base, ipc, rtp);	//sk_write_queue为空,初始化cork
		if (err)
			return err;
	} else {
		transhdrlen = 0;
	}
	return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base,	//添加skb到sk_write_queue
				sk_page_frag(sk), getfrag,
				from, length, transhdrlen, flags);
}
__ip_append_data
  • 如果支持UFO则调用ip_ufo_append_data
  • 尝试添加到队列sk_write_queue尾部skb,如果head空间够则copy到head,否则根据是否支持sg,创建新的skb或者添加到frag中

static int __ip_append_data(struct sock *sk,
			    struct flowi4 *fl4,
			    struct sk_buff_head *queue,
			    struct inet_cork *cork,
			    struct page_frag *pfrag,
			    int getfrag(void *from, char *to, int offset,
					int len, int odd, struct sk_buff *skb),
			    void *from, int length, int transhdrlen,
			    unsigned int flags)
{
	struct inet_sock *inet = inet_sk(sk);
	struct sk_buff *skb;
	struct ip_options *opt = cork->opt;
	int hh_len;
	int exthdrlen;
	int mtu;
	int copy;
	int err;
	int offset = 0;
	unsigned int maxfraglen, fragheaderlen, maxnonfragsize;
	int csummode = CHECKSUM_NONE;
	struct rtable *rt = (struct rtable *)cork->dst;
	u32 tskey = 0;
	skb = skb_peek_tail(queue);
	exthdrlen = !skb ? rt->dst.header_len : 0;
	mtu = cork->fragsize;
	if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
	    sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
		tskey = sk->sk_tskey++;
	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;	//ip包长,8对齐
	maxnonfragsize = ip_sk_ignore_df(sk) ? 0xFFFF : mtu;
	if (cork->length + length > maxnonfragsize - fragheaderlen) {	//超过最大包长
		ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
			       mtu - (opt ? opt->optlen : 0));
		return -EMSGSIZE;
	}
	/*
	 * transhdrlen > 0 means that this is the first fragment and we wish
	 * it won't be fragmented in the future.
	 */
	if (transhdrlen &&	// sizeof(struct udphdr)
	    length + fragheaderlen <= mtu &&
	    rt->dst.dev->features & (NETIF_F_HW_CSUM | NETIF_F_IP_CSUM) &&
	    !(flags & MSG_MORE) &&
	    !exthdrlen)
		csummode = CHECKSUM_PARTIAL;	//硬件计算校验和
	cork->length += length;
	if (((length > mtu) || (skb && skb_is_gso(skb))) &&
	    (sk->sk_protocol == IPPROTO_UDP) &&
	    (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len &&
	    (sk->sk_type == SOCK_DGRAM) && !sk->sk_no_check_tx) {	//支持ufo
		err = ip_ufo_append_data(sk, queue, getfrag, from, length,	//创建ufo skb,添加数据到frag中
					 hh_len, fragheaderlen, transhdrlen,
					 maxfraglen, flags);
		if (err)
			goto error;
		return 0;
	}
	/* So, what's going on in the loop below?
	 *
	 * We use calculated fragment length to generate chained skb,
	 * each of segments is IP fragment ready for sending to network after
	 * adding appropriate IP header.
	 */
	//下面是不支持gso/ufo的处理
	if (!skb)	//队列中没有skb,则需要创建新的
		goto alloc_new_skb;
	while (length > 0) {
		/* Check if the remaining data fits into current packet. */	//检查当前skb是否还由空间能copy
		copy = mtu - skb->len;
		if (copy < length)
			copy = maxfraglen - skb->len;
		if (copy <= 0) {	//剩余空间不够,创建新的skb插入
			char *data;
			unsigned int datalen;
			unsigned int fraglen;
			unsigned int fraggap;
			unsigned int alloclen;
			struct sk_buff *skb_prev;
alloc_new_skb:
			skb_prev = skb;
			if (skb_prev)
				fraggap = skb_prev->len - maxfraglen;
			else
				fraggap = 0;
			/*
			 * If remaining data exceeds the mtu,
			 * we know we need more fragment(s).
			 */
			datalen = length + fraggap;
			if (datalen > mtu - fragheaderlen)
				datalen = maxfraglen - fragheaderlen;
			fraglen = datalen + fragheaderlen;
			if ((flags & MSG_MORE) &&
			    !(rt->dst.dev->features&NETIF_F_SG))
				alloclen = mtu;
			else
				alloclen = fraglen;
			alloclen += exthdrlen;
			/* The last fragment gets additional space at tail.
			 * Note, with MSG_MORE we overallocate on fragments,
			 * because we have no idea what fragment will be
			 * the last.
			 */
			if (datalen == length + fraggap)
				alloclen += rt->dst.trailer_len;
			if (transhdrlen) {
				skb = sock_alloc_send_skb(sk,
						alloclen + hh_len + 15,
						(flags & MSG_DONTWAIT), &err);
			} else {
				skb = NULL;
				if (atomic_read(&sk->sk_wmem_alloc) <=
				    2 * sk->sk_sndbuf)
					skb = sock_wmalloc(sk,
							   alloclen + hh_len + 15, 1,
							   sk->sk_allocation);
				if (unlikely(!skb))
					err = -ENOBUFS;
			}
			if (!skb)
				goto error;
			/*
			 *	Fill in the control structures
			 */
			skb->ip_summed = csummode;
			skb->csum = 0;
			skb_reserve(skb, hh_len);
			/* only the initial fragment is time stamped */
			skb_shinfo(skb)->tx_flags = cork->tx_flags;
			cork->tx_flags = 0;
			skb_shinfo(skb)->tskey = tskey;
			tskey = 0;
			/*
			 *	Find where to start putting bytes.
			 */
			data = skb_put(skb, fraglen + exthdrlen);
			skb_set_network_header(skb, exthdrlen);
			skb->transport_header = (skb->network_header +
						 fragheaderlen);
			data += fragheaderlen + exthdrlen;
			if (fraggap) {
				skb->csum = skb_copy_and_csum_bits(
					skb_prev, maxfraglen,
					data + transhdrlen, fraggap, 0);
				skb_prev->csum = csum_sub(skb_prev->csum,
							  skb->csum);
				data += fraggap;
				pskb_trim_unique(skb_prev, maxfraglen);
			}
			copy = datalen - transhdrlen - fraggap;
			if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
				err = -EFAULT;
				kfree_skb(skb);
				goto error;
			}
			offset += copy;
			length -= datalen - fraggap;
			transhdrlen = 0;
			exthdrlen = 0;
			csummode = CHECKSUM_NONE;
			/*
			 * Put the packet on the pending queue.
			 */
			__skb_queue_tail(queue, skb);	//把新的skb插入队列
			continue;
		}
		if (copy > length)
			copy = length;
		if (!(rt->dst.dev->features&NETIF_F_SG)) {	//不支持sg,copy数据到head区
			unsigned int off;
			off = skb->len;
			if (getfrag(from, skb_put(skb, copy),	//ip_generic_getfrag
					offset, copy, off, skb) < 0) {
				__skb_trim(skb, off);
				err = -EFAULT;
				goto error;
			}
		} else {	//支持sg,copy数据到frag
			int i = skb_shinfo(skb)->nr_frags;
			err = -ENOMEM;
			if (!sk_page_frag_refill(sk, pfrag))
				goto error;
			if (!skb_can_coalesce(skb, i, pfrag->page,
					      pfrag->offset)) {
				err = -EMSGSIZE;
				if (i == MAX_SKB_FRAGS)
					goto error;
				__skb_fill_page_desc(skb, i, pfrag->page,
						     pfrag->offset, 0);
				skb_shinfo(skb)->nr_frags = ++i;
				get_page(pfrag->page);
			}
			copy = min_t(int, copy, pfrag->size - pfrag->offset);
			if (getfrag(from,
				    page_address(pfrag->page) + pfrag->offset,
				    offset, copy, skb->len, skb) < 0)
				goto error_efault;
			pfrag->offset += copy;
			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
			skb->len += copy;
			skb->data_len += copy;
			skb->truesize += copy;
			atomic_add(copy, &sk->sk_wmem_alloc);
		}
		offset += copy;
		length -= copy;
	}
	return 0;
error_efault:
	err = -EFAULT;
error:
	cork->length -= length;
	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
	return err;
}
ip_ufo_append_data
static inline int ip_ufo_append_data(struct sock *sk,
			struct sk_buff_head *queue,
			int getfrag(void *from, char *to, int offset, int len,
			       int odd, struct sk_buff *skb),
			void *from, int length, int hh_len, int fragheaderlen,
			int transhdrlen, int maxfraglen, unsigned int flags)
{
	struct sk_buff *skb;
	int err;
	/* There is support for UDP fragmentation offload by network
	 * device, so create one single skb packet containing complete
	 * udp datagram
	 */
	skb = skb_peek_tail(queue);
	if (!skb) {
		skb = sock_alloc_send_skb(sk,	//分配skb空间
			hh_len + fragheaderlen + transhdrlen + 20,	//这部分在skb head区
			(flags & MSG_DONTWAIT), &err);
		if (!skb)
			return err;
		/* reserve space for Hardware header */
		skb_reserve(skb, hh_len);
		/* create space for UDP/IP header */
		skb_put(skb, fragheaderlen + transhdrlen);
		/* initialize network header pointer */
		skb_reset_network_header(skb);
		/* initialize protocol header pointer */
		skb->transport_header = skb->network_header + fragheaderlen;
		skb->csum = 0;
		__skb_queue_tail(queue, skb);	//把这个不带数据部分的skb,插入队列中
	} else if (skb_is_gso(skb)) {
		goto append;
	}
	skb->ip_summed = CHECKSUM_PARTIAL;
	/* specify the length of each IP datagram fragment */
	skb_shinfo(skb)->gso_size = maxfraglen - fragheaderlen;
	skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
append:
	return skb_append_datato_frags(sk, skb, getfrag, from,	//copy数据部分到frag
				       (length - transhdrlen));
}
int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb,
			int (*getfrag)(void *from, char *to, int offset,
					int len, int odd, struct sk_buff *skb),
			void *from, int length)
{
	int frg_cnt = skb_shinfo(skb)->nr_frags;
	int copy;
	int offset = 0;
	int ret;
	struct page_frag *pfrag = ¤t->task_frag;
	do {
		/* Return error if we don't have space for new frag */
		if (frg_cnt >= MAX_SKB_FRAGS)
			return -EMSGSIZE;
		if (!sk_page_frag_refill(sk, pfrag))
			return -ENOMEM;
		/* copy the user data to page */
		copy = min_t(int, length, pfrag->size - pfrag->offset);
		ret = getfrag(from, page_address(pfrag->page) + pfrag->offset,	//ip_generic_getfrag
			      offset, copy, 0, skb);
		if (ret < 0)
			return -EFAULT;
		/* copy was successful so update the size parameters */
		skb_fill_page_desc(skb, frg_cnt, pfrag->page, pfrag->offset,
				   copy);
		frg_cnt++;
		pfrag->offset += copy;
		get_page(pfrag->page);
		skb->truesize += copy;
		atomic_add(copy, &sk->sk_wmem_alloc);
		skb->len += copy;
		skb->data_len += copy;
		offset += copy;
		length -= copy;
	} while (length > 0);
	return 0;
}
int ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
{
	struct msghdr *msg = from;
	if (skb->ip_summed == CHECKSUM_PARTIAL) {
		if (copy_from_iter(to, len, &msg->msg_iter) != len)
			return -EFAULT;
	} else {
		__wsum csum = 0;
		if (csum_and_copy_from_iter(to, len, &csum, &msg->msg_iter) != len)
			return -EFAULT;
		skb->csum = csum_block_add(skb->csum, csum, odd);
	}
	return 0;
}
udp_send_skb

udp_send_skb()->ip_send_skb()->ip_local_out()

static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4)
{
	/*
	 * Create a UDP header
	 */
	uh = udp_hdr(skb);
	uh->source = inet->inet_sport;
	uh->dest = fl4->fl4_dport;
	uh->len = htons(len);
	uh->check = 0;
	if (is_udplite)  				 /*     UDP-Lite      */
		csum = udplite_csum(skb);
	else if (sk->sk_no_check_tx) {   /* UDP csum disabled */
		skb->ip_summed = CHECKSUM_NONE;
		goto send;
	} else if (skb->ip_summed == CHECKSUM_PARTIAL) { /* UDP hardware csum */
		udp4_hwcsum(skb, fl4->saddr, fl4->daddr);
		goto send;
	} else
		csum = udp_csum(skb);
	/* add protocol-dependent pseudo-header */
	uh->check = csum_tcpudp_magic(fl4->saddr, fl4->daddr, len,
				      sk->sk_protocol, csum);
	if (uh->check == 0)
		uh->check = CSUM_MANGLED_0;
send:
	err = ip_send_skb(sock_net(sk), skb);
	if (err) {
		if (err == -ENOBUFS && !inet->recverr) {
			UDP_INC_STATS(sock_net(sk),
				      UDP_MIB_SNDBUFERRORS, is_udplite);
			err = 0;
		}
	} else
		UDP_INC_STATS(sock_net(sk),
			      UDP_MIB_OUTDATAGRAMS, is_udplite);
	return err;
}

udp_recvmsg

udp_recvmsg()只要是调用__skb_recv_datagram()从sk_receive_queue取出一个skb,然后调用skb_copy_datagram_msg拷贝数据到用户态

int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
		int flags, int *addr_len)
{
	struct inet_sock *inet = inet_sk(sk);
	DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name);
	struct sk_buff *skb;
	unsigned int ulen, copied;
	int peeked, peeking, off;
	int err;
	int is_udplite = IS_UDPLITE(sk);
	bool checksum_valid = false;
	bool slow;
	if (flags & MSG_ERRQUEUE)
		return ip_recv_error(sk, msg, len, addr_len);	//返回sk_error_queue队列中的数据
try_again:
	peeking = off = sk_peek_offset(sk, flags);
	skb = __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0),
				  &peeked, &off, &err);
	if (!skb)
		return err;
	ulen = skb->len;
	copied = len;
	if (copied > ulen - off)
		copied = ulen - off;
	else if (copied < ulen)
		msg->msg_flags |= MSG_TRUNC;
	/*
	 * If checksum is needed at all, try to do it while copying the
	 * data.  If the data is truncated, or if we only want a partial
	 * coverage checksum (UDP-Lite), do it before the copy.
	 */
	if (copied < ulen || UDP_SKB_CB(skb)->partial_cov || peeking) {
		checksum_valid = !udp_lib_checksum_complete(skb);
		if (!checksum_valid)
			goto csum_copy_err;
	}
	if (checksum_valid || skb_csum_unnecessary(skb))
		err = skb_copy_datagram_msg(skb, off, msg, copied);
	else {
		err = skb_copy_and_csum_datagram_msg(skb, off, msg);
		if (err == -EINVAL)
			goto csum_copy_err;
	}
	if (unlikely(err)) {
		trace_kfree_skb(skb, udp_recvmsg);
		if (!peeked) {
			atomic_inc(&sk->sk_drops);
			UDP_INC_STATS(sock_net(sk),
				      UDP_MIB_INERRORS, is_udplite);
		}
		skb_free_datagram_locked(sk, skb);
		return err;
	}
	if (!peeked)
		UDP_INC_STATS(sock_net(sk),
			      UDP_MIB_INDATAGRAMS, is_udplite);
	sock_recv_ts_and_drops(msg, sk, skb);
	/* Copy the address. */
	if (sin) {
		sin->sin_family = AF_INET;
		sin->sin_port = udp_hdr(skb)->source;
		sin->sin_addr.s_addr = ip_hdr(skb)->saddr;
		memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
		*addr_len = sizeof(*sin);
	}
	if (inet->cmsg_flags)
		ip_cmsg_recv_offset(msg, skb, sizeof(struct udphdr), off);
	err = copied;
	if (flags & MSG_TRUNC)
		err = ulen;
	__skb_free_datagram_locked(sk, skb, peeking ? -err : err);
	return err;
csum_copy_err:
	slow = lock_sock_fast(sk);
	if (!skb_kill_datagram(sk, skb, flags)) {
		UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite);
		UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
	}
	unlock_sock_fast(sk, slow);
	/* starting over for a new packet, but check if we need to yield */
	cond_resched();
	msg->msg_flags &= ~MSG_TRUNC;
	goto try_again;
}
struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags,
				    int *peeked, int *off, int *err)
{
	struct sk_buff *skb, *last;
	long timeo;
	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
	do {
		skb = __skb_try_recv_datagram(sk, flags, peeked, off, err,	//从sk_receive_queue取出一个skb
					      &last);
		if (skb)
			return skb;
		if (*err != -EAGAIN)
			break;
	} while (timeo &&
		!__skb_wait_for_more_packets(sk, err, &timeo, last));	//阻塞等待
	return NULL;
}

udp_rcv

在中断下半部会调用udp_rcv来进行udp协议的处理
主要是调用__udp4_lib_lookup_skb在udp hash中查找要接收该skb的socket,
然后调用udp_queue_rcv_skb把skb放到socket的sk_receive_queue中

int udp_rcv(struct sk_buff *skb)
{
	return __udp4_lib_rcv(skb, &udp_table, IPPROTO_UDP);
}
int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
		   int proto)
{
	struct sock *sk;
	struct udphdr *uh;
	unsigned short ulen;
	struct rtable *rt = skb_rtable(skb);
	__be32 saddr, daddr;
	struct net *net = dev_net(skb->dev);
	/*
	 *  Validate the packet.
	 */
	if (!pskb_may_pull(skb, sizeof(struct udphdr)))
		goto drop;		/* No space for header. */
	uh   = udp_hdr(skb);
	ulen = ntohs(uh->len);
	saddr = ip_hdr(skb)->saddr;
	daddr = ip_hdr(skb)->daddr;
	if (ulen > skb->len)
		goto short_packet;
	if (proto == IPPROTO_UDP) {
		/* UDP validates ulen. */
		if (ulen < sizeof(*uh) || pskb_trim_rcsum(skb, ulen))
			goto short_packet;
		uh = udp_hdr(skb);
	}
	/* UDP只提供有限的校验能力,如果不计算,则发送时为全0  
	* 校验和初始化,若udp头部的校验和字段为0,则设置不必校验标志位,若校验由硬
     * 完成,且伪首部通过校验,则同样设置不必校验标志,若以上都不满足,则将伪首
     * 部的校验和给skb->csum,之后还需要对数据部分进行校验*/
	if (udp4_csum_init(skb, uh, proto))
		goto csum_error;
	sk = skb_steal_sock(skb);
	if (sk) {
		struct dst_entry *dst = skb_dst(skb);
		int ret;
		if (unlikely(sk->sk_rx_dst != dst))
			udp_sk_rx_dst_set(sk, dst);
		ret = udp_queue_rcv_skb(sk, skb);
		sock_put(sk);
		/* a return value > 0 means to resubmit the input, but
		 * it wants the return to be -protocol, or 0
		 */
		if (ret > 0)
			return -ret;
		return 0;
	}
	if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST))
		return __udp4_lib_mcast_deliver(net, skb, uh,	// 组播和广播处理,所有满足条件的sk都能接收
						saddr, daddr, udptable, proto);
	sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable);	//在hash表中查找满足条件的sk
	if (sk) {
		int ret;
		if (inet_get_convert_csum(sk) && uh->check && !IS_UDPLITE(sk))
			skb_checksum_try_convert(skb, IPPROTO_UDP, uh->check,
						 inet_compute_pseudo);
		ret = udp_queue_rcv_skb(sk, skb);
		/* a return value > 0 means to resubmit the input, but
		 * it wants the return to be -protocol, or 0
		 */
		if (ret > 0)
			return -ret;
		return 0;
	}
	//没有满足条件的sk
	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
		goto drop;
	nf_reset(skb);
	//对数据包进行校验和,如果校验失败则安静地丢弃
	/* No socket. Drop packet silently, if checksum is wrong */
	if (udp_lib_checksum_complete(skb))
		goto csum_error;
	//校验成功,发送端口不可达ICMP
	__UDP_INC_STATS(net, UDP_MIB_NOPORTS, proto == IPPROTO_UDPLITE);
	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
	/*
	 * Hmm.  We got an UDP packet to a port to which we
	 * don't wanna listen.  Ignore it.
	 */
	kfree_skb(skb);
	return 0;
short_packet:
	net_dbg_ratelimited("UDP%s: short packet: From %pI4:%u %d/%d to %pI4:%u\n",
			    proto == IPPROTO_UDPLITE ? "Lite" : "",
			    &saddr, ntohs(uh->source),
			    ulen, skb->len,
			    &daddr, ntohs(uh->dest));
	goto drop;
csum_error:
	/*
	 * RFC1122: OK.  Discards the bad packet silently (as far as
	 * the network is concerned, anyway) as per 4.1.3.4 (MUST).
	 */
	net_dbg_ratelimited("UDP%s: bad checksum. From %pI4:%u to %pI4:%u ulen %d\n",
			    proto == IPPROTO_UDPLITE ? "Lite" : "",
			    &saddr, ntohs(uh->source), &daddr, ntohs(uh->dest),
			    ulen);
	__UDP_INC_STATS(net, UDP_MIB_CSUMERRORS, proto == IPPROTO_UDPLITE);
drop:
	__UDP_INC_STATS(net, UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE);
	kfree_skb(skb);
	return 0;
}
static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
				    struct udphdr  *uh,
				    __be32 saddr, __be32 daddr,
				    struct udp_table *udptable,
				    int proto)
{
	struct sock *sk, *first = NULL;
	unsigned short hnum = ntohs(uh->dest);
	struct udp_hslot *hslot = udp_hashslot(udptable, net, hnum);
	unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10);
	unsigned int offset = offsetof(typeof(*sk), sk_node);
	int dif = skb->dev->ifindex;
	struct hlist_node *node;
	struct sk_buff *nskb;
	if (use_hash2) {
		hash2_any = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum) &
			    udptable->mask;
		hash2 = udp4_portaddr_hash(net, daddr, hnum) & udptable->mask;
start_lookup:
		hslot = &udptable->hash2[hash2];
		offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node);
	}
	sk_for_each_entry_offset_rcu(sk, node, &hslot->head, offset) {
		if (!__udp_is_mcast_sock(net, sk, uh->dest, daddr,	//检查sk是否满足接收条件,并判断组播是否允许通过
					 uh->source, saddr, dif, hnum))
			continue;
		if (!first) {
			first = sk;
			continue;
		}
		nskb = skb_clone(skb, GFP_ATOMIC);
		if (unlikely(!nskb)) {
			atomic_inc(&sk->sk_drops);
			__UDP_INC_STATS(net, UDP_MIB_RCVBUFERRORS,
					IS_UDPLITE(sk));
			__UDP_INC_STATS(net, UDP_MIB_INERRORS,
					IS_UDPLITE(sk));
			continue;
		}
		if (udp_queue_rcv_skb(sk, nskb) > 0)	//所有满足条件的sk都能接收
			consume_skb(nskb);
	}
	/* Also lookup *:port if we are using hash2 and haven't done so yet. */
	if (use_hash2 && hash2 != hash2_any) {	//再遍历INADDR_ANY的情况
		hash2 = hash2_any;
		goto start_lookup;
	}
	if (first) {
		if (udp_queue_rcv_skb(first, skb) > 0)	//第一个sk使用原始skb,不用clone了
			consume_skb(skb);
	} else {// 说明没有满足条件的sk,直接丢弃
		kfree_skb(skb);
		__UDP_INC_STATS(net, UDP_MIB_IGNOREDMULTI,
				proto == IPPROTO_UDPLITE);
	}
	return 0;
}

__udp4_lib_lookup_skb

__udp4_lib_lookup_skb在两个udp hash中,较短的那个链表中查找。
查找的时候跟tcp一样通过compute_score来打分,最高分数的socket来进行接收。 如果是reuseport,则使用reuseport对应的算法来选择sock

struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
		__be16 sport, __be32 daddr, __be16 dport,
		int dif, struct udp_table *udptable, struct sk_buff *skb)
{
	struct sock *sk, *result;
	unsigned short hnum = ntohs(dport);
	unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask);	//根据目的端口和net来hash1
	struct udp_hslot *hslot2, *hslot = &udptable->hash[slot];
	int score, badness, matches = 0, reuseport = 0;
	u32 hash = 0;
	if (hslot->count > 10) {	//超过10个, 在hash2中查找
		hash2 = udp4_portaddr_hash(net, daddr, hnum);
		slot2 = hash2 & udptable->mask;
		hslot2 = &udptable->hash2[slot2];
		if (hslot->count < hslot2->count) //哪个链表短查哪个
			goto begin;
		result = udp4_lib_lookup2(net, saddr, sport,	//跟begin相同的逻辑
					  daddr, hnum, dif,
					  hslot2, skb);
		if (!result) {
			unsigned int old_slot2 = slot2;
			hash2 = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum);	//再差INADDR_ANY的情况
			slot2 = hash2 & udptable->mask;
			/* avoid searching the same slot again. */
			if (unlikely(slot2 == old_slot2))
				return result;
			hslot2 = &udptable->hash2[slot2];
			if (hslot->count < hslot2->count)
				goto begin;
			result = udp4_lib_lookup2(net, saddr, sport,
						  daddr, hnum, dif,
						  hslot2, skb);
		}
		return result;
	}
begin:
	result = NULL;
	badness = 0;
	sk_for_each_rcu(sk, &hslot->head) {
		score = compute_score(sk, net, saddr, sport,
				      daddr, hnum, dif);
		if (score > badness) {
			reuseport = sk->sk_reuseport;
			if (reuseport) {
				hash = udp_ehashfn(net, daddr, hnum,
						   saddr, sport);
				result = reuseport_select_sock(sk, hash, skb,
							sizeof(struct udphdr));	//根据reuseport算法选择一个sock直接返回
				if (result)
					return result;
				matches = 1;
			}
			result = sk;	//使用最大分数的sk作为结果返回
			badness = score;
		} else if (score == badness && reuseport) {	//和最大分数相等,并且最大分数的sk是reuseport
			matches++;
			if (reciprocal_scale(hash, matches) == 0) // 判断当前sk能否作为结果
				result = sk;
			hash = next_pseudo_random32(hash);
		}
	}
	return result;
}
static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb,
						 __be16 sport, __be16 dport,
						 struct udp_table *udptable)
{
	const struct iphdr *iph = ip_hdr(skb);
	return __udp4_lib_lookup(dev_net(skb->dev), iph->saddr, sport,
				 iph->daddr, dport, inet_iif(skb),
				 udptable, skb);
}

udp_queue_rcv_skb

/* returns:
 *  -1: error
 *   0: success
 *  >0: "udp encap" protocol resubmission
 *
 * Note that in the success and error cases, the skb is assumed to
 * have either been requeued or freed.
 */
int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
	struct udp_sock *up = udp_sk(sk);
	int rc;
	int is_udplite = IS_UDPLITE(sk);
	/*
	 *	Charge it to the socket, dropping if the queue is full.
	 */
	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
		goto drop;
	nf_reset(skb);
	if (static_key_false(&udp_encap_needed) && up->encap_type) {	//udp tunnel处理
		int (*encap_rcv)(struct sock *sk, struct sk_buff *skb);
		/*
		 * This is an encapsulation socket so pass the skb to
		 * the socket's udp_encap_rcv() hook. Otherwise, just
		 * fall through and pass this up the UDP socket.
		 * up->encap_rcv() returns the following value:
		 * =0 if skb was successfully passed to the encap
		 *    handler or was discarded by it.
		 * >0 if skb should be passed on to UDP.
		 * <0 if skb should be resubmitted as proto -N
		 */
		/* if we're overly short, let UDP handle it */
		encap_rcv = ACCESS_ONCE(up->encap_rcv);
		if (encap_rcv) {
			int ret;
			/* Verify checksum before giving to encap */
			if (udp_lib_checksum_complete(skb))	//校验
				goto csum_error;
			ret = encap_rcv(sk, skb);
			if (ret <= 0) {
				__UDP_INC_STATS(sock_net(sk),
						UDP_MIB_INDATAGRAMS,
						is_udplite);
				return -ret;
			}
		}
		/* FALLTHROUGH -- it's a UDP Packet */
	}
	/*
	 * 	UDP-Lite specific tests, ignored on UDP sockets
	 */
	if ((is_udplite & UDPLITE_RECV_CC)  &&  UDP_SKB_CB(skb)->partial_cov) {	//udplite处理
		...
	}
	if (rcu_access_pointer(sk->sk_filter) &&
	    udp_lib_checksum_complete(skb))
			goto csum_error;
	if (sk_filter_trim_cap(sk, skb, sizeof(struct udphdr)))
		goto drop;
	udp_csum_pull_header(skb);
	if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {	//接收缓存满了
		__UDP_INC_STATS(sock_net(sk), UDP_MIB_RCVBUFERRORS,
				is_udplite);
		goto drop;
	}
	rc = 0;
	ipv4_pktinfo_prepare(sk, skb);
	bh_lock_sock(sk);
	if (!sock_owned_by_user(sk))
		rc = __udp_queue_rcv_skb(sk, skb);	//应用程序没在使用,添加到sk_receive_queue队列
	else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {	//应用程序在使用,添加到backlog中,等release_lock的是否处理
		bh_unlock_sock(sk);
		goto drop;
	}
	bh_unlock_sock(sk);
	return rc;
csum_error:
	__UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite);
drop:
	__UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
	atomic_inc(&sk->sk_drops);
	kfree_skb(skb);
	return -1;
}

由上可知,udp非常简单,只有两个hash表,一个backlog队列,以及sk_receive_queue和sk_write_queue。

本文作者: bhpike65

本文链接: http://www.cnhalo.net/2016/06/13/linux-udp/

linux udp

本文源自互联网,采用知识共享署名-非商业性使用 4.0 国际许可协议进行许可,
版权归原作者,如有问题请联系service@tsingfun.com (编辑:admin)
分享到: