【内核源码】linux UDP实现
创建udp socket
在socket()创建的时候,会设置对应协议的操作集。 inet_dgram_ops是系统调用层直接调用的操作。udp_prot是底层协议的处理。
可以看到相比TCP,UDP不用accept(),listen(). 但是可以bind(), connect().
直接通过sendto()等发送数据即可
int __sock_create(struct net *net, int family, int type, int protocol,
struct socket **res, int kern)
{
...
pf = rcu_dereference(net_families[family]); // inet_family_ops
err = pf->create(net, sock, protocol, kern); // inet_create
...
}
static int inet_create(struct net *net, struct socket *sock, int protocol,
int kern)
{
...
list_for_each_entry_rcu(answer, &inetsw[sock->type], list) { //inet_init->inet_register_protosw, inetsw_array
err = 0;
/* Check the non-wild match. */
if (protocol == answer->protocol) {
if (protocol != IPPROTO_IP)
break;
} else {
..
}
}
...
sock->ops = answer->ops; //inet_dgram_ops
answer_prot = answer->prot; //udp_prot
...
}
static struct inet_protosw inetsw_array[] =
{
...
{
.type = SOCK_DGRAM,
.protocol = IPPROTO_UDP,
.prot = &udp_prot,
.ops = &inet_dgram_ops,
.flags = INET_PROTOSW_PERMANENT,
},
...
};
const struct proto_ops inet_dgram_ops = {
.family = PF_INET,
.owner = THIS_MODULE,
.release = inet_release,
.bind = inet_bind,
.connect = inet_dgram_connect,
.socketpair = sock_no_socketpair,
.accept = sock_no_accept,
.getname = inet_getname,
.poll = udp_poll,
.ioctl = inet_ioctl,
.listen = sock_no_listen,
.shutdown = inet_shutdown,
.setsockopt = sock_common_setsockopt,
.getsockopt = sock_common_getsockopt,
.sendmsg = inet_sendmsg,
.recvmsg = inet_recvmsg,
.mmap = sock_no_mmap,
.sendpage = inet_sendpage,
.set_peek_off = sk_set_peek_off,
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_sock_common_setsockopt,
.compat_getsockopt = compat_sock_common_getsockopt,
.compat_ioctl = inet_compat_ioctl,
#endif
};
struct proto udp_prot = {
.name = "UDP",
.owner = THIS_MODULE,
.close = udp_lib_close,
.connect = ip4_datagram_connect,
.disconnect = udp_disconnect,
.ioctl = udp_ioctl,
.destroy = udp_destroy_sock,
.setsockopt = udp_setsockopt,
.getsockopt = udp_getsockopt,
.sendmsg = udp_sendmsg,
.recvmsg = udp_recvmsg,
.sendpage = udp_sendpage,
.backlog_rcv = __udp_queue_rcv_skb,
.release_cb = ip4_datagram_release_cb,
.hash = udp_lib_hash,
.unhash = udp_lib_unhash,
.rehash = udp_v4_rehash,
.get_port = udp_v4_get_port,
.memory_allocated = &udp_memory_allocated,
.sysctl_mem = sysctl_udp_mem,
.sysctl_wmem = &sysctl_udp_wmem_min,
.sysctl_rmem = &sysctl_udp_rmem_min,
.obj_size = sizeof(struct udp_sock),
.h.udp_table = &udp_table,
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_udp_setsockopt,
.compat_getsockopt = compat_udp_getsockopt,
#endif
.diag_destroy = udp_abort,
};
bind()
bind()->inet_bind()->udp_v4_get_port()
int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
{
...
lock_sock(sk);
inet->inet_rcv_saddr = inet->inet_saddr = addr->sin_addr.s_addr; //绑定地址
/* Make sure we are allowed to bind here. */
if ((snum || !inet->bind_address_no_port) &&
sk->sk_prot->get_port(sk, snum)) { // udp_v4_get_port 绑定端口
inet->inet_saddr = inet->inet_rcv_saddr = 0;
err = -EADDRINUSE;
goto out_release_sock;
}
if (inet->inet_rcv_saddr)
sk->sk_userlocks |= SOCK_BINDADDR_LOCK;
if (snum)
sk->sk_userlocks |= SOCK_BINDPORT_LOCK;
inet->inet_sport = htons(inet->inet_num);
inet->inet_daddr = 0;
inet->inet_dport = 0;
sk_dst_reset(sk);
err = 0;
out_release_sock:
release_sock(sk);
out:
return err;
}
int udp_v4_get_port(struct sock *sk, unsigned short snum)
{
unsigned int hash2_nulladdr =
udp4_portaddr_hash(sock_net(sk), htonl(INADDR_ANY), snum);
unsigned int hash2_partial =
udp4_portaddr_hash(sock_net(sk), inet_sk(sk)->inet_rcv_saddr, 0);
/* precompute partial secondary hash */
udp_sk(sk)->udp_portaddr_hash = hash2_partial;
return udp_lib_get_port(sk, snum, ipv4_rcv_saddr_equal, hash2_nulladdr);
}
int udp_lib_get_port(struct sock *sk, unsigned short snum,
int (*saddr_comp)(const struct sock *sk1,
const struct sock *sk2,
bool match_wildcard),
unsigned int hash2_nulladdr)
{
struct udp_hslot *hslot, *hslot2;
struct udp_table *udptable = sk->sk_prot->h.udp_table;
if (!snum) { //sendto的时候自动bind, 查找一个可用端口
...
} else {
hslot = udp_hashslot(udptable, net, snum);
spin_lock_bh(&hslot->lock);
if (hslot->count > 10) { // 超过10个要检查hash2
int exist;
unsigned int slot2 = udp_sk(sk)->udp_portaddr_hash ^ snum; //udp_portaddr_hash之前只对inet_rcv_saddr做过hash
slot2 &= udptable->mask;
hash2_nulladdr &= udptable->mask;
hslot2 = udp_hashslot2(udptable, slot2);
if (hslot->count < hslot2->count) //检查较短的那个hash链表
goto scan_primary_hash;
exist = udp_lib_lport_inuse2(net, snum, hslot2, //判断hash2是否有该端口正在被使用
sk, saddr_comp);
if (!exist && (hash2_nulladdr != slot2)) {
hslot2 = udp_hashslot2(udptable, hash2_nulladdr);
exist = udp_lib_lport_inuse2(net, snum, hslot2, //再判断通配地址的情况
sk, saddr_comp);
}
if (exist)
goto fail_unlock;
else
goto found;
}
scan_primary_hash:
if (udp_lib_lport_inuse(net, snum, hslot, NULL, sk,
saddr_comp, 0))
goto fail_unlock;
}
found:
inet_sk(sk)->inet_num = snum;
udp_sk(sk)->udp_port_hash = snum;
udp_sk(sk)->udp_portaddr_hash ^= snum; //地址+端口 hash
if (sk_unhashed(sk)) { //未添加过hash表中
if (sk->sk_reuseport &&
udp_reuseport_add_sock(sk, hslot, saddr_comp)) { //添加到sk_reuseport_cb
inet_sk(sk)->inet_num = 0;
udp_sk(sk)->udp_port_hash = 0;
udp_sk(sk)->udp_portaddr_hash ^= snum;
goto fail_unlock;
}
sk_add_node_rcu(sk, &hslot->head); //根据net+端口的hash,添加到hash1
hslot->count++;
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash); //根据地址+端口hash,添加到hash2
spin_lock(&hslot2->lock);
if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport &&
sk->sk_family == AF_INET6)
hlist_add_tail_rcu(&udp_sk(sk)->udp_portaddr_node,
&hslot2->head);
else
hlist_add_head_rcu(&udp_sk(sk)->udp_portaddr_node,
&hslot2->head);
hslot2->count++;
spin_unlock(&hslot2->lock);
}
sock_set_flag(sk, SOCK_RCU_FREE);
error = 0;
fail_unlock:
spin_unlock_bh(&hslot->lock);
fail:
return error;
}
bind()会设置inet_rcv_saddr接收地址,以及通过udp_lib_get_port绑定到特定端口, 如果可以绑定,会在hash1和hash2两个hash表的对应链表中都添加
- 为什么使用两个hash? 两个hash的key为什么不同?
最初的时候只有一个hash表,hash key为本地端口,当使用了大量ip地址的情况下,有可能导致单个链表的上有很多socket,导致查询慢
添加第二个hash表,使用地址和端口作为key, 来应对上述情况。 查询的时候只用查两个hash链表中较短的那个。
connect()
sys_connect()->inet_dgram_connect()->ip4_datagram_connect()
调用connect之后,就不需要sendto()指定目的地址了,直接调用send()即可
int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{
int res;
lock_sock(sk);
res = __ip4_datagram_connect(sk, uaddr, addr_len);
release_sock(sk);
return res;
}
int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{
...
// 查找路由
rt = ip_route_connect(fl4, usin->sin_addr.s_addr, saddr,
RT_CONN_FLAGS(sk), oif,
sk->sk_protocol,
inet->inet_sport, usin->sin_port, sk);
...
//更新源地址,如果更新后需要重新hash
if (!inet->inet_saddr)
inet->inet_saddr = fl4->saddr; /* Update source address */
if (!inet->inet_rcv_saddr) {
inet->inet_rcv_saddr = fl4->saddr;
if (sk->sk_prot->rehash)
sk->sk_prot->rehash(sk);
}
inet->inet_daddr = fl4->daddr;
inet->inet_dport = usin->sin_port;
sk->sk_state = TCP_ESTABLISHED; //进入TCP_ESTABLISHED状态
sk_set_txhash(sk);
inet->inet_id = jiffies;
sk_dst_set(sk, &rt->dst);
}
sendto
sys_sendto()->sock_sendmsg()->sock_sendmsg_nosec()->inet_sendmsg()
static int inet_autobind(struct sock *sk)
{
struct inet_sock *inet;
/* We may need to bind the socket. */
lock_sock(sk);
inet = inet_sk(sk);
if (!inet->inet_num) {
if (sk->sk_prot->get_port(sk, 0)) {
release_sock(sk);
return -EAGAIN;
}
inet->inet_sport = htons(inet->inet_num);
}
release_sock(sk);
return 0;
}
int inet_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
{
struct sock *sk = sock->sk;
sock_rps_record_flow(sk);
/* We may need to bind the socket. */
if (!inet_sk(sk)->inet_num && !sk->sk_prot->no_autobind &&
inet_autobind(sk)) //如果没调用bind(), 则自动绑定一个可用端口
return -EAGAIN;
return sk->sk_prot->sendmsg(sk, msg, size); //udp_sendmsg
}
int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
{
struct inet_sock *inet = inet_sk(sk);
struct udp_sock *up = udp_sk(sk);
struct flowi4 fl4_stack;
struct flowi4 *fl4;
int ulen = len;
struct ipcm_cookie ipc;
struct rtable *rt = NULL;
int free = 0;
int connected = 0;
__be32 daddr, faddr, saddr;
__be16 dport;
u8 tos;
int err, is_udplite = IS_UDPLITE(sk);
int corkreq = up->corkflag || msg->msg_flags&MSG_MORE; //UDP_CORK或者MSG_MORE
int (*getfrag)(void *, char *, int, int, int, struct sk_buff *);
struct sk_buff *skb;
struct ip_options_data opt_copy;
if (len > 0xFFFF) //长度不能超过64k, UDP头部只有16位长度位
return -EMSGSIZE;
/*
* Check the flags.
*/
if (msg->msg_flags & MSG_OOB) /* Mirror BSD error message compatibility */
return -EOPNOTSUPP;
ipc.opt = NULL;
ipc.tx_flags = 0;
ipc.ttl = 0;
ipc.tos = -1;
getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag;
fl4 = &inet->cork.fl.u.ip4;
if (up->pending) {
/*
* There are pending frames.
* The socket lock must be held while it's corked.
*/
lock_sock(sk);
if (likely(up->pending)) {
if (unlikely(up->pending != AF_INET)) {
release_sock(sk);
return -EINVAL;
}
goto do_append_data; //有数据pending,就往后添加
}
release_sock(sk);
}
ulen += sizeof(struct udphdr); //否则开始一个新的udp包
/*
* Get and verify the address.
*/
if (msg->msg_name) {
DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name);
if (msg->msg_namelen < sizeof(*usin))
return -EINVAL;
if (usin->sin_family != AF_INET) {
if (usin->sin_family != AF_UNSPEC)
return -EAFNOSUPPORT;
}
daddr = usin->sin_addr.s_addr;
dport = usin->sin_port;
if (dport == 0)
return -EINVAL;
} else {
//已经调用connect()进入TCP_ESTABLISHED状态
if (sk->sk_state != TCP_ESTABLISHED)
return -EDESTADDRREQ;
daddr = inet->inet_daddr;
dport = inet->inet_dport;
/* Open fast path for connected socket.
Route will not be used, if at least one option is set.
*/
connected = 1;
}
ipc.sockc.tsflags = sk->sk_tsflags;
ipc.addr = inet->inet_saddr;
ipc.oif = sk->sk_bound_dev_if;
if (msg->msg_controllen) {
//处理控制信息,如IP选项
err = ip_cmsg_send(sk, msg, &ipc, sk->sk_family == AF_INET6);
if (unlikely(err)) {
kfree(ipc.opt);
return err;
}
if (ipc.opt)
free = 1;
connected = 0;
}
if (!ipc.opt) { // 如果发送数据中的控制信息中没有IP选项信息,则尝试从inet_sock结构中获取
struct ip_options_rcu *inet_opt;
rcu_read_lock();
inet_opt = rcu_dereference(inet->inet_opt);
if (inet_opt) { //如果setsockopt设置了ip选项
memcpy(&opt_copy, inet_opt,
sizeof(*inet_opt) + inet_opt->opt.optlen);
ipc.opt = &opt_copy.opt;
}
rcu_read_unlock();
}
saddr = ipc.addr;
ipc.addr = faddr = daddr; //控制信息改成目的地址
sock_tx_timestamp(sk, ipc.sockc.tsflags, &ipc.tx_flags);
if (ipc.opt && ipc.opt->opt.srr) {//设置了源路由
if (!daddr)
return -EINVAL;
faddr = ipc.opt->opt.faddr; //需要使用ip选项中的下一跳作为目的地址
connected = 0; //因为重新选择路由,清除connected
}
tos = get_rttos(&ipc, inet);
if (sock_flag(sk, SOCK_LOCALROUTE) || //设置了SO_DONTROUTE
(msg->msg_flags & MSG_DONTROUTE) ||
(ipc.opt && ipc.opt->opt.is_strictroute)) { //ip选项中设置了严格源路由
tos |= RTO_ONLINK; //此时目的地址和下一跳必然在本地网络,设置RTO_ONLINK表示后续查找时与目的地址直连,而不用通过gateway
connected = 0;
}
if (ipv4_is_multicast(daddr)) { //如果是多播,同样控制信息中不存在的话,获取setsockopt配置
if (!ipc.oif)
ipc.oif = inet->mc_index;
if (!saddr)
saddr = inet->mc_addr;
connected = 0;
} else if (!ipc.oif)
ipc.oif = inet->uc_index;
if (connected) //connect()的时候已经查找过路由
rt = (struct rtable *)sk_dst_check(sk, 0);
if (!rt) { //查找路由
struct net *net = sock_net(sk);
__u8 flow_flags = inet_sk_flowi_flags(sk);
fl4 = &fl4_stack;
flowi4_init_output(fl4, ipc.oif, sk->sk_mark, tos,
RT_SCOPE_UNIVERSE, sk->sk_protocol,
flow_flags,
faddr, saddr, dport, inet->inet_sport);
security_sk_classify_flow(sk, flowi4_to_flowi(fl4));
rt = ip_route_output_flow(net, fl4, sk);
if (IS_ERR(rt)) {
err = PTR_ERR(rt);
rt = NULL;
if (err == -ENETUNREACH)
IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
goto out;
}
err = -EACCES;
if ((rt->rt_flags & RTCF_BROADCAST) &&
!sock_flag(sk, SOCK_BROADCAST))
goto out;
if (connected)
sk_dst_set(sk, dst_clone(&rt->dst));
}
if (msg->msg_flags&MSG_CONFIRM) //设置MSG_CONFIRM,表示确认ip地址到mac地址的映射不会过期,不用定时发送arp
goto do_confirm;
back_from_confirm:
saddr = fl4->saddr; //获取路由源地址
if (!ipc.addr) //如果选项中更没有指定目的地址,从路由中获取. 比如发送的时候没有指定目的地址,而在控制信息中提供严格或者宽松的源路由
daddr = ipc.addr = fl4->daddr;
//不用cork的话不需要加锁,只是创建一个skb发出去就可以了, 由ip层执行分片
/* Lockless fast path for the non-corking case. */
if (!corkreq) {
skb = ip_make_skb(sk, fl4, getfrag, msg, ulen, //创建好skb,并设置ip头
sizeof(struct udphdr), &ipc, &rt,
msg->msg_flags);
err = PTR_ERR(skb);
if (!IS_ERR_OR_NULL(skb))
err = udp_send_skb(skb, fl4); //直接发送
goto out;
}
//需要cork数据了,需要加锁
lock_sock(sk);
if (unlikely(up->pending)) {
/* The socket is already corked while preparing it. */
/* ... which is an evident application bug. --ANK */
release_sock(sk);
net_dbg_ratelimited("cork app bug 2\n");
err = -EINVAL;
goto out;
}
/*
* Now cork the socket to pend data.
*/
fl4 = &inet->cork.fl.u.ip4;
fl4->daddr = daddr;
fl4->saddr = saddr;
fl4->fl4_dport = dport;
fl4->fl4_sport = inet->inet_sport;
up->pending = AF_INET; //标记,表明正在处理
do_append_data:
up->len += ulen;
err = ip_append_data(sk, fl4, getfrag, msg, ulen,
sizeof(struct udphdr), &ipc, &rt,
corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags);
if (err)
udp_flush_pending_frames(sk); //出错就清空数据,和pending标记, 因为udp不保证可靠性
else if (!corkreq) //不需要cork
err = udp_push_pending_frames(sk); //直接发送所有数据
else if (unlikely(skb_queue_empty(&sk->sk_write_queue)))
up->pending = 0; //没有数据等待发送,则清空pendind
release_sock(sk);
out:
ip_rt_put(rt); //发送完成,减少路由缓存的引用计数
if (free)
kfree(ipc.opt);
if (!err)
return len; //发送成功,返回发送的字节数
/*
* ENOBUFS = no kernel mem, SOCK_NOSPACE = no sndbuf space. Reporting
* ENOBUFS might not be good (it's not tunable per se), but otherwise
* we don't have a good statistic (IpOutDiscards but it can be too many
* things). We could add another new stat but at least for now that
* seems like overkill.
*/
if (err == -ENOBUFS || test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
UDP_INC_STATS(sock_net(sk),
UDP_MIB_SNDBUFERRORS, is_udplite);
}
return err; //发送失败,返回错误码
do_confirm:
dst_confirm(&rt->dst);
if (!(msg->msg_flags&MSG_PROBE) || len) // 标记MSG_PROBE和MSG_CONFIRM,且长度为0,则不发送数据
goto back_from_confirm;
err = 0;
goto out;
}