// SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * The User Datagram Protocol (UDP). * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Arnt Gulbrandsen, <agulbra@nvg.unit.no> * Alan Cox, <alan@lxorguk.ukuu.org.uk> * Hirokazu Takahashi, <taka@valinux.co.jp> * * Fixes: * Alan Cox : verify_area() calls * Alan Cox : stopped close while in use off icmp * messages. Not a fix but a botch that * for udp at least is 'valid'. * Alan Cox : Fixed icmp handling properly * Alan Cox : Correct error for oversized datagrams * Alan Cox : Tidied select() semantics. * Alan Cox : udp_err() fixed properly, also now * select and read wake correctly on errors * Alan Cox : udp_send verify_area moved to avoid mem leak * Alan Cox : UDP can count its memory * Alan Cox : send to an unknown connection causes * an ECONNREFUSED off the icmp, but * does NOT close. * Alan Cox : Switched to new sk_buff handlers. No more backlog! * Alan Cox : Using generic datagram code. Even smaller and the PEEK * bug no longer crashes it. * Fred Van Kempen : Net2e support for sk->broadcast. * Alan Cox : Uses skb_free_datagram * Alan Cox : Added get/set sockopt support. * Alan Cox : Broadcasting without option set returns EACCES. * Alan Cox : No wakeup calls. Instead we now use the callbacks. * Alan Cox : Use ip_tos and ip_ttl * Alan Cox : SNMP Mibs * Alan Cox : MSG_DONTROUTE, and 0.0.0.0 support. * Matt Dillon : UDP length checks. * Alan Cox : Smarter af_inet used properly. * Alan Cox : Use new kernel side addressing. * Alan Cox : Incorrect return on truncated datagram receive. * Arnt Gulbrandsen : New udp_send and stuff * Alan Cox : Cache last socket * Alan Cox : Route cache * Jon Peatfield : Minor efficiency fix to sendto(). * Mike Shaver : RFC1122 checks. * Alan Cox : Nonblocking error fix. * Willy Konynenberg : Transparent proxying support. * Mike McLagan : Routing by source * David S. Miller : New socket lookup architecture. * Last socket cache retained as it * does have a high hit rate. * Olaf Kirch : Don't linearise iovec on sendmsg. * Andi Kleen : Some cleanups, cache destination entry * for connect. * Vitaly E. Lavrov : Transparent proxy revived after year coma. * Melvin Smith : Check msg_name not msg_namelen in sendto(), * return ENOTCONN for unconnected sockets (POSIX) * Janos Farkas : don't deliver multi/broadcasts to a different * bound-to-device socket * Hirokazu Takahashi : HW checksumming for outgoing UDP * datagrams. * Hirokazu Takahashi : sendfile() on UDP works now. * Arnaldo C. Melo : convert /proc/net/udp to seq_file * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which * Alexey Kuznetsov: allow both IPv4 and IPv6 sockets to bind * a single port at the same time. * Derek Atkins <derek@ihtfp.com>: Add Encapulation Support * James Chapman : Add L2TP encapsulation type.
*/
/* * Note: we still hold spinlock of primary hash chain, so no other writer * can insert/delete a socket with local_port == num
*/ staticint udp_lib_lport_inuse2(struct net *net, __u16 num, struct udp_hslot *hslot2, struct sock *sk)
{
kuid_t uid = sk_uid(sk); struct sock *sk2; int res = 0;
/** * udp_lib_get_port - UDP/-Lite port lookup for IPv4 and IPv6 * * @sk: socket struct in question * @snum: port number to look up * @hash2_nulladdr: AF-dependent hash value in secondary hash chains, * with NULL address
*/ int udp_lib_get_port(struct sock *sk, unsignedshort snum, unsignedint hash2_nulladdr)
{ struct udp_table *udptable = udp_get_table_prot(sk); struct udp_hslot *hslot, *hslot2; struct net *net = sock_net(sk); int error = -EADDRINUSE;
if (!snum) {
DECLARE_BITMAP(bitmap, PORTS_PER_CHAIN); unsignedshort first, last; int low, high, remaining; unsignedint rand;
rand = get_random_u32();
first = reciprocal_scale(rand, remaining) + low; /* * force rand to be an odd multiple of UDP_HTABLE_SIZE
*/
rand = (rand | 1) * (udptable->mask + 1);
last = first + udptable->mask + 1; do {
hslot = udp_hashslot(udptable, net, first);
bitmap_zero(bitmap, PORTS_PER_CHAIN);
spin_lock_bh(&hslot->lock);
udp_lib_lport_inuse(net, snum, hslot, bitmap, sk,
udptable->log);
snum = first; /* * Iterate on all possible values of snum for this hash. * Using steps of an odd multiple of UDP_HTABLE_SIZE * give us randomization and full range coverage.
*/ do { if (low <= snum && snum <= high &&
!test_bit(snum >> udptable->log, bitmap) &&
!inet_is_local_reserved_port(net, snum)) goto found;
snum += rand;
} while (snum != first);
spin_unlock_bh(&hslot->lock);
cond_resched();
} while (++first != last); goto fail;
} else {
hslot = udp_hashslot(udptable, net, snum);
spin_lock_bh(&hslot->lock); if (hslot->count > 10) { int exist; unsignedint slot2 = udp_sk(sk)->udp_portaddr_hash ^ snum;
/** * udp4_lib_lookup1() - Simplified lookup using primary hash (destination port) * @net: Network namespace * @saddr: Source address, network order * @sport: Source port, network order * @daddr: Destination address, network order * @hnum: Destination port, host order * @dif: Destination interface index * @sdif: Destination bridge port index, if relevant * @udptable: Set of UDP hash tables * * Simplified lookup to be used as fallback if no sockets are found due to a * potential race between (receive) address change, and lookup happening before * the rehash operation. This function ignores SO_REUSEPORT groups while scoring * result sockets, because if we have one, we don't need the fallback at all. * * Called under rcu_read_lock(). * * Return: socket with highest matching score if any, NULL if none
*/ staticstruct sock *udp4_lib_lookup1(conststruct net *net,
__be32 saddr, __be16 sport,
__be32 daddr, unsignedint hnum, int dif, int sdif, conststruct udp_table *udptable)
{ unsignedint slot = udp_hashfn(net, hnum, udptable->mask); struct udp_hslot *hslot = &udptable->hash[slot]; struct sock *sk, *result = NULL; int score, badness = 0;
if (sk->sk_state == TCP_ESTABLISHED) {
result = sk; continue;
}
result = inet_lookup_reuseport(net, sk, skb, sizeof(struct udphdr),
saddr, sport, daddr, hnum, udp_ehashfn); if (!result) {
result = sk; continue;
}
/* Fall back to scoring if group has connections */ if (!reuseport_has_conns(sk)) return result;
/* Reuseport logic returned an error, keep original score. */ if (IS_ERR(result)) continue;
/* compute_score is too long of a function to be * inlined, and calling it again here yields * measureable overhead for some * workloads. Work around it by jumping * backwards to rescore 'result'.
*/
need_rescore = true; goto rescore;
}
} return result;
}
#if IS_ENABLED(CONFIG_BASE_SMALL) staticstruct sock *udp4_lib_lookup4(conststruct net *net,
__be32 saddr, __be16 sport,
__be32 daddr, unsignedint hnum, int dif, int sdif, struct udp_table *udptable)
{ return NULL;
}
begin: /* SLAB_TYPESAFE_BY_RCU not used, so we don't need to touch sk_refcnt */
udp_lrpa_for_each_entry_rcu(up, node, &hslot4->nulls_head) {
sk = (struct sock *)up; if (inet_match(net, sk, acookie, ports, dif, sdif)) return sk;
}
/* if the nulls value we got at the end of this lookup is not the * expected one, we must restart lookup. We probably met an item that * was moved to another chain due to rehash.
*/ if (get_nulls_value(node) != slot) goto begin;
return NULL;
}
/* udp_rehash4() only checks hslot4, and hash4_cnt is not processed. */ staticvoid udp_rehash4(struct udp_table *udptable, struct sock *sk,
u16 newhash4)
{ struct udp_hslot *hslot4, *nhslot4;
/* Connected udp socket can re-connect to another remote address, which * will be handled by rehash. Thus no need to redo hash4 here.
*/ if (udp_hashed4(sk)) return;
/* UDP is nearly always wildcards out the wazoo, it makes no sense to try * harder than this. -DaveM
*/ struct sock *__udp4_lib_lookup(conststruct net *net, __be32 saddr,
__be16 sport, __be32 daddr, __be16 dport, int dif, int sdif, struct udp_table *udptable, struct sk_buff *skb)
{ unsignedshort hnum = ntohs(dport); struct udp_hslot *hslot2; struct sock *result, *sk; unsignedint hash2;
if (udp_has_hash4(hslot2)) {
result = udp4_lib_lookup4(net, saddr, sport, daddr, hnum,
dif, sdif, udptable); if (result) /* udp4_lib_lookup4 return sk or NULL */ return result;
}
/* Lookup connected or non-wildcard socket */
result = udp4_lib_lookup2(net, saddr, sport,
daddr, hnum, dif, sdif,
hslot2, skb); if (!IS_ERR_OR_NULL(result) && result->sk_state == TCP_ESTABLISHED) goto done;
/* Lookup redirect from BPF */ if (static_branch_unlikely(&bpf_sk_lookup_enabled) &&
udptable == net->ipv4.udp_table) {
sk = inet_lookup_run_sk_lookup(net, IPPROTO_UDP, skb, sizeof(struct udphdr),
saddr, sport, daddr, hnum, dif,
udp_ehashfn); if (sk) {
result = sk; goto done;
}
}
/* Got non-wildcard socket or error on first lookup */ if (result) goto done;
result = udp4_lib_lookup2(net, saddr, sport,
htonl(INADDR_ANY), hnum, dif, sdif,
hslot2, skb); if (!IS_ERR_OR_NULL(result)) goto done;
/* Primary hash (destination port) lookup as fallback for this race: * 1. __ip4_datagram_connect() sets sk_rcv_saddr * 2. lookup (this function): new sk_rcv_saddr, hashes not updated yet * 3. rehash operation updating _secondary and four-tuple_ hashes * The primary hash doesn't need an update after 1., so, thanks to this * further step, 1. and 3. don't need to be atomic against the lookup.
*/
result = udp4_lib_lookup1(net, saddr, sport, daddr, hnum, dif, sdif,
udptable);
done: if (IS_ERR(result)) return NULL; return result;
}
EXPORT_SYMBOL_GPL(__udp4_lib_lookup);
/* Must be called under rcu_read_lock(). * Does increment socket refcount.
*/ #if IS_ENABLED(CONFIG_NF_TPROXY_IPV4) || IS_ENABLED(CONFIG_NF_SOCKET_IPV4) struct sock *udp4_lib_lookup(conststruct net *net, __be32 saddr, __be16 sport,
__be32 daddr, __be16 dport, int dif)
{ struct sock *sk;
sk = __udp4_lib_lookup(net, saddr, sport, daddr, dport,
dif, 0, net->ipv4.udp_table, NULL); if (sk && !refcount_inc_not_zero(&sk->sk_refcnt))
sk = NULL; return sk;
}
EXPORT_SYMBOL_GPL(udp4_lib_lookup); #endif
staticinlinebool __udp_is_mcast_sock(struct net *net, conststruct sock *sk,
__be16 loc_port, __be32 loc_addr,
__be16 rmt_port, __be32 rmt_addr, int dif, int sdif, unsignedshort hnum)
{ conststruct inet_sock *inet = inet_sk(sk);
/* Handler for tunnels with arbitrary destination ports: no socket lookup, go * through error handlers in encapsulations looking for a match.
*/ staticint __udp4_lib_err_encap_no_sk(struct sk_buff *skb, u32 info)
{ int i;
for (i = 0; i < MAX_IPTUN_ENCAP_OPS; i++) { int (*handler)(struct sk_buff *skb, u32 info); conststruct ip_tunnel_encap_ops *encap;
encap = rcu_dereference(iptun_encaps[i]); if (!encap) continue;
handler = encap->err_handler; if (handler && !handler(skb, info)) return 0;
}
return -ENOENT;
}
/* Try to match ICMP errors to UDP tunnels by looking up a socket without * reversing source and destination port: this will match tunnels that force the * same destination port on both endpoints (e.g. VXLAN, GENEVE). Note that * lwtunnels might actually break this assumption by being configured with * different destination ports on endpoints, in this case we won't be able to * trace ICMP messages back to them. * * If this doesn't match any socket, probe tunnels with arbitrary destination * ports (e.g. FoU, GUE): there, the receiving socket is useless, as the port * we've sent packets to won't necessarily match the local destination port. * * Then ask the tunnel implementation to match the error against a valid * association. * * Return an error if we can't find a match, the socket if we need further * processing, zero otherwise.
*/ staticstruct sock *__udp4_lib_err_encap(struct net *net, conststruct iphdr *iph, struct udphdr *uh, struct udp_table *udptable, struct sock *sk, struct sk_buff *skb, u32 info)
{ int (*lookup)(struct sock *sk, struct sk_buff *skb); int network_offset, transport_offset; struct udp_sock *up;
/* * This routine is called by the ICMP module when it gets some * sort of error condition. If err < 0 then the socket should * be closed and the error returned to the user. If err > 0 * it's just the icmp type << 8 | icmp code. * Header points to the ip header of the error packet. We move * on past this. Then (as it used to claim before adjustment) * header points to the first 8 bytes of the udp header. We need * to find the appropriate port.
*/
if (!sk || READ_ONCE(udp_sk(sk)->encap_type)) { /* No socket for error: try tunnels before discarding */ if (static_branch_unlikely(&udp_encap_needed_key)) {
sk = __udp4_lib_err_encap(net, iph, uh, udptable, sk, skb,
info); if (!sk) return 0;
} else
sk = ERR_PTR(-ENOENT);
if (IS_ERR(sk)) {
__ICMP_INC_STATS(net, ICMP_MIB_INERRORS); return PTR_ERR(sk);
}
tunnel = true;
}
err = 0;
harderr = 0;
inet = inet_sk(sk);
switch (type) { default: case ICMP_TIME_EXCEEDED:
err = EHOSTUNREACH; break; case ICMP_SOURCE_QUENCH: goto out; case ICMP_PARAMETERPROB:
err = EPROTO;
harderr = 1; break; case ICMP_DEST_UNREACH: if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */
ipv4_sk_update_pmtu(skb, sk, info); if (READ_ONCE(inet->pmtudisc) != IP_PMTUDISC_DONT) {
err = EMSGSIZE;
harderr = 1; break;
} goto out;
}
err = EHOSTUNREACH; if (code <= NR_ICMP_UNREACH) {
harderr = icmp_err_convert[code].fatal;
err = icmp_err_convert[code].errno;
} break; case ICMP_REDIRECT:
ipv4_sk_redirect(skb, sk); goto out;
}
/* * RFC1122: OK. Passes ICMP errors back to application, as per * 4.1.3.3.
*/ if (tunnel) { /* ...not for tunnels though: we don't have a sending socket */ if (udp_sk(sk)->encap_err_rcv)
udp_sk(sk)->encap_err_rcv(sk, skb, err, uh->dest, info,
(u8 *)(uh+1)); goto out;
} if (!inet_test_bit(RECVERR, sk)) { if (!harderr || sk->sk_state != TCP_ESTABLISHED) goto out;
} else
ip_icmp_error(sk, skb, err, uh->dest, info, (u8 *)(uh+1));
/* * Throw away all pending data and cancel the corking. Socket is locked.
*/ void udp_flush_pending_frames(struct sock *sk)
{ struct udp_sock *up = udp_sk(sk);
/** * udp4_hwcsum - handle outgoing HW checksumming * @skb: sk_buff containing the filled-in UDP header * (checksum field must be zeroed out) * @src: source IP address * @dst: destination IP address
*/ void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst)
{ struct udphdr *uh = udp_hdr(skb); int offset = skb_transport_offset(skb); int len = skb->len - offset; int hlen = len;
__wsum csum = 0;
if (!skb_has_frag_list(skb)) { /* * Only one fragment on the socket.
*/
skb->csum_start = skb_transport_header(skb) - skb->head;
skb->csum_offset = offsetof(struct udphdr, check);
uh->check = ~csum_tcpudp_magic(src, dst, len,
IPPROTO_UDP, 0);
} else { struct sk_buff *frags;
/* * HW-checksum won't work as there are two or more * fragments on the socket so that all csums of sk_buffs * should be together
*/
skb_walk_frags(skb, frags) {
csum = csum_add(csum, frags->csum);
hlen -= frags->len;
}
/* Function to set UDP checksum for an IPv4 UDP packet. This is intended * for the simple case like when setting the checksum for a UDP tunnel.
*/ void udp_set_csum(bool nocheck, struct sk_buff *skb,
__be32 saddr, __be32 daddr, int len)
{ struct udphdr *uh = udp_hdr(skb);
fl4 = &inet->cork.fl.u.ip4; if (READ_ONCE(up->pending)) { /* * There are pending frames. * The socket lock must be held while it's corked.
*/
lock_sock(sk); if (likely(up->pending)) { if (unlikely(up->pending != AF_INET)) {
release_sock(sk); return -EINVAL;
} goto do_append_data;
}
release_sock(sk);
}
ulen += sizeof(struct udphdr);
/* * Get and verify the address.
*/ if (usin) { if (msg->msg_namelen < sizeof(*usin)) return -EINVAL; if (usin->sin_family != AF_INET) { if (usin->sin_family != AF_UNSPEC) return -EAFNOSUPPORT;
}
daddr = usin->sin_addr.s_addr;
dport = usin->sin_port; if (dport == 0) return -EINVAL;
} else { if (sk->sk_state != TCP_ESTABLISHED) return -EDESTADDRREQ;
daddr = inet->inet_daddr;
dport = inet->inet_dport; /* Open fast path for connected socket. Route will not be used, if at least one option is set.
*/
connected = 1;
}
uc_index = READ_ONCE(inet->uc_index); if (ipv4_is_multicast(daddr)) { if (!ipc.oif || netif_index_is_l3_master(sock_net(sk), ipc.oif))
ipc.oif = READ_ONCE(inet->mc_index); if (!saddr)
saddr = READ_ONCE(inet->mc_addr);
connected = 0;
} elseif (!ipc.oif) {
ipc.oif = uc_index;
} elseif (ipv4_is_lbcast(daddr) && uc_index) { /* oif is set, packet is to local broadcast and * uc_index is set. oif is most likely set * by sk_bound_dev_if. If uc_index != oif check if the * oif is an L3 master and uc_index is an L3 slave. * If so, we want to allow the send using the uc_index.
*/ if (ipc.oif != uc_index &&
ipc.oif == l3mdev_master_ifindex_by_index(sock_net(sk),
uc_index)) {
ipc.oif = uc_index;
}
}
if (connected)
rt = dst_rtable(sk_dst_check(sk, 0));
if (!rt) { struct net *net = sock_net(sk);
__u8 flow_flags = inet_sk_flowi_flags(sk);
lock_sock(sk); if (unlikely(up->pending)) { /* The socket is already corked while preparing it. */ /* ... which is an evident application bug. --ANK */
release_sock(sk);
out:
ip_rt_put(rt);
out_free: if (free)
kfree(ipc.opt); if (!err) return len; /* * ENOBUFS = no kernel mem, SOCK_NOSPACE = no sndbuf space. Reporting * ENOBUFS might not be good (it's not tunable per se), but otherwise * we don't have a good statistic (IpOutDiscards but it can be too many * things). We could add another new stat but at least for now that * seems like overkill.
*/ if (err == -ENOBUFS || test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
UDP_INC_STATS(sock_net(sk),
UDP_MIB_SNDBUFERRORS, is_udplite);
} return err;
do_confirm: if (msg->msg_flags & MSG_PROBE)
dst_confirm_neigh(&rt->dst, &fl4->daddr); if (!(msg->msg_flags&MSG_PROBE) || len) goto back_from_confirm;
err = 0; goto out;
}
EXPORT_SYMBOL(udp_sendmsg);
if (!READ_ONCE(up->pending) || udp_test_bit(CORK, sk)) return;
lock_sock(sk); if (up->pending && !udp_test_bit(CORK, sk))
udp_push_pending_frames(sk);
release_sock(sk);
}
EXPORT_IPV6_MOD_GPL(udp_splice_eof);
#define UDP_SKB_IS_STATELESS 0x80000000
/* all head states (dst, sk, nf conntrack) except skb extensions are * cleared by udp_rcv(). * * We need to preserve secpath, if present, to eventually process * IP_CMSG_PASSSEC at recvmsg() time. * * Other extensions can be cleared.
*/ staticbool udp_try_make_stateless(struct sk_buff *skb)
{ if (!skb_has_extensions(skb)) returntrue;
if (!secpath_exists(skb)) {
skb_ext_reset(skb); returntrue;
}
staticvoid udp_skb_csum_unnecessary_set(struct sk_buff *skb)
{ /* We come here after udp_lib_checksum_complete() returned 0. * This means that __skb_checksum_complete() might have * set skb->csum_valid to 1. * On 64bit platforms, we can set csum_unnecessary * to true, but only if the skb is not shared.
*/ #if BITS_PER_LONG == 64 if (!skb_shared(skb))
udp_skb_scratch(skb)->csum_unnecessary = true; #endif
}
/* acquire the sk_receive_queue for fwd allocated memory scheduling, * if the called don't held it already
*/
sk_queue = &sk->sk_receive_queue; if (!rx_queue_lock_held)
spin_lock(&sk_queue->lock);
if (amt)
__sk_mem_reduce_allocated(sk, amt >> PAGE_SHIFT);
atomic_sub(size, &sk->sk_rmem_alloc);
/* this can save us from acquiring the rx queue lock on next receive */
skb_queue_splice_tail_init(sk_queue, &up->reader_queue);
if (!rx_queue_lock_held)
spin_unlock(&sk_queue->lock);
}
/* Note: called with reader_queue.lock held. * Instead of using skb->truesize here, find a copy of it in skb->dev_scratch * This avoids a cache line miss while receive_queue lock is held. * Look at __udp_enqueue_schedule_skb() to find where this copy is done.
*/ void udp_skb_destructor(struct sock *sk, struct sk_buff *skb)
{
prefetch(&skb->data);
udp_rmem_release(sk, udp_skb_truesize(skb), 1, false);
}
EXPORT_IPV6_MOD(udp_skb_destructor);
/* as above, but the caller held the rx queue lock, too */ staticvoid udp_skb_dtor_locked(struct sock *sk, struct sk_buff *skb)
{
prefetch(&skb->data);
udp_rmem_release(sk, udp_skb_truesize(skb), 1, true);
}
/* Idea of busylocks is to let producers grab an extra spinlock * to relieve pressure on the receive_queue spinlock shared by consumer. * Under flood, this means that only one producer can be in line * trying to acquire the receive_queue spinlock. * These busylock can be allocated on a per cpu manner, instead of a * per socket one (that would consume a cache line per socket)
*/ staticint udp_busylocks_log __read_mostly; static spinlock_t *udp_busylocks __read_mostly;
/* Immediately drop when the receive queue is full. * Cast to unsigned int performs the boundary check for INT_MAX.
*/ if (rmem + size > rcvbuf) { if (rcvbuf > INT_MAX >> 1) goto drop;
/* Always allow at least one packet for small buffer. */ if (rmem > rcvbuf) goto drop;
}
/* Under mem pressure, it might be helpful to help udp_recvmsg() * having linear skbs : * - Reduce memory overhead and thus increase receive queue capacity * - Less cache line misses at copyout() time * - Less work at consume_skb() (less alien page frag freeing)
*/ if (rmem > (rcvbuf >> 1)) {
skb_condense(skb);
size = skb->truesize;
busy = busylock_acquire(sk);
}
void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len)
{ if (unlikely(READ_ONCE(udp_sk(sk)->peeking_with_offset)))
sk_peek_offset_bwd(sk, len);
if (!skb_unref(skb)) return;
/* In the more common cases we cleared the head states previously, * see __udp_queue_rcv_skb().
*/ if (unlikely(udp_skb_has_head_state(skb)))
skb_release_head_state(skb);
__consume_stateless_skb(skb);
}
EXPORT_IPV6_MOD_GPL(skb_consume_udp);
/** * first_packet_length - return length of first packet in receive queue * @sk: socket * * Drops all bad checksum frames, until a valid one is found. * Returns the length of found skb, or -1 if none is found.
*/ staticint first_packet_length(struct sock *sk)
{ struct sk_buff_head *rcvq = &udp_sk(sk)->reader_queue; struct sk_buff_head *sk_queue = &sk->sk_receive_queue; unsignedint total = 0; struct sk_buff *skb; int res;
error = -EAGAIN; do {
spin_lock_bh(&queue->lock);
skb = __skb_try_recv_from_queue(queue, flags, off, err,
&last); if (skb) { if (!(flags & MSG_PEEK))
udp_skb_destructor(sk, skb);
spin_unlock_bh(&queue->lock); return skb;
}
if (skb_queue_empty_lockless(sk_queue)) {
spin_unlock_bh(&queue->lock); goto busy_check;
}
/* refill the reader queue and walk it again * keep both queues locked to avoid re-acquiring * the sk_receive_queue lock if fwd memory scheduling * is needed.
*/
spin_lock(&sk_queue->lock);
skb_queue_splice_tail_init(sk_queue, queue);
/* * If checksum is needed at all, try to do it while copying the * data. If the data is truncated, or if we only want a partial * coverage checksum (UDP-Lite), do it before the copy.
*/
/* starting over for a new packet, but check if we need to yield */
cond_resched();
msg->msg_flags &= ~MSG_TRUNC; goto try_again;
}
int udp_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{ /* This check is replicated from __ip4_datagram_connect() and * intended to prevent BPF program called below from accessing bytes * that are out of the bound specified by user in addr_len.
*/ if (addr_len < sizeof(struct sockaddr_in)) return -EINVAL;
if (hslot2 != nhslot2 ||
rcu_access_pointer(sk->sk_reuseport_cb)) { /* we must lock primary chain too */
spin_lock_bh(&hslot->lock); if (rcu_access_pointer(sk->sk_reuseport_cb))
reuseport_detach_sock(sk);
if (hslot2 != nhslot2) {
spin_lock(&hslot2->lock);
hlist_del_init_rcu(&udp_sk(sk)->udp_portaddr_node);
hslot2->count--;
spin_unlock(&hslot2->lock);
/* Now process hash4 if necessary: * (1) update hslot4; * (2) update hslot2->hash4_cnt. * Note that hslot2/hslot4 should be checked separately, as * either of them may change with the other unchanged.
*/ if (udp_hashed4(sk)) {
spin_lock_bh(&hslot->lock);
/* returns: * -1: error * 0: success * >0: "udp encap" protocol resubmission * * Note that in the success and error cases, the skb is assumed to * have either been requeued or freed.
*/ staticint udp_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb)
{ enum skb_drop_reason drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; struct udp_sock *up = udp_sk(sk); int is_udplite = IS_UDPLITE(sk);
/* * Charge it to the socket, dropping if the queue is full.
*/ if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
drop_reason = SKB_DROP_REASON_XFRM_POLICY; goto drop;
}
nf_reset_ct(skb);
if (static_branch_unlikely(&udp_encap_needed_key) &&
READ_ONCE(up->encap_type)) { int (*encap_rcv)(struct sock *sk, struct sk_buff *skb);
/* * This is an encapsulation socket so pass the skb to * the socket's udp_encap_rcv() hook. Otherwise, just * fall through and pass this up the UDP socket. * up->encap_rcv() returns the following value: * =0 if skb was successfully passed to the encap * handler or was discarded by it. * >0 if skb should be passed on to UDP. * <0 if skb should be resubmitted as proto -N
*/
/* if we're overly short, let UDP handle it */
encap_rcv = READ_ONCE(up->encap_rcv); if (encap_rcv) { int ret;
/* Verify checksum before giving to encap */ if (udp_lib_checksum_complete(skb)) goto csum_error;
ret = encap_rcv(sk, skb); if (ret <= 0) {
__UDP_INC_STATS(sock_net(sk),
UDP_MIB_INDATAGRAMS,
is_udplite); return -ret;
}
}
/* FALLTHROUGH -- it's a UDP Packet */
}
/* * UDP-Lite specific tests, ignored on UDP sockets
*/ if (udp_test_bit(UDPLITE_RECV_CC, sk) && UDP_SKB_CB(skb)->partial_cov) {
u16 pcrlen = READ_ONCE(up->pcrlen);
/* * MIB statistics other than incrementing the error count are * disabled for the following two types of errors: these depend * on the application settings, not on the functioning of the * protocol stack as such. * * RFC 3828 here recommends (sec 3.3): "There should also be a * way ... to ... at least let the receiving application block * delivery of packets with coverage values less than a value * provided by the application."
*/ if (pcrlen == 0) { /* full coverage was set */
net_dbg_ratelimited("UDPLite: partial coverage %d while full coverage %d requested\n",
UDP_SKB_CB(skb)->cscov, skb->len); goto drop;
} /* The next case involves violating the min. coverage requested * by the receiver. This is subtle: if receiver wants x and x is * greater than the buffersize/MTU then receiver will complain * that it wants x while sender emits packets of smaller size y. * Therefore the above ...()->partial_cov statement is essential.
*/ if (UDP_SKB_CB(skb)->cscov < pcrlen) {
net_dbg_ratelimited("UDPLite: coverage %d too small, need min %d\n",
UDP_SKB_CB(skb)->cscov, pcrlen); goto drop;
}
}
prefetch(&sk->sk_rmem_alloc); if (rcu_access_pointer(sk->sk_filter) &&
udp_lib_checksum_complete(skb)) goto csum_error;
if (sk_filter_trim_cap(sk, skb, sizeof(struct udphdr), &drop_reason)) goto drop;
udp_post_segment_fix_csum(skb);
ret = udp_queue_rcv_one_skb(sk, skb); if (ret > 0)
ip_protocol_deliver_rcu(dev_net(skb->dev), skb, ret);
} return 0;
}
/* For TCP sockets, sk_rx_dst is protected by socket lock * For UDP, we use xchg() to guard against concurrent changes.
*/ bool udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst)
{ struct dst_entry *old;
if (dst_hold_safe(dst)) {
old = unrcu_pointer(xchg(&sk->sk_rx_dst, RCU_INITIALIZER(dst)));
dst_release(old); return old != dst;
} returnfalse;
}
EXPORT_IPV6_MOD(udp_sk_rx_dst_set);
/* * Multicasts and broadcasts go to each listener. * * Note: called only from the BH handler context.
*/ staticint __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb, struct udphdr *uh,
__be32 saddr, __be32 daddr, struct udp_table *udptable, int proto)
{ struct sock *sk, *first = NULL; unsignedshort hnum = ntohs(uh->dest); struct udp_hslot *hslot = udp_hashslot(udptable, net, hnum); unsignedint hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10); unsignedint offset = offsetof(typeof(*sk), sk_node); int dif = skb->dev->ifindex; int sdif = inet_sdif(skb); struct hlist_node *node; struct sk_buff *nskb;
if (!first) {
first = sk; continue;
}
nskb = skb_clone(skb, GFP_ATOMIC);
if (unlikely(!nskb)) {
atomic_inc(&sk->sk_drops);
__UDP_INC_STATS(net, UDP_MIB_RCVBUFERRORS,
IS_UDPLITE(sk));
__UDP_INC_STATS(net, UDP_MIB_INERRORS,
IS_UDPLITE(sk)); continue;
} if (udp_queue_rcv_skb(sk, nskb) > 0)
consume_skb(nskb);
}
/* Also lookup *:port if we are using hash2 and haven't done so yet. */ if (use_hash2 && hash2 != hash2_any) {
hash2 = hash2_any; goto start_lookup;
}
if (first) { if (udp_queue_rcv_skb(first, skb) > 0)
consume_skb(skb);
} else {
kfree_skb(skb);
__UDP_INC_STATS(net, UDP_MIB_IGNOREDMULTI,
proto == IPPROTO_UDPLITE);
} return 0;
}
/* Initialize UDP checksum. If exited with zero value (success), * CHECKSUM_UNNECESSARY means, that no more checks are required. * Otherwise, csum completion requires checksumming packet body, * including udp header and folding it to skb->csum.
*/ staticinlineint udp4_csum_init(struct sk_buff *skb, struct udphdr *uh, int proto)
{ int err;
/* Note, we are only interested in != 0 or == 0, thus the * force to int.
*/
err = (__force int)skb_checksum_init_zero_check(skb, proto, uh->check,
inet_compute_pseudo); if (err) return err;
if (skb->ip_summed == CHECKSUM_COMPLETE && !skb->csum_valid) { /* If SW calculated the value, we know it's bad */ if (skb->csum_complete_sw) return 1;
/* HW says the value is bad. Let's validate that. * skb->csum is no longer the full packet checksum, * so don't treat it as such.
*/
skb_checksum_complete_unset(skb);
}
return 0;
}
/* wrapper for udp_queue_rcv_skb tacking care of csum conversion and * return code conversion for ip layer consumption
*/ staticint udp_unicast_rcv_skb(struct sock *sk, struct sk_buff *skb, struct udphdr *uh)
{ int ret;
if (inet_get_convert_csum(sk) && uh->check && !IS_UDPLITE(sk))
skb_checksum_try_convert(skb, IPPROTO_UDP, inet_compute_pseudo);
ret = udp_queue_rcv_skb(sk, skb);
/* a return value > 0 means to resubmit the input, but * it wants the return to be -protocol, or 0
*/ if (ret > 0) return -ret; return 0;
}
/* * All we need to do is get the socket, and then do a checksum.
*/
int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, int proto)
{ struct sock *sk = NULL; struct udphdr *uh; unsignedshort ulen; struct rtable *rt = skb_rtable(skb);
__be32 saddr, daddr; struct net *net = dev_net(skb->dev); bool refcounted; int drop_reason;
drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
/* * Validate the packet.
*/ if (!pskb_may_pull(skb, sizeof(struct udphdr))) goto drop; /* No space for header. */
/* * Hmm. We got an UDP packet to a port to which we * don't wanna listen. Ignore it.
*/
sk_skb_reason_drop(sk, skb, drop_reason); return 0;
short_packet:
drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
net_dbg_ratelimited("UDP%s: short packet: From %pI4:%u %d/%d to %pI4:%u\n",
proto == IPPROTO_UDPLITE ? "Lite" : "",
&saddr, ntohs(uh->source),
ulen, skb->len,
&daddr, ntohs(uh->dest)); goto drop;
csum_error: /* * RFC1122: OK. Discards the bad packet silently (as far as * the network is concerned, anyway) as per 4.1.3.4 (MUST).
*/
drop_reason = SKB_DROP_REASON_UDP_CSUM;
net_dbg_ratelimited("UDP%s: bad checksum. From %pI4:%u to %pI4:%u ulen %d\n",
proto == IPPROTO_UDPLITE ? "Lite" : "",
&saddr, ntohs(uh->source), &daddr, ntohs(uh->dest),
ulen);
__UDP_INC_STATS(net, UDP_MIB_CSUMERRORS, proto == IPPROTO_UDPLITE);
drop:
__UDP_INC_STATS(net, UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE);
sk_skb_reason_drop(sk, skb, drop_reason); return 0;
}
/* We can only early demux multicast if there is a single matching socket. * If more than one socket found returns NULL
*/ staticstruct sock *__udp4_lib_mcast_demux_lookup(struct net *net,
__be16 loc_port, __be32 loc_addr,
__be16 rmt_port, __be32 rmt_addr, int dif, int sdif)
{ struct udp_table *udptable = net->ipv4.udp_table; unsignedshort hnum = ntohs(loc_port); struct sock *sk, *result; struct udp_hslot *hslot; unsignedint slot;
/* Do not bother scanning a too big list */ if (hslot->count > 10) return NULL;
result = NULL;
sk_for_each_rcu(sk, &hslot->head) { if (__udp_is_mcast_sock(net, sk, loc_port, loc_addr,
rmt_port, rmt_addr, dif, sdif, hnum)) { if (result) return NULL;
result = sk;
}
}
return result;
}
/* For unicast we should only early demux connected sockets or we can * break forwarding setups. The chains here can be long so only check * if the first socket is an exact match and if not move on.
*/ staticstruct sock *__udp4_lib_demux_lookup(struct net *net,
__be16 loc_port, __be32 loc_addr,
__be16 rmt_port, __be32 rmt_addr, int dif, int sdif)
{ struct udp_table *udptable = net->ipv4.udp_table;
INET_ADDR_COOKIE(acookie, rmt_addr, loc_addr); unsignedshort hnum = ntohs(loc_port); struct udp_hslot *hslot2; unsignedint hash2;
__portpair ports; struct sock *sk;
if (dst)
dst = dst_check(dst, 0); if (dst) {
u32 itag = 0;
/* set noref for now. * any place which wants to hold dst has to call * dst_hold_safe()
*/
skb_dst_set_noref(skb, dst);
/* for unconnected multicast sockets we need to validate * the source on each packet
*/ if (!inet_sk(sk)->inet_daddr && in_dev) return ip_mc_validate_source(skb, iph->daddr,
iph->saddr,
ip4h_dscp(iph),
skb->dev, in_dev, &itag);
} return 0;
}
int udp_rcv(struct sk_buff *skb)
{ return __udp4_lib_rcv(skb, dev_net(skb->dev)->ipv4.udp_table, IPPROTO_UDP);
}
if (udp_test_bit(GRO_ENABLED, sk) && encap_type == UDP_ENCAP_ESPINUDP) { if (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6)
new_gro_receive = ipv6_stub->xfrm6_gro_udp_encap_rcv; else
new_gro_receive = xfrm4_gro_udp_encap_rcv;
if (udp_sk(sk)->gro_receive != new_gro_receive) { /* * With IPV6_ADDRFORM the gro callback could change * after being set, unregister the old one, if valid.
*/ if (udp_sk(sk)->gro_receive)
udp_tunnel_update_gro_rcv(sk, false);
case UDP_ENCAP:
sockopt_lock_sock(sk); switch (val) { case 0: #ifdef CONFIG_XFRM case UDP_ENCAP_ESPINUDP:
set_xfrm_gro_udp_encap_rcv(val, sk->sk_family, sk); #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6)
WRITE_ONCE(up->encap_rcv,
ipv6_stub->xfrm6_udp_encap_rcv); else #endif
WRITE_ONCE(up->encap_rcv,
xfrm4_udp_encap_rcv); #endif
fallthrough; case UDP_ENCAP_L2TPINUDP:
WRITE_ONCE(up->encap_type, val);
udp_tunnel_encap_enable(sk); break; default:
err = -ENOPROTOOPT; break;
}
sockopt_release_sock(sk); break;
case UDP_NO_CHECK6_TX:
udp_set_no_check6_tx(sk, valbool); break;
case UDP_NO_CHECK6_RX:
udp_set_no_check6_rx(sk, valbool); break;
case UDP_SEGMENT: if (val < 0 || val > USHRT_MAX) return -EINVAL;
WRITE_ONCE(up->gso_size, val); break;
case UDP_GRO:
sockopt_lock_sock(sk); /* when enabling GRO, accept the related GSO packet type */ if (valbool)
udp_tunnel_encap_enable(sk);
udp_assign_bit(GRO_ENABLED, sk, valbool);
udp_assign_bit(ACCEPT_L4, sk, valbool);
set_xfrm_gro_udp_encap_rcv(up->encap_type, sk->sk_family, sk);
sockopt_release_sock(sk); break;
/* * UDP-Lite's partial checksum coverage (RFC 3828).
*/ /* The sender sets actual checksum coverage length via this option.
* The case coverage > packet length is handled by send module. */ case UDPLITE_SEND_CSCOV: if (!is_udplite) /* Disable the option on UDP sockets */ return -ENOPROTOOPT; if (val != 0 && val < 8) /* Illegal coverage: use default (8) */
val = 8; elseif (val > USHRT_MAX)
val = USHRT_MAX;
WRITE_ONCE(up->pcslen, val);
udp_set_bit(UDPLITE_SEND_CC, sk); break;
/* The receiver specifies a minimum checksum coverage value. To make * sense, this should be set to at least 8 (as done below). If zero is
* used, this again means full checksum coverage. */ case UDPLITE_RECV_CSCOV: if (!is_udplite) /* Disable the option on UDP sockets */ return -ENOPROTOOPT; if (val != 0 && val < 8) /* Avoid silly minimal values. */
val = 8; elseif (val > USHRT_MAX)
val = USHRT_MAX;
WRITE_ONCE(up->pcrlen, val);
udp_set_bit(UDPLITE_RECV_CC, sk); break;
int udp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsignedint optlen)
{ if (level == SOL_UDP || level == SOL_UDPLITE || level == SOL_SOCKET) return udp_lib_setsockopt(sk, level, optname,
optval, optlen,
udp_push_pending_frames); return ip_setsockopt(sk, level, optname, optval, optlen);
}
int udp_lib_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen)
{ struct udp_sock *up = udp_sk(sk); int val, len;
if (get_user(len, optlen)) return -EFAULT;
if (len < 0) return -EINVAL;
len = min_t(unsignedint, len, sizeof(int));
switch (optname) { case UDP_CORK:
val = udp_test_bit(CORK, sk); break;
case UDP_ENCAP:
val = READ_ONCE(up->encap_type); break;
case UDP_NO_CHECK6_TX:
val = udp_get_no_check6_tx(sk); break;
case UDP_NO_CHECK6_RX:
val = udp_get_no_check6_rx(sk); break;
case UDP_SEGMENT:
val = READ_ONCE(up->gso_size); break;
case UDP_GRO:
val = udp_test_bit(GRO_ENABLED, sk); break;
/* The following two cannot be changed on UDP sockets, the return is
* always 0 (which corresponds to the full checksum coverage of UDP). */ case UDPLITE_SEND_CSCOV:
val = READ_ONCE(up->pcslen); break;
case UDPLITE_RECV_CSCOV:
val = READ_ONCE(up->pcrlen); break;
default: return -ENOPROTOOPT;
}
if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, &val, len)) return -EFAULT; return 0;
}
EXPORT_IPV6_MOD(udp_lib_getsockopt);
int udp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen)
{ if (level == SOL_UDP || level == SOL_UDPLITE) return udp_lib_getsockopt(sk, level, optname, optval, optlen); return ip_getsockopt(sk, level, optname, optval, optlen);
}
/** * udp_poll - wait for a UDP event. * @file: - file struct * @sock: - socket * @wait: - poll table * * This is same as datagram poll, except for the special case of * blocking sockets. If application is using a blocking fd * and a packet with checksum error is in the queue; * then it could get return from select indicating data available * but then block when reading it. Add special case code * to work around these arguably broken applications.
*/
__poll_t udp_poll(struct file *file, struct socket *sock, poll_table *wait)
{
__poll_t mask = datagram_poll(file, sock, wait); struct sock *sk = sock->sk;
if (!skb_queue_empty_lockless(&udp_sk(sk)->reader_queue))
mask |= EPOLLIN | EPOLLRDNORM;
/* Check for false positives due to checksum errors */ if ((mask & EPOLLRDNORM) && !(file->f_flags & O_NONBLOCK) &&
!(sk->sk_shutdown & RCV_SHUTDOWN) && first_packet_length(sk) == -1)
mask &= ~(EPOLLIN | EPOLLRDNORM);
/* psock ingress_msg queue should not contain any bad checksum frames */ if (sk_is_readable(sk))
mask |= EPOLLIN | EPOLLRDNORM; return mask;
}
EXPORT_IPV6_MOD(udp_poll);
int udp_abort(struct sock *sk, int err)
{ if (!has_current_bpf_ctx())
lock_sock(sk);
/* udp{v6}_destroy_sock() sets it under the sk lock, avoid racing * with close()
*/ if (sock_flag(sk, SOCK_DEAD)) goto out;
staticint bpf_iter_udp_realloc_batch(struct bpf_udp_iter_state *iter, unsignedint new_batch_sz, gfp_t flags); staticstruct sock *bpf_iter_udp_resume(struct sock *first_sk, union bpf_udp_iter_batch_item *cookies, int n_cookies)
{ struct sock *sk = NULL; int i;
for (i = 0; i < n_cookies; i++) {
sk = first_sk;
udp_portaddr_for_each_entry_from(sk) if (cookies[i].cookie == atomic64_read(&sk->sk_cookie)) goto done;
}
done: return sk;
}
/* The current batch is done, so advance the bucket. */ if (iter->cur_sk == iter->end_sk)
state->bucket++;
udptable = udp_get_table_seq(seq, net);
again: /* New batch for the next bucket. * Iterate over the hash table to find a bucket with sockets matching * the iterator attributes, and return the first matching socket from * the bucket. The remaining matched sockets from the bucket are batched * before releasing the bucket lock. This allows BPF programs that are * called in seq_show to acquire the bucket lock if needed.
*/
find_cookie = iter->cur_sk;
end_cookie = iter->end_sk;
iter->cur_sk = 0;
iter->end_sk = 0;
batch_sks = 0;
spin_lock_bh(&hslot2->lock);
sk = hlist_entry_safe(hslot2->head.first, struct sock,
__sk_common.skc_portaddr_node); /* Resume from the first (in iteration order) unseen socket from * the last batch that still exists in resume_bucket. Most of * the time this will just be where the last iteration left off * in resume_bucket unless that socket disappeared between * reads.
*/ if (state->bucket == resume_bucket)
sk = bpf_iter_udp_resume(sk, &iter->batch[find_cookie],
end_cookie - find_cookie);
fill_batch:
udp_portaddr_for_each_entry_from(sk) { if (seq_sk_match(seq, sk)) { if (iter->end_sk < iter->max_sk) {
sock_hold(sk);
iter->batch[iter->end_sk++].sk = sk;
}
batch_sks++;
}
}
/* Allocate a larger batch and try again. */ if (unlikely(resizes <= 1 && iter->end_sk &&
iter->end_sk != batch_sks)) {
resizes++;
/* First, try with GFP_USER to maximize the chances of * grabbing more memory.
*/ if (resizes == 1) {
spin_unlock_bh(&hslot2->lock);
err = bpf_iter_udp_realloc_batch(iter,
batch_sks * 3 / 2,
GFP_USER); if (err) return ERR_PTR(err); /* Start over. */ goto again;
}
/* Next, hold onto the lock, so the bucket doesn't * change while we get the rest of the sockets.
*/
err = bpf_iter_udp_realloc_batch(iter, batch_sks,
GFP_NOWAIT); if (err) {
spin_unlock_bh(&hslot2->lock); return ERR_PTR(err);
}
/* Pick up where we left off. */
sk = iter->batch[iter->end_sk - 1].sk;
sk = hlist_entry_safe(sk->__sk_common.skc_portaddr_node.next, struct sock,
__sk_common.skc_portaddr_node);
batch_sks = iter->end_sk; goto fill_batch;
}
spin_unlock_bh(&hslot2->lock);
if (iter->end_sk) break;
next_bucket:
resizes = 0;
}
/* Whenever seq_next() is called, the iter->cur_sk is * done with seq_show(), so unref the iter->cur_sk.
*/ if (iter->cur_sk < iter->end_sk)
sock_put(iter->batch[iter->cur_sk++].sk);
/* After updating iter->cur_sk, check if there are more sockets * available in the current bucket batch.
*/ if (iter->cur_sk < iter->end_sk)
sk = iter->batch[iter->cur_sk].sk; else /* Prepare a new batch. */
sk = bpf_iter_udp_batch(seq);
++*pos; return sk;
}
staticvoid *bpf_iter_udp_seq_start(struct seq_file *seq, loff_t *pos)
{ /* bpf iter does not support lseek, so it always * continue from where it was stop()-ped.
*/ if (*pos) return bpf_iter_udp_batch(seq);
/* Remember the cookies of the sockets we haven't seen yet, so we can * pick up where we left off next time around.
*/ while (cur_sk < iter->end_sk) {
item = &iter->batch[cur_sk++];
cookie = sock_gen_cookie(item->sk);
sock_put(item->sk);
item->cookie = cookie;
}
}
staticvoid __net_init udp_set_table(struct net *net)
{ struct udp_table *udptable; unsignedint hash_entries; struct net *old_net;
if (net_eq(net, &init_net)) goto fallback;
old_net = current->nsproxy->net_ns;
hash_entries = READ_ONCE(old_net->ipv4.sysctl_udp_child_hash_entries); if (!hash_entries) goto fallback;
/* Set min to keep the bitmap on stack in udp_lib_get_port() */ if (hash_entries < UDP_HTABLE_SIZE_MIN_PERNET)
hash_entries = UDP_HTABLE_SIZE_MIN_PERNET; else
hash_entries = roundup_pow_of_two(hash_entries);
udptable = udp_pernet_table_alloc(hash_entries); if (udptable) {
net->ipv4.udp_table = udptable;
} else {
pr_warn("Failed to allocate UDP hash table (entries: %u) " "for a netns, fallback to the global one\n",
hash_entries);
fallback:
net->ipv4.udp_table = &udp_table;
}
}
staticint __net_init udp_pernet_init(struct net *net)
{ #if IS_ENABLED(CONFIG_NET_UDP_TUNNEL) int i;
/* No tunnel is configured */ for (i = 0; i < ARRAY_SIZE(net->ipv4.udp_tunnel_gro); ++i) {
INIT_HLIST_HEAD(&net->ipv4.udp_tunnel_gro[i].list);
RCU_INIT_POINTER(net->ipv4.udp_tunnel_gro[i].sk, NULL);
} #endif
udp_sysctl_init(net);
udp_set_table(net);
return 0;
}
staticvoid __net_exit udp_pernet_exit(struct net *net)
{
udp_pernet_table_free(net);
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.