// SPDX-License-Identifier: GPL-2.0-or-later /* * UDP over IPv6 * Linux INET6 implementation * * Authors: * Pedro Roque <roque@di.fc.ul.pt> * * Based on linux/ipv4/udp.c * * Fixes: * Hideaki YOSHIFUJI : sin6_scope_id support * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind * a single port at the same time. * Kazunori MIYAZAWA @USAGI: change process style to use ip6_append_data * YOSHIFUJI Hideaki @USAGI: convert /proc/net/udp6 to seq_file.
*/
if (!ipv6_addr_equal(&sk->sk_v6_rcv_saddr, daddr)) return -1;
score = 0;
inet = inet_sk(sk);
if (inet->inet_dport) { if (inet->inet_dport != sport) return -1;
score++;
}
if (!ipv6_addr_any(&sk->sk_v6_daddr)) { if (!ipv6_addr_equal(&sk->sk_v6_daddr, saddr)) return -1;
score++;
}
bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
dev_match = udp_sk_bound_dev_eq(net, bound_dev_if, dif, sdif); if (!dev_match) return -1; if (bound_dev_if)
score++;
if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id())
score++;
return score;
}
/** * udp6_lib_lookup1() - Simplified lookup using primary hash (destination port) * @net: Network namespace * @saddr: Source address, network order * @sport: Source port, network order * @daddr: Destination address, network order * @hnum: Destination port, host order * @dif: Destination interface index * @sdif: Destination bridge port index, if relevant * @udptable: Set of UDP hash tables * * Simplified lookup to be used as fallback if no sockets are found due to a * potential race between (receive) address change, and lookup happening before * the rehash operation. This function ignores SO_REUSEPORT groups while scoring * result sockets, because if we have one, we don't need the fallback at all. * * Called under rcu_read_lock(). * * Return: socket with highest matching score if any, NULL if none
*/ staticstruct sock *udp6_lib_lookup1(conststruct net *net, conststruct in6_addr *saddr, __be16 sport, conststruct in6_addr *daddr, unsignedint hnum, int dif, int sdif, conststruct udp_table *udptable)
{ unsignedint slot = udp_hashfn(net, hnum, udptable->mask); struct udp_hslot *hslot = &udptable->hash[slot]; struct sock *sk, *result = NULL;
int score, badness = 0;
if (sk->sk_state == TCP_ESTABLISHED) {
result = sk; continue;
}
result = inet6_lookup_reuseport(net, sk, skb, sizeof(struct udphdr),
saddr, sport, daddr, hnum, udp6_ehashfn); if (!result) {
result = sk; continue;
}
/* Fall back to scoring if group has connections */ if (!reuseport_has_conns(sk)) return result;
/* Reuseport logic returned an error, keep original score. */ if (IS_ERR(result)) continue;
/* compute_score is too long of a function to be * inlined, and calling it again here yields * measureable overhead for some * workloads. Work around it by jumping * backwards to rescore 'result'.
*/
need_rescore = true; goto rescore;
}
} return result;
}
#if IS_ENABLED(CONFIG_BASE_SMALL) staticstruct sock *udp6_lib_lookup4(conststruct net *net, conststruct in6_addr *saddr, __be16 sport, conststruct in6_addr *daddr, unsignedint hnum, int dif, int sdif, struct udp_table *udptable)
{ return NULL;
}
/* if the nulls value we got at the end of this lookup is not the * expected one, we must restart lookup. We probably met an item that * was moved to another chain due to rehash.
*/ if (get_nulls_value(node) != slot) goto begin;
if (udp_has_hash4(hslot2)) {
result = udp6_lib_lookup4(net, saddr, sport, daddr, hnum,
dif, sdif, udptable); if (result) /* udp6_lib_lookup4 return sk or NULL */ return result;
}
/* Lookup connected or non-wildcard sockets */
result = udp6_lib_lookup2(net, saddr, sport,
daddr, hnum, dif, sdif,
hslot2, skb); if (!IS_ERR_OR_NULL(result) && result->sk_state == TCP_ESTABLISHED) goto done;
/* Lookup redirect from BPF */ if (static_branch_unlikely(&bpf_sk_lookup_enabled) &&
udptable == net->ipv4.udp_table) {
sk = inet6_lookup_run_sk_lookup(net, IPPROTO_UDP, skb, sizeof(struct udphdr),
saddr, sport, daddr, hnum, dif,
udp6_ehashfn); if (sk) {
result = sk; goto done;
}
}
/* Got non-wildcard socket or error on first lookup */ if (result) goto done;
/* Must be called under rcu_read_lock(). * Does increment socket refcount.
*/ #if IS_ENABLED(CONFIG_NF_TPROXY_IPV6) || IS_ENABLED(CONFIG_NF_SOCKET_IPV6) struct sock *udp6_lib_lookup(conststruct net *net, conststruct in6_addr *saddr, __be16 sport, conststruct in6_addr *daddr, __be16 dport, int dif)
{ struct sock *sk;
sk = __udp6_lib_lookup(net, saddr, sport, daddr, dport,
dif, 0, net->ipv4.udp_table, NULL); if (sk && !refcount_inc_not_zero(&sk->sk_refcnt))
sk = NULL; return sk;
}
EXPORT_SYMBOL_GPL(udp6_lib_lookup); #endif
/* do not use the scratch area len for jumbogram: their length execeeds the * scratch area space; note that the IP6CB flags is still in the first * cacheline, so checking for jumbograms is cheap
*/ staticint udp6_skb_len(struct sk_buff *skb)
{ return unlikely(inet6_is_jumbogram(skb)) ? skb->len : udp_skb_len(skb);
}
/* * This should be easy, if there is something there we * return it, otherwise we block.
*/
int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len)
{ struct ipv6_pinfo *np = inet6_sk(sk); struct inet_sock *inet = inet_sk(sk); struct sk_buff *skb; unsignedint ulen, copied; int off, err, peeking = flags & MSG_PEEK; int is_udplite = IS_UDPLITE(sk); struct udp_mib __percpu *mib; bool checksum_valid = false; int is_udp4;
if (flags & MSG_ERRQUEUE) return ipv6_recv_error(sk, msg, len, addr_len);
if (np->rxopt.bits.rxpmtu && READ_ONCE(np->rxpmtu)) return ipv6_recv_rxpmtu(sk, msg, len, addr_len);
try_again:
off = sk_peek_offset(sk, flags);
skb = __skb_recv_udp(sk, flags, &off, &err); if (!skb) return err;
/* * If checksum is needed at all, try to do it while copying the * data. If the data is truncated, or if we only want a partial * coverage checksum (UDP-Lite), do it before the copy.
*/
/* Handler for tunnels with arbitrary destination ports: no socket lookup, go * through error handlers in encapsulations looking for a match.
*/ staticint __udp6_lib_err_encap_no_sk(struct sk_buff *skb, struct inet6_skb_parm *opt,
u8 type, u8 code, int offset, __be32 info)
{ int i;
for (i = 0; i < MAX_IPTUN_ENCAP_OPS; i++) { int (*handler)(struct sk_buff *skb, struct inet6_skb_parm *opt,
u8 type, u8 code, int offset, __be32 info); conststruct ip6_tnl_encap_ops *encap;
/* Try to match ICMP errors to UDP tunnels by looking up a socket without * reversing source and destination port: this will match tunnels that force the * same destination port on both endpoints (e.g. VXLAN, GENEVE). Note that * lwtunnels might actually break this assumption by being configured with * different destination ports on endpoints, in this case we won't be able to * trace ICMP messages back to them. * * If this doesn't match any socket, probe tunnels with arbitrary destination * ports (e.g. FoU, GUE): there, the receiving socket is useless, as the port * we've sent packets to won't necessarily match the local destination port. * * Then ask the tunnel implementation to match the error against a valid * association. * * Return an error if we can't find a match, the socket if we need further * processing, zero otherwise.
*/ staticstruct sock *__udp6_lib_err_encap(struct net *net, conststruct ipv6hdr *hdr, int offset, struct udphdr *uh, struct udp_table *udptable, struct sock *sk, struct sk_buff *skb, struct inet6_skb_parm *opt,
u8 type, u8 code, __be32 info)
{ int (*lookup)(struct sock *sk, struct sk_buff *skb); int network_offset, transport_offset; struct udp_sock *up;
if (!sk || READ_ONCE(udp_sk(sk)->encap_type)) { /* No socket for error: try tunnels before discarding */ if (static_branch_unlikely(&udpv6_encap_needed_key)) {
sk = __udp6_lib_err_encap(net, hdr, offset, uh,
udptable, sk, skb,
opt, type, code, info); if (!sk) return 0;
} else
sk = ERR_PTR(-ENOENT);
if (IS_ERR(sk)) {
__ICMP6_INC_STATS(net, __in6_dev_get(skb->dev),
ICMP6_MIB_INERRORS); return PTR_ERR(sk);
}
if (static_branch_unlikely(&udpv6_encap_needed_key) &&
READ_ONCE(up->encap_type)) { int (*encap_rcv)(struct sock *sk, struct sk_buff *skb);
/* * This is an encapsulation socket so pass the skb to * the socket's udp_encap_rcv() hook. Otherwise, just * fall through and pass this up the UDP socket. * up->encap_rcv() returns the following value: * =0 if skb was successfully passed to the encap * handler or was discarded by it. * >0 if skb should be passed on to UDP. * <0 if skb should be resubmitted as proto -N
*/
/* if we're overly short, let UDP handle it */
encap_rcv = READ_ONCE(up->encap_rcv); if (encap_rcv) { int ret;
/* Verify checksum before giving to encap */ if (udp_lib_checksum_complete(skb)) goto csum_error;
ret = encap_rcv(sk, skb); if (ret <= 0) {
__UDP6_INC_STATS(sock_net(sk),
UDP_MIB_INDATAGRAMS,
is_udplite); return -ret;
}
}
/* FALLTHROUGH -- it's a UDP Packet */
}
/* * UDP-Lite specific tests, ignored on UDP sockets (see net/ipv4/udp.c).
*/ if (udp_test_bit(UDPLITE_RECV_CC, sk) && UDP_SKB_CB(skb)->partial_cov) {
u16 pcrlen = READ_ONCE(up->pcrlen);
if (pcrlen == 0) { /* full coverage was set */
net_dbg_ratelimited("UDPLITE6: partial coverage %d while full coverage %d requested\n",
UDP_SKB_CB(skb)->cscov, skb->len); goto drop;
} if (UDP_SKB_CB(skb)->cscov < pcrlen) {
net_dbg_ratelimited("UDPLITE6: coverage %d too small, need min %d\n",
UDP_SKB_CB(skb)->cscov, pcrlen); goto drop;
}
}
prefetch(&sk->sk_rmem_alloc); if (rcu_access_pointer(sk->sk_filter) &&
udp_lib_checksum_complete(skb)) goto csum_error;
if (sk_filter_trim_cap(sk, skb, sizeof(struct udphdr), &drop_reason)) goto drop;
staticvoid udp6_csum_zero_error(struct sk_buff *skb)
{ /* RFC 2460 section 8.1 says that we SHOULD log * this error. Well, it is reasonable.
*/
net_dbg_ratelimited("IPv6: udp checksum is 0 for [%pI6c]:%u->[%pI6c]:%u\n",
&ipv6_hdr(skb)->saddr, ntohs(udp_hdr(skb)->source),
&ipv6_hdr(skb)->daddr, ntohs(udp_hdr(skb)->dest));
}
/* * Note: called only from the BH handler context, * so we don't need to lock the hashes.
*/ staticint __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb, conststruct in6_addr *saddr, conststruct in6_addr *daddr, struct udp_table *udptable, int proto)
{ struct sock *sk, *first = NULL; conststruct udphdr *uh = udp_hdr(skb); unsignedshort hnum = ntohs(uh->dest); struct udp_hslot *hslot = udp_hashslot(udptable, net, hnum); unsignedint offset = offsetof(typeof(*sk), sk_node); unsignedint hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10); int dif = inet6_iif(skb); int sdif = inet6_sdif(skb); struct hlist_node *node; struct sk_buff *nskb;
sk_for_each_entry_offset_rcu(sk, node, &hslot->head, offset) { if (!__udp_v6_is_mcast_sock(net, sk, uh->dest, daddr,
uh->source, saddr, dif, sdif,
hnum)) continue; /* If zero checksum and no_check is not on for * the socket then skip it.
*/ if (!uh->check && !udp_get_no_check6_rx(sk)) continue; if (!first) {
first = sk; continue;
}
nskb = skb_clone(skb, GFP_ATOMIC); if (unlikely(!nskb)) {
atomic_inc(&sk->sk_drops);
__UDP6_INC_STATS(net, UDP_MIB_RCVBUFERRORS,
IS_UDPLITE(sk));
__UDP6_INC_STATS(net, UDP_MIB_INERRORS,
IS_UDPLITE(sk)); continue;
}
if (udpv6_queue_rcv_skb(sk, nskb) > 0)
consume_skb(nskb);
}
/* Also lookup *:port if we are using hash2 and haven't done so yet. */ if (use_hash2 && hash2 != hash2_any) {
hash2 = hash2_any; goto start_lookup;
}
if (first) { if (udpv6_queue_rcv_skb(first, skb) > 0)
consume_skb(skb);
} else {
kfree_skb(skb);
__UDP6_INC_STATS(net, UDP_MIB_IGNOREDMULTI,
proto == IPPROTO_UDPLITE);
} return 0;
}
/* wrapper for udp_queue_rcv_skb tacking care of csum conversion and * return code conversion for ip layer consumption
*/ staticint udp6_unicast_rcv_skb(struct sock *sk, struct sk_buff *skb, struct udphdr *uh)
{ int ret;
if (inet_get_convert_csum(sk) && uh->check && !IS_UDPLITE(sk))
skb_checksum_try_convert(skb, IPPROTO_UDP, ip6_compute_pseudo);
ret = udpv6_queue_rcv_skb(sk, skb);
/* a return value > 0 means to resubmit the input */ if (ret > 0) return ret; return 0;
}
if (udp6_csum_init(skb, uh, proto)) goto csum_error;
/* Check if the socket is already available, e.g. due to early demux */
sk = inet6_steal_sock(net, skb, sizeof(struct udphdr), saddr, uh->source, daddr, uh->dest,
&refcounted, udp6_ehashfn); if (IS_ERR(sk)) goto no_sk;
if (sk) { struct dst_entry *dst = skb_dst(skb); int ret;
if (unlikely(rcu_dereference(sk->sk_rx_dst) != dst))
udp6_sk_rx_dst_set(sk, dst);
if (!uh->check && !udp_get_no_check6_rx(sk)) { if (refcounted)
sock_put(sk); goto report_csum_error;
}
ret = udp6_unicast_rcv_skb(sk, skb, uh); if (refcounted)
sock_put(sk); return ret;
}
if (dst)
dst = dst_check(dst, sk->sk_rx_dst_cookie); if (dst) { /* set noref for now. * any place which wants to hold dst has to call * dst_hold_safe()
*/
skb_dst_set_noref(skb, dst);
}
}
/* * Throw away all pending data and cancel the corking. Socket is locked.
*/ staticvoid udp_v6_flush_pending_frames(struct sock *sk)
{ struct udp_sock *up = udp_sk(sk);
staticint udpv6_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{ if (addr_len < offsetofend(struct sockaddr, sa_family)) return -EINVAL; /* The following checks are replicated from __ip6_datagram_connect() * and intended to prevent BPF program called below from accessing * bytes that are out of the bound specified by user in addr_len.
*/ if (uaddr->sa_family == AF_INET) { if (ipv6_only_sock(sk)) return -EAFNOSUPPORT; return udp_pre_connect(sk, uaddr, addr_len);
}
staticint udpv6_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{ int res;
lock_sock(sk);
res = __ip6_datagram_connect(sk, uaddr, addr_len); if (!res)
udp6_hash4(sk);
release_sock(sk); return res;
}
/** * udp6_hwcsum_outgoing - handle outgoing HW checksumming * @sk: socket we are sending on * @skb: sk_buff containing the filled-in UDP header * (checksum field must be zeroed out) * @saddr: source address * @daddr: destination address * @len: length of packet
*/ staticvoid udp6_hwcsum_outgoing(struct sock *sk, struct sk_buff *skb, conststruct in6_addr *saddr, conststruct in6_addr *daddr, int len)
{ unsignedint offset; struct udphdr *uh = udp_hdr(skb); struct sk_buff *frags = skb_shinfo(skb)->frag_list;
__wsum csum = 0;
if (!frags) { /* Only one fragment on the socket. */
skb->csum_start = skb_transport_header(skb) - skb->head;
skb->csum_offset = offsetof(struct udphdr, check);
uh->check = ~csum_ipv6_magic(saddr, daddr, len, IPPROTO_UDP, 0);
} else { /* * HW-checksum won't work as there are two or more * fragments on the socket so that all csums of sk_buffs * should be together
*/
offset = skb_transport_offset(skb);
skb->csum = skb_checksum(skb, offset, skb->len - offset, 0);
csum = skb->csum;
skb->ip_summed = CHECKSUM_NONE;
do {
csum = csum_add(csum, frags->csum);
} while ((frags = frags->next));
if (inet6_test_bit(SNDFLOW, sk)) {
fl6->flowlabel = sin6->sin6_flowinfo&IPV6_FLOWINFO_MASK; if (fl6->flowlabel & IPV6_FLOWLABEL_MASK) {
flowlabel = fl6_sock_lookup(sk, fl6->flowlabel); if (IS_ERR(flowlabel)) return -EINVAL;
}
}
/* * Otherwise it will be difficult to maintain * sk->sk_dst_cache.
*/ if (sk->sk_state == TCP_ESTABLISHED &&
ipv6_addr_equal(daddr, &sk->sk_v6_daddr))
daddr = &sk->sk_v6_daddr;
lock_sock(sk); if (unlikely(up->pending)) { /* The socket is already corked while preparing it. */ /* ... which is an evident application bug. --ANK */
release_sock(sk);
out:
dst_release(dst);
out_no_dst:
fl6_sock_release(flowlabel);
txopt_put(opt_to_free); if (!err) return len; /* * ENOBUFS = no kernel mem, SOCK_NOSPACE = no sndbuf space. Reporting * ENOBUFS might not be good (it's not tunable per se), but otherwise * we don't have a good statistic (IpOutDiscards but it can be too many * things). We could add another new stat but at least for now that * seems like overkill.
*/ if (err == -ENOBUFS || test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
UDP6_INC_STATS(sock_net(sk),
UDP_MIB_SNDBUFERRORS, is_udplite);
} return err;
do_confirm: if (msg->msg_flags & MSG_PROBE)
dst_confirm_neigh(dst, &fl6->daddr); if (!(msg->msg_flags&MSG_PROBE) || len) goto back_from_confirm;
err = 0; goto out;
}
EXPORT_SYMBOL(udpv6_sendmsg);
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.