// SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Generic INET transport hashtables * * Authors: Lotsa people, from code originally in tcp
*/
/* * Allocate and initialize a new local port bind bucket. * The bindhash mutex for snum's hash chain must be held here.
*/ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep, struct net *net, struct inet_bind_hashbucket *head, constunsignedshort snum, int l3mdev)
{ struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC);
/* * Caller must hold hashbucket lock for this tb with local BH disabled
*/ void inet_bind_bucket_destroy(struct inet_bind_bucket *tb)
{ conststruct inet_bind2_bucket *tb2;
if (hlist_empty(&tb->bhash2)) {
hlist_del_rcu(&tb->node);
kfree_rcu(tb, rcu); return;
}
if (tb2)
inet_bind2_bucket_init(tb2, net, head, tb, sk);
return tb2;
}
/* Caller must hold hashbucket lock for this tb with local BH disabled */ void inet_bind2_bucket_destroy(struct kmem_cache *cachep, struct inet_bind2_bucket *tb)
{ conststruct sock *sk;
if (hlist_empty(&tb->owners)) {
__hlist_del(&tb->node);
__hlist_del(&tb->bhash_node);
kmem_cache_free(cachep, tb); return;
}
/* * Get rid of any references to a local port held by the given sock.
*/ staticvoid __inet_put_port(struct sock *sk)
{ struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk); struct inet_bind_hashbucket *head, *head2; struct net *net = sock_net(sk); struct inet_bind_bucket *tb; int bhash;
/* NOTE: using tproxy and redirecting skbs to a proxy * on a different listener port breaks the assumption * that the listener socket's icsk_bind_hash is the same * as that of the child socket. We have to look up or
* create a new bind bucket for the child here. */
inet_bind_bucket_for_each(tb, &head->chain) { if (inet_bind_bucket_match(tb, net, port, l3mdev)) break;
} if (!tb) {
tb = inet_bind_bucket_create(table->bind_bucket_cachep,
net, head, port, l3mdev); if (!tb) {
spin_unlock(&head2->lock);
spin_unlock(&head->lock); return -ENOMEM;
}
created_inet_bind_bucket = true;
}
update_fastreuse = true;
if (sk->sk_family == PF_INET)
score++; if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id())
score++;
} return score;
}
/** * inet_lookup_reuseport() - execute reuseport logic on AF_INET socket if necessary. * @net: network namespace. * @sk: AF_INET socket, must be in TCP_LISTEN state for TCP or TCP_CLOSE for UDP. * @skb: context for a potential SK_REUSEPORT program. * @doff: header offset. * @saddr: source address. * @sport: source port. * @daddr: destination address. * @hnum: destination port in host byte order. * @ehashfn: hash function used to generate the fallback hash. * * Return: NULL if sk doesn't have SO_REUSEPORT set, otherwise a pointer to * the selected sock or an error.
*/ struct sock *inet_lookup_reuseport(conststruct net *net, struct sock *sk, struct sk_buff *skb, int doff,
__be32 saddr, __be16 sport,
__be32 daddr, unsignedshort hnum,
inet_ehashfn_t *ehashfn)
{ struct sock *reuse_sk = NULL;
u32 phash;
/* * Here are some nice properties to exploit here. The BSD API * does not allow a listening sock to specify the remote port nor the * remote address for the connection. So always assume those are both * wildcarded during the search since they can never be otherwise.
*/
/* called with rcu_read_lock() : No refcount taken on the socket */ staticstruct sock *inet_lhash2_lookup(conststruct net *net, struct inet_listen_hashbucket *ilb2, struct sk_buff *skb, int doff, const __be32 saddr, __be16 sport, const __be32 daddr, constunsignedshort hnum, constint dif, constint sdif)
{ struct sock *sk, *result = NULL; struct hlist_nulls_node *node; int score, hiscore = 0;
/* All sockets share common refcount, but have different destructors */ void sock_gen_put(struct sock *sk)
{ if (!refcount_dec_and_test(&sk->sk_refcnt)) return;
begin:
sk_nulls_for_each_rcu(sk, node, &head->chain) { if (sk->sk_hash != hash) continue; if (likely(inet_match(net, sk, acookie, ports, dif, sdif))) { if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt))) goto out; if (unlikely(!inet_match(net, sk, acookie,
ports, dif, sdif))) {
sock_gen_put(sk); goto begin;
} goto found;
}
} /* * if the nulls value we got at the end of this lookup is * not the expected one, we must restart lookup. * We probably met an item that was moved to another chain.
*/ if (get_nulls_value(node) != slot) goto begin;
out:
sk = NULL;
found: return sk;
}
EXPORT_SYMBOL_GPL(__inet_lookup_established);
sk_nulls_for_each(sk2, node, &head->chain) { if (sk2->sk_hash != hash) continue;
if (likely(inet_match(net, sk2, acookie, ports, dif, sdif))) { if (sk2->sk_state == TCP_TIME_WAIT) {
tw = inet_twsk(sk2); if (sk->sk_protocol == IPPROTO_TCP &&
tcp_twsk_unique(sk, sk2, twp)) break;
} goto not_unique;
}
}
/* Must record num and sport now. Otherwise we will see * in hash table socket with a funny identity.
*/
inet->inet_num = lport;
inet->inet_sport = htons(lport);
sk->sk_hash = hash;
WARN_ON(!sk_unhashed(sk));
__sk_nulls_add_node_rcu(sk, &head->chain); if (tw) {
sk_nulls_del_node_init_rcu((struct sock *)tw);
__NET_INC_STATS(net, LINUX_MIB_TIMEWAITRECYCLED);
}
spin_unlock(lock);
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
if (twp) {
*twp = tw;
} elseif (tw) { /* Silly. Should hash-dance instead... */
inet_twsk_deschedule_put(tw);
} return 0;
/* Insert a socket into ehash, and eventually remove another one * (The another one can be a SYN_RECV or TIMEWAIT) * If an existing socket already exists, socket sk is not inserted, * and sets found_dup_sk parameter to true.
*/ bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk)
{ struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk); struct inet_ehash_bucket *head; struct hlist_nulls_head *list;
spinlock_t *lock; bool ret = true;
WARN_ON_ONCE(!sk_unhashed(sk));
sk->sk_hash = sk_ehashfn(sk);
head = inet_ehash_bucket(hashinfo, sk->sk_hash);
list = &head->chain;
lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
spin_lock(lock); if (osk) {
WARN_ON_ONCE(sk->sk_hash != osk->sk_hash);
ret = sk_nulls_del_node_init_rcu(osk);
} elseif (found_dup_sk) {
*found_dup_sk = inet_ehash_lookup_by_sk(sk, list); if (*found_dup_sk)
ret = false;
}
/* The socket's bhash2 hashbucket spinlock must be held when this is called */ struct inet_bind2_bucket *
inet_bind2_bucket_find(conststruct inet_bind_hashbucket *head, conststruct net *net, unsignedshort port, int l3mdev, conststruct sock *sk)
{ struct inet_bind2_bucket *bhash2 = NULL;
inet_bind_bucket_for_each(bhash2, &head->chain) if (inet_bind2_bucket_match(bhash2, net, port, l3mdev, sk)) break;
return bhash2;
}
struct inet_bind_hashbucket *
inet_bhash2_addr_any_hashbucket(conststruct sock *sk, conststruct net *net, int port)
{ struct inet_hashinfo *hinfo = tcp_get_hashinfo(sk);
u32 hash;
staticint __inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family, bool reset)
{ struct inet_hashinfo *hinfo = tcp_get_hashinfo(sk); struct inet_bind_hashbucket *head, *head2; struct inet_bind2_bucket *tb2, *new_tb2; int l3mdev = inet_sk_bound_l3mdev(sk); int port = inet_sk(sk)->inet_num; struct net *net = sock_net(sk); int bhash;
if (!inet_csk(sk)->icsk_bind2_hash) { /* Not bind()ed before. */ if (reset)
inet_reset_saddr(sk); else
inet_update_saddr(sk, saddr, family);
return 0;
}
/* Allocate a bind2 bucket ahead of time to avoid permanently putting * the bhash2 table in an inconsistent state if a new tb2 bucket * allocation fails.
*/
new_tb2 = kmem_cache_alloc(hinfo->bind2_bucket_cachep, GFP_ATOMIC); if (!new_tb2) { if (reset) { /* The (INADDR_ANY, port) bucket might have already * been freed, then we cannot fixup icsk_bind2_hash, * so we give up and unlink sk from bhash/bhash2 not * to leave inconsistency in bhash2.
*/
inet_put_port(sk);
inet_reset_saddr(sk);
}
/* RFC 6056 3.3.4. Algorithm 4: Double-Hash Port Selection Algorithm * Note that we use 32bit integers (vs RFC 'short integers') * because 2^16 is not a multiple of num_ephemeral and this * property might be used by clever attacker. * * RFC claims using TABLE_LENGTH=10 buckets gives an improvement, though * attacks were since demonstrated, thus we use 65536 by default instead * to really give more isolation and privacy, at the expense of 256kB * of kernel memory.
*/ #define INET_TABLE_PERTURB_SIZE (1 << CONFIG_INET_TABLE_PERTURB_ORDER) static u32 *table_perturb;
int __inet_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk, u64 port_offset,
u32 hash_port0, int (*check_established)(struct inet_timewait_death_row *, struct sock *, __u16, struct inet_timewait_sock **, bool rcu_lookup, u32 hash))
{ struct inet_hashinfo *hinfo = death_row->hashinfo; struct inet_bind_hashbucket *head, *head2; struct inet_timewait_sock *tw = NULL; int port = inet_sk(sk)->inet_num; struct net *net = sock_net(sk); struct inet_bind2_bucket *tb2; struct inet_bind_bucket *tb; bool tb_created = false;
u32 remaining, offset; int ret, i, low, high; bool local_ports; int step, l3mdev;
u32 index;
if (port) {
local_bh_disable();
ret = check_established(death_row, sk, port, NULL, false,
hash_port0 + port);
local_bh_enable(); return ret;
}
/* In first pass we try ports of @low parity. * inet_csk_get_port() does the opposite choice.
*/ if (!local_ports)
offset &= ~1U;
other_parity_scan:
port = low + offset; for (i = 0; i < remaining; i += step, port += step) { if (unlikely(port >= high))
port -= remaining; if (inet_is_local_reserved_port(net, port)) continue;
head = &hinfo->bhash[inet_bhashfn(net, port,
hinfo->bhash_size)];
rcu_read_lock();
hlist_for_each_entry_rcu(tb, &head->chain, node) { if (!inet_bind_bucket_match(tb, net, port, l3mdev)) continue; if (tb->fastreuse >= 0 || tb->fastreuseport >= 0) {
rcu_read_unlock(); goto next_port;
} if (!check_established(death_row, sk, port, &tw, true,
hash_port0 + port)) break;
rcu_read_unlock(); goto next_port;
}
rcu_read_unlock();
spin_lock_bh(&head->lock);
/* Does not bother with rcv_saddr checks, because * the established check is already unique enough.
*/
inet_bind_bucket_for_each(tb, &head->chain) { if (inet_bind_bucket_match(tb, net, port, l3mdev)) { if (tb->fastreuse >= 0 ||
tb->fastreuseport >= 0) goto next_port_unlock;
WARN_ON(hlist_empty(&tb->bhash2)); if (!check_established(death_row, sk,
port, &tw, false,
hash_port0 + port)) goto ok; goto next_port_unlock;
}
}
if (!local_ports) {
offset++; if ((offset & 1) && remaining > 1) goto other_parity_scan;
} return -EADDRNOTAVAIL;
ok: /* Find the corresponding tb2 bucket since we need to * add the socket to the bhash2 table as well
*/
head2 = inet_bhashfn_portaddr(hinfo, sk, net, port);
spin_lock(&head2->lock);
/* Here we want to add a little bit of randomness to the next source * port that will be chosen. We use a max() with a random here so that * on low contention the randomness is maximal and on high contention * it may be inexistent.
*/
i = max_t(int, i, get_random_u32_below(8) * step);
WRITE_ONCE(table_perturb[index], READ_ONCE(table_perturb[index]) + i + step);
/* Head lock still held and bh's disabled */
inet_bind_hash(sk, tb, tb2, port);
sk->sk_userlocks |= SOCK_CONNECT_BIND;
if (sk_unhashed(sk)) {
inet_sk(sk)->inet_sport = htons(port);
inet_ehash_nolisten(sk, (struct sock *)tw, NULL);
} if (tw)
inet_twsk_bind_unhash(tw, hinfo);
spin_unlock(&head2->lock); if (tb_created)
inet_bind_bucket_destroy(tb);
spin_unlock(&head->lock);
if (tw)
inet_twsk_deschedule_put(tw);
local_bh_enable();
return -ENOMEM;
}
/* * Bind a port for a connect operation and hash it.
*/ int inet_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk)
{ conststruct inet_sock *inet = inet_sk(sk); conststruct net *net = sock_net(sk);
u64 port_offset = 0;
u32 hash_port0;
if (!inet_sk(sk)->inet_num)
port_offset = inet_sk_port_offset(sk);
staticvoid init_hashinfo_lhash2(struct inet_hashinfo *h)
{ int i;
for (i = 0; i <= h->lhash2_mask; i++) {
spin_lock_init(&h->lhash2[i].lock);
INIT_HLIST_NULLS_HEAD(&h->lhash2[i].nulls_head,
i + LISTENING_NULLS_BASE);
}
}
/* this one is used for source ports of outgoing connections */
table_perturb = alloc_large_system_hash("Table-perturb", sizeof(*table_perturb),
INET_TABLE_PERTURB_SIZE,
0, 0, NULL, NULL,
INET_TABLE_PERTURB_SIZE,
INET_TABLE_PERTURB_SIZE);
}
int inet_hashinfo2_init_mod(struct inet_hashinfo *h)
{
h->lhash2 = kmalloc_array(INET_LHTABLE_SIZE, sizeof(*h->lhash2), GFP_KERNEL); if (!h->lhash2) return -ENOMEM;
h->lhash2_mask = INET_LHTABLE_SIZE - 1; /* INET_LHTABLE_SIZE must be a power of 2 */
BUG_ON(INET_LHTABLE_SIZE & h->lhash2_mask);
init_hashinfo_lhash2(h); return 0;
}
int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo)
{ unsignedint locksz = sizeof(spinlock_t); unsignedint i, nblocks = 1;
spinlock_t *ptr = NULL;
if (locksz == 0) goto set_mask;
/* Allocate 2 cache lines or at least one spinlock per cpu. */
nblocks = max(2U * L1_CACHE_BYTES / locksz, 1U) * num_possible_cpus();
/* At least one page per NUMA node. */
nblocks = max(nblocks, num_online_nodes() * PAGE_SIZE / locksz);
nblocks = roundup_pow_of_two(nblocks);
/* No more locks than number of hash buckets. */
nblocks = min(nblocks, hashinfo->ehash_mask + 1);
if (num_online_nodes() > 1) { /* Use vmalloc() to allow NUMA policy to spread pages * on all available nodes if desired.
*/
ptr = vmalloc_array(nblocks, locksz);
} if (!ptr) {
ptr = kvmalloc_array(nblocks, locksz, GFP_KERNEL); if (!ptr) return -ENOMEM;
} for (i = 0; i < nblocks; i++)
spin_lock_init(&ptr[i]);
hashinfo->ehash_locks = ptr;
set_mask:
hashinfo->ehash_locks_mask = nblocks - 1; return 0;
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.