// SPDX-License-Identifier: GPL-2.0-or-later /* * Linux INET6 implementation * Forwarding Information Database * * Authors: * Pedro Roque <roque@di.fc.ul.pt> * * Changes: * Yuji SEKIYA @USAGI: Support default route on router node; * remove ip6_null_entry from the top of * routing table. * Ville Nuorvala: Fixed routing subtrees.
*/
/* * A routing update causes an increase of the serial number on the * affected subtree. This allows for cached routes to be asynchronously * tested when modifications are made to the destination cache as a * result of redirects, path MTU changes, etc.
*/
staticvoid fib6_link_table(struct net *net, struct fib6_table *tb)
{ unsignedint h;
/* * Initialize table lock at a single place to give lockdep a key, * tables aren't visible prior to being linked to the list.
*/
spin_lock_init(&tb->tb6_lock);
h = tb->tb6_id & (FIB6_TABLE_HASHSZ - 1);
/* * No protection necessary, this is the only list mutatation * operation, tables never disappear once they exist.
*/
hlist_add_head_rcu(&tb->tb6_hlist, &net->ipv6.fib_table_hash[h]);
}
head = &net->ipv6.fib_table_hash[id & (FIB6_TABLE_HASHSZ - 1)];
/* See comment in fib6_link_table(). RCU is not required, * but rcu_dereference_raw() is used to avoid data-race.
*/
hlist_for_each_entry_rcu(tb, head, tb6_hlist, true) if (tb->tb6_id == id) return tb;
/* called with rcu lock held; no reference taken on fib6_info */ int fib6_lookup(struct net *net, int oif, struct flowi6 *fl6, struct fib6_result *res, int flags)
{ return fib6_table_lookup(net, net->ipv6.fib6_main_tbl, oif, fl6,
res, flags);
}
staticvoid __net_init fib6_tables_init(struct net *net)
{
fib6_link_table(net, net->ipv6.fib6_main_tbl);
}
#endif
unsignedint fib6_tables_seq_read(conststruct net *net)
{ unsignedint h, fib_seq = 0;
rcu_read_lock(); for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { conststruct hlist_head *head = &net->ipv6.fib_table_hash[h]; conststruct fib6_table *tb;
/* The tree traversal function should never return a positive value. */ return err > 0 ? -EINVAL : err;
}
staticint fib6_dump_node(struct fib6_walker *w)
{ int res; struct fib6_info *rt;
for_each_fib6_walker_rt(w) {
res = rt6_dump_route(rt, w->args, w->skip_in_node); if (res >= 0) { /* Frame is full, suspend walking */
w->leaf = rt;
/* We'll restart from this node, so if some routes were * already dumped, skip them next time.
*/
w->skip_in_node += res;
return 1;
}
w->skip_in_node = 0;
/* Multipath routes are dumped in one route with the * RTA_MULTIPATH attribute. Jump 'rt' to point to the * last sibling of this route (no need to dump the * sibling routes again)
*/ if (rt->fib6_nsiblings)
rt = list_last_entry(&rt->fib6_siblings, struct fib6_info,
fib6_siblings);
}
w->leaf = NULL; return 0;
}
/* * Routing Table * * return the appropriate node for a routing tree "add" operation * by either creating and inserting or by returning an existing * node.
*/
staticstruct fib6_node *fib6_add_1(struct net *net, struct fib6_table *table, struct fib6_node *root, struct in6_addr *addr, int plen, int offset, int allow_create, int replace_required, struct netlink_ext_ack *extack)
{ struct fib6_node *fn, *in, *ln; struct fib6_node *pn = NULL; struct rt6key *key; int bit;
__be32 dir = 0;
/* * Prefix match
*/ if (plen < fn->fn_bit ||
!ipv6_prefix_equal(&key->addr, addr, fn->fn_bit)) { if (!allow_create) { if (replace_required) {
NL_SET_ERR_MSG(extack, "Can not replace route - no match found");
pr_warn("Can't replace route, no match found\n"); return ERR_PTR(-ENOENT);
}
pr_warn("NLM_F_CREATE should be set when creating new route\n");
} goto insert_above;
}
/* * Exact match ?
*/
if (plen == fn->fn_bit) { /* clean up an intermediate node */ if (!(fn->fn_flags & RTN_RTINFO)) {
RCU_INIT_POINTER(fn->leaf, NULL);
fib6_info_release(leaf); /* remove null_entry in the root node */
} elseif (fn->fn_flags & RTN_TL_ROOT &&
rcu_access_pointer(fn->leaf) ==
net->ipv6.fib6_null_entry) {
RCU_INIT_POINTER(fn->leaf, NULL);
}
return fn;
}
/* * We have more bits to go
*/
/* Try to walk down on tree. */
dir = addr_bit_set(addr, fn->fn_bit);
pn = fn;
fn = dir ?
rcu_dereference_protected(fn->right,
lockdep_is_held(&table->tb6_lock)) :
rcu_dereference_protected(fn->left,
lockdep_is_held(&table->tb6_lock));
} while (fn);
if (!allow_create) { /* We should not create new node because * NLM_F_REPLACE was specified without NLM_F_CREATE * I assume it is safe to require NLM_F_CREATE when * REPLACE flag is used! Later we may want to remove the * check for replace_required, because according * to netlink specification, NLM_F_CREATE * MUST be specified if new route is created. * That would keep IPv6 consistent with IPv4
*/ if (replace_required) {
NL_SET_ERR_MSG(extack, "Can not replace route - no match found");
pr_warn("Can't replace route, no match found\n"); return ERR_PTR(-ENOENT);
}
pr_warn("NLM_F_CREATE should be set when creating new route\n");
} /* * We walked to the bottom of tree. * Create new leaf node without children.
*/
ln = node_alloc(net);
if (!ln) return ERR_PTR(-ENOMEM);
ln->fn_bit = plen;
RCU_INIT_POINTER(ln->parent, pn);
if (dir)
rcu_assign_pointer(pn->right, ln); else
rcu_assign_pointer(pn->left, ln);
return ln;
insert_above: /* * split since we don't have a common prefix anymore or * we have a less significant route. * we've to insert an intermediate node on the list * this new node will point to the one we need to create * and the current
*/
/* find 1st bit in difference between the 2 addrs.
See comment in __ipv6_addr_diff: bit may be an invalid value, but if it is >= plen, the value is ignored in any case.
*/
bit = __ipv6_addr_diff(addr, &key->addr, sizeof(*addr));
/* * (intermediate)[in] * / \ * (new leaf node)[ln] (old node)[fn]
*/ if (plen > bit) {
in = node_alloc(net);
ln = node_alloc(net);
if (!in || !ln) { if (in)
node_free_immediate(net, in); if (ln)
node_free_immediate(net, ln); return ERR_PTR(-ENOMEM);
}
/* * new intermediate node. * RTN_RTINFO will * be off since that an address that chooses one of * the branches would not match less specific routes * in the other branch
*/
staticvoid __fib6_drop_pcpu_from(struct fib6_nh *fib6_nh, conststruct fib6_info *match)
{ int cpu;
if (!fib6_nh->rt6i_pcpu) return;
rcu_read_lock(); /* release the reference to this fib entry from * all of its cached pcpu routes
*/
for_each_possible_cpu(cpu) { struct rt6_info **ppcpu_rt; struct rt6_info *pcpu_rt;
ppcpu_rt = per_cpu_ptr(fib6_nh->rt6i_pcpu, cpu);
/* Paired with xchg() in rt6_get_pcpu_route() */
pcpu_rt = READ_ONCE(*ppcpu_rt);
/* only dropping the 'from' reference if the cached route * is using 'match'. The cached pcpu_rt->from only changes * from a fib6_info to NULL (ip6_dst_destroy); it can never * change from one fib6_info reference to another
*/ if (pcpu_rt && rcu_access_pointer(pcpu_rt->from) == match) { struct fib6_info *from;
from = unrcu_pointer(xchg(&pcpu_rt->from, NULL));
fib6_info_release(from);
}
}
rcu_read_unlock();
}
staticvoid fib6_drop_pcpu_from(struct fib6_info *f6i)
{ /* Make sure rt6_make_pcpu_route() wont add other percpu routes * while we are cleaning them here.
*/
f6i->fib6_destroying = 1;
mb(); /* paired with the cmpxchg() in rt6_make_pcpu_route() */
/* Flush all cached dst in exception table */
rt6_flush_exceptions(rt);
fib6_drop_pcpu_from(rt);
if (rt->nh) {
spin_lock(&rt->nh->lock);
if (!list_empty(&rt->nh_list))
list_del_init(&rt->nh_list);
spin_unlock(&rt->nh->lock);
}
if (refcount_read(&rt->fib6_ref) != 1) { /* This route is used as dummy address holder in some split * nodes. It is not leaked, but it still holds other resources, * which must be released in time. So, scan ascendant nodes * and replace dummy references to this route with references * to still alive ones.
*/ while (fn) { struct fib6_info *leaf = rcu_dereference_protected(fn->leaf,
lockdep_is_held(&table->tb6_lock)); struct fib6_info *new_leaf; if (!(fn->fn_flags & RTN_RTINFO) && leaf == rt) {
new_leaf = fib6_find_prefix(net, table, fn);
fib6_info_hold(new_leaf);
if (rt6_duplicate_nexthop(iter, rt)) { if (rt->fib6_nsiblings)
WRITE_ONCE(rt->fib6_nsiblings, 0); if (!(iter->fib6_flags & RTF_EXPIRES)) return -EEXIST; if (!(rt->fib6_flags & RTF_EXPIRES)) {
fib6_clean_expires(iter);
fib6_remove_gc_list(iter);
} else {
fib6_set_expires(iter, rt->expires);
fib6_add_gc_list(iter);
}
if (rt->fib6_pmtu)
fib6_metric_set(iter, RTAX_MTU,
rt->fib6_pmtu); return -EEXIST;
} /* If we have the same destination and the same metric, * but not the same gateway, then the route we try to * add is sibling to this route, increment our counter * of siblings, and later we will add our route to the * list. * Only static routes (which don't have flag * RTF_EXPIRES) are used for ECMPv6. * * To avoid long list, we only had siblings if the * route have a gateway.
*/ if (rt_can_ecmp &&
rt6_qualify_for_ecmp(iter))
WRITE_ONCE(rt->fib6_nsiblings,
rt->fib6_nsiblings + 1);
}
if (iter->fib6_metric > rt->fib6_metric) break;
next_iter:
ins = &iter->fib6_next;
}
if (fallback_ins && !found) { /* No matching route with same ecmp-able-ness found, replace * first matching route
*/
ins = fallback_ins;
iter = rcu_dereference_protected(*ins,
lockdep_is_held(&rt->fib6_table->tb6_lock));
found++;
}
/* Reset round-robin state, if necessary */ if (ins == &fn->leaf)
fn->rr_ptr = NULL;
/* Link this route to others same route. */ if (rt->fib6_nsiblings) { unsignedint fib6_nsiblings; struct fib6_info *sibling, *temp_sibling;
/* Find the first route that have the same metric */
sibling = leaf;
notify_sibling_rt = true; while (sibling) { if (sibling->fib6_metric == rt->fib6_metric &&
rt6_qualify_for_ecmp(sibling)) {
list_add_tail_rcu(&rt->fib6_siblings,
&sibling->fib6_siblings); break;
}
sibling = rcu_dereference_protected(sibling->fib6_next,
lockdep_is_held(&rt->fib6_table->tb6_lock));
notify_sibling_rt = false;
} /* For each sibling in the list, increment the counter of * siblings. BUG() if counters does not match, list of siblings * is broken!
*/
fib6_nsiblings = 0;
list_for_each_entry_safe(sibling, temp_sibling,
&rt->fib6_siblings, fib6_siblings) {
WRITE_ONCE(sibling->fib6_nsiblings,
sibling->fib6_nsiblings + 1);
BUG_ON(sibling->fib6_nsiblings != rt->fib6_nsiblings);
fib6_nsiblings++;
}
BUG_ON(fib6_nsiblings != rt->fib6_nsiblings);
rcu_read_lock();
rt6_multipath_rebalance(temp_sibling);
rcu_read_unlock();
}
/* * insert node
*/ if (!replace) { if (!add)
pr_warn("NLM_F_CREATE should be set when creating new route\n");
add:
nlflags |= NLM_F_CREATE;
/* The route should only be notified if it is the first * route in the node or if it is added as a sibling * route to the first route in the node.
*/ if (!info->skip_notify_kernel &&
(notify_sibling_rt || ins == &fn->leaf)) { enum fib_event_type fib_event;
/* allow ipv4 to update sernum via ipv6_stub */ void fib6_update_sernum_stub(struct net *net, struct fib6_info *f6i)
{
spin_lock_bh(&f6i->fib6_table->tb6_lock);
fib6_update_sernum_upto_root(net, f6i);
spin_unlock_bh(&f6i->fib6_table->tb6_lock);
}
/* * Add routing information to the routing tree. * <destination addr>/<source addr> * with source addr info in sub-trees * Need to own table->tb6_lock
*/
if (info->nlh) { if (!(info->nlh->nlmsg_flags & NLM_F_CREATE))
allow_create = 0; if (info->nlh->nlmsg_flags & NLM_F_REPLACE)
replace_required = 1;
} if (!allow_create && !replace_required)
pr_warn("RTM_NEWROUTE with no NLM_F_CREATE or NLM_F_REPLACE\n");
if (IS_ERR(sn)) { /* If it is failed, discard just allocated root, and then (in failure) stale node in main tree.
*/
node_free_immediate(info->nl_net, sfn);
err = PTR_ERR(sn); goto failure;
}
/* Now link new subtree to main tree */
rcu_assign_pointer(sfn->parent, fn);
rcu_assign_pointer(fn->subtree, sfn);
} else {
sn = fib6_add_1(info->nl_net, table, FIB6_SUBTREE(fn),
&rt->fib6_src.addr, rt->fib6_src.plen,
offsetof(struct fib6_info, fib6_src),
allow_create, replace_required, extack);
if (IS_ERR(sn)) {
err = PTR_ERR(sn); goto failure;
}
}
if (!rcu_access_pointer(fn->leaf)) { if (fn->fn_flags & RTN_TL_ROOT) { /* put back null_entry for root node */
rcu_assign_pointer(fn->leaf,
info->nl_net->ipv6.fib6_null_entry);
} else {
fib6_info_hold(rt);
rcu_assign_pointer(fn->leaf, rt);
}
}
fn = sn;
} #endif
if (rt->fib6_flags & RTF_EXPIRES)
fib6_add_gc_list(rt);
fib6_start_gc(info->nl_net, rt);
}
out: if (err) { #ifdef CONFIG_IPV6_SUBTREES /* * If fib6_add_1 has cleared the old leaf pointer in the * super-tree leaf node we have to find a new one for it.
*/ if (pn != fn) { struct fib6_info *pn_leaf =
rcu_dereference_protected(pn->leaf,
lockdep_is_held(&table->tb6_lock)); if (pn_leaf == rt) {
pn_leaf = NULL;
RCU_INIT_POINTER(pn->leaf, NULL);
fib6_info_release(rt);
} if (!pn_leaf && !(pn->fn_flags & RTN_RTINFO)) {
pn_leaf = fib6_find_prefix(info->nl_net, table,
pn); if (!pn_leaf)
pn_leaf =
info->nl_net->ipv6.fib6_null_entry;
fib6_info_hold(pn_leaf);
rcu_assign_pointer(pn->leaf, pn_leaf);
}
} #endif goto failure;
} elseif (fib6_requires_src(rt)) {
fib6_routes_require_src_inc(info->nl_net);
} return err;
failure: /* fn->leaf could be NULL and fib6_repair_tree() needs to be called if: * 1. fn is an intermediate node and we failed to add the new * route to it in both subtree creation failure and fib6_add_rt2node() * failure case. * 2. fn is the root node in the table and we fail to add the first * default route to it.
*/ if (fn &&
(!(fn->fn_flags & (RTN_RTINFO|RTN_ROOT)) ||
(fn->fn_flags & RTN_TL_ROOT &&
!rcu_access_pointer(fn->leaf))))
fib6_repair_tree(info->nl_net, table, fn); return err;
}
/* * Routing tree lookup *
*/
struct lookup_args { int offset; /* key offset on fib6_info */ conststruct in6_addr *addr; /* search key */
};
/* * Get node with specified destination prefix (and source prefix, * if subtrees are used) * exact_match == true means we try to find fn with exact match of * the passed in prefix addr * exact_match == false means we try to find fn with longest prefix * match of the passed in prefix addr. This is useful for finding fn * for cached route as it will be stored in the exception table under * the node with longest prefix length.
*/
/* This node is being deleted */ if (!leaf) { if (plen <= fn->fn_bit) goto out; else goto next;
}
key = (struct rt6key *)((u8 *)leaf + offset);
/* * Prefix match
*/ if (plen < fn->fn_bit ||
!ipv6_prefix_equal(&key->addr, addr, fn->fn_bit)) goto out;
if (plen == fn->fn_bit) return fn;
if (fn->fn_flags & RTN_RTINFO)
prev = fn;
next: /* * We have more bits to go
*/ if (addr_bit_set(addr, fn->fn_bit))
fn = rcu_dereference(fn->right); else
fn = rcu_dereference(fn->left);
}
out: if (exact_match) return NULL; else return prev;
}
if (fn->fn_flags & RTN_ROOT) return net->ipv6.fib6_null_entry;
while (fn) {
child_left = rcu_dereference_protected(fn->left,
lockdep_is_held(&table->tb6_lock));
child_right = rcu_dereference_protected(fn->right,
lockdep_is_held(&table->tb6_lock)); if (child_left) return rcu_dereference_protected(child_left->leaf,
lockdep_is_held(&table->tb6_lock)); if (child_right) return rcu_dereference_protected(child_right->leaf,
lockdep_is_held(&table->tb6_lock));
fn = FIB6_SUBTREE(fn);
} return NULL;
}
/* * Called to trim the tree of intermediate nodes when possible. "fn" * is the node we want to try and remove. * Need to own table->tb6_lock
*/
staticstruct fib6_node *fib6_repair_tree(struct net *net, struct fib6_table *table, struct fib6_node *fn)
{ int children; int nstate; struct fib6_node *child; struct fib6_walker *w; int iter = 0;
/* Set fn->leaf to null_entry for root node. */ if (fn->fn_flags & RTN_TL_ROOT) {
rcu_assign_pointer(fn->leaf, net->ipv6.fib6_null_entry); return fn;
}
/* If the deleted route is the first in the node and it is not part of * a multipath route, then we need to replace it with the next route * in the node, if exists.
*/
leaf = rcu_dereference_protected(fn->leaf,
lockdep_is_held(&table->tb6_lock)); if (leaf == rt && !rt->fib6_nsiblings) { if (rcu_access_pointer(rt->fib6_next))
replace_rt = rcu_dereference_protected(rt->fib6_next,
lockdep_is_held(&table->tb6_lock)); else
notify_del = true;
}
/* Reset round-robin state, if necessary */ if (rcu_access_pointer(fn->rr_ptr) == rt)
fn->rr_ptr = NULL;
/* Remove this entry from other siblings */ if (rt->fib6_nsiblings) { struct fib6_info *sibling, *next_sibling;
/* The route is deleted from a multipath route. If this * multipath route is the first route in the node, then we need * to emit a delete notification. Otherwise, we need to skip * the notification.
*/ if (rt->fib6_metric == leaf->fib6_metric &&
rt6_qualify_for_ecmp(leaf))
notify_del = true;
list_for_each_entry_safe(sibling, next_sibling,
&rt->fib6_siblings, fib6_siblings)
WRITE_ONCE(sibling->fib6_nsiblings,
sibling->fib6_nsiblings - 1);
WRITE_ONCE(rt->fib6_nsiblings, 0);
list_del_rcu(&rt->fib6_siblings);
rt6_multipath_rebalance(next_sibling);
}
/* If it was last route, call fib6_repair_tree() to: * 1. For root node, put back null_entry as how the table was created. * 2. For other nodes, expunge its radix tree node.
*/ if (!rcu_access_pointer(fn->leaf)) { if (!(fn->fn_flags & RTN_TL_ROOT)) {
fn->fn_flags &= ~RTN_RTINFO;
net->ipv6.rt6_stats->fib_route_nodes--;
}
fn = fib6_repair_tree(net, table, fn);
}
fib6_purge_rt(rt, fn, net);
if (!info->skip_notify_kernel) { if (notify_del)
call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_DEL,
rt, NULL); elseif (replace_rt)
call_fib6_entry_notifiers_replace(net, replace_rt);
} if (!info->skip_notify)
inet6_rt_notify(RTM_DELROUTE, rt, info, 0);
fib6_info_release(rt);
}
/* Need to own table->tb6_lock */ int fib6_del(struct fib6_info *rt, struct nl_info *info)
{ struct net *net = info->nl_net; struct fib6_info __rcu **rtp; struct fib6_info __rcu **rtp_next; struct fib6_table *table; struct fib6_node *fn;
if (rt == net->ipv6.fib6_null_entry) return -ENOENT;
for (rtp = &fn->leaf; *rtp; rtp = rtp_next) { struct fib6_info *cur = rcu_dereference_protected(*rtp,
lockdep_is_held(&table->tb6_lock)); if (rt == cur) { if (fib6_requires_src(cur))
fib6_routes_require_src_dec(info->nl_net);
fib6_del_route(table, fn, rtp, info); return 0;
}
rtp_next = &cur->fib6_next;
} return -ENOENT;
}
/* * Tree traversal function. * * Certainly, it is not interrupt safe. * However, it is internally reenterable wrt itself and fib6_add/fib6_del. * It means, that we can modify tree during walking * and use this function for garbage collection, clone pruning, * cleaning tree when a device goes down etc. etc. * * It guarantees that every node will be traversed, * and that it will be traversed only once. * * Callback function w->func may return: * 0 -> continue walking. * positive value -> walking is suspended (used by tree dumps, * and probably by gc, if it will be split to several slices) * negative value -> terminate walking. * * The function itself returns: * 0 -> walk is complete. * >0 -> walk is incomplete (i.e. suspended) * <0 -> walk is terminated by an error. * * This function is called with tb6_lock held.
*/
/* * Convenient frontend to tree walker. * * func is called on each route. * It may return -2 -> skip multipath route. * -1 -> delete this route. * 0 -> continue walking
*/
staticvoid fib6_clean_tree(struct net *net, struct fib6_node *root, int (*func)(struct fib6_info *, void *arg), int sernum, void *arg, bool skip_notify)
{ struct fib6_cleaner c;
/* * check addrconf expiration here. * Routes are expired even if they are in use.
*/
if (rt->fib6_flags & RTF_EXPIRES && rt->expires) { if (time_after(now, rt->expires)) {
pr_debug("expiring %p\n", rt); return -1;
}
gc_args->more++;
}
/* Also age clones in the exception table. * Note, that clones are aged out * only if they are not in use now.
*/
rt6_age_exceptions(rt, gc_args, now);
¤ Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.0.30Bemerkung:
(vorverarbeitet)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.