fn = rcu_dereference(rt->fib6_node); if (!fn) goto out;
iter = rcu_dereference(fn->leaf); if (!iter) goto out;
while (iter) { if (iter->fib6_metric == rt->fib6_metric &&
rt6_qualify_for_ecmp(iter)) return iter;
iter = rcu_dereference(iter->fib6_next);
}
out: return NULL;
}
void fib6_select_path(conststruct net *net, struct fib6_result *res, struct flowi6 *fl6, int oif, bool have_oif_match, conststruct sk_buff *skb, int strict)
{ struct fib6_info *first, *match = res->f6i; struct fib6_info *sibling; int hash;
if (!match->nh && (!match->fib6_nsiblings || have_oif_match)) goto out;
if (match->nh && have_oif_match && res->nh) return;
if (skb)
IP6CB(skb)->flags |= IP6SKB_MULTIPATH;
/* We might have already computed the hash for ICMPv6 errors. In such * case it will always be non-zero. Otherwise now is the time to do it.
*/ if (!fl6->mp_hash &&
(!match->nh || nexthop_is_multipath(match->nh)))
fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
if (unlikely(match->nh)) {
nexthop_path_fib6_result(res, fl6->mp_hash); return;
}
first = rt6_multipath_first_sibling_rcu(match); if (!first) goto out;
hash = fl6->mp_hash; if (hash <= atomic_read(&first->fib6_nh->fib_nh_upper_bound)) { if (rt6_score_route(first->fib6_nh, first->fib6_flags, oif,
strict) >= 0)
match = first; goto out;
}
/* * Okay, this does not seem to be appropriate * for now, however, we need to check if it * is really so; aka Router Reachability Probing. * * Router Reachability Probe MUST be rate-limited * to no more than one per minute.
*/ if (!fib6_nh->fib_nh_gw_family) return;
nh_gw = &fib6_nh->fib_nh_gw6;
dev = fib6_nh->fib_nh_dev;
rcu_read_lock();
last_probe = READ_ONCE(fib6_nh->last_probe);
idev = __in6_dev_get(dev); if (!idev) goto out;
neigh = __ipv6_neigh_lookup_noref(dev, nh_gw); if (neigh) { if (READ_ONCE(neigh->nud_state) & NUD_VALID) goto out;
write_lock_bh(&neigh->lock); if (!(neigh->nud_state & NUD_VALID) &&
time_after(jiffies,
neigh->updated +
READ_ONCE(idev->cnf.rtr_probe_interval))) {
work = kmalloc(sizeof(*work), GFP_ATOMIC); if (work)
__neigh_set_probe_once(neigh);
}
write_unlock_bh(&neigh->lock);
} elseif (time_after(jiffies, last_probe +
READ_ONCE(idev->cnf.rtr_probe_interval))) {
work = kmalloc(sizeof(*work), GFP_ATOMIC);
}
if (nud_state & NUD_VALID)
ret = RT6_NUD_SUCCEED; #ifdef CONFIG_IPV6_ROUTER_PREF elseif (!(nud_state & NUD_FAILED))
ret = RT6_NUD_SUCCEED; else
ret = RT6_NUD_FAIL_PROBE; #endif
} else {
ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
}
rcu_read_unlock();
return ret;
}
staticint rt6_score_route(conststruct fib6_nh *nh, u32 fib6_flags, int oif, int strict)
{ int m = 0;
if (!oif || nh->fib_nh_dev->ifindex == oif)
m = 2;
if (!m && (strict & RT6_LOOKUP_F_IFACE)) return RT6_NUD_FAIL_HARD; #ifdef CONFIG_IPV6_ROUTER_PREF
m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(fib6_flags)) << 2; #endif if ((strict & RT6_LOOKUP_F_REACHABLE) &&
!(fib6_flags & RTF_NONEXTHOP) && nh->fib_nh_gw_family) { int n = rt6_check_neigh(nh); if (n < 0) return n;
} return m;
}
staticbool find_match(struct fib6_nh *nh, u32 fib6_flags, int oif, int strict, int *mpri, bool *do_rr)
{ bool match_do_rr = false; bool rc = false; int m;
staticvoid rt6_select(struct net *net, struct fib6_node *fn, int oif, struct fib6_result *res, int strict)
{ struct fib6_info *leaf = rcu_dereference(fn->leaf); struct fib6_info *rt0; bool do_rr = false; int key_plen;
/* make sure this function or its helpers sets f6i */
res->f6i = NULL;
if (!leaf || leaf == net->ipv6.fib6_null_entry) goto out;
rt0 = rcu_dereference(fn->rr_ptr); if (!rt0)
rt0 = leaf;
/* Double check to make sure fn is not an intermediate node * and fn->leaf does not points to its child's leaf * (This might happen if all routes under fn are deleted from * the tree and fib6_repair_tree() is called on the node.)
*/
key_plen = rt0->fib6_dst.plen; #ifdef CONFIG_IPV6_SUBTREES if (rt0->fib6_src.plen)
key_plen = rt0->fib6_src.plen; #endif if (fn->fn_bit != key_plen) goto out;
/* no entries matched; do round-robin */ if (!next || next->fib6_metric != rt0->fib6_metric)
next = leaf;
if (next != rt0) {
spin_lock_bh(&leaf->fib6_table->tb6_lock); /* make sure next is not being deleted from the tree */ if (next->fib6_node)
rcu_assign_pointer(fn->rr_ptr, next);
spin_unlock_bh(&leaf->fib6_table->tb6_lock);
}
}
/* called with rcu_lock held */ staticstruct net_device *ip6_rt_get_dev_rcu(conststruct fib6_result *res)
{ struct net_device *dev = res->nh->fib_nh_dev;
if (res->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) { /* for copies of local routes, dst->dev needs to be the * device if it is a master device, the master device if * device is enslaved, and the loopback as the default
*/ if (netif_is_l3_slave(dev) &&
!rt6_need_strict(&res->f6i->fib6_dst.addr))
dev = l3mdev_master_dev_rcu(dev); elseif (!netif_is_l3_master(dev))
dev = dev_net(dev)->loopback_dev; /* last case is netif_is_l3_master(dev) is true in which * case we want dev returned to be dev
*/
}
/* ip6_ins_rt is called with FREE table->tb6_lock. * It takes new route entry, the addition fails by any reason the * route is released. * Caller must hold dst before calling it.
*/
/* It should be called with rcu_read_lock() acquired */ staticstruct rt6_info *rt6_get_pcpu_route(conststruct fib6_result *res)
{ struct rt6_info *pcpu_rt;
p = this_cpu_ptr(res->nh->rt6i_pcpu); /* Paired with READ_ONCE() in __fib6_drop_pcpu_from() */
prev = xchg(p, NULL); if (prev) {
dst_dev_put(&prev->dst);
dst_release(&prev->dst);
}
/* Remove rt6_ex from hash table and free the memory * Caller must hold rt6_exception_lock
*/ staticvoid rt6_remove_exception(struct rt6_exception_bucket *bucket, struct rt6_exception *rt6_ex)
{ struct net *net;
if (!bucket || !rt6_ex) return;
net = dev_net(rt6_ex->rt6i->dst.dev);
net->ipv6.rt6_stats->fib_rt_cache--;
/* purge completely the exception to allow releasing the held resources: * some [sk] cache may keep the dst around for unlimited time
*/
dst_dev_put(&rt6_ex->rt6i->dst);
/* Helper function to find the cached rt in the hash table * and update bucket pointer to point to the bucket for this * (daddr, saddr) pair * Caller must hold rt6_exception_lock
*/ staticstruct rt6_exception *
__rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket, conststruct in6_addr *daddr, conststruct in6_addr *saddr)
{ struct rt6_exception *rt6_ex;
u32 hval;
#ifdef CONFIG_IPV6_SUBTREES if (matched && saddr)
matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); #endif if (matched) return rt6_ex;
} return NULL;
}
/* Helper function to find the cached rt in the hash table * and update bucket pointer to point to the bucket for this * (daddr, saddr) pair * Caller must hold rcu_read_lock()
*/ staticstruct rt6_exception *
__rt6_find_exception_rcu(struct rt6_exception_bucket **bucket, conststruct in6_addr *daddr, conststruct in6_addr *saddr)
{ struct rt6_exception *rt6_ex;
u32 hval;
if (res->f6i->fib6_pmtu) {
mtu = res->f6i->fib6_pmtu;
} else { struct net_device *dev = nh->fib_nh_dev; struct inet6_dev *idev;
rcu_read_lock();
idev = __in6_dev_get(dev);
mtu = READ_ONCE(idev->cnf.mtu6);
rcu_read_unlock();
}
mtu = min_t(unsignedint, mtu, IP6_MAX_MTU);
return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu);
}
#define FIB6_EXCEPTION_BUCKET_FLUSHED 0x1UL
/* used when the flushed bit is not relevant, only access to the bucket * (ie., all bucket users except rt6_insert_exception); * * called under rcu lock; sometimes called with rt6_exception_lock held
*/ static struct rt6_exception_bucket *fib6_nh_get_excptn_bucket(conststruct fib6_nh *nh,
spinlock_t *lock)
{ struct rt6_exception_bucket *bucket;
if (lock)
bucket = rcu_dereference_protected(nh->rt6i_exception_bucket,
lockdep_is_held(lock)); else
bucket = rcu_dereference(nh->rt6i_exception_bucket);
/* remove bucket flushed bit if set */ if (bucket) { unsignedlong p = (unsignedlong)bucket;
p &= ~FIB6_EXCEPTION_BUCKET_FLUSHED;
bucket = (struct rt6_exception_bucket *)p;
}
return bucket;
}
staticbool fib6_nh_excptn_bucket_flushed(struct rt6_exception_bucket *bucket)
{ unsignedlong p = (unsignedlong)bucket;
return !!(p & FIB6_EXCEPTION_BUCKET_FLUSHED);
}
/* called with rt6_exception_lock held */ staticvoid fib6_nh_excptn_bucket_set_flushed(struct fib6_nh *nh,
spinlock_t *lock)
{ struct rt6_exception_bucket *bucket; unsignedlong p;
#ifdef CONFIG_IPV6_SUBTREES /* fib6_src.plen != 0 indicates f6i is in subtree * and exception table is indexed by a hash of * both fib6_dst and fib6_src. * Otherwise, the exception table is indexed by * a hash of only fib6_dst.
*/ if (f6i->fib6_src.plen)
src_key = &nrt->rt6i_src.addr; #endif /* rt6_mtu_change() might lower mtu on f6i. * Only insert this exception route if its mtu * is less than f6i's mtu value.
*/ if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(res)) {
err = -EINVAL; goto out;
}
rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
src_key); if (rt6_ex)
rt6_remove_exception(bucket, rt6_ex);
/* Randomize max depth to avoid some side channels attacks. */
max_depth = FIB6_MAX_DEPTH + get_random_u32_below(FIB6_MAX_DEPTH); while (bucket->depth > max_depth)
rt6_exception_remove_oldest(bucket);
out:
spin_unlock_bh(&rt6_exception_lock);
/* Update fn->fn_sernum to invalidate all cached dst */ if (!err) {
spin_lock_bh(&f6i->fib6_table->tb6_lock);
fib6_update_sernum(net, f6i);
fib6_add_gc_list(f6i);
spin_unlock_bh(&f6i->fib6_table->tb6_lock);
fib6_force_start_gc(net);
}
/* Find cached rt in the hash table inside passed in rt * Caller has to hold rcu_read_lock()
*/ staticstruct rt6_info *rt6_find_cached_rt(conststruct fib6_result *res, conststruct in6_addr *daddr, conststruct in6_addr *saddr)
{ conststruct in6_addr *src_key = NULL; struct rt6_exception_bucket *bucket; struct rt6_exception *rt6_ex; struct rt6_info *ret = NULL;
#ifdef CONFIG_IPV6_SUBTREES /* fib6i_src.plen != 0 indicates f6i is in subtree * and exception table is indexed by a hash of * both fib6_dst and fib6_src. * However, the src addr used to create the hash * might not be exactly the passed in saddr which * is a /128 addr from the flow. * So we need to use f6i->fib6_src to redo lookup * if the passed in saddr does not find anything. * (See the logic in ip6_rt_cache_alloc() on how * rt->rt6i_src is updated.)
*/ if (res->f6i->fib6_src.plen)
src_key = saddr;
find_ex: #endif
bucket = fib6_nh_get_excptn_bucket(res->nh, NULL);
rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
ret = rt6_ex->rt6i;
#ifdef CONFIG_IPV6_SUBTREES /* Use fib6_src as src_key and redo lookup */ if (!ret && src_key && src_key != &res->f6i->fib6_src.addr) {
src_key = &res->f6i->fib6_src.addr; goto find_ex;
} #endif
return ret;
}
/* Remove the passed in cached rt from the hash table that contains it */ staticint fib6_nh_remove_exception(conststruct fib6_nh *nh, int plen, conststruct rt6_info *rt)
{ conststruct in6_addr *src_key = NULL; struct rt6_exception_bucket *bucket; struct rt6_exception *rt6_ex; int err;
if (!rcu_access_pointer(nh->rt6i_exception_bucket)) return -ENOENT;
#ifdef CONFIG_IPV6_SUBTREES /* rt6i_src.plen != 0 indicates 'from' is in subtree * and exception table is indexed by a hash of * both rt6i_dst and rt6i_src. * Otherwise, the exception table is indexed by * a hash of only rt6i_dst.
*/ if (plen)
src_key = &rt->rt6i_src.addr; #endif
rt6_ex = __rt6_find_exception_spinlock(&bucket,
&rt->rt6i_dst.addr,
src_key); if (rt6_ex) {
rt6_remove_exception(bucket, rt6_ex);
err = 0;
} else {
err = -ENOENT;
}
/* Find rt6_ex which contains the passed in rt cache and * refresh its stamp
*/ staticvoid fib6_nh_update_exception(conststruct fib6_nh *nh, int plen, conststruct rt6_info *rt)
{ conststruct in6_addr *src_key = NULL; struct rt6_exception_bucket *bucket; struct rt6_exception *rt6_ex;
bucket = fib6_nh_get_excptn_bucket(nh, NULL); #ifdef CONFIG_IPV6_SUBTREES /* rt6i_src.plen != 0 indicates 'from' is in subtree * and exception table is indexed by a hash of * both rt6i_dst and rt6i_src. * Otherwise, the exception table is indexed by * a hash of only rt6i_dst.
*/ if (plen)
src_key = &rt->rt6i_src.addr; #endif
rt6_ex = __rt6_find_exception_rcu(&bucket, &rt->rt6i_dst.addr, src_key); if (rt6_ex)
rt6_ex->stamp = jiffies;
}
staticbool rt6_mtu_change_route_allowed(struct inet6_dev *idev, struct rt6_info *rt, int mtu)
{ /* If the new MTU is lower than the route PMTU, this new MTU will be the * lowest MTU in the path: always allow updating the route PMTU to * reflect PMTU decreases. * * If the new MTU is higher, and the route PMTU is equal to the local * MTU, this means the old MTU is the lowest in the path, so allow * updating it: if other nodes now have lower MTUs, PMTU discovery will * handle this.
*/
if (dst_mtu(&rt->dst) >= mtu) returntrue;
if (dst_mtu(&rt->dst) == idev->cnf.mtu6) returntrue;
returnfalse;
}
staticvoid rt6_exceptions_update_pmtu(struct inet6_dev *idev, conststruct fib6_nh *nh, int mtu)
{ struct rt6_exception_bucket *bucket; struct rt6_exception *rt6_ex; int i;
bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); if (!bucket) return;
for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { struct rt6_info *entry = rt6_ex->rt6i;
/* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected * route), the metrics of its rt->from have already * been updated.
*/ if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
rt6_mtu_change_route_allowed(idev, entry, mtu))
dst_metric_set(&entry->dst, RTAX_MTU, mtu);
}
bucket++;
}
}
/* we are pruning and obsoleting aged-out and non gateway exceptions * even if others have still references to them, so that on next * dst_check() such references can be dropped. * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when * expired, independently from their aging, as per RFC 8201 section 4
*/ if (!(rt->rt6i_flags & RTF_EXPIRES)) { if (time_after_eq(now, READ_ONCE(rt->dst.lastuse) +
gc_args->timeout)) {
pr_debug("aging clone %p\n", rt);
rt6_remove_exception(bucket, rt6_ex); return;
}
} elseif (time_after(jiffies, READ_ONCE(rt->dst.expires))) {
pr_debug("purging expired route %p\n", rt);
rt6_remove_exception(bucket, rt6_ex); return;
}
if (rt->rt6i_flags & RTF_GATEWAY) { struct neighbour *neigh;
/* must be called with rcu lock held */ int fib6_table_lookup(struct net *net, struct fib6_table *table, int oif, struct flowi6 *fl6, struct fib6_result *res, int strict)
{ struct fib6_node *fn, *saved_fn;
/*Search through exception table */
rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr); if (rt) { goto out;
} elseif (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
!res.nh->fib_nh_gw_family)) { /* Create a RTF_CACHE clone which will not be * owned by the fib6 tree. It is for the special case where * the daddr in the skb during the neighbor look-up is different * from the fl6->daddr used to look-up route here.
*/
rt = ip6_rt_cache_alloc(&res, &fl6->daddr, NULL);
if (rt) { /* 1 refcnt is taken during ip6_rt_cache_alloc(). * As rt6_uncached_list_add() does not consume refcnt, * this refcnt is always returned to the caller even * if caller sets RT6_LOOKUP_F_DST_NOREF flag.
*/
rt6_uncached_list_add(rt);
rcu_read_unlock();
return rt;
}
} else { /* Get a percpu copy */
local_bh_disable();
rt = rt6_get_pcpu_route(&res);
if (!rt)
rt = rt6_make_pcpu_route(net, &res);
local_bh_enable();
}
out: if (!rt)
rt = net->ipv6.ip6_null_entry; if (!(flags & RT6_LOOKUP_F_DST_NOREF))
ip6_hold_safe(net, &rt);
rcu_read_unlock();
/* We assume the packet carries an encapsulation, but if none was * encountered during dissection of the outer flow, then there is no * point in calling the flow dissector again.
*/ if (!has_inner) return 0;
if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_MASK)) return 0;
/* if skb is set it will be used and fl6 can be NULL */
u32 rt6_multipath_hash(conststruct net *net, conststruct flowi6 *fl6, conststruct sk_buff *skb, struct flow_keys *flkeys)
{ struct flow_keys hash_keys;
u32 mhash = 0;
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.