/* When 'force' is false, nexthop bucket replacement is performed * because the bucket was deemed to be idle. In this case, capable * listeners can choose to perform an atomic replacement: The bucket is * only replaced if it is inactive. However, if the idle timer interval * is smaller than the interval in which a listener is querying * buckets' activity from the device, then atomic replacement should * not be tried. Pass the idle timer value to listeners, so that they * could determine which type of replacement to perform.
*/ if (force) {
*p_idle_timer_ms = 0; return 0;
}
/* There are three users of RES_TABLE, and NHs etc. referenced from there: * * 1) a collection of callbacks for NH maintenance. This operates under * RTNL, * 2) the delayed work that gradually balances the resilient table, * 3) and nexthop_select_path(), operating under RCU. * * Both the delayed work and the RTNL block are writers, and need to * maintain mutual exclusion. Since there are only two and well-known * writers for each table, the RTNL code can make sure it has exclusive * access thus: * * - Have the DW operate without locking; * - synchronously cancel the DW; * - do the writing; * - if the write was not actually a delete, call upkeep, which schedules * DW again if necessary. * * The functions that are always called from the RTNL context use * rtnl_dereference(). The functions that can also be called from the DW do * a raw dereference and rely on the above mutual exclusion scheme.
*/ #define nh_res_dereference(p) (rcu_dereference_raw(p))
/* At this point, the nexthop buckets are still not populated. Only * emit a notification with the logical nexthops, so that a listener * could potentially veto it in case of unsupported configuration.
*/
nhg = rtnl_dereference(nh->nh_grp);
err = nh_notifier_mpath_info_init(&info, nhg); if (err) {
NL_SET_ERR_MSG(extack, "Failed to initialize nexthop notifier info"); return err;
}
staticvoid nh_base_seq_inc(struct net *net)
{ while (++net->nexthop.seq == 0)
;
}
/* no reference taken; rcu lock or rtnl must be held */ struct nexthop *nexthop_find_by_id(struct net *net, u32 id)
{ struct rb_node **pp, *parent = NULL, *next;
pp = &net->nexthop.rb_root.rb_node; while (1) { struct nexthop *nh;
next = rcu_dereference_raw(*pp); if (!next) break;
parent = next;
nh = rb_entry(parent, struct nexthop, rb_node); if (id < nh->id)
pp = &next->rb_left; elseif (id > nh->id)
pp = &next->rb_right; else return nh;
} return NULL;
}
EXPORT_SYMBOL_GPL(nexthop_find_by_id);
/* used for auto id allocation; called with rtnl held */ static u32 nh_find_unused_id(struct net *net)
{
u32 id_start = net->nexthop.last_id_allocated;
while (1) {
net->nexthop.last_id_allocated++; if (net->nexthop.last_id_allocated == id_start) break;
if (!nexthop_find_by_id(net, net->nexthop.last_id_allocated)) return net->nexthop.last_id_allocated;
} return 0;
}
/* Cache whatever we got, even if there was an error, otherwise the * successful stats retrievals would get lost.
*/
nh_grp_hw_stats_apply_update(nh, &info);
*hw_stats_used = info.nh_grp_hw_stats->hw_stats_used;
/* Nesting groups within groups is not supported. */ if (nhg->hash_threshold) {
NL_SET_ERR_MSG(extack, "Hash-threshold group can not be a nexthop within a group"); returnfalse;
} if (nhg->resilient) {
NL_SET_ERR_MSG(extack, "Resilient group can not be a nexthop within a group"); returnfalse;
}
*is_fdb = nhg->fdb_nh;
} else { struct nh_info *nhi = rtnl_dereference(nh->nh_info);
if (nhi->reject_nh && npaths > 1) {
NL_SET_ERR_MSG(extack, "Blackhole nexthop can not be used in a group with more than 1 path"); returnfalse;
}
*is_fdb = nhi->fdb_nh;
}
if (!nhi->fdb_nh) {
NL_SET_ERR_MSG(extack, "FDB nexthop group can only have fdb nexthops"); return -EINVAL;
}
if (*nh_family == AF_UNSPEC) {
*nh_family = nhi->family;
} elseif (*nh_family != nhi->family) {
NL_SET_ERR_MSG(extack, "FDB nexthop group cannot have mixed family nexthops"); return -EINVAL;
}
return 0;
}
staticint nh_check_attr_group(struct net *net, struct nlattr *tb[], size_t tb_size,
u16 nh_grp_type, struct netlink_ext_ack *extack)
{ unsignedint len = nla_len(tb[NHA_GROUP]); struct nexthop_grp *nhg; unsignedint i, j;
if (!len || len & (sizeof(struct nexthop_grp) - 1)) {
NL_SET_ERR_MSG(extack, "Invalid length for nexthop group attribute"); return -EINVAL;
}
/* convert len to number of nexthop ids */
len /= sizeof(*nhg);
nhg = nla_data(tb[NHA_GROUP]); for (i = 0; i < len; ++i) { if (nhg[i].resvd2) {
NL_SET_ERR_MSG(extack, "Reserved field in nexthop_grp must be 0"); return -EINVAL;
} if (nexthop_grp_weight(&nhg[i]) == 0) { /* 0xffff got passed in, representing weight of 0x10000, * which is too heavy.
*/
NL_SET_ERR_MSG(extack, "Invalid value for weight"); return -EINVAL;
} for (j = i + 1; j < len; ++j) { if (nhg[i].id == nhg[j].id) {
NL_SET_ERR_MSG(extack, "Nexthop id can not be used twice in a group"); return -EINVAL;
}
}
}
nhg = nla_data(tb[NHA_GROUP]); for (i = NHA_GROUP_TYPE + 1; i < tb_size; ++i) { if (!tb[i]) continue; switch (i) { case NHA_HW_STATS_ENABLE: case NHA_FDB: continue; case NHA_RES_GROUP: if (nh_grp_type == NEXTHOP_GRP_TYPE_RES) continue; break;
}
NL_SET_ERR_MSG(extack, "No other attributes can be set in nexthop groups"); return -EINVAL;
}
/* nexthop_select_path() is expected to return a non-NULL value, so * skip protocol validation and just hand out whatever there is.
*/
bucket = &res_table->nh_buckets[bucket_index];
nh_res_bucket_set_busy(bucket);
nhge = rcu_dereference(bucket->nh_entry);
nh_grp_entry_stats_inc(nhge); return nhge->nh;
}
/* fib6_src is unique to a fib6_info and limits the ability to cache * routes in fib6_nh within a nexthop that is potentially shared * across multiple fib entries. If the config wants to use source * routing it can not use nexthop objects. mlxsw also does not allow * fib6_src on routes.
*/ if (cfg && check_src_addr(&cfg->fc_src, extack) < 0) return -EINVAL;
if (is_fdb_nh) {
NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop"); return -EINVAL;
}
return 0;
no_v4_nh:
NL_SET_ERR_MSG(extack, "IPv6 routes can not use an IPv4 nexthop"); return -EINVAL;
}
EXPORT_SYMBOL_GPL(fib6_check_nexthop);
/* if existing nexthop has ipv6 routes linked to it, need * to verify this new spec works with ipv6
*/ staticint fib6_check_nh_list(struct nexthop *old, struct nexthop *new, struct netlink_ext_ack *extack)
{ struct fib6_info *f6i;
staticint nexthop_check_scope(struct nh_info *nhi, u8 scope, struct netlink_ext_ack *extack)
{ if (scope == RT_SCOPE_HOST && nhi->fib_nhc.nhc_gw_family) {
NL_SET_ERR_MSG(extack, "Route with host scope can not have a gateway"); return -EINVAL;
}
if (nhi->fib_nhc.nhc_flags & RTNH_F_ONLINK && scope >= RT_SCOPE_LINK) {
NL_SET_ERR_MSG(extack, "Scope mismatch with nexthop"); return -EINVAL;
}
return 0;
}
/* Invoked by fib add code to verify nexthop by id is ok with * config for prefix; parts of fib_check_nh not done when nexthop * object is used.
*/ int fib_check_nexthop(struct nexthop *nh, u8 scope, struct netlink_ext_ack *extack)
{ struct nh_info *nhi; int err = 0;
if (nh->is_group) { struct nh_group *nhg;
nhg = rtnl_dereference(nh->nh_grp); if (nhg->fdb_nh) {
NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop");
err = -EINVAL; goto out;
}
if (scope == RT_SCOPE_HOST) {
NL_SET_ERR_MSG(extack, "Route with host scope can not have multiple nexthops");
err = -EINVAL; goto out;
}
/* all nexthops in a group have the same scope */
nhi = rtnl_dereference(nhg->nh_entries[0].nh->nh_info);
err = nexthop_check_scope(nhi, scope, extack);
} else {
nhi = rtnl_dereference(nh->nh_info); if (nhi->fdb_nh) {
NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop");
err = -EINVAL; goto out;
}
err = nexthop_check_scope(nhi, scope, extack);
}
if (!bucket->occupied) { /* The bucket is not occupied, its NHGE pointer is either * NULL or obsolete. We _have to_ migrate: set force.
*/
*force = true; returntrue;
}
nhge = nh_res_dereference(bucket->nh_entry);
/* If the bucket is populated by an underweight or balanced * nexthop, do not migrate.
*/ if (!nh_res_nhge_is_ow(nhge)) returnfalse;
/* At this point we know that the bucket is populated with an * overweight nexthop. It needs to be migrated to a new nexthop if * the idle timer of unbalanced timer expired.
*/
idle_point = nh_res_bucket_idle_point(res_table, bucket, now); if (time_after_eq(now, idle_point)) { /* The bucket is idle. We _can_ migrate: unset force. */
*force = false; returntrue;
}
/* Unbalanced timer of 0 means "never force". */ if (res_table->unbalanced_timer) { unsignedlong unb_point;
unb_point = nh_res_table_unb_point(res_table); if (time_after(now, unb_point)) { /* The bucket is not idle, but the unbalanced timer * expired. We _can_ migrate, but set force anyway, * so that drivers know to ignore activity reports * from the HW.
*/
*force = true; returntrue;
}
new_nhge = list_first_entry_or_null(&res_table->uw_nh_entries, struct nh_grp_entry,
res.uw_nh_entry); if (WARN_ON_ONCE(!new_nhge)) /* If this function is called, "bucket" is either not * occupied, or it belongs to a next hop that is * overweight. In either case, there ought to be a * corresponding underweight next hop.
*/ returnfalse;
if (notify) { struct nh_grp_entry *old_nhge;
old_nhge = nh_res_dereference(bucket->nh_entry);
err = call_nexthop_res_bucket_notifiers(res_table->net,
res_table->nhg_id,
bucket_index, force,
old_nhge->nh,
new_nhge->nh, &extack); if (err) {
pr_err_ratelimited("%s\n", extack._msg); if (!force) returnfalse; /* It is not possible to veto a forced replacement, so * just clear the hardware flags from the nexthop * bucket to indicate to user space that this bucket is * not correctly populated in hardware.
*/
bucket->nh_flags &= ~(RTNH_F_OFFLOAD | RTNH_F_TRAP);
}
}
/* Deadline is the next time that upkeep should be run. It is the * earliest time at which one of the buckets might be migrated. * Start at the most pessimistic estimate: either unbalanced_timer * from now, or if there is none, idle_timer from now. For each * encountered time point, call nh_res_time_set_deadline() to * refine the estimate.
*/ if (res_table->unbalanced_timer)
deadline = now + res_table->unbalanced_timer; else
deadline = now + res_table->idle_timer;
for (i = 0; i < res_table->num_nh_buckets; i++) { struct nh_res_bucket *bucket = &res_table->nh_buckets[i]; bool force;
if (nh_res_bucket_should_migrate(res_table, bucket,
&deadline, &force)) { if (!nh_res_bucket_migrate(res_table, i, notify,
notify_nl, force)) { unsignedlong idle_point;
/* A driver can override the migration * decision if the HW reports that the * bucket is actually not idle. Therefore * remark the bucket as busy again and * update the deadline.
*/
nh_res_bucket_set_busy(bucket);
idle_point = nh_res_bucket_idle_point(res_table,
bucket,
now);
nh_res_time_set_deadline(idle_point, &deadline);
}
}
}
/* If the group is still unbalanced, schedule the next upkeep to * either the deadline computed above, or the minimum deadline, * whichever comes later.
*/ if (!nh_res_table_is_balanced(res_table)) { unsignedlong now = jiffies; unsignedlong min_deadline;
min_deadline = now + NH_RES_UPKEEP_DW_MINIMUM_INTERVAL; if (time_before(deadline, min_deadline))
deadline = min_deadline;
if (nh_res_nhge_is_uw(nhge)) { if (list_empty(&res_table->uw_nh_entries))
res_table->unbalanced_since = jiffies;
list_add(&nhge->res.uw_nh_entry,
&res_table->uw_nh_entries);
}
}
}
/* Migrate buckets in res_table so that they reference NHGE's from NHG with * the right NH ID. Set those buckets that do not have a corresponding NHGE * entry in NHG as not occupied.
*/ staticvoid nh_res_table_migrate_buckets(struct nh_res_table *res_table, struct nh_group *nhg)
{
u16 i;
for (i = 0; i < res_table->num_nh_buckets; i++) { struct nh_res_bucket *bucket = &res_table->nh_buckets[i];
u32 id = rtnl_dereference(bucket->nh_entry)->nh->id; bool found = false; int j;
if (nhge->nh->id == id) {
nh_res_bucket_set_nh(bucket, nhge);
found = true; break;
}
}
if (!found)
nh_res_bucket_unset_nh(bucket);
}
}
staticvoid replace_nexthop_grp_res(struct nh_group *oldg, struct nh_group *newg)
{ /* For NH group replacement, the new NHG might only have a stub * hash table with 0 buckets, because the number of buckets was not * specified. For NH removal, oldg and newg both reference the same * res_table. So in any case, in the following, we want to work * with oldg->res_table.
*/ struct nh_res_table *old_res_table = rtnl_dereference(oldg->res_table); unsignedlong prev_unbalanced_since = old_res_table->unbalanced_since; bool prev_has_uw = !list_empty(&old_res_table->uw_nh_entries);
/* copy old entries to new except the one getting removed */
nhges = nhg->nh_entries;
new_nhges = newg->nh_entries; for (i = 0, j = 0; i < nhg->num_nh; ++i) { struct nh_info *nhi;
/* current nexthop getting removed */ if (nhg->nh_entries[i].nh == nh) {
newg->num_nh--; continue;
}
nhi = rtnl_dereference(nhges[i].nh->nh_info); if (nhi->family == AF_INET)
newg->has_v4 = true;
/* Removal of a NH from a resilient group is notified through * bucket notifications.
*/ if (newg->hash_threshold) {
err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, nhp,
&extack); if (err)
pr_err("%s\n", extack._msg);
}
if (nlinfo)
nexthop_notify(RTM_NEWNEXTHOP, nhp, nlinfo);
}
/* if any FIB entries reference this nexthop, any dst entries * need to be regenerated
*/ staticvoid nh_rt_cache_flush(struct net *net, struct nexthop *nh, struct nexthop *replaced_nh)
{ struct fib6_info *f6i; struct nh_group *nhg; int i;
if (!list_empty(&nh->fi_list))
rt_cache_flush(net);
if (newg->hash_threshold != oldg->hash_threshold) {
NL_SET_ERR_MSG(extack, "Can not replace a nexthop group with one of a different type."); return -EINVAL;
}
/* Accept if num_nh_buckets was not given, but if it was * given, demand that the value be correct.
*/ if (cfg->nh_grp_res_has_num_buckets &&
cfg->nh_grp_res_num_buckets !=
old_res_table->num_nh_buckets) {
NL_SET_ERR_MSG(extack, "Can not change number of buckets of a resilient nexthop group."); return -EINVAL;
}
/* Emit a pre-replace notification so that listeners could veto * a potentially unsupported configuration. Otherwise, * individual bucket replacement notifications would need to be * vetoed, which is something that should only happen if the * bucket is currently active.
*/
err = call_nexthop_res_table_notifiers(net, new, extack); if (err) return err;
if (cfg->nh_grp_res_has_idle_timer)
old_res_table->idle_timer = cfg->nh_grp_res_idle_timer; if (cfg->nh_grp_res_has_unbalanced_timer)
old_res_table->unbalanced_timer =
cfg->nh_grp_res_unbalanced_timer;
if (new->is_group) {
NL_SET_ERR_MSG(extack, "Can not replace a nexthop with a nexthop group."); return -EINVAL;
}
if (!list_empty(&old->grp_list) &&
rtnl_dereference(new->nh_info)->fdb_nh !=
rtnl_dereference(old->nh_info)->fdb_nh) {
NL_SET_ERR_MSG(extack, "Cannot change nexthop FDB status while in a group"); return -EINVAL;
}
err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, new, extack); if (err) return err;
/* Hardware flags were set on 'old' as 'new' is not in the red-black * tree. Therefore, inherit the flags from 'old' to 'new'.
*/
new->nh_flags |= old->nh_flags & (RTNH_F_OFFLOAD | RTNH_F_TRAP);
oldi = rtnl_dereference(old->nh_info);
newi = rtnl_dereference(new->nh_info);
/* Send a replace notification for all the groups using the nexthop. */
list_for_each_entry(nhge, &old->grp_list, nh_list) { struct nexthop *nhp = nhge->nh_parent;
/* When replacing an IPv4 nexthop with an IPv6 nexthop, potentially * update IPv4 indication in all the groups using the nexthop.
*/ if (oldi->family == AF_INET && newi->family == AF_INET6) {
list_for_each_entry(nhge, &old->grp_list, nh_list) { struct nexthop *nhp = nhge->nh_parent; struct nh_group *nhg;
if (!list_empty(&nh->fi_list)) { struct fib_info *fi;
/* expectation is a few fib_info per nexthop and then * a lot of routes per fib_info. So mark the fib_info * and then walk the fib tables once
*/
list_for_each_entry(fi, &nh->fi_list, nh_list)
fi->nh_updated = true;
/* send RTM_NEWROUTE with REPLACE flag set for all FIB entries * linked to this nexthop and for all groups that the nexthop * is a member of
*/ staticvoid nexthop_replace_notify(struct net *net, struct nexthop *nh, struct nl_info *info)
{ struct nh_grp_entry *nhge;
/* check that existing FIB entries are ok with the * new nexthop definition
*/
err = fib_check_nh_list(old, new, extack); if (err) return err;
err = fib6_check_nh_list(old, new, extack); if (err) return err;
if (!new->is_group) { struct nh_info *nhi = rtnl_dereference(new->nh_info);
new_is_reject = nhi->reject_nh;
}
list_for_each_entry(nhge, &old->grp_list, nh_list) { /* if new nexthop is a blackhole, any groups using this * nexthop cannot have more than 1 path
*/ if (new_is_reject &&
nexthop_num_path(nhge->nh_parent) > 1) {
NL_SET_ERR_MSG(extack, "Blackhole nexthop can not be a member of a group with more than one path"); return -EINVAL;
}
err = fib_check_nh_list(nhge->nh_parent, new, extack); if (err) return err;
err = fib6_check_nh_list(nhge->nh_parent, new, extack); if (err) return err;
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.