if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu);
} if (group && !cpumask_test_cpu(cpu, sched_group_span(group))) {
printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu);
}
if ((meta_flags & SDF_SHARED_CHILD) && sd->child &&
!(sd->child->flags & flag))
printk(KERN_ERR "ERROR: flag %s set here but not in child\n",
sd_flag_debug[idx].name);
if ((meta_flags & SDF_SHARED_PARENT) && sd->parent &&
!(sd->parent->flags & flag))
printk(KERN_ERR "ERROR: flag %s set here but not in parent\n",
sd_flag_debug[idx].name);
}
printk(KERN_DEBUG "%*s groups:", level + 1, ""); do { if (!group) {
printk("\n");
printk(KERN_ERR "ERROR: group is NULL\n"); break;
}
if (group->sgc->capacity != SCHED_CAPACITY_SCALE)
printk(KERN_CONT " cap=%lu", group->sgc->capacity);
if (group == sd->groups && sd->child &&
!cpumask_equal(sched_domain_span(sd->child),
sched_group_span(group))) {
printk(KERN_ERR "ERROR: domain->groups does not match domain->child\n");
}
printk(KERN_CONT " }");
group = group->next;
if (group != sd->groups)
printk(KERN_CONT ",");
} while (group != sd->groups);
printk(KERN_CONT "\n");
if (!cpumask_equal(sched_domain_span(sd), groupmask))
printk(KERN_ERR "ERROR: groups don't span domain->span\n");
if (sd->parent &&
!cpumask_subset(groupmask, sched_domain_span(sd->parent)))
printk(KERN_ERR "ERROR: parent span is not a superset of domain->span\n"); return 0;
}
staticvoid sched_domain_debug(struct sched_domain *sd, int cpu)
{ int level = 0;
/* EAS is enabled for asymmetric CPU capacity topologies. */
for_each_cpu(i, cpu_mask) { if (rcu_access_pointer(per_cpu(sd_asym_cpucapacity, i))) {
any_asym_capacity = true; break;
}
} if (!any_asym_capacity) { if (sched_debug()) {
pr_info("rd %*pbl: Checking EAS, CPUs do not have asymmetric capacities\n",
cpumask_pr_args(cpu_mask));
} returnfalse;
}
/* EAS definitely does *not* handle SMT */ if (sched_smt_active()) { if (sched_debug()) {
pr_info("rd %*pbl: Checking EAS, SMT is not supported\n",
cpumask_pr_args(cpu_mask));
} returnfalse;
}
if (!arch_scale_freq_invariant()) { if (sched_debug()) {
pr_info("rd %*pbl: Checking EAS: frequency-invariant load tracking not yet supported",
cpumask_pr_args(cpu_mask));
} returnfalse;
}
if (!cpufreq_ready_for_eas(cpu_mask)) { if (sched_debug()) {
pr_info("rd %*pbl: Checking EAS: cpufreq is not ready\n",
cpumask_pr_args(cpu_mask));
} returnfalse;
}
/* * EAS can be used on a root domain if it meets all the following conditions: * 1. an Energy Model (EM) is available; * 2. the SD_ASYM_CPUCAPACITY flag is set in the sched_domain hierarchy. * 3. no SMT is detected. * 4. schedutil is driving the frequency of all CPUs of the rd; * 5. frequency invariance support is present;
*/ staticbool build_perf_domains(conststruct cpumask *cpu_map)
{ int i; struct perf_domain *pd = NULL, *tmp; int cpu = cpumask_first(cpu_map); struct root_domain *rd = cpu_rq(cpu)->rd;
/* Create the new pd and add it to the local list. */
tmp = pd_init(i); if (!tmp) goto free;
tmp->next = pd;
pd = tmp;
}
perf_domain_debug(cpu_map, pd);
/* Attach the new list of performance domains to the root domain. */
tmp = rd->pd;
rcu_assign_pointer(rd->pd, pd); if (tmp)
call_rcu(&tmp->rcu, destroy_perf_domain_rcu);
if (cpumask_test_cpu(rq->cpu, old_rd->online))
set_rq_offline(rq);
cpumask_clear_cpu(rq->cpu, old_rd->span);
/* * If we don't want to free the old_rd yet then * set old_rd to NULL to skip the freeing later * in this function:
*/ if (!atomic_dec_and_test(&old_rd->refcount))
old_rd = NULL;
}
atomic_inc(&rd->refcount);
rq->rd = rd;
cpumask_set_cpu(rq->cpu, rd->span); if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
set_rq_online(rq);
/* * Because the rq is not a task, dl_add_task_root_domain() did not * move the fair server bw to the rd if it already started. * Add it now.
*/ if (rq->fair_server.dl_server)
__dl_server_attach_root(&rq->fair_server, rq);
rq_unlock_irqrestore(rq, &rf);
if (old_rd)
call_rcu(&old_rd->rcu, free_rootdomain);
}
/* * By default the system creates a single root-domain with all CPUs as * members (mimicking the global state we have today).
*/ struct root_domain def_root_domain;
if (free_sgc && atomic_dec_and_test(&sg->sgc->ref))
kfree(sg->sgc);
if (atomic_dec_and_test(&sg->ref))
kfree(sg);
sg = tmp;
} while (sg != first);
}
staticvoid destroy_sched_domain(struct sched_domain *sd)
{ /* * A normal sched domain may have multiple group references, an * overlapping domain, having private groups, only one. Iterate, * dropping group/capacity references, freeing where none remain.
*/
free_sched_groups(sd->groups, 1);
if (sd->shared && atomic_dec_and_test(&sd->shared->ref))
kfree(sd->shared);
kfree(sd);
}
staticvoid destroy_sched_domains(struct sched_domain *sd)
{ if (sd)
call_rcu(&sd->rcu, destroy_sched_domains_rcu);
}
/* * Keep a special pointer to the highest sched_domain that has SD_SHARE_LLC set * (Last Level Cache Domain) for this allows us to avoid some pointer chasing * select_idle_sibling(). * * Also keep a unique ID per domain (we use the first CPU number in the cpumask * of the domain), this allows us to quickly tell if two CPUs are in the same * cache domain, see cpus_share_cache().
*/
DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);
DEFINE_PER_CPU(int, sd_llc_size);
DEFINE_PER_CPU(int, sd_llc_id);
DEFINE_PER_CPU(int, sd_share_id);
DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa);
DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
sd = lowest_flag_domain(cpu, SD_CLUSTER); if (sd)
id = cpumask_first(sched_domain_span(sd));
/* * This assignment should be placed after the sd_llc_id as * we want this id equals to cluster id on cluster machines * but equals to LLC id on non-Cluster machines.
*/
per_cpu(sd_share_id, cpu) = id;
/* * Attach the domain 'sd' to 'cpu' as its base domain. Callers must * hold the hotplug lock.
*/ staticvoid
cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
{ struct rq *rq = cpu_rq(cpu); struct sched_domain *tmp;
/* Remove the sched domains which do not contribute to scheduling. */ for (tmp = sd; tmp; ) { struct sched_domain *parent = tmp->parent; if (!parent) break;
if (sd_parent_degenerate(tmp, parent)) {
tmp->parent = parent->parent;
if (parent->parent) {
parent->parent->child = tmp;
parent->parent->groups->flags = tmp->flags;
}
/* * Transfer SD_PREFER_SIBLING down in case of a * degenerate parent; the spans match for this * so the property transfers.
*/ if (parent->flags & SD_PREFER_SIBLING)
tmp->flags |= SD_PREFER_SIBLING;
destroy_sched_domain(parent);
} else
tmp = tmp->parent;
}
if (sd && sd_degenerate(sd)) {
tmp = sd;
sd = sd->parent;
destroy_sched_domain(tmp); if (sd) { struct sched_group *sg = sd->groups;
/* * sched groups hold the flags of the child sched * domain for convenience. Clear such flags since * the child is being destroyed.
*/ do {
sg->flags = 0;
} while (sg != sd->groups);
/* * Return the canonical balance CPU for this group, this is the first CPU * of this group that's also in the balance mask. * * The balance mask are all those CPUs that could actually end up at this * group. See build_balance_mask(). * * Also see should_we_balance().
*/ int group_balance_cpu(struct sched_group *sg)
{ return cpumask_first(group_balance_mask(sg));
}
/* * NUMA topology (first read the regular topology blurb below) * * Given a node-distance table, for example: * * node 0 1 2 3 * 0: 10 20 30 20 * 1: 20 10 20 30 * 2: 30 20 10 20 * 3: 20 30 20 10 * * which represents a 4 node ring topology like: * * 0 ----- 1 * | | * | | * | | * 3 ----- 2 * * We want to construct domains and groups to represent this. The way we go * about doing this is to build the domains on 'hops'. For each NUMA level we * construct the mask of all nodes reachable in @level hops. * * For the above NUMA topology that gives 3 levels: * * NUMA-2 0-3 0-3 0-3 0-3 * groups: {0-1,3},{1-3} {0-2},{0,2-3} {1-3},{0-1,3} {0,2-3},{0-2} * * NUMA-1 0-1,3 0-2 1-3 0,2-3 * groups: {0},{1},{3} {0},{1},{2} {1},{2},{3} {0},{2},{3} * * NUMA-0 0 1 2 3 * * * As can be seen; things don't nicely line up as with the regular topology. * When we iterate a domain in child domain chunks some nodes can be * represented multiple times -- hence the "overlap" naming for this part of * the topology. * * In order to minimize this overlap, we only build enough groups to cover the * domain. For instance Node-0 NUMA-2 would only get groups: 0-1,3 and 1-3. * * Because: * * - the first group of each domain is its child domain; this * gets us the first 0-1,3 * - the only uncovered node is 2, who's child domain is 1-3. * * However, because of the overlap, computing a unique CPU for each group is * more complicated. Consider for instance the groups of NODE-1 NUMA-2, both * groups include the CPUs of Node-0, while those CPUs would not in fact ever * end up at those groups (they would end up in group: 0-1,3). * * To correct this we have to introduce the group balance mask. This mask * will contain those CPUs in the group that can reach this group given the * (child) domain tree. * * With this we can once again compute balance_cpu and sched_group_capacity * relations. * * XXX include words on how balance_cpu is unique and therefore can be * used for sched_group_capacity links. * * * Another 'interesting' topology is: * * node 0 1 2 3 * 0: 10 20 20 30 * 1: 20 10 20 20 * 2: 20 20 10 20 * 3: 30 20 20 10 * * Which looks a little like: * * 0 ----- 1 * | / | * | / | * | / | * 2 ----- 3 * * This topology is asymmetric, nodes 1,2 are fully connected, but nodes 0,3 * are not. * * This leads to a few particularly weird cases where the sched_domain's are * not of the same number for each CPU. Consider: * * NUMA-2 0-3 0-3 * groups: {0-2},{1-3} {1-3},{0-2} * * NUMA-1 0-2 0-3 0-3 1-3 * * NUMA-0 0 1 2 3 *
*/
/* * Build the balance mask; it contains only those CPUs that can arrive at this * group and should be considered to continue balancing. * * We do this during the group creation pass, therefore the group information * isn't complete yet, however since each group represents a (child) domain we * can fully construct this using the sched_domain bits (which are already * complete).
*/ staticvoid
build_balance_mask(struct sched_domain *sd, struct sched_group *sg, struct cpumask *mask)
{ conststruct cpumask *sg_span = sched_group_span(sg); struct sd_data *sdd = sd->private; struct sched_domain *sibling; int i;
/* * Can happen in the asymmetric case, where these siblings are * unused. The mask will not be empty because those CPUs that * do have the top domain _should_ span the domain.
*/ if (!sibling->child) continue;
/* If we would not end up here, we can't continue from here */ if (!cpumask_equal(sg_span, sched_domain_span(sibling->child))) continue;
cpumask_set_cpu(i, mask);
}
/* We must not have empty masks here */
WARN_ON_ONCE(cpumask_empty(mask));
}
/* * XXX: This creates per-node group entries; since the load-balancer will * immediately access remote memory to construct this group's load-balance * statistics having the groups node local is of dubious benefit.
*/ staticstruct sched_group *
build_group_from_child_sched_domain(struct sched_domain *sd, int cpu)
{ struct sched_group *sg; struct cpumask *sg_span;
/* * Initialize sgc->capacity such that even if we mess up the * domains and no possible iteration will get us here, we won't * die on a /0 trap.
*/
sg_span = sched_group_span(sg);
sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
sg->sgc->max_capacity = SCHED_CAPACITY_SCALE;
}
staticstruct sched_domain *
find_descended_sibling(struct sched_domain *sd, struct sched_domain *sibling)
{ /* * The proper descendant would be the one whose child won't span out * of sd
*/ while (sibling->child &&
!cpumask_subset(sched_domain_span(sibling->child),
sched_domain_span(sd)))
sibling = sibling->child;
/* * As we are referencing sgc across different topology level, we need * to go down to skip those sched_domains which don't contribute to * scheduling because they will be degenerated in cpu_attach_domain
*/ while (sibling->child &&
cpumask_equal(sched_domain_span(sibling->child),
sched_domain_span(sibling)))
sibling = sibling->child;
/* * Asymmetric node setups can result in situations where the * domain tree is of unequal depth, make sure to skip domains * that already cover the entire range. * * In that case build_sched_domains() will have terminated the * iteration early and our sibling sd spans will be empty. * Domains should always include the CPU they're built on, so * check that.
*/ if (!cpumask_test_cpu(i, sched_domain_span(sibling))) continue;
/* * Usually we build sched_group by sibling's child sched_domain * But for machines whose NUMA diameter are 3 or above, we move * to build sched_group by sibling's proper descendant's child * domain because sibling's child sched_domain will span out of * the sched_domain being built as below. * * Smallest diameter=3 topology is: * * node 0 1 2 3 * 0: 10 20 30 40 * 1: 20 10 20 30 * 2: 30 20 10 20 * 3: 40 30 20 10 * * 0 --- 1 --- 2 --- 3 * * NUMA-3 0-3 N/A N/A 0-3 * groups: {0-2},{1-3} {1-3},{0-2} * * NUMA-2 0-2 0-3 0-3 1-3 * groups: {0-1},{1-3} {0-2},{2-3} {1-3},{0-1} {2-3},{0-2} * * NUMA-1 0-1 0-2 1-3 2-3 * groups: {0},{1} {1},{2},{0} {2},{3},{1} {3},{2} * * NUMA-0 0 1 2 3 * * The NUMA-2 groups for nodes 0 and 3 are obviously buggered, as the * group span isn't a subset of the domain span.
*/ if (sibling->child &&
!cpumask_subset(sched_domain_span(sibling->child), span))
sibling = find_descended_sibling(sd, sibling);
sg = build_group_from_child_sched_domain(sibling, cpu); if (!sg) goto fail;
if (!first)
first = sg; if (last)
last->next = sg;
last = sg;
last->next = first;
}
sd->groups = first;
return 0;
fail:
free_sched_groups(first, 0);
return -ENOMEM;
}
/* * Package topology (also see the load-balance blurb in fair.c) * * The scheduler builds a tree structure to represent a number of important * topology features. By default (default_topology[]) these include: * * - Simultaneous multithreading (SMT) * - Multi-Core Cache (MC) * - Package (PKG) * * Where the last one more or less denotes everything up to a NUMA node. * * The tree consists of 3 primary data structures: * * sched_domain -> sched_group -> sched_group_capacity * ^ ^ ^ ^ * `-' `-' * * The sched_domains are per-CPU and have a two way link (parent & child) and * denote the ever growing mask of CPUs belonging to that level of topology. * * Each sched_domain has a circular (double) linked list of sched_group's, each * denoting the domains of the level below (or individual CPUs in case of the * first domain level). The sched_group linked by a sched_domain includes the * CPU of that sched_domain [*]. * * Take for instance a 2 threaded, 2 core, 2 cache cluster part: * * CPU 0 1 2 3 4 5 6 7 * * PKG [ ] * MC [ ] [ ] * SMT [ ] [ ] [ ] [ ] * * - or - * * PKG 0-7 0-7 0-7 0-7 0-7 0-7 0-7 0-7 * MC 0-3 0-3 0-3 0-3 4-7 4-7 4-7 4-7 * SMT 0-1 0-1 2-3 2-3 4-5 4-5 6-7 6-7 * * CPU 0 1 2 3 4 5 6 7 * * One way to think about it is: sched_domain moves you up and down among these * topology levels, while sched_group moves you sideways through it, at child * domain granularity. * * sched_group_capacity ensures each unique sched_group has shared storage. * * There are two related construction problems, both require a CPU that * uniquely identify each group (for a given domain): * * - The first is the balance_cpu (see should_we_balance() and the * load-balance blurb in fair.c); for each group we only want 1 CPU to * continue balancing at a higher domain. * * - The second is the sched_group_capacity; we want all identical groups * to share a single sched_group_capacity. * * Since these topologies are exclusive by construction. That is, its * impossible for an SMT thread to belong to multiple cores, and cores to * be part of multiple caches. There is a very clear and unique location * for each CPU in the hierarchy. * * Therefore computing a unique CPU for each group is trivial (the iteration * mask is redundant and set all 1s; all CPUs in a group will end up at _that_ * group), we can simply pick the first CPU in each group. * * * [*] in other words, the first group of each domain is its child domain.
*/
/* * build_sched_groups will build a circular linked list of the groups * covered by the given span, will set each group's ->cpumask correctly, * and will initialize their ->sgc. * * Assumes the sched_domain tree is fully constructed
*/ staticint
build_sched_groups(struct sched_domain *sd, int cpu)
{ struct sched_group *first = NULL, *last = NULL; struct sd_data *sdd = sd->private; conststruct cpumask *span = sched_domain_span(sd); struct cpumask *covered; int i;
if (!first)
first = sg; if (last)
last->next = sg;
last = sg;
}
last->next = first;
sd->groups = first;
return 0;
}
/* * Initialize sched groups cpu_capacity. * * cpu_capacity indicates the capacity of sched group, which is used while * distributing the load between different sched groups in a sched domain. * Typically cpu_capacity for all the groups in a sched domain will be same * unless there are asymmetries in the topology. If there are asymmetries, * group having more cpu_capacity will pickup more load compared to the * group having less cpu_capacity.
*/ staticvoid init_sched_groups_capacity(int cpu, struct sched_domain *sd)
{ struct sched_group *sg = sd->groups; struct cpumask *mask = sched_domains_tmpmask2;
/* Update the "asym_prefer_cpu" when arch_asym_cpu_priority() changes. */ void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio)
{ int asym_prefer_cpu = cpu; struct sched_domain *sd;
guard(rcu)();
for_each_domain(cpu, sd) { struct sched_group *sg; int group_cpu;
if (!(sd->flags & SD_ASYM_PACKING)) continue;
/* * Groups of overlapping domain are replicated per NUMA * node and will require updating "asym_prefer_cpu" on * each local copy. * * If you are hitting this warning, consider moving * "sg->asym_prefer_cpu" to "sg->sgc->asym_prefer_cpu" * which is shared by all the overlapping groups.
*/
WARN_ON_ONCE(sd->flags & SD_NUMA);
sg = sd->groups; if (cpu != sg->asym_prefer_cpu) { /* * Since the parent is a superset of the current group, * if the cpu is not the "asym_prefer_cpu" at the * current level, it cannot be the preferred CPU at a * higher levels either.
*/ if (!sched_asym_prefer(cpu, sg->asym_prefer_cpu)) return;
WRITE_ONCE(sg->asym_prefer_cpu, cpu); continue;
}
/* Ranking has improved; CPU is still the preferred one. */ if (new_prio >= old_prio) continue;
for_each_cpu(group_cpu, sched_group_span(sg)) { if (sched_asym_prefer(group_cpu, asym_prefer_cpu))
asym_prefer_cpu = group_cpu;
}
/* * Set of available CPUs grouped by their corresponding capacities * Each list entry contains a CPU mask reflecting CPUs that share the same * capacity. * The lifespan of data is unlimited.
*/
LIST_HEAD(asym_cap_list);
/* * Verify whether there is any CPU capacity asymmetry in a given sched domain. * Provides sd_flags reflecting the asymmetry scope.
*/ staticinlineint
asym_cpu_capacity_classify(conststruct cpumask *sd_span, conststruct cpumask *cpu_map)
{ struct asym_cap_data *entry; int count = 0, miss = 0;
/* * Count how many unique CPU capacities this domain spans across * (compare sched_domain CPUs mask with ones representing available * CPUs capacities). Take into account CPUs that might be offline: * skip those.
*/
list_for_each_entry(entry, &asym_cap_list, link) { if (cpumask_intersects(sd_span, cpu_capacity_span(entry)))
++count; elseif (cpumask_intersects(cpu_map, cpu_capacity_span(entry)))
++miss;
}
/* No asymmetry detected */ if (count < 2) return 0; /* Some of the available CPU capacity values have not been detected */ if (miss) return SD_ASYM_CPUCAPACITY;
/* Full asymmetry */ return SD_ASYM_CPUCAPACITY | SD_ASYM_CPUCAPACITY_FULL;
/* * Search if capacity already exits. If not, track which the entry * where we should insert to keep the list ordered descending.
*/
list_for_each_entry(entry, &asym_cap_list, link) { if (capacity == entry->capacity) goto done; elseif (!insert_entry && capacity > entry->capacity)
insert_entry = list_prev_entry(entry, link);
}
entry = kzalloc(sizeof(*entry) + cpumask_size(), GFP_KERNEL); if (WARN_ONCE(!entry, "Failed to allocate memory for asymmetry data\n")) return;
entry->capacity = capacity;
/* If NULL then the new capacity is the smallest, add last. */ if (!insert_entry)
list_add_tail_rcu(&entry->link, &asym_cap_list); else
list_add_rcu(&entry->link, &insert_entry->link);
done:
__cpumask_set_cpu(cpu, cpu_capacity_span(entry));
}
/* * Build-up/update list of CPUs grouped by their capacities * An update requires explicit request to rebuild sched domains * with state indicating CPU topology changes.
*/ staticvoid asym_cpu_capacity_scan(void)
{ struct asym_cap_data *entry, *next; int cpu;
/* * Only one capacity value has been detected i.e. this system is symmetric. * No need to keep this data around.
*/ if (list_is_singular(&asym_cap_list)) {
entry = list_first_entry(&asym_cap_list, typeof(*entry), link);
list_del_rcu(&entry->link);
call_rcu(&entry->rcu, free_asym_cap_entry);
}
}
/* * Initializers for schedule domains * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
*/
staticint default_relax_domain_level = -1; int sched_domain_level_max;
staticint __init setup_relax_domain_level(char *str)
{ if (kstrtoint(str, 0, &default_relax_domain_level))
pr_warn("Unable to set relax_domain_level\n");
if (__sdt_alloc(cpu_map)) return sa_sd_storage;
d->sd = alloc_percpu(struct sched_domain *); if (!d->sd) return sa_sd_storage;
d->rd = alloc_rootdomain(); if (!d->rd) return sa_sd;
return sa_rootdomain;
}
/* * NULL the sd_data elements we've used to build the sched_domain and * sched_group structure so that the subsequent __free_domain_allocs() * will not free the data we're using.
*/ staticvoid claim_allocations(int cpu, struct sched_domain *sd)
{ struct sd_data *sdd = sd->private;
int sched_max_numa_distance; staticint *sched_domains_numa_distance; staticstruct cpumask ***sched_domains_numa_masks; #endif/* CONFIG_NUMA */
/* * SD_flags allowed in topology descriptions. * * These flags are purely descriptive of the topology and do not prescribe * behaviour. Behaviour is artificial and mapped in the below sd_init() * function. For details, see include/linux/sched/sd_flags.h. * * SD_SHARE_CPUCAPACITY * SD_SHARE_LLC * SD_CLUSTER * SD_NUMA * * Odd one out, which beside describing the topology has a quirk also * prescribes the desired behaviour that goes along with it: * * SD_ASYM_PACKING - describes SMT quirks
*/ #define TOPOLOGY_SD_FLAGS \
(SD_SHARE_CPUCAPACITY | \
SD_CLUSTER | \
SD_SHARE_LLC | \
SD_NUMA | \
SD_ASYM_PACKING)
WARN_ONCE((sd->flags & (SD_SHARE_CPUCAPACITY | SD_ASYM_CPUCAPACITY)) ==
(SD_SHARE_CPUCAPACITY | SD_ASYM_CPUCAPACITY), "CPU capacity asymmetry not supported on SMT\n");
/* * Convert topological properties into behaviour.
*/ /* Don't attempt to spread across CPUs of different capacities. */ if ((sd->flags & SD_ASYM_CPUCAPACITY) && sd->child)
sd->child->flags &= ~SD_PREFER_SIBLING;
if (sd->flags & SD_SHARE_CPUCAPACITY) {
sd->imbalance_pct = 110;
staticvoid sched_numa_warn(constchar *str)
{ staticint done = false; int i,j;
if (done) return;
done = true;
printk(KERN_WARNING "ERROR: %s\n\n", str);
for (i = 0; i < nr_node_ids; i++) {
printk(KERN_WARNING " "); for (j = 0; j < nr_node_ids; j++) { if (!node_state(i, N_CPU) || !node_state(j, N_CPU))
printk(KERN_CONT "(%02d) ", node_distance(i,j)); else
printk(KERN_CONT " %02d ", node_distance(i,j));
}
printk(KERN_CONT "\n");
}
printk(KERN_WARNING "\n");
}
bool find_numa_distance(int distance)
{ bool found = false; int i, *distances;
if (distance == node_distance(0, 0)) returntrue;
rcu_read_lock();
distances = rcu_dereference(sched_domains_numa_distance); if (!distances) goto unlock; for (i = 0; i < sched_domains_numa_levels; i++) { if (distances[i] == distance) {
found = true; break;
}
}
unlock:
rcu_read_unlock();
/* * A system can have three types of NUMA topology: * NUMA_DIRECT: all nodes are directly connected, or not a NUMA system * NUMA_GLUELESS_MESH: some nodes reachable through intermediary nodes * NUMA_BACKPLANE: nodes can reach other nodes through a backplane * * The difference between a glueless mesh topology and a backplane * topology lies in whether communication between not directly * connected nodes goes through intermediary nodes (where programs * could run), or through backplane controllers. This affects * placement of programs. * * The type of topology can be discerned with the following tests: * - If the maximum distance between any nodes is 1 hop, the system * is directly connected. * - If for two nodes A and B, located N > 1 hops away from each other, * there is an intermediary node C, which is < N hops away from both * nodes A and B, the system is a glueless mesh.
*/ staticvoid init_numa_topology_type(int offline_node)
{ int a, b, c, n;
n = sched_max_numa_distance;
if (sched_domains_numa_levels <= 2) {
sched_numa_topology_type = NUMA_DIRECT; return;
}
for_each_cpu_node_but(a, offline_node) {
for_each_cpu_node_but(b, offline_node) { /* Find two nodes furthest removed from each other. */ if (node_distance(a, b) < n) continue;
/* Is there an intermediary node between a and b? */
for_each_cpu_node_but(c, offline_node) { if (node_distance(a, c) < n &&
node_distance(b, c) < n) {
sched_numa_topology_type =
NUMA_GLUELESS_MESH; return;
}
}
pr_err("Failed to find a NUMA topology type, defaulting to DIRECT\n");
sched_numa_topology_type = NUMA_DIRECT;
}
#define NR_DISTANCE_VALUES (1 << DISTANCE_BITS)
void sched_init_numa(int offline_node)
{ struct sched_domain_topology_level *tl; unsignedlong *distance_map; int nr_levels = 0; int i, j; int *distances; struct cpumask ***masks;
/* * O(nr_nodes^2) de-duplicating selection sort -- in order to find the * unique distances in the node_distance() table.
*/
distance_map = bitmap_alloc(NR_DISTANCE_VALUES, GFP_KERNEL); if (!distance_map) return;
if (distance < LOCAL_DISTANCE || distance >= NR_DISTANCE_VALUES) {
sched_numa_warn("Invalid distance value range");
bitmap_free(distance_map); return;
}
bitmap_set(distance_map, distance, 1);
}
} /* * We can now figure out how many unique distance values there are and * allocate memory accordingly.
*/
nr_levels = bitmap_weight(distance_map, NR_DISTANCE_VALUES);
for (i = 0, j = 0; i < nr_levels; i++, j++) {
j = find_next_bit(distance_map, NR_DISTANCE_VALUES, j);
distances[i] = j;
}
rcu_assign_pointer(sched_domains_numa_distance, distances);
bitmap_free(distance_map);
/* * 'nr_levels' contains the number of unique distances * * The sched_domains_numa_distance[] array includes the actual distance * numbers.
*/
/* * Here, we should temporarily reset sched_domains_numa_levels to 0. * If it fails to allocate memory for array sched_domains_numa_masks[][], * the array will contain less then 'nr_levels' members. This could be * dangerous when we use it to iterate array sched_domains_numa_masks[][] * in other functions. * * We reset it to 'nr_levels' at the end of this function.
*/
sched_domains_numa_levels = 0;
masks = kzalloc(sizeof(void *) * nr_levels, GFP_KERNEL); if (!masks) return;
/* * Now for each level, construct a mask per node which contains all * CPUs of nodes that are that many hops away from us.
*/ for (i = 0; i < nr_levels; i++) {
masks[i] = kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL); if (!masks[i]) return;
staticvoid sched_reset_numa(void)
{ int nr_levels, *distances; struct cpumask ***masks;
nr_levels = sched_domains_numa_levels;
sched_domains_numa_levels = 0;
sched_max_numa_distance = 0;
sched_numa_topology_type = NUMA_DIRECT;
distances = sched_domains_numa_distance;
rcu_assign_pointer(sched_domains_numa_distance, NULL);
masks = sched_domains_numa_masks;
rcu_assign_pointer(sched_domains_numa_masks, NULL); if (distances || masks) { int i, j;
synchronize_rcu();
kfree(distances); for (i = 0; i < nr_levels && masks; i++) { if (!masks[i]) continue;
for_each_node(j)
kfree(masks[i][j]);
kfree(masks[i]);
}
kfree(masks);
} if (sched_domain_topology_saved) {
kfree(sched_domain_topology);
sched_domain_topology = sched_domain_topology_saved;
sched_domain_topology_saved = NULL;
}
}
/* * Call with hotplug lock held
*/ void sched_update_numa(int cpu, bool online)
{ int node;
node = cpu_to_node(cpu); /* * Scheduler NUMA topology is updated when the first CPU of a * node is onlined or the last CPU of a node is offlined.
*/ if (cpumask_weight(cpumask_of_node(node)) != 1) return;
void sched_domains_numa_masks_set(unsignedint cpu)
{ int node = cpu_to_node(cpu); int i, j;
for (i = 0; i < sched_domains_numa_levels; i++) { for (j = 0; j < nr_node_ids; j++) { if (!node_state(j, N_CPU)) continue;
/* Set ourselves in the remote node's masks */ if (node_distance(j, node) <= sched_domains_numa_distance[i])
cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
}
}
}
void sched_domains_numa_masks_clear(unsignedint cpu)
{ int i, j;
for (i = 0; i < sched_domains_numa_levels; i++) { for (j = 0; j < nr_node_ids; j++) { if (sched_domains_numa_masks[i][j])
cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
}
}
}
/* * sched_numa_find_closest() - given the NUMA topology, find the cpu * closest to @cpu from @cpumask. * cpumask: cpumask to find a cpu from * cpu: cpu to be close to * * returns: cpu, or nr_cpu_ids when nothing found.
*/ int sched_numa_find_closest(conststruct cpumask *cpus, int cpu)
{ int i, j = cpu_to_node(cpu), found = nr_cpu_ids; struct cpumask ***masks;
rcu_read_lock();
masks = rcu_dereference(sched_domains_numa_masks); if (!masks) goto unlock; for (i = 0; i < sched_domains_numa_levels; i++) { if (!masks[i][j]) break;
cpu = cpumask_any_and_distribute(cpus, masks[i][j]); if (cpu < nr_cpu_ids) {
found = cpu; break;
}
}
unlock:
rcu_read_unlock();
return found;
}
struct __cmp_key { conststruct cpumask *cpus; struct cpumask ***masks; int node; int cpu; int w;
};
/** * sched_numa_find_nth_cpu() - given the NUMA topology, find the Nth closest CPU * from @cpus to @cpu, taking into account distance * from a given @node. * @cpus: cpumask to find a cpu from * @cpu: CPU to start searching * @node: NUMA node to order CPUs by distance * * Return: cpu, or nr_cpu_ids when nothing found.
*/ int sched_numa_find_nth_cpu(conststruct cpumask *cpus, int cpu, int node)
{ struct __cmp_key k = { .cpus = cpus, .cpu = cpu }; struct cpumask ***hop_masks; int hop, ret = nr_cpu_ids;
if (node == NUMA_NO_NODE) return cpumask_nth_and(cpu, cpus, cpu_online_mask);
rcu_read_lock();
/* CPU-less node entries are uninitialized in sched_domains_numa_masks */
node = numa_nearest_node(node, N_CPU);
k.node = node;
k.masks = rcu_dereference(sched_domains_numa_masks); if (!k.masks) goto unlock;
hop_masks = bsearch(&k, k.masks, sched_domains_numa_levels, sizeof(k.masks[0]), hop_cmp); if (!hop_masks) goto unlock;
hop = hop_masks - k.masks;
ret = hop ?
cpumask_nth_and_andnot(cpu - k.w, cpus, k.masks[hop][node], k.masks[hop-1][node]) :
cpumask_nth_and(cpu, cpus, k.masks[0][node]);
unlock:
rcu_read_unlock(); return ret;
}
EXPORT_SYMBOL_GPL(sched_numa_find_nth_cpu);
/** * sched_numa_hop_mask() - Get the cpumask of CPUs at most @hops hops away from * @node * @node: The node to count hops from. * @hops: Include CPUs up to that many hops away. 0 means local node. * * Return: On success, a pointer to a cpumask of CPUs at most @hops away from * @node, an error value otherwise. * * Requires rcu_lock to be held. Returned cpumask is only valid within that * read-side section, copy it if required beyond that. * * Note that not all hops are equal in distance; see sched_init_numa() for how * distances and masks are handled. * Also note that this is a reflection of sched_domains_numa_masks, which may change * during the lifetime of the system (offline nodes are taken out of the masks).
*/ conststruct cpumask *sched_numa_hop_mask(unsignedint node, unsignedint hops)
{ struct cpumask ***masks;
if (node >= nr_node_ids || hops >= sched_domains_numa_levels) return ERR_PTR(-EINVAL);
masks = rcu_dereference(sched_domains_numa_masks); if (!masks) return ERR_PTR(-EBUSY);
if (!cpumask_subset(sched_domain_span(child),
sched_domain_span(sd))) {
pr_err("BUG: arch topology borken\n");
pr_err(" the %s domain not a subset of the %s domain\n",
child->name, sd->name); /* Fixup, ensure @sd has at least @child CPUs. */
cpumask_or(sched_domain_span(sd),
sched_domain_span(sd),
sched_domain_span(child));
}
}
set_domain_attribute(sd, attr);
return sd;
}
/* * Ensure topology masks are sane, i.e. there are no conflicts (overlaps) for * any two given CPUs on non-NUMA topology levels.
*/ staticbool topology_span_sane(conststruct cpumask *cpu_map)
{ struct sched_domain_topology_level *tl; struct cpumask *covered, *id_seen; int cpu;
for_each_sd_topology(tl) { int tl_common_flags = 0;
if (tl->sd_flags)
tl_common_flags = (*tl->sd_flags)();
/* NUMA levels are allowed to overlap */ if (tl_common_flags & SD_NUMA) continue;
cpumask_clear(covered);
cpumask_clear(id_seen);
/* * Non-NUMA levels cannot partially overlap - they must be either * completely equal or completely disjoint. Otherwise we can end up * breaking the sched_group lists - i.e. a later get_group() pass * breaks the linking done for an earlier span.
*/
for_each_cpu(cpu, cpu_map) { conststruct cpumask *tl_cpu_mask = tl->mask(tl, cpu); int id;
/* lowest bit set in this mask is used as a unique id */
id = cpumask_first(tl_cpu_mask);
if (cpumask_test_cpu(id, id_seen)) { /* First CPU has already been seen, ensure identical spans */ if (!cpumask_equal(tl->mask(tl, id), tl_cpu_mask)) returnfalse;
} else { /* First CPU hasn't been seen before, ensure it's a completely new span */ if (cpumask_intersects(tl_cpu_mask, covered)) returnfalse;
/* * Build sched domains for a given set of CPUs and attach the sched domains * to the individual CPUs
*/ staticint
build_sched_domains(conststruct cpumask *cpu_map, struct sched_domain_attr *attr)
{ enum s_alloc alloc_state = sa_none; struct sched_domain *sd; struct s_data d; struct rq *rq = NULL; int i, ret = -ENOMEM; bool has_asym = false; bool has_cluster = false;
if (WARN_ON(cpumask_empty(cpu_map))) goto error;
alloc_state = __visit_domain_allocation_hell(&d, cpu_map); if (alloc_state != sa_rootdomain) goto error;
/* Set up domains for CPUs specified by the cpu_map: */
for_each_cpu(i, cpu_map) { struct sched_domain_topology_level *tl;
if (tl == sched_domain_topology)
*per_cpu_ptr(d.sd, i) = sd; if (cpumask_equal(cpu_map, sched_domain_span(sd))) break;
}
}
if (WARN_ON(!topology_span_sane(cpu_map))) goto error;
/* Build the groups for the domains */
for_each_cpu(i, cpu_map) { for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
sd->span_weight = cpumask_weight(sched_domain_span(sd)); if (sd->flags & SD_NUMA) { if (build_overlap_sched_groups(sd, i)) goto error;
} else { if (build_sched_groups(sd, i)) goto error;
}
}
}
/* * Calculate an allowed NUMA imbalance such that LLCs do not get * imbalanced.
*/
for_each_cpu(i, cpu_map) { unsignedint imb = 0; unsignedint imb_span = 1;
/* * For a single LLC per node, allow an * imbalance up to 12.5% of the node. This is * arbitrary cutoff based two factors -- SMT and * memory channels. For SMT-2, the intent is to * avoid premature sharing of HT resources but * SMT-4 or SMT-8 *may* benefit from a different * cutoff. For memory channels, this is a very * rough estimate of how many channels may be * active and is based on recent CPUs with * many cores. * * For multiple LLCs, allow an imbalance * until multiple tasks would share an LLC * on one node while LLCs on another node * remain idle. This assumes that there are * enough logical CPUs per LLC to avoid SMT * factors and that there is a correlation * between LLCs and memory channels.
*/
nr_llcs = sd->span_weight / child->span_weight; if (nr_llcs == 1)
imb = sd->span_weight >> 3; else
imb = nr_llcs;
imb = max(1U, imb);
sd->imb_numa_nr = imb;
/* Set span based on the first NUMA domain. */
top_p = sd->parent; while (top_p && !(top_p->flags & SD_NUMA)) {
top_p = top_p->parent;
}
imb_span = top_p ? top_p->span_weight : sd->span_weight;
} else { int factor = max(1U, (sd->span_weight / imb_span));
sd->imb_numa_nr = imb * factor;
}
}
}
--> --------------------
--> maximum size reached
--> --------------------
Messung V0.5
¤ Dauer der Verarbeitung: 0.24 Sekunden
(vorverarbeitet)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.