/* * For asym packing, by default the lower numbered CPU has higher priority.
*/ int __weak arch_asym_cpu_priority(int cpu)
{ return -cpu;
}
/* * The margin used when comparing utilization with CPU capacity. * * (default: ~20%)
*/ #define fits_capacity(cap, max) ((cap) * 1280 < (max) * 1024)
/* * The margin used when comparing CPU capacities. * is 'cap1' noticeably greater than 'cap2' * * (default: ~5%)
*/ #define capacity_greater(cap1, cap2) ((cap1) * 1024 > (cap2) * 1078)
#ifdef CONFIG_CFS_BANDWIDTH /* * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool * each time a cfs_rq requests quota. * * Note: in the case that the slice exceeds the runtime remaining (either due * to consumption or the quota being specified to be smaller than the slice) * we will always only issue the remaining available time. * * (default: 5 msec, units: microseconds)
*/ staticunsignedint sysctl_sched_cfs_bandwidth_slice = 5000UL; #endif
#ifdef CONFIG_NUMA_BALANCING /* Restrict the NUMA promotion throughput (MB/s) for each target node. */ staticunsignedint sysctl_numa_balancing_promote_rate_limit = 65536; #endif
/* * Increase the granularity value when there are more CPUs, * because with more CPUs the 'effective latency' as visible * to users decreases. But the relationship is not linear, * so pick a second-best guess by going with the log2 of the * number of CPUs. * * This idea comes from the SD scheduler of Con Kolivas:
*/ staticunsignedint get_update_sysctl_factor(void)
{ unsignedint cpus = min_t(unsignedint, num_online_cpus(), 8); unsignedint factor;
switch (sysctl_sched_tunable_scaling) { case SCHED_TUNABLESCALING_NONE:
factor = 1; break; case SCHED_TUNABLESCALING_LINEAR:
factor = cpus; break; case SCHED_TUNABLESCALING_LOG: default:
factor = 1 + ilog2(cpus); break;
}
/************************************************************** * CFS operations on generic schedulable entities:
*/
#ifdef CONFIG_FAIR_GROUP_SCHED
/* Walk up scheduling entities hierarchy */ #define for_each_sched_entity(se) \ for (; se; se = se->parent)
staticinlinebool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{ struct rq *rq = rq_of(cfs_rq); int cpu = cpu_of(rq);
if (cfs_rq->on_list) return rq->tmp_alone_branch == &rq->leaf_cfs_rq_list;
cfs_rq->on_list = 1;
/* * Ensure we either appear before our parent (if already * enqueued) or force our parent to appear after us when it is * enqueued. The fact that we always enqueue bottom-up * reduces this to two cases and a special case for the root * cfs_rq. Furthermore, it also means that we will always reset * tmp_alone_branch either when the branch is connected * to a tree or when we reach the top of the tree
*/ if (cfs_rq->tg->parent &&
cfs_rq->tg->parent->cfs_rq[cpu]->on_list) { /* * If parent is already on the list, we add the child * just before. Thanks to circular linked property of * the list, this means to put the child at the tail * of the list that starts by parent.
*/
list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
&(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list)); /* * The branch is now connected to its tree so we can * reset tmp_alone_branch to the beginning of the * list.
*/
rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; returntrue;
}
if (!cfs_rq->tg->parent) { /* * cfs rq without parent should be put * at the tail of the list.
*/
list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
&rq->leaf_cfs_rq_list); /* * We have reach the top of a tree so we can reset * tmp_alone_branch to the beginning of the list.
*/
rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; returntrue;
}
/* * The parent has not already been added so we want to * make sure that it will be put after us. * tmp_alone_branch points to the begin of the branch * where we will add parent.
*/
list_add_rcu(&cfs_rq->leaf_cfs_rq_list, rq->tmp_alone_branch); /* * update tmp_alone_branch to points to the new begin * of the branch
*/
rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list; returnfalse;
}
/* * With cfs_rq being unthrottled/throttled during an enqueue, * it can happen the tmp_alone_branch points to the leaf that * we finally want to delete. In this case, tmp_alone_branch moves * to the prev element but it will point to rq->leaf_cfs_rq_list * at the end of the enqueue.
*/ if (rq->tmp_alone_branch == &cfs_rq->leaf_cfs_rq_list)
rq->tmp_alone_branch = cfs_rq->leaf_cfs_rq_list.prev;
/* Iterate through all leaf cfs_rq's on a runqueue */ #define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list, \
leaf_cfs_rq_list)
/* Do the two (enqueued) entities belong to the same group ? */ staticinlinestruct cfs_rq *
is_same_group(struct sched_entity *se, struct sched_entity *pse)
{ if (se->cfs_rq == pse->cfs_rq) return se->cfs_rq;
/* * preemption test can be made between sibling entities who are in the * same cfs_rq i.e who have a common parent. Walk up the hierarchy of * both tasks until we find their ancestors who are siblings of common * parent.
*/
/* First walk up until both entities are at same depth */
se_depth = (*se)->depth;
pse_depth = (*pse)->depth;
while (se_depth > pse_depth) {
se_depth--;
*se = parent_entity(*se);
}
while (pse_depth > se_depth) {
pse_depth--;
*pse = parent_entity(*pse);
}
/* * Compute virtual time from the per-task service numbers: * * Fair schedulers conserve lag: * * \Sum lag_i = 0 * * Where lag_i is given by: * * lag_i = S - s_i = w_i * (V - v_i) * * Where S is the ideal service time and V is it's virtual time counterpart. * Therefore: * * \Sum lag_i = 0 * \Sum w_i * (V - v_i) = 0 * \Sum w_i * V - w_i * v_i = 0 * * From which we can solve an expression for V in v_i (which we have in * se->vruntime): * * \Sum v_i * w_i \Sum v_i * w_i * V = -------------- = -------------- * \Sum w_i W * * Specifically, this is the weighted average of all entity virtual runtimes. * * [[ NOTE: this is only equal to the ideal scheduler under the condition * that join/leave operations happen at lag_i = 0, otherwise the * virtual time has non-contiguous motion equivalent to: * * V +-= lag_i / W * * Also see the comment in place_entity() that deals with this. ]] * * However, since v_i is u64, and the multiplication could easily overflow * transform it into a relative form that uses smaller quantities: * * Substitute: v_i == (v_i - v0) + v0 * * \Sum ((v_i - v0) + v0) * w_i \Sum (v_i - v0) * w_i * V = ---------------------------- = --------------------- + v0 * W W * * Which we track using: * * v0 := cfs_rq->min_vruntime * \Sum (v_i - v0) * w_i := cfs_rq->avg_vruntime * \Sum w_i := cfs_rq->avg_load * * Since min_vruntime is a monotonic increasing variable that closely tracks * the per-task service, these deltas: (v_i - v), will be in the order of the * maximal (virtual) lag induced in the system due to quantisation. * * Also, we use scale_load_down() to reduce the size. * * As measured, the max (key * weight) value was ~44 bits for a kernel build.
*/ staticvoid
avg_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
{ unsignedlong weight = scale_load_down(se->load.weight);
s64 key = entity_key(cfs_rq, se);
/* * Specifically: avg_runtime() + 0 must result in entity_eligible() := true * For this to be so, the result of this function must have a left bias.
*/
u64 avg_vruntime(struct cfs_rq *cfs_rq)
{ struct sched_entity *curr = cfs_rq->curr;
s64 avg = cfs_rq->avg_vruntime; long load = cfs_rq->avg_load;
if (curr && curr->on_rq) { unsignedlong weight = scale_load_down(curr->load.weight);
/* * lag_i = S - s_i = w_i * (V - v_i) * * However, since V is approximated by the weighted average of all entities it * is possible -- by addition/removal/reweight to the tree -- to move V around * and end up with a larger lag than we started with. * * Limit this to either double the slice length with a minimum of TICK_NSEC * since that is the timing granularity. * * EEVDF gives the following limit for a steady state system: * * -r_max < lag < max(r_max, q) * * XXX could add max_slice to the augmented data to track this.
*/ staticvoid update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
s64 vlag, limit;
/* * Entity is eligible once it received less service than it ought to have, * eg. lag >= 0. * * lag_i = S - s_i = w_i*(V - v_i) * * lag_i >= 0 -> V >= v_i * * \Sum (v_i - v)*w_i * V = ------------------ + v * \Sum w_i * * lag_i >= 0 -> \Sum (v_i - v)*w_i >= (v_i - v)*(\Sum w_i) * * Note: using 'avg_vruntime() > se->vruntime' is inaccurate due * to the loss in precision caused by the division.
*/ staticint vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime)
{ struct sched_entity *curr = cfs_rq->curr;
s64 avg = cfs_rq->avg_vruntime; long load = cfs_rq->avg_load;
if (curr && curr->on_rq) { unsignedlong weight = scale_load_down(curr->load.weight);
/* * Set the vruntime up to which an entity can run before looking * for another entity to pick. * In case of run to parity, we use the shortest slice of the enqueued * entities to set the protected period. * When run to parity is disabled, we give a minimum quantum to the running * entity to ensure progress.
*/ staticinlinevoid set_protect_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
u64 slice = normalized_sysctl_sched_base_slice;
u64 vprot = se->deadline;
if (sched_feat(RUN_TO_PARITY))
slice = cfs_rq_min_slice(cfs_rq);
/* * Earliest Eligible Virtual Deadline First * * In order to provide latency guarantees for different request sizes * EEVDF selects the best runnable task from two criteria: * * 1) the task must be eligible (must be owed service) * * 2) from those tasks that meet 1), we select the one * with the earliest virtual deadline. * * We can do this in O(log n) time due to an augmented RB-tree. The * tree keeps the entries sorted on deadline, but also functions as a * heap based on the vruntime by keeping: * * se->min_vruntime = min(se->vruntime, se->{left,right}->min_vruntime) * * Which allows tree pruning through eligibility.
*/ staticstruct sched_entity *__pick_eevdf(struct cfs_rq *cfs_rq, bool protect)
{ struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node; struct sched_entity *se = __pick_first_entity(cfs_rq); struct sched_entity *curr = cfs_rq->curr; struct sched_entity *best = NULL;
/* * We can safely skip eligibility check if there is only one entity * in this cfs_rq, saving some cycles.
*/ if (cfs_rq->nr_queued == 1) return curr && curr->on_rq ? curr : se;
if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr)))
curr = NULL;
if (curr && protect && protect_slice(curr)) return curr;
/* Pick the leftmost entity if it's eligible */ if (se && entity_eligible(cfs_rq, se)) {
best = se; goto found;
}
/* Heap search for the EEVD entity */ while (node) { struct rb_node *left = node->rb_left;
/* * Eligible entities in left subtree are always better * choices, since they have earlier deadlines.
*/ if (left && vruntime_eligible(cfs_rq,
__node_2_se(left)->min_vruntime)) {
node = left; continue;
}
se = __node_2_se(node);
/* * The left subtree either is empty or has no eligible * entity, so check the current node since it is the one * with earliest deadline that might be eligible.
*/ if (entity_eligible(cfs_rq, se)) {
best = se; break;
}
node = node->rb_right;
}
found: if (!best || (curr && entity_before(curr, best)))
best = curr;
/* * XXX: strictly: vd_i += N*r_i/w_i such that: vd_i > ve_i * this is probably good enough.
*/ staticbool update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
{ if ((s64)(se->vruntime - se->deadline) < 0) returnfalse;
/* * For EEVDF the virtual time slope is determined by w_i (iow. * nice) while the request time r_i is determined by * sysctl_sched_base_slice.
*/ if (!se->custom_slice)
se->slice = sysctl_sched_base_slice;
/* * The task has consumed its request, reschedule.
*/ returntrue;
}
#include"pelt.h"
staticint select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu); staticunsignedlong task_h_load(struct task_struct *p); staticunsignedlong capacity_of(int cpu);
/* Give new sched_entity start runnable values to heavy its load in infant time */ void init_entity_runnable_average(struct sched_entity *se)
{ struct sched_avg *sa = &se->avg;
memset(sa, 0, sizeof(*sa));
/* * Tasks are initialized with full load to be seen as heavy tasks until * they get a chance to stabilize to their real load level. * Group entities are initialized with zero load to reflect the fact that * nothing has been attached to the task group yet.
*/ if (entity_is_task(se))
sa->load_avg = scale_load_down(se->load.weight);
/* when this task is enqueued, it will contribute to its cfs_rq's load_avg */
}
/* * With new tasks being created, their initial util_avgs are extrapolated * based on the cfs_rq's current util_avg: * * util_avg = cfs_rq->avg.util_avg / (cfs_rq->avg.load_avg + 1) * * se_weight(se) * * However, in many cases, the above util_avg does not give a desired * value. Moreover, the sum of the util_avgs may be divergent, such * as when the series is a harmonic series. * * To solve this problem, we also cap the util_avg of successive tasks to * only 1/2 of the left utilization budget: * * util_avg_cap = (cpu_scale - cfs_rq->avg.util_avg) / 2^n * * where n denotes the nth task and cpu_scale the CPU capacity. * * For example, for a CPU with 1024 of capacity, a simplest series from * the beginning would be like: * * task util_avg: 512, 256, 128, 64, 32, 16, 8, ... * cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ... * * Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap) * if util_avg > util_avg_cap.
*/ void post_init_entity_util_avg(struct task_struct *p)
{ struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); struct sched_avg *sa = &se->avg; long cpu_scale = arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq))); long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / 2;
if (p->sched_class != &fair_sched_class) { /* * For !fair tasks do: * update_cfs_rq_load_avg(now, cfs_rq); attach_entity_load_avg(cfs_rq, se); switched_from_fair(rq, p); * * such that the next switched_to_fair() has the * expected state.
*/
se->avg.last_update_time = cfs_rq_clock_pelt(cfs_rq); return;
}
if (cap > 0) { if (cfs_rq->avg.util_avg != 0) {
sa->util_avg = cfs_rq->avg.util_avg * se_weight(se);
sa->util_avg /= (cfs_rq->avg.load_avg + 1);
delta_exec = now - se->exec_start; if (unlikely(delta_exec <= 0)) return delta_exec;
se->exec_start = now; if (entity_is_task(se)) { struct task_struct *donor = task_of(se); struct task_struct *running = rq->curr; /* * If se is a task, we account the time against the running * task, as w/ proxy-exec they may not be the same.
*/
running->se.exec_start = now;
running->se.sum_exec_runtime += delta_exec;
/* cgroup time is always accounted against the donor */
cgroup_account_cputime(donor, delta_exec);
} else { /* If not task, account the time against donor se */
se->sum_exec_runtime += delta_exec;
}
if (schedstat_enabled()) { struct sched_statistics *stats;
/* * Used by other classes to account runtime.
*/
s64 update_curr_common(struct rq *rq)
{ return update_se(rq, &rq->donor->se);
}
/* * Update the current task's runtime statistics.
*/ staticvoid update_curr(struct cfs_rq *cfs_rq)
{ /* * Note: cfs_rq->curr corresponds to the task picked to * run (ie: rq->donor.se) which due to proxy-exec may * not necessarily be the actual task running * (rq->curr.se). This is easy to confuse!
*/ struct sched_entity *curr = cfs_rq->curr; struct rq *rq = rq_of(cfs_rq);
s64 delta_exec; bool resched;
if (unlikely(!curr)) return;
delta_exec = update_se(rq, curr); if (unlikely(delta_exec <= 0)) return;
if (entity_is_task(curr)) { /* * If the fair_server is active, we need to account for the * fair_server time whether or not the task is running on * behalf of fair_server or not: * - If the task is running on behalf of fair_server, we need * to limit its time based on the assigned runtime. * - Fair task that runs outside of fair_server should account * against fair_server such that it can account for this time * and possibly avoid running this period.
*/ if (dl_server_active(&rq->fair_server))
dl_server_update(&rq->fair_server, delta_exec);
}
account_cfs_rq_runtime(cfs_rq, delta_exec);
if (cfs_rq->nr_queued == 1) return;
if (resched || !protect_slice(curr)) {
resched_curr_lazy(rq);
clear_buddies(cfs_rq, curr);
}
}
/* * When the sched_schedstat changes from 0 to 1, some sched se * maybe already in the runqueue, the se->statistics.wait_start * will be 0.So it will let the delta wrong. We need to avoid this * scenario.
*/ if (unlikely(!schedstat_val(stats->wait_start))) return;
/* * Task is being enqueued - update stats:
*/ staticinlinevoid
update_stats_enqueue_fair(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{ if (!schedstat_enabled()) return;
/* * Are we enqueueing a waiting task? (for current tasks * a dequeue/enqueue event is a NOP)
*/ if (se != cfs_rq->curr)
update_stats_wait_start_fair(cfs_rq, se);
if (flags & ENQUEUE_WAKEUP)
update_stats_enqueue_sleeper_fair(cfs_rq, se);
}
staticinlinevoid
update_stats_dequeue_fair(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
if (!schedstat_enabled()) return;
/* * Mark the end of the wait period if dequeueing a * waiting task:
*/ if (se != cfs_rq->curr)
update_stats_wait_end_fair(cfs_rq, se);
/* XXX racy against TTWU */
state = READ_ONCE(tsk->__state); if (state & TASK_INTERRUPTIBLE)
__schedstat_set(tsk->stats.sleep_start,
rq_clock(rq_of(cfs_rq))); if (state & TASK_UNINTERRUPTIBLE)
__schedstat_set(tsk->stats.block_start,
rq_clock(rq_of(cfs_rq)));
}
}
/* * We are picking a new current task - update its stats:
*/ staticinlinevoid
update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
{ /* * We are starting a new run period:
*/
se->exec_start = rq_clock_task(rq_of(cfs_rq));
}
/************************************************** * Scheduling class queueing methods:
*/
staticinlinebool is_core_idle(int cpu)
{ #ifdef CONFIG_SCHED_SMT int sibling;
for_each_cpu(sibling, cpu_smt_mask(cpu)) { if (cpu == sibling) continue;
if (!idle_cpu(sibling)) returnfalse;
} #endif
returntrue;
}
#ifdef CONFIG_NUMA #define NUMA_IMBALANCE_MIN 2
staticinlinelong
adjust_numa_imbalance(int imbalance, int dst_running, int imb_numa_nr)
{ /* * Allow a NUMA imbalance if busy CPUs is less than the maximum * threshold. Above this threshold, individual tasks may be contending * for both memory bandwidth and any shared HT resources. This is an * approximation as the number of running tasks may not be related to * the number of busy CPUs due to sched_setaffinity.
*/ if (dst_running > imb_numa_nr) return imbalance;
/* * Allow a small imbalance based on a simple pair of communicating * tasks that remain local when the destination is lightly loaded.
*/ if (imbalance <= NUMA_IMBALANCE_MIN) return 0;
return imbalance;
} #endif/* CONFIG_NUMA */
#ifdef CONFIG_NUMA_BALANCING /* * Approximate time to scan a full NUMA task in ms. The task scan period is * calculated based on the tasks virtual memory size and * numa_balancing_scan_size.
*/ unsignedint sysctl_numa_balancing_scan_period_min = 1000; unsignedint sysctl_numa_balancing_scan_period_max = 60000;
/* Portion of address space to scan in MB */ unsignedint sysctl_numa_balancing_scan_size = 256;
/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */ unsignedint sysctl_numa_balancing_scan_delay = 1000;
/* The page with hint page fault latency < threshold in ms is considered hot */ unsignedint sysctl_numa_balancing_hot_threshold = MSEC_PER_SEC;
struct numa_group {
refcount_t refcount;
spinlock_t lock; /* nr_tasks, tasks */ int nr_tasks;
pid_t gid; int active_nodes;
struct rcu_head rcu; unsignedlong total_faults; unsignedlong max_faults_cpu; /* * faults[] array is split into two regions: faults_mem and faults_cpu. * * Faults_cpu is used to decide whether memory should move * towards the CPU. As a consequence, these stats are weighted * more by CPU use than by memory faults.
*/ unsignedlong faults[];
};
/* * For functions that can be called in multiple contexts that permit reading * ->numa_group (see struct task_struct for locking rules).
*/ staticstruct numa_group *deref_task_numa_group(struct task_struct *p)
{ return rcu_dereference_check(p->numa_group, p == current ||
(lockdep_is_held(__rq_lockp(task_rq(p))) && !READ_ONCE(p->on_cpu)));
}
/* * Calculations based on RSS as non-present and empty pages are skipped * by the PTE scanner and NUMA hinting faults should be trapped based * on resident pages
*/
nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
rss = get_mm_rss(p->mm); if (!rss)
rss = nr_scan_pages;
/* Scale the maximum scan period with the amount of shared memory. */
rcu_read_lock();
ng = rcu_dereference(p->numa_group); if (ng) { unsignedlong shared = group_faults_shared(ng); unsignedlongprivate = group_faults_priv(ng);
period *= refcount_read(&ng->refcount);
period *= shared + 1;
period /= private + shared + 1;
}
rcu_read_unlock();
/* Watch for min being lower than max due to floor calculations */
smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
/* Scale the maximum scan period with the amount of shared memory. */
ng = deref_curr_numa_group(p); if (ng) { unsignedlong shared = group_faults_shared(ng); unsignedlongprivate = group_faults_priv(ng); unsignedlong period = smax;
period *= refcount_read(&ng->refcount);
period *= shared + 1;
period /= private + shared + 1;
rcu_read_lock();
ng = rcu_dereference(p->numa_group); if (ng)
gid = ng->gid;
rcu_read_unlock();
return gid;
}
/* * The averaged statistics, shared & private, memory & CPU, * occupy the first half of the array. The second half of the * array is for current counters, which are averaged into the * first set by task_numa_placement.
*/ staticinlineint task_faults_idx(enum numa_faults_stats s, int nid, int priv)
{ return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv;
}
staticinlineunsignedlong task_faults(struct task_struct *p, int nid)
{ if (!p->numa_faults) return 0;
/* * A node triggering more than 1/3 as many NUMA faults as the maximum is * considered part of a numa group's pseudo-interleaving set. Migrations * between these nodes are slowed down, to allow things to settle down.
*/ #define ACTIVE_NODE_FRACTION 3
/* Handle placement on systems where not all nodes are directly connected. */ staticunsignedlong score_nearby_nodes(struct task_struct *p, int nid, int lim_dist, bool task)
{ unsignedlong score = 0; int node, max_dist;
/* * All nodes are directly connected, and the same distance * from each other. No need for fancy placement algorithms.
*/ if (sched_numa_topology_type == NUMA_DIRECT) return 0;
/* sched_max_numa_distance may be changed in parallel. */
max_dist = READ_ONCE(sched_max_numa_distance); /* * This code is called for each node, introducing N^2 complexity, * which should be OK given the number of nodes rarely exceeds 8.
*/
for_each_online_node(node) { unsignedlong faults; int dist = node_distance(nid, node);
/* * The furthest away nodes in the system are not interesting * for placement; nid was already counted.
*/ if (dist >= max_dist || node == nid) continue;
/* * On systems with a backplane NUMA topology, compare groups * of nodes, and move tasks towards the group with the most * memory accesses. When comparing two nodes at distance * "hoplimit", only nodes closer by than "hoplimit" are part * of each group. Skip other nodes.
*/ if (sched_numa_topology_type == NUMA_BACKPLANE && dist >= lim_dist) continue;
/* Add up the faults from nearby nodes. */ if (task)
faults = task_faults(p, node); else
faults = group_faults(p, node);
/* * On systems with a glueless mesh NUMA topology, there are * no fixed "groups of nodes". Instead, nodes that are not * directly connected bounce traffic through intermediate * nodes; a numa_group can occupy any set of nodes. * The further away a node is, the less the faults count. * This seems to result in good task placement.
*/ if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
faults *= (max_dist - dist);
faults /= (max_dist - LOCAL_DISTANCE);
}
score += faults;
}
return score;
}
/* * These return the fraction of accesses done by a particular task, or * task group, on a particular numa node. The group weight is given a * larger multiplier, in order to group tasks together that are almost * evenly spread out between numa nodes.
*/ staticinlineunsignedlong task_weight(struct task_struct *p, int nid, int dist)
{ unsignedlong faults, total_faults;
/* * If memory tiering mode is enabled, cpupid of slow memory page is * used to record scan time instead of CPU and PID. When tiering mode * is disabled at run time, the scan time (in cpupid) will be * interpreted as CPU and PID. So CPU needs to be checked to avoid to * access out of array bound.
*/ staticinlinebool cpupid_valid(int cpupid)
{ return cpupid_to_cpu(cpupid) < nr_cpu_ids;
}
/* * For memory tiering mode, if there are enough free pages (more than * enough watermark defined here) in fast memory node, to take full * advantage of fast memory capacity, all recently accessed slow * memory pages will be migrated to fast memory node without * considering hot threshold.
*/ staticbool pgdat_free_space_enough(struct pglist_data *pgdat)
{ int z; unsignedlong enough_wmark;
enough_wmark = max(1UL * 1024 * 1024 * 1024 >> PAGE_SHIFT,
pgdat->node_present_pages >> 4); for (z = pgdat->nr_zones - 1; z >= 0; z--) { struct zone *zone = pgdat->node_zones + z;
/* * For memory tiering mode, when page tables are scanned, the scan * time will be recorded in struct page in addition to make page * PROT_NONE for slow memory page. So when the page is accessed, in * hint page fault handler, the hint page fault latency is calculated * via, * * hint page fault latency = hint page fault time - scan time * * The smaller the hint page fault latency, the higher the possibility * for the page to be hot.
*/ staticint numa_hint_fault_latency(struct folio *folio)
{ int last_time, time;
time = jiffies_to_msecs(jiffies);
last_time = folio_xchg_access_time(folio, time);
/* * For memory tiering mode, too high promotion/demotion throughput may * hurt application latency. So we provide a mechanism to rate limit * the number of pages that are tried to be promoted.
*/ staticbool numa_promotion_rate_limit(struct pglist_data *pgdat, unsignedlong rate_limit, int nr)
{ unsignedlong nr_cand; unsignedint now, start;
bool should_numa_migrate_memory(struct task_struct *p, struct folio *folio, int src_nid, int dst_cpu)
{ struct numa_group *ng = deref_curr_numa_group(p); int dst_nid = cpu_to_node(dst_cpu); int last_cpupid, this_cpupid;
/* * Cannot migrate to memoryless nodes.
*/ if (!node_state(dst_nid, N_MEMORY)) returnfalse;
/* * The pages in slow memory node should be migrated according * to hot/cold instead of private/shared.
*/ if (folio_use_access_time(folio)) { struct pglist_data *pgdat; unsignedlong rate_limit; unsignedint latency, th, def_th;
pgdat = NODE_DATA(dst_nid); if (pgdat_free_space_enough(pgdat)) { /* workload changed, reset hot threshold */
pgdat->nbp_threshold = 0; returntrue;
}
if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) &&
!node_is_toptier(src_nid) && !cpupid_valid(last_cpupid)) returnfalse;
/* * Allow first faults or private faults to migrate immediately early in * the lifetime of a task. The magic number 4 is based on waiting for * two full passes of the "multi-stage node selection" test that is * executed below.
*/ if ((p->numa_preferred_nid == NUMA_NO_NODE || p->numa_scan_seq <= 4) &&
(cpupid_pid_unset(last_cpupid) || cpupid_match_pid(p, last_cpupid))) returntrue;
/* * Multi-stage node selection is used in conjunction with a periodic * migration fault to build a temporal task<->page relation. By using * a two-stage filter we remove short/unlikely relations. * * Using P(p) ~ n_p / n_t as per frequentist probability, we can equate * a task's usage of a particular page (n_p) per total usage of this * page (n_t) (in a given time-span) to a probability. * * Our periodic faults will sample this probability and getting the * same result twice in a row, given these samples are fully * independent, is then given by P(n)^2, provided our sample period * is sufficiently short compared to the usage pattern. * * This quadric squishes small probabilities, making it less likely we * act on an unlikely task<->page relation.
*/ if (!cpupid_pid_unset(last_cpupid) &&
cpupid_to_nid(last_cpupid) != dst_nid) returnfalse;
/* Always allow migrate on private faults */ if (cpupid_match_pid(p, last_cpupid)) returntrue;
/* A shared fault, but p->numa_group has not been set up yet. */ if (!ng) returntrue;
/* * Destination node is much more heavily used than the source * node? Allow migration.
*/ if (group_faults_cpu(ng, dst_nid) > group_faults_cpu(ng, src_nid) *
ACTIVE_NODE_FRACTION) returntrue;
/* * Distribute memory according to CPU & memory use on each node, * with 3/4 hysteresis to avoid unnecessary memory migrations: * * faults_cpu(dst) 3 faults_cpu(src) * --------------- * - > --------------- * faults_mem(dst) 4 faults_mem(src)
*/ return group_faults_cpu(ng, dst_nid) * group_faults(p, src_nid) * 3 >
group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
}
/* * 'numa_type' describes the node at the moment of load balancing.
*/ enum numa_type { /* The node has spare capacity that can be used to run more tasks. */
node_has_spare = 0, /* * The node is fully used and the tasks don't compete for more CPU * cycles. Nevertheless, some tasks might wait before running.
*/
node_fully_busy, /* * The node is overloaded and can't provide expected CPU cycles to all * tasks.
*/
node_overloaded
};
/* Cached statistics for all CPUs within a node */ struct numa_stats { unsignedlong load; unsignedlong runnable; unsignedlong util; /* Total compute capacity of CPUs on a node */ unsignedlong compute_capacity; unsignedint nr_running; unsignedint weight; enum numa_type node_type; int idle_cpu;
};
struct task_numa_env { struct task_struct *p;
int src_cpu, src_nid; int dst_cpu, dst_nid; int imb_numa_nr;
struct numa_stats src_stats, dst_stats;
int imbalance_pct; int dist;
struct task_struct *best_task; long best_imp; int best_cpu;
};
/* * Gather all necessary information to make NUMA balancing placement * decisions that are compatible with standard load balancer. This * borrows code and logic from update_sg_lb_stats but sharing a * common implementation is impractical.
*/ staticvoid update_numa_stats(struct task_numa_env *env, struct numa_stats *ns, int nid, bool find_idle)
{ int cpu, idle_core = -1;
/* Check if run-queue part of active NUMA balance. */ if (env->best_cpu != env->dst_cpu && xchg(&rq->numa_migrate_on, 1)) { int cpu; int start = env->dst_cpu;
/* Failed to find an alternative idle CPU */ return;
}
assign: /* * Clear previous best_cpu/rq numa-migrate flag, since task now * found a better CPU to move/swap.
*/ if (env->best_cpu != -1 && env->best_cpu != env->dst_cpu) {
rq = cpu_rq(env->best_cpu);
WRITE_ONCE(rq->numa_migrate_on, 0);
}
if (env->best_task)
put_task_struct(env->best_task); if (p)
get_task_struct(p);
staticbool load_too_imbalanced(long src_load, long dst_load, struct task_numa_env *env)
{ long imb, old_imb; long orig_src_load, orig_dst_load; long src_capacity, dst_capacity;
/* * The load is corrected for the CPU capacity available on each node. * * src_load dst_load * ------------ vs --------- * src_capacity dst_capacity
*/
src_capacity = env->src_stats.compute_capacity;
dst_capacity = env->dst_stats.compute_capacity;
/* Would this change make things worse? */ return (imb > old_imb);
}
/* * Maximum NUMA importance can be 1998 (2*999); * SMALLIMP @ 30 would be close to 1998/64. * Used to deter task migration.
*/ #define SMALLIMP 30
/* * This checks if the overall compute and NUMA accesses of the system would * be improved if the source tasks was migrated to the target dst_cpu taking * into account that it might be best if task running on the dst_cpu should * be exchanged with the source task
*/ staticbool task_numa_compare(struct task_numa_env *env, long taskimp, long groupimp, bool maymove)
{ struct numa_group *cur_ng, *p_ng = deref_curr_numa_group(env->p); struct rq *dst_rq = cpu_rq(env->dst_cpu); long imp = p_ng ? groupimp : taskimp; struct task_struct *cur; long src_load, dst_load; int dist = env->dist; long moveimp = imp; long load; bool stopsearch = false;
if (READ_ONCE(dst_rq->numa_migrate_on)) returnfalse;
rcu_read_lock();
cur = rcu_dereference(dst_rq->curr); if (cur && ((cur->flags & (PF_EXITING | PF_KTHREAD)) ||
!cur->mm))
cur = NULL;
/* * Because we have preemption enabled we can get migrated around and * end try selecting ourselves (current == env->p) as a swap candidate.
*/ if (cur == env->p) {
stopsearch = true; goto unlock;
}
if (!cur) { if (maymove && moveimp >= env->best_imp) goto assign; else goto unlock;
}
/* Skip this swap candidate if cannot move to the source cpu. */ if (!cpumask_test_cpu(env->src_cpu, cur->cpus_ptr)) goto unlock;
/* * Skip this swap candidate if it is not moving to its preferred * node and the best task is.
*/ if (env->best_task &&
env->best_task->numa_preferred_nid == env->src_nid &&
cur->numa_preferred_nid != env->src_nid) { goto unlock;
}
/* * "imp" is the fault differential for the source task between the * source and destination node. Calculate the total differential for * the source task and potential destination task. The more negative * the value is, the more remote accesses that would be expected to * be incurred if the tasks were swapped. * * If dst and source tasks are in the same NUMA group, or not * in any group then look only at task weights.
*/
cur_ng = rcu_dereference(cur->numa_group); if (cur_ng == p_ng) { /* * Do not swap within a group or between tasks that have * no group if there is spare capacity. Swapping does * not address the load imbalance and helps one task at * the cost of punishing another.
*/ if (env->dst_stats.node_type == node_has_spare) goto unlock;
imp = taskimp + task_weight(cur, env->src_nid, dist) -
task_weight(cur, env->dst_nid, dist); /* * Add some hysteresis to prevent swapping the * tasks within a group over tiny differences.
*/ if (cur_ng)
imp -= imp / 16;
} else { /* * Compare the group weights. If a task is all by itself * (not part of a group), use the task weight instead.
*/ if (cur_ng && p_ng)
imp += group_weight(cur, env->src_nid, dist) -
group_weight(cur, env->dst_nid, dist); else
imp += task_weight(cur, env->src_nid, dist) -
task_weight(cur, env->dst_nid, dist);
}
/* Discourage picking a task already on its preferred node */ if (cur->numa_preferred_nid == env->dst_nid)
imp -= imp / 16;
/* * Encourage picking a task that moves to its preferred node. * This potentially makes imp larger than it's maximum of * 1998 (see SMALLIMP and task_weight for why) but in this * case, it does not matter.
*/ if (cur->numa_preferred_nid == env->src_nid)
imp += imp / 8;
if (maymove && moveimp > imp && moveimp > env->best_imp) {
imp = moveimp;
cur = NULL; goto assign;
}
/* * Prefer swapping with a task moving to its preferred node over a * task that is not.
*/ if (env->best_task && cur->numa_preferred_nid == env->src_nid &&
env->best_task->numa_preferred_nid != env->src_nid) { goto assign;
}
/* * If the NUMA importance is less than SMALLIMP, * task migration might only result in ping pong * of tasks and also hurt performance due to cache * misses.
*/ if (imp < SMALLIMP || imp <= env->best_imp + SMALLIMP / 2) goto unlock;
/* * In the overloaded case, try and keep the load balanced.
*/
load = task_h_load(env->p) - task_h_load(cur); if (!load) goto assign;
if (load_too_imbalanced(src_load, dst_load, env)) goto unlock;
assign: /* Evaluate an idle CPU for a task numa move. */ if (!cur) { int cpu = env->dst_stats.idle_cpu;
/* Nothing cached so current CPU went idle since the search. */ if (cpu < 0)
cpu = env->dst_cpu;
/* * If the CPU is no longer truly idle and the previous best CPU * is, keep using it.
*/ if (!idle_cpu(cpu) && env->best_cpu >= 0 &&
idle_cpu(env->best_cpu)) {
cpu = env->best_cpu;
}
env->dst_cpu = cpu;
}
task_numa_assign(env, cur, imp);
/* * If a move to idle is allowed because there is capacity or load * balance improves then stop the search. While a better swap * candidate may exist, a search is not free.
*/ if (maymove && !cur && env->best_cpu >= 0 && idle_cpu(env->best_cpu))
stopsearch = true;
/* * If a swap candidate must be identified and the current best task * moves its preferred node then stop the search.
*/ if (!maymove && env->best_task &&
env->best_task->numa_preferred_nid == env->src_nid) {
stopsearch = true;
}
unlock:
rcu_read_unlock();
return stopsearch;
}
staticvoid task_numa_find_cpu(struct task_numa_env *env, long taskimp, long groupimp)
{ bool maymove = false; int cpu;
/* * If dst node has spare capacity, then check if there is an * imbalance that would be overruled by the load balancer.
*/ if (env->dst_stats.node_type == node_has_spare) { unsignedint imbalance; int src_running, dst_running;
/* * Would movement cause an imbalance? Note that if src has * more running tasks that the imbalance is ignored as the * move improves the imbalance from the perspective of the * CPU load balancer.
* */
src_running = env->src_stats.nr_running - 1;
dst_running = env->dst_stats.nr_running + 1;
imbalance = max(0, dst_running - src_running);
imbalance = adjust_numa_imbalance(imbalance, dst_running,
env->imb_numa_nr);
/* Use idle CPU if there is no imbalance */ if (!imbalance) {
maymove = true; if (env->dst_stats.idle_cpu >= 0) {
env->dst_cpu = env->dst_stats.idle_cpu;
--> --------------------
--> maximum size reached
--> --------------------
Messung V0.5
¤ Diese beiden folgenden Angebotsgruppen bietet das Unternehmen0.34Angebot
Wie Sie bei der Firma Beratungs- und Dienstleistungen beauftragen können
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.