/* * For asym packing, by default the lower numbered CPU has higher priority.
*/ int __weak arch_asym_cpu_priority(int cpu)
{ return -cpu;
}
/* * The margin used when comparing utilization with CPU capacity. * * (default: ~20%)
*/ #define fits_capacity(cap, max) ((cap) * 1280 < (max) * 1024)
/* * The margin used when comparing CPU capacities. * is 'cap1' noticeably greater than 'cap2' * * (default: ~5%)
*/ #define capacity_greater(cap1, cap2) ((cap1) * 1024 > (cap2) * 1078)
#ifdef CONFIG_CFS_BANDWIDTH /* * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool * each time a cfs_rq requests quota. * * Note: in the case that the slice exceeds the runtime remaining (either due * to consumption or the quota being specified to be smaller than the slice) * we will always only issue the remaining available time. * * (default: 5 msec, units: microseconds)
*/ staticunsignedint sysctl_sched_cfs_bandwidth_slice = 5000UL; #endif
#ifdef CONFIG_NUMA_BALANCING /* Restrict the NUMA promotion throughput (MB/s) for each target node. */ staticunsignedint sysctl_numa_balancing_promote_rate_limit = 65536; #endif
/* * Increase the granularity value when there are more CPUs, * because with more CPUs the 'effective latency' as visible * to users decreases. But the relationship is not linear, * so pick a second-best guess by going with the log2 of the * number of CPUs. * * This idea comes from the SD scheduler of Con Kolivas:
*/ staticunsignedint get_update_sysctl_factor(void)
{ unsignedint cpus = min_t(unsignedint, num_online_cpus(), 8); unsignedint factor;
switch (sysctl_sched_tunable_scaling) { case SCHED_TUNABLESCALING_NONE:
factor = 1; break; case SCHED_TUNABLESCALING_LINEAR:
factor = cpus; break; case SCHED_TUNABLESCALING_LOG: default:
factor = 1 + ilog2(cpus); break;
}
/************************************************************** * CFS operations on generic schedulable entities:
*/
#ifdef CONFIG_FAIR_GROUP_SCHED
/* Walk up scheduling entities hierarchy */ #define for_each_sched_entity(se) \ for (; se; se = se->parent)
staticinlinebool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{ struct rq *rq = rq_of(cfs_rq); int cpu = cpu_of(rq);
if (cfs_rq->on_list) return rq->tmp_alone_branch == &rq->leaf_cfs_rq_list;
cfs_rq->on_list = 1;
/* * Ensure we either appear before our parent (if already * enqueued) or force our parent to appear after us when it is * enqueued. The fact that we always enqueue bottom-up * reduces this to two cases and a special case for the root * cfs_rq. Furthermore, it also means that we will always reset * tmp_alone_branch either when the branch is connected * to a tree or when we reach the top of the tree
*/ if (cfs_rq->tg->parent &&
cfs_rq->tg->parent->cfs_rq[cpu]->on_list) { /* * If parent is already on the list, we add the child * just before. Thanks to circular linked property of * the list, this means to put the child at the tail * of the list that starts by parent.
*/
list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
&(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list)); /* * The branch is now connected to its tree so we can * reset tmp_alone_branch to the beginning of the * list.
*/
rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; returntrue;
}
if (!cfs_rq->tg->parent) { /* * cfs rq without parent should be put * at the tail of the list.
*/
list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
&rq->leaf_cfs_rq_list); /* * We have reach the top of a tree so we can reset * tmp_alone_branch to the beginning of the list.
*/
rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; returntrue;
}
/* * The parent has not already been added so we want to * make sure that it will be put after us. * tmp_alone_branch points to the begin of the branch * where we will add parent.
*/
list_add_rcu(&cfs_rq->leaf_cfs_rq_list, rq->tmp_alone_branch); /* * update tmp_alone_branch to points to the new begin * of the branch
*/
rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list; returnfalse;
}
/* * With cfs_rq being unthrottled/throttled during an enqueue, * it can happen the tmp_alone_branch points to the leaf that * we finally want to delete. In this case, tmp_alone_branch moves * to the prev element but it will point to rq->leaf_cfs_rq_list * at the end of the enqueue.
*/ if (rq->tmp_alone_branch == &cfs_rq->leaf_cfs_rq_list)
rq->tmp_alone_branch = cfs_rq->leaf_cfs_rq_list.prev;
/* Iterate through all leaf cfs_rq's on a runqueue */ #define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list, \
leaf_cfs_rq_list)
/* Do the two (enqueued) entities belong to the same group ? */ staticinlinestruct cfs_rq *
is_same_group(struct sched_entity *se, struct sched_entity *pse)
{ if (se->cfs_rq == pse->cfs_rq) return se->cfs_rq;
/* * preemption test can be made between sibling entities who are in the * same cfs_rq i.e who have a common parent. Walk up the hierarchy of * both tasks until we find their ancestors who are siblings of common * parent.
*/
/* First walk up until both entities are at same depth */
se_depth = (*se)->depth;
pse_depth = (*pse)->depth;
while (se_depth > pse_depth) {
se_depth--;
*se = parent_entity(*se);
}
while (pse_depth > se_depth) {
pse_depth--;
*pse = parent_entity(*pse);
}
/* * Compute virtual time from the per-task service numbers: * * Fair schedulers conserve lag: * * \Sum lag_i = 0 * * Where lag_i is given by: * * lag_i = S - s_i = w_i * (V - v_i) * * Where S is the ideal service time and V is it's virtual time counterpart. * Therefore: * * \Sum lag_i = 0 * \Sum w_i * (V - v_i) = 0 * \Sum w_i * V - w_i * v_i = 0 * * From which we can solve an expression for V in v_i (which we have in * se->vruntime): * * \Sum v_i * w_i \Sum v_i * w_i * V = -------------- = -------------- * \Sum w_i W * * Specifically, this is the weighted average of all entity virtual runtimes. * * [[ NOTE: this is only equal to the ideal scheduler under the condition * that join/leave operations happen at lag_i = 0, otherwise the * virtual time has non-contiguous motion equivalent to: * * V +-= lag_i / W * * Also see the comment in place_entity() that deals with this. ]] * * However, since v_i is u64, and the multiplication could easily overflow * transform it into a relative form that uses smaller quantities: * * Substitute: v_i == (v_i - v0) + v0 * * \Sum ((v_i - v0) + v0) * w_i \Sum (v_i - v0) * w_i * V = ---------------------------- = --------------------- + v0 * W W * * Which we track using: * * v0 := cfs_rq->min_vruntime * \Sum (v_i - v0) * w_i := cfs_rq->avg_vruntime * \Sum w_i := cfs_rq->avg_load * * Since min_vruntime is a monotonic increasing variable that closely tracks * the per-task service, these deltas: (v_i - v), will be in the order of the * maximal (virtual) lag induced in the system due to quantisation. * * Also, we use scale_load_down() to reduce the size. * * As measured, the max (key * weight) value was ~44 bits for a kernel build.
*/ staticvoid
avg_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
{ unsignedlong weight = scale_load_down(se->load.weight);
s64 key = entity_key(cfs_rq, se);
/* * Specifically: avg_runtime() + 0 must result in entity_eligible() := true * For this to be so, the result of this function must have a left bias.
*/
u64 avg_vruntime(struct cfs_rq *cfs_rq)
{ struct sched_entity *curr = cfs_rq->curr;
s64 avg = cfs_rq->avg_vruntime; long load = cfs_rq->avg_load;
if (curr && curr->on_rq) { unsignedlong weight = scale_load_down(curr->load.weight);
/* * lag_i = S - s_i = w_i * (V - v_i) * * However, since V is approximated by the weighted average of all entities it * is possible -- by addition/removal/reweight to the tree -- to move V around * and end up with a larger lag than we started with. * * Limit this to either double the slice length with a minimum of TICK_NSEC * since that is the timing granularity. * * EEVDF gives the following limit for a steady state system: * * -r_max < lag < max(r_max, q) * * XXX could add max_slice to the augmented data to track this.
*/ staticvoid update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
s64 vlag, limit;
/* * Entity is eligible once it received less service than it ought to have, * eg. lag >= 0. * * lag_i = S - s_i = w_i*(V - v_i) * * lag_i >= 0 -> V >= v_i * * \Sum (v_i - v)*w_i * V = ------------------ + v * \Sum w_i * * lag_i >= 0 -> \Sum (v_i - v)*w_i >= (v_i - v)*(\Sum w_i) * * Note: using 'avg_vruntime() > se->vruntime' is inaccurate due * to the loss in precision caused by the division.
*/ staticint vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime)
{ struct sched_entity *curr = cfs_rq->curr;
s64 avg = cfs_rq->avg_vruntime; long load = cfs_rq->avg_load;
if (curr && curr->on_rq) { unsignedlong weight = scale_load_down(curr->load.weight);
/* * Set the vruntime up to which an entity can run before looking * for another entity to pick. * In case of run to parity, we use the shortest slice of the enqueued * entities to set the protected period. * When run to parity is disabled, we give a minimum quantum to the running * entity to ensure progress.
*/ staticinlinevoid set_protect_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
u64 slice = normalized_sysctl_sched_base_slice;
u64 vprot = se->deadline;
if (sched_feat(RUN_TO_PARITY))
slice = cfs_rq_min_slice(cfs_rq);
/* * Earliest Eligible Virtual Deadline First * * In order to provide latency guarantees for different request sizes * EEVDF selects the best runnable task from two criteria: * * 1) the task must be eligible (must be owed service) * * 2) from those tasks that meet 1), we select the one * with the earliest virtual deadline. * * We can do this in O(log n) time due to an augmented RB-tree. The * tree keeps the entries sorted on deadline, but also functions as a * heap based on the vruntime by keeping: * * se->min_vruntime = min(se->vruntime, se->{left,right}->min_vruntime) * * Which allows tree pruning through eligibility.
*/ staticstruct sched_entity *__pick_eevdf(struct cfs_rq *cfs_rq, bool protect)
{ struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node; struct sched_entity *se = __pick_first_entity(cfs_rq); struct sched_entity *curr = cfs_rq->curr; struct sched_entity *best = NULL;
/* * We can safely skip eligibility check if there is only one entity * in this cfs_rq, saving some cycles.
*/ if (cfs_rq->nr_queued == 1) return curr && curr->on_rq ? curr : se;
if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr)))
curr = NULL;
if (curr && protect && protect_slice(curr)) return curr;
/* Pick the leftmost entity if it's eligible */ if (se && entity_eligible(cfs_rq, se)) {
best = se; goto found;
}
/* Heap search for the EEVD entity */ while (node) { struct rb_node *left = node->rb_left;
/* * Eligible entities in left subtree are always better * choices, since they have earlier deadlines.
*/ if (left && vruntime_eligible(cfs_rq,
__node_2_se(left)->min_vruntime)) {
node = left; continue;
}
se = __node_2_se(node);
/* * The left subtree either is empty or has no eligible * entity, so check the current node since it is the one * with earliest deadline that might be eligible.
*/ if (entity_eligible(cfs_rq, se)) {
best = se; break;
}
node = node->rb_right;
}
found: if (!best || (curr && entity_before(curr, best)))
best = curr;
/* * XXX: strictly: vd_i += N*r_i/w_i such that: vd_i > ve_i * this is probably good enough.
*/ staticbool update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
{ if ((s64)(se->vruntime - se->deadline) < 0) returnfalse;
/* * For EEVDF the virtual time slope is determined by w_i (iow. * nice) while the request time r_i is determined by * sysctl_sched_base_slice.
*/ if (!se->custom_slice)
se->slice = sysctl_sched_base_slice;
/* * The task has consumed its request, reschedule.
*/ returntrue;
}
#include"pelt.h"
staticint select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu); staticunsignedlong task_h_load(struct task_struct *p); staticunsignedlong capacity_of(int cpu);
/* Give new sched_entity start runnable values to heavy its load in infant time */ void init_entity_runnable_average(struct sched_entity *se)
{ struct sched_avg *sa = &se->avg;
memset(sa, 0, sizeof(*sa));
/* * Tasks are initialized with full load to be seen as heavy tasks until * they get a chance to stabilize to their real load level. * Group entities are initialized with zero load to reflect the fact that * nothing has been attached to the task group yet.
*/ if (entity_is_task(se))
sa->load_avg = scale_load_down(se->load.weight);
/* when this task is enqueued, it will contribute to its cfs_rq's load_avg */
}
/* * With new tasks being created, their initial util_avgs are extrapolated * based on the cfs_rq's current util_avg: * * util_avg = cfs_rq->avg.util_avg / (cfs_rq->avg.load_avg + 1) * * se_weight(se) * * However, in many cases, the above util_avg does not give a desired * value. Moreover, the sum of the util_avgs may be divergent, such * as when the series is a harmonic series. * * To solve this problem, we also cap the util_avg of successive tasks to * only 1/2 of the left utilization budget: * * util_avg_cap = (cpu_scale - cfs_rq->avg.util_avg) / 2^n * * where n denotes the nth task and cpu_scale the CPU capacity. * * For example, for a CPU with 1024 of capacity, a simplest series from * the beginning would be like: * * task util_avg: 512, 256, 128, 64, 32, 16, 8, ... * cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ... * * Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap) * if util_avg > util_avg_cap.
*/ void post_init_entity_util_avg(struct task_struct *p)
{ struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); struct sched_avg *sa = &se->avg; long cpu_scale = arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq))); long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / 2;
if (p->sched_class != &fair_sched_class) { /* * For !fair tasks do: * update_cfs_rq_load_avg(now, cfs_rq); attach_entity_load_avg(cfs_rq, se); switched_from_fair(rq, p); * * such that the next switched_to_fair() has the * expected state.
*/
se->avg.last_update_time = cfs_rq_clock_pelt(cfs_rq); return;
}
if (cap > 0) { if (cfs_rq->avg.util_avg != 0) {
sa->util_avg = cfs_rq->avg.util_avg * se_weight(se);
sa->util_avg /= (cfs_rq->avg.load_avg + 1);
delta_exec = now - se->exec_start; if (unlikely(delta_exec <= 0)) return delta_exec;
se->exec_start = now; if (entity_is_task(se)) { struct task_struct *donor = task_of(se); struct task_struct *running = rq->curr; /* * If se is a task, we account the time against the running * task, as w/ proxy-exec they may not be the same.
*/
running->se.exec_start = now;
running->se.sum_exec_runtime += delta_exec;
/* cgroup time is always accounted against the donor */
cgroup_account_cputime(donor, delta_exec);
} else { /* If not task, account the time against donor se */
se->sum_exec_runtime += delta_exec;
}
if (schedstat_enabled()) { struct sched_statistics *stats;
/* * Used by other classes to account runtime.
*/
s64 update_curr_common(struct rq *rq)
{ return update_se(rq, &rq->donor->se);
}
/* * Update the current task's runtime statistics.
*/ staticvoid update_curr(struct cfs_rq *cfs_rq)
{ /* * Note: cfs_rq->curr corresponds to the task picked to * run (ie: rq->donor.se) which due to proxy-exec may * not necessarily be the actual task running * (rq->curr.se). This is easy to confuse!
*/ struct sched_entity *curr = cfs_rq->curr; struct rq *rq = rq_of(cfs_rq);
s64 delta_exec; bool resched;
if (unlikely(!curr)) return;
delta_exec = update_se(rq, curr); if (unlikely(delta_exec <= 0)) return;
if (entity_is_task(curr)) { /* * If the fair_server is active, we need to account for the * fair_server time whether or not the task is running on * behalf of fair_server or not: * - If the task is running on behalf of fair_server, we need * to limit its time based on the assigned runtime. * - Fair task that runs outside of fair_server should account * against fair_server such that it can account for this time * and possibly avoid running this period.
*/ if (dl_server_active(&rq->fair_server))
dl_server_update(&rq->fair_server, delta_exec);
}
account_cfs_rq_runtime(cfs_rq, delta_exec);
if (cfs_rq->nr_queued == 1) return;
if (resched || !protect_slice(curr)) {
resched_curr_lazy(rq);
clear_buddies(cfs_rq, curr);
}
}
/* * When the sched_schedstat changes from 0 to 1, some sched se * maybe already in the runqueue, the se->statistics.wait_start * will be 0.So it will let the delta wrong. We need to avoid this * scenario.
*/ if (unlikely(!schedstat_val(stats->wait_start))) return;
/* * Task is being enqueued - update stats:
*/ staticinlinevoid
update_stats_enqueue_fair(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{ if (!schedstat_enabled()) return;
/* * Are we enqueueing a waiting task? (for current tasks * a dequeue/enqueue event is a NOP)
*/ if (se != cfs_rq->curr)
update_stats_wait_start_fair(cfs_rq, se);
if (flags & ENQUEUE_WAKEUP)
update_stats_enqueue_sleeper_fair(cfs_rq, se);
}
staticinlinevoid
update_stats_dequeue_fair(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
if (!schedstat_enabled()) return;
/* * Mark the end of the wait period if dequeueing a * waiting task:
*/ if (se != cfs_rq->curr)
update_stats_wait_end_fair(cfs_rq, se);
/* XXX racy against TTWU */
state = READ_ONCE(tsk->__state); if (state & TASK_INTERRUPTIBLE)
__schedstat_set(tsk->stats.sleep_start,
rq_clock(rq_of(cfs_rq))); if (state & TASK_UNINTERRUPTIBLE)
__schedstat_set(tsk->stats.block_start,
rq_clock(rq_of(cfs_rq)));
}
}
/* * We are picking a new current task - update its stats:
*/ staticinlinevoid
update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
{ /* * We are starting a new run period:
*/
se->exec_start = rq_clock_task(rq_of(cfs_rq));
}
/************************************************** * Scheduling class queueing methods:
*/
staticinlinebool is_core_idle(int cpu)
{ #ifdef CONFIG_SCHED_SMT int sibling;
for_each_cpu(sibling, cpu_smt_mask(cpu)) { if (cpu == sibling) continue;
if (!idle_cpu(sibling)) returnfalse;
} #endif
returntrue;
}
#ifdef CONFIG_NUMA #define NUMA_IMBALANCE_MIN 2
staticinlinelong
adjust_numa_imbalance(int imbalance, int dst_running, int imb_numa_nr)
{ /* * Allow a NUMA imbalance if busy CPUs is less than the maximum * threshold. Above this threshold, individual tasks may be contending * for both memory bandwidth and any shared HT resources. This is an * approximation as the number of running tasks may not be related to * the number of busy CPUs due to sched_setaffinity.
*/ if (dst_running > imb_numa_nr) return imbalance;
/* * Allow a small imbalance based on a simple pair of communicating * tasks that remain local when the destination is lightly loaded.
*/ if (imbalance <= NUMA_IMBALANCE_MIN) return 0;
return imbalance;
} #endif/* CONFIG_NUMA */
#ifdef CONFIG_NUMA_BALANCING /* * Approximate time to scan a full NUMA task in ms. The task scan period is * calculated based on the tasks virtual memory size and * numa_balancing_scan_size.
*/ unsignedint sysctl_numa_balancing_scan_period_min = 1000; unsignedint sysctl_numa_balancing_scan_period_max = 60000;
/* Portion of address space to scan in MB */ unsignedint sysctl_numa_balancing_scan_size = 256;
/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */ unsignedint sysctl_numa_balancing_scan_delay = 1000;
/* The page with hint page fault latency < threshold in ms is considered hot */ unsignedint sysctl_numa_balancing_hot_threshold = MSEC_PER_SEC;
struct numa_group {
refcount_t refcount;
spinlock_t lock; /* nr_tasks, tasks */ int nr_tasks;
pid_t gid; int active_nodes;
struct rcu_head rcu; unsignedlong total_faults; unsignedlong max_faults_cpu; /* * faults[] array is split into two regions: faults_mem and faults_cpu. * * Faults_cpu is used to decide whether memory should move * towards the CPU. As a consequence, these stats are weighted * more by CPU use than by memory faults.
*/ unsignedlong faults[];
};
/* * For functions that can be called in multiple contexts that permit reading * ->numa_group (see struct task_struct for locking rules).
*/ staticstruct numa_group *deref_task_numa_group(struct task_struct *p)
{ return rcu_dereference_check(p->numa_group, p == current ||
(lockdep_is_held(__rq_lockp(task_rq(p))) && !READ_ONCE(p->on_cpu)));
}
/* * Calculations based on RSS as non-present and empty pages are skipped * by the PTE scanner and NUMA hinting faults should be trapped based * on resident pages
*/
nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
rss = get_mm_rss(p->mm); if (!rss)
rss = nr_scan_pages;
/* Scale the maximum scan period with the amount of shared memory. */
rcu_read_lock();
ng = rcu_dereference(p->numa_group); if (ng) { unsignedlong shared = group_faults_shared(ng); unsignedlongprivate = group_faults_priv(ng);
period *= refcount_read(&ng->refcount);
period *= shared + 1;
period /= private + shared + 1;
}
rcu_read_unlock();
/* Watch for min being lower than max due to floor calculations */
smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
/* Scale the maximum scan period with the amount of shared memory. */
ng = deref_curr_numa_group(p); if (ng) { unsignedlong shared = group_faults_shared(ng); unsignedlongprivate = group_faults_priv(ng); unsignedlong period = smax;
period *= refcount_read(&ng->refcount);
period *= shared + 1;
period /= private + shared + 1;
rcu_read_lock();
ng = rcu_dereference(p->numa_group); if (ng)
gid = ng->gid;
rcu_read_unlock();
return gid;
}
/* * The averaged statistics, shared & private, memory & CPU, * occupy the first half of the array. The second half of the * array is for current counters, which are averaged into the * first set by task_numa_placement.
*/ staticinlineint task_faults_idx(enum numa_faults_stats s, int nid, int priv)
{ return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv;
}
staticinlineunsignedlong task_faults(struct task_struct *p, int nid)
{ if (!p->numa_faults) return 0;
/* * A node triggering more than 1/3 as many NUMA faults as the maximum is * considered part of a numa group's pseudo-interleaving set. Migrations * between these nodes are slowed down, to allow things to settle down.
*/ #define ACTIVE_NODE_FRACTION 3
/* Handle placement on systems where not all nodes are directly connected. */ staticunsignedlong score_nearby_nodes(struct task_struct *p, int nid, int lim_dist, bool task)
{ unsignedlong score = 0; int node, max_dist;
/* * All nodes are directly connected, and the same distance * from each other. No need for fancy placement algorithms.
*/ if (sched_numa_topology_type == NUMA_DIRECT) return 0;
/* sched_max_numa_distance may be changed in parallel. */
max_dist = READ_ONCE(sched_max_numa_distance); /* * This code is called for each node, introducing N^2 complexity, * which should be OK given the number of nodes rarely exceeds 8.
*/
for_each_online_node(node) { unsignedlong faults; int dist = node_distance(nid, node);
/* * The furthest away nodes in the system are not interesting * for placement; nid was already counted.
*/ if (dist >= max_dist || node == nid) continue;
/* * On systems with a backplane NUMA topology, compare groups * of nodes, and move tasks towards the group with the most * memory accesses. When comparing two nodes at distance * "hoplimit", only nodes closer by than "hoplimit" are part * of each group. Skip other nodes.
*/ if (sched_numa_topology_type == NUMA_BACKPLANE && dist >= lim_dist) continue;
/* Add up the faults from nearby nodes. */ if (task)
faults = task_faults(p, node); else
faults = group_faults(p, node);
/* * On systems with a glueless mesh NUMA topology, there are * no fixed "groups of nodes". Instead, nodes that are not * directly connected bounce traffic through intermediate * nodes; a numa_group can occupy any set of nodes. * The further away a node is, the less the faults count. * This seems to result in good task placement.
*/ if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
faults *= (max_dist - dist);
faults /= (max_dist - LOCAL_DISTANCE);
}
score += faults;
}
return score;
}
/* * These return the fraction of accesses done by a particular task, or * task group, on a particular numa node. The group weight is given a * larger multiplier, in order to group tasks together that are almost * evenly spread out between numa nodes.
*/ staticinlineunsignedlong task_weight(struct task_struct *p, int nid, int dist)
{ unsignedlong faults, total_faults;
/* * If memory tiering mode is enabled, cpupid of slow memory page is * used to record scan time instead of CPU and PID. When tiering mode * is disabled at run time, the scan time (in cpupid) will be * interpreted as CPU and PID. So CPU needs to be checked to avoid to * access out of array bound.
*/ staticinlinebool cpupid_valid(int cpupid)
{ return cpupid_to_cpu(cpupid) < nr_cpu_ids;
}
/* * For memory tiering mode, if there are enough free pages (more than * enough watermark defined here) in fast memory node, to take full * advantage of fast memory capacity, all recently accessed slow * memory pages will be migrated to fast memory node without * considering hot threshold.
*/ staticbool pgdat_free_space_enough(struct pglist_data *pgdat)
{ int z; unsignedlong enough_wmark;
enough_wmark = max(1UL * 1024 * 1024 * 1024 >> PAGE_SHIFT,
pgdat->node_present_pages >> 4); for (z = pgdat->nr_zones - 1; z >= 0; z--) { struct zone *zone = pgdat->node_zones + z;
/* * For memory tiering mode, when page tables are scanned, the scan * time will be recorded in struct page in addition to make page * PROT_NONE for slow memory page. So when the page is accessed, in * hint page fault handler, the hint page fault latency is calculated * via, * * hint page fault latency = hint page fault time - scan time * * The smaller the hint page fault latency, the higher the possibility * for the page to be hot.
*/ staticint numa_hint_fault_latency(struct folio *folio)
{ int last_time, time;
time = jiffies_to_msecs(jiffies);
last_time = folio_xchg_access_time(folio, time);
/* * For memory tiering mode, too high promotion/demotion throughput may * hurt application latency. So we provide a mechanism to rate limit * the number of pages that are tried to be promoted.
*/ staticbool numa_promotion_rate_limit(struct pglist_data *pgdat, unsignedlong rate_limit, int nr)
{ unsignedlong nr_cand; unsignedint now, start;
bool should_numa_migrate_memory(struct task_struct *p, struct folio *folio, int src_nid, int dst_cpu)
{ struct numa_group *ng = deref_curr_numa_group(p); int dst_nid = cpu_to_node(dst_cpu); int last_cpupid, this_cpupid;
/* * Cannot migrate to memoryless nodes.
*/ if (!node_state(dst_nid, N_MEMORY)) returnfalse;
/* * The pages in slow memory node should be migrated according * to hot/cold instead of private/shared.
*/ if (folio_use_access_time(folio)) { struct pglist_data *pgdat; unsignedlong rate_limit; unsignedint latency, th, def_th;
pgdat = NODE_DATA(dst_nid); if (pgdat_free_space_enough(pgdat)) { /* workload changed, reset hot threshold */
pgdat->nbp_threshold = 0; returntrue;
}
if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) &&
!node_is_toptier(src_nid) && !cpupid_valid(last_cpupid)) returnfalse;
/* * Allow first faults or private faults to migrate immediately early in * the lifetime of a task. The magic number 4 is based on waiting for * two full passes of the "multi-stage node selection" test that is * executed below.
*/ if ((p->numa_preferred_nid == NUMA_NO_NODE || p->numa_scan_seq <= 4) &&
(cpupid_pid_unset(last_cpupid) || cpupid_match_pid(p, last_cpupid))) returntrue;
/* * Multi-stage node selection is used in conjunction with a periodic * migration fault to build a temporal task<->page relation. By using * a two-stage filter we remove short/unlikely relations. * * Using P(p) ~ n_p / n_t as per frequentist probability, we can equate * a task's usage of a particular page (n_p) per total usage of this * page (n_t) (in a given time-span) to a probability. * * Our periodic faults will sample this probability and getting the * same result twice in a row, given these samples are fully * independent, is then given by P(n)^2, provided our sample period * is sufficiently short compared to the usage pattern. * * This quadric squishes small probabilities, making it less likely we * act on an unlikely task<->page relation.
*/ if (!cpupid_pid_unset(last_cpupid) &&
cpupid_to_nid(last_cpupid) != dst_nid) returnfalse;
/* Always allow migrate on private faults */ if (cpupid_match_pid(p, last_cpupid)) returntrue;
/* A shared fault, but p->numa_group has not been set up yet. */ if (!ng) returntrue;
/* * Destination node is much more heavily used than the source * node? Allow migration.
*/ if (group_faults_cpu(ng, dst_nid) > group_faults_cpu(ng, src_nid) *
ACTIVE_NODE_FRACTION) returntrue;
/* * Distribute memory according to CPU & memory use on each node, * with 3/4 hysteresis to avoid unnecessary memory migrations: * * faults_cpu(dst) 3 faults_cpu(src) * --------------- * - > --------------- * faults_mem(dst) 4 faults_mem(src)
*/ return group_faults_cpu(ng, dst_nid) * group_faults(p, src_nid) * 3 >
group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
}
/* * 'numa_type' describes the node at the moment of load balancing.
*/ enum numa_type { /* The node has spare capacity that can be used to run more tasks. */
node_has_spare = 0, /* * The node is fully used and the tasks don't compete for more CPU * cycles. Nevertheless, some tasks might wait before running.
*/
node_fully_busy, /* * The node is overloaded and can't provide expected CPU cycles to all * tasks.
*/
node_overloaded
};
/* Cached statistics for all CPUs within a node */ struct numa_stats { unsignedlong load; unsignedlong runnable; unsignedlong util; /* Total compute capacity of CPUs on a node */ unsignedlong compute_capacity; unsignedint nr_running; unsignedint weight; enum numa_type node_type; int idle_cpu;
};
struct task_numa_env { struct task_struct *p;
int src_cpu, src_nid; int dst_cpu, dst_nid; int imb_numa_nr;
struct numa_stats src_stats, dst_stats;
int imbalance_pct; int dist;
struct task_struct *best_task; long best_imp; int best_cpu;
};
/* * Gather all necessary information to make NUMA balancing placement * decisions that are compatible with standard load balancer. This * borrows code and logic from update_sg_lb_stats but sharing a * common implementation is impractical.
*/ staticvoid update_numa_stats(struct task_numa_env *env, struct numa_stats *ns, int nid, bool find_idle)
{ int cpu, idle_core = -1;
/* Check if run-queue part of active NUMA balance. */ if (env->best_cpu != env->dst_cpu && xchg(&rq->numa_migrate_on, 1)) { int cpu; int start = env->dst_cpu;
/* Failed to find an alternative idle CPU */ return;
}
assign: /* * Clear previous best_cpu/rq numa-migrate flag, since task now * found a better CPU to move/swap.
*/ if (env->best_cpu != -1 && env->best_cpu != env->dst_cpu) {
rq = cpu_rq(env->best_cpu);
WRITE_ONCE(rq->numa_migrate_on, 0);
}
if (env->best_task)
put_task_struct(env->best_task); if (p)
get_task_struct(p);
staticbool load_too_imbalanced(long src_load, long dst_load, struct task_numa_env *env)
{ long imb, old_imb; long orig_src_load, orig_dst_load; long src_capacity, dst_capacity;
/* * The load is corrected for the CPU capacity available on each node. * * src_load dst_load * ------------ vs --------- * src_capacity dst_capacity
*/
src_capacity = env->src_stats.compute_capacity;
dst_capacity = env->dst_stats.compute_capacity;
/* Would this change make things worse? */ return (imb > old_imb);
}
/* * Maximum NUMA importance can be 1998 (2*999); * SMALLIMP @ 30 would be close to 1998/64. * Used to deter task migration.
*/ #define SMALLIMP 30
/* * This checks if the overall compute and NUMA accesses of the system would * be improved if the source tasks was migrated to the target dst_cpu taking * into account that it might be best if task running on the dst_cpu should * be exchanged with the source task
*/ staticbool task_numa_compare(struct task_numa_env *env, long taskimp, long groupimp, bool maymove)
{ struct numa_group *cur_ng, *p_ng = deref_curr_numa_group(env->p); struct rq *dst_rq = cpu_rq(env->dst_cpu); long imp = p_ng ? groupimp : taskimp; struct task_struct *cur; long src_load, dst_load; int dist = env->dist; long moveimp = imp; long load; bool stopsearch = false;
if (READ_ONCE(dst_rq->numa_migrate_on)) returnfalse;
rcu_read_lock();
cur = rcu_dereference(dst_rq->curr); if (cur && ((cur->flags & (PF_EXITING | PF_KTHREAD)) ||
!cur->mm))
cur = NULL;
/* * Because we have preemption enabled we can get migrated around and * end try selecting ourselves (current == env->p) as a swap candidate.
*/ if (cur == env->p) {
stopsearch = true; goto unlock;
}
if (!cur) { if (maymove && moveimp >= env->best_imp) goto assign; else goto unlock;
}
/* Skip this swap candidate if cannot move to the source cpu. */ if (!cpumask_test_cpu(env->src_cpu, cur->cpus_ptr)) goto unlock;
/* * Skip this swap candidate if it is not moving to its preferred * node and the best task is.
*/ if (env->best_task &&
env->best_task->numa_preferred_nid == env->src_nid &&
cur->numa_preferred_nid != env->src_nid) { goto unlock;
}
/* * "imp" is the fault differential for the source task between the * source and destination node. Calculate the total differential for * the source task and potential destination task. The more negative * the value is, the more remote accesses that would be expected to * be incurred if the tasks were swapped. * * If dst and source tasks are in the same NUMA group, or not * in any group then look only at task weights.
*/
cur_ng = rcu_dereference(cur->numa_group); if (cur_ng == p_ng) { /* * Do not swap within a group or between tasks that have * no group if there is spare capacity. Swapping does * not address the load imbalance and helps one task at * the cost of punishing another.
*/ if (env->dst_stats.node_type == node_has_spare) goto unlock;
imp = taskimp + task_weight(cur, env->src_nid, dist) -
task_weight(cur, env->dst_nid, dist); /* * Add some hysteresis to prevent swapping the * tasks within a group over tiny differences.
*/ if (cur_ng)
imp -= imp / 16;
} else { /* * Compare the group weights. If a task is all by itself * (not part of a group), use the task weight instead.
*/ if (cur_ng && p_ng)
imp += group_weight(cur, env->src_nid, dist) -
group_weight(cur, env->dst_nid, dist); else
imp += task_weight(cur, env->src_nid, dist) -
task_weight(cur, env->dst_nid, dist);
}
/* Discourage picking a task already on its preferred node */ if (cur->numa_preferred_nid == env->dst_nid)
imp -= imp / 16;
/* * Encourage picking a task that moves to its preferred node. * This potentially makes imp larger than it's maximum of * 1998 (see SMALLIMP and task_weight for why) but in this * case, it does not matter.
*/ if (cur->numa_preferred_nid == env->src_nid)
imp += imp / 8;
if (maymove && moveimp > imp && moveimp > env->best_imp) {
imp = moveimp;
cur = NULL; goto assign;
}
/* * Prefer swapping with a task moving to its preferred node over a * task that is not.
*/ if (env->best_task && cur->numa_preferred_nid == env->src_nid &&
env->best_task->numa_preferred_nid != env->src_nid) { goto assign;
}
/* * If the NUMA importance is less than SMALLIMP, * task migration might only result in ping pong * of tasks and also hurt performance due to cache * misses.
*/ if (imp < SMALLIMP || imp <= env->best_imp + SMALLIMP / 2) goto unlock;
/* * In the overloaded case, try and keep the load balanced.
*/
load = task_h_load(env->p) - task_h_load(cur); if (!load) goto assign;
if (load_too_imbalanced(src_load, dst_load, env)) goto unlock;
assign: /* Evaluate an idle CPU for a task numa move. */ if (!cur) { int cpu = env->dst_stats.idle_cpu;
/* Nothing cached so current CPU went idle since the search. */ if (cpu < 0)
cpu = env->dst_cpu;
/* * If the CPU is no longer truly idle and the previous best CPU * is, keep using it.
*/ if (!idle_cpu(cpu) && env->best_cpu >= 0 &&
idle_cpu(env->best_cpu)) {
cpu = env->best_cpu;
}
env->dst_cpu = cpu;
}
task_numa_assign(env, cur, imp);
/* * If a move to idle is allowed because there is capacity or load * balance improves then stop the search. While a better swap * candidate may exist, a search is not free.
*/ if (maymove && !cur && env->best_cpu >= 0 && idle_cpu(env->best_cpu))
stopsearch = true;
/* * If a swap candidate must be identified and the current best task * moves its preferred node then stop the search.
*/ if (!maymove && env->best_task &&
env->best_task->numa_preferred_nid == env->src_nid) {
stopsearch = true;
}
unlock:
rcu_read_unlock();
return stopsearch;
}
staticvoid task_numa_find_cpu(struct task_numa_env *env, long taskimp, long groupimp)
{ bool maymove = false; int cpu;
/* * If dst node has spare capacity, then check if there is an * imbalance that would be overruled by the load balancer.
*/ if (env->dst_stats.node_type == node_has_spare) { unsignedint imbalance; int src_running, dst_running;
/* * Would movement cause an imbalance? Note that if src has * more running tasks that the imbalance is ignored as the * move improves the imbalance from the perspective of the * CPU load balancer.
* */
src_running = env->src_stats.nr_running - 1;
dst_running = env->dst_stats.nr_running + 1;
imbalance = max(0, dst_running - src_running);
imbalance = adjust_numa_imbalance(imbalance, dst_running,
env->imb_numa_nr);
/* Use idle CPU if there is no imbalance */ if (!imbalance) {
maymove = true; if (env->dst_stats.idle_cpu >= 0) {
env->dst_cpu = env->dst_stats.idle_cpu;
task_numa_assign(env, NULL, 0); return;
}
}
} else { long src_load, dst_load, load; /* * If the improvement from just moving env->p direction is better * than swapping tasks around, check if a move is possible.
*/
load = task_h_load(env->p);
dst_load = env->dst_stats.load + load;
src_load = env->src_stats.load - load;
maymove = !load_too_imbalanced(src_load, dst_load, env);
}
for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) { /* Skip this CPU if the source task cannot migrate */ if (!cpumask_test_cpu(cpu, env->p->cpus_ptr)) continue;
/* * Pick the lowest SD_NUMA domain, as that would have the smallest * imbalance and would be the first to start moving tasks about. * * And we want to avoid any moving of tasks about, as that would create * random movement of tasks -- counter the numa conditions we're trying * to satisfy here.
*/
rcu_read_lock();
sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu)); if (sd) {
env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
env.imb_numa_nr = sd->imb_numa_nr;
}
rcu_read_unlock();
/* * Cpusets can break the scheduler domain tree into smaller * balance domains, some of which do not cross NUMA boundaries. * Tasks that are "trapped" in such domains cannot be migrated * elsewhere, so there is no point in (re)trying.
*/ if (unlikely(!sd)) {
sched_setnuma(p, task_node(p)); return -EINVAL;
}
/* Try to find a spot on the preferred nid. */
task_numa_find_cpu(&env, taskimp, groupimp);
/* * Look at other nodes in these cases: * - there is no space available on the preferred_nid * - the task is part of a numa_group that is interleaved across * multiple NUMA nodes; in order to better consolidate the group, * we need to check other locations.
*/
ng = deref_curr_numa_group(p); if (env.best_cpu == -1 || (ng && ng->active_nodes > 1)) {
for_each_node_state(nid, N_CPU) { if (nid == env.src_nid || nid == p->numa_preferred_nid) continue;
/* * If the task is part of a workload that spans multiple NUMA nodes, * and is migrating into one of the workload's active nodes, remember * this node as the task's preferred numa node, so the workload can * settle down. * A task that migrated to a second choice node will be better off * trying for a better one later. Do not set the preferred node here.
*/ if (ng) { if (env.best_cpu == -1)
nid = env.src_nid; else
nid = cpu_to_node(env.best_cpu);
if (nid != p->numa_preferred_nid)
sched_setnuma(p, nid);
}
/* No better CPU than the current one was found. */ if (env.best_cpu == -1) {
trace_sched_stick_numa(p, env.src_cpu, NULL, -1); return -EAGAIN;
}
best_rq = cpu_rq(env.best_cpu); if (env.best_task == NULL) {
ret = migrate_task_to(p, env.best_cpu);
WRITE_ONCE(best_rq->numa_migrate_on, 0); if (ret != 0)
trace_sched_stick_numa(p, env.src_cpu, NULL, env.best_cpu); return ret;
}
ret = migrate_swap(p, env.best_task, env.best_cpu, env.src_cpu);
WRITE_ONCE(best_rq->numa_migrate_on, 0);
/* Attempt to migrate a task to a CPU on the preferred node. */ staticvoid numa_migrate_preferred(struct task_struct *p)
{ unsignedlong interval = HZ;
/* This task has no NUMA fault statistics yet */ if (unlikely(p->numa_preferred_nid == NUMA_NO_NODE || !p->numa_faults)) return;
/* Periodically retry migrating the task to the preferred node */
interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
p->numa_migrate_retry = jiffies + interval;
/* Success if task is already running on preferred CPU */ if (task_node(p) == p->numa_preferred_nid) return;
/* Otherwise, try migrate to a CPU on the preferred node */
task_numa_migrate(p);
}
/* * Find out how many nodes the workload is actively running on. Do this by * tracking the nodes from which NUMA hinting faults are triggered. This can * be different from the set of nodes where the workload's memory is currently * located.
*/ staticvoid numa_group_count_active_nodes(struct numa_group *numa_group)
{ unsignedlong faults, max_faults = 0; int nid, active_nodes = 0;
/* * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS * increments. The more local the fault statistics are, the higher the scan * period will be for the next scan window. If local/(local+remote) ratio is * below NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS) * the scan period will decrease. Aim for 70% local accesses.
*/ #define NUMA_PERIOD_SLOTS 10 #define NUMA_PERIOD_THRESHOLD 7
/* * Increase the scan period (slow down scanning) if the majority of * our memory is already on our local node, or if the majority of * the page accesses are shared with other processes. * Otherwise, decrease the scan period.
*/ staticvoid update_task_scan_period(struct task_struct *p, unsignedlong shared, unsignedlongprivate)
{ unsignedint period_slot; int lr_ratio, ps_ratio; int diff;
unsignedlong remote = p->numa_faults_locality[0]; unsignedlong local = p->numa_faults_locality[1];
/* * If there were no record hinting faults then either the task is * completely idle or all activity is in areas that are not of interest * to automatic numa balancing. Related to that, if there were failed * migration then it implies we are migrating too quickly or the local * node is overloaded. In either case, scan slower
*/ if (local + shared == 0 || p->numa_faults_locality[2]) {
p->numa_scan_period = min(p->numa_scan_period_max,
p->numa_scan_period << 1);
/* * Prepare to scale scan period relative to the current period. * == NUMA_PERIOD_THRESHOLD scan period stays the same * < NUMA_PERIOD_THRESHOLD scan period decreases (scan faster) * >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
*/
period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
lr_ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
ps_ratio = (private * NUMA_PERIOD_SLOTS) / (private + shared);
if (ps_ratio >= NUMA_PERIOD_THRESHOLD) { /* * Most memory accesses are local. There is no need to * do fast NUMA scanning, since memory is already local.
*/ int slot = ps_ratio - NUMA_PERIOD_THRESHOLD; if (!slot)
slot = 1;
diff = slot * period_slot;
} elseif (lr_ratio >= NUMA_PERIOD_THRESHOLD) { /* * Most memory accesses are shared with other tasks. * There is no point in continuing fast NUMA scanning, * since other tasks may just move the memory elsewhere.
*/ int slot = lr_ratio - NUMA_PERIOD_THRESHOLD; if (!slot)
slot = 1;
diff = slot * period_slot;
} else { /* * Private memory faults exceed (SLOTS-THRESHOLD)/SLOTS, * yet they are not on the local NUMA node. Speed up * NUMA scanning to get the memory moved over.
*/ int ratio = max(lr_ratio, ps_ratio);
diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
}
/* * Get the fraction of time the task has been running since the last * NUMA placement cycle. The scheduler keeps similar statistics, but * decays those on a 32ms period, which is orders of magnitude off * from the dozens-of-seconds NUMA balancing period. Use the scheduler * stats only if the task is so new there are no NUMA statistics yet.
*/ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
{
u64 runtime, delta, now; /* Use the start of this time slice to avoid calculations. */
now = p->se.exec_start;
runtime = p->se.sum_exec_runtime;
if (p->last_task_numa_placement) {
delta = runtime - p->last_sum_exec_runtime;
*period = now - p->last_task_numa_placement;
/* * Determine the preferred nid for a task in a numa_group. This needs to * be done in a way that produces consistent results with group_weight, * otherwise workloads might not converge.
*/ staticint preferred_group_nid(struct task_struct *p, int nid)
{
nodemask_t nodes; int dist;
/* Direct connections between all NUMA nodes. */ if (sched_numa_topology_type == NUMA_DIRECT) return nid;
/* * On a system with glueless mesh NUMA topology, group_weight * scores nodes according to the number of NUMA hinting faults on * both the node itself, and on nearby nodes.
*/ if (sched_numa_topology_type == NUMA_GLUELESS_MESH) { unsignedlong score, max_score = 0; int node, max_node = nid;
/* * Finding the preferred nid in a system with NUMA backplane * interconnect topology is more involved. The goal is to locate * tasks from numa_groups near each other in the system, and * untangle workloads from different sides of the system. This requires * searching down the hierarchy of node groups, recursively searching * inside the highest scoring group of nodes. The nodemask tricks * keep the complexity of the search down.
*/
nodes = node_states[N_CPU]; for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) { unsignedlong max_faults = 0;
nodemask_t max_group = NODE_MASK_NONE; int a, b;
/* Are there nodes at this distance from each other? */ if (!find_numa_distance(dist)) continue;
/* Sum group's NUMA faults; includes a==b case. */
for_each_node_mask(b, nodes) { if (node_distance(a, b) < dist) {
faults += group_faults(p, b);
node_set(b, this_group);
node_clear(b, nodes);
}
}
/* Remember the top group. */ if (faults > max_faults) {
max_faults = faults;
max_group = this_group; /* * subtle: at the smallest distance there is * just one node left in each "group", the * winner is the preferred nid.
*/
nid = a;
}
} /* Next round, evaluate the nodes within max_group. */ if (!max_faults) break;
nodes = max_group;
} return nid;
}
/* * The p->mm->numa_scan_seq field gets updated without * exclusive access. Use READ_ONCE() here to ensure * that the field is read in a single access:
*/
seq = READ_ONCE(p->mm->numa_scan_seq); if (p->numa_scan_seq == seq) return;
p->numa_scan_seq = seq;
p->numa_scan_period_max = task_scan_max(p);
/* If the task is part of a group prevent parallel updates to group stats */
ng = deref_curr_numa_group(p); if (ng) {
group_lock = &ng->lock;
spin_lock_irq(group_lock);
}
/* Find the node with the highest number of faults */
for_each_online_node(nid) { /* Keep track of the offsets in numa_faults array */ int mem_idx, membuf_idx, cpu_idx, cpubuf_idx; unsignedlong faults = 0, group_faults = 0; int priv;
for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) { long diff, f_diff, f_weight;
/* * Normalize the faults_from, so all tasks in a group * count according to CPU use, instead of by the raw * number of faults. Tasks with little runtime have * little over-all impact on throughput, and thus their * faults are less important.
*/
f_weight = div64_u64(runtime << 16, period + 1);
f_weight = (f_weight * p->numa_faults[cpubuf_idx]) /
(total_faults + 1);
f_diff = f_weight - p->numa_faults[cpu_idx] / 2;
p->numa_faults[cpubuf_idx] = 0;
p->numa_faults[mem_idx] += diff;
p->numa_faults[cpu_idx] += f_diff;
faults += p->numa_faults[mem_idx];
p->total_numa_faults += diff; if (ng) { /* * safe because we can only change our own group * * mem_idx represents the offset for a given * nid and priv in a specific region because it * is at the beginning of the numa_faults array.
*/
ng->faults[mem_idx] += diff;
ng->faults[cpu_idx] += f_diff;
ng->total_faults += diff;
group_faults += ng->faults[mem_idx];
}
}
staticinlinevoid put_numa_group(struct numa_group *grp)
{ if (refcount_dec_and_test(&grp->refcount))
kfree_rcu(grp, rcu);
}
staticvoid task_numa_group(struct task_struct *p, int cpupid, int flags, int *priv)
{ struct numa_group *grp, *my_grp; struct task_struct *tsk; bool join = false; int cpu = cpupid_to_cpu(cpupid); int i;
grp = rcu_dereference(tsk->numa_group); if (!grp) goto no_join;
my_grp = deref_curr_numa_group(p); if (grp == my_grp) goto no_join;
/* * Only join the other group if its bigger; if we're the bigger group, * the other task will join us.
*/ if (my_grp->nr_tasks > grp->nr_tasks) goto no_join;
/* * Tie-break on the grp address.
*/ if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp) goto no_join;
/* Always join threads in the same process. */ if (tsk->mm == current->mm)
join = true;
/* Simple filter to avoid false positives due to PID collisions */ if (flags & TNF_SHARED)
join = true;
/* Update priv based on whether false sharing was detected */
*priv = !join;
/* * Get rid of NUMA statistics associated with a task (either current or dead). * If @final is set, the task is dead and has reached refcount zero, so we can * safely free all relevant data structures. Otherwise, there might be * concurrent reads from places like load balancing and procfs, and we should * reset the data back to default state without freeing ->numa_faults.
*/ void task_numa_free(struct task_struct *p, bool final)
{ /* safe: p either is current or is being freed by current */ struct numa_group *grp = rcu_dereference_raw(p->numa_group); unsignedlong *numa_faults = p->numa_faults; unsignedlong flags; int i;
if (!numa_faults) return;
if (grp) {
spin_lock_irqsave(&grp->lock, flags); for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
grp->faults[i] -= p->numa_faults[i];
grp->total_faults -= p->total_numa_faults;
if (final) {
p->numa_faults = NULL;
kfree(numa_faults);
} else {
p->total_numa_faults = 0; for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
numa_faults[i] = 0;
}
}
/* * Got a PROT_NONE fault for a page on @node.
*/ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
{ struct task_struct *p = current; bool migrated = flags & TNF_MIGRATED; int cpu_node = task_node(current); int local = !!(flags & TNF_FAULT_LOCAL); struct numa_group *ng; int priv;
if (!static_branch_likely(&sched_numa_balancing)) return;
/* for example, ksmd faulting in a user's mm */ if (!p->mm) return;
/* * NUMA faults statistics are unnecessary for the slow memory * node for memory tiering mode.
*/ if (!node_is_toptier(mem_node) &&
(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING ||
!cpupid_valid(last_cpupid))) return;
/* Allocate buffer to track faults on a per-node basis */ if (unlikely(!p->numa_faults)) { int size = sizeof(*p->numa_faults) *
NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
p->numa_faults = kzalloc(size, GFP_KERNEL|__GFP_NOWARN); if (!p->numa_faults) return;
/* * First accesses are treated as private, otherwise consider accesses * to be private if the accessing pid has not changed
*/ if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {
priv = 1;
} else {
priv = cpupid_match_pid(p, last_cpupid); if (!priv && !(flags & TNF_NO_GROUP))
task_numa_group(p, last_cpupid, flags, &priv);
}
/* * If a workload spans multiple NUMA nodes, a shared fault that * occurs wholly within the set of nodes that the workload is * actively using should be counted as local. This allows the * scan rate to slow down when a workload has settled down.
*/
ng = deref_curr_numa_group(p); if (!priv && !local && ng && ng->active_nodes > 1 &&
numa_is_active_node(cpu_node, ng) &&
numa_is_active_node(mem_node, ng))
local = 1;
/* * Retry to migrate task to preferred node periodically, in case it * previously failed, or the scheduler moved us.
*/ if (time_after(jiffies, p->numa_migrate_retry)) {
task_numa_placement(p);
numa_migrate_preferred(p);
}
if (migrated)
p->numa_pages_migrated += pages; if (flags & TNF_MIGRATE_FAIL)
p->numa_faults_locality[2] += pages;
staticvoid reset_ptenuma_scan(struct task_struct *p)
{ /* * We only did a read acquisition of the mmap sem, so * p->mm->numa_scan_seq is written to without exclusive access * and the update is not guaranteed to be atomic. That's not * much of an issue though, since this is just used for * statistical sampling. Use READ_ONCE/WRITE_ONCE, which are not * expensive, to avoid any form of compiler optimizations:
*/
WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
p->mm->numa_scan_offset = 0;
}
staticbool vma_is_accessed(struct mm_struct *mm, struct vm_area_struct *vma)
{ unsignedlong pids; /* * Allow unconditional access first two times, so that all the (pages) * of VMAs get prot_none fault introduced irrespective of accesses. * This is also done to avoid any side effect of task scanning * amplifying the unfairness of disjoint set of VMAs' access.
*/ if ((READ_ONCE(current->mm->numa_scan_seq) - vma->numab_state->start_scan_seq) < 2) returntrue;
pids = vma->numab_state->pids_active[0] | vma->numab_state->pids_active[1]; if (test_bit(hash_32(current->pid, ilog2(BITS_PER_LONG)), &pids)) returntrue;
/* * Complete a scan that has already started regardless of PID access, or * some VMAs may never be scanned in multi-threaded applications:
*/ if (mm->numa_scan_offset > vma->vm_start) {
trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_IGNORE_PID); returntrue;
}
/* * This vma has not been accessed for a while, and if the number * the threads in the same process is low, which means no other * threads can help scan this vma, force a vma scan.
*/ if (READ_ONCE(mm->numa_scan_seq) >
(vma->numab_state->prev_scan_seq + get_nr_threads(current))) returntrue;
work->next = work; /* * Who cares about NUMA placement when they're dying. * * NOTE: make sure not to dereference p->mm before this check, * exit_task_work() happens _after_ exit_mm() so we could be called * without p->mm even though we still had it when we enqueued this * work.
*/ if (p->flags & PF_EXITING) return;
/* * Memory is pinned to only one NUMA node via cpuset.mems, naturally * no page can be migrated.
*/ if (cpusets_enabled() && nodes_weight(cpuset_current_mems_allowed) == 1) {
trace_sched_skip_cpuset_numa(current, &cpuset_current_mems_allowed); return;
}
if (!mm->numa_next_scan) {
mm->numa_next_scan = now +
msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
}
next_scan = now + msecs_to_jiffies(p->numa_scan_period); if (!try_cmpxchg(&mm->numa_next_scan, &migrate, next_scan)) return;
/* * Delay this task enough that another task of this mm will likely win * the next time around.
*/
p->node_stamp += 2 * TICK_NSEC;
pages = sysctl_numa_balancing_scan_size;
pages <<= 20 - PAGE_SHIFT; /* MB in pages */
virtpages = pages * 8; /* Scan up to this much virtual space */ if (!pages) return;
if (!mmap_read_trylock(mm)) return;
/* * VMAs are skipped if the current PID has not trapped a fault within * the VMA recently. Allow scanning to be forced if there is no * suitable VMA remaining.
*/
vma_pids_skipped = false;
/* * Shared library pages mapped by multiple processes are not * migrated as it is expected they are cache replicated. Avoid * hinting faults in read-only file-backed mappings or the vDSO * as migrating the pages will be of marginal benefit.
*/ if (!vma->vm_mm ||
(vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ))) {
trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_SHARED_RO); continue;
}
/* * Skip inaccessible VMAs to avoid any confusion between * PROT_NONE and NUMA hinting PTEs
*/ if (!vma_is_accessible(vma)) {
trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_INACCESSIBLE); continue;
}
/* Initialise new per-VMA NUMAB state. */ if (!vma->numab_state) { struct vma_numab_state *ptr;
ptr = kzalloc(sizeof(*ptr), GFP_KERNEL); if (!ptr) continue;
if (cmpxchg(&vma->numab_state, NULL, ptr)) {
kfree(ptr); continue;
}
vma->numab_state->next_scan = now +
msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
/* Reset happens after 4 times scan delay of scan start */
vma->numab_state->pids_active_reset = vma->numab_state->next_scan +
msecs_to_jiffies(VMA_PID_RESET_PERIOD);
/* * Ensure prev_scan_seq does not match numa_scan_seq, * to prevent VMAs being skipped prematurely on the * first scan:
*/
vma->numab_state->prev_scan_seq = mm->numa_scan_seq - 1;
}
/* * Scanning the VMAs of short lived tasks add more overhead. So * delay the scan for new VMAs.
*/ if (mm->numa_scan_seq && time_before(jiffies,
vma->numab_state->next_scan)) {
trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_SCAN_DELAY); continue;
}
/* RESET access PIDs regularly for old VMAs. */ if (mm->numa_scan_seq &&
time_after(jiffies, vma->numab_state->pids_active_reset)) {
vma->numab_state->pids_active_reset = vma->numab_state->pids_active_reset +
msecs_to_jiffies(VMA_PID_RESET_PERIOD);
vma->numab_state->pids_active[0] = READ_ONCE(vma->numab_state->pids_active[1]);
vma->numab_state->pids_active[1] = 0;
}
/* Do not rescan VMAs twice within the same sequence. */ if (vma->numab_state->prev_scan_seq == mm->numa_scan_seq) {
mm->numa_scan_offset = vma->vm_end;
trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_SEQ_COMPLETED); continue;
}
/* * Do not scan the VMA if task has not accessed it, unless no other * VMA candidate exists.
*/ if (!vma_pids_forced && !vma_is_accessed(mm, vma)) {
vma_pids_skipped = true;
trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_PID_INACTIVE); continue;
}
do {
start = max(start, vma->vm_start);
end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
end = min(end, vma->vm_end);
nr_pte_updates = change_prot_numa(vma, start, end);
/* * Try to scan sysctl_numa_balancing_size worth of * hpages that have at least one present PTE that * is not already PTE-numa. If the VMA contains * areas that are unused or already full of prot_numa * PTEs, scan up to virtpages, to skip through those * areas faster.
*/ if (nr_pte_updates)
pages -= (end - start) >> PAGE_SHIFT;
virtpages -= (end - start) >> PAGE_SHIFT;
/* VMA scan is complete, do not scan until next sequence. */
vma->numab_state->prev_scan_seq = mm->numa_scan_seq;
/* * Only force scan within one VMA at a time, to limit the * cost of scanning a potentially uninteresting VMA.
*/ if (vma_pids_forced) break;
}
/* * If no VMAs are remaining and VMAs were skipped due to the PID * not accessing the VMA previously, then force a scan to ensure * forward progress:
*/ if (!vma && !vma_pids_forced && vma_pids_skipped) {
vma_pids_forced = true; goto retry_pids;
}
out: /* * It is possible to reach the end of the VMA list but the last few * VMAs are not guaranteed to the vma_migratable. If they are not, we * would find the !migratable VMA on the next scan but not reset the * scanner to the start so check it now.
*/ if (vma)
mm->numa_scan_offset = start; else
reset_ptenuma_scan(p);
mmap_read_unlock(mm);
/* * Make sure tasks use at least 32x as much time to run other code * than they used here, to limit NUMA PTE scanning overhead to 3% max. * Usually update_task_scan_period slows down scanning enough; on an * overloaded system we need to limit overhead on a per task basis.
*/ if (unlikely(p->se.sum_exec_runtime != runtime)) {
u64 diff = p->se.sum_exec_runtime - runtime;
p->node_stamp += 32 * diff;
}
}
/* New address space, reset the preferred nid */ if (!(clone_flags & CLONE_VM)) {
p->numa_preferred_nid = NUMA_NO_NODE; return;
}
/* * New thread, keep existing numa_preferred_nid which should be copied * already by arch_dup_task_struct but stagger when scans start.
*/ if (mm) { unsignedint delay;
/* * We don't care about NUMA placement if we don't have memory.
*/ if (!curr->mm || (curr->flags & (PF_EXITING | PF_KTHREAD)) || work->next != work) return;
/* * Using runtime rather than walltime has the dual advantage that * we (mostly) drive the selection from busy threads and that the * task needs to have done some actual work before we bother with * NUMA placement.
*/
now = curr->se.sum_exec_runtime;
period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
if (now > curr->node_stamp + period) { if (!curr->node_stamp)
curr->numa_scan_period = task_scan_start(curr);
curr->node_stamp += period;
if (!time_before(jiffies, curr->mm->numa_next_scan))
task_work_add(curr, work, TWA_RESUME);
}
}
staticvoid update_scan_period(struct task_struct *p, int new_cpu)
{ int src_nid = cpu_to_node(task_cpu(p)); int dst_nid = cpu_to_node(new_cpu);
if (!static_branch_likely(&sched_numa_balancing)) return;
if (!p->mm || !p->numa_faults || (p->flags & PF_EXITING)) return;
if (src_nid == dst_nid) return;
/* * Allow resets if faults have been trapped before one scan * has completed. This is most likely due to a new task that * is pulled cross-node due to wakeups or load balancing.
*/ if (p->numa_scan_seq) { /* * Avoid scan adjustments if moving to the preferred * node or if the task was not previously running on * the preferred node.
*/ if (dst_nid == p->numa_preferred_nid ||
(p->numa_preferred_nid != NUMA_NO_NODE &&
src_nid != p->numa_preferred_nid)) return;
}
/* * Signed add and clamp on underflow. * * Explicitly do a load-store to ensure the intermediate value never hits * memory. This allows lockless observations without ever seeing the negative * values.
*/ #define add_positive(_ptr, _val) do { \
typeof(_ptr) ptr = (_ptr); \
typeof(_val) val = (_val); \
typeof(*ptr) res, var = READ_ONCE(*ptr); \
\
res = var + val; \
\ if (val < 0 && res > var) \
res = 0; \
\
WRITE_ONCE(*ptr, res); \
} while (0)
/* * Unsigned subtract and clamp on underflow. * * Explicitly do a load-store to ensure the intermediate value never hits * memory. This allows lockless observations without ever seeing the negative * values.
*/ #define sub_positive(_ptr, _val) do { \
typeof(_ptr) ptr = (_ptr); \
typeof(*ptr) val = (_val); \
typeof(*ptr) res, var = READ_ONCE(*ptr); \
res = var - val; \ if (res > var) \
res = 0; \
WRITE_ONCE(*ptr, res); \
} while (0)
/* * Remove and clamp on negative, from a local variable. * * A variant of sub_positive(), which does not use explicit load-store * and is thus optimized for local variable updates.
*/ #define lsub_positive(_ptr, _val) do { \
typeof(_ptr) ptr = (_ptr); \
*ptr -= min_t(typeof(*ptr), *ptr, _val); \
} while (0)
if (se->on_rq) { /* commit outstanding execution time */
update_curr(cfs_rq);
update_entity_lag(cfs_rq, se);
se->deadline -= se->vruntime;
se->rel_deadline = 1;
cfs_rq->nr_queued--; if (!curr)
__dequeue_entity(cfs_rq, se);
update_load_sub(&cfs_rq->load, se->load.weight);
}
dequeue_load_avg(cfs_rq, se);
/* * Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i), * we need to scale se->vlag when w_i changes.
*/
se->vlag = div_s64(se->vlag * se->load.weight, weight); if (se->rel_deadline)
se->deadline = div_s64(se->deadline * se->load.weight, weight);
update_load_set(&se->load, weight);
do {
u32 divider = get_pelt_divider(&se->avg);
se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider);
} while (0);
enqueue_load_avg(cfs_rq, se); if (se->on_rq) {
place_entity(cfs_rq, se, 0);
update_load_add(&cfs_rq->load, se->load.weight); if (!curr)
__enqueue_entity(cfs_rq, se);
cfs_rq->nr_queued++;
/* * The entity's vruntime has been adjusted, so let's check * whether the rq-wide min_vruntime needs updated too. Since * the calculations above require stable min_vruntime rather * than up-to-date one, we do the update at the end of the * reweight process.
*/
update_min_vruntime(cfs_rq);
}
}
#ifdef CONFIG_FAIR_GROUP_SCHED /* * All this does is approximate the hierarchical proportion which includes that * global sum we all love to hate. * * That is, the weight of a group entity, is the proportional share of the * group weight based on the group runqueue weights. That is: * * tg->weight * grq->load.weight * ge->load.weight = ----------------------------- (1) * \Sum grq->load.weight * * Now, because computing that sum is prohibitively expensive to compute (been * there, done that) we approximate it with this average stuff. The average * moves slower and therefore the approximation is cheaper and more stable. * * So instead of the above, we substitute: * * grq->load.weight -> grq->avg.load_avg (2) * * which yields the following: * * tg->weight * grq->avg.load_avg * ge->load.weight = ------------------------------ (3) * tg->load_avg * * Where: tg->load_avg ~= \Sum grq->avg.load_avg * * That is shares_avg, and it is right (given the approximation (2)). * * The problem with it is that because the average is slow -- it was designed * to be exactly that of course -- this leads to transients in boundary * conditions. In specific, the case where the group was idle and we start the * one task. It takes time for our CPU's grq->avg.load_avg to build up, * yielding bad latency etc.. * * Now, in that special case (1) reduces to: * * tg->weight * grq->load.weight * ge->load.weight = ----------------------------- = tg->weight (4) * grp->load.weight * * That is, the sum collapses because all other CPUs are idle; the UP scenario. * * So what we do is modify our approximation (3) to approach (4) in the (near) * UP case, like: * * ge->load.weight = * * tg->weight * grq->load.weight * --------------------------------------------------- (5) * tg->load_avg - grq->avg.load_avg + grq->load.weight * * But because grq->load.weight can drop to 0, resulting in a divide by zero, * we need to use grq->avg.load_avg as its lower bound, which then gives: * * * tg->weight * grq->load.weight * ge->load.weight = ----------------------------- (6) * tg_load_avg' * * Where: * * tg_load_avg' = tg->load_avg - grq->avg.load_avg + * max(grq->load.weight, grq->avg.load_avg) * * And that is shares_weight and is icky. In the (near) UP case it approaches * (4) while in the normal case it approaches (3). It consistently * overestimates the ge->load.weight and therefore: * * \Sum ge->load.weight >= tg->weight * * hence icky!
*/ staticlong calc_group_shares(struct cfs_rq *cfs_rq)
{ long tg_weight, tg_shares, load, shares; struct task_group *tg = cfs_rq->tg;
shares = (tg_shares * load); if (tg_weight)
shares /= tg_weight;
/* * MIN_SHARES has to be unscaled here to support per-CPU partitioning * of a group with small tg->shares value. It is a floor value which is * assigned as a minimum load.weight to the sched_entity representing * the group on a CPU. * * E.g. on 64-bit for a group with tg->shares of scale_load(15)=15*1024 * on an 8-core system with 8 tasks each runnable on one CPU shares has * to be 15*1024*1/8=1920 instead of scale_load(MIN_SHARES)=2*1024. In * case no task is runnable on a CPU MIN_SHARES=2 should be returned * instead of 0.
*/ return clamp_t(long, shares, MIN_SHARES, tg_shares);
}
/* * Recomputes the group entity based on the current state of its group * runqueue.
*/ staticvoid update_cfs_group(struct sched_entity *se)
{ struct cfs_rq *gcfs_rq = group_cfs_rq(se); long shares;
/* * When a group becomes empty, preserve its weight. This matters for * DELAY_DEQUEUE.
*/ if (!gcfs_rq || !gcfs_rq->load.weight) return;
if (throttled_hierarchy(gcfs_rq)) return;
shares = calc_group_shares(gcfs_rq); if (unlikely(se->load.weight != shares))
reweight_entity(cfs_rq_of(se), se, shares);
}
if (&rq->cfs == cfs_rq) { /* * There are a few boundary cases this might miss but it should * get called often enough that that should (hopefully) not be * a real problem. * * It will not get called when we go idle, because the idle * thread is a different class (!fair), nor will the utilization * number include things like RT tasks. * * As is, the util number is not freq-invariant (we'd have to * implement arch_scale_freq_capacity() for that). * * See cpu_util_cfs().
*/
cpufreq_update_util(rq, flags);
}
}
staticinlinebool load_avg_is_decayed(struct sched_avg *sa)
{ if (sa->load_sum) returnfalse;
if (sa->util_sum) returnfalse;
if (sa->runnable_sum) returnfalse;
/* * _avg must be null when _sum are null because _avg = _sum / divider * Make sure that rounding and/or propagation of PELT values never * break this.
*/
WARN_ON_ONCE(sa->load_avg ||
sa->util_avg ||
sa->runnable_avg);
returntrue;
}
staticinline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
{ return u64_u32_load_copy(cfs_rq->avg.last_update_time,
cfs_rq->last_update_time_copy);
} #ifdef CONFIG_FAIR_GROUP_SCHED /* * Because list_add_leaf_cfs_rq always places a child cfs_rq on the list * immediately before a parent cfs_rq, and cfs_rqs are removed from the list * bottom-up, we only have to test whether the cfs_rq before us on the list * is our child. * If cfs_rq is not on the list, test whether a child needs its to be added to * connect a branch to the tree * (see list_add_leaf_cfs_rq() for details).
*/ staticinlinebool child_cfs_rq_on_list(struct cfs_rq *cfs_rq)
{ struct cfs_rq *prev_cfs_rq; struct list_head *prev; struct rq *rq = rq_of(cfs_rq);
staticinlinebool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
{ if (cfs_rq->load.weight) returnfalse;
if (!load_avg_is_decayed(&cfs_rq->avg)) returnfalse;
if (child_cfs_rq_on_list(cfs_rq)) returnfalse;
returntrue;
}
/** * update_tg_load_avg - update the tg's load avg * @cfs_rq: the cfs_rq whose avg changed * * This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load. * However, because tg->load_avg is a global value there are performance * considerations. * * In order to avoid having to look at the other cfs_rq's, we use a * differential update where we store the last value we propagated. This in * turn allows skipping updates if the differential is 'small'. * * Updating tg's load_avg is necessary before update_cfs_share().
*/ staticinlinevoid update_tg_load_avg(struct cfs_rq *cfs_rq)
{ long delta;
u64 now;
/* * No need to update load_avg for root_task_group as it is not used.
*/ if (cfs_rq->tg == &root_task_group) return;
/* rq has been offline and doesn't contribute to the share anymore: */ if (!cpu_active(cpu_of(rq_of(cfs_rq)))) return;
/* * For migration heavy workloads, access to tg->load_avg can be * unbound. Limit the update rate to at most once per ms.
*/
now = sched_clock_cpu(cpu_of(rq_of(cfs_rq))); if (now - cfs_rq->last_update_tg_load_avg < NSEC_PER_MSEC) return;
/* * The rq clock has already been updated in * set_rq_offline(), so we should skip updating * the rq clock again in unthrottle_cfs_rq().
*/
rq_clock_start_loop_update(rq);
/* * Called within set_task_rq() right before setting a task's CPU. The * caller only guarantees p->pi_lock is held; no other assumptions, * including the state of rq->lock, should be made.
*/ void set_task_rq_fair(struct sched_entity *se, struct cfs_rq *prev, struct cfs_rq *next)
{
u64 p_last_update_time;
u64 n_last_update_time;
if (!sched_feat(ATTACH_AGE_LOAD)) return;
/* * We are supposed to update the task to "current" time, then its up to * date and ready to go to new CPU/cfs_rq. But we have difficulty in * getting what current time is, so simply throw away the out-of-date * time. This will result in the wakee task is less decayed, but giving * the wakee more load sounds not bad.
*/ if (!(se->avg.last_update_time && prev)) return;
/* * When on migration a sched_entity joins/leaves the PELT hierarchy, we need to * propagate its contribution. The key to this propagation is the invariant * that for each group: * * ge->avg == grq->avg (1) * * _IFF_ we look at the pure running and runnable sums. Because they * represent the very same entity, just at different points in the hierarchy. * * Per the above update_tg_cfs_util() and update_tg_cfs_runnable() are trivial * and simply copies the running/runnable sum over (but still wrong, because * the group entity and group rq do not have their PELT windows aligned). * * However, update_tg_cfs_load() is more complex. So we have: * * ge->avg.load_avg = ge->load.weight * ge->avg.runnable_avg (2) * * And since, like util, the runnable part should be directly transferable, * the following would _appear_ to be the straight forward approach: * * grq->avg.load_avg = grq->load.weight * grq->avg.runnable_avg (3) * * And per (1) we have: * * ge->avg.runnable_avg == grq->avg.runnable_avg * * Which gives: * * ge->load.weight * grq->avg.load_avg * ge->avg.load_avg = ----------------------------------- (4) * grq->load.weight * * Except that is wrong! * * Because while for entities historical weight is not important and we * really only care about our future and therefore can consider a pure * runnable sum, runqueues can NOT do this. * * We specifically want runqueues to have a load_avg that includes * historical weights. Those represent the blocked load, the load we expect * to (shortly) return to us. This only works by keeping the weights as * integral part of the sum. We therefore cannot decompose as per (3). * * Another reason this doesn't work is that runnable isn't a 0-sum entity. * Imagine a rq with 2 tasks that each are runnable 2/3 of the time. Then the * rq itself is runnable anywhere between 2/3 and 1 depending on how the * runnable section of these tasks overlap (or not). If they were to perfectly * align the rq as a whole would be runnable 2/3 of the time. If however we * always have at least 1 runnable task, the rq as a whole is always runnable. * * So we'll have to approximate.. :/ * * Given the constraint: * * ge->avg.running_sum <= ge->avg.runnable_sum <= LOAD_AVG_MAX * * We can construct a rule that adds runnable to a rq by assuming minimal * overlap. * * On removal, we'll assume each task is equally runnable; which yields: * * grq->avg.runnable_sum = grq->avg.load_sum / grq->load.weight * * XXX: only do this for the part of runnable > running ? *
*/ staticinlinevoid
update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
{ long delta_sum, delta_avg = gcfs_rq->avg.util_avg - se->avg.util_avg;
u32 new_sum, divider;
/* Nothing to update */ if (!delta_avg) return;
/* * cfs_rq->avg.period_contrib can be used for both cfs_rq and se. * See ___update_load_avg() for details.
*/
divider = get_pelt_divider(&cfs_rq->avg);
/* * cfs_rq->avg.period_contrib can be used for both cfs_rq and se. * See ___update_load_avg() for details.
*/
divider = get_pelt_divider(&cfs_rq->avg);
/* * cfs_rq->avg.period_contrib can be used for both cfs_rq and se. * See ___update_load_avg() for details.
*/
divider = get_pelt_divider(&cfs_rq->avg);
if (runnable_sum >= 0) { /* * Add runnable; clip at LOAD_AVG_MAX. Reflects that until * the CPU is saturated running == runnable.
*/
runnable_sum += se->avg.load_sum;
runnable_sum = min_t(long, runnable_sum, divider);
} else { /* * Estimate the new unweighted runnable_sum of the gcfs_rq by * assuming all tasks are equally runnable.
*/ if (scale_load_down(gcfs_rq->load.weight)) {
load_sum = div_u64(gcfs_rq->avg.load_sum,
scale_load_down(gcfs_rq->load.weight));
}
/* But make sure to not inflate se's runnable */
runnable_sum = min(se->avg.load_sum, load_sum);
}
/* * runnable_sum can't be lower than running_sum * Rescale running sum to be in the same range as runnable sum * running_sum is in [0 : LOAD_AVG_MAX << SCHED_CAPACITY_SHIFT] * runnable_sum is in [0 : LOAD_AVG_MAX]
*/
running_sum = se->avg.util_sum >> SCHED_CAPACITY_SHIFT;
runnable_sum = max(runnable_sum, running_sum);
update_tg_cfs_util(cfs_rq, se, gcfs_rq);
update_tg_cfs_runnable(cfs_rq, se, gcfs_rq);
update_tg_cfs_load(cfs_rq, se, gcfs_rq);
trace_pelt_cfs_tp(cfs_rq);
trace_pelt_se_tp(se);
return 1;
}
/* * Check if we need to update the load and the utilization of a blocked * group_entity:
*/ staticinlinebool skip_blocked_update(struct sched_entity *se)
{ struct cfs_rq *gcfs_rq = group_cfs_rq(se);
/* * If sched_entity still have not zero load or utilization, we have to * decay it:
*/ if (se->avg.load_avg || se->avg.util_avg) returnfalse;
/* * If there is a pending propagation, we have to update the load and * the utilization of the sched_entity:
*/ if (gcfs_rq->propagate) returnfalse;
/* * Otherwise, the load and the utilization of the sched_entity is * already zero and there is no pending propagation, so it will be a * waste of time to try to decay it:
*/ returntrue;
}
/* * The lag estimation comes with a cost we don't want to pay all the * time. Hence, limiting to the case where the source CPU is idle and * we know we are at the greatest risk to have an outdated clock.
*/ if (!is_idle) return;
/* * Estimated "now" is: last_update_time + cfs_idle_lag + rq_idle_lag, where: * * last_update_time (the cfs_rq's last_update_time) * = cfs_rq_clock_pelt()@cfs_rq_idle * = rq_clock_pelt()@cfs_rq_idle * - cfs->throttled_clock_pelt_time@cfs_rq_idle * * cfs_idle_lag (delta between rq's update and cfs_rq's update) * = rq_clock_pelt()@rq_idle - rq_clock_pelt()@cfs_rq_idle * * rq_idle_lag (delta between now and rq's update) * = sched_clock_cpu() - rq_clock()@rq_idle * * We can then write: * * now = rq_clock_pelt()@rq_idle - cfs->throttled_clock_pelt_time + * sched_clock_cpu() - rq_clock()@rq_idle * Where: * rq_clock_pelt()@rq_idle is rq->clock_pelt_idle * rq_clock()@rq_idle is rq->clock_idle * cfs->throttled_clock_pelt_time@cfs_rq_idle * is cfs_rq->throttled_pelt_idle
*/
#ifdef CONFIG_CFS_BANDWIDTH
throttled = u64_u32_load(cfs_rq->throttled_pelt_idle); /* The clock has been stopped for throttling */ if (throttled == U64_MAX) return; #endif
now = u64_u32_load(rq->clock_pelt_idle); /* * Paired with _update_idle_rq_clock_pelt(). It ensures at the worst case * is observed the old clock_pelt_idle value and the new clock_idle, * which lead to an underestimation. The opposite would lead to an * overestimation.
*/
smp_rmb();
lut = cfs_rq_last_update_time(cfs_rq);
now -= throttled; if (now < lut) /* * cfs_rq->avg.last_update_time is more recent than our * estimation, let's use it.
*/
now = lut; else
now += sched_clock_cpu(cpu_of(rq)) - u64_u32_load(rq->clock_idle);
/** * update_cfs_rq_load_avg - update the cfs_rq's load/util averages * @now: current time, as per cfs_rq_clock_pelt() * @cfs_rq: cfs_rq to update * * The cfs_rq avg is the direct sum of all its entities (blocked and runnable) * avg. The immediate corollary is that all (fair) tasks must be attached. * * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example. * * Return: true if the load decayed or we removed load. * * Since both these conditions indicate a changed cfs_rq->avg.load we should * call update_tg_load_avg() when this function returns true.
*/ staticinlineint
update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
{ unsignedlong removed_load = 0, removed_util = 0, removed_runnable = 0; struct sched_avg *sa = &cfs_rq->avg; int decayed = 0;
if (cfs_rq->removed.nr) { unsignedlong r;
u32 divider = get_pelt_divider(&cfs_rq->avg);
r = removed_load;
sub_positive(&sa->load_avg, r);
sub_positive(&sa->load_sum, r * divider); /* See sa->util_sum below */
sa->load_sum = max_t(u32, sa->load_sum, sa->load_avg * PELT_MIN_DIVIDER);
r = removed_util;
sub_positive(&sa->util_avg, r);
sub_positive(&sa->util_sum, r * divider); /* * Because of rounding, se->util_sum might ends up being +1 more than * cfs->util_sum. Although this is not a problem by itself, detaching * a lot of tasks with the rounding problem between 2 updates of * util_avg (~1ms) can make cfs->util_sum becoming null whereas * cfs_util_avg is not. * Check that util_sum is still above its lower bound for the new * util_avg. Given that period_contrib might have moved since the last * sync, we are only sure that util_sum must be above or equal to * util_avg * minimum possible divider
*/
sa->util_sum = max_t(u32, sa->util_sum, sa->util_avg * PELT_MIN_DIVIDER);
r = removed_runnable;
sub_positive(&sa->runnable_avg, r);
sub_positive(&sa->runnable_sum, r * divider); /* See sa->util_sum above */
sa->runnable_sum = max_t(u32, sa->runnable_sum,
sa->runnable_avg * PELT_MIN_DIVIDER);
/* * removed_runnable is the unweighted version of removed_load so we * can use it to estimate removed_load_sum.
*/
add_tg_cfs_propagate(cfs_rq,
-(long)(removed_runnable * divider) >> SCHED_CAPACITY_SHIFT);
/** * attach_entity_load_avg - attach this entity to its cfs_rq load avg * @cfs_rq: cfs_rq to attach to * @se: sched_entity to attach * * Must call update_cfs_rq_load_avg() before this, since we rely on * cfs_rq->avg.last_update_time being current.
*/ staticvoid attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{ /* * cfs_rq->avg.period_contrib can be used for both cfs_rq and se. * See ___update_load_avg() for details.
*/
u32 divider = get_pelt_divider(&cfs_rq->avg);
/* * When we attach the @se to the @cfs_rq, we must align the decay * window because without that, really weird and wonderful things can * happen. * * XXX illustrate
*/
se->avg.last_update_time = cfs_rq->avg.last_update_time;
se->avg.period_contrib = cfs_rq->avg.period_contrib;
/* * Hell(o) Nasty stuff.. we need to recompute _sum based on the new * period_contrib. This isn't strictly correct, but since we're * entirely outside of the PELT hierarchy, nobody cares if we truncate * _sum a little.
*/
se->avg.util_sum = se->avg.util_avg * divider;
/** * detach_entity_load_avg - detach this entity from its cfs_rq load avg * @cfs_rq: cfs_rq to detach from * @se: sched_entity to detach * * Must call update_cfs_rq_load_avg() before this, since we rely on * cfs_rq->avg.last_update_time being current.
*/ staticvoid detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
dequeue_load_avg(cfs_rq, se);
sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum); /* See update_cfs_rq_load_avg() */
cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum,
cfs_rq->avg.util_avg * PELT_MIN_DIVIDER);
/* * Optional action to be done while updating the load average
*/ #define UPDATE_TG 0x1 #define SKIP_AGE_LOAD 0x2 #define DO_ATTACH 0x4 #define DO_DETACH 0x8
/* Update task and its cfs_rq load average */ staticinlinevoid update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
u64 now = cfs_rq_clock_pelt(cfs_rq); int decayed;
/* * Track task load average for carrying it to new CPU after migrated, and * track group sched_entity load average for task_h_load calculation in migration
*/ if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
__update_load_avg_se(now, cfs_rq, se);
if (!se->avg.last_update_time && (flags & DO_ATTACH)) {
/* * DO_ATTACH means we're here from enqueue_entity(). * !last_update_time means we've passed through * migrate_task_rq_fair() indicating we migrated. * * IOW we're enqueueing a task on a new CPU.
*/
attach_entity_load_avg(cfs_rq, se);
update_tg_load_avg(cfs_rq);
} elseif (flags & DO_DETACH) { /* * DO_DETACH means we're here from dequeue_entity() * and we are migrating task out of the CPU.
*/
detach_entity_load_avg(cfs_rq, se);
update_tg_load_avg(cfs_rq);
} elseif (decayed) {
cfs_rq_util_change(cfs_rq, 0);
if (flags & UPDATE_TG)
update_tg_load_avg(cfs_rq);
}
}
/* * Synchronize entity load avg of dequeued entity without locking * the previous rq.
*/ staticvoid sync_entity_load_avg(struct sched_entity *se)
{ struct cfs_rq *cfs_rq = cfs_rq_of(se);
u64 last_update_time;
/* * Task first catches up with cfs_rq, and then subtract * itself from the cfs_rq (task must be off the queue now).
*/ staticvoid remove_entity_load_avg(struct sched_entity *se)
{ struct cfs_rq *cfs_rq = cfs_rq_of(se); unsignedlong flags;
/* * tasks cannot exit without having gone through wake_up_new_task() -> * enqueue_task_fair() which will have added things to the cfs_rq, * so we can remove unconditionally.
*/
/* * Skip update of task's estimated utilization when the task has not * yet completed an activation, e.g. being migrated.
*/ if (!task_sleep) return;
/* Get current estimate of utilization */
ewma = READ_ONCE(p->se.avg.util_est);
/* * If the PELT values haven't changed since enqueue time, * skip the util_est update.
*/ if (ewma & UTIL_AVG_UNCHANGED) return;
/* Get utilization at dequeue */
dequeued = task_util(p);
/* * Reset EWMA on utilization increases, the moving average is used only * to smooth utilization decreases.
*/ if (ewma <= dequeued) {
ewma = dequeued; goto done;
}
/* * Skip update of task's estimated utilization when its members are * already ~1% close to its last activation value.
*/
last_ewma_diff = ewma - dequeued; if (last_ewma_diff < UTIL_EST_MARGIN) goto done;
/* * To avoid underestimate of task utilization, skip updates of EWMA if * we cannot grant that thread got all CPU time it wanted.
*/ if ((dequeued + UTIL_EST_MARGIN) < task_runnable(p)) goto done;
/* * Update Task's estimated utilization * * When *p completes an activation we can consolidate another sample * of the task size. This is done by using this value to update the * Exponential Weighted Moving Average (EWMA): * * ewma(t) = w * task_util(p) + (1-w) * ewma(t-1) * = w * task_util(p) + ewma(t-1) - w * ewma(t-1) * = w * (task_util(p) - ewma(t-1)) + ewma(t-1) * = w * ( -last_ewma_diff ) + ewma(t-1) * = w * (-last_ewma_diff + ewma(t-1) / w) * * Where 'w' is the weight of new samples, which is configured to be * 0.25, thus making w=1/4 ( >>= UTIL_EST_WEIGHT_SHIFT)
*/
ewma <<= UTIL_EST_WEIGHT_SHIFT;
ewma -= last_ewma_diff;
ewma >>= UTIL_EST_WEIGHT_SHIFT;
done:
ewma |= UTIL_AVG_UNCHANGED;
WRITE_ONCE(p->se.avg.util_est, ewma);
/* * Check if the real util fits without any uclamp boost/cap applied.
*/
fits = fits_capacity(util, capacity);
if (!uclamp_is_used()) return fits;
/* * We must use arch_scale_cpu_capacity() for comparing against uclamp_min and * uclamp_max. We only care about capacity pressure (by using * capacity_of()) for comparing against the real util. * * If a task is boosted to 1024 for example, we don't want a tiny * pressure to skew the check whether it fits a CPU or not. * * Similarly if a task is capped to arch_scale_cpu_capacity(little_cpu), it * should fit a little cpu even if there's some pressure. * * Only exception is for HW or cpufreq pressure since it has a direct impact * on available OPP of the system. * * We honour it for uclamp_min only as a drop in performance level * could result in not getting the requested minimum performance level. * * For uclamp_max, we can tolerate a drop in performance level as the * goal is to cap the task. So it's okay if it's getting less.
*/
capacity_orig = arch_scale_cpu_capacity(cpu);
/* * We want to force a task to fit a cpu as implied by uclamp_max. * But we do have some corner cases to cater for.. * * * C=z * | ___ * | C=y | | * |_ _ _ _ _ _ _ _ _ ___ _ _ _ | _ | _ _ _ _ _ uclamp_max * | C=x | | | | * | ___ | | | | * | | | | | | | (util somewhere in this region) * | | | | | | | * | | | | | | | * +---------------------------------------- * CPU0 CPU1 CPU2 * * In the above example if a task is capped to a specific performance * point, y, then when: * * * util = 80% of x then it does not fit on CPU0 and should migrate * to CPU1 * * util = 80% of y then it is forced to fit on CPU1 to honour * uclamp_max request. * * which is what we're enforcing here. A task always fits if * uclamp_max <= capacity_orig. But when uclamp_max > capacity_orig, * the normal upmigration rules should withhold still. * * Only exception is when we are on max capacity, then we need to be * careful not to block overutilized state. This is so because: * * 1. There's no concept of capping at max_capacity! We can't go * beyond this performance level anyway. * 2. The system is being saturated when we're operating near * max capacity, it doesn't make sense to block overutilized.
*/
uclamp_max_fits = (capacity_orig == SCHED_CAPACITY_SCALE) && (uclamp_max == SCHED_CAPACITY_SCALE);
uclamp_max_fits = !uclamp_max_fits && (uclamp_max <= capacity_orig);
fits = fits || uclamp_max_fits;
/* * * C=z * | ___ (region a, capped, util >= uclamp_max) * | C=y | | * |_ _ _ _ _ _ _ _ _ ___ _ _ _ | _ | _ _ _ _ _ uclamp_max * | C=x | | | | * | ___ | | | | (region b, uclamp_min <= util <= uclamp_max) * |_ _ _|_ _|_ _ _ _| _ | _ _ _| _ | _ _ _ _ _ uclamp_min * | | | | | | | * | | | | | | | (region c, boosted, util < uclamp_min) * +---------------------------------------- * CPU0 CPU1 CPU2 * * a) If util > uclamp_max, then we're capped, we don't care about * actual fitness value here. We only care if uclamp_max fits * capacity without taking margin/pressure into account. * See comment above. * * b) If uclamp_min <= util <= uclamp_max, then the normal * fits_capacity() rules apply. Except we need to ensure that we * enforce we remain within uclamp_max, see comment above. * * c) If util < uclamp_min, then we are boosted. Same as (b) but we * need to take into account the boosted value fits the CPU without * taking margin/pressure into account. * * Cases (a) and (b) are handled in the 'fits' variable already. We * just need to consider an extra check for case (c) after ensuring we * handle the case uclamp_min > uclamp_max.
*/
uclamp_min = min(uclamp_min, uclamp_max); if (fits && (util < uclamp_min) &&
(uclamp_min > get_actual_cpu_capacity(cpu))) return -1;
return fits;
}
staticinlineint task_fits_cpu(struct task_struct *p, int cpu)
{ unsignedlong uclamp_min = uclamp_eff_value(p, UCLAMP_MIN); unsignedlong uclamp_max = uclamp_eff_value(p, UCLAMP_MAX); unsignedlong util = task_util_est(p); /* * Return true only if the cpu fully fits the task requirements, which * include the utilization but also the performance hints.
*/ return (util_fits_cpu(util, uclamp_min, uclamp_max, cpu) > 0);
}
staticinlinevoid update_misfit_status(struct task_struct *p, struct rq *rq)
{ int cpu = cpu_of(rq);
if (!sched_asym_cpucap_active()) return;
/* * Affinity allows us to go somewhere higher? Or are we on biggest * available CPU already? Or do we fit into this CPU ?
*/ if (!p || (p->nr_cpus_allowed == 1) ||
(arch_scale_cpu_capacity(cpu) == p->max_allowed_capacity) ||
task_fits_cpu(p, cpu)) {
rq->misfit_task_load = 0; return;
}
/* * Make sure that misfit_task_load will not be null even if * task_h_load() returns 0.
*/
rq->misfit_task_load = max_t(unsignedlong, task_h_load(p), 1);
}
staticvoid
place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
u64 vslice, vruntime = avg_vruntime(cfs_rq);
s64 lag = 0;
if (!se->custom_slice)
se->slice = sysctl_sched_base_slice;
vslice = calc_delta_fair(se->slice, se);
/* * Due to how V is constructed as the weighted average of entities, * adding tasks with positive lag, or removing tasks with negative lag * will move 'time' backwards, this can screw around with the lag of * other tasks. * * EEVDF: placement strategy #1 / #2
*/ if (sched_feat(PLACE_LAG) && cfs_rq->nr_queued && se->vlag) { struct sched_entity *curr = cfs_rq->curr; unsignedlong load;
lag = se->vlag;
/* * If we want to place a task and preserve lag, we have to * consider the effect of the new entity on the weighted * average and compensate for this, otherwise lag can quickly * evaporate. * * Lag is defined as: * * lag_i = S - s_i = w_i * (V - v_i) * * To avoid the 'w_i' term all over the place, we only track * the virtual lag: * * vl_i = V - v_i <=> v_i = V - vl_i * * And we take V to be the weighted average of all v: * * V = (\Sum w_j*v_j) / W * * Where W is: \Sum w_j * * Then, the weighted average after adding an entity with lag * vl_i is given by: * * V' = (\Sum w_j*v_j + w_i*v_i) / (W + w_i) * = (W*V + w_i*(V - vl_i)) / (W + w_i) * = (W*V + w_i*V - w_i*vl_i) / (W + w_i) * = (V*(W + w_i) - w_i*vl_i) / (W + w_i) * = V - w_i*vl_i / (W + w_i) * * And the actual lag after adding an entity with vl_i is: * * vl'_i = V' - v_i * = V - w_i*vl_i / (W + w_i) - (V - vl_i) * = vl_i - w_i*vl_i / (W + w_i) * * Which is strictly less than vl_i. So in order to preserve lag * we should inflate the lag before placement such that the * effective lag after placement comes out right. * * As such, invert the above relation for vl'_i to get the vl_i * we need to use such that the lag after placement is the lag * we computed before dequeue. * * vl'_i = vl_i - w_i*vl_i / (W + w_i) * = ((W + w_i)*vl_i - w_i*vl_i) / (W + w_i) * * (W + w_i)*vl'_i = (W + w_i)*vl_i - w_i*vl_i * = W*vl_i * * vl_i = (W + w_i)*vl'_i / W
*/
load = cfs_rq->avg_load; if (curr && curr->on_rq)
load += scale_load_down(curr->load.weight);
lag *= load + scale_load_down(se->load.weight); if (WARN_ON_ONCE(!load))
load = 1;
lag = div_s64(lag, load);
}
/* * When joining the competition; the existing tasks will be, * on average, halfway through their slice, as such start tasks * off with half a slice to ease into the competition.
*/ if (sched_feat(PLACE_DEADLINE_INITIAL) && (flags & ENQUEUE_INITIAL))
vslice /= 2;
/* * If we're the current task, we must renormalise before calling * update_curr().
*/ if (curr)
place_entity(cfs_rq, se, flags);
update_curr(cfs_rq);
/* * When enqueuing a sched_entity, we must: * - Update loads to have both entity and cfs_rq synced with now. * - For group_entity, update its runnable_weight to reflect the new * h_nr_runnable of its group cfs_rq. * - For group_entity, update its weight to reflect the new share of * its group cfs_rq * - Add its new weight to cfs_rq->load.weight
*/
update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH);
se_update_runnable(se); /* * XXX update_load_avg() above will have attached us to the pelt sum; * but update_cfs_group() here will re-adjust the weight and have to * undo/redo all that. Seems wasteful.
*/
update_cfs_group(se);
/* * XXX now that the entity has been re-weighted, and it's lag adjusted, * we can place the entity.
*/ if (!curr)
place_entity(cfs_rq, se, flags);
account_entity_enqueue(cfs_rq, se);
/* Entity has migrated, no longer consider this task hot */ if (flags & ENQUEUE_MIGRATED)
se->exec_start = 0;
check_schedstat_required();
update_stats_enqueue_fair(cfs_rq, se, flags); if (!curr)
__enqueue_entity(cfs_rq, se);
se->on_rq = 1;
if (cfs_rq->nr_queued == 1) {
check_enqueue_throttle(cfs_rq); if (!throttled_hierarchy(cfs_rq)) {
list_add_leaf_cfs_rq(cfs_rq);
} else { #ifdef CONFIG_CFS_BANDWIDTH struct rq *rq = rq_of(cfs_rq);
if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock)
cfs_rq->throttled_clock = rq_clock(rq); if (!cfs_rq->throttled_clock_self)
cfs_rq->throttled_clock_self = rq_clock(rq); #endif
}
}
}
/* * Delayed se of cfs_rq have no tasks queued on them. * Do not adjust h_nr_runnable since dequeue_entities() * will account it for blocked tasks.
*/ if (!entity_is_task(se)) return;
/* * Delayed se of cfs_rq have no tasks queued on them. * Do not adjust h_nr_runnable since a dequeue has * already accounted for it or an enqueue of a task * below it will account for it in enqueue_task_fair().
*/ if (!entity_is_task(se)) return;
staticbool
dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{ bool sleep = flags & DEQUEUE_SLEEP; int action = UPDATE_TG;
update_curr(cfs_rq);
clear_buddies(cfs_rq, se);
if (flags & DEQUEUE_DELAYED) {
WARN_ON_ONCE(!se->sched_delayed);
} else { bool delay = sleep; /* * DELAY_DEQUEUE relies on spurious wakeups, special task * states must not suffer spurious wakeups, excempt them.
*/ if (flags & DEQUEUE_SPECIAL)
delay = false;
WARN_ON_ONCE(delay && se->sched_delayed);
if (sched_feat(DELAY_DEQUEUE) && delay &&
!entity_eligible(cfs_rq, se)) {
update_load_avg(cfs_rq, se, 0);
set_delayed(se); returnfalse;
}
}
if (entity_is_task(se) && task_on_rq_migrating(task_of(se)))
action |= DO_DETACH;
/* * When dequeuing a sched_entity, we must: * - Update loads to have both entity and cfs_rq synced with now. * - For group_entity, update its runnable_weight to reflect the new * h_nr_runnable of its group cfs_rq. * - Subtract its previous weight from cfs_rq->load.weight. * - For group entity, update its weight to reflect the new share * of its group cfs_rq.
*/
update_load_avg(cfs_rq, se, action);
se_update_runnable(se);
/* return excess runtime on last dequeue */
return_cfs_rq_runtime(cfs_rq);
update_cfs_group(se);
/* * Now advance min_vruntime if @se was the entity holding it back, * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be * put back on, and if we advance min_vruntime, we'll be placed back * further than we started -- i.e. we'll be penalized.
*/ if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE)
update_min_vruntime(cfs_rq);
if (flags & DEQUEUE_DELAYED)
finish_delayed_dequeue_entity(se);
if (cfs_rq->nr_queued == 0)
update_idle_cfs_rq_clock_pelt(cfs_rq);
/* 'current' is not kept within the tree. */ if (se->on_rq) { /* * Any task has to be enqueued before it get to execute on * a CPU. So account for the time it spent waiting on the * runqueue.
*/
update_stats_wait_end_fair(cfs_rq, se);
__dequeue_entity(cfs_rq, se);
update_load_avg(cfs_rq, se, UPDATE_TG);
/* * Track our maximum slice length, if the CPU's load is at * least twice that of our own weight (i.e. don't track it * when there are only lesser-weight tasks around):
*/ if (schedstat_enabled() &&
rq_of(cfs_rq)->cfs.load.weight >= 2*se->load.weight) { struct sched_statistics *stats;
staticint dequeue_entities(struct rq *rq, struct sched_entity *se, int flags);
/* * Pick the next process, keeping these things in mind, in this order: * 1) keep things fair between processes/task groups * 2) pick the "next" process, since someone really wants that to run * 3) pick the "last" process, for cache locality * 4) do not run the "skip" process, if something else is available
*/ staticstruct sched_entity *
pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq)
{ struct sched_entity *se;
/* * Picking the ->next buddy will affect latency but not fairness.
*/ if (sched_feat(PICK_BUDDY) &&
cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) { /* ->next will never be delayed */
WARN_ON_ONCE(cfs_rq->next->sched_delayed); return cfs_rq->next;
}
se = pick_eevdf(cfs_rq); if (se->sched_delayed) {
dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED); /* * Must not reference @se again, see __block_task().
*/ return NULL;
} return se;
}
staticvoid put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
{ /* * If still on the runqueue then deactivate_task() * was not called and update_curr() has to be done:
*/ if (prev->on_rq)
update_curr(cfs_rq);
if (prev->on_rq) {
update_stats_wait_start_fair(cfs_rq, prev); /* Put 'current' back into the tree. */
__enqueue_entity(cfs_rq, prev); /* in !on_rq case, update occurred at dequeue */
update_load_avg(cfs_rq, prev, 0);
}
WARN_ON_ONCE(cfs_rq->curr != prev);
cfs_rq->curr = NULL;
}
staticvoid
entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
{ /* * Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);
/* * Ensure that runnable average is periodically updated.
*/
update_load_avg(cfs_rq, curr, UPDATE_TG);
update_cfs_group(curr);
#ifdef CONFIG_SCHED_HRTICK /* * queued ticks are scheduled to match the slice, so don't bother * validating it and just reschedule.
*/ if (queued) {
resched_curr_lazy(rq_of(cfs_rq)); return;
} #endif
}
/************************************************** * CFS bandwidth control machinery
*/
/* returns 0 on failure to allocate runtime */ staticint assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{ struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); int ret;
raw_spin_lock(&cfs_b->lock);
ret = __assign_cfs_rq_runtime(cfs_b, cfs_rq, sched_cfs_bandwidth_slice());
raw_spin_unlock(&cfs_b->lock);
return ret;
}
staticvoid __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
{ /* dock delta_exec before expiring quota (as it could span periods) */
cfs_rq->runtime_remaining -= delta_exec;
if (likely(cfs_rq->runtime_remaining > 0)) return;
if (cfs_rq->throttled) return; /* * if we're unable to extend our runtime we resched so that the active * hierarchy can be throttled
*/ if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
resched_curr(rq_of(cfs_rq));
}
/* check whether cfs_rq, or any parent, is throttled */ staticinlineint throttled_hierarchy(struct cfs_rq *cfs_rq)
{ return cfs_bandwidth_used() && cfs_rq->throttle_count;
}
/* * Ensure that neither of the group entities corresponding to src_cpu or * dest_cpu are members of a throttled hierarchy when performing group * load-balance operations.
*/ staticinlineint throttled_lb_pair(struct task_group *tg, int src_cpu, int dest_cpu)
{ struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
/* group is entering throttled state, stop time */ if (!cfs_rq->throttle_count) {
cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
list_del_leaf_cfs_rq(cfs_rq);
WARN_ON_ONCE(cfs_rq->throttled_clock_self); if (cfs_rq->nr_queued)
cfs_rq->throttled_clock_self = rq_clock(rq);
}
cfs_rq->throttle_count++;
raw_spin_lock(&cfs_b->lock); /* This will start the period timer if necessary */ if (__assign_cfs_rq_runtime(cfs_b, cfs_rq, 1)) { /* * We have raced with bandwidth becoming available, and if we * actually throttled the timer might not unthrottle us for an * entire period. We additionally needed to make sure that any * subsequent check_cfs_rq_runtime calls agree not to throttle * us, as we may commit to do cfs put_prev+pick_next, so we ask * for 1ns of runtime rather than just check cfs_b.
*/
dequeue = 0;
} else {
list_add_tail_rcu(&cfs_rq->throttled_list,
&cfs_b->throttled_cfs_rq);
}
raw_spin_unlock(&cfs_b->lock);
if (!dequeue) returnfalse; /* Throttle no longer required. */
/* throttled entity or throttle-on-deactivate */ if (!se->on_rq) goto done;
/* * Abuse SPECIAL to avoid delayed dequeue in this instance. * This avoids teaching dequeue_entities() about throttled * entities and keeps things relatively simple.
*/
flags = DEQUEUE_SLEEP | DEQUEUE_SPECIAL; if (se->sched_delayed)
flags |= DEQUEUE_DELAYED;
dequeue_entity(qcfs_rq, se, flags);
if (cfs_rq_is_idle(group_cfs_rq(se)))
idle_delta = cfs_rq->h_nr_queued;
/* At this point se is NULL and we are at root level*/
sub_nr_running(rq, queued_delta);
done: /* * Note: distribution will already see us throttled via the * throttled-list. rq->lock protects completion.
*/
cfs_rq->throttled = 1;
WARN_ON_ONCE(cfs_rq->throttled_clock); if (cfs_rq->nr_queued)
cfs_rq->throttled_clock = rq_clock(rq); returntrue;
}
if (!cfs_rq->load.weight) { if (!cfs_rq->on_list) return; /* * Nothing to run but something to decay (on_list)? * Complete the branch.
*/
for_each_sched_entity(se) { if (list_add_leaf_cfs_rq(cfs_rq_of(se))) break;
} goto unthrottle_throttle;
}
/* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(qcfs_rq)) goto unthrottle_throttle;
}
/* Start the fair server if un-throttling resulted in new runnable tasks */ if (!rq_h_nr_queued && rq->cfs.h_nr_queued)
dl_server_start(&rq->fair_server);
/* At this point se is NULL and we are at root level*/
add_nr_running(rq, queued_delta);
unthrottle_throttle:
assert_list_leaf_cfs_rq(rq);
/* Determine whether we need to wake up potentially idle CPU: */ if (rq->curr == rq->idle && rq->cfs.nr_queued)
resched_curr(rq);
}
/* * Iterating over the list can trigger several call to * update_rq_clock() in unthrottle_cfs_rq(). * Do it once and skip the potential next ones.
*/
update_rq_clock(rq);
rq_clock_start_loop_update(rq);
/* * Since we hold rq lock we're safe from concurrent manipulation of * the CSD list. However, this RCU critical section annotates the * fact that we pair with sched_free_group_rcu(), so that we cannot * race with group being freed in the window between removing it * from the list and advancing to the next entry in the list.
*/
rcu_read_lock();
if (rq == this_rq()) {
unthrottle_cfs_rq(cfs_rq); return;
}
/* Already enqueued */ if (WARN_ON_ONCE(!list_empty(&cfs_rq->throttled_csd_list))) return;
first = list_empty(&rq->cfsb_csd_list);
list_add_tail(&cfs_rq->throttled_csd_list, &rq->cfsb_csd_list); if (first)
smp_call_function_single_async(cpu_of(rq), &rq->cfsb_csd);
}
/* we check whether we're throttled above */ if (cfs_rq->runtime_remaining > 0) { if (cpu_of(rq) != this_cpu) {
unthrottle_cfs_rq_async(cfs_rq);
} else { /* * We currently only expect to be unthrottling * a single cfs_rq locally.
*/
WARN_ON_ONCE(!list_empty(&local_unthrottle));
list_add_tail(&cfs_rq->throttled_csd_list,
&local_unthrottle);
}
} else {
throttled = true;
}
/* * Responsible for refilling a task_group's bandwidth and unthrottling its * cfs_rqs as appropriate. If there has been no activity within the last * period the timer is deactivated until scheduling resumes; cfs_b->idle is * used to track this state.
*/ staticint do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, unsignedlong flags)
{ int throttled;
/* no need to continue the timer with no bandwidth constraint */ if (cfs_b->quota == RUNTIME_INF) goto out_deactivate;
/* Refill extra burst quota even if cfs_b->idle */
__refill_cfs_bandwidth_runtime(cfs_b);
/* * idle depends on !throttled (for the case of a large deficit), and if * we're going inactive then everything else can be deferred
*/ if (cfs_b->idle && !throttled) goto out_deactivate;
if (!throttled) { /* mark as potentially idle for the upcoming period */
cfs_b->idle = 1; return 0;
}
/* account preceding periods in which throttling occurred */
cfs_b->nr_throttled += overrun;
/* * This check is repeated as we release cfs_b->lock while we unthrottle.
*/ while (throttled && cfs_b->runtime > 0) {
raw_spin_unlock_irqrestore(&cfs_b->lock, flags); /* we can't nest cfs_b->lock while distributing bandwidth */
throttled = distribute_cfs_runtime(cfs_b);
raw_spin_lock_irqsave(&cfs_b->lock, flags);
}
/* * While we are ensured activity in the period following an * unthrottle, this also covers the case in which the new bandwidth is * insufficient to cover the existing bandwidth deficit. (Forcing the * timer to remain active while there are any throttled entities.)
*/
cfs_b->idle = 0;
return 0;
out_deactivate: return 1;
}
/* a cfs_rq won't donate quota below this amount */ staticconst u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC; /* minimum remaining period time to redistribute slack quota */ staticconst u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC; /* how long we wait to gather additional slack before distributing */ staticconst u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
/* * Are we near the end of the current quota period? * * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the * hrtimer base being cleared by hrtimer_start. In the case of * migrate_hrtimers, base is never cleared, so we are fine.
*/ staticint runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
{ struct hrtimer *refresh_timer = &cfs_b->period_timer;
s64 remaining;
/* if the call-back is running a quota refresh is already occurring */ if (hrtimer_callback_running(refresh_timer)) return 1;
/* is a quota refresh about to occur? */
remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer)); if (remaining < (s64)min_expire) return 1;
/* we know any runtime found here is valid as update_curr() precedes return */ staticvoid __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{ struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
if (slack_runtime <= 0) return;
raw_spin_lock(&cfs_b->lock); if (cfs_b->quota != RUNTIME_INF) {
cfs_b->runtime += slack_runtime;
/* we are under rq->lock, defer unthrottling using a timer */ if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
!list_empty(&cfs_b->throttled_cfs_rq))
start_cfs_slack_bandwidth(cfs_b);
}
raw_spin_unlock(&cfs_b->lock);
/* even if it's not valid for return we don't want to try again */
cfs_rq->runtime_remaining -= slack_runtime;
}
static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{ if (!cfs_bandwidth_used()) return;
if (!cfs_rq->runtime_enabled || cfs_rq->nr_queued) return;
__return_cfs_rq_runtime(cfs_rq);
}
/* * This is done with a timer (instead of inline with bandwidth return) since * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
*/ staticvoid do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
{
u64 runtime = 0, slice = sched_cfs_bandwidth_slice(); unsignedlong flags;
/* confirm we're still not at a refresh boundary */
raw_spin_lock_irqsave(&cfs_b->lock, flags);
cfs_b->slack_started = false;
if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
raw_spin_unlock_irqrestore(&cfs_b->lock, flags); return;
}
/* * When a group wakes up we want to make sure that its quota is not already * expired/exceeded, otherwise it may be allowed to steal additional ticks of * runtime as update_curr() throttling can not trigger until it's on-rq.
*/ staticvoid check_enqueue_throttle(struct cfs_rq *cfs_rq)
{ if (!cfs_bandwidth_used()) return;
/* an active group must be handled by the update_curr()->put() path */ if (!cfs_rq->runtime_enabled || cfs_rq->curr) return;
/* ensure the group is not already throttled */ if (cfs_rq_throttled(cfs_rq)) return;
/* conditionally throttle active cfs_rq's from put_prev_entity() */ staticbool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{ if (!cfs_bandwidth_used()) returnfalse;
if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0)) returnfalse;
/* * it's possible for a throttled entity to be forced into a running * state (e.g. set_curr_task), in this case we're finished.
*/ if (cfs_rq_throttled(cfs_rq)) returntrue;
if (++count > 3) {
u64 new, old = ktime_to_ns(cfs_b->period);
/* * Grow period by a factor of 2 to avoid losing precision. * Precision loss in the quota/period ratio can cause __cfs_schedulable * to fail.
*/ new = old * 2; if (new < max_bw_quota_period_us * NSEC_PER_USEC) {
cfs_b->period = ns_to_ktime(new);
cfs_b->quota *= 2;
cfs_b->burst *= 2;
pr_warn_ratelimited( "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us = %lld, cfs_quota_us = %lld)\n",
smp_processor_id(),
div_u64(new, NSEC_PER_USEC),
div_u64(cfs_b->quota, NSEC_PER_USEC));
} else {
pr_warn_ratelimited( "cfs_period_timer[cpu%d]: period too short, but cannot scale up without losing precision (cfs_period_us = %lld, cfs_quota_us = %lld)\n",
smp_processor_id(),
div_u64(old, NSEC_PER_USEC),
div_u64(cfs_b->quota, NSEC_PER_USEC));
}
/* reset count so we don't come right back in here */
count = 0;
}
} if (idle)
cfs_b->period_active = 0;
raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
/* Add a random offset so that timers interleave */
hrtimer_set_expires(&cfs_b->period_timer,
get_random_u32_below(cfs_b->period));
hrtimer_setup(&cfs_b->slack_timer, sched_cfs_slack_timer, CLOCK_MONOTONIC,
HRTIMER_MODE_REL);
cfs_b->slack_started = false;
}
/* * It is possible that we still have some cfs_rq's pending on a CSD * list, though this race is very rare. In order for this to occur, we * must have raced with the last task leaving the group while there * exist throttled cfs_rq(s), and the period_timer must have queued the * CSD item but the remote cpu has not yet processed it. To handle this, * we can simply flush all pending CSD work inline here. We're * guaranteed at this point that no additional cfs_rq of this group can * join a CSD list.
*/
for_each_possible_cpu(i) { struct rq *rq = cpu_rq(i); unsignedlong flags;
/* * Both these CPU hotplug callbacks race against unregister_fair_sched_group() * * The race is harmless, since modifying bandwidth settings of unhooked group * bits doesn't do much.
*/
// Do not unthrottle for an active CPU if (cpumask_test_cpu(cpu_of(rq), cpu_active_mask)) return;
/* * The rq clock has already been updated in the * set_rq_offline(), so we should skip updating * the rq clock again in unthrottle_cfs_rq().
*/
rq_clock_start_loop_update(rq);
/* * Offline rq is schedulable till CPU is completely disabled * in take_cpu_down(), so we prevent new cfs throttling here.
*/
cfs_rq->runtime_enabled = 0;
if (!cfs_rq_throttled(cfs_rq)) continue;
/* * clock_task is not advancing so we just need to make sure * there's some valid quota amount
*/
cfs_rq->runtime_remaining = 1;
unthrottle_cfs_rq(cfs_rq);
}
rcu_read_unlock();
if (cfs_rq->runtime_enabled ||
tg_cfs_bandwidth(cfs_rq->tg)->hierarchical_quota != RUNTIME_INF) returntrue;
returnfalse;
}
#ifdef CONFIG_NO_HZ_FULL /* called from pick_next_task_fair() */ staticvoid sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p)
{ int cpu = cpu_of(rq);
if (!cfs_bandwidth_used()) return;
if (!tick_nohz_full_cpu(cpu)) return;
if (rq->nr_running != 1) return;
/* * We know there is only one task runnable and we've just picked it. The * normal enqueue path will have cleared TICK_DEP_BIT_SCHED if we will * be otherwise able to stop the tick. Just need to check if we are using * bandwidth control.
*/ if (cfs_task_bw_constrained(p))
tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED);
} #endif/* CONFIG_NO_HZ_FULL */
if (delta < 0) { if (task_current_donor(rq, p))
resched_curr(rq); return;
}
hrtick_start(rq, delta);
}
}
/* * called from enqueue/dequeue and updates the hrtick when the * current task is from our class and nr_running is low enough * to matter.
*/ staticvoid hrtick_update(struct rq *rq)
{ struct task_struct *donor = rq->donor;
if (!hrtick_enabled_fair(rq) || donor->sched_class != &fair_sched_class) return;
/* Return true only if the utilization doesn't fit CPU's capacity */ return !util_fits_cpu(cpu_util_cfs(cpu), rq_util_min, rq_util_max, cpu);
}
/* * overutilized value make sense only if EAS is enabled
*/ staticinlinebool is_rd_overutilized(struct root_domain *rd)
{ return !sched_energy_enabled() || READ_ONCE(rd->overutilized);
}
staticinlinevoid set_rd_overutilized(struct root_domain *rd, bool flag)
{ if (!sched_energy_enabled()) return;
staticinlinevoid check_update_overutilized_status(struct rq *rq)
{ /* * overutilized field is used for load balancing decisions only * if energy aware scheduler is being used
*/
if (!is_rd_overutilized(rq->rd) && cpu_overutilized(rq->cpu))
set_rd_overutilized(rq->rd, 1);
}
/* Runqueue only has SCHED_IDLE tasks enqueued */ staticint sched_idle_rq(struct rq *rq)
{ return unlikely(rq->nr_running == rq->cfs.h_nr_idle &&
rq->nr_running);
}
/* * se->sched_delayed should imply: se->on_rq == 1. * Because a delayed entity is one that is still on * the runqueue competing until elegibility.
*/
WARN_ON_ONCE(!se->sched_delayed);
WARN_ON_ONCE(!se->on_rq);
if (sched_feat(DELAY_ZERO)) {
update_entity_lag(cfs_rq, se); if (se->vlag > 0) {
cfs_rq->nr_queued--; if (se != cfs_rq->curr)
__dequeue_entity(cfs_rq, se);
se->vlag = 0;
place_entity(cfs_rq, se, 0); if (se != cfs_rq->curr)
__enqueue_entity(cfs_rq, se);
cfs_rq->nr_queued++;
}
}
update_load_avg(cfs_rq, se, 0);
clear_delayed(se);
}
/* * The enqueue_task method is called before nr_running is * increased. Here we update the fair scheduling stats and * then put the task into the rbtree:
*/ staticvoid
enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
{ struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; int h_nr_idle = task_has_idle_policy(p); int h_nr_runnable = 1; int task_new = !(flags & ENQUEUE_WAKEUP); int rq_h_nr_queued = rq->cfs.h_nr_queued;
u64 slice = 0;
/* * The code below (indirectly) updates schedutil which looks at * the cfs_rq utilization to select a frequency. * Let's add the task's estimated utilization to the cfs_rq's * estimated utilization, before we update schedutil.
*/ if (!p->se.sched_delayed || (flags & ENQUEUE_DELAYED))
util_est_enqueue(&rq->cfs, p);
if (flags & ENQUEUE_DELAYED) {
requeue_delayed_entity(se); return;
}
/* * If in_iowait is set, the code below may not trigger any cpufreq * utilization updates, so do it here explicitly with the IOWAIT flag * passed.
*/ if (p->in_iowait)
cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
if (task_new && se->sched_delayed)
h_nr_runnable = 0;
for_each_sched_entity(se) { if (se->on_rq) { if (se->sched_delayed)
requeue_delayed_entity(se); break;
}
cfs_rq = cfs_rq_of(se);
/* * Basically set the slice of group entries to the min_slice of * their respective cfs_rq. This ensures the group can service * its entities in the desired time-frame.
*/ if (slice) {
se->slice = slice;
se->custom_slice = 1;
}
enqueue_entity(cfs_rq, se, flags);
slice = cfs_rq_min_slice(cfs_rq);
/* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) goto enqueue_throttle;
}
if (!rq_h_nr_queued && rq->cfs.h_nr_queued) { /* Account for idle runtime */ if (!rq->nr_running)
dl_server_update_idle_time(rq, rq->curr);
dl_server_start(&rq->fair_server);
}
/* At this point se is NULL and we are at root level*/
add_nr_running(rq, 1);
/* * Since new tasks are assigned an initial util_avg equal to * half of the spare capacity of their CPU, tiny tasks have the * ability to cross the overutilized threshold, which will * result in the load balancer ruining all the task placement * done by EAS. As a way to mitigate that effect, do not account * for the first enqueue operation of new tasks during the * overutilized flag detection. * * A better way of solving this problem would be to wait for * the PELT signals of tasks to converge before taking them * into account, but that is not straightforward to implement, * and the following generally works well enough in practice.
*/ if (!task_new)
check_update_overutilized_status(rq);
if (cfs_rq_is_idle(cfs_rq))
h_nr_idle = h_nr_queued;
/* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) goto out;
/* Don't dequeue parent if it has other entities besides us */ if (cfs_rq->load.weight) {
slice = cfs_rq_min_slice(cfs_rq);
/* Avoid re-evaluating load for this entity: */
se = parent_entity(se); /* * Bias pick_next to pick a task from this cfs_rq, as * p is sleeping when it is within its sched_slice.
*/ if (task_sleep && se && !throttled_hierarchy(cfs_rq))
set_next_buddy(se); break;
}
flags |= DEQUEUE_SLEEP;
flags &= ~(DEQUEUE_DELAYED | DEQUEUE_SPECIAL);
}
if (cfs_rq_is_idle(cfs_rq))
h_nr_idle = h_nr_queued;
/* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) goto out;
}
sub_nr_running(rq, h_nr_queued);
/* balance early to pull high priority tasks */ if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
rq->next_balance = jiffies;
ret = 1;
out: if (p && task_delayed) {
WARN_ON_ONCE(!task_sleep);
WARN_ON_ONCE(p->on_rq != 1);
/* Fix-up what dequeue_task_fair() skipped */
hrtick_update(rq);
/* * Fix-up what block_task() skipped. * * Must be last, @p might not be valid after this.
*/
__block_task(rq, p);
}
return ret;
}
/* * The dequeue_task method is called before nr_running is * decreased. We remove the task from the rbtree and * update the fair scheduling stats:
*/ staticbool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
{ if (!p->se.sched_delayed)
util_est_dequeue(&rq->cfs, p);
staticstruct {
cpumask_var_t idle_cpus_mask;
atomic_t nr_cpus; int has_blocked; /* Idle CPUS has blocked load */ int needs_update; /* Newly idle CPUs need their next_balance collated */ unsignedlong next_balance; /* in jiffy units */ unsignedlong next_blocked; /* Next update of blocked load in jiffies */
} nohz ____cacheline_aligned;
/* * cpu_load_without - compute CPU load without any contributions from *p * @cpu: the CPU which load is requested * @p: the task which load should be discounted * * The load of a CPU is defined by the load of tasks currently enqueued on that * CPU as well as tasks which are currently sleeping after an execution on that * CPU. * * This method returns the load of the specified CPU by discounting the load of * the specified task, whenever the task is currently contributing to the CPU * load.
*/ staticunsignedlong cpu_load_without(struct rq *rq, struct task_struct *p)
{ struct cfs_rq *cfs_rq; unsignedint load;
/* Task has no contribution or is new */ if (cpu_of(rq) != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time)) return cpu_load(rq);
staticvoid record_wakee(struct task_struct *p)
{ /* * Only decay a single time; tasks that have less then 1 wakeup per * jiffy will not have built up many flips.
*/ if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) {
current->wakee_flips >>= 1;
current->wakee_flip_decay_ts = jiffies;
}
/* * Detect M:N waker/wakee relationships via a switching-frequency heuristic. * * A waker of many should wake a different task than the one last awakened * at a frequency roughly N times higher than one of its wakees. * * In order to determine whether we should let the load spread vs consolidating * to shared cache, we look for a minimum 'flip' frequency of llc_size in one * partner, and a factor of lls_size higher frequency in the other. * * With both conditions met, we can be relatively sure that the relationship is * non-monogamous, with partner count exceeding socket size. * * Waker/wakee being client/server, worker/dispatcher, interrupt source or * whatever is irrelevant, spread criteria is apparent partner count exceeds * socket size.
*/ staticint wake_wide(struct task_struct *p)
{ unsignedint master = current->wakee_flips; unsignedint slave = p->wakee_flips; int factor = __this_cpu_read(sd_llc_size);
/* * The purpose of wake_affine() is to quickly determine on which CPU we can run * soonest. For the purpose of speed we only consider the waking and previous * CPU. * * wake_affine_idle() - only considers 'now', it check if the waking CPU is * cache-affine and is (or will be) idle. * * wake_affine_weight() - considers the weight to reflect the average * scheduling latency of the CPUs. This seems to work * for the overloaded case.
*/ staticint
wake_affine_idle(int this_cpu, int prev_cpu, int sync)
{ /* * If this_cpu is idle, it implies the wakeup is from interrupt * context. Only allow the move if cache is shared. Otherwise an * interrupt intensive workload could force all tasks onto one * node depending on the IO topology or IRQ affinity settings. * * If the prev_cpu is idle and cache affine then avoid a migration. * There is no guarantee that the cache hot data from an interrupt * is more important than cache hot data on the prev_cpu and from * a cpufreq perspective, it's better to have higher utilisation * on one CPU.
*/ if (available_idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu)) return available_idle_cpu(prev_cpu) ? prev_cpu : this_cpu;
if (sync) { struct rq *rq = cpu_rq(this_cpu);
if ((rq->nr_running - cfs_h_nr_delayed(rq)) == 1) return this_cpu;
}
if (available_idle_cpu(prev_cpu)) return prev_cpu;
return nr_cpumask_bits;
}
staticint
wake_affine_weight(struct sched_domain *sd, struct task_struct *p, int this_cpu, int prev_cpu, int sync)
{
s64 this_eff_load, prev_eff_load; unsignedlong task_load;
this_eff_load = cpu_load(cpu_rq(this_cpu));
if (sync) { unsignedlong current_load = task_h_load(current);
if (current_load > this_eff_load) return this_cpu;
/* * If sync, adjust the weight of prev_eff_load such that if * prev_eff == this_eff that select_idle_sibling() will consider * stacking the wakee on top of the waker if no other CPU is * idle.
*/ if (sync)
prev_eff_load += 1;
/* * sched_balance_find_dst_group_cpu - find the idlest CPU among the CPUs in the group.
*/ staticint
sched_balance_find_dst_group_cpu(struct sched_group *group, struct task_struct *p, intthis_cpu)
{ unsignedlong load, min_load = ULONG_MAX; unsignedint min_exit_latency = UINT_MAX;
u64 latest_idle_timestamp = 0; int least_loaded_cpu = this_cpu; int shallowest_idle_cpu = -1; int i;
/* Check if we have any choice: */ if (group->group_weight == 1) return cpumask_first(sched_group_span(group));
/* Traverse only the allowed CPUs */
for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) { struct rq *rq = cpu_rq(i);
if (!sched_core_cookie_match(rq, p)) continue;
if (sched_idle_cpu(i)) return i;
if (available_idle_cpu(i)) { struct cpuidle_state *idle = idle_get_state(rq); if (idle && idle->exit_latency < min_exit_latency) { /* * We give priority to a CPU whose idle state * has the smallest exit latency irrespective * of any idle timestamp.
*/
min_exit_latency = idle->exit_latency;
latest_idle_timestamp = rq->idle_stamp;
shallowest_idle_cpu = i;
} elseif ((!idle || idle->exit_latency == min_exit_latency) &&
rq->idle_stamp > latest_idle_timestamp) { /* * If equal or no active idle state, then * the most recently idled CPU might have * a warmer cache.
*/
latest_idle_timestamp = rq->idle_stamp;
shallowest_idle_cpu = i;
}
} elseif (shallowest_idle_cpu == -1) {
load = cpu_load(cpu_rq(i)); if (load < min_load) {
min_load = load;
least_loaded_cpu = i;
}
}
}
staticinlineint sched_balance_find_dst_cpu(struct sched_domain *sd, struct task_struct *p, int cpu, int prev_cpu, int sd_flag)
{ int new_cpu = cpu;
if (!cpumask_intersects(sched_domain_span(sd), p->cpus_ptr)) return prev_cpu;
/* * We need task's util for cpu_util_without, sync it up to * prev_cpu's last_update_time.
*/ if (!(sd_flag & SD_BALANCE_FORK))
sync_entity_load_avg(&p->se);
while (sd) { struct sched_group *group; struct sched_domain *tmp; int weight;
if (!(sd->flags & sd_flag)) {
sd = sd->child; continue;
}
group = sched_balance_find_dst_group(sd, p, cpu); if (!group) {
sd = sd->child; continue;
}
new_cpu = sched_balance_find_dst_group_cpu(group, p, cpu); if (new_cpu == cpu) { /* Now try balancing at a lower domain level of 'cpu': */
sd = sd->child; continue;
}
/* Now try balancing at a lower domain level of 'new_cpu': */
cpu = new_cpu;
weight = sd->span_weight;
sd = NULL;
for_each_domain(cpu, tmp) { if (weight <= tmp->span_weight) break; if (tmp->flags & sd_flag)
sd = tmp;
}
}
sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); if (sds) return READ_ONCE(sds->has_idle_cores);
returnfalse;
}
/* * Scans the local SMT mask to see if the entire core is idle, and records this * information in sd_llc_shared->has_idle_cores. * * Since SMT siblings share all cache levels, inspecting this limited remote * state should be fairly cheap.
*/ void __update_idle_core(struct rq *rq)
{ int core = cpu_of(rq); int cpu;
rcu_read_lock(); if (test_idle_cores(core)) goto unlock;
for_each_cpu(cpu, cpu_smt_mask(core)) { if (cpu == core) continue;
/* * Scan the entire LLC domain for idle cores; this dynamically switches off if * there are no idle cores left in the system; tracked through * sd_llc->shared->has_idle_cores and enabled through update_idle_core() above.
*/ staticint select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu)
{ bool idle = true; int cpu;
for_each_cpu(cpu, cpu_smt_mask(core)) { if (!available_idle_cpu(cpu)) {
idle = false; if (*idle_cpu == -1) { if (sched_idle_cpu(cpu) && cpumask_test_cpu(cpu, cpus)) {
*idle_cpu = cpu; break;
} continue;
} break;
} if (*idle_cpu == -1 && cpumask_test_cpu(cpu, cpus))
*idle_cpu = cpu;
}
/* * Scan the local SMT mask for idle CPUs.
*/ staticint select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
{ int cpu;
for_each_cpu_and(cpu, cpu_smt_mask(target), p->cpus_ptr) { if (cpu == target) continue; /* * Check if the CPU is in the LLC scheduling domain of @target. * Due to isolcpus, there is no guarantee that all the siblings are in the domain.
*/ if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) continue; if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) return cpu;
}
return -1;
}
#else/* !CONFIG_SCHED_SMT: */
staticinlinevoid set_idle_cores(int cpu, int val)
{
}
/* * Scan the LLC domain for idle CPUs; this is dynamically regulated by * comparing the average scan cost (tracked in sd->avg_scan_cost) against the * average idle time for this rq (as found in rq->avg_idle).
*/ staticint select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool has_idle_core, int target)
{ struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask); int i, cpu, idle_cpu = -1, nr = INT_MAX; struct sched_domain_shared *sd_share;
if (sched_feat(SIS_UTIL)) {
sd_share = rcu_dereference(per_cpu(sd_llc_shared, target)); if (sd_share) { /* because !--nr is the condition to stop scan */
nr = READ_ONCE(sd_share->nr_idle_scan) + 1; /* overloaded LLC is unlikely to have idle cpu/core */ if (nr == 1) return -1;
}
}
if (static_branch_unlikely(&sched_cluster_active)) { struct sched_group *sg = sd->groups;
if (sg->flags & SD_CLUSTER) {
for_each_cpu_wrap(cpu, sched_group_span(sg), target + 1) { if (!cpumask_test_cpu(cpu, cpus)) continue;
if (has_idle_core) {
i = select_idle_core(p, cpu, cpus, &idle_cpu); if ((unsignedint)i < nr_cpumask_bits) return i;
} else { if (--nr <= 0) return -1;
idle_cpu = __select_idle_cpu(cpu, p); if ((unsignedint)idle_cpu < nr_cpumask_bits) return idle_cpu;
}
}
cpumask_andnot(cpus, cpus, sched_group_span(sg));
}
}
for_each_cpu_wrap(cpu, cpus, target + 1) { if (has_idle_core) {
i = select_idle_core(p, cpu, cpus, &idle_cpu); if ((unsignedint)i < nr_cpumask_bits) return i;
} else { if (--nr <= 0) return -1;
idle_cpu = __select_idle_cpu(cpu, p); if ((unsignedint)idle_cpu < nr_cpumask_bits) break;
}
}
if (has_idle_core)
set_idle_cores(target, false);
return idle_cpu;
}
/* * Scan the asym_capacity domain for idle CPUs; pick the first idle one on which * the task fits. If no CPU is big enough, but there are idle ones, try to * maximize capacity.
*/ staticint
select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
{ unsignedlong task_util, util_min, util_max, best_cap = 0; int fits, best_fits = 0; int cpu, best_cpu = -1; struct cpumask *cpus;
/* This CPU fits with all requirements */ if (fits > 0) return cpu; /* * Only the min performance hint (i.e. uclamp_min) doesn't fit. * Look for the CPU with best capacity.
*/ elseif (fits < 0)
cpu_cap = get_actual_cpu_capacity(cpu);
/* * First, select CPU which fits better (-1 being better than 0). * Then, select the one with best capacity at same level.
*/ if ((fits < best_fits) ||
((fits == best_fits) && (cpu_cap > best_cap))) {
best_cap = cpu_cap;
best_cpu = cpu;
best_fits = fits;
}
}
return best_cpu;
}
staticinlinebool asym_fits_cpu(unsignedlong util, unsignedlong util_min, unsignedlong util_max, int cpu)
{ if (sched_asym_cpucap_active()) /* * Return true only if the cpu fully fits the task requirements * which include the utilization and the performance hints.
*/ return (util_fits_cpu(util, util_min, util_max, cpu) > 0);
returntrue;
}
/* * Try and locate an idle core/thread in the LLC cache domain.
*/ staticint select_idle_sibling(struct task_struct *p, int prev, int target)
{ bool has_idle_core = false; struct sched_domain *sd; unsignedlong task_util, util_min, util_max; int i, recent_used_cpu, prev_aff = -1;
/* * On asymmetric system, update task utilization because we will check * that the task fits with CPU's capacity.
*/ if (sched_asym_cpucap_active()) {
sync_entity_load_avg(&p->se);
task_util = task_util_est(p);
util_min = uclamp_eff_value(p, UCLAMP_MIN);
util_max = uclamp_eff_value(p, UCLAMP_MAX);
}
/* * If the previous CPU is cache affine and idle, don't be stupid:
*/ if (prev != target && cpus_share_cache(prev, target) &&
(available_idle_cpu(prev) || sched_idle_cpu(prev)) &&
asym_fits_cpu(task_util, util_min, util_max, prev)) {
if (!static_branch_unlikely(&sched_cluster_active) ||
cpus_share_resources(prev, target)) return prev;
prev_aff = prev;
}
/* * Allow a per-cpu kthread to stack with the wakee if the * kworker thread and the tasks previous CPUs are the same. * The assumption is that the wakee queued work for the * per-cpu kthread that is now complete and the wakeup is * essentially a sync wakeup. An obvious example of this * pattern is IO completions.
*/ if (is_per_cpu_kthread(current) &&
in_task() &&
prev == smp_processor_id() &&
this_rq()->nr_running <= 1 &&
asym_fits_cpu(task_util, util_min, util_max, prev)) { return prev;
}
/* Check a recently used CPU as a potential idle candidate: */
recent_used_cpu = p->recent_used_cpu;
p->recent_used_cpu = prev; if (recent_used_cpu != prev &&
recent_used_cpu != target &&
cpus_share_cache(recent_used_cpu, target) &&
(available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
cpumask_test_cpu(recent_used_cpu, p->cpus_ptr) &&
asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) {
if (!static_branch_unlikely(&sched_cluster_active) ||
cpus_share_resources(recent_used_cpu, target)) return recent_used_cpu;
} else {
recent_used_cpu = -1;
}
/* * For asymmetric CPU capacity systems, our domain of interest is * sd_asym_cpucapacity rather than sd_llc.
*/ if (sched_asym_cpucap_active()) {
sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, target)); /* * On an asymmetric CPU capacity system where an exclusive * cpuset defines a symmetric island (i.e. one unique * capacity_orig value through the cpuset), the key will be set * but the CPUs within that cpuset will not have a domain with * SD_ASYM_CPUCAPACITY. These should follow the usual symmetric * capacity path.
*/ if (sd) {
i = select_idle_capacity(p, sd, target); return ((unsigned)i < nr_cpumask_bits) ? i : target;
}
}
sd = rcu_dereference(per_cpu(sd_llc, target)); if (!sd) return target;
if (sched_smt_active()) {
has_idle_core = test_idle_cores(target);
if (!has_idle_core && cpus_share_cache(prev, target)) {
i = select_idle_smt(p, sd, prev); if ((unsignedint)i < nr_cpumask_bits) return i;
}
}
i = select_idle_cpu(p, sd, has_idle_core, target); if ((unsigned)i < nr_cpumask_bits) return i;
/* * For cluster machines which have lower sharing cache like L2 or * LLC Tag, we tend to find an idle CPU in the target's cluster * first. But prev_cpu or recent_used_cpu may also be a good candidate, * use them if possible when no idle CPU found in select_idle_cpu().
*/ if ((unsignedint)prev_aff < nr_cpumask_bits) return prev_aff; if ((unsignedint)recent_used_cpu < nr_cpumask_bits) return recent_used_cpu;
return target;
}
/** * cpu_util() - Estimates the amount of CPU capacity used by CFS tasks. * @cpu: the CPU to get the utilization for * @p: task for which the CPU utilization should be predicted or NULL * @dst_cpu: CPU @p migrates to, -1 if @p moves from @cpu or @p == NULL * @boost: 1 to enable boosting, otherwise 0 * * The unit of the return value must be the same as the one of CPU capacity * so that CPU utilization can be compared with CPU capacity. * * CPU utilization is the sum of running time of runnable tasks plus the * recent utilization of currently non-runnable tasks on that CPU. * It represents the amount of CPU capacity currently used by CFS tasks in * the range [0..max CPU capacity] with max CPU capacity being the CPU * capacity at f_max. * * The estimated CPU utilization is defined as the maximum between CPU * utilization and sum of the estimated utilization of the currently * runnable tasks on that CPU. It preserves a utilization "snapshot" of * previously-executed tasks, which helps better deduce how busy a CPU will * be when a long-sleeping task wakes up. The contribution to CPU utilization * of such a task would be significantly decayed at this point of time. * * Boosted CPU utilization is defined as max(CPU runnable, CPU utilization). * CPU contention for CFS tasks can be detected by CPU runnable > CPU * utilization. Boosting is implemented in cpu_util() so that internal * users (e.g. EAS) can use it next to external users (e.g. schedutil), * latter via cpu_util_cfs_boost(). * * CPU utilization can be higher than the current CPU capacity * (f_curr/f_max * max CPU capacity) or even the max CPU capacity because * of rounding errors as well as task migrations or wakeups of new tasks. * CPU utilization has to be capped to fit into the [0..max CPU capacity] * range. Otherwise a group of CPUs (CPU0 util = 121% + CPU1 util = 80%) * could be seen as over-utilized even though CPU1 has 20% of spare CPU * capacity. CPU utilization is allowed to overshoot current CPU capacity * though since this is useful for predicting the CPU capacity required * after task migrations (scheduler-driven DVFS). * * Return: (Boosted) (estimated) utilization for the specified CPU.
*/ staticunsignedlong
cpu_util(int cpu, struct task_struct *p, int dst_cpu, int boost)
{ struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs; unsignedlong util = READ_ONCE(cfs_rq->avg.util_avg); unsignedlong runnable;
/* * If @dst_cpu is -1 or @p migrates from @cpu to @dst_cpu remove its * contribution. If @p migrates from another CPU to @cpu add its * contribution. In all the other cases @cpu is not impacted by the * migration so its util_avg is already correct.
*/ if (p && task_cpu(p) == cpu && dst_cpu != cpu)
lsub_positive(&util, task_util(p)); elseif (p && task_cpu(p) != cpu && dst_cpu == cpu)
util += task_util(p);
if (sched_feat(UTIL_EST)) { unsignedlong util_est;
util_est = READ_ONCE(cfs_rq->avg.util_est);
/* * During wake-up @p isn't enqueued yet and doesn't contribute * to any cpu_rq(cpu)->cfs.avg.util_est. * If @dst_cpu == @cpu add it to "simulate" cpu_util after @p * has been enqueued. * * During exec (@dst_cpu = -1) @p is enqueued and does * contribute to cpu_rq(cpu)->cfs.util_est. * Remove it to "simulate" cpu_util without @p's contribution. * * Despite the task_on_rq_queued(@p) check there is still a * small window for a possible race when an exec * select_task_rq_fair() races with LB's detach_task(). * * detach_task() * deactivate_task() * p->on_rq = TASK_ON_RQ_MIGRATING; * -------------------------------- A * dequeue_task() \ * dequeue_task_fair() + Race Time * util_est_dequeue() / * -------------------------------- B * * The additional check "current == p" is required to further * reduce the race window.
*/ if (dst_cpu == cpu)
util_est += _task_util_est(p); elseif (p && unlikely(task_on_rq_queued(p) || current == p))
lsub_positive(&util_est, _task_util_est(p));
/* * cpu_util_without: compute cpu utilization without any contributions from *p * @cpu: the CPU which utilization is requested * @p: the task which utilization should be discounted * * The utilization of a CPU is defined by the utilization of tasks currently * enqueued on that CPU as well as tasks which are currently sleeping after an * execution on that CPU. * * This method returns the utilization of the specified CPU by discounting the * utilization of the specified task, whenever the task is currently * contributing to the CPU utilization.
*/ staticunsignedlong cpu_util_without(int cpu, struct task_struct *p)
{ /* Task has no contribution or is new */ if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
p = NULL;
return cpu_util(cpu, p, -1, 0);
}
/* * This function computes an effective utilization for the given CPU, to be * used for frequency selection given the linear relation: f = u * f_max. * * The scheduler tracks the following metrics: * * cpu_util_{cfs,rt,dl,irq}() * cpu_bw_dl() * * Where the cfs,rt and dl util numbers are tracked with the same metric and * synchronized windows and are thus directly comparable. * * The cfs,rt,dl utilization are the running times measured with rq->clock_task * which excludes things like IRQ and steal-time. These latter are then accrued * in the IRQ utilization. * * The DL bandwidth number OTOH is not a measured metric but a value computed * based on the task model parameters and gives the minimal utilization * required to meet deadlines.
*/ unsignedlong effective_cpu_util(int cpu, unsignedlong util_cfs, unsignedlong *min, unsignedlong *max)
{ unsignedlong util, irq, scale; struct rq *rq = cpu_rq(cpu);
scale = arch_scale_cpu_capacity(cpu);
/* * Early check to see if IRQ/steal time saturates the CPU, can be * because of inaccuracies in how we track these -- see * update_irq_load_avg().
*/
irq = cpu_util_irq(rq); if (unlikely(irq >= scale)) { if (min)
*min = scale; if (max)
*max = scale; return scale;
}
if (min) { /* * The minimum utilization returns the highest level between: * - the computed DL bandwidth needed with the IRQ pressure which * steals time to the deadline task. * - The minimum performance requirement for CFS and/or RT.
*/
*min = max(irq + cpu_bw_dl(rq), uclamp_rq_get(rq, UCLAMP_MIN));
/* * When an RT task is runnable and uclamp is not used, we must * ensure that the task will run at maximum compute capacity.
*/ if (!uclamp_is_used() && rt_rq_is_runnable(&rq->rt))
*min = max(*min, scale);
}
/* * Because the time spend on RT/DL tasks is visible as 'lost' time to * CFS tasks and we use the same metric to track the effective * utilization (PELT windows are synchronized) we can directly add them * to obtain the CPU's actual utilization.
*/
util = util_cfs + cpu_util_rt(rq);
util += cpu_util_dl(rq);
/* * The maximum hint is a soft bandwidth requirement, which can be lower * than the actual utilization because of uclamp_max requirements.
*/ if (max)
*max = min(scale, uclamp_rq_get(rq, UCLAMP_MAX));
if (util >= scale) return scale;
/* * There is still idle time; further improve the number by using the * IRQ metric. Because IRQ/steal time is hidden from the task clock we * need to scale the task numbers: * * max - irq * U' = irq + --------- * U * max
*/
util = scale_irq_capacity(util, irq, scale);
util += irq;
/* * energy_env - Utilization landscape for energy estimation. * @task_busy_time: Utilization contribution by the task for which we test the * placement. Given by eenv_task_busy_time(). * @pd_busy_time: Utilization of the whole perf domain without the task * contribution. Given by eenv_pd_busy_time(). * @cpu_cap: Maximum CPU capacity for the perf domain. * @pd_cap: Entire perf domain capacity. (pd->nr_cpus * cpu_cap).
*/ struct energy_env { unsignedlong task_busy_time; unsignedlong pd_busy_time; unsignedlong cpu_cap; unsignedlong pd_cap;
};
/* * Compute the task busy time for compute_energy(). This time cannot be * injected directly into effective_cpu_util() because of the IRQ scaling. * The latter only makes sense with the most recent CPUs where the task has * run.
*/ staticinlinevoid eenv_task_busy_time(struct energy_env *eenv, struct task_struct *p, int prev_cpu)
{ unsignedlong busy_time, max_cap = arch_scale_cpu_capacity(prev_cpu); unsignedlong irq = cpu_util_irq(cpu_rq(prev_cpu));
/* * Compute the perf_domain (PD) busy time for compute_energy(). Based on the * utilization for each @pd_cpus, it however doesn't take into account * clamping since the ratio (utilization / cpu_capacity) is already enough to * scale the EM reported power consumption at the (eventually clamped) * cpu_capacity. * * The contribution of the task @p for which we want to estimate the * energy cost is removed (by cpu_util()) and must be calculated * separately (see eenv_task_busy_time). This ensures: * * - A stable PD utilization, no matter which CPU of that PD we want to place * the task on. * * - A fair comparison between CPUs as the task contribution (task_util()) * will always be the same no matter which CPU utilization we rely on * (util_avg or util_est). * * Set @eenv busy time for the PD that spans @pd_cpus. This busy time can't * exceed @eenv->pd_cap.
*/ staticinlinevoid eenv_pd_busy_time(struct energy_env *eenv, struct cpumask *pd_cpus, struct task_struct *p)
{ unsignedlong busy_time = 0; int cpu;
/* * Compute the maximum utilization for compute_energy() when the task @p * is placed on the cpu @dst_cpu. * * Returns the maximum utilization among @eenv->cpus. This utilization can't * exceed @eenv->cpu_cap.
*/ staticinlineunsignedlong
eenv_pd_max_util(struct energy_env *eenv, struct cpumask *pd_cpus, struct task_struct *p, int dst_cpu)
{ unsignedlong max_util = 0; int cpu;
/* * Performance domain frequency: utilization clamping * must be considered since it affects the selection * of the performance domain frequency. * NOTE: in case RT tasks are running, by default the min * utilization can be max OPP.
*/
eff_util = effective_cpu_util(cpu, util, &min, &max);
/* Task's uclamp can modify min and max value */ if (tsk && uclamp_is_used()) {
min = max(min, uclamp_eff_value(p, UCLAMP_MIN));
/* * If there is no active max uclamp constraint, * directly use task's one, otherwise keep max.
*/ if (uclamp_rq_is_idle(cpu_rq(cpu)))
max = uclamp_eff_value(p, UCLAMP_MAX); else
max = max(max, uclamp_eff_value(p, UCLAMP_MAX));
}
/* * compute_energy(): Use the Energy Model to estimate the energy that @pd would * consume for a given utilization landscape @eenv. When @dst_cpu < 0, the task * contribution is ignored.
*/ staticinlineunsignedlong
compute_energy(struct energy_env *eenv, struct perf_domain *pd, struct cpumask *pd_cpus, struct task_struct *p, int dst_cpu)
{ unsignedlong max_util = eenv_pd_max_util(eenv, pd_cpus, p, dst_cpu); unsignedlong busy_time = eenv->pd_busy_time; unsignedlong energy;
if (dst_cpu >= 0)
busy_time = min(eenv->pd_cap, busy_time + eenv->task_busy_time);
energy = em_cpu_energy(pd->em_pd, max_util, busy_time, eenv->cpu_cap);
/* * find_energy_efficient_cpu(): Find most energy-efficient target CPU for the * waking task. find_energy_efficient_cpu() looks for the CPU with maximum * spare capacity in each performance domain and uses it as a potential * candidate to execute the task. Then, it uses the Energy Model to figure * out which of the CPU candidates is the most energy-efficient. * * The rationale for this heuristic is as follows. In a performance domain, * all the most energy efficient CPU candidates (according to the Energy * Model) are those for which we'll request a low frequency. When there are * several CPUs for which the frequency request will be the same, we don't * have enough data to break the tie between them, because the Energy Model * only includes active power costs. With this model, if we assume that * frequency requests follow utilization (e.g. using schedutil), the CPU with * the maximum spare capacity in a performance domain is guaranteed to be among * the best candidates of the performance domain. * * In practice, it could be preferable from an energy standpoint to pack * small tasks on a CPU in order to let other CPUs go in deeper idle states, * but that could also hurt our chances to go cluster idle, and we have no * ways to tell with the current Energy Model if this is actually a good * idea or not. So, find_energy_efficient_cpu() basically favors * cluster-packing, and spreading inside a cluster. That should at least be * a good thing for latency, and this is consistent with the idea that most * of the energy savings of EAS come from the asymmetry of the system, and * not so much from breaking the tie between identical CPUs. That's also the * reason why EAS is enabled in the topology code only for systems where * SD_ASYM_CPUCAPACITY is set. * * NOTE: Forkees are not accepted in the energy-aware wake-up path because * they don't have any useful utilization data yet and it's not possible to * forecast their impact on energy consumption. Consequently, they will be * placed by sched_balance_find_dst_cpu() on the least loaded CPU, which might turn out * to be energy-inefficient in some use-cases. The alternative would be to * bias new tasks towards specific types of CPUs first, or to try to infer * their util_avg from the parent task, but those heuristics could hurt * other use-cases too. So, until someone finds a better way to solve this, * let's keep things simple by re-using the existing slow path.
*/ staticint find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
{ struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask); unsignedlong prev_delta = ULONG_MAX, best_delta = ULONG_MAX; unsignedlong p_util_min = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MIN) : 0; unsignedlong p_util_max = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MAX) : 1024; struct root_domain *rd = this_rq()->rd; int cpu, best_energy_cpu, target = -1; int prev_fits = -1, best_fits = -1; unsignedlong best_actual_cap = 0; unsignedlong prev_actual_cap = 0; struct sched_domain *sd; struct perf_domain *pd; struct energy_env eenv;
rcu_read_lock();
pd = rcu_dereference(rd->pd); if (!pd) goto unlock;
/* * Energy-aware wake-up happens on the lowest sched_domain starting * from sd_asym_cpucapacity spanning over this_cpu and prev_cpu.
*/
sd = rcu_dereference(*this_cpu_ptr(&sd_asym_cpucapacity)); while (sd && !cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
sd = sd->parent; if (!sd) goto unlock;
target = prev_cpu;
sync_entity_load_avg(&p->se); if (!task_util_est(p) && p_util_min == 0) goto unlock;
eenv_task_busy_time(&eenv, p, prev_cpu);
for (; pd; pd = pd->next) { unsignedlong util_min = p_util_min, util_max = p_util_max; unsignedlong cpu_cap, cpu_actual_cap, util; long prev_spare_cap = -1, max_spare_cap = -1; unsignedlong rq_util_min, rq_util_max; unsignedlong cur_delta, base_energy; int max_spare_cap_cpu = -1; int fits, max_fits = -1;
/* * Skip CPUs that cannot satisfy the capacity request. * IOW, placing the task there would make the CPU * overutilized. Take uclamp into account to see how * much capacity we can get out of the CPU; this is * aligned with sched_cpu_util().
*/ if (uclamp_is_used() && !uclamp_rq_is_idle(rq)) { /* * Open code uclamp_rq_util_with() except for * the clamp() part. I.e.: apply max aggregation * only. util_fits_cpu() logic requires to * operate on non clamped util but must use the * max-aggregated uclamp_{min, max}.
*/
rq_util_min = uclamp_rq_get(rq, UCLAMP_MIN);
rq_util_max = uclamp_rq_get(rq, UCLAMP_MAX);
fits = util_fits_cpu(util, util_min, util_max, cpu); if (!fits) continue;
lsub_positive(&cpu_cap, util);
if (cpu == prev_cpu) { /* Always use prev_cpu as a candidate. */
prev_spare_cap = cpu_cap;
prev_fits = fits;
} elseif ((fits > max_fits) ||
((fits == max_fits) && ((long)cpu_cap > max_spare_cap))) { /* * Find the CPU with the maximum spare capacity * among the remaining CPUs in the performance * domain.
*/
max_spare_cap = cpu_cap;
max_spare_cap_cpu = cpu;
max_fits = fits;
}
}
if (max_spare_cap_cpu < 0 && prev_spare_cap < 0) continue;
eenv_pd_busy_time(&eenv, cpus, p); /* Compute the 'base' energy of the pd, without @p */
base_energy = compute_energy(&eenv, pd, cpus, p, -1);
/* Evaluate the energy impact of using prev_cpu. */ if (prev_spare_cap > -1) {
prev_delta = compute_energy(&eenv, pd, cpus, p,
prev_cpu); /* CPU utilization has changed */ if (prev_delta < base_energy) goto unlock;
prev_delta -= base_energy;
prev_actual_cap = cpu_actual_cap;
best_delta = min(best_delta, prev_delta);
}
/* Evaluate the energy impact of using max_spare_cap_cpu. */ if (max_spare_cap_cpu >= 0 && max_spare_cap > prev_spare_cap) { /* Current best energy cpu fits better */ if (max_fits < best_fits) continue;
/* * Both don't fit performance hint (i.e. uclamp_min) * but best energy cpu has better capacity.
*/ if ((max_fits < 0) &&
(cpu_actual_cap <= best_actual_cap)) continue;
cur_delta = compute_energy(&eenv, pd, cpus, p,
max_spare_cap_cpu); /* CPU utilization has changed */ if (cur_delta < base_energy) goto unlock;
cur_delta -= base_energy;
/* * Both fit for the task but best energy cpu has lower * energy impact.
*/ if ((max_fits > 0) && (best_fits > 0) &&
(cur_delta >= best_delta)) continue;
/* * select_task_rq_fair: Select target runqueue for the waking task in domains * that have the relevant SD flag set. In practice, this is SD_BALANCE_WAKE, * SD_BALANCE_FORK, or SD_BALANCE_EXEC. * * Balances load by selecting the idlest CPU in the idlest group, or under * certain conditions an idle sibling CPU if the domain has SD_WAKE_AFFINE set. * * Returns the target CPU number.
*/ staticint
select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
{ int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING); struct sched_domain *tmp, *sd = NULL; int cpu = smp_processor_id(); int new_cpu = prev_cpu; int want_affine = 0; /* SD_flags and WF_flags share the first nibble */ int sd_flag = wake_flags & 0xF;
/* * required for stable ->cpus_allowed
*/
lockdep_assert_held(&p->pi_lock); if (wake_flags & WF_TTWU) {
record_wakee(p);
if ((wake_flags & WF_CURRENT_CPU) &&
cpumask_test_cpu(cpu, p->cpus_ptr)) return cpu;
if (!is_rd_overutilized(this_rq()->rd)) {
new_cpu = find_energy_efficient_cpu(p, prev_cpu); if (new_cpu >= 0) return new_cpu;
new_cpu = prev_cpu;
}
rcu_read_lock();
for_each_domain(cpu, tmp) { /* * If both 'cpu' and 'prev_cpu' are part of this domain, * cpu is a valid SD_WAKE_AFFINE target.
*/ if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) { if (cpu != prev_cpu)
new_cpu = wake_affine(tmp, p, cpu, prev_cpu, sync);
/* * Usually only true for WF_EXEC and WF_FORK, as sched_domains * usually do not have SD_BALANCE_WAKE set. That means wakeup * will usually go to the fast path.
*/ if (tmp->flags & sd_flag)
sd = tmp; elseif (!want_affine) break;
}
/* * Called immediately before a task is migrated to a new CPU; task_cpu(p) and * cfs_rq_of(p) references at time of call are still valid and identify the * previous CPU. The caller guarantees p->pi_lock or task_rq(p)->lock is held.
*/ staticvoid migrate_task_rq_fair(struct task_struct *p, int new_cpu)
{ struct sched_entity *se = &p->se;
if (!task_on_rq_migrating(p)) {
remove_entity_load_avg(se);
/* * Here, the task's PELT values have been updated according to * the current rq's clock. But if that clock hasn't been * updated in a while, a substantial idle time will be missed, * leading to an inflation after wake-up on the new rq. * * Estimate the missing time from the cfs_rq last_update_time * and update sched_avg to improve the PELT continuity after * migration.
*/
migrate_se_pelt_lag(se);
}
/* Tell new CPU we are migrated */
se->avg.last_update_time = 0;
if (se->sched_delayed) { struct rq_flags rf; struct rq *rq;
rq = task_rq_lock(p, &rf); if (se->sched_delayed) {
update_rq_clock(rq);
dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
}
task_rq_unlock(rq, p, &rf);
}
remove_entity_load_avg(se);
}
/* * Set the max capacity the task is allowed to run at for misfit detection.
*/ staticvoid set_task_max_allowed_capacity(struct task_struct *p)
{ struct asym_cap_data *entry;
staticvoid set_next_buddy(struct sched_entity *se)
{
for_each_sched_entity(se) { if (WARN_ON_ONCE(!se->on_rq)) return; if (se_is_idle(se)) return;
cfs_rq_of(se)->next = se;
}
}
/* * Preempt the current task with a newly woken task if needed:
*/ staticvoid check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int wake_flags)
{ struct task_struct *donor = rq->donor; struct sched_entity *se = &donor->se, *pse = &p->se; struct cfs_rq *cfs_rq = task_cfs_rq(donor); int cse_is_idle, pse_is_idle; bool do_preempt_short = false;
if (unlikely(se == pse)) return;
/* * This is possible from callers such as attach_tasks(), in which we * unconditionally wakeup_preempt() after an enqueue (which may have * lead to a throttle). This both saves work and prevents false * next-buddy nomination below.
*/ if (unlikely(throttled_hierarchy(cfs_rq_of(pse)))) return;
/* * We can come here with TIF_NEED_RESCHED already set from new task * wake up path. * * Note: this also catches the edge-case of curr being in a throttled * group (e.g. via set_curr_task), since update_curr() (in the * enqueue of curr) will have resulted in resched being set. This * prevents us from potentially nominating it as a false LAST_BUDDY * below.
*/ if (test_tsk_need_resched(rq->curr)) return;
/* * Preempt an idle entity in favor of a non-idle entity (and don't preempt * in the inverse case).
*/ if (cse_is_idle && !pse_is_idle) { /* * When non-idle entity preempt an idle entity, * don't give idle entity slice protection.
*/
do_preempt_short = true; goto preempt;
}
if (cse_is_idle != pse_is_idle) return;
/* * BATCH and IDLE tasks do not preempt others.
*/ if (unlikely(!normal_policy(p->policy))) return;
cfs_rq = cfs_rq_of(se);
update_curr(cfs_rq); /* * If @p has a shorter slice than current and @p is eligible, override * current's slice protection in order to allow preemption.
*/
do_preempt_short = sched_feat(PREEMPT_SHORT) && (pse->slice < se->slice);
/* * If @p has become the most eligible task, force preemption.
*/ if (__pick_eevdf(cfs_rq, !do_preempt_short) == pse) goto preempt;
if (sched_feat(RUN_TO_PARITY) && do_preempt_short)
update_protect_slice(cfs_rq, se);
return;
preempt: if (do_preempt_short)
cancel_protect_slice(se);
again:
p = pick_task_fair(rq); if (!p) goto idle;
se = &p->se;
#ifdef CONFIG_FAIR_GROUP_SCHED if (prev->sched_class != &fair_sched_class) goto simple;
__put_prev_set_next_dl_server(rq, prev, p);
/* * Because of the set_next_buddy() in dequeue_task_fair() it is rather * likely that a next task is from the same cgroup as the current. * * Therefore attempt to avoid putting and setting the entire cgroup * hierarchy, only change the part that actually changes. * * Since we haven't yet done put_prev_entity and if the selected task * is a different task than we started out with, try and touch the * least amount of cfs_rqs.
*/ if (prev != p) { struct sched_entity *pse = &prev->se; struct cfs_rq *cfs_rq;
while (!(cfs_rq = is_same_group(se, pse))) { int se_depth = se->depth; int pse_depth = pse->depth;
if (se_depth <= pse_depth) {
put_prev_entity(cfs_rq_of(pse), pse);
pse = parent_entity(pse);
} if (se_depth >= pse_depth) {
set_next_entity(cfs_rq_of(se), se);
se = parent_entity(se);
}
}
idle: if (rf) {
new_tasks = sched_balance_newidle(rq, rf);
/* * Because sched_balance_newidle() releases (and re-acquires) * rq->lock, it is possible for any higher priority task to * appear. In that case we must re-start the pick_next_entity() * loop.
*/ if (new_tasks < 0) return RETRY_TASK;
if (new_tasks > 0) goto again;
}
/* * rq is about to be idle, check if we need to update the * lost_idle_time of clock_pelt
*/
update_idle_rq_clock_pelt(rq);
/* * Are we the only task in the tree?
*/ if (unlikely(rq->nr_running == 1)) return;
clear_buddies(cfs_rq, se);
update_rq_clock(rq); /* * Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq); /* * Tell update_rq_clock() that we've just updated, * so we don't do microscopic update in schedule() * and double the fastpath cost.
*/
rq_clock_skip_update(rq);
/* throttled hierarchies are not runnable */ if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se))) returnfalse;
/* Tell the scheduler that we'd really like se to run next. */
set_next_buddy(se);
yield_task_fair(rq);
returntrue;
}
/************************************************** * Fair scheduling class load-balancing methods. * * BASICS * * The purpose of load-balancing is to achieve the same basic fairness the * per-CPU scheduler provides, namely provide a proportional amount of compute * time to each task. This is expressed in the following equation: * * W_i,n/P_i == W_j,n/P_j for all i,j (1) * * Where W_i,n is the n-th weight average for CPU i. The instantaneous weight * W_i,0 is defined as: * * W_i,0 = \Sum_j w_i,j (2) * * Where w_i,j is the weight of the j-th runnable task on CPU i. This weight * is derived from the nice value as per sched_prio_to_weight[]. * * The weight average is an exponential decay average of the instantaneous * weight: * * W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3) * * C_i is the compute capacity of CPU i, typically it is the * fraction of 'recent' time available for SCHED_OTHER task execution. But it * can also include other factors [XXX]. * * To achieve this balance we define a measure of imbalance which follows * directly from (1): * * imb_i,j = max{ avg(W/C), W_i/C_i } - min{ avg(W/C), W_j/C_j } (4) * * We them move tasks around to minimize the imbalance. In the continuous * function space it is obvious this converges, in the discrete case we get * a few fun cases generally called infeasible weight scenarios. * * [XXX expand on: * - infeasible weights; * - local vs global optima in the discrete case. ] * * * SCHED DOMAINS * * In order to solve the imbalance equation (4), and avoid the obvious O(n^2) * for all i,j solution, we create a tree of CPUs that follows the hardware * topology where each level pairs two lower groups (or better). This results * in O(log n) layers. Furthermore we reduce the number of CPUs going up the * tree to only the first of the previous level and we decrease the frequency * of load-balance at each level inversely proportional to the number of CPUs in * the groups. * * This yields: * * log_2 n 1 n * \Sum { --- * --- * 2^i } = O(n) (5) * i = 0 2^i 2^i * `- size of each group * | | `- number of CPUs doing load-balance * | `- freq * `- sum over all levels * * Coupled with a limit on how many tasks we can migrate every balance pass, * this makes (5) the runtime complexity of the balancer. * * An important property here is that each CPU is still (indirectly) connected * to every other CPU in at most O(log n) steps: * * The adjacency matrix of the resulting graph is given by: * * log_2 n * A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6) * k = 0 * * And you'll find that: * * A^(log_2 n)_i,j != 0 for all i,j (7) * * Showing there's indeed a path between every CPU in at most O(log n) steps. * The task movement gives a factor of O(m), giving a convergence complexity * of: * * O(nm log n), n := nr_cpus, m := nr_tasks (8) * * * WORK CONSERVING * * In order to avoid CPUs going idle while there's still work to do, new idle * balancing is more aggressive and has the newly idle CPU iterate up the domain * tree itself instead of relying on other CPUs to bring it work. * * This adds some complexity to both (5) and (8) but it reduces the total idle * time. * * [XXX more?] * * * CGROUPS * * Cgroups make a horror show out of (2), instead of a simple sum we get: * * s_k,i * W_i,0 = \Sum_j \Prod_k w_k * ----- (9) * S_k * * Where * * s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10) * * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on CPU i. * * The big problem is S_k, its a global sum needed to compute a local (W_i) * property. * * [XXX write more on how we solve this.. _after_ merging pjt's patches that * rewrite all of this once again.]
*/
/* * 'group_type' describes the group of CPUs at the moment of load balancing. * * The enum is ordered by pulling priority, with the group with lowest priority * first so the group_type can simply be compared when selecting the busiest * group. See update_sd_pick_busiest().
*/ enum group_type { /* The group has spare capacity that can be used to run more tasks. */
group_has_spare = 0, /* * The group is fully used and the tasks don't compete for more CPU * cycles. Nevertheless, some tasks might wait before running.
*/
group_fully_busy, /* * One task doesn't fit with CPU's capacity and must be migrated to a * more powerful CPU.
*/
group_misfit_task, /* * Balance SMT group that's fully busy. Can benefit from migration * a task on SMT with busy sibling to another CPU on idle core.
*/
group_smt_balance, /* * SD_ASYM_PACKING only: One local CPU with higher capacity is available, * and the task should be migrated to it instead of running on the * current CPU.
*/
group_asym_packing, /* * The tasks' affinity constraints previously prevented the scheduler * from balancing the load across the system.
*/
group_imbalanced, /* * The CPU is overloaded and can't provide expected CPU cycles to all * tasks.
*/
group_overloaded
};
struct cpumask *dst_grpmask; int new_dst_cpu; enum cpu_idle_type idle; long imbalance; /* The set of CPUs under consideration for load-balancing */ struct cpumask *cpus;
/* * Buddy candidates are cache hot:
*/ if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running &&
(&p->se == cfs_rq_of(&p->se)->next)) return 1;
if (sysctl_sched_migration_cost == -1) return 1;
/* * Don't migrate task if the task's cookie does not match * with the destination CPU's core cookie.
*/ if (!sched_core_cookie_match(cpu_rq(env->dst_cpu), p)) return 1;
/* * Check whether the task is ineligible on the destination cpu * * When the PLACE_LAG scheduling feature is enabled and * dst_cfs_rq->nr_queued is greater than 1, if the task * is ineligible, it will also be ineligible when * it is migrated to the destination cpu.
*/ staticinlineint task_is_ineligible_on_dst_cpu(struct task_struct *p, int dest_cpu)
{ struct cfs_rq *dst_cfs_rq;
/* * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
*/ static int can_migrate_task(struct task_struct *p, struct lb_env *env)
{ long degrades, hot;
lockdep_assert_rq_held(env->src_rq); if (p->sched_task_hot)
p->sched_task_hot = 0;
/* * We do not migrate tasks that are: * 1) delayed dequeued unless we migrate load, or * 2) throttled_lb_pair, or * 3) cannot be migrated to this CPU due to cpus_ptr, or * 4) running (obviously), or * 5) are cache-hot on their current CPU, or * 6) are blocked on mutexes (if SCHED_PROXY_EXEC is enabled)
*/ if ((p->se.sched_delayed) && (env->migration_type != migrate_load)) return 0;
if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) return 0;
/* * We want to prioritize the migration of eligible tasks. * For ineligible tasks we soft-limit them and only allow * them to migrate when nr_balance_failed is non-zero to * avoid load-balancing trying very hard to balance the load.
*/ if (!env->sd->nr_balance_failed &&
task_is_ineligible_on_dst_cpu(p, env->dst_cpu)) return 0;
/* Disregard percpu kthreads; they are where they need to be. */ if (kthread_is_per_cpu(p)) return 0;
if (task_is_blocked(p)) return 0;
if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) { int cpu;
/* * Remember if this task can be migrated to any other CPU in * our sched_group. We may want to revisit it if we couldn't * meet load balance goals by pulling other tasks on src_cpu. * * Avoid computing new_dst_cpu * - for NEWLY_IDLE * - if we have already computed one in current iteration * - if it's an active balance
*/ if (env->idle == CPU_NEWLY_IDLE ||
env->flags & (LBF_DST_PINNED | LBF_ACTIVE_LB)) return 0;
/* Prevent to re-select dst_cpu via env's CPUs: */
cpu = cpumask_first_and_and(env->dst_grpmask, env->cpus, p->cpus_ptr);
/* * Aggressive migration if: * 1) active balance * 2) destination numa is preferred * 3) task is cache cold, or * 4) too many balance attempts have failed.
*/ if (env->flags & LBF_ACTIVE_LB) return 1;
degrades = migrate_degrades_locality(p, env); if (!degrades)
hot = task_hot(p, env); else
hot = degrades > 0;
if (!hot || env->sd->nr_balance_failed > env->sd->cache_nice_tries) { if (hot)
p->sched_task_hot = 1; return 1;
}
/* * detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as * part of active balancing operations within "domain". * * Returns a task if successful and NULL otherwise.
*/ staticstruct task_struct *detach_one_task(struct lb_env *env)
{ struct task_struct *p;
lockdep_assert_rq_held(env->src_rq);
list_for_each_entry_reverse(p,
&env->src_rq->cfs_tasks, se.group_node) { if (!can_migrate_task(p, env)) continue;
detach_task(p, env);
/* * Right now, this is only the second place where * lb_gained[env->idle] is updated (other is detach_tasks) * so we can safely collect stats here rather than * inside detach_tasks().
*/
schedstat_inc(env->sd->lb_gained[env->idle]); return p;
} return NULL;
}
/* * detach_tasks() -- tries to detach up to imbalance load/util/tasks from * busiest_rq, as part of a balancing operation within domain "sd". * * Returns number of detached tasks if successful and 0 otherwise.
*/ staticint detach_tasks(struct lb_env *env)
{ struct list_head *tasks = &env->src_rq->cfs_tasks; unsignedlong util, load; struct task_struct *p; int detached = 0;
lockdep_assert_rq_held(env->src_rq);
/* * Source run queue has been emptied by another CPU, clear * LBF_ALL_PINNED flag as we will not test any task.
*/ if (env->src_rq->nr_running <= 1) {
env->flags &= ~LBF_ALL_PINNED; return 0;
}
if (env->imbalance <= 0) return 0;
while (!list_empty(tasks)) { /* * We don't want to steal all, otherwise we may be treated likewise, * which could at worst lead to a livelock crash.
*/ if (env->idle && env->src_rq->nr_running <= 1) break;
env->loop++; /* We've more or less seen every task there is, call it quits */ if (env->loop > env->loop_max) break;
/* take a breather every nr_migrate tasks */ if (env->loop > env->loop_break) {
env->loop_break += SCHED_NR_MIGRATE_BREAK;
env->flags |= LBF_NEED_BREAK; break;
}
p = list_last_entry(tasks, struct task_struct, se.group_node);
if (!can_migrate_task(p, env)) goto next;
switch (env->migration_type) { case migrate_load: /* * Depending of the number of CPUs and tasks and the * cgroup hierarchy, task_h_load() can return a null * value. Make sure that env->imbalance decreases * otherwise detach_tasks() will stop only after * detaching up to loop_max tasks.
*/
load = max_t(unsignedlong, task_h_load(p), 1);
if (sched_feat(LB_MIN) &&
load < 16 && !env->sd->nr_balance_failed) goto next;
/* * Make sure that we don't migrate too much load. * Nevertheless, let relax the constraint if * scheduler fails to find a good waiting task to * migrate.
*/ if (shr_bound(load, env->sd->nr_balance_failed) > env->imbalance) goto next;
env->imbalance -= load; break;
case migrate_util:
util = task_util_est(p);
if (shr_bound(util, env->sd->nr_balance_failed) > env->imbalance) goto next;
env->imbalance -= util; break;
case migrate_task:
env->imbalance--; break;
case migrate_misfit: /* This is not a misfit task */ if (task_fits_cpu(p, env->src_cpu)) goto next;
#ifdef CONFIG_PREEMPTION /* * NEWIDLE balancing is a source of latency, so preemptible * kernels will stop after the first task is detached to minimize * the critical section.
*/ if (env->idle == CPU_NEWLY_IDLE) break; #endif
/* * We only want to steal up to the prescribed amount of * load/util/tasks.
*/ if (env->imbalance <= 0) break;
continue;
next: if (p->sched_task_hot)
schedstat_inc(p->stats.nr_failed_migrations_hot);
list_move(&p->se.group_node, tasks);
}
/* * Right now, this is one of only two places we collect this stat * so we can safely collect detach_one_task() stats here rather * than inside detach_one_task().
*/
schedstat_add(env->sd->lb_gained[env->idle], detached);
return detached;
}
/* * attach_task() -- attach the task detached by detach_task() to its new rq.
*/ staticvoid attach_task(struct rq *rq, struct task_struct *p)
{
lockdep_assert_rq_held(rq);
/* * attach_one_task() -- attaches the task returned from detach_one_task() to * its new rq.
*/ staticvoid attach_one_task(struct rq *rq, struct task_struct *p)
{ struct rq_flags rf;
/* * update_load_avg() can call cpufreq_update_util(). Make sure that RT, * DL and IRQ signals have been updated before updating CFS.
*/
updated = update_other_load_avgs(rq);
if (others_have_blocked(rq))
*done = false;
return updated;
}
#ifdef CONFIG_FAIR_GROUP_SCHED
staticbool __update_blocked_fair(struct rq *rq, bool *done)
{ struct cfs_rq *cfs_rq, *pos; bool decayed = false; int cpu = cpu_of(rq);
/* * Iterates the task_group tree in a bottom up fashion, see * list_add_leaf_cfs_rq() for details.
*/
for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) { struct sched_entity *se;
if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq)) {
update_tg_load_avg(cfs_rq);
if (cfs_rq->nr_queued == 0)
update_idle_cfs_rq_clock_pelt(cfs_rq);
if (cfs_rq == &rq->cfs)
decayed = true;
}
/* Propagate pending load changes to the parent, if any: */
se = cfs_rq->tg->se[cpu]; if (se && !skip_blocked_update(se))
update_load_avg(cfs_rq_of(se), se, UPDATE_TG);
/* * There can be a lot of idle CPU cgroups. Don't let fully * decayed cfs_rqs linger on the list.
*/ if (cfs_rq_is_decayed(cfs_rq))
list_del_leaf_cfs_rq(cfs_rq);
/* Don't need periodic decay once load/util_avg are null */ if (cfs_rq_has_blocked(cfs_rq))
*done = false;
}
return decayed;
}
/* * Compute the hierarchical load factor for cfs_rq and all its ascendants. * This needs to be done in a top-down fashion because the load of a child * group is a fraction of its parents load.
*/ staticvoid update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
{ struct rq *rq = rq_of(cfs_rq); struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)]; unsignedlong now = jiffies; unsignedlong load;
update_blocked_load_status(rq, !done); if (decayed)
cpufreq_update_util(rq, 0);
rq_unlock_irqrestore(rq, &rf);
}
/********** Helpers for sched_balance_find_src_group ************************/
/* * sg_lb_stats - stats of a sched_group required for load-balancing:
*/ struct sg_lb_stats { unsignedlong avg_load; /* Avg load over the CPUs of the group */ unsignedlong group_load; /* Total load over the CPUs of the group */ unsignedlong group_capacity; /* Capacity over the CPUs of the group */ unsignedlong group_util; /* Total utilization over the CPUs of the group */ unsignedlong group_runnable; /* Total runnable time over the CPUs of the group */ unsignedint sum_nr_running; /* Nr of all tasks running in the group */ unsignedint sum_h_nr_running; /* Nr of CFS tasks running in the group */ unsignedint idle_cpus; /* Nr of idle CPUs in the group */ unsignedint group_weight; enum group_type group_type; unsignedint group_asym_packing; /* Tasks should be moved to preferred CPU */ unsignedint group_smt_balance; /* Task on busy SMT be moved */ unsignedlong group_misfit_task_load; /* A CPU has a task too big for its capacity */ #ifdef CONFIG_NUMA_BALANCING unsignedint nr_numa_running; unsignedint nr_preferred_running; #endif
};
/* * sd_lb_stats - stats of a sched_domain required for load-balancing:
*/ struct sd_lb_stats { struct sched_group *busiest; /* Busiest group in this sd */ struct sched_group *local; /* Local group in this sd */ unsignedlong total_load; /* Total load of all groups in sd */ unsignedlong total_capacity; /* Total capacity of all groups in sd */ unsignedlong avg_load; /* Average load across all groups in sd */ unsignedint prefer_sibling; /* Tasks should go to sibling first */
struct sg_lb_stats busiest_stat; /* Statistics of the busiest group */ struct sg_lb_stats local_stat; /* Statistics of the local group */
};
staticinlinevoid init_sd_lb_stats(struct sd_lb_stats *sds)
{ /* * Skimp on the clearing to avoid duplicate work. We can avoid clearing * local_stat because update_sg_lb_stats() does a full clear/assignment. * We must however set busiest_stat::group_type and * busiest_stat::idle_cpus to the worst busiest group because * update_sd_pick_busiest() reads these before assignment.
*/
*sds = (struct sd_lb_stats){
.busiest = NULL,
.local = NULL,
.total_load = 0UL,
.total_capacity = 0UL,
.busiest_stat = {
.idle_cpus = UINT_MAX,
.group_type = group_has_spare,
},
};
}
/* * avg_rt.util_avg and avg_dl.util_avg track binary signals * (running and not running) with weights 0 and 1024 respectively.
*/
used = cpu_util_rt(rq);
used += cpu_util_dl(rq);
/* * Check whether the capacity of the rq has been noticeably reduced by side * activity. The imbalance_pct is used for the threshold. * Return true is the capacity is reduced
*/ staticinlineint
check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
{ return ((rq->cpu_capacity * sd->imbalance_pct) <
(arch_scale_cpu_capacity(cpu_of(rq)) * 100));
}
/* Check if the rq has a misfit task */ staticinlinebool check_misfit_status(struct rq *rq)
{ return rq->misfit_task_load;
}
/* * Group imbalance indicates (and tries to solve) the problem where balancing * groups is inadequate due to ->cpus_ptr constraints. * * Imagine a situation of two groups of 4 CPUs each and 4 tasks each with a * cpumask covering 1 CPU of the first group and 3 CPUs of the second group. * Something like: * * { 0 1 2 3 } { 4 5 6 7 } * * * * * * * If we were to balance group-wise we'd place two tasks in the first group and * two tasks in the second group. Clearly this is undesired as it will overload * cpu 3 and leave one of the CPUs in the second group unused. * * The current solution to this issue is detecting the skew in the first group * by noticing the lower domain failed to reach balance and had difficulty * moving tasks due to affinity constraints. * * When this is so detected; this group becomes a candidate for busiest; see * update_sd_pick_busiest(). And calculate_imbalance() and * sched_balance_find_src_group() avoid some of the usual balance conditions to allow it * to create an effective group imbalance. * * This is a somewhat tricky proposition since the next run might not find the * group imbalance and decide the groups need to be balanced again. A most * subtle and fragile situation.
*/
/* * group_has_capacity returns true if the group has spare capacity that could * be used by some tasks. * We consider that a group has spare capacity if the number of task is * smaller than the number of CPUs or if the utilization is lower than the * available capacity for CFS tasks. * For the latter, we use a threshold to stabilize the state, to take into * account the variance of the tasks' load and to return true if the available * capacity in meaningful for the load balancer. * As an example, an available capacity of 1% can appear but it doesn't make * any benefit for the load balance.
*/ staticinlinebool
group_has_capacity(unsignedint imbalance_pct, struct sg_lb_stats *sgs)
{ if (sgs->sum_nr_running < sgs->group_weight) returntrue;
if ((sgs->group_capacity * imbalance_pct) <
(sgs->group_runnable * 100)) returnfalse;
if ((sgs->group_capacity * 100) >
(sgs->group_util * imbalance_pct)) returntrue;
returnfalse;
}
/* * group_is_overloaded returns true if the group has more tasks than it can * handle. * group_is_overloaded is not equals to !group_has_capacity because a group * with the exact right number of tasks, has no more spare capacity but is not * overloaded so both group_has_capacity and group_is_overloaded return * false.
*/ staticinlinebool
group_is_overloaded(unsignedint imbalance_pct, struct sg_lb_stats *sgs)
{ if (sgs->sum_nr_running <= sgs->group_weight) returnfalse;
if ((sgs->group_capacity * 100) <
(sgs->group_util * imbalance_pct)) returntrue;
if ((sgs->group_capacity * imbalance_pct) <
(sgs->group_runnable * 100)) returntrue;
if (sg_imbalanced(group)) return group_imbalanced;
if (sgs->group_asym_packing) return group_asym_packing;
if (sgs->group_smt_balance) return group_smt_balance;
if (sgs->group_misfit_task_load) return group_misfit_task;
if (!group_has_capacity(imbalance_pct, sgs)) return group_fully_busy;
return group_has_spare;
}
/** * sched_use_asym_prio - Check whether asym_packing priority must be used * @sd: The scheduling domain of the load balancing * @cpu: A CPU * * Always use CPU priority when balancing load between SMT siblings. When * balancing load between cores, it is not sufficient that @cpu is idle. Only * use CPU priority if the whole core is idle. * * Returns: True if the priority of @cpu must be followed. False otherwise.
*/ staticbool sched_use_asym_prio(struct sched_domain *sd, int cpu)
{ if (!(sd->flags & SD_ASYM_PACKING)) returnfalse;
staticinlinebool sched_asym(struct sched_domain *sd, int dst_cpu, int src_cpu)
{ /* * First check if @dst_cpu can do asym_packing load balance. Only do it * if it has higher priority than @src_cpu.
*/ return sched_use_asym_prio(sd, dst_cpu) &&
sched_asym_prefer(dst_cpu, src_cpu);
}
/** * sched_group_asym - Check if the destination CPU can do asym_packing balance * @env: The load balancing environment * @sgs: Load-balancing statistics of the candidate busiest group * @group: The candidate busiest group * * @env::dst_cpu can do asym_packing if it has higher priority than the * preferred CPU of @group. * * Return: true if @env::dst_cpu can do with asym_packing load balance. False * otherwise.
*/ staticinlinebool
sched_group_asym(struct lb_env *env, struct sg_lb_stats *sgs, struct sched_group *group)
{ /* * CPU priorities do not make sense for SMT cores with more than one * busy sibling.
*/ if ((group->flags & SD_SHARE_CPUCAPACITY) &&
(sgs->group_weight - sgs->idle_cpus != 1)) returnfalse;
/* One group has more than one SMT CPU while the other group does not */ staticinlinebool smt_vs_nonsmt_groups(struct sched_group *sg1, struct sched_group *sg2)
{ if (!sg1 || !sg2) returnfalse;
/* * For SMT source group, it is better to move a task * to a CPU that doesn't have multiple tasks sharing its CPU capacity. * Note that if a group has a single SMT, SD_SHARE_CPUCAPACITY * will not be on.
*/ if (group->flags & SD_SHARE_CPUCAPACITY &&
sgs->sum_h_nr_running > 1) returntrue;
returnfalse;
}
staticinlinelong sibling_imbalance(struct lb_env *env, struct sd_lb_stats *sds, struct sg_lb_stats *busiest, struct sg_lb_stats *local)
{ int ncores_busiest, ncores_local; long imbalance;
if (!env->idle || !busiest->sum_nr_running) return 0;
/* Balance such that nr_running/ncores ratio are same on both groups */
imbalance = ncores_local * busiest->sum_nr_running;
lsub_positive(&imbalance, ncores_busiest * local->sum_nr_running); /* Normalize imbalance and do rounding on normalization */
imbalance = 2 * imbalance + ncores_local + ncores_busiest;
imbalance /= ncores_local + ncores_busiest;
/* Take advantage of resource in an empty sched group */ if (imbalance <= 1 && local->sum_nr_running == 0 &&
busiest->sum_nr_running > 1)
imbalance = 2;
return imbalance;
}
staticinlinebool
sched_reduced_capacity(struct rq *rq, struct sched_domain *sd)
{ /* * When there is more than 1 task, the group_overloaded case already * takes care of cpu with reduced capacity
*/ if (rq->cfs.h_nr_runnable != 1) returnfalse;
return check_cpu_capacity(rq, sd);
}
/** * update_sg_lb_stats - Update sched_group's statistics for load balancing. * @env: The load balancing environment. * @sds: Load-balancing data with statistics of the local group. * @group: sched_group whose statistics are to be updated. * @sgs: variable to hold the statistics for this group. * @sg_overloaded: sched_group is overloaded * @sg_overutilized: sched_group is overutilized
*/ staticinlinevoid update_sg_lb_stats(struct lb_env *env, struct sd_lb_stats *sds, struct sched_group *group, struct sg_lb_stats *sgs, bool *sg_overloaded, bool *sg_overutilized)
{ int i, nr_running, local_group, sd_flags = env->sd->flags; bool balancing_at_rd = !env->sd->parent;
/* * No need to call idle_cpu() if nr_running is not 0
*/ if (!nr_running && idle_cpu(i)) {
sgs->idle_cpus++; /* Idle cpu can't have misfit task */ continue;
}
/* Overload indicator is only updated at root domain */ if (balancing_at_rd && nr_running > 1)
*sg_overloaded = 1;
#ifdef CONFIG_NUMA_BALANCING /* Only fbq_classify_group() uses this to classify NUMA groups */ if (sd_flags & SD_NUMA) {
sgs->nr_numa_running += rq->nr_numa_running;
sgs->nr_preferred_running += rq->nr_preferred_running;
} #endif if (local_group) continue;
if (sd_flags & SD_ASYM_CPUCAPACITY) { /* Check for a misfit task on the cpu */ if (sgs->group_misfit_task_load < rq->misfit_task_load) {
sgs->group_misfit_task_load = rq->misfit_task_load;
*sg_overloaded = 1;
}
} elseif (env->idle && sched_reduced_capacity(rq, env->sd)) { /* Check for a task running on a CPU with reduced capacity */ if (sgs->group_misfit_task_load < load)
sgs->group_misfit_task_load = load;
}
}
sgs->group_capacity = group->sgc->capacity;
sgs->group_weight = group->group_weight;
/* Check if dst CPU is idle and preferred to this group */ if (!local_group && env->idle && sgs->sum_h_nr_running &&
sched_group_asym(env, sgs, group))
sgs->group_asym_packing = 1;
/* Check for loaded SMT group to be balanced to dst CPU */ if (!local_group && smt_balance(env, sgs, group))
sgs->group_smt_balance = 1;
/* Computing avg_load makes sense only when group is overloaded */ if (sgs->group_type == group_overloaded)
sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) /
sgs->group_capacity;
}
/** * update_sd_pick_busiest - return 1 on busiest group * @env: The load balancing environment. * @sds: sched_domain statistics * @sg: sched_group candidate to be checked for being the busiest * @sgs: sched_group statistics * * Determine if @sg is a busier group than the previously selected * busiest group. * * Return: %true if @sg is a busier group than the previously selected * busiest group. %false otherwise.
*/ staticbool update_sd_pick_busiest(struct lb_env *env, struct sd_lb_stats *sds, struct sched_group *sg, struct sg_lb_stats *sgs)
{ struct sg_lb_stats *busiest = &sds->busiest_stat;
/* Make sure that there is at least one task to pull */ if (!sgs->sum_h_nr_running) returnfalse;
/* * Don't try to pull misfit tasks we can't help. * We can use max_capacity here as reduction in capacity on some * CPUs in the group should either be possible to resolve * internally or be covered by avg_load imbalance (eventually).
*/ if ((env->sd->flags & SD_ASYM_CPUCAPACITY) &&
(sgs->group_type == group_misfit_task) &&
(!capacity_greater(capacity_of(env->dst_cpu), sg->sgc->max_capacity) ||
sds->local_stat.group_type != group_has_spare)) returnfalse;
if (sgs->group_type > busiest->group_type) returntrue;
if (sgs->group_type < busiest->group_type) returnfalse;
/* * The candidate and the current busiest group are the same type of * group. Let check which one is the busiest according to the type.
*/
switch (sgs->group_type) { case group_overloaded: /* Select the overloaded group with highest avg_load. */ return sgs->avg_load > busiest->avg_load;
case group_imbalanced: /* * Select the 1st imbalanced group as we don't have any way to * choose one more than another.
*/ returnfalse;
case group_asym_packing: /* Prefer to move from lowest priority CPU's work */ return sched_asym_prefer(READ_ONCE(sds->busiest->asym_prefer_cpu),
READ_ONCE(sg->asym_prefer_cpu));
case group_misfit_task: /* * If we have more than one misfit sg go with the biggest * misfit.
*/ return sgs->group_misfit_task_load > busiest->group_misfit_task_load;
case group_smt_balance: /* * Check if we have spare CPUs on either SMT group to * choose has spare or fully busy handling.
*/ if (sgs->idle_cpus != 0 || busiest->idle_cpus != 0) goto has_spare;
fallthrough;
case group_fully_busy: /* * Select the fully busy group with highest avg_load. In * theory, there is no need to pull task from such kind of * group because tasks have all compute capacity that they need * but we can still improve the overall throughput by reducing * contention when accessing shared HW resources. * * XXX for now avg_load is not computed and always 0 so we * select the 1st one, except if @sg is composed of SMT * siblings.
*/
if (sgs->avg_load < busiest->avg_load) returnfalse;
if (sgs->avg_load == busiest->avg_load) { /* * SMT sched groups need more help than non-SMT groups. * If @sg happens to also be SMT, either choice is good.
*/ if (sds->busiest->flags & SD_SHARE_CPUCAPACITY) returnfalse;
}
break;
case group_has_spare: /* * Do not pick sg with SMT CPUs over sg with pure CPUs, * as we do not want to pull task off SMT core with one task * and make the core idle.
*/ if (smt_vs_nonsmt_groups(sds->busiest, sg)) { if (sg->flags & SD_SHARE_CPUCAPACITY && sgs->sum_h_nr_running <= 1) returnfalse; else returntrue;
}
has_spare:
/* * Select not overloaded group with lowest number of idle CPUs * and highest number of running tasks. We could also compare * the spare capacity which is more stable but it can end up * that the group has less spare capacity but finally more idle * CPUs which means less opportunity to pull tasks.
*/ if (sgs->idle_cpus > busiest->idle_cpus) returnfalse; elseif ((sgs->idle_cpus == busiest->idle_cpus) &&
(sgs->sum_nr_running <= busiest->sum_nr_running)) returnfalse;
break;
}
/* * Candidate sg has no more than one task per CPU and has higher * per-CPU capacity. Migrating tasks to less capable CPUs may harm * throughput. Maximize throughput, power/energy consequences are not * considered.
*/ if ((env->sd->flags & SD_ASYM_CPUCAPACITY) &&
(sgs->group_type <= group_fully_busy) &&
(capacity_greater(sg->sgc->min_capacity, capacity_of(env->dst_cpu)))) returnfalse;
/* * task_running_on_cpu - return 1 if @p is running on @cpu.
*/
staticunsignedint task_running_on_cpu(int cpu, struct task_struct *p)
{ /* Task has no contribution or is new */ if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time)) return 0;
if (task_on_rq_queued(p)) return 1;
return 0;
}
/** * idle_cpu_without - would a given CPU be idle without p ? * @cpu: the processor on which idleness is tested. * @p: task which should be ignored. * * Return: 1 if the CPU would be idle. 0 otherwise.
*/ staticint idle_cpu_without(int cpu, struct task_struct *p)
{ struct rq *rq = cpu_rq(cpu);
if (rq->curr != rq->idle && rq->curr != p) return 0;
/* * rq->nr_running can't be used but an updated version without the * impact of p on cpu must be used instead. The updated nr_running * be computed and tested before calling idle_cpu_without().
*/
if (rq->ttwu_pending) return 0;
return 1;
}
/* * update_sg_wakeup_stats - Update sched_group's statistics for wakeup. * @sd: The sched_domain level to look for idlest group. * @group: sched_group whose statistics are to be updated. * @sgs: variable to hold the statistics for this group. * @p: The task for which we look for the idlest group/CPU.
*/ staticinlinevoid update_sg_wakeup_stats(struct sched_domain *sd, struct sched_group *group, struct sg_lb_stats *sgs, struct task_struct *p)
{ int i, nr_running;
memset(sgs, 0, sizeof(*sgs));
/* Assume that task can't fit any CPU of the group */ if (sd->flags & SD_ASYM_CPUCAPACITY)
sgs->group_misfit_task_load = 1;
/* * No need to call idle_cpu_without() if nr_running is not 0
*/ if (!nr_running && idle_cpu_without(i, p))
sgs->idle_cpus++;
/* Check if task fits in the CPU */ if (sd->flags & SD_ASYM_CPUCAPACITY &&
sgs->group_misfit_task_load &&
task_fits_cpu(p, i))
sgs->group_misfit_task_load = 0;
/* * Computing avg_load makes sense only when group is fully busy or * overloaded
*/ if (sgs->group_type == group_fully_busy ||
sgs->group_type == group_overloaded)
sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) /
sgs->group_capacity;
}
if (sgs->group_type > idlest_sgs->group_type) returnfalse;
/* * The candidate and the current idlest group are the same type of * group. Let check which one is the idlest according to the type.
*/
switch (sgs->group_type) { case group_overloaded: case group_fully_busy: /* Select the group with lowest avg_load. */ if (idlest_sgs->avg_load <= sgs->avg_load) returnfalse; break;
case group_imbalanced: case group_asym_packing: case group_smt_balance: /* Those types are not used in the slow wakeup path */ returnfalse;
case group_misfit_task: /* Select group with the highest max capacity */ if (idlest->sgc->max_capacity >= group->sgc->max_capacity) returnfalse; break;
case group_has_spare: /* Select group with most idle CPUs */ if (idlest_sgs->idle_cpus > sgs->idle_cpus) returnfalse;
/* Select group with lowest group_util */ if (idlest_sgs->idle_cpus == sgs->idle_cpus &&
idlest_sgs->group_util <= sgs->group_util) returnfalse;
break;
}
returntrue;
}
/* * sched_balance_find_dst_group() finds and returns the least busy CPU group within the * domain. * * Assumes p is allowed on at least one CPU in sd.
*/ staticstruct sched_group *
sched_balance_find_dst_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
{ struct sched_group *idlest = NULL, *local = NULL, *group = sd->groups; struct sg_lb_stats local_sgs, tmp_sgs; struct sg_lb_stats *sgs; unsignedlong imbalance; struct sg_lb_stats idlest_sgs = {
.avg_load = UINT_MAX,
.group_type = group_overloaded,
};
do { int local_group;
/* Skip over this group if it has no CPUs allowed */ if (!cpumask_intersects(sched_group_span(group),
p->cpus_ptr)) continue;
/* Skip over this group if no cookie matched */ if (!sched_group_cookie_match(cpu_rq(this_cpu), p, group)) continue;
} while (group = group->next, group != sd->groups);
/* There is no idlest group to push tasks to */ if (!idlest) return NULL;
/* The local group has been skipped because of CPU affinity */ if (!local) return idlest;
/* * If the local group is idler than the selected idlest group * don't try and push the task.
*/ if (local_sgs.group_type < idlest_sgs.group_type) return NULL;
/* * If the local group is busier than the selected idlest group * try and push the task.
*/ if (local_sgs.group_type > idlest_sgs.group_type) return idlest;
switch (local_sgs.group_type) { case group_overloaded: case group_fully_busy:
/* Calculate allowed imbalance based on load */
imbalance = scale_load_down(NICE_0_LOAD) *
(sd->imbalance_pct-100) / 100;
/* * When comparing groups across NUMA domains, it's possible for * the local domain to be very lightly loaded relative to the * remote domains but "imbalance" skews the comparison making * remote CPUs look much more favourable. When considering * cross-domain, add imbalance to the load on the remote node * and consider staying local.
*/
/* * If the local group is less loaded than the selected * idlest group don't try and push any tasks.
*/ if (idlest_sgs.avg_load >= (local_sgs.avg_load + imbalance)) return NULL;
case group_imbalanced: case group_asym_packing: case group_smt_balance: /* Those type are not used in the slow wakeup path */ return NULL;
case group_misfit_task: /* Select group with the highest max capacity */ if (local->sgc->max_capacity >= idlest->sgc->max_capacity) return NULL; break;
case group_has_spare: #ifdef CONFIG_NUMA if (sd->flags & SD_NUMA) { int imb_numa_nr = sd->imb_numa_nr; #ifdef CONFIG_NUMA_BALANCING int idlest_cpu; /* * If there is spare capacity at NUMA, try to select * the preferred node
*/ if (cpu_to_node(this_cpu) == p->numa_preferred_nid) return NULL;
idlest_cpu = cpumask_first(sched_group_span(idlest)); if (cpu_to_node(idlest_cpu) == p->numa_preferred_nid) return idlest; #endif/* CONFIG_NUMA_BALANCING */ /* * Otherwise, keep the task close to the wakeup source * and improve locality if the number of running tasks * would remain below threshold where an imbalance is * allowed while accounting for the possibility the * task is pinned to a subset of CPUs. If there is a * real need of migration, periodic load balance will * take care of it.
*/ if (p->nr_cpus_allowed != NR_CPUS) { struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
/* * Select group with highest number of idle CPUs. We could also * compare the utilization which is more stable but it can end * up that the group has less spare capacity but finally more * idle CPUs which means more opportunity to run task.
*/ if (local_sgs.idle_cpus >= idlest_sgs.idle_cpus) return NULL; break;
}
return idlest;
}
staticvoid update_idle_cpu_scan(struct lb_env *env, unsignedlong sum_util)
{ struct sched_domain_shared *sd_share; int llc_weight, pct;
u64 x, y, tmp; /* * Update the number of CPUs to scan in LLC domain, which could * be used as a hint in select_idle_cpu(). The update of sd_share * could be expensive because it is within a shared cache line. * So the write of this hint only occurs during periodic load * balancing, rather than CPU_NEWLY_IDLE, because the latter * can fire way more frequently than the former.
*/ if (!sched_feat(SIS_UTIL) || env->idle == CPU_NEWLY_IDLE) return;
llc_weight = per_cpu(sd_llc_size, env->dst_cpu); if (env->sd->span_weight != llc_weight) return;
sd_share = rcu_dereference(per_cpu(sd_llc_shared, env->dst_cpu)); if (!sd_share) return;
/* * The number of CPUs to search drops as sum_util increases, when * sum_util hits 85% or above, the scan stops. * The reason to choose 85% as the threshold is because this is the * imbalance_pct(117) when a LLC sched group is overloaded. * * let y = SCHED_CAPACITY_SCALE - p * x^2 [1] * and y'= y / SCHED_CAPACITY_SCALE * * x is the ratio of sum_util compared to the CPU capacity: * x = sum_util / (llc_weight * SCHED_CAPACITY_SCALE) * y' is the ratio of CPUs to be scanned in the LLC domain, * and the number of CPUs to scan is calculated by: * * nr_scan = llc_weight * y' [2] * * When x hits the threshold of overloaded, AKA, when * x = 100 / pct, y drops to 0. According to [1], * p should be SCHED_CAPACITY_SCALE * pct^2 / 10000 * * Scale x by SCHED_CAPACITY_SCALE: * x' = sum_util / llc_weight; [3] * * and finally [1] becomes: * y = SCHED_CAPACITY_SCALE - * x'^2 * pct^2 / (10000 * SCHED_CAPACITY_SCALE) [4] *
*/ /* equation [3] */
x = sum_util;
do_div(x, llc_weight);
/* equation [4] */
pct = env->sd->imbalance_pct;
tmp = x * x * pct * pct;
do_div(tmp, 10000 * SCHED_CAPACITY_SCALE);
tmp = min_t(long, tmp, SCHED_CAPACITY_SCALE);
y = SCHED_CAPACITY_SCALE - tmp;
/* equation [2] */
y *= llc_weight;
do_div(y, SCHED_CAPACITY_SCALE); if ((int)y != sd_share->nr_idle_scan)
WRITE_ONCE(sd_share->nr_idle_scan, (int)y);
}
/** * update_sd_lb_stats - Update sched_domain's statistics for load balancing. * @env: The load balancing environment. * @sds: variable to hold the statistics for this sched_domain.
*/
/* * Indicate that the child domain of the busiest group prefers tasks * go to a child's sibling domains first. NB the flags of a sched group * are those of the child domain.
*/ if (sds->busiest)
sds->prefer_sibling = !!(sds->busiest->flags & SD_PREFER_SIBLING);
if (env->sd->flags & SD_NUMA)
env->fbq_type = fbq_classify_group(&sds->busiest_stat);
if (!env->sd->parent) { /* update overload indicator if we are at root domain */
set_rd_overloaded(env->dst_rq->rd, sg_overloaded);
/* Update over-utilization (tipping point, U >= 0) indicator */
set_rd_overutilized(env->dst_rq->rd, sg_overutilized);
} elseif (sg_overutilized) {
set_rd_overutilized(env->dst_rq->rd, sg_overutilized);
}
update_idle_cpu_scan(env, sum_util);
}
/** * calculate_imbalance - Calculate the amount of imbalance present within the * groups of a given sched_domain during load balance. * @env: load balance environment * @sds: statistics of the sched_domain whose imbalance is to be calculated.
*/ staticinlinevoid calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
{ struct sg_lb_stats *local, *busiest;
local = &sds->local_stat;
busiest = &sds->busiest_stat;
if (busiest->group_type == group_misfit_task) { if (env->sd->flags & SD_ASYM_CPUCAPACITY) { /* Set imbalance to allow misfit tasks to be balanced. */
env->migration_type = migrate_misfit;
env->imbalance = 1;
} else { /* * Set load imbalance to allow moving task from cpu * with reduced capacity.
*/
env->migration_type = migrate_load;
env->imbalance = busiest->group_misfit_task_load;
} return;
}
if (busiest->group_type == group_asym_packing) { /* * In case of asym capacity, we will try to migrate all load to * the preferred CPU.
*/
env->migration_type = migrate_task;
env->imbalance = busiest->sum_h_nr_running; return;
}
if (busiest->group_type == group_smt_balance) { /* Reduce number of tasks sharing CPU capacity */
env->migration_type = migrate_task;
env->imbalance = 1; return;
}
if (busiest->group_type == group_imbalanced) { /* * In the group_imb case we cannot rely on group-wide averages * to ensure CPU-load equilibrium, try to move any task to fix * the imbalance. The next load balance will take care of * balancing back the system.
*/
env->migration_type = migrate_task;
env->imbalance = 1; return;
}
/* * Try to use spare capacity of local group without overloading it or * emptying busiest.
*/ if (local->group_type == group_has_spare) { if ((busiest->group_type > group_fully_busy) &&
!(env->sd->flags & SD_SHARE_LLC)) { /* * If busiest is overloaded, try to fill spare * capacity. This might end up creating spare capacity * in busiest or busiest still being overloaded but * there is no simple way to directly compute the * amount of load to migrate in order to balance the * system.
*/
env->migration_type = migrate_util;
env->imbalance = max(local->group_capacity, local->group_util) -
local->group_util;
/* * In some cases, the group's utilization is max or even * higher than capacity because of migrations but the * local CPU is (newly) idle. There is at least one * waiting task in this overloaded busiest group. Let's * try to pull it.
*/ if (env->idle && env->imbalance == 0) {
env->migration_type = migrate_task;
env->imbalance = 1;
}
/* * If there is no overload, we just want to even the number of * idle CPUs.
*/
env->migration_type = migrate_task;
env->imbalance = max_t(long, 0,
(local->idle_cpus - busiest->idle_cpus));
}
#ifdef CONFIG_NUMA /* Consider allowing a small imbalance between NUMA groups */ if (env->sd->flags & SD_NUMA) {
env->imbalance = adjust_numa_imbalance(env->imbalance,
local->sum_nr_running + 1,
env->sd->imb_numa_nr);
} #endif
/* Number of tasks to move to restore balance */
env->imbalance >>= 1;
return;
}
/* * Local is fully busy but has to take more load to relieve the * busiest group
*/ if (local->group_type < group_overloaded) { /* * Local will become overloaded so the avg_load metrics are * finally needed.
*/
/* * If the local group is more loaded than the selected * busiest group don't try to pull any tasks.
*/ if (local->avg_load >= busiest->avg_load) {
env->imbalance = 0; return;
}
/* * If the local group is more loaded than the average system * load, don't try to pull any tasks.
*/ if (local->avg_load >= sds->avg_load) {
env->imbalance = 0; return;
}
}
/* * Both group are or will become overloaded and we're trying to get all * the CPUs to the average_load, so we don't want to push ourselves * above the average load, nor do we wish to reduce the max loaded CPU * below the average load. At the same time, we also don't want to * reduce the group load below the group capacity. Thus we look for * the minimum possible imbalance.
*/
env->migration_type = migrate_load;
env->imbalance = min(
(busiest->avg_load - sds->avg_load) * busiest->group_capacity,
(sds->avg_load - local->avg_load) * local->group_capacity
) / SCHED_CAPACITY_SCALE;
}
/******* sched_balance_find_src_group() helpers end here *********************/
/* * Decision matrix according to the local and busiest group type: * * busiest \ local has_spare fully_busy misfit asym imbalanced overloaded * has_spare nr_idle balanced N/A N/A balanced balanced * fully_busy nr_idle nr_idle N/A N/A balanced balanced * misfit_task force N/A N/A N/A N/A N/A * asym_packing force force N/A N/A force force * imbalanced force force N/A N/A force force * overloaded force force N/A N/A force avg_load * * N/A : Not Applicable because already filtered while updating * statistics. * balanced : The system is balanced for these 2 groups. * force : Calculate the imbalance as load migration is probably needed. * avg_load : Only if imbalance is significant enough. * nr_idle : dst_cpu is not busy and the number of idle CPUs is quite * different in groups.
*/
/** * sched_balance_find_src_group - Returns the busiest group within the sched_domain * if there is an imbalance. * @env: The load balancing environment. * * Also calculates the amount of runnable load which should be moved * to restore balance. * * Return: - The busiest group if imbalance exists.
*/ staticstruct sched_group *sched_balance_find_src_group(struct lb_env *env)
{ struct sg_lb_stats *local, *busiest; struct sd_lb_stats sds;
init_sd_lb_stats(&sds);
/* * Compute the various statistics relevant for load balancing at * this level.
*/
update_sd_lb_stats(env, &sds);
/* There is no busy sibling group to pull tasks from */ if (!sds.busiest) goto out_balanced;
busiest = &sds.busiest_stat;
/* Misfit tasks should be dealt with regardless of the avg load */ if (busiest->group_type == group_misfit_task) goto force_balance;
if (!is_rd_overutilized(env->dst_rq->rd) &&
rcu_dereference(env->dst_rq->rd->pd)) goto out_balanced;
/* * If the busiest group is imbalanced the below checks don't * work because they assume all things are equal, which typically * isn't true due to cpus_ptr constraints and the like.
*/ if (busiest->group_type == group_imbalanced) goto force_balance;
local = &sds.local_stat; /* * If the local group is busier than the selected busiest group * don't try and pull any tasks.
*/ if (local->group_type > busiest->group_type) goto out_balanced;
/* * When groups are overloaded, use the avg_load to ensure fairness * between tasks.
*/ if (local->group_type == group_overloaded) { /* * If the local group is more loaded than the selected * busiest group don't try to pull any tasks.
*/ if (local->avg_load >= busiest->avg_load) goto out_balanced;
/* XXX broken for overlapping NUMA groups */
sds.avg_load = (sds.total_load * SCHED_CAPACITY_SCALE) /
sds.total_capacity;
/* * Don't pull any tasks if this group is already above the * domain average load.
*/ if (local->avg_load >= sds.avg_load) goto out_balanced;
/* * If the busiest group is more loaded, use imbalance_pct to be * conservative.
*/ if (100 * busiest->avg_load <=
env->sd->imbalance_pct * local->avg_load) goto out_balanced;
}
/* * Try to move all excess tasks to a sibling domain of the busiest * group's child domain.
*/ if (sds.prefer_sibling && local->group_type == group_has_spare &&
sibling_imbalance(env, &sds, busiest, local) > 1) goto force_balance;
if (busiest->group_type != group_overloaded) { if (!env->idle) { /* * If the busiest group is not overloaded (and as a * result the local one too) but this CPU is already * busy, let another idle CPU try to pull task.
*/ goto out_balanced;
}
if (busiest->group_type == group_smt_balance &&
smt_vs_nonsmt_groups(sds.local, sds.busiest)) { /* Let non SMT CPU pull from SMT CPU sharing with sibling */ goto force_balance;
}
if (busiest->group_weight > 1 &&
local->idle_cpus <= (busiest->idle_cpus + 1)) { /* * If the busiest group is not overloaded * and there is no imbalance between this and busiest * group wrt idle CPUs, it is balanced. The imbalance * becomes significant if the diff is greater than 1 * otherwise we might end up to just move the imbalance * on another group. Of course this applies only if * there is more than 1 CPU per group.
*/ goto out_balanced;
}
if (busiest->sum_h_nr_running == 1) { /* * busiest doesn't have any tasks waiting to run
*/ goto out_balanced;
}
}
force_balance: /* Looks like there is an imbalance. Compute it */
calculate_imbalance(env, &sds); return env->imbalance ? sds.busiest : NULL;
out_balanced:
env->imbalance = 0; return NULL;
}
/* * sched_balance_find_src_rq - find the busiest runqueue among the CPUs in the group.
*/ staticstruct rq *sched_balance_find_src_rq(struct lb_env *env, struct sched_group *group)
{ struct rq *busiest = NULL, *rq; unsignedlong busiest_util = 0, busiest_load = 0, busiest_capacity = 1; unsignedint busiest_nr = 0; int i;
/* * We classify groups/runqueues into three groups: * - regular: there are !numa tasks * - remote: there are numa tasks that run on the 'wrong' node * - all: there is no distinction * * In order to avoid migrating ideally placed numa tasks, * ignore those when there's better options. * * If we ignore the actual busiest queue to migrate another * task, the next balance pass can still reduce the busiest * queue by moving tasks around inside the node. * * If we cannot move enough load due to this classification * the next pass will adjust the group classification and * allow migration of more tasks. * * Both cases only affect the total convergence complexity.
*/ if (rt > env->fbq_type) continue;
nr_running = rq->cfs.h_nr_runnable; if (!nr_running) continue;
capacity = capacity_of(i);
/* * For ASYM_CPUCAPACITY domains, don't pick a CPU that could * eventually lead to active_balancing high->low capacity. * Higher per-CPU capacity is considered better than balancing * average load.
*/ if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
!capacity_greater(capacity_of(env->dst_cpu), capacity) &&
nr_running == 1) continue;
/* * Make sure we only pull tasks from a CPU of lower priority * when balancing between SMT siblings. * * If balancing between cores, let lower priority CPUs help * SMT cores with more than one busy sibling.
*/ if (sched_asym(env->sd, i, env->dst_cpu) && nr_running == 1) continue;
switch (env->migration_type) { case migrate_load: /* * When comparing with load imbalance, use cpu_load() * which is not scaled with the CPU capacity.
*/
load = cpu_load(rq);
/* * For the load comparisons with the other CPUs, * consider the cpu_load() scaled with the CPU * capacity, so that the load can be moved away * from the CPU that is potentially running at a * lower capacity. * * Thus we're looking for max(load_i / capacity_i), * crosswise multiplication to rid ourselves of the * division works out to: * load_i * capacity_j > load_j * capacity_i; * where j is our previous maximum.
*/ if (load * busiest_capacity > busiest_load * capacity) {
busiest_load = load;
busiest_capacity = capacity;
busiest = rq;
} break;
case migrate_util:
util = cpu_util_cfs_boost(i);
/* * Don't try to pull utilization from a CPU with one * running task. Whatever its utilization, we will fail * detach the task.
*/ if (nr_running <= 1) continue;
case migrate_task: if (busiest_nr < nr_running) {
busiest_nr = nr_running;
busiest = rq;
} break;
case migrate_misfit: /* * For ASYM_CPUCAPACITY domains with misfit tasks we * simply seek the "biggest" misfit task.
*/ if (rq->misfit_task_load > busiest_load) {
busiest_load = rq->misfit_task_load;
busiest = rq;
}
break;
}
}
return busiest;
}
/* * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but * so long as it is large enough.
*/ #define MAX_PINNED_INTERVAL 512
staticinlinebool
asym_active_balance(struct lb_env *env)
{ /* * ASYM_PACKING needs to force migrate tasks from busy but lower * priority CPUs in order to pack all tasks in the highest priority * CPUs. When done between cores, do it only if the whole core if the * whole core is idle. * * If @env::src_cpu is an SMT core with busy siblings, let * the lower priority @env::dst_cpu help it. Do not follow * CPU priority.
*/ return env->idle && sched_use_asym_prio(env->sd, env->dst_cpu) &&
(sched_asym_prefer(env->dst_cpu, env->src_cpu) ||
!sched_use_asym_prio(env->sd, env->src_cpu));
}
/* * The imbalanced case includes the case of pinned tasks preventing a fair * distribution of the load on the system but also the even distribution of the * threads on a system with spare capacity
*/ if ((env->migration_type == migrate_task) &&
(sd->nr_balance_failed > sd->cache_nice_tries+2)) return 1;
/* * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task. * It's worth migrating the task if the src_cpu's capacity is reduced * because of other sched_class or IRQs if more capacity stays * available on dst_cpu.
*/ if (env->idle &&
(env->src_rq->cfs.h_nr_runnable == 1)) { if ((check_cpu_capacity(env->src_rq, sd)) &&
(capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100)) return 1;
}
if (env->migration_type == migrate_misfit) return 1;
/* * Ensure the balancing environment is consistent; can happen * when the softirq triggers 'during' hotplug.
*/ if (!cpumask_test_cpu(env->dst_cpu, env->cpus)) return 0;
/* * In the newly idle case, we will allow all the CPUs * to do the newly idle load balance. * * However, we bail out if we already have tasks or a wakeup pending, * to optimize wakeup latency.
*/ if (env->idle == CPU_NEWLY_IDLE) { if (env->dst_rq->nr_running > 0 || env->dst_rq->ttwu_pending) return 0; return 1;
}
cpumask_copy(swb_cpus, group_balance_mask(sg)); /* Try to find first idle CPU */
for_each_cpu_and(cpu, swb_cpus, env->cpus) { if (!idle_cpu(cpu)) continue;
/* * Don't balance to idle SMT in busy core right away when * balancing cores, but remember the first idle SMT CPU for * later consideration. Find CPU on an idle core first.
*/ if (!(env->sd->flags & SD_SHARE_CPUCAPACITY) && !is_core_idle(cpu)) { if (idle_smt == -1)
idle_smt = cpu; /* * If the core is not idle, and first SMT sibling which is * idle has been found, then its not needed to check other * SMT siblings for idleness:
*/ #ifdef CONFIG_SCHED_SMT
cpumask_andnot(swb_cpus, swb_cpus, cpu_smt_mask(cpu)); #endif continue;
}
/* * Are we the first idle core in a non-SMT domain or higher, * or the first idle CPU in a SMT domain?
*/ return cpu == env->dst_cpu;
}
/* Are we the first idle CPU with busy siblings? */ if (idle_smt != -1) return idle_smt == env->dst_cpu;
/* Are we the first CPU of this group ? */ return group_balance_cpu(sg) == env->dst_cpu;
}
ld_moved = 0; /* Clear this flag as soon as we find a pullable task */
env.flags |= LBF_ALL_PINNED; if (busiest->nr_running > 1) { /* * Attempt to move tasks. If sched_balance_find_src_group has found * an imbalance but busiest->nr_running <= 1, the group is * still unbalanced. ld_moved simply stays zero, so it is * correctly treated as an imbalance.
*/
env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
/* * cur_ld_moved - load moved in current iteration * ld_moved - cumulative load moved across iterations
*/
cur_ld_moved = detach_tasks(&env);
/* * We've detached some tasks from busiest_rq. Every * task is masked "TASK_ON_RQ_MIGRATING", so we can safely * unlock busiest->lock, and we are able to be sure * that nobody can manipulate the tasks in parallel. * See task_rq_lock() family for the details.
*/
rq_unlock(busiest, &rf);
if (cur_ld_moved) {
attach_tasks(&env);
ld_moved += cur_ld_moved;
}
/* * Revisit (affine) tasks on src_cpu that couldn't be moved to * us and move them to an alternate dst_cpu in our sched_group * where they can run. The upper limit on how many times we * iterate on same src_cpu is dependent on number of CPUs in our * sched_group. * * This changes load balance semantics a bit on who can move * load to a given_cpu. In addition to the given_cpu itself * (or a ilb_cpu acting on its behalf where given_cpu is * nohz-idle), we now have balance_cpu in a position to move * load to given_cpu. In rare situations, this may cause * conflicts (balance_cpu and given_cpu/ilb_cpu deciding * _independently_ and at _same_ time to move some load to * given_cpu) causing excess load to be moved to given_cpu. * This however should not happen so much in practice and * moreover subsequent load balance cycles should correct the * excess load moved.
*/ if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
/* Prevent to re-select dst_cpu via env's CPUs */
__cpumask_clear_cpu(env.dst_cpu, env.cpus);
/* All tasks on this runqueue were pinned by CPU affinity */ if (unlikely(env.flags & LBF_ALL_PINNED)) {
__cpumask_clear_cpu(cpu_of(busiest), cpus); /* * Attempting to continue load balancing at the current * sched_domain level only makes sense if there are * active CPUs remaining as possible busiest CPUs to * pull load from which are not contained within the * destination group that is receiving any migrated * load.
*/ if (!cpumask_subset(cpus, env.dst_grpmask)) {
env.loop = 0;
env.loop_break = SCHED_NR_MIGRATE_BREAK; goto redo;
} goto out_all_pinned;
}
}
if (!ld_moved) {
schedstat_inc(sd->lb_failed[idle]); /* * Increment the failure counter only on periodic balance. * We do not want newidle balance, which can be very * frequent, pollute the failure counter causing * excessive cache_hot migrations and active balances. * * Similarly for migration_misfit which is not related to * load/util migration, don't pollute nr_balance_failed.
*/ if (idle != CPU_NEWLY_IDLE &&
env.migration_type != migrate_misfit)
sd->nr_balance_failed++;
if (need_active_balance(&env)) { unsignedlong flags;
raw_spin_rq_lock_irqsave(busiest, flags);
/* * Don't kick the active_load_balance_cpu_stop, * if the curr task on busiest CPU can't be * moved to this_cpu:
*/ if (!cpumask_test_cpu(this_cpu, busiest->curr->cpus_ptr)) {
raw_spin_rq_unlock_irqrestore(busiest, flags); goto out_one_pinned;
}
/* Record that we found at least one task that could run on this_cpu */
env.flags &= ~LBF_ALL_PINNED;
/* * ->active_balance synchronizes accesses to * ->active_balance_work. Once set, it's cleared * only after active load balance is finished.
*/ if (!busiest->active_balance) {
busiest->active_balance = 1;
busiest->push_cpu = this_cpu;
active_balance = 1;
}
if (likely(!active_balance) || need_active_balance(&env)) { /* We were unbalanced, so reset the balancing interval */
sd->balance_interval = sd->min_interval;
}
goto out;
out_balanced: /* * We reach balance although we may have faced some affinity * constraints. Clear the imbalance flag only if other tasks got * a chance to move and fix the imbalance.
*/ if (sd_parent && !(env.flags & LBF_ALL_PINNED)) { int *group_imbalance = &sd_parent->groups->sgc->imbalance;
if (*group_imbalance)
*group_imbalance = 0;
}
out_all_pinned: /* * We reach balance because all tasks are pinned at this level so * we can't migrate them. Let the imbalance flag set so parent level * can try to migrate them.
*/
schedstat_inc(sd->lb_balanced[idle]);
sd->nr_balance_failed = 0;
out_one_pinned:
ld_moved = 0;
/* * sched_balance_newidle() disregards balance intervals, so we could * repeatedly reach this code, which would lead to balance_interval * skyrocketing in a short amount of time. Skip the balance_interval * increase logic to avoid that. * * Similarly misfit migration which is not necessarily an indication of * the system being busy and requires lb to backoff to let it settle * down.
*/ if (env.idle == CPU_NEWLY_IDLE ||
env.migration_type == migrate_misfit) goto out;
/* tune up the balancing interval */ if ((env.flags & LBF_ALL_PINNED &&
sd->balance_interval < MAX_PINNED_INTERVAL) ||
sd->balance_interval < sd->max_interval)
sd->balance_interval *= 2;
out: return ld_moved;
}
/* scale ms to jiffies */
interval = msecs_to_jiffies(interval);
/* * Reduce likelihood of busy balancing at higher domains racing with * balancing at lower domains by preventing their balancing periods * from being multiples of each other.
*/ if (cpu_busy)
interval -= 1;
/* used by idle balance, so cpu_busy = 0 */
interval = get_sd_balance_interval(sd, 0);
next = sd->last_balance + interval;
if (time_after(*next_balance, next))
*next_balance = next;
}
/* * active_load_balance_cpu_stop is run by the CPU stopper. It pushes * running tasks off the busiest CPU onto idle CPUs. It requires at * least 1 task to be running on each physical CPU where possible, and * avoids physical / logical imbalances.
*/ staticint active_load_balance_cpu_stop(void *data)
{ struct rq *busiest_rq = data; int busiest_cpu = cpu_of(busiest_rq); int target_cpu = busiest_rq->push_cpu; struct rq *target_rq = cpu_rq(target_cpu); struct sched_domain *sd; struct task_struct *p = NULL; struct rq_flags rf;
rq_lock_irq(busiest_rq, &rf); /* * Between queueing the stop-work and running it is a hole in which * CPUs can become inactive. We should not move tasks from or to * inactive CPUs.
*/ if (!cpu_active(busiest_cpu) || !cpu_active(target_cpu)) goto out_unlock;
/* Make sure the requested CPU hasn't gone down in the meantime: */ if (unlikely(busiest_cpu != smp_processor_id() ||
!busiest_rq->active_balance)) goto out_unlock;
/* Is there any task to move? */ if (busiest_rq->nr_running <= 1) goto out_unlock;
/* * This condition is "impossible", if it occurs * we need to fix it. Originally reported by * Bjorn Helgaas on a 128-CPU setup.
*/
WARN_ON_ONCE(busiest_rq == target_rq);
/* Search for an sd spanning us and the target CPU. */
rcu_read_lock();
for_each_domain(target_cpu, sd) { if (cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) break;
}
p = detach_one_task(&env); if (p) {
schedstat_inc(sd->alb_pushed); /* Active balancing done, reset the failure counter. */
sd->nr_balance_failed = 0;
} else {
schedstat_inc(sd->alb_failed);
}
}
rcu_read_unlock();
out_unlock:
busiest_rq->active_balance = 0;
rq_unlock(busiest_rq, &rf);
if (p)
attach_one_task(target_rq, p);
local_irq_enable();
return 0;
}
/* * This flag serializes load-balancing passes over large domains * (above the NODE topology level) - only one load-balancing instance * may run at a time, to reduce overhead on very large systems with * lots of CPUs and large NUMA distances. * * - Note that load-balancing passes triggered while another one * is executing are skipped and not re-tried. * * - Also note that this does not serialize rebalance_domains() * execution, as non-SD_SERIALIZE domains will still be * load-balanced in parallel.
*/ static atomic_t sched_balance_running = ATOMIC_INIT(0);
/* * Scale the max sched_balance_rq interval with the number of CPUs in the system. * This trades load-balance latency on larger machines for less cross talk.
*/ void update_max_interval(void)
{
max_load_balance_interval = HZ*num_online_cpus()/10;
}
staticinlinebool update_newidle_cost(struct sched_domain *sd, u64 cost)
{ if (cost > sd->max_newidle_lb_cost) { /* * Track max cost of a domain to make sure to not delay the * next wakeup on the CPU. * * sched_balance_newidle() bumps the cost whenever newidle * balance fails, and we don't want things to grow out of * control. Use the sysctl_sched_migration_cost as the upper * limit, plus a litle extra to avoid off by ones.
*/
sd->max_newidle_lb_cost =
min(cost, sysctl_sched_migration_cost + 200);
sd->last_decay_max_lb_cost = jiffies;
} elseif (time_after(jiffies, sd->last_decay_max_lb_cost + HZ)) { /* * Decay the newidle max times by ~1% per second to ensure that * it is not outdated and the current max cost is actually * shorter.
*/
sd->max_newidle_lb_cost = (sd->max_newidle_lb_cost * 253) / 256;
sd->last_decay_max_lb_cost = jiffies;
returntrue;
}
returnfalse;
}
/* * It checks each scheduling domain to see if it is due to be balanced, * and initiates a balancing operation if so. * * Balancing parameters are set up in init_sched_domains.
*/ staticvoid sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
{ int continue_balancing = 1; int cpu = rq->cpu; int busy = idle != CPU_IDLE && !sched_idle_cpu(cpu); unsignedlong interval; struct sched_domain *sd; /* Earliest time when we have to do rebalance again */ unsignedlong next_balance = jiffies + 60*HZ; int update_next_balance = 0; int need_serialize, need_decay = 0;
u64 max_cost = 0;
rcu_read_lock();
for_each_domain(cpu, sd) { /* * Decay the newidle max times here because this is a regular * visit to all the domains.
*/
need_decay = update_newidle_cost(sd, 0);
max_cost += sd->max_newidle_lb_cost;
/* * Stop the load balance at this level. There is another * CPU in our sched group which is doing load balancing more * actively.
*/ if (!continue_balancing) { if (need_decay) continue; break;
}
interval = get_sd_balance_interval(sd, busy);
need_serialize = sd->flags & SD_SERIALIZE; if (need_serialize) { if (atomic_cmpxchg_acquire(&sched_balance_running, 0, 1)) goto out;
}
if (time_after_eq(jiffies, sd->last_balance + interval)) { if (sched_balance_rq(cpu, rq, sd, idle, &continue_balancing)) { /* * The LBF_DST_PINNED logic could have changed * env->dst_cpu, so we can't know our idle * state even if we migrated tasks. Update it.
*/
idle = idle_cpu(cpu);
busy = !idle && !sched_idle_cpu(cpu);
}
sd->last_balance = jiffies;
interval = get_sd_balance_interval(sd, busy);
} if (need_serialize)
atomic_set_release(&sched_balance_running, 0);
out: if (time_after(next_balance, sd->last_balance + interval)) {
next_balance = sd->last_balance + interval;
update_next_balance = 1;
}
} if (need_decay) { /* * Ensure the rq-wide value also decays but keep it at a * reasonable floor to avoid funnies with rq->avg_idle.
*/
rq->max_idle_balance_cost =
max((u64)sysctl_sched_migration_cost, max_cost);
}
rcu_read_unlock();
/* * next_balance will be updated only when there is a need. * When the cpu is attached to null domain for ex, it will not be * updated.
*/ if (likely(update_next_balance))
rq->next_balance = next_balance;
#ifdef CONFIG_NO_HZ_COMMON /* * NOHZ idle load balancing (ILB) details: * * - When one of the busy CPUs notices that there may be an idle rebalancing * needed, they will kick the idle load balancer, which then does idle * load balancing for all the idle CPUs.
*/ staticinlineint find_new_ilb(void)
{ conststruct cpumask *hk_mask; int ilb_cpu;
/* * Kick a CPU to do the NOHZ balancing, if it is time for it, via a cross-CPU * SMP function call (IPI). * * We pick the first idle CPU in the HK_TYPE_KERNEL_NOISE housekeeping set * (if there is one).
*/ staticvoid kick_ilb(unsignedint flags)
{ int ilb_cpu;
/* * Increase nohz.next_balance only when if full ilb is triggered but * not if we only update stats.
*/ if (flags & NOHZ_BALANCE_KICK)
nohz.next_balance = jiffies+1;
ilb_cpu = find_new_ilb(); if (ilb_cpu < 0) return;
/* * Don't bother if no new NOHZ balance work items for ilb_cpu, * i.e. all bits in flags are already set in ilb_cpu.
*/ if ((atomic_read(nohz_flags(ilb_cpu)) & flags) == flags) return;
/* * Access to rq::nohz_csd is serialized by NOHZ_KICK_MASK; he who sets * the first flag owns it; cleared by nohz_csd_func().
*/
flags = atomic_fetch_or(flags, nohz_flags(ilb_cpu)); if (flags & NOHZ_KICK_MASK) return;
/* * This way we generate an IPI on the target CPU which * is idle, and the softirq performing NOHZ idle load balancing * will be run before returning from the IPI.
*/
smp_call_function_single_async(ilb_cpu, &cpu_rq(ilb_cpu)->nohz_csd);
}
/* * Current decision point for kicking the idle load balancer in the presence * of idle CPUs in the system.
*/ staticvoid nohz_balancer_kick(struct rq *rq)
{ unsignedlong now = jiffies; struct sched_domain_shared *sds; struct sched_domain *sd; int nr_busy, i, cpu = rq->cpu; unsignedint flags = 0;
if (unlikely(rq->idle_balance)) return;
/* * We may be recently in ticked or tickless idle mode. At the first * busy tick after returning from idle, we will update the busy stats.
*/
nohz_balance_exit_idle(rq);
/* * None are in tickless mode and hence no need for NOHZ idle load * balancing:
*/ if (likely(!atomic_read(&nohz.nr_cpus))) return;
if (READ_ONCE(nohz.has_blocked) &&
time_after(now, READ_ONCE(nohz.next_blocked)))
flags = NOHZ_STATS_KICK;
if (time_before(now, nohz.next_balance)) goto out;
sd = rcu_dereference(rq->sd); if (sd) { /* * If there's a runnable CFS task and the current CPU has reduced * capacity, kick the ILB to see if there's a better CPU to run on:
*/ if (rq->cfs.h_nr_runnable >= 1 && check_cpu_capacity(rq, sd)) {
flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; goto unlock;
}
}
sd = rcu_dereference(per_cpu(sd_asym_packing, cpu)); if (sd) { /* * When ASYM_PACKING; see if there's a more preferred CPU * currently idle; in which case, kick the ILB to move tasks * around. * * When balancing between cores, all the SMT siblings of the * preferred CPU must be idle.
*/
for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) { if (sched_asym(sd, i, cpu)) {
flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; goto unlock;
}
}
}
sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, cpu)); if (sd) { /* * When ASYM_CPUCAPACITY; see if there's a higher capacity CPU * to run the misfit task on.
*/ if (check_misfit_status(rq)) {
flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; goto unlock;
}
/* * For asymmetric systems, we do not want to nicely balance * cache use, instead we want to embrace asymmetry and only * ensure tasks have enough CPU capacity. * * Skip the LLC logic because it's not relevant in that case.
*/ goto unlock;
}
sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); if (sds) { /* * If there is an imbalance between LLC domains (IOW we could * increase the overall cache utilization), we need a less-loaded LLC * domain to pull some load from. Likewise, we may need to spread * load within the current LLC domain (e.g. packed SMT cores but * other CPUs are idle). We can't really know from here how busy * the others are - so just get a NOHZ balance going if it looks * like this LLC domain has tasks we could move.
*/
nr_busy = atomic_read(&sds->nr_busy_cpus); if (nr_busy > 1) {
flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; goto unlock;
}
}
unlock:
rcu_read_unlock();
out: if (READ_ONCE(nohz.needs_update))
flags |= NOHZ_NEXT_KICK;
/* * This routine will record that the CPU is going idle with tick stopped. * This info will be used in performing idle load balancing in the future.
*/ void nohz_balance_enter_idle(int cpu)
{ struct rq *rq = cpu_rq(cpu);
WARN_ON_ONCE(cpu != smp_processor_id());
/* If this CPU is going down, then nothing needs to be done: */ if (!cpu_active(cpu)) return;
/* * Can be set safely without rq->lock held * If a clear happens, it will have evaluated last additions because * rq->lock is held during the check and the clear
*/
rq->has_blocked_load = 1;
/* * The tick is still stopped but load could have been added in the * meantime. We set the nohz.has_blocked flag to trig a check of the * *_avg. The CPU is already part of nohz.idle_cpus_mask so the clear * of nohz.has_blocked can only happen after checking the new load
*/ if (rq->nohz_tick_stopped) goto out;
/* If we're a completely isolated CPU, we don't play: */ if (on_null_domain(rq)) return;
/* * Ensures that if nohz_idle_balance() fails to observe our * @idle_cpus_mask store, it must observe the @has_blocked * and @needs_update stores.
*/
smp_mb__after_atomic();
set_cpu_sd_state_idle(cpu);
WRITE_ONCE(nohz.needs_update, 1);
out: /* * Each time a cpu enter idle, we assume that it has blocked load and * enable the periodic update of the load of idle CPUs
*/
WRITE_ONCE(nohz.has_blocked, 1);
}
staticbool update_nohz_stats(struct rq *rq)
{ unsignedint cpu = rq->cpu;
if (!rq->has_blocked_load) returnfalse;
if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask)) returnfalse;
if (!time_after(jiffies, READ_ONCE(rq->last_blocked_load_update_tick))) returntrue;
sched_balance_update_blocked_averages(cpu);
return rq->has_blocked_load;
}
/* * Internal function that runs load balance for all idle CPUs. The load balance * can be a simple update of blocked load or a complete load balance with * tasks movement depending of flags.
*/ staticvoid _nohz_idle_balance(struct rq *this_rq, unsignedint flags)
{ /* Earliest time when we have to do rebalance again */ unsignedlong now = jiffies; unsignedlong next_balance = now + 60*HZ; bool has_blocked_load = false; int update_next_balance = 0; int this_cpu = this_rq->cpu; int balance_cpu; struct rq *rq;
/* * We assume there will be no idle load after this update and clear * the has_blocked flag. If a cpu enters idle in the mean time, it will * set the has_blocked flag and trigger another update of idle load. * Because a cpu that becomes idle, is added to idle_cpus_mask before * setting the flag, we are sure to not clear the state and not * check the load of an idle cpu. * * Same applies to idle_cpus_mask vs needs_update.
*/ if (flags & NOHZ_STATS_KICK)
WRITE_ONCE(nohz.has_blocked, 0); if (flags & NOHZ_NEXT_KICK)
WRITE_ONCE(nohz.needs_update, 0);
/* * Ensures that if we miss the CPU, we must see the has_blocked * store from nohz_balance_enter_idle().
*/
smp_mb();
/* * Start with the next CPU after this_cpu so we will end with this_cpu and let a * chance for other idle cpu to pull load.
*/
for_each_cpu_wrap(balance_cpu, nohz.idle_cpus_mask, this_cpu+1) { if (!idle_cpu(balance_cpu)) continue;
/* * If this CPU gets work to do, stop the load balancing * work being done for other CPUs. Next load * balancing owner will pick it up.
*/ if (!idle_cpu(this_cpu) && need_resched()) { if (flags & NOHZ_STATS_KICK)
has_blocked_load = true; if (flags & NOHZ_NEXT_KICK)
WRITE_ONCE(nohz.needs_update, 1); goto abort;
}
rq = cpu_rq(balance_cpu);
if (flags & NOHZ_STATS_KICK)
has_blocked_load |= update_nohz_stats(rq);
/* * If time for next balance is due, * do the balance.
*/ if (time_after_eq(jiffies, rq->next_balance)) { struct rq_flags rf;
/* * next_balance will be updated only when there is a need. * When the CPU is attached to null domain for ex, it will not be * updated.
*/ if (likely(update_next_balance))
nohz.next_balance = next_balance;
if (flags & NOHZ_STATS_KICK)
WRITE_ONCE(nohz.next_blocked,
now + msecs_to_jiffies(LOAD_AVG_PERIOD));
abort: /* There is still blocked load, enable periodic update */ if (has_blocked_load)
WRITE_ONCE(nohz.has_blocked, 1);
}
/* * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the * rebalancing for all the CPUs for whom scheduler ticks are stopped.
*/ staticbool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
{ unsignedint flags = this_rq->nohz_idle_balance;
if (!flags) returnfalse;
this_rq->nohz_idle_balance = 0;
if (idle != CPU_IDLE) returnfalse;
_nohz_idle_balance(this_rq, flags);
returntrue;
}
/* * Check if we need to directly run the ILB for updating blocked load before * entering idle state. Here we run ILB directly without issuing IPIs. * * Note that when this function is called, the tick may not yet be stopped on * this CPU yet. nohz.idle_cpus_mask is updated only when tick is stopped and * cleared on the next busy tick. In other words, nohz.idle_cpus_mask updates * don't align with CPUs enter/exit idle to avoid bottlenecks due to high idle * entry/exit rate (usec). So it is possible that _nohz_idle_balance() is * called from this function on (this) CPU that's not yet in the mask. That's * OK because the goal of nohz_run_idle_balance() is to run ILB only for * updating the blocked load of already idle CPUs without waking up one of * those idle CPUs and outside the preempt disable / IRQ off phase of the local * cpu about to enter idle, because it can take a long time.
*/ void nohz_run_idle_balance(int cpu)
{ unsignedint flags;
/* * Update the blocked load only if no SCHED_SOFTIRQ is about to happen * (i.e. NOHZ_STATS_KICK set) and will do the same.
*/ if ((flags == NOHZ_NEWILB_KICK) && !need_resched())
_nohz_idle_balance(cpu_rq(cpu), NOHZ_STATS_KICK);
}
staticvoid nohz_newidle_balance(struct rq *this_rq)
{ int this_cpu = this_rq->cpu;
/* Will wake up very soon. No time for doing anything else*/ if (this_rq->avg_idle < sysctl_sched_migration_cost) return;
/* Don't need to update blocked load of idle CPUs*/ if (!READ_ONCE(nohz.has_blocked) ||
time_before(jiffies, READ_ONCE(nohz.next_blocked))) return;
/* * Set the need to trigger ILB in order to update blocked load * before entering idle state.
*/
atomic_or(NOHZ_NEWILB_KICK, nohz_flags(this_cpu));
}
/* * sched_balance_newidle is called by schedule() if this_cpu is about to become * idle. Attempts to pull tasks from other CPUs. * * Returns: * < 0 - we released the lock and there are !fair tasks present * 0 - failed, no new tasks * > 0 - success, new (fair) tasks present
*/ staticint sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
{ unsignedlong next_balance = jiffies + HZ; int this_cpu = this_rq->cpu; int continue_balancing = 1;
u64 t0, t1, curr_cost = 0; struct sched_domain *sd; int pulled_task = 0;
update_misfit_status(NULL, this_rq);
/* * There is a task waiting to run. No need to search for one. * Return 0; the task will be enqueued when switching to idle.
*/ if (this_rq->ttwu_pending) return 0;
/* * We must set idle_stamp _before_ calling sched_balance_rq() * for CPU_NEWLY_IDLE, such that we measure the this duration * as idle time.
*/
this_rq->idle_stamp = rq_clock(this_rq);
/* * Do not pull tasks towards !active CPUs...
*/ if (!cpu_active(this_cpu)) return 0;
/* * This is OK, because current is on_cpu, which avoids it being picked * for load-balance and preemption/IRQs are still disabled avoiding * further scheduler activity on it and we're being very careful to * re-start the picking loop.
*/
rq_unpin_lock(this_rq, rf);
/* * Failing newidle means it is not effective; * bump the cost so we end up doing less of it.
*/ if (!pulled_task)
domain_cost = (3 * sd->max_newidle_lb_cost) / 2;
update_newidle_cost(sd, domain_cost);
}
/* * Stop searching for tasks to pull if there are * now runnable tasks on this rq.
*/ if (pulled_task || !continue_balancing) break;
}
rcu_read_unlock();
raw_spin_rq_lock(this_rq);
if (curr_cost > this_rq->max_idle_balance_cost)
this_rq->max_idle_balance_cost = curr_cost;
/* * While browsing the domains, we released the rq lock, a task could * have been enqueued in the meantime. Since we're not going idle, * pretend we pulled a task.
*/ if (this_rq->cfs.h_nr_queued && !pulled_task)
pulled_task = 1;
/* Is there a task of a high priority class? */ if (this_rq->nr_running != this_rq->cfs.h_nr_queued)
pulled_task = -1;
out: /* Move the next balance forward */ if (time_after(this_rq->next_balance, next_balance))
this_rq->next_balance = next_balance;
if (pulled_task)
this_rq->idle_stamp = 0; else
nohz_newidle_balance(this_rq);
rq_repin_lock(this_rq, rf);
return pulled_task;
}
/* * This softirq handler is triggered via SCHED_SOFTIRQ from two places: * * - directly from the local sched_tick() for periodic load balancing * * - indirectly from a remote sched_tick() for NOHZ idle balancing * through the SMP cross-call nohz_csd_func()
*/ static __latent_entropy void sched_balance_softirq(void)
{ struct rq *this_rq = this_rq(); enum cpu_idle_type idle = this_rq->idle_balance; /* * If this CPU has a pending NOHZ_BALANCE_KICK, then do the * balancing on behalf of the other idle CPUs whose ticks are * stopped. Do nohz_idle_balance *before* sched_balance_domains to * give the idle CPUs a chance to load balance. Else we may * load balance only within the local sched_domain hierarchy * and abort nohz_idle_balance altogether if we pull some load.
*/ if (nohz_idle_balance(this_rq, idle)) return;
/* normal load balance */
sched_balance_update_blocked_averages(this_rq->cpu);
sched_balance_domains(this_rq, idle);
}
/* * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
*/ void sched_balance_trigger(struct rq *rq)
{ /* * Don't need to rebalance while attached to NULL domain or * runqueue CPU is not active
*/ if (unlikely(on_null_domain(rq) || !cpu_active(cpu_of(rq)))) return;
if (time_after_eq(jiffies, rq->next_balance))
raise_softirq(SCHED_SOFTIRQ);
/* * If runqueue has only one task which used up its slice and * if the sibling is forced idle, then trigger schedule to * give forced idle task a chance. * * sched_slice() considers only this active rq and it gets the * whole slice. But during force idle, we have siblings acting * like a single runqueue and hence we need to consider runnable * tasks on this CPU and the forced idle CPU. Ideally, we should * go through the forced idle rq, but that would be a perf hit. * We can assume that the forced idle CPU has at least * MIN_NR_TASKS_DURING_FORCEIDLE - 1 tasks and use that to check * if we need to give up the CPU.
*/ if (rq->core->core_forceidle_count && rq->cfs.nr_queued == 1 &&
__entity_slice_used(&curr->se, MIN_NR_TASKS_DURING_FORCEIDLE))
resched_curr(rq);
}
/* * se_fi_update - Update the cfs_rq->min_vruntime_fi in a CFS hierarchy if needed.
*/ staticvoid se_fi_update(conststruct sched_entity *se, unsignedint fi_seq, bool forceidle)
{
for_each_sched_entity(se) { struct cfs_rq *cfs_rq = cfs_rq_of(se);
if (forceidle) { if (cfs_rq->forceidle_seq == fi_seq) break;
cfs_rq->forceidle_seq = fi_seq;
}
#ifdef CONFIG_FAIR_GROUP_SCHED /* * Find an se in the hierarchy for tasks a and b, such that the se's * are immediate siblings.
*/ while (sea->cfs_rq->tg != seb->cfs_rq->tg) { int sea_depth = sea->depth; int seb_depth = seb->depth;
if (sea_depth >= seb_depth)
sea = parent_entity(sea); if (sea_depth <= seb_depth)
seb = parent_entity(seb);
}
/* * Find delta after normalizing se's vruntime with its cfs_rq's * min_vruntime_fi, which would have been updated in prior calls * to se_fi_update().
*/
delta = (s64)(sea->vruntime - seb->vruntime) +
(s64)(cfs_rqb->min_vruntime_fi - cfs_rqa->min_vruntime_fi);
return delta > 0;
}
staticint task_is_throttled_fair(struct task_struct *p, int cpu)
{ struct cfs_rq *cfs_rq;
/* * scheduler tick hitting a task of our scheduling class. * * NOTE: This function can be called remotely by the tick offload that * goes along full dynticks. Therefore no local assumption can be made * and everything must be accessed through the @rq and @curr passed in * parameters.
*/ staticvoid task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
{ struct cfs_rq *cfs_rq; struct sched_entity *se = &curr->se;
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
entity_tick(cfs_rq, se, queued);
}
if (static_branch_unlikely(&sched_numa_balancing))
task_tick_numa(rq, curr);
/* * called on fork with the child task as argument from the parent's context * - child not yet on the tasklist * - preemption disabled
*/ staticvoid task_fork_fair(struct task_struct *p)
{
set_task_max_allowed_capacity(p);
}
/* * Priority of the task has changed. Check to see if we preempt * the current task.
*/ staticvoid
prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
{ if (!task_on_rq_queued(p)) return;
if (rq->cfs.nr_queued == 1) return;
/* * Reschedule if we are currently running on this runqueue and * our priority decreased, or if we are not currently running on * this runqueue and our priority is higher than the current's
*/ if (task_current_donor(rq, p)) { if (p->prio > oldprio)
resched_curr(rq);
} else
wakeup_preempt(rq, p, 0);
}
#ifdef CONFIG_FAIR_GROUP_SCHED /* * Propagate the changes of the sched_entity across the tg tree to make it * visible to the root
*/ staticvoid propagate_entity_cfs_rq(struct sched_entity *se)
{ struct cfs_rq *cfs_rq = cfs_rq_of(se);
if (cfs_rq_throttled(cfs_rq)) return;
if (!throttled_hierarchy(cfs_rq))
list_add_leaf_cfs_rq(cfs_rq);
/* Start to propagate at parent */
se = se->parent;
/* * In case the task sched_avg hasn't been attached: * - A forked task which hasn't been woken up by wake_up_new_task(). * - A task which has been woken up by try_to_wake_up() but is * waiting for actually being woken up by sched_ttwu_pending().
*/ if (!se->avg.last_update_time) return;
/* Catch up with the cfs_rq and remove our load when we leave */
update_load_avg(cfs_rq, se, 0);
detach_entity_load_avg(cfs_rq, se);
update_tg_load_avg(cfs_rq);
propagate_entity_cfs_rq(se);
}
if (task_on_rq_queued(p)) { /* * We were most likely switched from sched_rt, so * kick off the schedule if running, otherwise just see * if we can still preempt the current task.
*/ if (task_current_donor(rq, p))
resched_curr(rq); else
wakeup_preempt(rq, p, 0);
}
}
if (task_on_rq_queued(p)) { /* * Move the next running task to the front of the list, so our * cfs_tasks list becomes MRU one.
*/
list_move(&se->group_node, &rq->cfs_tasks);
} if (!first) return;
WARN_ON_ONCE(se->sched_delayed);
if (hrtick_enabled_fair(rq))
hrtick_start_fair(rq, p);
/* * Account for a task changing its policy or group. * * This routine is mostly called to set cfs_rq->curr field when a task * migrates between groups/classes.
*/ staticvoid set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
{ struct sched_entity *se = &p->se;
#ifdef CONFIG_FAIR_GROUP_SCHED staticvoid task_change_group_fair(struct task_struct *p)
{ /* * We couldn't detach or attach a forked task which * hasn't been woken up by wake_up_new_task().
*/ if (READ_ONCE(p->__state) == TASK_NEW) return;
detach_task_cfs_rq(p);
/* Tell se's cfs_rq has been changed -- migrated */
p->se.avg.last_update_time = 0;
set_task_rq(p, task_cpu(p));
attach_task_cfs_rq(p);
}
void free_fair_sched_group(struct task_group *tg)
{ int i;
for_each_possible_cpu(i) { if (tg->cfs_rq)
kfree(tg->cfs_rq[i]); if (tg->se)
kfree(tg->se[i]);
}
kfree(tg->cfs_rq);
kfree(tg->se);
}
int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
{ struct sched_entity *se; struct cfs_rq *cfs_rq; int i;
tg->cfs_rq = kcalloc(nr_cpu_ids, sizeof(cfs_rq), GFP_KERNEL); if (!tg->cfs_rq) goto err;
tg->se = kcalloc(nr_cpu_ids, sizeof(se), GFP_KERNEL); if (!tg->se) goto err;
if (se) { if (se->sched_delayed) {
guard(rq_lock_irqsave)(rq); if (se->sched_delayed) {
update_rq_clock(rq);
dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
}
list_del_leaf_cfs_rq(cfs_rq);
}
remove_entity_load_avg(se);
}
/* * Only empty task groups can be destroyed; so we can speculatively * check on_list without danger of it being re-added.
*/ if (cfs_rq->on_list) {
guard(rq_lock_irqsave)(rq);
list_del_leaf_cfs_rq(cfs_rq);
}
}
}
/* Already accounted at parent level and above. */ if (cfs_rq_is_idle(cfs_rq)) break;
}
next_cpu:
rq_unlock_irqrestore(rq, &rf);
}
/* Idle groups have minimum weight. */ if (tg_is_idle(tg))
__sched_group_set_shares(tg, scale_load(WEIGHT_IDLEPRIO)); else
__sched_group_set_shares(tg, NICE_0_LOAD);
/* * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise * idle runqueue:
*/ if (rq->cfs.load.weight)
rr_interval = NS_TO_JIFFIES(se->slice);
return rr_interval;
}
/* * All the scheduling class methods:
*/
DEFINE_SCHED_CLASS(fair) = {
¤ Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.0.478Bemerkung:
(vorverarbeitet am 2026-04-26)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.