/* * Export tracepoints that act as a bare tracehook (ie: have no trace event * associated with them) to allow external modules to probe them.
*/
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_cfs_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_rt_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_hw_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_cpu_capacity_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_compute_energy_tp);
if (*str && kstrtobool(str + 1, &proxy_enable)) {
pr_warn("Unable to parse sched_proxy_exec=\n"); return 0;
}
if (proxy_enable) {
pr_info("sched_proxy_exec enabled via boot arg\n");
static_branch_enable(&__sched_proxy_exec);
} else {
pr_info("sched_proxy_exec disabled via boot arg\n");
static_branch_disable(&__sched_proxy_exec);
} return 1;
} #else staticint __init setup_proxy_exec(char *str)
{
pr_warn("CONFIG_SCHED_PROXY_EXEC=n, so it cannot be enabled or disabled at boot time\n"); return 0;
} #endif
__setup("sched_proxy_exec", setup_proxy_exec);
/* * Debugging: various feature bits * * If SCHED_DEBUG is disabled, each compilation unit has its own copy of * sysctl_sched_features, defined in sched.h, to allow constants propagation * at compile time and compiler optimization based on features default.
*/ #define SCHED_FEAT(name, enabled) \
(1UL << __SCHED_FEAT_##name) * enabled |
__read_mostly unsignedint sysctl_sched_features = #include"features.h"
0; #undef SCHED_FEAT
/* * Print a warning if need_resched is set for the given duration (if * LATENCY_WARN is enabled). * * If sysctl_resched_latency_warn_once is set, only one warning will be shown * per boot.
*/
__read_mostly int sysctl_resched_latency_warn_ms = 100;
__read_mostly int sysctl_resched_latency_warn_once = 1;
/* * Number of tasks to iterate in a single balance run. * Limited because this is done with IRQs disabled.
*/
__read_mostly unsignedint sysctl_sched_nr_migrate = SCHED_NR_MIGRATE_BREAK;
__read_mostly int scheduler_running;
#ifdef CONFIG_SCHED_CORE
DEFINE_STATIC_KEY_FALSE(__sched_core_enabled);
/* kernel prio, less is more */ staticinlineint __task_prio(conststruct task_struct *p)
{ if (p->sched_class == &stop_sched_class) /* trumps deadline */ return -2;
if (p->dl_server) return -1; /* deadline */
if (rt_or_dl_prio(p->prio)) return p->prio; /* [-1, 99] */
/* real prio, less is less */ staticinlinebool prio_less(conststruct task_struct *a, conststruct task_struct *b, bool in_fi)
{
int pa = __task_prio(a), pb = __task_prio(b);
if (-pa < -pb) returntrue;
if (-pb < -pa) returnfalse;
if (pa == -1) { /* dl_prio() doesn't work because of stop_class above */ conststruct sched_dl_entity *a_dl, *b_dl;
a_dl = &a->dl; /* * Since,'a' and 'b' can be CFS tasks served by DL server, * __task_prio() can return -1 (for DL) even for those. In that * case, get to the dl_server's DL entity.
*/ if (a->dl_server)
a_dl = a->dl_server;
b_dl = &b->dl; if (b->dl_server)
b_dl = b->dl_server;
void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags)
{ if (p->se.sched_delayed) return;
rq->core->core_task_seq++;
if (sched_core_enqueued(p)) {
rb_erase(&p->core_node, &rq->core_tree);
RB_CLEAR_NODE(&p->core_node);
}
/* * Migrating the last task off the cpu, with the cpu in forced idle * state. Reschedule to create an accounting edge for forced idle, * and re-examine whether the core is still in forced idle state.
*/ if (!(flags & DEQUEUE_SAVE) && rq->nr_running == 1 &&
rq->core->core_forceidle_count && rq->curr == rq->idle)
resched_curr(rq);
}
staticint sched_task_is_throttled(struct task_struct *p, int cpu)
{ if (p->sched_class->task_is_throttled) return p->sched_class->task_is_throttled(p, cpu);
return 0;
}
staticstruct task_struct *sched_core_next(struct task_struct *p, unsignedlong cookie)
{ struct rb_node *node = &p->core_node; int cpu = task_cpu(p);
do {
node = rb_next(node); if (!node) return NULL;
p = __node_2_sc(node); if (p->core_cookie != cookie) return NULL;
} while (sched_task_is_throttled(p, cpu));
return p;
}
/* * Find left-most (aka, highest priority) and unthrottled task matching @cookie. * If no suitable task is found, NULL will be returned.
*/ staticstruct task_struct *sched_core_find(struct rq *rq, unsignedlong cookie)
{ struct task_struct *p; struct rb_node *node;
node = rb_find_first((void *)cookie, &rq->core_tree, rb_sched_core_cmp); if (!node) return NULL;
p = __node_2_sc(node); if (!sched_task_is_throttled(p, rq->cpu)) return p;
return sched_core_next(p, cookie);
}
/* * Magic required such that: * * raw_spin_rq_lock(rq); * ... * raw_spin_rq_unlock(rq); * * ends up locking and unlocking the _same_ lock, and all CPUs * always agree on what rq has what lock. * * XXX entirely possible to selectively enable cores, don't bother for now.
*/
/* * "There can be only one" * * Either this is the last one, or we don't actually need to do any * 'work'. If it is the last *again*, we rely on * WORK_STRUCT_PENDING_BIT.
*/ if (!atomic_add_unless(&sched_core_count, -1, 1))
schedule_work(&_work);
}
/* need a wrapper since we may need to trace from modules */
EXPORT_TRACEPOINT_SYMBOL(sched_set_state_tp);
/* Call via the helper macro trace_set_current_state. */ void __trace_set_current_state(int state_value)
{
trace_sched_set_state_tp(current, state_value);
}
EXPORT_SYMBOL(__trace_set_current_state);
/* * Serialization rules: * * Lock order: * * p->pi_lock * rq->lock * hrtimer_cpu_base->lock (hrtimer_start() for bandwidth controls) * * rq1->lock * rq2->lock where: rq1 < rq2 * * Regular state: * * Normal scheduling state is serialized by rq->lock. __schedule() takes the * local CPU's rq->lock, it optionally removes the task from the runqueue and * always looks at the local rq data structures to find the most eligible task * to run next. * * Task enqueue is also under rq->lock, possibly taken from another CPU. * Wakeups from another LLC domain might use an IPI to transfer the enqueue to * the local CPU to avoid bouncing the runqueue state around [ see * ttwu_queue_wakelist() ] * * Task wakeup, specifically wakeups that involve migration, are horribly * complicated to avoid having to take two rq->locks. * * Special state: * * System-calls and anything external will use task_rq_lock() which acquires * both p->pi_lock and rq->lock. As a consequence the state they change is * stable while holding either lock: * * - sched_setaffinity()/ * set_cpus_allowed_ptr(): p->cpus_ptr, p->nr_cpus_allowed * - set_user_nice(): p->se.load, p->*prio * - __sched_setscheduler(): p->sched_class, p->policy, p->*prio, * p->se.load, p->rt_priority, * p->dl.dl_{runtime, deadline, period, flags, bw, density} * - sched_setnuma(): p->numa_preferred_nid * - sched_move_task(): p->sched_task_group * - uclamp_update_active() p->uclamp* * * p->state <- TASK_*: * * is changed locklessly using set_current_state(), __set_current_state() or * set_special_state(), see their respective comments, or by * try_to_wake_up(). This latter uses p->pi_lock to serialize against * concurrent self. * * p->on_rq <- { 0, 1 = TASK_ON_RQ_QUEUED, 2 = TASK_ON_RQ_MIGRATING }: * * is set by activate_task() and cleared by deactivate_task(), under * rq->lock. Non-zero indicates the task is runnable, the special * ON_RQ_MIGRATING state is used for migration without holding both * rq->locks. It indicates task_cpu() is not stable, see task_rq_lock(). * * Additionally it is possible to be ->on_rq but still be considered not * runnable when p->se.sched_delayed is true. These tasks are on the runqueue * but will be dequeued as soon as they get picked again. See the * task_is_runnable() helper. * * p->on_cpu <- { 0, 1 }: * * is set by prepare_task() and cleared by finish_task() such that it will be * set before p is scheduled-in and cleared after p is scheduled-out, both * under rq->lock. Non-zero indicates the task is running on its CPU. * * [ The astute reader will observe that it is possible for two tasks on one * CPU to have ->on_cpu = 1 at the same time. ] * * task_cpu(p): is changed by set_task_cpu(), the rules are: * * - Don't call set_task_cpu() on a blocked task: * * We don't care what CPU we're not running on, this simplifies hotplug, * the CPU assignment of blocked tasks isn't required to be valid. * * - for try_to_wake_up(), called under p->pi_lock: * * This allows try_to_wake_up() to only take one rq->lock, see its comment. * * - for migration called under rq->lock: * [ see task_on_rq_migrating() in task_rq_lock() ] * * o move_queued_task() * o detach_task() * * - for migration called under double_rq_lock(): * * o __migrate_swap_task() * o push_rt_task() / pull_rt_task() * o push_dl_task() / pull_dl_task() * o dl_task_offline_migration() *
*/
void raw_spin_rq_lock_nested(struct rq *rq, int subclass)
{
raw_spinlock_t *lock;
/* Matches synchronize_rcu() in __sched_core_enable() */
preempt_disable(); if (sched_core_disabled()) {
raw_spin_lock_nested(&rq->__lock, subclass); /* preempt_count *MUST* be > 1 */
preempt_enable_no_resched(); return;
}
for (;;) {
lock = __rq_lockp(rq);
raw_spin_lock_nested(lock, subclass); if (likely(lock == __rq_lockp(rq))) { /* preempt_count *MUST* be > 1 */
preempt_enable_no_resched(); return;
}
raw_spin_unlock(lock);
}
}
for (;;) {
rq = task_rq(p);
raw_spin_rq_lock(rq); if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
rq_pin_lock(rq, rf); return rq;
}
raw_spin_rq_unlock(rq);
while (unlikely(task_on_rq_migrating(p)))
cpu_relax();
}
}
/* * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
*/ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
__acquires(p->pi_lock)
__acquires(rq->lock)
{ struct rq *rq;
for (;;) {
raw_spin_lock_irqsave(&p->pi_lock, rf->flags);
rq = task_rq(p);
raw_spin_rq_lock(rq); /* * move_queued_task() task_rq_lock() * * ACQUIRE (rq->lock) * [S] ->on_rq = MIGRATING [L] rq = task_rq() * WMB (__set_task_cpu()) ACQUIRE (rq->lock); * [S] ->cpu = new_cpu [L] task_rq() * [L] ->on_rq * RELEASE (rq->lock) * * If we observe the old CPU in task_rq_lock(), the acquire of * the old rq->lock will fully serialize against the stores. * * If we observe the new CPU in task_rq_lock(), the address * dependency headed by '[L] rq = task_rq()' and the acquire * will pair with the WMB to ensure we then also see migrating.
*/ if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
rq_pin_lock(rq, rf); return rq;
}
raw_spin_rq_unlock(rq);
raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
while (unlikely(task_on_rq_migrating(p)))
cpu_relax();
}
}
/* * RQ-clock updating methods:
*/
staticvoid update_rq_clock_task(struct rq *rq, s64 delta)
{ /* * In theory, the compile should just see 0 here, and optimize out the call * to sched_rt_avg_update. But I don't trust it...
*/
s64 __maybe_unused steal = 0, irq_delta = 0;
#ifdef CONFIG_IRQ_TIME_ACCOUNTING if (irqtime_enabled()) {
irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
/* * Since irq_time is only updated on {soft,}irq_exit, we might run into * this case when a previous update_rq_clock() happened inside a * {soft,}IRQ region. * * When this happens, we stop ->clock_task and only update the * prev_irq_time stamp to account for the part that fit, so that a next * update will consume the rest. This ensures ->clock_task is * monotonic. * * It does however cause some slight miss-attribution of {soft,}IRQ * time, a more accurate solution would be to update the irq_time using * the current rq->clock timestamp, except that would require using * atomic ops.
*/ if (irq_delta > delta)
irq_delta = delta;
/* * Called to set the hrtick timer state. * * called with rq->lock held and IRQs disabled
*/ void hrtick_start(struct rq *rq, u64 delay)
{ struct hrtimer *timer = &rq->hrtick_timer;
s64 delta;
/* * Don't schedule slices shorter than 10000ns, that just * doesn't make sense and can cause timer DoS.
*/
delta = max_t(s64, delay, 10000LL);
rq->hrtick_time = ktime_add_ns(timer->base->get_time(), delta);
if (rq == this_rq())
__hrtick_restart(rq); else
smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd);
}
/* * try_cmpxchg based fetch_or() macro so it works for different integer types:
*/ #define fetch_or(ptr, mask) \
({ \
typeof(ptr) _ptr = (ptr); \
typeof(mask) _mask = (mask); \
typeof(*_ptr) _val = *_ptr; \
\ do { \
} while (!try_cmpxchg(_ptr, &_val, _val | _mask)); \
_val; \
})
#ifdef TIF_POLLING_NRFLAG /* * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG, * this avoids any races wrt polling state changes and thereby avoids * spurious IPIs.
*/ staticinlinebool set_nr_and_not_polling(struct thread_info *ti, int tif)
{ return !(fetch_or(&ti->flags, 1 << tif) & _TIF_POLLING_NRFLAG);
}
/* * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set. * * If this returns true, then the idle task promises to call * sched_ttwu_pending() and reschedule soon.
*/ staticbool set_nr_if_polling(struct task_struct *p)
{ struct thread_info *ti = task_thread_info(p);
typeof(ti->flags) val = READ_ONCE(ti->flags);
do { if (!(val & _TIF_POLLING_NRFLAG)) returnfalse; if (val & _TIF_NEED_RESCHED) returntrue;
} while (!try_cmpxchg(&ti->flags, &val, val | _TIF_NEED_RESCHED));
/* * Atomically grab the task, if ->wake_q is !nil already it means * it's already queued (either by us or someone else) and will get the * wakeup due to that. * * In order to ensure that a pending wakeup will observe our pending * state, even in the failed case, an explicit smp_mb() must be used.
*/
smp_mb__before_atomic(); if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL))) returnfalse;
/* * The head is context local, there can be no concurrency.
*/
*head->lastp = node;
head->lastp = &node->next; returntrue;
}
/** * wake_q_add() - queue a wakeup for 'later' waking. * @head: the wake_q_head to add @task to * @task: the task to queue for 'later' wakeup * * Queue a task for later wakeup, most likely by the wake_up_q() call in the * same context, _HOWEVER_ this is not guaranteed, the wakeup can come * instantly. * * This function must be used as-if it were wake_up_process(); IOW the task * must be ready to be woken at this location.
*/ void wake_q_add(struct wake_q_head *head, struct task_struct *task)
{ if (__wake_q_add(head, task))
get_task_struct(task);
}
/** * wake_q_add_safe() - safely queue a wakeup for 'later' waking. * @head: the wake_q_head to add @task to * @task: the task to queue for 'later' wakeup * * Queue a task for later wakeup, most likely by the wake_up_q() call in the * same context, _HOWEVER_ this is not guaranteed, the wakeup can come * instantly. * * This function must be used as-if it were wake_up_process(); IOW the task * must be ready to be woken at this location. * * This function is essentially a task-safe equivalent to wake_q_add(). Callers * that already hold reference to @task can call the 'safe' version and trust * wake_q to do the right thing depending whether or not the @task is already * queued for wakeup.
*/ void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task)
{ if (!__wake_q_add(head, task))
put_task_struct(task);
}
while (node != WAKE_Q_TAIL) { struct task_struct *task;
task = container_of(node, struct task_struct, wake_q);
node = node->next; /* pairs with cmpxchg_relaxed() in __wake_q_add() */
WRITE_ONCE(task->wake_q.next, NULL); /* Task can safely be re-inserted now. */
/* * wake_up_process() executes a full barrier, which pairs with * the queueing in wake_q_add() so as not to miss wakeups.
*/
wake_up_process(task);
put_task_struct(task);
}
}
/* * resched_curr - mark rq's current task 'to be rescheduled now'. * * On UP this means the setting of the need_resched flag, on SMP it * might also involve a cross-CPU call to trigger the scheduler on * the target CPU.
*/ staticvoid __resched_curr(struct rq *rq, int tif)
{ struct task_struct *curr = rq->curr; struct thread_info *cti = task_thread_info(curr); int cpu;
lockdep_assert_rq_held(rq);
/* * Always immediately preempt the idle task; no point in delaying doing * actual work.
*/ if (is_idle_task(curr) && tif == TIF_NEED_RESCHED_LAZY)
tif = TIF_NEED_RESCHED;
if (cti->flags & ((1 << tif) | _TIF_NEED_RESCHED)) return;
cpu = cpu_of(rq);
trace_sched_set_need_resched_tp(curr, cpu, tif); if (cpu == smp_processor_id()) {
set_ti_thread_flag(cti, tif); if (tif == TIF_NEED_RESCHED)
set_preempt_need_resched(); return;
}
if (set_nr_and_not_polling(cti, tif)) { if (tif == TIF_NEED_RESCHED)
smp_send_reschedule(cpu);
} else {
trace_sched_wake_idle_without_ipi(cpu);
}
}
raw_spin_rq_lock_irqsave(rq, flags); if (cpu_online(cpu) || cpu == smp_processor_id())
resched_curr(rq);
raw_spin_rq_unlock_irqrestore(rq, flags);
}
#ifdef CONFIG_NO_HZ_COMMON /* * In the semi idle case, use the nearest busy CPU for migrating timers * from an idle CPU. This is good for power-savings. * * We don't do similar optimization for completely idle system, as * selecting an idle CPU will add more delays to the timers than intended * (as that CPU's timer base may not be up to date wrt jiffies etc).
*/ int get_nohz_timer_target(void)
{ int i, cpu = smp_processor_id(), default_cpu = -1; struct sched_domain *sd; conststruct cpumask *hk_mask;
if (housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE)) { if (!idle_cpu(cpu)) return cpu;
default_cpu = cpu;
}
if (default_cpu == -1)
default_cpu = housekeeping_any_cpu(HK_TYPE_KERNEL_NOISE);
return default_cpu;
}
/* * When add_timer_on() enqueues a timer into the timer wheel of an * idle CPU then this timer might expire before the next timer event * which is scheduled to wake up that CPU. In case of a completely * idle system the next event might even be infinite time into the * future. wake_up_idle_cpu() ensures that the CPU is woken up and * leaves the inner idle loop so the newly added timer is taken into * account when the CPU goes back to idle and evaluates the timer * wheel for the next timer event.
*/ staticvoid wake_up_idle_cpu(int cpu)
{ struct rq *rq = cpu_rq(cpu);
if (cpu == smp_processor_id()) return;
/* * Set TIF_NEED_RESCHED and send an IPI if in the non-polling * part of the idle loop. This forces an exit from the idle loop * and a round trip to schedule(). Now this could be optimized * because a simple new idle loop iteration is enough to * re-evaluate the next tick. Provided some re-ordering of tick * nohz functions that would need to follow TIF_NR_POLLING * clearing: * * - On most architectures, a simple fetch_or on ti::flags with a * "0" value would be enough to know if an IPI needs to be sent. * * - x86 needs to perform a last need_resched() check between * monitor and mwait which doesn't take timers into account. * There a dedicated TIF_TIMER flag would be required to * fetch_or here and be checked along with TIF_NEED_RESCHED * before mwait(). * * However, remote timer enqueue is not such a frequent event * and testing of the above solutions didn't appear to report * much benefits.
*/ if (set_nr_and_not_polling(task_thread_info(rq->idle), TIF_NEED_RESCHED))
smp_send_reschedule(cpu); else
trace_sched_wake_idle_without_ipi(cpu);
}
staticbool wake_up_full_nohz_cpu(int cpu)
{ /* * We just need the target to call irq_exit() and re-evaluate * the next tick. The nohz full kick at least implies that. * If needed we can still optimize that later with an * empty IRQ.
*/ if (cpu_is_offline(cpu)) returntrue; /* Don't try to wake offline CPUs. */ if (tick_nohz_full_cpu(cpu)) { if (cpu != smp_processor_id() ||
tick_nohz_tick_stopped())
tick_nohz_full_kick_cpu(cpu); returntrue;
}
returnfalse;
}
/* * Wake up the specified CPU. If the CPU is going offline, it is the * caller's responsibility to deal with the lost wakeup, for example, * by hooking into the CPU_DEAD notifier like timers and hrtimers do.
*/ void wake_up_nohz_cpu(int cpu)
{ if (!wake_up_full_nohz_cpu(cpu))
wake_up_idle_cpu(cpu);
}
staticvoid nohz_csd_func(void *info)
{ struct rq *rq = info; int cpu = cpu_of(rq); unsignedint flags;
if (p->sched_class != &fair_sched_class) returnfalse;
if (!task_on_rq_queued(p)) returnfalse;
returntrue;
}
bool sched_can_stop_tick(struct rq *rq)
{ int fifo_nr_running;
/* Deadline tasks, even if single, need the tick */ if (rq->dl.dl_nr_running) returnfalse;
/* * If there are more than one RR tasks, we need the tick to affect the * actual RR behaviour.
*/ if (rq->rt.rr_nr_running) { if (rq->rt.rr_nr_running == 1) returntrue; else returnfalse;
}
/* * If there's no RR tasks, but FIFO tasks, we can skip the tick, no * forced preemption between FIFO tasks.
*/
fifo_nr_running = rq->rt.rt_nr_running - rq->rt.rr_nr_running; if (fifo_nr_running) returntrue;
/* * If there are no DL,RR/FIFO tasks, there must only be CFS or SCX tasks * left. For CFS, if there's more than one we need the tick for * involuntary preemption. For SCX, ask.
*/ if (scx_enabled() && !scx_can_stop_tick(rq)) returnfalse;
if (rq->cfs.h_nr_queued > 1) returnfalse;
/* * If there is one task and it has CFS runtime bandwidth constraints * and it's on the cpu now we don't want to stop the tick. * This check prevents clearing the bit if a newly enqueued task here is * dequeued by migrating while the constrained task continues to run. * E.g. going from 2->1 without going through pick_next_task().
*/ if (__need_bw_check(rq, rq->curr)) { if (cfs_task_bw_constrained(rq->curr)) returnfalse;
}
returntrue;
} #endif/* CONFIG_NO_HZ_FULL */
#ifdefined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_FAIR_GROUP_SCHED) /* * Iterate task_group tree rooted at *from, calling @down when first entering a * node and @up when leaving it for the final time. * * Caller must hold rcu_lock or sufficient equivalent.
*/ int walk_tg_tree_from(struct task_group *from,
tg_visitor down, tg_visitor up, void *data)
{ struct task_group *parent, *child; int ret;
parent = from;
down:
ret = (*down)(parent, data); if (ret) goto out;
list_for_each_entry_rcu(child, &parent->children, siblings) {
parent = child; goto down;
up: continue;
}
ret = (*up)(parent, data); if (ret || parent == from) goto out;
/* * SCHED_OTHER tasks have to update their load when changing their * weight
*/ if (update_load && p->sched_class->reweight_task)
p->sched_class->reweight_task(task_rq(p), p, &lw); else
p->se.load = lw;
}
#ifdef CONFIG_UCLAMP_TASK /* * Serializes updates of utilization clamp values * * The (slow-path) user-space triggers utilization clamp value updates which * can require updates on (fast-path) scheduler's data structures used to * support enqueue/dequeue operations. * While the per-CPU rq lock protects fast-path update operations, user-space * requests are serialized using a mutex to reduce the risk of conflicting * updates or API abuses.
*/ static __maybe_unused DEFINE_MUTEX(uclamp_mutex);
/* Max allowed maximum utilization */ staticunsignedint __maybe_unused sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE;
/* * By default RT tasks run at the maximum performance point/capacity of the * system. Uclamp enforces this by always setting UCLAMP_MIN of RT tasks to * SCHED_CAPACITY_SCALE. * * This knob allows admins to change the default behavior when uclamp is being * used. In battery powered devices, particularly, running at the maximum * capacity and frequency will increase energy consumption and shorten the * battery life. * * This knob only affects RT tasks that their uclamp_se->user_defined == false. * * This knob will not override the system default sched_util_clamp_min defined * above.
*/ unsignedint sysctl_sched_uclamp_util_min_rt_default = SCHED_CAPACITY_SCALE;
/* All clamps are required to be less or equal than these values */ staticstruct uclamp_se uclamp_default[UCLAMP_CNT];
/* * This static key is used to reduce the uclamp overhead in the fast path. It * primarily disables the call to uclamp_rq_{inc, dec}() in * enqueue/dequeue_task(). * * This allows users to continue to enable uclamp in their kernel config with * minimum uclamp overhead in the fast path. * * As soon as userspace modifies any of the uclamp knobs, the static key is * enabled, since we have an actual users that make use of uclamp * functionality. * * The knobs that would enable this static key are: * * * A task modifying its uclamp value with sched_setattr(). * * An admin modifying the sysctl_sched_uclamp_{min, max} via procfs. * * An admin modifying the cgroup cpu.uclamp.{min, max}
*/
DEFINE_STATIC_KEY_FALSE(sched_uclamp_used);
staticinlineunsignedint
uclamp_idle_value(struct rq *rq, enum uclamp_id clamp_id, unsignedint clamp_value)
{ /* * Avoid blocked utilization pushing up the frequency when we go * idle (which drops the max-clamp) by retaining the last known * max-clamp.
*/ if (clamp_id == UCLAMP_MAX) {
rq->uclamp_flags |= UCLAMP_FLAG_IDLE; return clamp_value;
}
return uclamp_none(UCLAMP_MIN);
}
staticinlinevoid uclamp_idle_reset(struct rq *rq, enum uclamp_id clamp_id, unsignedint clamp_value)
{ /* Reset max-clamp retention only on idle exit */ if (!(rq->uclamp_flags & UCLAMP_FLAG_IDLE)) return;
/* * Since both min and max clamps are max aggregated, find the * top most bucket with tasks in.
*/ for ( ; bucket_id >= 0; bucket_id--) { if (!bucket[bucket_id].tasks) continue; return bucket[bucket_id].value;
}
staticvoid uclamp_update_util_min_rt_default(struct task_struct *p)
{ if (!rt_task(p)) return;
/* Protect updates to p->uclamp_* */
guard(task_rq_lock)(p);
__uclamp_update_util_min_rt_default(p);
}
staticinlinestruct uclamp_se
uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id)
{ /* Copy by value as we could modify it */ struct uclamp_se uc_req = p->uclamp_req[clamp_id]; #ifdef CONFIG_UCLAMP_TASK_GROUP unsignedint tg_min, tg_max, value;
/* * Tasks in autogroups or root task group will be * restricted by system defaults.
*/ if (task_group_is_autogroup(task_group(p))) return uc_req; if (task_group(p) == &root_task_group) return uc_req;
tg_min = task_group(p)->uclamp[UCLAMP_MIN].value;
tg_max = task_group(p)->uclamp[UCLAMP_MAX].value;
value = uc_req.value;
value = clamp(value, tg_min, tg_max);
uclamp_se_set(&uc_req, value, false); #endif
return uc_req;
}
/* * The effective clamp bucket index of a task depends on, by increasing * priority: * - the task specific clamp value, when explicitly requested from userspace * - the task group effective clamp value, for tasks not either in the root * group or in an autogroup * - the system default clamp value, defined by the sysadmin
*/ staticinlinestruct uclamp_se
uclamp_eff_get(struct task_struct *p, enum uclamp_id clamp_id)
{ struct uclamp_se uc_req = uclamp_tg_restrict(p, clamp_id); struct uclamp_se uc_max = uclamp_default[clamp_id];
/* System default restrictions always apply */ if (unlikely(uc_req.value > uc_max.value)) return uc_max;
/* Task currently refcounted: use back-annotated (effective) value */ if (p->uclamp[clamp_id].active) return (unsignedlong)p->uclamp[clamp_id].value;
uc_eff = uclamp_eff_get(p, clamp_id);
return (unsignedlong)uc_eff.value;
}
/* * When a task is enqueued on a rq, the clamp bucket currently defined by the * task's uclamp::bucket_id is refcounted on that rq. This also immediately * updates the rq's clamp value if required. * * Tasks can have a task-specific value requested from user-space, track * within each bucket the maximum value for tasks refcounted in it. * This "local max aggregation" allows to track the exact "requested" value * for each bucket when all its RUNNABLE tasks require the same clamp.
*/ staticinlinevoid uclamp_rq_inc_id(struct rq *rq, struct task_struct *p, enum uclamp_id clamp_id)
{ struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id]; struct uclamp_se *uc_se = &p->uclamp[clamp_id]; struct uclamp_bucket *bucket;
/* * Local max aggregation: rq buckets always track the max * "requested" clamp value of its RUNNABLE tasks.
*/ if (bucket->tasks == 1 || uc_se->value > bucket->value)
bucket->value = uc_se->value;
if (uc_se->value > uclamp_rq_get(rq, clamp_id))
uclamp_rq_set(rq, clamp_id, uc_se->value);
}
/* * When a task is dequeued from a rq, the clamp bucket refcounted by the task * is released. If this is the last task reference counting the rq's max * active clamp value, then the rq's clamp value is updated. * * Both refcounted tasks and rq's cached clamp values are expected to be * always valid. If it's detected they are not, as defensive programming, * enforce the expected state and warn.
*/ staticinlinevoid uclamp_rq_dec_id(struct rq *rq, struct task_struct *p, enum uclamp_id clamp_id)
{ struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id]; struct uclamp_se *uc_se = &p->uclamp[clamp_id]; struct uclamp_bucket *bucket; unsignedint bkt_clamp; unsignedint rq_clamp;
lockdep_assert_rq_held(rq);
/* * If sched_uclamp_used was enabled after task @p was enqueued, * we could end up with unbalanced call to uclamp_rq_dec_id(). * * In this case the uc_se->active flag should be false since no uclamp * accounting was performed at enqueue time and we can just return * here. * * Need to be careful of the following enqueue/dequeue ordering * problem too * * enqueue(taskA) * // sched_uclamp_used gets enabled * enqueue(taskB) * dequeue(taskA) * // Must not decrement bucket->tasks here * dequeue(taskB) * * where we could end up with stale data in uc_se and * bucket[uc_se->bucket_id]. * * The following check here eliminates the possibility of such race.
*/ if (unlikely(!uc_se->active)) return;
bucket = &uc_rq->bucket[uc_se->bucket_id];
WARN_ON_ONCE(!bucket->tasks); if (likely(bucket->tasks))
bucket->tasks--;
uc_se->active = false;
/* * Keep "local max aggregation" simple and accept to (possibly) * overboost some RUNNABLE tasks in the same bucket. * The rq clamp bucket value is reset to its base value whenever * there are no more RUNNABLE tasks refcounting it.
*/ if (likely(bucket->tasks)) return;
rq_clamp = uclamp_rq_get(rq, clamp_id); /* * Defensive programming: this should never happen. If it happens, * e.g. due to future modification, warn and fix up the expected value.
*/
WARN_ON_ONCE(bucket->value > rq_clamp); if (bucket->value >= rq_clamp) {
bkt_clamp = uclamp_rq_max_value(rq, clamp_id, uc_se->value);
uclamp_rq_set(rq, clamp_id, bkt_clamp);
}
}
/* * Avoid any overhead until uclamp is actually used by the userspace. * * The condition is constructed such that a NOP is generated when * sched_uclamp_used is disabled.
*/ if (!uclamp_is_used()) return;
if (unlikely(!p->sched_class->uclamp_enabled)) return;
/* Only inc the delayed task which being woken up. */ if (p->se.sched_delayed && !(flags & ENQUEUE_DELAYED)) return;
/* * Avoid any overhead until uclamp is actually used by the userspace. * * The condition is constructed such that a NOP is generated when * sched_uclamp_used is disabled.
*/ if (!uclamp_is_used()) return;
if (unlikely(!p->sched_class->uclamp_enabled)) return;
/* * Make sure to clear the idle flag if we've transiently reached 0 * active tasks on rq.
*/ if (clamp_id == UCLAMP_MAX && (rq->uclamp_flags & UCLAMP_FLAG_IDLE))
rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE;
}
/* * Lock the task and the rq where the task is (or was) queued. * * We might lock the (previous) rq of a !RUNNABLE task, but that's the * price to pay to safely serialize util_{min,max} updates with * enqueues, dequeues and migration operations. * This is the same locking schema used by __set_cpus_allowed_ptr().
*/
rq = task_rq_lock(p, &rf);
/* * Setting the clamp bucket is serialized by task_rq_lock(). * If the task is not yet RUNNABLE and its task_struct is not * affecting a valid clamp bucket, the next time it's enqueued, * it will already see the updated clamp bucket value.
*/
for_each_clamp_id(clamp_id)
uclamp_rq_reinc_id(rq, p, clamp_id);
/* * copy_process() sysctl_uclamp * uclamp_min_rt = X; * write_lock(&tasklist_lock) read_lock(&tasklist_lock) * // link thread smp_mb__after_spinlock() * write_unlock(&tasklist_lock) read_unlock(&tasklist_lock); * sched_post_fork() for_each_process_thread() * __uclamp_sync_rt() __uclamp_sync_rt() * * Ensures that either sched_post_fork() will observe the new * uclamp_min_rt or for_each_process_thread() will observe the new * task.
*/
read_lock(&tasklist_lock);
smp_mb__after_spinlock();
read_unlock(&tasklist_lock);
if (update_root_tg) {
sched_uclamp_enable();
uclamp_update_root_tg();
}
if (old_min_rt != sysctl_sched_uclamp_util_min_rt_default) {
sched_uclamp_enable();
uclamp_sync_util_min_rt_default();
}
/* * We update all RUNNABLE tasks only when task groups are in use. * Otherwise, keep it simple and do just a lazy update at each next * task enqueue time.
*/ return 0;
/* * We don't need to hold task_rq_lock() when updating p->uclamp_* here * as the task is still at its early fork stages.
*/
for_each_clamp_id(clamp_id)
p->uclamp[clamp_id].active = false;
/* Only get wchan if task is blocked and we can keep it that way. */
raw_spin_lock_irq(&p->pi_lock);
state = READ_ONCE(p->__state);
smp_rmb(); /* see try_to_wake_up() */ if (state != TASK_RUNNING && state != TASK_WAKING && !p->on_rq)
ip = __get_wchan(p);
raw_spin_unlock_irq(&p->pi_lock);
return ip;
}
void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
{ if (!(flags & ENQUEUE_NOCLOCK))
update_rq_clock(rq);
/* * Can be before ->enqueue_task() because uclamp considers the * ENQUEUE_DELAYED task before its ->sched_delayed gets cleared * in ->enqueue_task().
*/
uclamp_rq_inc(rq, p, flags);
p->sched_class->enqueue_task(rq, p, flags);
psi_enqueue(p, flags);
if (!(flags & ENQUEUE_RESTORE))
sched_info_enqueue(rq, p);
if (sched_core_enabled(rq))
sched_core_enqueue(rq, p);
}
/* * Must only return false when DEQUEUE_SLEEP.
*/ inlinebool dequeue_task(struct rq *rq, struct task_struct *p, int flags)
{ if (sched_core_enabled(rq))
sched_core_dequeue(rq, p, flags);
if (!(flags & DEQUEUE_NOCLOCK))
update_rq_clock(rq);
if (!(flags & DEQUEUE_SAVE))
sched_info_dequeue(rq, p);
psi_dequeue(p, flags);
/* * Must be before ->dequeue_task() because ->dequeue_task() can 'fail' * and mark the task ->sched_delayed.
*/
uclamp_rq_dec(rq, p); return p->sched_class->dequeue_task(rq, p, flags);
}
void activate_task(struct rq *rq, struct task_struct *p, int flags)
{ if (task_on_rq_migrating(p))
flags |= ENQUEUE_MIGRATED; if (flags & ENQUEUE_MIGRATED)
sched_mm_cid_migrate_to(rq, p);
/* * Code explicitly relies on TASK_ON_RQ_MIGRATING begin set *before* * dequeue_task() and cleared *after* enqueue_task().
*/
dequeue_task(rq, p, flags);
}
staticvoid block_task(struct rq *rq, struct task_struct *p, int flags)
{ if (dequeue_task(rq, p, DEQUEUE_SLEEP | flags))
__block_task(rq, p);
}
/** * task_curr - is this task currently executing on a CPU? * @p: the task in question. * * Return: 1 if the task is currently executing. 0 otherwise.
*/ inlineint task_curr(conststruct task_struct *p)
{ return cpu_curr(task_cpu(p)) == p;
}
/* * ->switching_to() is called with the pi_lock and rq_lock held and must not * mess with locking.
*/ void check_class_changing(struct rq *rq, struct task_struct *p, conststruct sched_class *prev_class)
{ if (prev_class != p->sched_class && p->sched_class->switching_to)
p->sched_class->switching_to(rq, p);
}
/* * switched_from, switched_to and prio_changed must _NOT_ drop rq->lock, * use the balance_callback list if you want balancing. * * this means any call to check_class_changed() must be followed by a call to * balance_callback().
*/ void check_class_changed(struct rq *rq, struct task_struct *p, conststruct sched_class *prev_class, int oldprio)
{ if (prev_class != p->sched_class) { if (prev_class->switched_from)
prev_class->switched_from(rq, p);
/* * A queue event has occurred, and we're going to schedule. In * this case, we can save a useless back to back clock update.
*/ if (task_on_rq_queued(donor) && test_tsk_need_resched(rq->curr))
rq_clock_skip_update(rq);
}
static __always_inline int __task_state_match(struct task_struct *p, unsignedint state)
{ if (READ_ONCE(p->__state) & state) return 1;
if (READ_ONCE(p->saved_state) & state) return -1;
return 0;
}
static __always_inline int task_state_match(struct task_struct *p, unsignedint state)
{ /* * Serialize against current_save_and_set_rtlock_wait_state(), * current_restore_rtlock_saved_state(), and __refrigerator().
*/
guard(raw_spinlock_irq)(&p->pi_lock); return __task_state_match(p, state);
}
/* * wait_task_inactive - wait for a thread to unschedule. * * Wait for the thread to block in any of the states set in @match_state. * If it changes, i.e. @p might have woken up, then return zero. When we * succeed in waiting for @p to be off its CPU, we return a positive number * (its total switch count). If a second call a short while later returns the * same number, the caller can be sure that @p has remained unscheduled the * whole time. * * The caller must ensure that the task *will* unschedule sometime soon, * else this function might spin for a *long* time. This function can't * be called with interrupts off, or it may introduce deadlock with * smp_call_function() if an IPI is sent by the same process we are * waiting to become inactive.
*/ unsignedlong wait_task_inactive(struct task_struct *p, unsignedint match_state)
{ int running, queued, match; struct rq_flags rf; unsignedlong ncsw; struct rq *rq;
for (;;) { /* * We do the initial early heuristics without holding * any task-queue locks at all. We'll only try to get * the runqueue lock when things look like they will * work out!
*/
rq = task_rq(p);
/* * If the task is actively running on another CPU * still, just relax and busy-wait without holding * any locks. * * NOTE! Since we don't hold any locks, it's not * even sure that "rq" stays as the right runqueue! * But we don't care, since "task_on_cpu()" will * return false if the runqueue has changed and p * is actually now running somewhere else!
*/ while (task_on_cpu(rq, p)) { if (!task_state_match(p, match_state)) return 0;
cpu_relax();
}
/* * Ok, time to look more closely! We need the rq * lock now, to be *sure*. If we're wrong, we'll * just go back and repeat.
*/
rq = task_rq_lock(p, &rf); /* * If task is sched_delayed, force dequeue it, to avoid always * hitting the tick timeout in the queued case
*/ if (p->se.sched_delayed)
dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
trace_sched_wait_task(p);
running = task_on_cpu(rq, p);
queued = task_on_rq_queued(p);
ncsw = 0; if ((match = __task_state_match(p, match_state))) { /* * When matching on p->saved_state, consider this task * still queued so it will wait.
*/ if (match < 0)
queued = 1;
ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
}
task_rq_unlock(rq, p, &rf);
/* * If it changed from the expected state, bail out now.
*/ if (unlikely(!ncsw)) break;
/* * Was it really running after all now that we * checked with the proper locks actually held? * * Oops. Go back and try again..
*/ if (unlikely(running)) {
cpu_relax(); continue;
}
/* * It's not enough that it's not actively running, * it must be off the runqueue _entirely_, and not * preempted! * * So if it was still runnable (but just not actively * running right now), it's preempted, and we should * yield - it could be a while.
*/ if (unlikely(queued)) {
ktime_t to = NSEC_PER_SEC / HZ;
/* * Ahh, all good. It wasn't running, and it wasn't * runnable, which means that it will never become * running in the future either. We're all done!
*/ break;
}
#ifdef CONFIG_DEBUG_PREEMPT /* * Check both overflow from migrate_disable() and superfluous * migrate_enable().
*/ if (WARN_ON_ONCE((s16)p->migration_disabled <= 0)) return; #endif
if (p->migration_disabled > 1) {
p->migration_disabled--; return;
}
/* * Ensure stop_task runs either before or after this, and that * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule().
*/
guard(preempt)(); if (p->cpus_ptr != &p->cpus_mask)
__set_cpus_allowed_ptr(p, &ac); /* * Mustn't clear migration_disabled() until cpus_ptr points back at the * regular cpus_mask, otherwise things that race (eg. * select_fallback_rq) get confused.
*/
barrier();
p->migration_disabled = 0;
this_rq()->nr_pinned--;
}
EXPORT_SYMBOL_GPL(migrate_enable);
/* * Per-CPU kthreads are allowed to run on !active && online CPUs, see * __set_cpus_allowed_ptr() and select_fallback_rq().
*/ staticinlinebool is_cpu_allowed(struct task_struct *p, int cpu)
{ /* When not in the task's cpumask, no point in looking further. */ if (!task_allowed_on_cpu(p, cpu)) returnfalse;
/* migrate_disabled() must be allowed to finish. */ if (is_migration_disabled(p)) return cpu_online(cpu);
/* Non kernel threads are not allowed during either online or offline. */ if (!(p->flags & PF_KTHREAD)) return cpu_active(cpu);
/* KTHREAD_IS_PER_CPU is always allowed. */ if (kthread_is_per_cpu(p)) return cpu_online(cpu);
/* Regular kernel threads don't get to stay during offline. */ if (cpu_dying(cpu)) returnfalse;
/* But are allowed during online. */ return cpu_online(cpu);
}
/* * This is how migration works: * * 1) we invoke migration_cpu_stop() on the target CPU using * stop_one_cpu(). * 2) stopper starts to run (implicitly forcing the migrated thread * off the CPU) * 3) it checks whether the migrated task is still in the wrong runqueue. * 4) if it's in the wrong runqueue then the migration thread removes * it and puts it into the right queue. * 5) stopper completes and stop_one_cpu() returns and the migration * is done.
*/
/* * move_queued_task - move a queued task to new rq. * * Returns (locked) new rq. Old rq's lock is released.
*/ staticstruct rq *move_queued_task(struct rq *rq, struct rq_flags *rf, struct task_struct *p, int new_cpu)
{
lockdep_assert_rq_held(rq);
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.