/* SPDX-License-Identifier: GPL-2.0 */ /* * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst * * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. * Copyright (c) 2022 Tejun Heo <tj@kernel.org> * Copyright (c) 2022 David Vernet <dvernet@meta.com>
*/ #include <linux/btf_ids.h> #include"ext_idle.h"
/* * NOTE: sched_ext is in the process of growing multiple scheduler support and * scx_root usage is in a transitional state. Naked dereferences are safe if the * caller is one of the tasks attached to SCX and explicit RCU dereference is * necessary otherwise. Naked scx_root dereferences trigger sparse warnings but * are used as temporary markers to indicate that the dereferences need to be * updated to point to the associated scheduler instances rather than scx_root.
*/ staticstruct scx_sched __rcu *scx_root;
/* * During exit, a task may schedule after losing its PIDs. When disabling the * BPF scheduler, we need to be able to iterate tasks in every state to * guarantee system safety. Maintain a dedicated task list which contains every * task between its fork and eventual free.
*/ static DEFINE_SPINLOCK(scx_tasks_lock); static LIST_HEAD(scx_tasks);
/* * A monotically increasing sequence number that is incremented every time a * scheduler is enabled. This can be used by to check if any custom sched_ext * scheduler has ever been used in the system.
*/ static atomic_long_t scx_enable_seq = ATOMIC_LONG_INIT(0);
/* * The maximum amount of time in jiffies that a task may be runnable without * being scheduled on a CPU. If this timeout is exceeded, it will trigger * scx_error().
*/ staticunsignedlong scx_watchdog_timeout;
/* * The last time the delayed work was run. This delayed work relies on * ksoftirqd being able to run to service timer interrupts, so it's possible * that this work itself could get wedged. To account for this, we check that * it's not stalled in the timer tick, and trigger an error if it is.
*/ staticunsignedlong scx_watchdog_timestamp = INITIAL_JIFFIES;
staticstruct delayed_work scx_watchdog_work;
/* for %SCX_KICK_WAIT */ staticunsignedlong __percpu *scx_kick_cpus_pnt_seqs;
/* * Direct dispatch marker. * * Non-NULL values are used for direct dispatch from enqueue path. A valid * pointer points to the task currently being enqueued. An ERR_PTR value is used * to indicate that direct dispatch has already happened.
*/ static DEFINE_PER_CPU(struct task_struct *, direct_dispatch_task);
/* * scx_kf_mask enforcement. Some kfuncs can only be called from specific SCX * ops. When invoking SCX ops, SCX_CALL_OP[_RET]() should be used to indicate * the allowed kfuncs and those kfuncs should use scx_kf_allowed() to check * whether it's running from an allowed context. * * @mask is constant, always inline to cull the mask calculations.
*/ static __always_inline void scx_kf_allow(u32 mask)
{ /* nesting is allowed only in increasing scx_kf_mask order */
WARN_ONCE((mask | higher_bits(mask)) & current->scx.kf_mask, "invalid nesting current->scx.kf_mask=0x%x mask=0x%x\n",
current->scx.kf_mask, mask);
current->scx.kf_mask |= mask;
barrier();
}
/* * Track the rq currently locked. * * This allows kfuncs to safely operate on rq from any scx ops callback, * knowing which rq is already locked.
*/
DEFINE_PER_CPU(struct rq *, scx_locked_rq_state);
staticinlinevoid update_locked_rq(struct rq *rq)
{ /* * Check whether @rq is actually locked. This can help expose bugs * or incorrect assumptions about the context in which a kfunc or * callback is executed.
*/ if (rq)
lockdep_assert_rq_held(rq);
__this_cpu_write(scx_locked_rq_state, rq);
}
#define SCX_CALL_OP(sch, mask, op, rq, args...) \ do { \ if (rq) \
update_locked_rq(rq); \ if (mask) { \
scx_kf_allow(mask); \
(sch)->ops.op(args); \
scx_kf_disallow(mask); \
} else { \
(sch)->ops.op(args); \
} \ if (rq) \
update_locked_rq(NULL); \
} while (0)
/* * Some kfuncs are allowed only on the tasks that are subjects of the * in-progress scx_ops operation for, e.g., locking guarantees. To enforce such * restrictions, the following SCX_CALL_OP_*() variants should be used when * invoking scx_ops operations that take task arguments. These can only be used * for non-nesting operations due to the way the tasks are tracked. * * kfuncs which can only operate on such tasks can in turn use * scx_kf_allowed_on_arg_tasks() to test whether the invocation is allowed on * the specific task.
*/ #define SCX_CALL_OP_TASK(sch, mask, op, rq, task, args...) \ do { \
BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \
current->scx.kf_tasks[0] = task; \
SCX_CALL_OP((sch), mask, op, rq, task, ##args); \
current->scx.kf_tasks[0] = NULL; \
} while (0)
/* @mask is constant, always inline to cull unnecessary branches */ static __always_inline bool scx_kf_allowed(u32 mask)
{ if (unlikely(!(current->scx.kf_mask & mask))) {
scx_kf_error("kfunc with mask 0x%x called from an operation only allowing 0x%x",
mask, current->scx.kf_mask); returnfalse;
}
/* * Enforce nesting boundaries. e.g. A kfunc which can be called from * DISPATCH must not be called if we're running DEQUEUE which is nested * inside ops.dispatch(). We don't need to check boundaries for any * blocking kfuncs as the verifier ensures they're only called from * sleepable progs.
*/ if (unlikely(highest_bit(mask) == SCX_KF_CPU_RELEASE &&
(current->scx.kf_mask & higher_bits(SCX_KF_CPU_RELEASE)))) {
scx_kf_error("cpu_release kfunc called from a nested operation"); returnfalse;
}
if (unlikely(highest_bit(mask) == SCX_KF_DISPATCH &&
(current->scx.kf_mask & higher_bits(SCX_KF_DISPATCH)))) {
scx_kf_error("dispatch kfunc called from a nested operation"); returnfalse;
}
returntrue;
}
/* see SCX_CALL_OP_TASK() */ static __always_inline bool scx_kf_allowed_on_arg_tasks(u32 mask, struct task_struct *p)
{ if (!scx_kf_allowed(mask)) returnfalse;
if (unlikely((p != current->scx.kf_tasks[0] &&
p != current->scx.kf_tasks[1]))) {
scx_kf_error("called on a task not being operated on"); returnfalse;
}
returntrue;
}
/** * nldsq_next_task - Iterate to the next task in a non-local DSQ * @dsq: user dsq being iterated * @cur: current position, %NULL to start iteration * @rev: walk backwards * * Returns %NULL when iteration is finished.
*/ staticstruct task_struct *nldsq_next_task(struct scx_dispatch_q *dsq, struct task_struct *cur, bool rev)
{ struct list_head *list_node; struct scx_dsq_list_node *dsq_lnode;
lockdep_assert_held(&dsq->lock);
if (cur)
list_node = &cur->scx.dsq_list.node; else
list_node = &dsq->list;
/* find the next task, need to skip BPF iteration cursors */ do { if (rev)
list_node = list_node->prev; else
list_node = list_node->next;
/* * BPF DSQ iterator. Tasks in a non-local DSQ can be iterated in [reverse] * dispatch order. BPF-visible iterator is opaque and larger to allow future * changes without breaking backward compatibility. Can be used with * bpf_for_each(). See bpf_iter_scx_dsq_*().
*/ enum scx_dsq_iter_flags { /* iterate in the reverse dispatch order */
SCX_DSQ_ITER_REV = 1U << 16,
/** * scx_task_iter_start - Lock scx_tasks_lock and start a task iteration * @iter: iterator to init * * Initialize @iter and return with scx_tasks_lock held. Once initialized, @iter * must eventually be stopped with scx_task_iter_stop(). * * scx_tasks_lock and the rq lock may be released using scx_task_iter_unlock() * between this and the first next() call or between any two next() calls. If * the locks are released between two next() calls, the caller is responsible * for ensuring that the task being iterated remains accessible either through * RCU read lock or obtaining a reference count. * * All tasks which existed when the iteration started are guaranteed to be * visited as long as they still exist.
*/ staticvoid scx_task_iter_start(struct scx_task_iter *iter)
{
BUILD_BUG_ON(__SCX_DSQ_ITER_ALL_FLAGS &
((1U << __SCX_DSQ_LNODE_PRIV_SHIFT) - 1));
/** * scx_task_iter_unlock - Unlock rq and scx_tasks_lock held by a task iterator * @iter: iterator to unlock * * If @iter is in the middle of a locked iteration, it may be locking the rq of * the task currently being visited in addition to scx_tasks_lock. Unlock both. * This function can be safely called anytime during an iteration.
*/ staticvoid scx_task_iter_unlock(struct scx_task_iter *iter)
{
__scx_task_iter_rq_unlock(iter);
spin_unlock_irq(&scx_tasks_lock);
}
/** * scx_task_iter_relock - Lock scx_tasks_lock released by scx_task_iter_unlock() * @iter: iterator to re-lock * * Re-lock scx_tasks_lock unlocked by scx_task_iter_unlock(). Note that it * doesn't re-lock the rq lock. Must be called before other iterator operations.
*/ staticvoid scx_task_iter_relock(struct scx_task_iter *iter)
{
spin_lock_irq(&scx_tasks_lock);
}
/** * scx_task_iter_stop - Stop a task iteration and unlock scx_tasks_lock * @iter: iterator to exit * * Exit a previously initialized @iter. Must be called with scx_tasks_lock held * which is released on return. If the iterator holds a task's rq lock, that rq * lock is also released. See scx_task_iter_start() for details.
*/ staticvoid scx_task_iter_stop(struct scx_task_iter *iter)
{
list_del_init(&iter->cursor.tasks_node);
scx_task_iter_unlock(iter);
}
/** * scx_task_iter_next - Next task * @iter: iterator to walk * * Visit the next task. See scx_task_iter_start() for details. Locks are dropped * and re-acquired every %SCX_TASK_ITER_BATCH iterations to avoid causing stalls * by holding scx_tasks_lock for too long.
*/ staticstruct task_struct *scx_task_iter_next(struct scx_task_iter *iter)
{ struct list_head *cursor = &iter->cursor.tasks_node; struct sched_ext_entity *pos;
if (!(++iter->cnt % SCX_TASK_ITER_BATCH)) {
scx_task_iter_unlock(iter);
cond_resched();
scx_task_iter_relock(iter);
}
/* can't happen, should always terminate at scx_tasks above */
BUG();
}
/** * scx_task_iter_next_locked - Next non-idle task with its rq locked * @iter: iterator to walk * * Visit the non-idle task with its rq lock held. Allows callers to specify * whether they would like to filter out dead tasks. See scx_task_iter_start() * for details.
*/ staticstruct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter)
{ struct task_struct *p;
__scx_task_iter_rq_unlock(iter);
while ((p = scx_task_iter_next(iter))) { /* * scx_task_iter is used to prepare and move tasks into SCX * while loading the BPF scheduler and vice-versa while * unloading. The init_tasks ("swappers") should be excluded * from the iteration because: * * - It's unsafe to use __setschduler_prio() on an init_task to * determine the sched_class to use as it won't preserve its * idle_sched_class. * * - ops.init/exit_task() can easily be confused if called with * init_tasks as they, e.g., share PID 0. * * As init_tasks are never scheduled through SCX, they can be * skipped safely. Note that is_idle_task() which tests %PF_IDLE * doesn't work here: * * - %PF_IDLE may not be set for an init_task whose CPU hasn't * yet been onlined. * * - %PF_IDLE can be set on tasks that are not init_tasks. See * play_idle_precise() used by CONFIG_IDLE_INJECT. * * Test for idle_sched_class as only init_tasks are on it.
*/ if (p->sched_class != &idle_sched_class) break;
} if (!p) return NULL;
/** * scx_add_event - Increase an event counter for 'name' by 'cnt' * @sch: scx_sched to account events for * @name: an event name defined in struct scx_event_stats * @cnt: the number of the event occurred * * This can be used when preemption is not disabled.
*/ #define scx_add_event(sch, name, cnt) do { \
this_cpu_add((sch)->pcpu->event_stats.name, (cnt)); \
trace_sched_ext_event(#name, (cnt)); \
} while(0)
/** * __scx_add_event - Increase an event counter for 'name' by 'cnt' * @sch: scx_sched to account events for * @name: an event name defined in struct scx_event_stats * @cnt: the number of the event occurred * * This should be used only when preemption is disabled.
*/ #define __scx_add_event(sch, name, cnt) do { \
__this_cpu_add((sch)->pcpu->event_stats.name, (cnt)); \
trace_sched_ext_event(#name, cnt); \
} while(0)
/** * scx_agg_event - Aggregate an event counter 'kind' from 'src_e' to 'dst_e' * @dst_e: destination event stats * @src_e: source event stats * @kind: a kind of event to be aggregated
*/ #define scx_agg_event(dst_e, src_e, kind) do { \
(dst_e)->kind += READ_ONCE((src_e)->kind); \
} while(0)
/** * scx_dump_event - Dump an event 'kind' in 'events' to 's' * @s: output seq_buf * @events: event stats * @kind: a kind of event to dump
*/ #define scx_dump_event(s, events, kind) do { \
dump_line(&(s), "%40s: %16lld", #kind, (events)->kind); \
} while (0)
/** * wait_ops_state - Busy-wait the specified ops state to end * @p: target task * @opss: state to wait the end of * * Busy-wait for @p to transition out of @opss. This can only be used when the * state part of @opss is %SCX_QUEUEING or %SCX_DISPATCHING. This function also * has load_acquire semantics to ensure that the caller can see the updates made * in the enqueueing and dispatching paths.
*/ staticvoid wait_ops_state(struct task_struct *p, unsignedlong opss)
{ do {
cpu_relax();
} while (atomic_long_read_acquire(&p->scx.ops_state) == opss);
}
/** * ops_cpu_valid - Verify a cpu number, to be used on ops input args * @sch: scx_sched to abort on error * @cpu: cpu number which came from a BPF ops * @where: extra information reported on error * * @cpu is a cpu number which came from the BPF scheduler and can be any value. * Verify that it is in range and one of the possible cpus. If invalid, trigger * an ops error.
*/ staticbool ops_cpu_valid(struct scx_sched *sch, s32 cpu, constchar *where)
{ if (__cpu_valid(cpu)) { returntrue;
} else {
scx_error(sch, "invalid CPU %d%s%s", cpu, where ? " " : "", where ?: ""); returnfalse;
}
}
/** * kf_cpu_valid - Verify a CPU number, to be used on kfunc input args * @cpu: cpu number which came from a BPF ops * @where: extra information reported on error * * The same as ops_cpu_valid() but @sch is implicit.
*/ staticbool kf_cpu_valid(u32 cpu, constchar *where)
{ if (__cpu_valid(cpu)) { returntrue;
} else {
scx_kf_error("invalid CPU %d%s%s", cpu, where ? " " : "", where ?: ""); returnfalse;
}
}
/** * ops_sanitize_err - Sanitize a -errno value * @sch: scx_sched to error out on error * @ops_name: operation to blame on failure * @err: -errno value to sanitize * * Verify @err is a valid -errno. If not, trigger scx_error() and return * -%EPROTO. This is necessary because returning a rogue -errno up the chain can * cause misbehaviors. For an example, a large negative return from * ops.init_task() triggers an oops when passed up the call chain because the * value fails IS_ERR() test after being encoded with ERR_PTR() and then is * handled as a pointer.
*/ staticint ops_sanitize_err(struct scx_sched *sch, constchar *ops_name, s32 err)
{ if (err < 0 && err >= -MAX_ERRNO) return err;
scx_error(sch, "ops.%s() returned an invalid errno %d", ops_name, err); return -EPROTO;
}
/** * schedule_deferred - Schedule execution of deferred actions on an rq * @rq: target rq * * Schedule execution of deferred actions on @rq. Must be called with @rq * locked. Deferred actions are executed with @rq locked but unpinned, and thus * can unlock @rq to e.g. migrate tasks to other rqs.
*/ staticvoid schedule_deferred(struct rq *rq)
{
lockdep_assert_rq_held(rq);
/* * If in the middle of waking up a task, task_woken_scx() will be called * afterwards which will then run the deferred actions, no need to * schedule anything.
*/ if (rq->scx.flags & SCX_RQ_IN_WAKEUP) return;
/* * If in balance, the balance callbacks will be called before rq lock is * released. Schedule one.
*/ if (rq->scx.flags & SCX_RQ_IN_BALANCE) {
queue_balance_callback(rq, &rq->scx.deferred_bal_cb,
deferred_bal_cb_workfn); return;
}
/* * No scheduler hooks available. Queue an irq work. They are executed on * IRQ re-enable which may take a bit longer than the scheduler hooks. * The above WAKEUP and BALANCE paths should cover most of the cases and * the time to IRQ re-enable shouldn't be long.
*/
irq_work_queue(&rq->scx.deferred_irq_work);
}
/** * touch_core_sched - Update timestamp used for core-sched task ordering * @rq: rq to read clock from, must be locked * @p: task to update the timestamp for * * Update @p->scx.core_sched_at timestamp. This is used by scx_prio_less() to * implement global or local-DSQ FIFO ordering for core-sched. Should be called * when a task becomes runnable and its turn on the CPU ends (e.g. slice * exhaustion).
*/ staticvoid touch_core_sched(struct rq *rq, struct task_struct *p)
{
lockdep_assert_rq_held(rq);
#ifdef CONFIG_SCHED_CORE /* * It's okay to update the timestamp spuriously. Use * sched_core_disabled() which is cheaper than enabled(). * * As this is used to determine ordering between tasks of sibling CPUs, * it may be better to use per-core dispatch sequence instead.
*/ if (!sched_core_disabled())
p->scx.core_sched_at = sched_clock_cpu(cpu_of(rq)); #endif
}
/** * touch_core_sched_dispatch - Update core-sched timestamp on dispatch * @rq: rq to read clock from, must be locked * @p: task being dispatched * * If the BPF scheduler implements custom core-sched ordering via * ops.core_sched_before(), @p->scx.core_sched_at is used to implement FIFO * ordering within each local DSQ. This function is called from dispatch paths * and updates @p->scx.core_sched_at if custom core-sched ordering is in effect.
*/ staticvoid touch_core_sched_dispatch(struct rq *rq, struct task_struct *p)
{
lockdep_assert_rq_held(rq);
#ifdef CONFIG_SCHED_CORE if (unlikely(SCX_HAS_OP(scx_root, core_sched_before)))
touch_core_sched(rq, p); #endif
}
if (!is_local) {
raw_spin_lock(&dsq->lock); if (unlikely(dsq->id == SCX_DSQ_INVALID)) {
scx_error(sch, "attempting to dispatch to a destroyed dsq"); /* fall back to the global dsq */
raw_spin_unlock(&dsq->lock);
dsq = find_global_dsq(p);
raw_spin_lock(&dsq->lock);
}
}
if (unlikely((dsq->id & SCX_DSQ_FLAG_BUILTIN) &&
(enq_flags & SCX_ENQ_DSQ_PRIQ))) { /* * SCX_DSQ_LOCAL and SCX_DSQ_GLOBAL DSQs always consume from * their FIFO queues. To avoid confusion and accidentally * starving vtime-dispatched tasks by FIFO-dispatched tasks, we * disallow any internal DSQ from doing vtime ordering of * tasks.
*/
scx_error(sch, "cannot use vtime ordering for built-in DSQs");
enq_flags &= ~SCX_ENQ_DSQ_PRIQ;
}
if (enq_flags & SCX_ENQ_DSQ_PRIQ) { struct rb_node *rbp;
/* * A PRIQ DSQ shouldn't be using FIFO enqueueing. As tasks are * linked to both the rbtree and list on PRIQs, this can only be * tested easily when adding the first task.
*/ if (unlikely(RB_EMPTY_ROOT(&dsq->priq) &&
nldsq_next_task(dsq, NULL, false)))
scx_error(sch, "DSQ ID 0x%016llx already had FIFO-enqueued tasks",
dsq->id);
/* * Find the previous task and insert after it on the list so * that @dsq->list is vtime ordered.
*/
rbp = rb_prev(&p->scx.dsq_priq); if (rbp) { struct task_struct *prev =
container_of(rbp, struct task_struct,
scx.dsq_priq);
list_add(&p->scx.dsq_list.node, &prev->scx.dsq_list.node);
} else {
list_add(&p->scx.dsq_list.node, &dsq->list);
}
} else { /* a FIFO DSQ shouldn't be using PRIQ enqueuing */ if (unlikely(!RB_EMPTY_ROOT(&dsq->priq)))
scx_error(sch, "DSQ ID 0x%016llx already had PRIQ-enqueued tasks",
dsq->id);
/* seq records the order tasks are queued, used by BPF DSQ iterator */
dsq->seq++;
p->scx.dsq_seq = dsq->seq;
dsq_mod_nr(dsq, 1);
p->scx.dsq = dsq;
/* * scx.ddsp_dsq_id and scx.ddsp_enq_flags are only relevant on the * direct dispatch path, but we clear them here because the direct * dispatch verdict may be overridden on the enqueue path during e.g. * bypass.
*/
p->scx.ddsp_dsq_id = SCX_DSQ_INVALID;
p->scx.ddsp_enq_flags = 0;
/* * We're transitioning out of QUEUEING or DISPATCHING. store_release to * match waiters' load_acquire.
*/ if (enq_flags & SCX_ENQ_CLEAR_OPSS)
atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE);
if (!dsq) { /* * If !dsq && on-list, @p is on @rq's ddsp_deferred_locals. * Unlinking is all that's needed to cancel.
*/ if (unlikely(!list_empty(&p->scx.dsq_list.node)))
list_del_init(&p->scx.dsq_list.node);
/* * When dispatching directly from the BPF scheduler to a local * DSQ, the task isn't associated with any DSQ but * @p->scx.holding_cpu may be set under the protection of * %SCX_OPSS_DISPATCHING.
*/ if (p->scx.holding_cpu >= 0)
p->scx.holding_cpu = -1;
return;
}
if (!is_local)
raw_spin_lock(&dsq->lock);
/* * Now that we hold @dsq->lock, @p->holding_cpu and @p->scx.dsq_* can't * change underneath us.
*/ if (p->scx.holding_cpu < 0) { /* @p must still be on @dsq, dequeue */
task_unlink_from_dsq(p, dsq);
} else { /* * We're racing against dispatch_to_local_dsq() which already * removed @p from @dsq and set @p->scx.holding_cpu. Clear the * holding_cpu which tells dispatch_to_local_dsq() that it lost * the race.
*/
WARN_ON_ONCE(!list_empty(&p->scx.dsq_list.node));
p->scx.holding_cpu = -1;
}
p->scx.dsq = NULL;
if (unlikely(!dsq)) {
scx_error(sch, "non-existent DSQ 0x%llx for %s[%d]",
dsq_id, p->comm, p->pid); return find_global_dsq(p);
}
return dsq;
}
staticvoid mark_direct_dispatch(struct task_struct *ddsp_task, struct task_struct *p, u64 dsq_id,
u64 enq_flags)
{ /* * Mark that dispatch already happened from ops.select_cpu() or * ops.enqueue() by spoiling direct_dispatch_task with a non-NULL value * which can never match a valid task pointer.
*/
__this_cpu_write(direct_dispatch_task, ERR_PTR(-ESRCH));
/* @p must match the task on the enqueue path */ if (unlikely(p != ddsp_task)) { if (IS_ERR(ddsp_task))
scx_kf_error("%s[%d] already direct-dispatched",
p->comm, p->pid); else
scx_kf_error("scheduling for %s[%d] but trying to direct-dispatch %s[%d]",
ddsp_task->comm, ddsp_task->pid,
p->comm, p->pid); return;
}
/* * We are in the enqueue path with @rq locked and pinned, and thus can't * double lock a remote rq and enqueue to its local DSQ. For * DSQ_LOCAL_ON verdicts targeting the local DSQ of a remote CPU, defer * the enqueue so that it's executed when @rq can be unlocked.
*/ if (dsq->id == SCX_DSQ_LOCAL && dsq != &rq->scx.local_dsq) { unsignedlong opss;
switch (opss & SCX_OPSS_STATE_MASK) { case SCX_OPSS_NONE: break; case SCX_OPSS_QUEUEING: /* * As @p was never passed to the BPF side, _release is * not strictly necessary. Still do it for consistency.
*/
atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE); break; default:
WARN_ONCE(true, "sched_ext: %s[%d] has invalid ops state 0x%lx in direct_dispatch()",
p->comm, p->pid, opss);
atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE); break;
}
staticbool scx_rq_online(struct rq *rq)
{ /* * Test both cpu_active() and %SCX_RQ_ONLINE. %SCX_RQ_ONLINE indicates * the online state as seen from the BPF scheduler. cpu_active() test * guarantees that, if this function returns %true, %SCX_RQ_ONLINE will * stay set until the current scheduling operation is complete even if * we aren't locking @rq.
*/ return likely((rq->scx.flags & SCX_RQ_ONLINE) && cpu_active(cpu_of(rq)));
}
/* rq migration */ if (sticky_cpu == cpu_of(rq)) goto local_norefill;
/* * If !scx_rq_online(), we already told the BPF scheduler that the CPU * is offline and are just running the hotplug path. Don't bother the * BPF scheduler.
*/ if (!scx_rq_online(rq)) goto local;
if (scx_rq_bypassing(rq)) {
__scx_add_event(sch, SCX_EV_BYPASS_DISPATCH, 1); goto global;
}
if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID) goto direct;
/* see %SCX_OPS_ENQ_EXITING */ if (!(sch->ops.flags & SCX_OPS_ENQ_EXITING) &&
unlikely(p->flags & PF_EXITING)) {
__scx_add_event(sch, SCX_EV_ENQ_SKIP_EXITING, 1); goto local;
}
/* see %SCX_OPS_ENQ_MIGRATION_DISABLED */ if (!(sch->ops.flags & SCX_OPS_ENQ_MIGRATION_DISABLED) &&
is_migration_disabled(p)) {
__scx_add_event(sch, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED, 1); goto local;
}
if (unlikely(!SCX_HAS_OP(sch, enqueue))) goto global;
/* DSQ bypass didn't trigger, enqueue on the BPF scheduler */
qseq = rq->scx.ops_qseq++ << SCX_OPSS_QSEQ_SHIFT;
*ddsp_taskp = NULL; if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID) goto direct;
/* * If not directly dispatched, QUEUEING isn't clear yet and dispatch or * dequeue may be waiting. The store_release matches their load_acquire.
*/
atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_QUEUED | qseq); return;
local: /* * For task-ordering, slice refill must be treated as implying the end * of the current slice. Otherwise, the longer @p stays on the CPU, the * higher priority it becomes from scx_prio_less()'s POV.
*/
touch_core_sched(rq, p);
refill_task_slice_dfl(p);
local_norefill:
dispatch_enqueue(sch, &rq->scx.local_dsq, p, enq_flags); return;
global:
touch_core_sched(rq, p); /* see the comment in local: */
refill_task_slice_dfl(p);
dispatch_enqueue(sch, find_global_dsq(p), p, enq_flags);
}
/* * list_add_tail() must be used. scx_bypass() depends on tasks being * appended to the runnable_list.
*/
list_add_tail(&p->scx.runnable_node, &rq->scx.runnable_list);
}
staticvoid enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags)
{ struct scx_sched *sch = scx_root; int sticky_cpu = p->scx.sticky_cpu;
if (enq_flags & ENQUEUE_WAKEUP)
rq->scx.flags |= SCX_RQ_IN_WAKEUP;
enq_flags |= rq->scx.extra_enq_flags;
if (sticky_cpu >= 0)
p->scx.sticky_cpu = -1;
/* * Restoring a running task will be immediately followed by * set_next_task_scx() which expects the task to not be on the BPF * scheduler as tasks can only start running through local DSQs. Force * direct-dispatch into the local DSQ by setting the sticky_cpu.
*/ if (unlikely(enq_flags & ENQUEUE_RESTORE) && task_current(rq, p))
sticky_cpu = cpu_of(rq);
if (p->scx.flags & SCX_TASK_QUEUED) {
WARN_ON_ONCE(!task_runnable(p)); goto out;
}
/* acquire ensures that we see the preceding updates on QUEUED */
opss = atomic_long_read_acquire(&p->scx.ops_state);
switch (opss & SCX_OPSS_STATE_MASK) { case SCX_OPSS_NONE: break; case SCX_OPSS_QUEUEING: /* * QUEUEING is started and finished while holding @p's rq lock. * As we're holding the rq lock now, we shouldn't see QUEUEING.
*/
BUG(); case SCX_OPSS_QUEUED: if (SCX_HAS_OP(sch, dequeue))
SCX_CALL_OP_TASK(sch, SCX_KF_REST, dequeue, rq,
p, deq_flags);
if (atomic_long_try_cmpxchg(&p->scx.ops_state, &opss,
SCX_OPSS_NONE)) break;
fallthrough; case SCX_OPSS_DISPATCHING: /* * If @p is being dispatched from the BPF scheduler to a DSQ, * wait for the transfer to complete so that @p doesn't get * added to its DSQ after dequeueing is complete. * * As we're waiting on DISPATCHING with the rq locked, the * dispatching side shouldn't try to lock the rq while * DISPATCHING is set. See dispatch_to_local_dsq(). * * DISPATCHING shouldn't have qseq set and control can reach * here with NONE @opss from the above QUEUED case block. * Explicitly wait on %SCX_OPSS_DISPATCHING instead of @opss.
*/
wait_ops_state(p, SCX_OPSS_DISPATCHING);
BUG_ON(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE); break;
}
}
if (!(p->scx.flags & SCX_TASK_QUEUED)) {
WARN_ON_ONCE(task_runnable(p)); returntrue;
}
ops_dequeue(rq, p, deq_flags);
/* * A currently running task which is going off @rq first gets dequeued * and then stops running. As we want running <-> stopping transitions * to be contained within runnable <-> quiescent transitions, trigger * ->stopping() early here instead of in put_prev_task_scx(). * * @p may go through multiple stopping <-> running transitions between * here and put_prev_task_scx() if task attribute changes occur while * balance_scx() leaves @rq unlocked. However, they don't contain any * information meaningful to the BPF scheduler and can be suppressed by * skipping the callbacks if the task is !QUEUED.
*/ if (SCX_HAS_OP(sch, stopping) && task_current(rq, p)) {
update_curr_scx(rq);
SCX_CALL_OP_TASK(sch, SCX_KF_REST, stopping, rq, p, false);
}
/** * move_remote_task_to_local_dsq - Move a task from a foreign rq to a local DSQ * @p: task to move * @enq_flags: %SCX_ENQ_* * @src_rq: rq to move the task from, locked on entry, released on return * @dst_rq: rq to move the task into, locked on return * * Move @p which is currently on @src_rq to @dst_rq's local DSQ.
*/ staticvoid move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags, struct rq *src_rq, struct rq *dst_rq)
{
lockdep_assert_rq_held(src_rq);
/* the following marks @p MIGRATING which excludes dequeue */
deactivate_task(src_rq, p, 0);
set_task_cpu(p, cpu_of(dst_rq));
p->scx.sticky_cpu = cpu_of(dst_rq);
/* * We want to pass scx-specific enq_flags but activate_task() will * truncate the upper 32 bit. As we own @rq, we can pass them through * @rq->scx.extra_enq_flags instead.
*/
WARN_ON_ONCE(!cpumask_test_cpu(cpu_of(dst_rq), p->cpus_ptr));
WARN_ON_ONCE(dst_rq->scx.extra_enq_flags);
dst_rq->scx.extra_enq_flags = enq_flags;
activate_task(dst_rq, p, 0);
dst_rq->scx.extra_enq_flags = 0;
}
/* * Similar to kernel/sched/core.c::is_cpu_allowed(). However, there are two * differences: * * - is_cpu_allowed() asks "Can this task run on this CPU?" while * task_can_run_on_remote_rq() asks "Can the BPF scheduler migrate the task to * this CPU?". * * While migration is disabled, is_cpu_allowed() has to say "yes" as the task * must be allowed to finish on the CPU that it's currently on regardless of * the CPU state. However, task_can_run_on_remote_rq() must say "no" as the * BPF scheduler shouldn't attempt to migrate a task which has migration * disabled. * * - The BPF scheduler is bypassed while the rq is offline and we can always say * no to the BPF scheduler initiated migrations while offline. * * The caller must ensure that @p and @rq are on different CPUs.
*/ staticbool task_can_run_on_remote_rq(struct scx_sched *sch, struct task_struct *p, struct rq *rq, bool enforce)
{ int cpu = cpu_of(rq);
WARN_ON_ONCE(task_cpu(p) == cpu);
/* * If @p has migration disabled, @p->cpus_ptr is updated to contain only * the pinned CPU in migrate_disable_switch() while @p is being switched * out. However, put_prev_task_scx() is called before @p->cpus_ptr is * updated and thus another CPU may see @p on a DSQ inbetween leading to * @p passing the below task_allowed_on_cpu() check while migration is * disabled. * * Test the migration disabled state first as the race window is narrow * and the BPF scheduler failing to check migration disabled state can * easily be masked if task_allowed_on_cpu() is done first.
*/ if (unlikely(is_migration_disabled(p))) { if (enforce)
scx_error(sch, "SCX_DSQ_LOCAL[_ON] cannot move migration disabled %s[%d] from CPU %d to %d",
p->comm, p->pid, task_cpu(p), cpu); returnfalse;
}
/* * We don't require the BPF scheduler to avoid dispatching to offline * CPUs mostly for convenience but also because CPUs can go offline * between scx_bpf_dsq_insert() calls and here. Trigger error iff the * picked CPU is outside the allowed mask.
*/ if (!task_allowed_on_cpu(p, cpu)) { if (enforce)
scx_error(sch, "SCX_DSQ_LOCAL[_ON] target CPU %d not allowed for %s[%d]",
cpu, p->comm, p->pid); returnfalse;
}
if (!scx_rq_online(rq)) { if (enforce)
__scx_add_event(scx_root,
SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE, 1); returnfalse;
}
returntrue;
}
/** * unlink_dsq_and_lock_src_rq() - Unlink task from its DSQ and lock its task_rq * @p: target task * @dsq: locked DSQ @p is currently on * @src_rq: rq @p is currently on, stable with @dsq locked * * Called with @dsq locked but no rq's locked. We want to move @p to a different * DSQ, including any local DSQ, but are not locking @src_rq. Locking @src_rq is * required when transferring into a local DSQ. Even when transferring into a * non-local DSQ, it's better to use the same mechanism to protect against * dequeues and maintain the invariant that @p->scx.dsq can only change while * @src_rq is locked, which e.g. scx_dump_task() depends on. * * We want to grab @src_rq but that can deadlock if we try while locking @dsq, * so we want to unlink @p from @dsq, drop its lock and then lock @src_rq. As * this may race with dequeue, which can't drop the rq lock or fail, do a little * dancing from our side. * * @p->scx.holding_cpu is set to this CPU before @dsq is unlocked. If @p gets * dequeued after we unlock @dsq but before locking @src_rq, the holding_cpu * would be cleared to -1. While other cpus may have updated it to different * values afterwards, as this operation can't be preempted or recurse, the * holding_cpu can never become this CPU again before we're done. Thus, we can * tell whether we lost to dequeue by testing whether the holding_cpu still * points to this CPU. See dispatch_dequeue() for the counterpart. * * On return, @dsq is unlocked and @src_rq is locked. Returns %true if @p is * still valid. %false if lost to dequeue.
*/ staticbool unlink_dsq_and_lock_src_rq(struct task_struct *p, struct scx_dispatch_q *dsq, struct rq *src_rq)
{
s32 cpu = raw_smp_processor_id();
/** * move_task_between_dsqs() - Move a task from one DSQ to another * @sch: scx_sched being operated on * @p: target task * @enq_flags: %SCX_ENQ_* * @src_dsq: DSQ @p is currently on, must not be a local DSQ * @dst_dsq: DSQ @p is being moved to, can be any DSQ * * Must be called with @p's task_rq and @src_dsq locked. If @dst_dsq is a local * DSQ and @p is on a different CPU, @p will be migrated and thus its task_rq * will change. As @p's task_rq is locked, this function doesn't need to use the * holding_cpu mechanism. * * On return, @src_dsq is unlocked and only @p's new task_rq, which is the * return value, is locked.
*/ staticstruct rq *move_task_between_dsqs(struct scx_sched *sch, struct task_struct *p, u64 enq_flags, struct scx_dispatch_q *src_dsq, struct scx_dispatch_q *dst_dsq)
{ struct rq *src_rq = task_rq(p), *dst_rq;
if (dst_dsq->id == SCX_DSQ_LOCAL) {
dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq); if (src_rq != dst_rq &&
unlikely(!task_can_run_on_remote_rq(sch, p, dst_rq, true))) {
dst_dsq = find_global_dsq(p);
dst_rq = src_rq;
}
} else { /* no need to migrate if destination is a non-local DSQ */
dst_rq = src_rq;
}
/* * Move @p into $dst_dsq. If $dst_dsq is the local DSQ of a different * CPU, @p will be migrated.
*/ if (dst_dsq->id == SCX_DSQ_LOCAL) { /* @p is going from a non-local DSQ to a local DSQ */ if (src_rq == dst_rq) {
task_unlink_from_dsq(p, src_dsq);
move_local_task_to_local_dsq(p, enq_flags,
src_dsq, dst_rq);
raw_spin_unlock(&src_dsq->lock);
} else {
raw_spin_unlock(&src_dsq->lock);
move_remote_task_to_local_dsq(p, enq_flags,
src_rq, dst_rq);
}
} else { /* * @p is going from a non-local DSQ to a non-local DSQ. As * $src_dsq is already locked, do an abbreviated dequeue.
*/
task_unlink_from_dsq(p, src_dsq);
p->scx.dsq = NULL;
raw_spin_unlock(&src_dsq->lock);
dispatch_enqueue(sch, dst_dsq, p, enq_flags);
}
return dst_rq;
}
/* * A poorly behaving BPF scheduler can live-lock the system by e.g. incessantly * banging on the same DSQ on a large NUMA system to the point where switching * to the bypass mode can take a long time. Inject artificial delays while the * bypass mode is switching to guarantee timely completion.
*/ staticvoid scx_breather(struct rq *rq)
{
u64 until;
lockdep_assert_rq_held(rq);
if (likely(!atomic_read(&scx_breather_depth))) return;
raw_spin_rq_unlock(rq);
until = ktime_get_ns() + NSEC_PER_MSEC;
do { int cnt = 1024; while (atomic_read(&scx_breather_depth) && --cnt)
cpu_relax();
} while (atomic_read(&scx_breather_depth) &&
time_before64(ktime_get_ns(), until));
raw_spin_rq_lock(rq);
}
staticbool consume_dispatch_q(struct scx_sched *sch, struct rq *rq, struct scx_dispatch_q *dsq)
{ struct task_struct *p;
retry: /* * This retry loop can repeatedly race against scx_bypass() dequeueing * tasks from @dsq trying to put the system into the bypass mode. On * some multi-socket machines (e.g. 2x Intel 8480c), this can live-lock * the machine into soft lockups. Give a breather.
*/
scx_breather(rq);
/* * The caller can't expect to successfully consume a task if the task's * addition to @dsq isn't guaranteed to be visible somehow. Test * @dsq->list without locking and skip if it seems empty.
*/ if (list_empty(&dsq->list)) returnfalse;
/** * dispatch_to_local_dsq - Dispatch a task to a local dsq * @sch: scx_sched being operated on * @rq: current rq which is locked * @dst_dsq: destination DSQ * @p: task to dispatch * @enq_flags: %SCX_ENQ_* * * We're holding @rq lock and want to dispatch @p to @dst_dsq which is a local * DSQ. This function performs all the synchronization dancing needed because * local DSQs are protected with rq locks. * * The caller must have exclusive ownership of @p (e.g. through * %SCX_OPSS_DISPATCHING).
*/ staticvoid dispatch_to_local_dsq(struct scx_sched *sch, struct rq *rq, struct scx_dispatch_q *dst_dsq, struct task_struct *p, u64 enq_flags)
{ struct rq *src_rq = task_rq(p); struct rq *dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq); struct rq *locked_rq = rq;
/* * We're synchronized against dequeue through DISPATCHING. As @p can't * be dequeued, its task_rq and cpus_allowed are stable too. * * If dispatching to @rq that @p is already on, no lock dancing needed.
*/ if (rq == src_rq && rq == dst_rq) {
dispatch_enqueue(sch, dst_dsq, p,
enq_flags | SCX_ENQ_CLEAR_OPSS); return;
}
/* * @p is on a possibly remote @src_rq which we need to lock to move the * task. If dequeue is in progress, it'd be locking @src_rq and waiting * on DISPATCHING, so we can't grab @src_rq lock while holding * DISPATCHING. * * As DISPATCHING guarantees that @p is wholly ours, we can pretend that * we're moving from a DSQ and use the same mechanism - mark the task * under transfer with holding_cpu, release DISPATCHING and then follow * the same protocol. See unlink_dsq_and_lock_src_rq().
*/
p->scx.holding_cpu = raw_smp_processor_id();
/* store_release ensures that dequeue sees the above */
atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE);
/* switch to @src_rq lock */ if (locked_rq != src_rq) {
raw_spin_rq_unlock(locked_rq);
locked_rq = src_rq;
raw_spin_rq_lock(src_rq);
}
/* task_rq couldn't have changed if we're still the holding cpu */ if (likely(p->scx.holding_cpu == raw_smp_processor_id()) &&
!WARN_ON_ONCE(src_rq != task_rq(p))) { /* * If @p is staying on the same rq, there's no need to go * through the full deactivate/activate cycle. Optimize by * abbreviating move_remote_task_to_local_dsq().
*/ if (src_rq == dst_rq) {
p->scx.holding_cpu = -1;
dispatch_enqueue(sch, &dst_rq->scx.local_dsq, p,
enq_flags);
} else {
move_remote_task_to_local_dsq(p, enq_flags,
src_rq, dst_rq); /* task has been moved to dst_rq, which is now locked */
locked_rq = dst_rq;
}
/* if the destination CPU is idle, wake it up */ if (sched_class_above(p->sched_class, dst_rq->curr->sched_class))
resched_curr(dst_rq);
}
/* switch back to @rq lock */ if (locked_rq != rq) {
raw_spin_rq_unlock(locked_rq);
raw_spin_rq_lock(rq);
}
}
/** * finish_dispatch - Asynchronously finish dispatching a task * @rq: current rq which is locked * @p: task to finish dispatching * @qseq_at_dispatch: qseq when @p started getting dispatched * @dsq_id: destination DSQ ID * @enq_flags: %SCX_ENQ_* * * Dispatching to local DSQs may need to wait for queueing to complete or * require rq lock dancing. As we don't wanna do either while inside * ops.dispatch() to avoid locking order inversion, we split dispatching into * two parts. scx_bpf_dsq_insert() which is called by ops.dispatch() records the * task and its qseq. Once ops.dispatch() returns, this function is called to * finish up. * * There is no guarantee that @p is still valid for dispatching or even that it * was valid in the first place. Make sure that the task is still owned by the * BPF scheduler and claim the ownership before dispatching.
*/ staticvoid finish_dispatch(struct scx_sched *sch, struct rq *rq, struct task_struct *p, unsignedlong qseq_at_dispatch,
u64 dsq_id, u64 enq_flags)
{ struct scx_dispatch_q *dsq; unsignedlong opss;
touch_core_sched_dispatch(rq, p);
retry: /* * No need for _acquire here. @p is accessed only after a successful * try_cmpxchg to DISPATCHING.
*/
opss = atomic_long_read(&p->scx.ops_state);
switch (opss & SCX_OPSS_STATE_MASK) { case SCX_OPSS_DISPATCHING: case SCX_OPSS_NONE: /* someone else already got to it */ return; case SCX_OPSS_QUEUED: /* * If qseq doesn't match, @p has gone through at least one * dispatch/dequeue and re-enqueue cycle between * scx_bpf_dsq_insert() and here and we have no claim on it.
*/ if ((opss & SCX_OPSS_QSEQ_MASK) != qseq_at_dispatch) return;
/* * While we know @p is accessible, we don't yet have a claim on * it - the BPF scheduler is allowed to dispatch tasks * spuriously and there can be a racing dequeue attempt. Let's * claim @p by atomically transitioning it from QUEUED to * DISPATCHING.
*/ if (likely(atomic_long_try_cmpxchg(&p->scx.ops_state, &opss,
SCX_OPSS_DISPATCHING))) break; goto retry; case SCX_OPSS_QUEUEING: /* * do_enqueue_task() is in the process of transferring the task * to the BPF scheduler while holding @p's rq lock. As we aren't * holding any kernel or BPF resource that the enqueue path may * depend upon, it's safe to wait.
*/
wait_ops_state(p, opss); goto retry;
}
if ((sch->ops.flags & SCX_OPS_HAS_CPU_PREEMPT) &&
unlikely(rq->scx.cpu_released)) { /* * If the previous sched_class for the current CPU was not SCX, * notify the BPF scheduler that it again has control of the * core. This callback complements ->cpu_release(), which is * emitted in switch_class().
*/ if (SCX_HAS_OP(sch, cpu_acquire))
SCX_CALL_OP(sch, SCX_KF_REST, cpu_acquire, rq,
cpu_of(rq), NULL);
rq->scx.cpu_released = false;
}
if (prev_on_scx) {
update_curr_scx(rq);
/* * If @prev is runnable & has slice left, it has priority and * fetching more just increases latency for the fetched tasks. * Tell pick_task_scx() to keep running @prev. If the BPF * scheduler wants to handle this explicitly, it should * implement ->cpu_release(). * * See scx_disable_workfn() for the explanation on the bypassing * test.
*/ if (prev_on_rq && prev->scx.slice && !scx_rq_bypassing(rq)) {
rq->scx.flags |= SCX_RQ_BAL_KEEP; goto has_tasks;
}
}
/* if there already are tasks to run, nothing to do */ if (rq->scx.local_dsq.nr) goto has_tasks;
if (consume_global_dsq(sch, rq)) goto has_tasks;
if (unlikely(!SCX_HAS_OP(sch, dispatch)) ||
scx_rq_bypassing(rq) || !scx_rq_online(rq)) goto no_tasks;
dspc->rq = rq;
/* * The dispatch loop. Because flush_dispatch_buf() may drop the rq lock, * the local DSQ might still end up empty after a successful * ops.dispatch(). If the local DSQ is empty even after ops.dispatch() * produced some tasks, retry. The BPF scheduler may depend on this * looping behavior to simplify its implementation.
*/ do {
dspc->nr_tasks = 0;
if (prev_on_rq && prev->scx.slice) {
rq->scx.flags |= SCX_RQ_BAL_KEEP; goto has_tasks;
} if (rq->scx.local_dsq.nr) goto has_tasks; if (consume_global_dsq(sch, rq)) goto has_tasks;
/* * ops.dispatch() can trap us in this loop by repeatedly * dispatching ineligible tasks. Break out once in a while to * allow the watchdog to run. As IRQ can't be enabled in * balance(), we want to complete this scheduling cycle and then * start a new one. IOW, we want to call resched_curr() on the * next, most likely idle, task, not the current one. Use * scx_bpf_kick_cpu() for deferred kicking.
*/ if (unlikely(!--nr_loops)) {
scx_bpf_kick_cpu(cpu_of(rq), 0); break;
}
} while (dspc->nr_tasks);
no_tasks: /* * Didn't find another task to run. Keep running @prev unless * %SCX_OPS_ENQ_LAST is in effect.
*/ if (prev_on_rq &&
(!(sch->ops.flags & SCX_OPS_ENQ_LAST) || scx_rq_bypassing(rq))) {
rq->scx.flags |= SCX_RQ_BAL_KEEP;
__scx_add_event(sch, SCX_EV_DISPATCH_KEEP_LAST, 1); goto has_tasks;
}
rq->scx.flags &= ~SCX_RQ_IN_BALANCE; returnfalse;
#ifdef CONFIG_SCHED_SMT /* * When core-sched is enabled, this ops.balance() call will be followed * by pick_task_scx() on this CPU and the SMT siblings. Balance the * siblings too.
*/ if (sched_core_enabled(rq)) { conststruct cpumask *smt_mask = cpu_smt_mask(cpu_of(rq)); int scpu;
/* * Now that @rq can be unlocked, execute the deferred enqueueing of * tasks directly dispatched to the local DSQs of other CPUs. See * direct_dispatch(). Keep popping from the head instead of using * list_for_each_entry_safe() as dispatch_local_dsq() may unlock @rq * temporarily.
*/ while ((p = list_first_entry_or_null(&rq->scx.ddsp_deferred_locals, struct task_struct, scx.dsq_list.node))) { struct scx_sched *sch = scx_root; struct scx_dispatch_q *dsq;
if (p->scx.flags & SCX_TASK_QUEUED) { /* * Core-sched might decide to execute @p before it is * dispatched. Call ops_dequeue() to notify the BPF scheduler.
*/
ops_dequeue(rq, p, SCX_DEQ_CORE_SCHED_EXEC);
dispatch_dequeue(rq, p);
}
p->se.exec_start = rq_clock_task(rq);
/* see dequeue_task_scx() on why we skip when !QUEUED */ if (SCX_HAS_OP(sch, running) && (p->scx.flags & SCX_TASK_QUEUED))
SCX_CALL_OP_TASK(sch, SCX_KF_REST, running, rq, p);
clr_task_runnable(p, true);
/* * @p is getting newly scheduled or got kicked after someone updated its * slice. Refresh whether tick can be stopped. See scx_can_stop_tick().
*/ if ((p->scx.slice == SCX_SLICE_INF) !=
(bool)(rq->scx.flags & SCX_RQ_CAN_STOP_TICK)) { if (p->scx.slice == SCX_SLICE_INF)
rq->scx.flags |= SCX_RQ_CAN_STOP_TICK; else
rq->scx.flags &= ~SCX_RQ_CAN_STOP_TICK;
sched_update_tick_dependency(rq);
/* * For now, let's refresh the load_avgs just when transitioning * in and out of nohz. In the future, we might want to add a * mechanism which calls the following periodically on * tick-stopped CPUs.
*/
update_other_load_avgs(rq);
}
}
staticenum scx_cpu_preempt_reason
preempt_reason_from_class(conststruct sched_class *class)
{ if (class == &stop_sched_class) return SCX_CPU_PREEMPT_STOP; if (class == &dl_sched_class) return SCX_CPU_PREEMPT_DL; if (class == &rt_sched_class) return SCX_CPU_PREEMPT_RT; return SCX_CPU_PREEMPT_UNKNOWN;
}
/* * Pairs with the smp_load_acquire() issued by a CPU in * kick_cpus_irq_workfn() who is waiting for this CPU to perform a * resched.
*/
smp_store_release(&rq->scx.pnt_seq, rq->scx.pnt_seq + 1); if (!(sch->ops.flags & SCX_OPS_HAS_CPU_PREEMPT)) return;
/* * The callback is conceptually meant to convey that the CPU is no * longer under the control of SCX. Therefore, don't invoke the callback * if the next class is below SCX (in which case the BPF scheduler has * actively decided not to schedule any tasks on the CPU).
*/ if (sched_class_above(&ext_sched_class, next_class)) return;
/* * At this point we know that SCX was preempted by a higher priority * sched_class, so invoke the ->cpu_release() callback if we have not * done so already. We only send the callback once between SCX being * preempted, and it regaining control of the CPU. * * ->cpu_release() complements ->cpu_acquire(), which is emitted the * next time that balance_scx() is invoked.
*/ if (!rq->scx.cpu_released) { if (SCX_HAS_OP(sch, cpu_release)) { struct scx_cpu_release_args args = {
.reason = preempt_reason_from_class(next_class),
.task = next,
};
/* see dequeue_task_scx() on why we skip when !QUEUED */ if (SCX_HAS_OP(sch, stopping) && (p->scx.flags & SCX_TASK_QUEUED))
SCX_CALL_OP_TASK(sch, SCX_KF_REST, stopping, rq, p, true);
if (p->scx.flags & SCX_TASK_QUEUED) {
set_task_runnable(rq, p);
/* * If @p has slice left and is being put, @p is getting * preempted by a higher priority scheduler class or core-sched * forcing a different task. Leave it at the head of the local * DSQ.
*/ if (p->scx.slice && !scx_rq_bypassing(rq)) {
dispatch_enqueue(sch, &rq->scx.local_dsq, p,
SCX_ENQ_HEAD); goto switch_class;
}
/* * If @p is runnable but we're about to enter a lower * sched_class, %SCX_OPS_ENQ_LAST must be set. Tell * ops.enqueue() that @p is the only one available for this cpu, * which should trigger an explicit follow-up scheduling event.
*/ if (sched_class_above(&ext_sched_class, next->sched_class)) {
WARN_ON_ONCE(!(sch->ops.flags & SCX_OPS_ENQ_LAST));
do_enqueue_task(rq, p, SCX_ENQ_LAST, -1);
} else {
do_enqueue_task(rq, p, 0, -1);
}
}
switch_class: if (next && next->sched_class != &ext_sched_class)
switch_class(rq, next);
}
/* * WORKAROUND: * * %SCX_RQ_BAL_KEEP should be set iff $prev is on SCX as it must just * have gone through balance_scx(). Unfortunately, there currently is a * bug where fair could say yes on balance() but no on pick_task(), * which then ends up calling pick_task_scx() without preceding * balance_scx(). * * Keep running @prev if possible and avoid stalling from entering idle * without balancing. * * Once fair is fixed, remove the workaround and trigger WARN_ON_ONCE() * if pick_task_scx() is called without preceding balance_scx().
*/ if (unlikely(rq->scx.flags & SCX_RQ_BAL_PENDING)) { if (prev->scx.flags & SCX_TASK_QUEUED) {
keep_prev = true;
} else {
keep_prev = false;
kick_idle = true;
}
} elseif (unlikely(keep_prev &&
prev->sched_class != &ext_sched_class)) { /* * Can happen while enabling as SCX_RQ_BAL_PENDING assertion is * conditional on scx_enabled() and may have been skipped.
*/
WARN_ON_ONCE(scx_enable_state() == SCX_ENABLED);
keep_prev = false;
}
/* * If balance_scx() is telling us to keep running @prev, replenish slice * if necessary and keep running @prev. Otherwise, pop the first one * from the local DSQ.
*/ if (keep_prev) {
p = prev; if (!p->scx.slice)
refill_task_slice_dfl(p);
} else {
p = first_local_task(rq); if (!p) { if (kick_idle)
scx_bpf_kick_cpu(cpu_of(rq), SCX_KICK_IDLE); return NULL;
}
if (unlikely(!p->scx.slice)) { struct scx_sched *sch = scx_root;
if (!scx_rq_bypassing(rq) && !sch->warned_zero_slice) {
printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in %s()\n",
p->comm, p->pid, __func__);
sch->warned_zero_slice = true;
}
refill_task_slice_dfl(p);
}
}
return p;
}
#ifdef CONFIG_SCHED_CORE /** * scx_prio_less - Task ordering for core-sched * @a: task A * @b: task B * @in_fi: in forced idle state * * Core-sched is implemented as an additional scheduling layer on top of the * usual sched_class'es and needs to find out the expected task ordering. For * SCX, core-sched calls this function to interrogate the task ordering. * * Unless overridden by ops.core_sched_before(), @p->scx.core_sched_at is used * to implement the default task ordering. The older the timestamp, the higher * priority the task - the global FIFO ordering matching the default scheduling * behavior. * * When ops.core_sched_before() is enabled, @p->scx.core_sched_at is used to * implement FIFO ordering within each local DSQ. See pick_task_scx().
*/ bool scx_prio_less(conststruct task_struct *a, conststruct task_struct *b, bool in_fi)
{ struct scx_sched *sch = scx_root;
/* * The const qualifiers are dropped from task_struct pointers when * calling ops.core_sched_before(). Accesses are controlled by the * verifier.
*/ if (SCX_HAS_OP(sch, core_sched_before) &&
!scx_rq_bypassing(task_rq(a))) return SCX_CALL_OP_2TASKS_RET(sch, SCX_KF_REST, core_sched_before,
NULL,
(struct task_struct *)a,
(struct task_struct *)b); else return time_after64(a->scx.core_sched_at, b->scx.core_sched_at);
} #endif/* CONFIG_SCHED_CORE */
staticint select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flags)
{ struct scx_sched *sch = scx_root; bool rq_bypass;
/* * sched_exec() calls with %WF_EXEC when @p is about to exec(2) as it * can be a good migration opportunity with low cache and memory * footprint. Returning a CPU different than @prev_cpu triggers * immediate rq migration. However, for SCX, as the current rq * association doesn't dictate where the task is going to run, this * doesn't fit well. If necessary, we can later add a dedicated method * which can decide to preempt self to force it through the regular * scheduling path.
*/ if (unlikely(wake_flags & WF_EXEC)) return prev_cpu;
/* * The effective cpumask is stored in @p->cpus_ptr which may temporarily * differ from the configured one in @p->cpus_mask. Always tell the bpf * scheduler the effective one. * * Fine-grained memory write control is enforced by BPF making the const * designation pointless. Cast it away when calling the operation.
*/ if (SCX_HAS_OP(sch, set_cpumask))
SCX_CALL_OP_TASK(sch, SCX_KF_REST, set_cpumask, NULL,
p, (struct cpumask *)p->cpus_ptr);
}
staticvoid handle_hotplug(struct rq *rq, bool online)
{ struct scx_sched *sch = scx_root; int cpu = cpu_of(rq);
atomic_long_inc(&scx_hotplug_seq);
/* * scx_root updates are protected by cpus_read_lock() and will stay * stable here. Note that we can't depend on scx_enabled() test as the * hotplug ops need to be enabled before __scx_enabled is set.
*/ if (unlikely(!sch)) return;
if (scx_enabled())
scx_idle_update_selcpu_topology(&sch->ops);
/* * While disabling, always resched and refresh core-sched timestamp as * we can't trust the slice management or ops.core_sched_before().
*/ if (scx_rq_bypassing(rq)) {
curr->scx.slice = 0;
touch_core_sched(rq, curr);
} elseif (SCX_HAS_OP(sch, tick)) {
SCX_CALL_OP_TASK(sch, SCX_KF_REST, tick, rq, curr);
}
if (!curr->scx.slice)
resched_curr(rq);
}
#ifdef CONFIG_EXT_GROUP_SCHED staticstruct cgroup *tg_cgrp(struct task_group *tg)
{ /* * If CGROUP_SCHED is disabled, @tg is NULL. If @tg is an autogroup, * @tg->css.cgroup is NULL. In both cases, @tg can be treated as the * root cgroup.
*/ if (tg && tg->css.cgroup) return tg->css.cgroup; else return &cgrp_dfl_root.cgrp;
}
ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, init_task, NULL,
p, &args); if (unlikely(ret)) {
ret = ops_sanitize_err(sch, "init_task", ret); return ret;
}
}
scx_set_task_state(p, SCX_TASK_INIT);
if (p->scx.disallow) { if (!fork) { struct rq *rq; struct rq_flags rf;
rq = task_rq_lock(p, &rf);
/* * We're in the load path and @p->policy will be applied * right after. Reverting @p->policy here and rejecting * %SCHED_EXT transitions from scx_check_setscheduler() * guarantees that if ops.init_task() sets @p->disallow, * @p can never be in SCX.
*/ if (p->policy == SCHED_EXT) {
p->policy = SCHED_NORMAL;
atomic_long_inc(&scx_nr_rejected);
}
task_rq_unlock(rq, p, &rf);
} elseif (p->policy == SCHED_EXT) {
scx_error(sch, "ops.init_task() set task->scx.disallow for %s[%d] during fork",
p->comm, p->pid);
}
}
/* * Set the weight before calling ops.enable() so that the scheduler * doesn't see a stale value if they inspect the task struct.
*/ if (task_has_idle_policy(p))
weight = WEIGHT_IDLEPRIO; else
weight = sched_prio_to_weight[p->static_prio - MAX_RT_PRIO];
p->scx.weight = sched_weight_to_cgroup(weight);
if (SCX_HAS_OP(sch, enable))
SCX_CALL_OP_TASK(sch, SCX_KF_REST, enable, rq, p);
scx_set_task_state(p, SCX_TASK_ENABLED);
if (SCX_HAS_OP(sch, set_weight))
SCX_CALL_OP_TASK(sch, SCX_KF_REST, set_weight, rq,
p, p->scx.weight);
}
void scx_pre_fork(struct task_struct *p)
{ /* * BPF scheduler enable/disable paths want to be able to iterate and * update all tasks which can become complex when racing forks. As * enable/disable are very cold paths, let's use a percpu_rwsem to * exclude forks.
*/
percpu_down_read(&scx_fork_rwsem);
}
int scx_fork(struct task_struct *p)
{
percpu_rwsem_assert_held(&scx_fork_rwsem);
if (scx_init_task_enabled) return scx_init_task(p, task_group(p), true); else return 0;
}
void scx_post_fork(struct task_struct *p)
{ if (scx_init_task_enabled) {
scx_set_task_state(p, SCX_TASK_READY);
/* * Enable the task immediately if it's running on sched_ext. * Otherwise, it'll be enabled in switching_to_scx() if and * when it's ever configured to run with a SCHED_EXT policy.
*/ if (p->sched_class == &ext_sched_class) { struct rq_flags rf; struct rq *rq;
/* * set_cpus_allowed_scx() is not called while @p is associated with a * different scheduler class. Keep the BPF scheduler up-to-date.
*/ if (SCX_HAS_OP(sch, set_cpumask))
SCX_CALL_OP_TASK(sch, SCX_KF_REST, set_cpumask, rq,
p, (struct cpumask *)p->cpus_ptr);
}
if (p->sched_class != &ext_sched_class) returntrue;
/* * @rq can dispatch from different DSQs, so we can't tell whether it * needs the tick or not by looking at nr_running. Allow stopping ticks * iff the BPF scheduler indicated so. See set_next_task_scx().
*/ return rq->scx.flags & SCX_RQ_CAN_STOP_TICK;
} #endif
/* * sched_move_task() omits identity migrations. Let's match the * behavior so that ops.cgroup_prep_move() and ops.cgroup_move() * always match one-to-one.
*/ if (from == to) continue;
if (SCX_HAS_OP(sch, cgroup_prep_move)) {
ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED,
cgroup_prep_move, NULL,
p, from, css->cgroup); if (ret) goto err;
}
/* * @p must have ops.cgroup_prep_move() called on it and thus * cgrp_moving_from set.
*/ if (SCX_HAS_OP(sch, cgroup_move) &&
!WARN_ON_ONCE(!p->scx.cgrp_moving_from))
SCX_CALL_OP_TASK(sch, SCX_KF_UNLOCKED, cgroup_move, NULL,
p, p->scx.cgrp_moving_from,
tg_cgrp(task_group(p)));
p->scx.cgrp_moving_from = NULL;
}
/* * Omitted operations: * * - wakeup_preempt: NOOP as it isn't useful in the wakeup path because the task * isn't tied to the CPU at that point. Preemption is implemented by resetting * the victim task's slice to 0 and triggering reschedule on the target CPU. * * - migrate_task_rq: Unnecessary as task to cpu mapping is transient. * * - task_fork/dead: We need fork/dead notifications for all tasks regardless of * their current sched_class. Call them directly from sched core instead.
*/
DEFINE_SCHED_CLASS(ext) = {
.enqueue_task = enqueue_task_scx,
.dequeue_task = dequeue_task_scx,
.yield_task = yield_task_scx,
.yield_to_task = yield_to_task_scx,
dsq = find_user_dsq(sch, dsq_id); if (!dsq) goto out_unlock_rcu;
raw_spin_lock_irqsave(&dsq->lock, flags);
if (dsq->nr) {
scx_error(sch, "attempting to destroy in-use dsq 0x%016llx (nr=%u)",
dsq->id, dsq->nr); goto out_unlock_dsq;
}
if (rhashtable_remove_fast(&sch->dsq_hash, &dsq->hash_node,
dsq_hash_params)) goto out_unlock_dsq;
/* * Mark dead by invalidating ->id to prevent dispatch_enqueue() from * queueing more tasks. As this function can be called from anywhere, * freeing is bounced through an irq work to avoid nesting RCU * operations inside scheduler locks.
*/
dsq->id = SCX_DSQ_INVALID;
llist_add(&dsq->free_node, &dsqs_to_free);
irq_work_queue(&free_dsq_irq_work);
/* * scx_tg_on/offline() are excluded through scx_cgroup_rwsem. If we walk * cgroups and exit all the inited ones, all online cgroups are exited.
*/
rcu_read_lock();
css_for_each_descendant_post(css, &root_task_group.css) { struct task_group *tg = css_tg(css);
if (!(tg->scx.flags & SCX_TG_INITED)) continue;
tg->scx.flags &= ~SCX_TG_INITED;
if (!sch->ops.cgroup_exit) continue;
if (WARN_ON_ONCE(!css_tryget(css))) continue;
rcu_read_unlock();
/* * Used by sched_fork() and __setscheduler_prio() to pick the matching * sched_class. dl/rt are already handled.
*/ bool task_should_scx(int policy)
{ if (!scx_enabled() || unlikely(scx_enable_state() == SCX_DISABLING)) returnfalse; if (READ_ONCE(scx_switching_all)) returntrue; return policy == SCHED_EXT;
}
/** * scx_rcu_cpu_stall - sched_ext RCU CPU stall handler * * While there are various reasons why RCU CPU stalls can occur on a system * that may not be caused by the current BPF scheduler, try kicking out the * current scheduler in an attempt to recover the system to a good state before * issuing panics.
*/ bool scx_rcu_cpu_stall(void)
{ struct scx_sched *sch;
rcu_read_lock();
sch = rcu_dereference(scx_root); if (unlikely(!sch)) {
rcu_read_unlock(); returnfalse;
}
switch (scx_enable_state()) { case SCX_ENABLING: case SCX_ENABLED: break; default:
rcu_read_unlock(); returnfalse;
}
scx_error(sch, "RCU CPU stall detected!");
rcu_read_unlock();
returntrue;
}
/** * scx_softlockup - sched_ext softlockup handler * @dur_s: number of seconds of CPU stuck due to soft lockup * * On some multi-socket setups (e.g. 2x Intel 8480c), the BPF scheduler can * live-lock the system by making many CPUs target the same DSQ to the point * where soft-lockup detection triggers. This function is called from * soft-lockup watchdog when the triggering point is close and tries to unjam * the system by enabling the breather and aborting the BPF scheduler.
*/ void scx_softlockup(u32 dur_s)
{ struct scx_sched *sch;
rcu_read_lock();
sch = rcu_dereference(scx_root); if (unlikely(!sch)) goto out_unlock;
switch (scx_enable_state()) { case SCX_ENABLING: case SCX_ENABLED: break; default: goto out_unlock;
}
/* allow only one instance, cleared at the end of scx_bypass() */ if (test_and_set_bit(0, &scx_in_softlockup)) goto out_unlock;
/* * Some CPUs may be trapped in the dispatch paths. Enable breather * immediately; otherwise, we might even be able to get to scx_bypass().
*/
atomic_inc(&scx_breather_depth);
staticvoid scx_clear_softlockup(void)
{ if (test_and_clear_bit(0, &scx_in_softlockup))
atomic_dec(&scx_breather_depth);
}
/** * scx_bypass - [Un]bypass scx_ops and guarantee forward progress * @bypass: true for bypass, false for unbypass * * Bypassing guarantees that all runnable tasks make forward progress without * trusting the BPF scheduler. We can't grab any mutexes or rwsems as they might * be held by tasks that the BPF scheduler is forgetting to run, which * unfortunately also excludes toggling the static branches. * * Let's work around by overriding a couple ops and modifying behaviors based on * the DISABLING state and then cycling the queued tasks through dequeue/enqueue * to force global FIFO scheduling. * * - ops.select_cpu() is ignored and the default select_cpu() is used. * * - ops.enqueue() is ignored and tasks are queued in simple global FIFO order. * %SCX_OPS_ENQ_LAST is also ignored. * * - ops.dispatch() is ignored. * * - balance_scx() does not set %SCX_RQ_BAL_KEEP on non-zero slice as slice * can't be trusted. Whenever a tick triggers, the running task is rotated to * the tail of the queue with core_sched_at touched. * * - pick_next_task() suppresses zero slice warning. * * - scx_bpf_kick_cpu() is disabled to avoid irq_work malfunction during PM * operations. * * - scx_prio_less() reverts to the default core_sched_at order.
*/ staticvoid scx_bypass(bool bypass)
{ static DEFINE_RAW_SPINLOCK(bypass_lock); staticunsignedlong bypass_timestamp; struct scx_sched *sch; unsignedlong flags; int cpu;
if (bypass) {
scx_bypass_depth++;
WARN_ON_ONCE(scx_bypass_depth <= 0); if (scx_bypass_depth != 1) goto unlock;
bypass_timestamp = ktime_get_ns(); if (sch)
scx_add_event(sch, SCX_EV_BYPASS_ACTIVATE, 1);
} else {
scx_bypass_depth--;
WARN_ON_ONCE(scx_bypass_depth < 0); if (scx_bypass_depth != 0) goto unlock; if (sch)
scx_add_event(sch, SCX_EV_BYPASS_DURATION,
ktime_get_ns() - bypass_timestamp);
}
atomic_inc(&scx_breather_depth);
/* * No task property is changing. We just need to make sure all currently * queued tasks are re-queued according to the new scx_rq_bypassing() * state. As an optimization, walk each rq's runnable_list instead of * the scx_tasks list. * * This function can't trust the scheduler and thus can't use * cpus_read_lock(). Walk all possible CPUs instead of online.
*/
for_each_possible_cpu(cpu) { struct rq *rq = cpu_rq(cpu); struct task_struct *p, *n;
/* * We need to guarantee that no tasks are on the BPF scheduler * while bypassing. Either we see enabled or the enable path * sees scx_rq_bypassing() before moving tasks to SCX.
*/ if (!scx_enabled()) {
raw_spin_rq_unlock(rq); continue;
}
/* * The use of list_for_each_entry_safe_reverse() is required * because each task is going to be removed from and added back * to the runnable_list during iteration. Because they're added * to the tail of the list, safe reverse iteration can still * visit all nodes.
*/
list_for_each_entry_safe_reverse(p, n, &rq->scx.runnable_list,
scx.runnable_node) { struct sched_enq_and_set_ctx ctx;
/* cycling deq/enq is enough, see the function comment */
sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);
sched_enq_and_set_task(&ctx);
}
/* resched to restore ticks and idle state */ if (cpu_online(cpu) || cpu == smp_processor_id())
resched_curr(rq);
staticconstchar *scx_exit_reason(enum scx_exit_kind kind)
{ switch (kind) { case SCX_EXIT_UNREG: return"unregistered from user space"; case SCX_EXIT_UNREG_BPF: return"unregistered from BPF"; case SCX_EXIT_UNREG_KERN: return"unregistered from the main kernel"; case SCX_EXIT_SYSRQ: return"disabled by sysrq-S"; case SCX_EXIT_ERROR: return"runtime error"; case SCX_EXIT_ERROR_BPF: return"scx_bpf_error"; case SCX_EXIT_ERROR_STALL: return"runnable task stall"; default: return"<UNKNOWN>";
}
}
kind = atomic_read(&sch->exit_kind); while (true) { if (kind == SCX_EXIT_DONE) /* already disabled? */ return;
WARN_ON_ONCE(kind == SCX_EXIT_NONE); if (atomic_try_cmpxchg(&sch->exit_kind, &kind, SCX_EXIT_DONE)) break;
}
ei->kind = kind;
ei->reason = scx_exit_reason(ei->kind);
/* guarantee forward progress by bypassing scx_ops */
scx_bypass(true);
switch (scx_set_enable_state(SCX_DISABLING)) { case SCX_DISABLING:
WARN_ONCE(true, "sched_ext: duplicate disabling instance?"); break; case SCX_DISABLED:
pr_warn("sched_ext: ops error detected without ops (%s)\n",
sch->exit_info->msg);
WARN_ON_ONCE(scx_set_enable_state(SCX_DISABLED) != SCX_DISABLING); goto done; default: break;
}
/* * Here, every runnable task is guaranteed to make forward progress and * we can safely use blocking synchronization constructs. Actually * disable ops.
*/
mutex_lock(&scx_enable_mutex);
/* * Shut down cgroup support before tasks so that the cgroup attach path * doesn't race against scx_exit_task().
*/
scx_cgroup_lock();
scx_cgroup_exit(sch);
scx_cgroup_unlock();
/* * The BPF scheduler is going away. All tasks including %TASK_DEAD ones * must be switched out and exited synchronously.
*/
percpu_down_write(&scx_fork_rwsem);
/* * Invalidate all the rq clocks to prevent getting outdated * rq clocks from a previous scx scheduler.
*/
for_each_possible_cpu(cpu) { struct rq *rq = cpu_rq(cpu);
scx_rq_clock_invalidate(rq);
}
/* no task is on scx, turn off all the switches and flush in-progress calls */
static_branch_disable(&__scx_enabled);
bitmap_zero(sch->has_op, SCX_OPI_END);
scx_idle_disable();
synchronize_rcu();
if (sch->ops.exit)
SCX_CALL_OP(sch, SCX_KF_UNLOCKED, exit, NULL, ei);
cancel_delayed_work_sync(&scx_watchdog_work);
/* * scx_root clearing must be inside cpus_read_lock(). See * handle_hotplug().
*/
cpus_read_lock();
RCU_INIT_POINTER(scx_root, NULL);
cpus_read_unlock();
/* * Delete the kobject from the hierarchy synchronously. Otherwise, sysfs * could observe an object of the same name still in the hierarchy when * the next scheduler is loaded.
*/
kobject_del(&sch->kobj);
trace_sched_ext_dump(line_buf);
} #endif /* @s may be zero sized and seq_buf triggers WARN if so */ if (s->size) {
va_start(args, fmt);
seq_buf_vprintf(s, fmt, args);
va_end(args);
/* * There's something to flush and this is the first line. Insert a blank * line to distinguish ops dump.
*/ if (dd->first) {
dump_newline(dd->s);
dd->first = false;
}
/* * There may be multiple lines in $line. Scan and emit each line * separately.
*/ while (true) { char *end = line; char c;
while (*end != '\n' && *end != '\0')
end++;
/* * If $line overflowed, it may not have newline at the end. * Always emit with a newline.
*/
c = *end;
*end = '\0';
dump_line(dd->s, "%s%s", dd->prefix, line); if (c == '\0') break;
/* move to the next line */
end++; if (*end == '\0') break;
line = end;
}
if (idle && !SCX_HAS_OP(sch, dump_cpu)) goto next;
/* * We don't yet know whether ops.dump_cpu() will produce output * and we may want to skip the default CPU dump if it doesn't. * Use a nested seq_buf to generate the standard dump so that we * can decide whether to commit later.
*/
avail = seq_buf_get_buf(&s, &buf);
seq_buf_init(&ns, buf, avail);
used = seq_buf_used(&ns); if (SCX_HAS_OP(sch, dump_cpu)) {
ops_dump_init(&ns, " ");
SCX_CALL_OP(sch, SCX_KF_REST, dump_cpu, NULL,
&dctx, cpu, idle);
ops_dump_exit();
}
/* * If idle && nothing generated by ops.dump_cpu(), there's * nothing interesting. Skip.
*/ if (idle && used == seq_buf_used(&ns)) goto next;
/* * $s may already have overflowed when $ns was created. If so, * calling commit on it will trigger BUG.
*/ if (avail) {
seq_buf_commit(&s, seq_buf_used(&ns)); if (seq_buf_has_overflowed(&ns))
seq_buf_set_overflow(&s);
}
if (rq->curr->sched_class == &ext_sched_class)
scx_dump_task(&s, &dctx, rq->curr, '*');
/* * Set ei->kind and ->reason for scx_dump_state(). They'll be set again * in scx_disable_workfn().
*/
ei->kind = kind;
ei->reason = scx_exit_reason(ei->kind);
/* * If a hotplug event has occurred between when a scheduler was * initialized, and when we were able to attach, exit and notify user * space about it.
*/ if (ops->hotplug_seq) {
global_hotplug_seq = atomic_long_read(&scx_hotplug_seq); if (ops->hotplug_seq != global_hotplug_seq) {
scx_exit(sch, SCX_EXIT_UNREG_KERN,
SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG, "expected hotplug seq %llu did not match actual %llu",
ops->hotplug_seq, global_hotplug_seq);
}
}
}
staticint validate_ops(struct scx_sched *sch, conststruct sched_ext_ops *ops)
{ /* * It doesn't make sense to specify the SCX_OPS_ENQ_LAST flag if the * ops.enqueue() callback isn't implemented.
*/ if ((ops->flags & SCX_OPS_ENQ_LAST) && !ops->enqueue) {
scx_error(sch, "SCX_OPS_ENQ_LAST requires ops.enqueue() to be implemented"); return -EINVAL;
}
/* * SCX_OPS_BUILTIN_IDLE_PER_NODE requires built-in CPU idle * selection policy to be enabled.
*/ if ((ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE) &&
(ops->update_idle && !(ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE))) {
scx_error(sch, "SCX_OPS_BUILTIN_IDLE_PER_NODE requires CPU idle selection enabled"); return -EINVAL;
}
if (ops->flags & SCX_OPS_HAS_CGROUP_WEIGHT)
pr_warn("SCX_OPS_HAS_CGROUP_WEIGHT is deprecated and a noop\n");
if (!cpumask_equal(housekeeping_cpumask(HK_TYPE_DOMAIN),
cpu_possible_mask)) {
pr_err("sched_ext: Not compatible with \"isolcpus=\" domain isolation\n"); return -EINVAL;
}
mutex_lock(&scx_enable_mutex);
if (scx_enable_state() != SCX_DISABLED) {
ret = -EBUSY; goto err_unlock;
}
sch = scx_alloc_and_add_sched(ops); if (IS_ERR(sch)) {
ret = PTR_ERR(sch); goto err_unlock;
}
/* * Transition to ENABLING and clear exit info to arm the disable path. * Failure triggers full disabling from here on.
*/
WARN_ON_ONCE(scx_set_enable_state(SCX_ENABLING) != SCX_DISABLED);
WARN_ON_ONCE(scx_root);
/* * Keep CPUs stable during enable so that the BPF scheduler can track * online CPUs by watching ->on/offline_cpu() after ->init().
*/
cpus_read_lock();
/* * Make the scheduler instance visible. Must be inside cpus_read_lock(). * See handle_hotplug().
*/
rcu_assign_pointer(scx_root, sch);
scx_idle_enable(ops);
if (sch->ops.init) {
ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, init, NULL); if (ret) {
ret = ops_sanitize_err(sch, "init", ret);
cpus_read_unlock();
scx_error(sch, "ops.init() failed (%d)", ret); goto err_disable;
}
}
for (i = SCX_OPI_CPU_HOTPLUG_BEGIN; i < SCX_OPI_CPU_HOTPLUG_END; i++) if (((void (**)(void))ops)[i])
set_bit(i, sch->has_op);
/* * Once __scx_enabled is set, %current can be switched to SCX anytime. * This can lead to stalls as some BPF schedulers (e.g. userspace * scheduling) may not function correctly before all tasks are switched. * Init in bypass mode to guarantee forward progress.
*/
scx_bypass(true);
for (i = SCX_OPI_NORMAL_BEGIN; i < SCX_OPI_NORMAL_END; i++) if (((void (**)(void))ops)[i])
set_bit(i, sch->has_op);
if (sch->ops.cpu_acquire || sch->ops.cpu_release)
sch->ops.flags |= SCX_OPS_HAS_CPU_PREEMPT;
/* * Lock out forks, cgroup on/offlining and moves before opening the * floodgate so that they don't wander into the operations prematurely.
*/
percpu_down_write(&scx_fork_rwsem);
/* * Enable ops for every task. Fork is excluded by scx_fork_rwsem * preventing new tasks from being added. No need to exclude tasks * leaving as sched_ext_free() can handle both prepped and enabled * tasks. Prep all tasks first and then enable them with preemption * disabled. * * All cgroups should be initialized before scx_init_task() so that the * BPF scheduler can reliably track each task's cgroup membership from * scx_init_task(). Lock out cgroup on/offlining and task migrations * while tasks are being initialized so that scx_cgroup_can_attach() * never sees uninitialized tasks.
*/
scx_cgroup_lock();
ret = scx_cgroup_init(sch); if (ret) goto err_disable_unlock_all;
scx_task_iter_start(&sti); while ((p = scx_task_iter_next_locked(&sti))) { /* * @p may already be dead, have lost all its usages counts and * be waiting for RCU grace period before being freed. @p can't * be initialized for SCX in such cases and should be ignored.
*/ if (!tryget_task_struct(p)) continue;
scx_task_iter_unlock(&sti);
ret = scx_init_task(p, task_group(p), false); if (ret) {
put_task_struct(p);
scx_task_iter_relock(&sti);
scx_task_iter_stop(&sti);
scx_error(sch, "ops.init_task() failed (%d) for %s[%d]",
ret, p->comm, p->pid); goto err_disable_unlock_all;
}
/* * All tasks are READY. It's safe to turn on scx_enabled() and switch * all eligible tasks.
*/
WRITE_ONCE(scx_switching_all, !(ops->flags & SCX_OPS_SWITCH_PARTIAL));
static_branch_enable(&__scx_enabled);
/* * We're fully committed and can't fail. The task READY -> ENABLED * transitions here are synchronized against sched_ext_free() through * scx_tasks_lock.
*/
percpu_down_write(&scx_fork_rwsem);
scx_task_iter_start(&sti); while ((p = scx_task_iter_next_locked(&sti))) { conststruct sched_class *old_class = p->sched_class; conststruct sched_class *new_class =
__setscheduler_class(p->policy, p->prio); struct sched_enq_and_set_ctx ctx;
err_disable_unlock_all:
scx_cgroup_unlock();
percpu_up_write(&scx_fork_rwsem); /* we'll soon enter disable path, keep bypass on */
err_disable:
mutex_unlock(&scx_enable_mutex); /* * Returning an error code here would not pass all the error information * to userspace. Record errno using scx_error() for cases scx_error() * wasn't already invoked and exit indicating success so that the error * is notified through ops.exit() with all the details. * * Flush scx_disable_work to ensure that error is reported before init * completion. sch's base reference will be put by bpf_scx_unreg().
*/
scx_error(sch, "scx_enable() failed (%d)", ret);
kthread_flush_work(&sch->disable_work); return 0;
}
switch (moff) { case offsetof(struct sched_ext_ops, init_task): #ifdef CONFIG_EXT_GROUP_SCHED case offsetof(struct sched_ext_ops, cgroup_init): case offsetof(struct sched_ext_ops, cgroup_exit): case offsetof(struct sched_ext_ops, cgroup_prep_move): #endif case offsetof(struct sched_ext_ops, cpu_online): case offsetof(struct sched_ext_ops, cpu_offline): case offsetof(struct sched_ext_ops, init): case offsetof(struct sched_ext_ops, exit): break; default: if (prog->sleepable) return -EINVAL;
}
staticint bpf_scx_update(void *kdata, void *old_kdata, struct bpf_link *link)
{ /* * sched_ext does not support updating the actively-loaded BPF * scheduler, as registering a BPF scheduler can always fail if the * scheduler returns an error code for e.g. ops.init(), ops.init_task(), * etc. Similarly, we can always race with unregistration happening * elsewhere, such as with sysrq.
*/ return -EOPNOTSUPP;
}
/* * We can skip idle kicking if @rq is going to go through at least one * full SCX scheduling cycle before going idle. Just checking whether * curr is not idle is insufficient because we could be racing * balance_one() trying to pull the next task from a remote rq, which * may fail, and @rq may become idle afterwards. * * The race window is small and we don't and can't guarantee that @rq is * only kicked while idle anyway. Skip only when sure.
*/ return !is_idle_task(rq->curr) && !(rq->scx.flags & SCX_RQ_IN_BALANCE);
}
/* * During CPU hotplug, a CPU may depend on kicking itself to make * forward progress. Allow kicking self regardless of online state.
*/ if (cpu_online(cpu) || cpu == cpu_of(this_rq)) { if (cpumask_test_cpu(cpu, this_scx->cpus_to_preempt)) { if (rq->curr->sched_class == &ext_sched_class)
rq->curr->scx.slice = 0;
cpumask_clear_cpu(cpu, this_scx->cpus_to_preempt);
}
if (cpu != cpu_of(this_rq)) { /* * Pairs with smp_store_release() issued by this CPU in * switch_class() on the resched path. * * We busy-wait here to guarantee that no other task can * be scheduled on our core before the target CPU has * entered the resched path.
*/ while (smp_load_acquire(wait_pnt_seq) == pseqs[cpu])
cpu_relax();
}
/** * print_scx_info - print out sched_ext scheduler state * @log_lvl: the log level to use when printing * @p: target task * * If a sched_ext scheduler is enabled, print the name and state of the * scheduler. If @p is on sched_ext, print further information about the task. * * This function can be safely called on any task as long as the task_struct * itself is accessible. While safe, this function isn't synchronized and may * print out mixups or garbages of limited length.
*/ void print_scx_info(constchar *log_lvl, struct task_struct *p)
{ struct scx_sched *sch = scx_root; enum scx_enable_state state = scx_enable_state(); constchar *all = READ_ONCE(scx_switching_all) ? "+all" : ""; char runnable_at_buf[22] = "?"; struct sched_class *class; unsignedlong runnable_at;
if (state == SCX_DISABLED) return;
/* * Carefully check if the task was running on sched_ext, and then * carefully copy the time it's been runnable, and its state.
*/ if (copy_from_kernel_nofault(&class, &p->sched_class, sizeof(class)) || class != &ext_sched_class) {
printk("%sSched_ext: %s (%s%s)", log_lvl, sch->ops.name,
scx_enable_state_str[state], all); return;
}
if (!copy_from_kernel_nofault(&runnable_at, &p->scx.runnable_at, sizeof(runnable_at)))
scnprintf(runnable_at_buf, sizeof(runnable_at_buf), "%+ldms",
jiffies_delta_msecs(runnable_at, jiffies));
/* print everything onto one line to conserve console space */
printk("%sSched_ext: %s (%s%s), task: runnable_at=%s",
log_lvl, sch->ops.name, scx_enable_state_str[state], all,
runnable_at_buf);
}
staticint scx_pm_handler(struct notifier_block *nb, unsignedlong event, void *ptr)
{ /* * SCX schedulers often have userspace components which are sometimes * involved in critial scheduling paths. PM operations involve freezing * userspace which can lead to scheduling misbehaviors including stalls. * Let's bypass while PM operations are in progress.
*/ switch (event) { case PM_HIBERNATION_PREPARE: case PM_SUSPEND_PREPARE: case PM_RESTORE_PREPARE:
scx_bypass(true); break; case PM_POST_HIBERNATION: case PM_POST_SUSPEND: case PM_POST_RESTORE:
scx_bypass(false); break;
}
/* * The following is to prevent the compiler from optimizing out the enum * definitions so that BPF scheduler implementations can use them * through the generated vmlinux.h.
*/
WRITE_ONCE(v, SCX_ENQ_WAKEUP | SCX_DEQ_SLEEP | SCX_KICK_PREEMPT |
SCX_TG_ONLINE);
/******************************************************************************** * Helpers that can be called from the BPF scheduler.
*/ staticbool scx_dsq_insert_preamble(struct task_struct *p, u64 enq_flags)
{ if (!scx_kf_allowed(SCX_KF_ENQUEUE | SCX_KF_DISPATCH)) returnfalse;
lockdep_assert_irqs_disabled();
if (unlikely(!p)) {
scx_kf_error("called with NULL task"); returnfalse;
}
/** * scx_bpf_dsq_insert - Insert a task into the FIFO queue of a DSQ * @p: task_struct to insert * @dsq_id: DSQ to insert into * @slice: duration @p can run for in nsecs, 0 to keep the current value * @enq_flags: SCX_ENQ_* * * Insert @p into the FIFO queue of the DSQ identified by @dsq_id. It is safe to * call this function spuriously. Can be called from ops.enqueue(), * ops.select_cpu(), and ops.dispatch(). * * When called from ops.select_cpu() or ops.enqueue(), it's for direct dispatch * and @p must match the task being enqueued. * * When called from ops.select_cpu(), @enq_flags and @dsp_id are stored, and @p * will be directly inserted into the corresponding dispatch queue after * ops.select_cpu() returns. If @p is inserted into SCX_DSQ_LOCAL, it will be * inserted into the local DSQ of the CPU returned by ops.select_cpu(). * @enq_flags are OR'd with the enqueue flags on the enqueue path before the * task is inserted. * * When called from ops.dispatch(), there are no restrictions on @p or @dsq_id * and this function can be called upto ops.dispatch_max_batch times to insert * multiple tasks. scx_bpf_dispatch_nr_slots() returns the number of the * remaining slots. scx_bpf_dsq_move_to_local() flushes the batch and resets the * counter. * * This function doesn't have any locking restrictions and may be called under * BPF locks (in the future when BPF introduces more flexible locking). * * @p is allowed to run for @slice. The scheduling path is triggered on slice * exhaustion. If zero, the current residual slice is maintained. If * %SCX_SLICE_INF, @p never expires and the BPF scheduler must kick the CPU with * scx_bpf_kick_cpu() to trigger scheduling.
*/
__bpf_kfunc void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice,
u64 enq_flags)
{ if (!scx_dsq_insert_preamble(p, enq_flags)) return;
/** * scx_bpf_dsq_insert_vtime - Insert a task into the vtime priority queue of a DSQ * @p: task_struct to insert * @dsq_id: DSQ to insert into * @slice: duration @p can run for in nsecs, 0 to keep the current value * @vtime: @p's ordering inside the vtime-sorted queue of the target DSQ * @enq_flags: SCX_ENQ_* * * Insert @p into the vtime priority queue of the DSQ identified by @dsq_id. * Tasks queued into the priority queue are ordered by @vtime. All other aspects * are identical to scx_bpf_dsq_insert(). * * @vtime ordering is according to time_before64() which considers wrapping. A * numerically larger vtime may indicate an earlier position in the ordering and * vice-versa. * * A DSQ can only be used as a FIFO or priority queue at any given time and this * function must not be called on a DSQ which already has one or more FIFO tasks * queued and vice-versa. Also, the built-in DSQs (SCX_DSQ_LOCAL and * SCX_DSQ_GLOBAL) cannot be used as priority queues.
*/
__bpf_kfunc void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id,
u64 slice, u64 vtime, u64 enq_flags)
{ if (!scx_dsq_insert_preamble(p, enq_flags)) return;
if (!scx_kf_allowed_if_unlocked() && !scx_kf_allowed(SCX_KF_DISPATCH)) returnfalse;
/* * Can be called from either ops.dispatch() locking this_rq() or any * context where no rq lock is held. If latter, lock @p's task_rq which * we'll likely need anyway.
*/
src_rq = task_rq(p);
if (in_balance) { if (this_rq != src_rq) {
raw_spin_rq_unlock(this_rq);
raw_spin_rq_lock(src_rq);
}
} else {
raw_spin_rq_lock(src_rq);
}
/* * If the BPF scheduler keeps calling this function repeatedly, it can * cause similar live-lock conditions as consume_dispatch_q(). Insert a * breather if necessary.
*/
scx_breather(src_rq);
/* * Did someone else get to it? @p could have already left $src_dsq, got * re-enqueud, or be in the process of being consumed by someone else.
*/ if (unlikely(p->scx.dsq != src_dsq ||
u32_before(kit->cursor.priv, p->scx.dsq_seq) ||
p->scx.holding_cpu >= 0) ||
WARN_ON_ONCE(src_rq != task_rq(p))) {
raw_spin_unlock(&src_dsq->lock); goto out;
}
/* @p is still on $src_dsq and stable, determine the destination */
dst_dsq = find_dsq_for_dispatch(sch, this_rq, dsq_id, p);
/* * Apply vtime and slice updates before moving so that the new time is * visible before inserting into $dst_dsq. @p is still on $src_dsq but * this is safe as we're locking it.
*/ if (kit->cursor.flags & __SCX_DSQ_ITER_HAS_VTIME)
p->scx.dsq_vtime = kit->vtime; if (kit->cursor.flags & __SCX_DSQ_ITER_HAS_SLICE)
p->scx.slice = kit->slice;
/** * scx_bpf_dispatch_nr_slots - Return the number of remaining dispatch slots * * Can only be called from ops.dispatch().
*/
__bpf_kfunc u32 scx_bpf_dispatch_nr_slots(void)
{ if (!scx_kf_allowed(SCX_KF_DISPATCH)) return 0;
/** * scx_bpf_dispatch_cancel - Cancel the latest dispatch * * Cancel the latest dispatch. Can be called multiple times to cancel further * dispatches. Can only be called from ops.dispatch().
*/
__bpf_kfunc void scx_bpf_dispatch_cancel(void)
{ struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
if (!scx_kf_allowed(SCX_KF_DISPATCH)) return;
if (dspc->cursor > 0)
dspc->cursor--; else
scx_kf_error("dispatch buffer underflow");
}
/** * scx_bpf_dsq_move_to_local - move a task from a DSQ to the current CPU's local DSQ * @dsq_id: DSQ to move task from * * Move a task from the non-local DSQ identified by @dsq_id to the current CPU's * local DSQ for execution. Can only be called from ops.dispatch(). * * This function flushes the in-flight dispatches from scx_bpf_dsq_insert() * before trying to move from the specified DSQ. It may also grab rq locks and * thus can't be called under any BPF locks. * * Returns %true if a task has been moved, %false if there isn't any task to * move.
*/
__bpf_kfunc bool scx_bpf_dsq_move_to_local(u64 dsq_id)
{ struct scx_sched *sch = scx_root; struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); struct scx_dispatch_q *dsq;
if (!scx_kf_allowed(SCX_KF_DISPATCH)) returnfalse;
flush_dispatch_buf(sch, dspc->rq);
dsq = find_user_dsq(sch, dsq_id); if (unlikely(!dsq)) {
scx_error(sch, "invalid DSQ ID 0x%016llx", dsq_id); returnfalse;
}
if (consume_dispatch_q(sch, dspc->rq, dsq)) { /* * A successfully consumed task can be dequeued before it starts * running while the CPU is trying to migrate other dispatched * tasks. Bump nr_tasks to tell balance_scx() to retry on empty * local DSQ.
*/
dspc->nr_tasks++; returntrue;
} else { returnfalse;
}
}
/** * scx_bpf_dsq_move_set_slice - Override slice when moving between DSQs * @it__iter: DSQ iterator in progress * @slice: duration the moved task can run for in nsecs * * Override the slice of the next task that will be moved from @it__iter using * scx_bpf_dsq_move[_vtime](). If this function is not called, the previous * slice duration is kept.
*/
__bpf_kfunc void scx_bpf_dsq_move_set_slice(struct bpf_iter_scx_dsq *it__iter,
u64 slice)
{ struct bpf_iter_scx_dsq_kern *kit = (void *)it__iter;
/** * scx_bpf_dsq_move_set_vtime - Override vtime when moving between DSQs * @it__iter: DSQ iterator in progress * @vtime: task's ordering inside the vtime-sorted queue of the target DSQ * * Override the vtime of the next task that will be moved from @it__iter using * scx_bpf_dsq_move_vtime(). If this function is not called, the previous slice * vtime is kept. If scx_bpf_dsq_move() is used to dispatch the next task, the * override is ignored and cleared.
*/
__bpf_kfunc void scx_bpf_dsq_move_set_vtime(struct bpf_iter_scx_dsq *it__iter,
u64 vtime)
{ struct bpf_iter_scx_dsq_kern *kit = (void *)it__iter;
/** * scx_bpf_dsq_move - Move a task from DSQ iteration to a DSQ * @it__iter: DSQ iterator in progress * @p: task to transfer * @dsq_id: DSQ to move @p to * @enq_flags: SCX_ENQ_* * * Transfer @p which is on the DSQ currently iterated by @it__iter to the DSQ * specified by @dsq_id. All DSQs - local DSQs, global DSQ and user DSQs - can * be the destination. * * For the transfer to be successful, @p must still be on the DSQ and have been * queued before the DSQ iteration started. This function doesn't care whether * @p was obtained from the DSQ iteration. @p just has to be on the DSQ and have * been queued before the iteration started. * * @p's slice is kept by default. Use scx_bpf_dsq_move_set_slice() to update. * * Can be called from ops.dispatch() or any BPF context which doesn't hold a rq * lock (e.g. BPF timers or SYSCALL programs). * * Returns %true if @p has been consumed, %false if @p had already been consumed * or dequeued.
*/
__bpf_kfunc bool scx_bpf_dsq_move(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id,
u64 enq_flags)
{ return scx_dsq_move((struct bpf_iter_scx_dsq_kern *)it__iter,
p, dsq_id, enq_flags);
}
/** * scx_bpf_dsq_move_vtime - Move a task from DSQ iteration to a PRIQ DSQ * @it__iter: DSQ iterator in progress * @p: task to transfer * @dsq_id: DSQ to move @p to * @enq_flags: SCX_ENQ_* * * Transfer @p which is on the DSQ currently iterated by @it__iter to the * priority queue of the DSQ specified by @dsq_id. The destination must be a * user DSQ as only user DSQs support priority queue. * * @p's slice and vtime are kept by default. Use scx_bpf_dsq_move_set_slice() * and scx_bpf_dsq_move_set_vtime() to update. * * All other aspects are identical to scx_bpf_dsq_move(). See * scx_bpf_dsq_insert_vtime() for more information on @vtime.
*/
__bpf_kfunc bool scx_bpf_dsq_move_vtime(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id,
u64 enq_flags)
{ return scx_dsq_move((struct bpf_iter_scx_dsq_kern *)it__iter,
p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ);
}
/** * scx_bpf_reenqueue_local - Re-enqueue tasks on a local DSQ * * Iterate over all of the tasks currently enqueued on the local DSQ of the * caller's CPU, and re-enqueue them in the BPF scheduler. Returns the number of * processed tasks. Can only be called from ops.cpu_release().
*/
__bpf_kfunc u32 scx_bpf_reenqueue_local(void)
{
LIST_HEAD(tasks);
u32 nr_enqueued = 0; struct rq *rq; struct task_struct *p, *n;
if (!scx_kf_allowed(SCX_KF_CPU_RELEASE)) return 0;
/* * The BPF scheduler may choose to dispatch tasks back to * @rq->scx.local_dsq. Move all candidate tasks off to a private list * first to avoid processing the same tasks repeatedly.
*/
list_for_each_entry_safe(p, n, &rq->scx.local_dsq.list,
scx.dsq_list.node) { /* * If @p is being migrated, @p's current CPU may not agree with * its allowed CPUs and the migration_cpu_stop is about to * deactivate and re-activate @p anyway. Skip re-enqueueing. * * While racing sched property changes may also dequeue and * re-enqueue a migrating task while its current CPU and allowed * CPUs disagree, they use %ENQUEUE_RESTORE which is bypassed to * the current local DSQ for running tasks and thus are not * visible to the BPF scheduler.
*/ if (p->migration_pending) continue;
/** * scx_bpf_create_dsq - Create a custom DSQ * @dsq_id: DSQ to create * @node: NUMA node to allocate from * * Create a custom DSQ identified by @dsq_id. Can be called from any sleepable * scx callback, and any BPF_PROG_TYPE_SYSCALL prog.
*/
__bpf_kfunc s32 scx_bpf_create_dsq(u64 dsq_id, s32 node)
{ struct scx_dispatch_q *dsq; struct scx_sched *sch;
s32 ret;
/** * scx_bpf_kick_cpu - Trigger reschedule on a CPU * @cpu: cpu to kick * @flags: %SCX_KICK_* flags * * Kick @cpu into rescheduling. This can be used to wake up an idle CPU or * trigger rescheduling on a busy CPU. This can be called from any online * scx_ops operation and the actual kicking is performed asynchronously through * an irq work.
*/
__bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags)
{ struct rq *this_rq; unsignedlong irq_flags;
if (!kf_cpu_valid(cpu, NULL)) return;
local_irq_save(irq_flags);
this_rq = this_rq();
/* * While bypassing for PM ops, IRQ handling may not be online which can * lead to irq_work_queue() malfunction such as infinite busy wait for * IRQ status update. Suppress kicking.
*/ if (scx_rq_bypassing(this_rq)) goto out;
/* * Actual kicking is bounced to kick_cpus_irq_workfn() to avoid nesting * rq locks. We can probably be smarter and avoid bouncing if called * from ops which don't hold a rq lock.
*/ if (flags & SCX_KICK_IDLE) { struct rq *target_rq = cpu_rq(cpu);
if (unlikely(flags & (SCX_KICK_PREEMPT | SCX_KICK_WAIT)))
scx_kf_error("PREEMPT/WAIT cannot be used with SCX_KICK_IDLE");
if (raw_spin_rq_trylock(target_rq)) { if (can_skip_idle_kick(target_rq)) {
raw_spin_rq_unlock(target_rq); goto out;
}
raw_spin_rq_unlock(target_rq);
}
cpumask_set_cpu(cpu, this_rq->scx.cpus_to_kick_if_idle);
} else {
cpumask_set_cpu(cpu, this_rq->scx.cpus_to_kick);
if (flags & SCX_KICK_PREEMPT)
cpumask_set_cpu(cpu, this_rq->scx.cpus_to_preempt); if (flags & SCX_KICK_WAIT)
cpumask_set_cpu(cpu, this_rq->scx.cpus_to_wait);
}
/** * scx_bpf_dsq_nr_queued - Return the number of queued tasks * @dsq_id: id of the DSQ * * Return the number of tasks in the DSQ matching @dsq_id. If not found, * -%ENOENT is returned.
*/
__bpf_kfunc s32 scx_bpf_dsq_nr_queued(u64 dsq_id)
{ struct scx_sched *sch; struct scx_dispatch_q *dsq;
s32 ret;
preempt_disable();
sch = rcu_dereference_sched(scx_root); if (unlikely(!sch)) {
ret = -ENODEV; goto out;
}
if (dsq_id == SCX_DSQ_LOCAL) {
ret = READ_ONCE(this_rq()->scx.local_dsq.nr); goto out;
} elseif ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) {
s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK;
if (ops_cpu_valid(sch, cpu, NULL)) {
ret = READ_ONCE(cpu_rq(cpu)->scx.local_dsq.nr); goto out;
}
} else {
dsq = find_user_dsq(sch, dsq_id); if (dsq) {
ret = READ_ONCE(dsq->nr); goto out;
}
}
ret = -ENOENT;
out:
preempt_enable(); return ret;
}
/** * scx_bpf_destroy_dsq - Destroy a custom DSQ * @dsq_id: DSQ to destroy * * Destroy the custom DSQ identified by @dsq_id. Only DSQs created with * scx_bpf_create_dsq() can be destroyed. The caller must ensure that the DSQ is * empty and no further tasks are dispatched to it. Ignored if called on a DSQ * which doesn't exist. Can be called from any online scx_ops operations.
*/
__bpf_kfunc void scx_bpf_destroy_dsq(u64 dsq_id)
{ struct scx_sched *sch;
rcu_read_lock();
sch = rcu_dereference(scx_root); if (sch)
destroy_dsq(sch, dsq_id);
rcu_read_unlock();
}
/** * bpf_iter_scx_dsq_new - Create a DSQ iterator * @it: iterator to initialize * @dsq_id: DSQ to iterate * @flags: %SCX_DSQ_ITER_* * * Initialize BPF iterator @it which can be used with bpf_for_each() to walk * tasks in the DSQ specified by @dsq_id. Iteration using @it only includes * tasks which are already queued when this function is invoked.
*/
__bpf_kfunc int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id,
u64 flags)
{ struct bpf_iter_scx_dsq_kern *kit = (void *)it; struct scx_sched *sch;
/** * bpf_iter_scx_dsq_next - Progress a DSQ iterator * @it: iterator to progress * * Return the next task. See bpf_iter_scx_dsq_new().
*/
__bpf_kfunc struct task_struct *bpf_iter_scx_dsq_next(struct bpf_iter_scx_dsq *it)
{ struct bpf_iter_scx_dsq_kern *kit = (void *)it; bool rev = kit->cursor.flags & SCX_DSQ_ITER_REV; struct task_struct *p; unsignedlong flags;
if (!kit->dsq) return NULL;
raw_spin_lock_irqsave(&kit->dsq->lock, flags);
if (list_empty(&kit->cursor.node))
p = NULL; else
p = container_of(&kit->cursor, struct task_struct, scx.dsq_list);
/* * Only tasks which were queued before the iteration started are * visible. This bounds BPF iterations and guarantees that vtime never * jumps in the other direction while iterating.
*/ do {
p = nldsq_next_task(kit->dsq, p, rev);
} while (p && unlikely(u32_before(kit->cursor.priv, p->scx.dsq_seq)));
if (p) { if (rev)
list_move_tail(&kit->cursor.node, &p->scx.dsq_list.node); else
list_move(&kit->cursor.node, &p->scx.dsq_list.node);
} else {
list_del_init(&kit->cursor.node);
}
/** * scx_bpf_exit_bstr - Gracefully exit the BPF scheduler. * @exit_code: Exit value to pass to user space via struct scx_exit_info. * @fmt: error message format string * @data: format string parameters packaged using ___bpf_fill() macro * @data__sz: @data len, must end in '__sz' for the verifier * * Indicate that the BPF scheduler wants to exit gracefully, and initiate ops * disabling.
*/
__bpf_kfunc void scx_bpf_exit_bstr(s64 exit_code, char *fmt, unsignedlonglong *data, u32 data__sz)
{ unsignedlong flags;
/** * scx_bpf_dump_bstr - Generate extra debug dump specific to the BPF scheduler * @fmt: format string * @data: format string parameters packaged using ___bpf_fill() macro * @data__sz: @data len, must end in '__sz' for the verifier * * To be called through scx_bpf_dump() helper from ops.dump(), dump_cpu() and * dump_task() to generate extra debug dump specific to the BPF scheduler. * * The extra dump may be multiple lines. A single line may be split over * multiple calls. The last line is automatically terminated.
*/
__bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsignedlonglong *data,
u32 data__sz)
{ struct scx_dump_data *dd = &scx_dump_data; struct scx_bstr_buf *buf = &dd->buf;
s32 ret;
if (raw_smp_processor_id() != dd->cpu) {
scx_kf_error("scx_bpf_dump() must only be called from ops.dump() and friends"); return;
}
/* append the formatted string to the line buf */
ret = __bstr_format(buf->data, buf->line + dd->cursor, sizeof(buf->line) - dd->cursor, fmt, data, data__sz); if (ret < 0) {
dump_line(dd->s, "%s[!] (\"%s\", %p, %u) failed to format (%d)",
dd->prefix, fmt, data, data__sz, ret); return;
}
/* * If the line buf overflowed or ends in a newline, flush it into the * dump. This is to allow the caller to generate a single line over * multiple calls. As ops_dump_flush() can also handle multiple lines in * the line buf, the only case which can lead to an unexpected * truncation is when the caller keeps generating newlines in the middle * instead of the end consecutively. Don't do that.
*/ if (dd->cursor >= sizeof(buf->line) || buf->line[dd->cursor - 1] == '\n')
ops_dump_flush();
}
/** * scx_bpf_cpuperf_cap - Query the maximum relative capacity of a CPU * @cpu: CPU of interest * * Return the maximum relative capacity of @cpu in relation to the most * performant CPU in the system. The return value is in the range [1, * %SCX_CPUPERF_ONE]. See scx_bpf_cpuperf_cur().
*/
__bpf_kfunc u32 scx_bpf_cpuperf_cap(s32 cpu)
{ if (kf_cpu_valid(cpu, NULL)) return arch_scale_cpu_capacity(cpu); else return SCX_CPUPERF_ONE;
}
/** * scx_bpf_cpuperf_cur - Query the current relative performance of a CPU * @cpu: CPU of interest * * Return the current relative performance of @cpu in relation to its maximum. * The return value is in the range [1, %SCX_CPUPERF_ONE]. * * The current performance level of a CPU in relation to the maximum performance * available in the system can be calculated as follows: * * scx_bpf_cpuperf_cap() * scx_bpf_cpuperf_cur() / %SCX_CPUPERF_ONE * * The result is in the range [1, %SCX_CPUPERF_ONE].
*/
__bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu)
{ if (kf_cpu_valid(cpu, NULL)) return arch_scale_freq_capacity(cpu); else return SCX_CPUPERF_ONE;
}
/** * scx_bpf_cpuperf_set - Set the relative performance target of a CPU * @cpu: CPU of interest * @perf: target performance level [0, %SCX_CPUPERF_ONE] * * Set the target performance level of @cpu to @perf. @perf is in linear * relative scale between 0 and %SCX_CPUPERF_ONE. This determines how the * schedutil cpufreq governor chooses the target frequency. * * The actual performance level chosen, CPU grouping, and the overhead and * latency of the operations are dependent on the hardware and cpufreq driver in * use. Consult hardware and cpufreq documentation for more information. The * current performance level can be monitored using scx_bpf_cpuperf_cur().
*/
__bpf_kfunc void scx_bpf_cpuperf_set(s32 cpu, u32 perf)
{ if (unlikely(perf > SCX_CPUPERF_ONE)) {
scx_kf_error("Invalid cpuperf target %u for CPU %d", perf, cpu); return;
}
/* * When called with an rq lock held, restrict the operation * to the corresponding CPU to prevent ABBA deadlocks.
*/ if (locked_rq && rq != locked_rq) {
scx_kf_error("Invalid target CPU %d", cpu); return;
}
/* * If no rq lock is held, allow to operate on any CPU by * acquiring the corresponding rq lock.
*/ if (!locked_rq) {
rq_lock_irqsave(rq, &rf);
update_rq_clock(rq);
}
if (!locked_rq)
rq_unlock_irqrestore(rq, &rf);
}
}
/** * scx_bpf_nr_node_ids - Return the number of possible node IDs * * All valid node IDs in the system are smaller than the returned value.
*/
__bpf_kfunc u32 scx_bpf_nr_node_ids(void)
{ return nr_node_ids;
}
/** * scx_bpf_nr_cpu_ids - Return the number of possible CPU IDs * * All valid CPU IDs in the system are smaller than the returned value.
*/
__bpf_kfunc u32 scx_bpf_nr_cpu_ids(void)
{ return nr_cpu_ids;
}
/** * scx_bpf_get_possible_cpumask - Get a referenced kptr to cpu_possible_mask
*/
__bpf_kfunc conststruct cpumask *scx_bpf_get_possible_cpumask(void)
{ return cpu_possible_mask;
}
/** * scx_bpf_get_online_cpumask - Get a referenced kptr to cpu_online_mask
*/
__bpf_kfunc conststruct cpumask *scx_bpf_get_online_cpumask(void)
{ return cpu_online_mask;
}
/** * scx_bpf_put_cpumask - Release a possible/online cpumask * @cpumask: cpumask to release
*/
__bpf_kfunc void scx_bpf_put_cpumask(conststruct cpumask *cpumask)
{ /* * Empty function body because we aren't actually acquiring or releasing * a reference to a global cpumask, which is read-only in the caller and * is never released. The acquire / release semantics here are just used * to make the cpumask is a trusted pointer in the caller.
*/
}
/** * scx_bpf_task_running - Is task currently running? * @p: task of interest
*/
__bpf_kfunc bool scx_bpf_task_running(conststruct task_struct *p)
{ return task_rq(p)->curr == p;
}
/** * scx_bpf_task_cpu - CPU a task is currently associated with * @p: task of interest
*/
__bpf_kfunc s32 scx_bpf_task_cpu(conststruct task_struct *p)
{ return task_cpu(p);
}
/** * scx_bpf_cpu_rq - Fetch the rq of a CPU * @cpu: CPU of the rq
*/
__bpf_kfunc struct rq *scx_bpf_cpu_rq(s32 cpu)
{ if (!kf_cpu_valid(cpu, NULL)) return NULL;
return cpu_rq(cpu);
}
/** * scx_bpf_task_cgroup - Return the sched cgroup of a task * @p: task of interest * * @p->sched_task_group->css.cgroup represents the cgroup @p is associated with * from the scheduler's POV. SCX operations should use this function to * determine @p's current cgroup as, unlike following @p->cgroups, * @p->sched_task_group is protected by @p's rq lock and thus atomic w.r.t. all * rq-locked operations. Can be called on the parameter tasks of rq-locked * operations. The restriction guarantees that @p's rq is locked by the caller.
*/ #ifdef CONFIG_CGROUP_SCHED
__bpf_kfunc struct cgroup *scx_bpf_task_cgroup(struct task_struct *p)
{ struct task_group *tg = p->sched_task_group; struct cgroup *cgrp = &cgrp_dfl_root.cgrp;
if (!scx_kf_allowed_on_arg_tasks(__SCX_KF_RQ_LOCKED, p)) goto out;
cgrp = tg_cgrp(tg);
out:
cgroup_get(cgrp); return cgrp;
} #endif
/** * scx_bpf_now - Returns a high-performance monotonically non-decreasing * clock for the current CPU. The clock returned is in nanoseconds. * * It provides the following properties: * * 1) High performance: Many BPF schedulers call bpf_ktime_get_ns() frequently * to account for execution time and track tasks' runtime properties. * Unfortunately, in some hardware platforms, bpf_ktime_get_ns() -- which * eventually reads a hardware timestamp counter -- is neither performant nor * scalable. scx_bpf_now() aims to provide a high-performance clock by * using the rq clock in the scheduler core whenever possible. * * 2) High enough resolution for the BPF scheduler use cases: In most BPF * scheduler use cases, the required clock resolution is lower than the most * accurate hardware clock (e.g., rdtsc in x86). scx_bpf_now() basically * uses the rq clock in the scheduler core whenever it is valid. It considers * that the rq clock is valid from the time the rq clock is updated * (update_rq_clock) until the rq is unlocked (rq_unpin_lock). * * 3) Monotonically non-decreasing clock for the same CPU: scx_bpf_now() * guarantees the clock never goes backward when comparing them in the same * CPU. On the other hand, when comparing clocks in different CPUs, there * is no such guarantee -- the clock can go backward. It provides a * monotonically *non-decreasing* clock so that it would provide the same * clock values in two different scx_bpf_now() calls in the same CPU * during the same period of when the rq clock is valid.
*/
__bpf_kfunc u64 scx_bpf_now(void)
{ struct rq *rq;
u64 clock;
preempt_disable();
rq = this_rq(); if (smp_load_acquire(&rq->scx.flags) & SCX_RQ_CLK_VALID) { /* * If the rq clock is valid, use the cached rq clock. * * Note that scx_bpf_now() is re-entrant between a process * context and an interrupt context (e.g., timer interrupt). * However, we don't need to consider the race between them * because such race is not observable from a caller.
*/
clock = READ_ONCE(rq->scx.clock);
} else { /* * Otherwise, return a fresh rq clock. * * The rq clock is updated outside of the rq lock. * In this case, keep the updated rq clock invalid so the next * kfunc call outside the rq lock gets a fresh rq clock.
*/
clock = sched_clock_cpu(cpu_of(rq));
}
/* * scx_bpf_events - Get a system-wide event counter to * @events: output buffer from a BPF program * @events__sz: @events len, must end in '__sz'' for the verifier
*/
__bpf_kfunc void scx_bpf_events(struct scx_event_stats *events,
size_t events__sz)
{ struct scx_sched *sch; struct scx_event_stats e_sys;
/* * We cannot entirely trust a BPF-provided size since a BPF program * might be compiled against a different vmlinux.h, of which * scx_event_stats would be larger (a newer vmlinux.h) or smaller * (an older vmlinux.h). Hence, we use the smaller size to avoid * memory corruption.
*/
events__sz = min(events__sz, sizeof(*events));
memcpy(events, &e_sys, events__sz);
}
/* * kfunc registration can't be done from init_sched_ext_class() as * register_btf_kfunc_id_set() needs most of the system to be up. * * Some kfuncs are context-sensitive and can only be called from * specific SCX ops. They are grouped into BTF sets accordingly. * Unfortunately, BPF currently doesn't have a way of enforcing such * restrictions. Eventually, the verifier should be able to enforce * them. For now, register them the same and make each kfunc explicitly * check using scx_kf_allowed().
*/ if ((ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
&scx_kfunc_set_enqueue_dispatch)) ||
(ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
&scx_kfunc_set_dispatch)) ||
(ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
&scx_kfunc_set_cpu_release)) ||
(ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
&scx_kfunc_set_unlocked)) ||
(ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL,
&scx_kfunc_set_unlocked)) ||
(ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
&scx_kfunc_set_any)) ||
(ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING,
&scx_kfunc_set_any)) ||
(ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL,
&scx_kfunc_set_any))) {
pr_err("sched_ext: Failed to register kfunc sets (%d)\n", ret); return ret;
}
ret = scx_idle_init(); if (ret) {
pr_err("sched_ext: Failed to initialize idle tracking (%d)\n", ret); return ret;
}
ret = register_bpf_struct_ops(&bpf_sched_ext_ops, sched_ext_ops); if (ret) {
pr_err("sched_ext: Failed to register struct_ops (%d)\n", ret); return ret;
}
ret = register_pm_notifier(&scx_pm_notifier); if (ret) {
pr_err("sched_ext: Failed to register PM notifier (%d)\n", ret); return ret;
}
scx_kset = kset_create_and_add("sched_ext", &scx_uevent_ops, kernel_kobj); if (!scx_kset) {
pr_err("sched_ext: Failed to create /sys/kernel/sched_ext\n"); return -ENOMEM;
}
ret = sysfs_create_group(&scx_kset->kobj, &scx_global_attr_group); if (ret < 0) {
pr_err("sched_ext: Failed to add global attributes\n"); return ret;
}
return 0;
}
__initcall(scx_init);
Messung V0.5 in Prozent
¤ Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.0.118Bemerkung:
(vorverarbeitet am 2026-04-26)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.