/* SPDX-License-Identifier: GPL-2.0 */ /* * A simple five-level FIFO queue scheduler. * * There are five FIFOs implemented using BPF_MAP_TYPE_QUEUE. A task gets * assigned to one depending on its compound weight. Each CPU round robins * through the FIFOs and dispatches more from FIFOs with higher indices - 1 from * queue0, 2 from queue1, 4 from queue2 and so on. * * This scheduler demonstrates: * * - BPF-side queueing using PIDs. * - Sleepable per-task storage allocation using ops.prep_enable(). * - Using ops.cpu_release() to handle a higher priority scheduling class taking * the CPU away. * - Core-sched support. * * This scheduler is primarily for demonstration and testing of sched_ext * features and unlikely to be useful for actual workloads. * * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. * Copyright (c) 2022 Tejun Heo <tj@kernel.org> * Copyright (c) 2022 David Vernet <dvernet@meta.com>
*/ #include <scx/common.bpf.h>
enum consts {
ONE_SEC_IN_NS = 1000000000,
SHARED_DSQ = 0,
HIGHPRI_DSQ = 1,
HIGHPRI_WEIGHT = 8668, /* this is what -20 maps to */
};
/* * If enabled, CPU performance target is set according to the queue index * according to the following table.
*/ staticconst u32 qidx_to_cpuperf_target[] = {
[0] = SCX_CPUPERF_ONE * 0 / 4,
[1] = SCX_CPUPERF_ONE * 1 / 4,
[2] = SCX_CPUPERF_ONE * 2 / 4,
[3] = SCX_CPUPERF_ONE * 3 / 4,
[4] = SCX_CPUPERF_ONE * 4 / 4,
};
/* * Per-queue sequence numbers to implement core-sched ordering. * * Tail seq is assigned to each queued task and incremented. Head seq tracks the * sequence number of the latest dispatched task. The distance between the a * task's seq and the associated queue's head seq is called the queue distance * and used when comparing two tasks for ordering. See qmap_core_sched_before().
*/ static u64 core_sched_head_seqs[5]; static u64 core_sched_tail_seqs[5];
if (p->flags & PF_KTHREAD) { if (stall_kernel_nth && !(++kernel_cnt % stall_kernel_nth)) return;
} else { if (stall_user_nth && !(++user_cnt % stall_user_nth)) return;
}
if (test_error_cnt && !--test_error_cnt)
scx_bpf_error("test triggering error");
if (!(tctx = lookup_task_ctx(p))) return;
/* * All enqueued tasks must have their core_sched_seq updated for correct * core-sched ordering. Also, take a look at the end of qmap_dispatch().
*/
tctx->core_sched_seq = core_sched_tail_seqs[idx]++;
/* * If qmap_select_cpu() is telling us to or this is the last runnable * task on the CPU, enqueue locally.
*/ if (tctx->force_local) {
tctx->force_local = false;
scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, slice_ns, enq_flags); return;
}
/* if select_cpu() wasn't called, try direct dispatch */ if (!__COMPAT_is_enq_cpu_selected(enq_flags) &&
(cpu = pick_direct_dispatch_cpu(p, scx_bpf_task_cpu(p))) >= 0) {
__sync_fetch_and_add(&nr_ddsp_from_enq, 1);
scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | cpu, slice_ns, enq_flags); return;
}
/* * If the task was re-enqueued due to the CPU being preempted by a * higher priority scheduling class, just re-enqueue the task directly * on the global DSQ. As we want another CPU to pick it up, find and * kick an idle CPU.
*/ if (enq_flags & SCX_ENQ_REENQ) {
s32 cpu;
scx_bpf_dsq_insert(p, SHARED_DSQ, 0, enq_flags);
cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); if (cpu >= 0)
scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE); return;
}
ring = bpf_map_lookup_elem(&queue_arr, &idx); if (!ring) {
scx_bpf_error("failed to find ring %d", idx); return;
}
/* Queue on the selected FIFO. If the FIFO overflows, punt to global. */ if (bpf_map_push_elem(ring, &pid, 0)) {
scx_bpf_dsq_insert(p, SHARED_DSQ, slice_ns, enq_flags); return;
}
/* * The BPF queue map doesn't support removal and sched_ext can handle spurious * dispatches. qmap_dequeue() is only used to collect statistics.
*/ void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags)
{
__sync_fetch_and_add(&nr_dequeued, 1); if (deq_flags & SCX_DEQ_CORE_SCHED_EXEC)
__sync_fetch_and_add(&nr_core_sched_execed, 1);
}
if ((tctx = lookup_task_ctx(p)))
core_sched_head_seqs[idx] = tctx->core_sched_seq;
}
/* * To demonstrate the use of scx_bpf_dsq_move(), implement silly selective * priority boosting mechanism by scanning SHARED_DSQ looking for highpri tasks, * moving them to HIGHPRI_DSQ and then consuming them first. This makes minor * difference only when dsp_batch is larger than 1. * * scx_bpf_dispatch[_vtime]_from_dsq() are allowed both from ops.dispatch() and * non-rq-lock holding BPF programs. As demonstration, this function is called * from qmap_dispatch() and monitor_timerfn().
*/ staticbool dispatch_highpri(bool from_timer)
{ struct task_struct *p;
s32 this_cpu = bpf_get_smp_processor_id();
if (tctx->highpri) { /* exercise the set_*() and vtime interface too */
__COMPAT_scx_bpf_dsq_move_set_slice(
BPF_FOR_EACH_ITER, slice_ns * 2);
__COMPAT_scx_bpf_dsq_move_set_vtime(
BPF_FOR_EACH_ITER, highpri_seq++);
__COMPAT_scx_bpf_dsq_move_vtime(
BPF_FOR_EACH_ITER, p, HIGHPRI_DSQ, 0);
}
}
/* * Scan HIGHPRI_DSQ and dispatch until a task that can run on this CPU * is found.
*/
bpf_for_each(scx_dsq, p, HIGHPRI_DSQ, 0) { bool dispatched = false;
s32 cpu;
if (bpf_cpumask_test_cpu(this_cpu, p->cpus_ptr))
cpu = this_cpu; else
cpu = scx_bpf_pick_any_cpu(p->cpus_ptr, 0);
if (!nr_highpri_queued && scx_bpf_dsq_move_to_local(SHARED_DSQ)) return;
if (dsp_inf_loop_after && nr_dispatched > dsp_inf_loop_after) { /* * PID 2 should be kthreadd which should mostly be idle and off * the scheduler. Let's keep dispatching it to force the kernel * to call this function over and over again.
*/
p = bpf_task_from_pid(2); if (p) {
scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, slice_ns, 0);
bpf_task_release(p); return;
}
}
if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) {
scx_bpf_error("failed to look up cpu_ctx"); return;
}
for (i = 0; i < 5; i++) { /* Advance the dispatch cursor and pick the fifo. */ if (!cpuc->dsp_cnt) {
cpuc->dsp_idx = (cpuc->dsp_idx + 1) % 5;
cpuc->dsp_cnt = 1 << cpuc->dsp_idx;
}
fifo = bpf_map_lookup_elem(&queue_arr, &cpuc->dsp_idx); if (!fifo) {
scx_bpf_error("failed to find ring %llu", cpuc->dsp_idx); return;
}
/* Dispatch or advance. */
bpf_repeat(BPF_MAX_LOOPS) { struct task_ctx *tctx;
if (bpf_map_pop_elem(fifo, &pid)) break;
p = bpf_task_from_pid(pid); if (!p) continue;
if (!(tctx = lookup_task_ctx(p))) {
bpf_task_release(p); return;
}
if (tctx->highpri)
__sync_fetch_and_sub(&nr_highpri_queued, 1);
batch--;
cpuc->dsp_cnt--; if (!batch || !scx_bpf_dispatch_nr_slots()) { if (dispatch_highpri(false)) return;
scx_bpf_dsq_move_to_local(SHARED_DSQ); return;
} if (!cpuc->dsp_cnt) break;
}
cpuc->dsp_cnt = 0;
}
/* * No other tasks. @prev will keep running. Update its core_sched_seq as * if the task were enqueued and dispatched immediately.
*/ if (prev) {
tctx = bpf_task_storage_get(&task_ctx_stor, prev, 0, 0); if (!tctx) {
scx_bpf_error("task_ctx lookup failed"); return;
}
void BPF_STRUCT_OPS(qmap_tick, struct task_struct *p)
{ struct cpu_ctx *cpuc;
u32 zero = 0; int idx;
if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) {
scx_bpf_error("failed to look up cpu_ctx"); return;
}
/* * Use the running avg of weights to select the target cpuperf level. * This is a demonstration of the cpuperf feature rather than a * practical strategy to regulate CPU frequency.
*/
cpuc->avg_weight = cpuc->avg_weight * 3 / 4 + p->scx.weight / 4;
idx = weight_to_idx(cpuc->avg_weight);
cpuc->cpuperf_target = qidx_to_cpuperf_target[idx];
/* * The distance from the head of the queue scaled by the weight of the queue. * The lower the number, the older the task and the higher the priority.
*/ static s64 task_qdist(struct task_struct *p)
{ int idx = weight_to_idx(p->scx.weight); struct task_ctx *tctx;
s64 qdist;
/* * As queue index increments, the priority doubles. The queue w/ index 3 * is dispatched twice more frequently than 2. Reflect the difference by * scaling qdists accordingly. Note that the shift amount needs to be * flipped depending on the sign to avoid flipping priority direction.
*/ if (qdist >= 0) return qdist << (4 - idx); else return qdist << idx;
}
/* * This is called to determine the task ordering when core-sched is picking * tasks to execute on SMT siblings and should encode about the same ordering as * the regular scheduling path. Use the priority-scaled distances from the head * of the queues to compare the two tasks which should be consistent with the * dispatch path behavior.
*/ bool BPF_STRUCT_OPS(qmap_core_sched_before, struct task_struct *a, struct task_struct *b)
{ return task_qdist(a) > task_qdist(b);
}
/* * Called when @cpu is taken by a higher priority scheduling class. This * makes @cpu no longer available for executing sched_ext tasks. As we * don't want the tasks in @cpu's local dsq to sit there until @cpu * becomes available again, re-enqueue them into the global dsq. See * %SCX_ENQ_REENQ handling in qmap_enqueue().
*/
cnt = scx_bpf_reenqueue_local(); if (cnt)
__sync_fetch_and_add(&nr_reenqueued, cnt);
}
/* * @p is new. Let's ensure that its task_ctx is available. We can sleep * in this function and the following will automatically use GFP_KERNEL.
*/ if (bpf_task_storage_get(&task_ctx_stor, p, 0,
BPF_LOCAL_STORAGE_GET_F_CREATE)) return 0; else return -ENOMEM;
}
void BPF_STRUCT_OPS(qmap_dump, struct scx_dump_ctx *dctx)
{
s32 i, pid;
if (suppress_dump) return;
bpf_for(i, 0, 5) { void *fifo;
if (!(fifo = bpf_map_lookup_elem(&queue_arr, &i))) return;
scx_bpf_dump("QMAP FIFO[%d]:", i);
/* * Dump can be invoked anytime and there is no way to iterate in * a non-destructive way. Pop and store in dump_store and then * restore afterwards. If racing against new enqueues, ordering * can get mixed up.
*/
bpf_repeat(4096) { if (bpf_map_pop_elem(fifo, &pid)) break;
bpf_map_push_elem(&dump_store, &pid, 0);
scx_bpf_dump(" %d", pid);
}
bpf_repeat(4096) { if (bpf_map_pop_elem(&dump_store, &pid)) break;
bpf_map_push_elem(fifo, &pid, 0);
}
/* * Print out the online and possible CPU map using bpf_printk() as a * demonstration of using the cpumask kfuncs and ops.cpu_on/offline().
*/ staticvoid print_cpus(void)
{ conststruct cpumask *possible, *online;
s32 cpu; char buf[128] = "", *p; int idx;
possible = scx_bpf_get_possible_cpumask();
online = scx_bpf_get_online_cpumask();
if (!bpf_cpumask_test_cpu(i, online)) continue;
nr_online_cpus++;
/* collect the capacity and current cpuperf */
cap = scx_bpf_cpuperf_cap(i);
cur = scx_bpf_cpuperf_cur(i);
cur_min = cur < cur_min ? cur : cur_min;
cur_max = cur > cur_max ? cur : cur_max;
/* * $cur is relative to $cap. Scale it down accordingly so that * it's in the same scale as other CPUs and $cur_sum/$cap_sum * makes sense.
*/
cur_sum += cur * cap / SCX_CPUPERF_ONE;
cap_sum += cap;
if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, i))) {
scx_bpf_error("failed to look up cpu_ctx"); goto out;
}
/* collect target */
cur = cpuc->cpuperf_target;
target_sum += cur;
target_min = cur < target_min ? cur : target_min;
target_max = cur > target_max ? cur : target_max;
}
/* * Dump the currently queued tasks in the shared DSQ to demonstrate the usage of * scx_bpf_dsq_nr_queued() and DSQ iterator. Raise the dispatch batch count to * see meaningful dumps in the trace pipe.
*/ staticvoid dump_shared_dsq(void)
{ struct task_struct *p;
s32 nr;
if (!(nr = scx_bpf_dsq_nr_queued(SHARED_DSQ))) return;
bpf_printk("Dumping %d tasks in SHARED_DSQ in reverse order", nr);
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.