/* * Helpers for converting nanosecond timing to jiffy resolution
*/ #define NS_TO_JIFFIES(time) ((unsignedlong)(time) / (NSEC_PER_SEC/HZ))
/* * Increase resolution of nice-level calculations for 64-bit architectures. * The extra resolution improves shares distribution and load balancing of * low-weight task groups (eg. nice +19 on an autogroup), deeper task-group * hierarchies, especially on larger systems. This is not a user-visible change * and does not change the user-interface for setting shares/weights. * * We increase resolution only if we have enough bits to allow this increased * resolution (i.e. 64-bit). The costs for increasing resolution when 32-bit * are pretty high and the returns do not justify the increased costs. * * Really only required when CONFIG_FAIR_GROUP_SCHED=y is also set, but to * increase coverage and consistency always enable it on 64-bit platforms.
*/ #ifdef CONFIG_64BIT # define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT + SCHED_FIXEDPOINT_SHIFT) # define scale_load(w) ((w) << SCHED_FIXEDPOINT_SHIFT) # define scale_load_down(w) \
({ \ unsignedlong __w = (w); \
\ if (__w) \
__w = max(2UL, __w >> SCHED_FIXEDPOINT_SHIFT); \
__w; \
}) #else # define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT) # define scale_load(w) (w) # define scale_load_down(w) (w) #endif
/* * Task weight (visible to users) and its load (invisible to users) have * independent resolution, but they should be well calibrated. We use * scale_load() and scale_load_down(w) to convert between them. The * following must be true: * * scale_load(sched_prio_to_weight[NICE_TO_PRIO(0)-MAX_RT_PRIO]) == NICE_0_LOAD *
*/ #define NICE_0_LOAD (1L << NICE_0_LOAD_SHIFT)
/* * Single value that decides SCHED_DEADLINE internal math precision. * 10 -> just above 1us * 9 -> just above 0.5us
*/ #define DL_SCALE 10
/* * Single value that denotes runtime == period, ie unlimited time.
*/ #define RUNTIME_INF ((u64)~0ULL)
/* * Shifting a value by an exponent greater *or equal* to the size of said value * is UB; cap at size-1.
*/ #define shr_bound(val, shift) \
(val >> min_t(typeof(shift), shift, BITS_PER_TYPE(typeof(val)) - 1))
/* * cgroup weight knobs should use the common MIN, DFL and MAX values which are * 1, 100 and 10000 respectively. While it loses a bit of range on both ends, it * maps pretty well onto the shares value used by scheduler and the round-trip * conversions preserve the original value over the entire range.
*/ staticinlineunsignedlong sched_weight_from_cgroup(unsignedlong cgrp_weight)
{ return DIV_ROUND_CLOSEST_ULL(cgrp_weight * 1024, CGROUP_WEIGHT_DFL);
}
/* * !! For sched_setattr_nocheck() (kernel) only !! * * This is actually gross. :( * * It is used to make schedutil kworker(s) higher priority than SCHED_DEADLINE * tasks, but still be able to sleep. We need this on platforms that cannot * atomically change clock frequency. Remove once fast switching will be * available on such platforms. * * SUGOV stands for SchedUtil GOVernor.
*/ #define SCHED_FLAG_SUGOV 0x10000000
/* * This is the priority-queue data structure of the RT scheduling class:
*/ struct rt_prio_array {
DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */ struct list_head queue[MAX_RT_PRIO];
};
/* * To keep the bandwidth of -deadline tasks under control * we need some place where: * - store the maximum -deadline bandwidth of each cpu; * - cache the fraction of bandwidth that is currently allocated in * each root domain; * * This is all done in the data structure below. It is similar to the * one used for RT-throttling (rt_bandwidth), with the main difference * that, since here we are only interested in admission control, we * do not decrease any runtime while the group "executes", neither we * need a timer to replenish it. * * With respect to SMP, bandwidth is given on a per root domain basis, * meaning that: * - bw (< 100%) is the deadline bandwidth of each CPU; * - total_bw is the currently allocated bandwidth in each root domain;
*/ struct dl_bw {
raw_spinlock_t lock;
u64 bw;
u64 total_bw;
};
externvoid init_dl_bw(struct dl_bw *dl_b); externint sched_dl_global_validate(void); externvoid sched_dl_do_global(void); externint sched_dl_overflow(struct task_struct *p, int policy, conststruct sched_attr *attr); externvoid __setparam_dl(struct task_struct *p, conststruct sched_attr *attr); externvoid __getparam_dl(struct task_struct *p, struct sched_attr *attr); externbool __checkparam_dl(conststruct sched_attr *attr); externbool dl_param_changed(struct task_struct *p, conststruct sched_attr *attr); externint dl_cpuset_cpumask_can_shrink(conststruct cpumask *cur, conststruct cpumask *trial); externint dl_bw_deactivate(int cpu); extern s64 dl_scaled_delta_exec(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec); /* * SCHED_DEADLINE supports servers (nested scheduling) with the following * interface: * * dl_se::rq -- runqueue we belong to. * * dl_se::server_pick() -- nested pick_next_task(); we yield the period if this * returns NULL. * * dl_server_update() -- called from update_curr_common(), propagates runtime * to the server. * * dl_server_start() -- start the server when it has tasks; it will stop * automatically when there are no more tasks, per * dl_se::server_pick() returning NULL. * * dl_server_stop() -- (force) stop the server; use when updating * parameters. * * dl_server_init() -- initializes the server. * * When started the dl_server will (per dl_defer) schedule a timer for its * zero-laxity point -- that is, unlike regular EDF tasks which run ASAP, a * server will run at the very end of its period. * * This is done such that any runtime from the target class can be accounted * against the server -- through dl_server_update() above -- such that when it * becomes time to run, it might already be out of runtime and get deferred * until the next period. In this case dl_server_timer() will alternate * between defer and replenish but never actually enqueue the server. * * Only when the target class does not manage to exhaust the server's runtime * (there's actualy starvation in the given period), will the dl_server get on * the runqueue. Once queued it will pick tasks from the target class and run * them until either its runtime is exhaused, at which point its back to * dl_server_timer, or until there are no more tasks to run, at which point * the dl_server stops itself. * * By stopping at this point the dl_server retains bandwidth, which, if a new * task wakes up imminently (starting the server again), can be used -- * subject to CBS wakeup rules -- without having to wait for the next period. * * Additionally, because of the dl_defer behaviour the start/stop behaviour is * naturally thottled to once per period, avoiding high context switch * workloads from spamming the hrtimer program/cancel paths.
*/ externvoid dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec); externvoid dl_server_start(struct sched_dl_entity *dl_se); externvoid dl_server_stop(struct sched_dl_entity *dl_se); externvoid dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq,
dl_server_pick_f pick_task); externvoid sched_init_dl_servers(void);
/* Statistics: */ int nr_periods; int nr_throttled; int nr_burst;
u64 throttled_time;
u64 burst_time; #endif/* CONFIG_CFS_BANDWIDTH */
};
/* Task group related information */ struct task_group { struct cgroup_subsys_state css;
#ifdef CONFIG_GROUP_SCHED_WEIGHT /* A positive value indicates that this is a SCHED_IDLE group. */ int idle; #endif
#ifdef CONFIG_FAIR_GROUP_SCHED /* schedulable entities of this group on each CPU */ struct sched_entity **se; /* runqueue "owned" by this group on each CPU */ struct cfs_rq **cfs_rq; unsignedlong shares; /* * load_avg can be heavily contended at clock tick time, so put * it in its own cache-line separated from the fields above which * will also be accessed at each tick.
*/
atomic_long_t load_avg ____cacheline_aligned; #endif/* CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_UCLAMP_TASK_GROUP /* The two decimal precision [%] value requested from user-space */ unsignedint uclamp_pct[UCLAMP_CNT]; /* Clamp values requested for a task group */ struct uclamp_se uclamp_req[UCLAMP_CNT]; /* Effective clamp values used for a task group */ struct uclamp_se uclamp[UCLAMP_CNT]; #endif
/* * A weight of 0 or 1 can cause arithmetics problems. * A weight of a cfs_rq is the sum of weights of which entities * are queued on this cfs_rq, so a weight of a entity should not be * too large, so as the shares value of a task group. * (The default weight is 1024 - so there's no practical * limitation from this.)
*/ #define MIN_SHARES (1UL << 1) #define MAX_SHARES (1UL << 18) #endif
/* * Iterate the full tree, calling @down when first entering a node and @up when * leaving it for the final time. * * Caller must hold rcu_lock or sufficient equivalent.
*/ staticinlineint walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
{ return walk_tg_tree_from(&root_task_group, down, up, data);
}
/* * 'curr' points to currently running entity on this cfs_rq. * It is set to NULL otherwise (i.e when none are currently running).
*/ struct sched_entity *curr; struct sched_entity *next;
#ifdef CONFIG_FAIR_GROUP_SCHED
u64 last_update_tg_load_avg; unsignedlong tg_load_avg_contrib; long propagate; long prop_runnable_sum;
/* * h_load = weight * f(tg) * * Where f(tg) is the recursive weight fraction assigned to * this group.
*/ unsignedlong h_load;
u64 last_h_load_update; struct sched_entity *h_load_next; #endif/* CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_FAIR_GROUP_SCHED struct rq *rq; /* CPU runqueue to which this cfs_rq is attached */
/* * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in * a hierarchy). Non-leaf lrqs hold other higher schedulable entities * (like users, containers etc.) * * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a CPU. * This list is used during load balance.
*/ int on_list; struct list_head leaf_cfs_rq_list; struct task_group *tg; /* group that "owns" this runqueue */
/* Locally cached copy of our task_group's idle value */ int idle;
#ifdef CONFIG_CFS_BANDWIDTH int runtime_enabled;
s64 runtime_remaining;
#ifdef CONFIG_SCHED_CLASS_EXT /* scx_rq->flags, protected by the rq lock */ enum scx_rq_flags { /* * A hotplugged CPU starts scheduling before rq_online_scx(). Track * ops.cpu_on/offline() state so that ops.enqueue/dispatch() are called * only while the BPF scheduler considers the CPU to be online.
*/
SCX_RQ_ONLINE = 1 << 0,
SCX_RQ_CAN_STOP_TICK = 1 << 1,
SCX_RQ_BAL_PENDING = 1 << 2, /* balance hasn't run yet */
SCX_RQ_BAL_KEEP = 1 << 3, /* balance decided to keep current */
SCX_RQ_BYPASSING = 1 << 4,
SCX_RQ_CLK_VALID = 1 << 5, /* RQ clock is fresh and valid */
/* Real-Time classes' related field in a runqueue: */ struct rt_rq { struct rt_prio_array active; unsignedint rt_nr_running; unsignedint rr_nr_running; struct { int curr; /* highest queued rt task prio */ int next; /* next highest */
} highest_prio; bool overloaded; struct plist_head pushable_tasks;
int rt_queued;
#ifdef CONFIG_RT_GROUP_SCHED int rt_throttled;
u64 rt_time; /* consumed RT time, goes up in update_curr_rt */
u64 rt_runtime; /* allotted RT time, "slice" from rt_bandwidth, RT sharing/balancing */ /* Nests inside the rq lock: */
raw_spinlock_t rt_runtime_lock;
unsignedint rt_nr_boosted;
struct rq *rq; /* this is always top-level rq, cache? */ #endif #ifdef CONFIG_CGROUP_SCHED struct task_group *tg; /* this tg has "this" rt_rq on given CPU for runnable entities */ #endif
};
/* Deadline class' related fields in a runqueue */ struct dl_rq { /* runqueue is an rbtree, ordered by deadline */ struct rb_root_cached root;
unsignedint dl_nr_running;
/* * Deadline values of the currently executing and the * earliest ready task on this rq. Caching these facilitates * the decision whether or not a ready but not running task * should migrate somewhere else.
*/ struct {
u64 curr;
u64 next;
} earliest_dl;
bool overloaded;
/* * Tasks on this rq that can be pushed away. They are kept in * an rb-tree, ordered by tasks' deadlines, with caching * of the leftmost (earliest deadline) element.
*/ struct rb_root_cached pushable_dl_tasks_root;
/* * "Active utilization" for this runqueue: increased when a * task wakes up (becomes TASK_RUNNING) and decreased when a * task blocks
*/
u64 running_bw;
/* * Utilization of the tasks "assigned" to this runqueue (including * the tasks that are in runqueue and the tasks that executed on this * CPU and blocked). Increased when a task moves to this runqueue, and * decreased when the task moves away (migrates, changes scheduling * policy, or terminates). * This is needed to compute the "inactive utilization" for the * runqueue (inactive utilization = this_bw - running_bw).
*/
u64 this_bw;
u64 extra_bw;
/* * Maximum available bandwidth for reclaiming by SCHED_FLAG_RECLAIM * tasks of this rq. Used in calculation of reclaimable bandwidth(GRUB).
*/
u64 max_bw;
/* * Inverse of the fraction of CPU utilization that can be reclaimed * by the GRUB algorithm.
*/
u64 bw_ratio;
};
#ifdef CONFIG_FAIR_GROUP_SCHED
/* An entity is a task if it doesn't "own" a runqueue */ #define entity_is_task(se) (!se->my_q)
staticinlinelong se_runnable(struct sched_entity *se)
{ if (se->sched_delayed) returnfalse;
return !!se->on_rq;
}
#endif/* !CONFIG_FAIR_GROUP_SCHED */
/* * XXX we want to get rid of these helpers and use the full load resolution.
*/ staticinlinelong se_weight(struct sched_entity *se)
{ return scale_load_down(se->load.weight);
}
staticinlinebool sched_asym_prefer(int a, int b)
{ return arch_asym_cpu_priority(a) > arch_asym_cpu_priority(b);
}
/* * We add the notion of a root-domain which will be used to define per-domain * variables. Each exclusive cpuset essentially defines an island domain by * fully partitioning the member CPUs from any other cpuset. Whenever a new * exclusive cpuset is created, we also create and attach a new root-domain * object. *
*/ struct root_domain {
atomic_t refcount;
atomic_t rto_count; struct rcu_head rcu;
cpumask_var_t span;
cpumask_var_t online;
/* * Indicate pullable load on at least one CPU, e.g: * - More than one runnable task * - Running task is misfit
*/ bool overloaded;
/* Indicate one or more CPUs over-utilized (tipping point) */ bool overutilized;
/* * The bit corresponding to a CPU gets set here if such CPU has more * than one runnable -deadline task (as it is below for RT tasks).
*/
cpumask_var_t dlo_mask;
atomic_t dlo_count; struct dl_bw dl_bw; struct cpudl cpudl;
/* * Indicate whether a root_domain's dl_bw has been checked or * updated. It's monotonously increasing value. * * Also, some corner cases, like 'wrap around' is dangerous, but given * that u64 is 'big enough'. So that shouldn't be a concern.
*/
u64 visit_cookie;
#ifdef HAVE_RT_PUSH_IPI /* * For IPI pull requests, loop across the rto_mask.
*/ struct irq_work rto_push_work;
raw_spinlock_t rto_lock; /* These are only updated and read within rto_lock */ int rto_loop; int rto_cpu; /* These atomics are updated outside of a lock */
atomic_t rto_loop_next;
atomic_t rto_loop_start; #endif/* HAVE_RT_PUSH_IPI */ /* * The "RT overload" flag: it gets set if a CPU has more than * one runnable RT task.
*/
cpumask_var_t rto_mask; struct cpupri cpupri;
/* * NULL-terminated list of performance domains intersecting with the * CPUs of the rd. Protected by RCU.
*/ struct perf_domain __rcu *pd;
};
#ifdef CONFIG_UCLAMP_TASK /* * struct uclamp_bucket - Utilization clamp bucket * @value: utilization clamp value for tasks on this clamp bucket * @tasks: number of RUNNABLE tasks on this clamp bucket * * Keep track of how many tasks are RUNNABLE for a given utilization * clamp value.
*/ struct uclamp_bucket { unsignedlong value : bits_per(SCHED_CAPACITY_SCALE); unsignedlong tasks : BITS_PER_LONG - bits_per(SCHED_CAPACITY_SCALE);
};
/* * struct uclamp_rq - rq's utilization clamp * @value: currently active clamp values for a rq * @bucket: utilization clamp buckets affecting a rq * * Keep track of RUNNABLE tasks on a rq to aggregate their clamp values. * A clamp value is affecting a rq when there is at least one task RUNNABLE * (or actually running) with that value. * * There are up to UCLAMP_CNT possible different clamp values, currently there * are only two: minimum utilization and maximum utilization. * * All utilization clamping values are MAX aggregated, since: * - for util_min: we want to run the CPU at least at the max of the minimum * utilization required by its currently RUNNABLE tasks. * - for util_max: we want to allow the CPU to run up to the max of the * maximum utilization allowed by its currently RUNNABLE tasks. * * Since on each system we expect only a limited number of different * utilization clamp values (UCLAMP_BUCKETS), use a simple array to track * the metrics required to compute all the per-rq utilization clamp values.
*/ struct uclamp_rq { unsignedint value; struct uclamp_bucket bucket[UCLAMP_BUCKETS];
};
/* * This is the main, per-CPU runqueue data structure. * * Locking rule: those places that want to lock multiple runqueues * (such as the load balancing or the thread migration code), lock * acquire operations must be ordered by ascending &runqueue.
*/ struct rq { /* runqueue lock: */
raw_spinlock_t __lock;
#ifdef CONFIG_FAIR_GROUP_SCHED /* list of leaf cfs_rq on this CPU: */ struct list_head leaf_cfs_rq_list; struct list_head *tmp_alone_branch; #endif/* CONFIG_FAIR_GROUP_SCHED */
/* * This is part of a global counter where only the total sum * over all CPUs matters. A task can increase this counter on * one CPU and if it got migrated afterwards it may decrease * it on another CPU. Always updated under the runqueue lock:
*/ unsignedlong nr_uninterruptible;
unsignedint clock_update_flags;
u64 clock; /* Ensure that all clocks are in the same cache line */
u64 clock_task ____cacheline_aligned;
u64 clock_pelt; unsignedlong lost_idle_time;
u64 clock_pelt_idle;
u64 clock_idle; #ifndef CONFIG_64BIT
u64 clock_pelt_idle_copy;
u64 clock_idle_copy; #endif
atomic_t nr_iowait;
u64 last_seen_need_resched_ns; int ticks_without_resched;
#ifdef CONFIG_MEMBARRIER int membarrier_state; #endif
/* * Be careful with this function; not for general use. The return value isn't * stable unless you actually hold a relevant rq->__lock.
*/ staticinline raw_spinlock_t *rq_lockp(struct rq *rq)
{ if (sched_core_enabled(rq)) return &rq->core->__lock;
return &rq->__lock;
}
staticinline raw_spinlock_t *__rq_lockp(struct rq *rq)
{ if (rq->core_enabled) return &rq->core->__lock;
/* * Helpers to check if the CPU's core cookie matches with the task's cookie * when core scheduling is enabled. * A special case is that the task's cookie always matches with CPU's core * cookie if the CPU is in an idle core.
*/ staticinlinebool sched_cpu_cookie_match(struct rq *rq, struct task_struct *p)
{ /* Ignore cookie match if core scheduler is not enabled on the CPU. */ if (!sched_core_enabled(rq)) returntrue;
/* runqueue "owned" by this group */ staticinlinestruct cfs_rq *group_cfs_rq(struct sched_entity *grp)
{ return NULL;
}
#endif/* !CONFIG_FAIR_GROUP_SCHED */
externvoid update_rq_clock(struct rq *rq);
/* * rq::clock_update_flags bits * * %RQCF_REQ_SKIP - will request skipping of clock update on the next * call to __schedule(). This is an optimisation to avoid * neighbouring rq clock updates. * * %RQCF_ACT_SKIP - is set from inside of __schedule() when skipping is * in effect and calls to update_rq_clock() are being ignored. * * %RQCF_UPDATED - is a debug flag that indicates whether a call has been * made to update_rq_clock() since the last time rq::lock was pinned. * * If inside of __schedule(), clock_update_flags will have been * shifted left (a left shift is a cheap operation for the fast path * to promote %RQCF_REQ_SKIP to %RQCF_ACT_SKIP), so you must use, * * if (rq-clock_update_flags >= RQCF_UPDATED) * * to check if %RQCF_UPDATED is set. It'll never be shifted more than * one position though, because the next rq_unpin_lock() will shift it * back.
*/ #define RQCF_REQ_SKIP 0x01 #define RQCF_ACT_SKIP 0x02 #define RQCF_UPDATED 0x04
staticinlinevoid assert_clock_updated(struct rq *rq)
{ /* * The only reason for not seeing a clock update since the * last rq_pin_lock() is if we're currently skipping updates.
*/
WARN_ON_ONCE(rq->clock_update_flags < RQCF_ACT_SKIP);
}
/* * See rt task throttling, which is the only time a skip * request is canceled.
*/ staticinlinevoid rq_clock_cancel_skipupdate(struct rq *rq)
{
lockdep_assert_rq_held(rq);
rq->clock_update_flags &= ~RQCF_REQ_SKIP;
}
/* * During cpu offlining and rq wide unthrottling, we can trigger * an update_rq_clock() for several cfs and rt runqueues (Typically * when using list_for_each_entry_*) * rq_clock_start_loop_update() can be called after updating the clock * once and before iterating over the list to prevent multiple update. * After the iterative traversal, we need to call rq_clock_stop_loop_update() * to clear RQCF_ACT_SKIP of rq->clock_update_flags.
*/ staticinlinevoid rq_clock_start_loop_update(struct rq *rq)
{
lockdep_assert_rq_held(rq);
WARN_ON_ONCE(rq->clock_update_flags & RQCF_ACT_SKIP);
rq->clock_update_flags |= RQCF_ACT_SKIP;
}
struct rq_flags { unsignedlong flags; struct pin_cookie cookie; /* * A copy of (rq::clock_update_flags & RQCF_UPDATED) for the * current pin context is stashed here in case it needs to be * restored in rq_repin_lock().
*/ unsignedint clock_update_flags;
};
/* * Lockdep annotation that avoids accidental unlocks; it's like a * sticky/continuous lockdep_assert_held(). * * This avoids code that has access to 'struct rq *rq' (basically everything in * the scheduler) from accidentally unlocking the rq if they do not also have a * copy of the (on-stack) 'struct rq_flags rf'. * * Also see Documentation/locking/lockdep-design.rst.
*/ staticinlinevoid rq_pin_lock(struct rq *rq, struct rq_flags *rf)
{
rf->cookie = lockdep_pin_lock(__rq_lockp(rq));
/* * Don't (re)queue an already queued item; nor queue anything when * balance_push() is active, see the comment with * balance_push_callback.
*/ if (unlikely(head->next || rq->balance_callback == &balance_push_callback)) return;
/* * The domain tree (rq->sd) is protected by RCU's quiescent state transition. * See destroy_sched_domains: call_rcu for details. * * The domain tree of any CPU may only be accessed from within * preempt-disabled sections.
*/ #define for_each_domain(cpu, __sd) \ for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \
__sd; __sd = __sd->parent)
/* A mask of all the SD flags that have the SDF_SHARED_CHILD metaflag */ #define SD_FLAG(name, mflags) (name * !!((mflags) & SDF_SHARED_CHILD)) | staticconstunsignedint SD_SHARED_CHILD_MASK = #include <linux/sched/sd_flags.h>
0; #undef SD_FLAG
/** * highest_flag_domain - Return highest sched_domain containing flag. * @cpu: The CPU whose highest level of sched domain is to * be returned. * @flag: The flag to check for the highest sched_domain * for the given CPU. * * Returns the highest sched_domain of a CPU which contains @flag. If @flag has * the SDF_SHARED_CHILD metaflag, all the children domains also have @flag.
*/ staticinlinestruct sched_domain *highest_flag_domain(int cpu, int flag)
{ struct sched_domain *sd, *hsd = NULL;
struct sched_group_capacity {
atomic_t ref; /* * CPU capacity of this group, SCHED_CAPACITY_SCALE being max capacity * for a single CPU.
*/ unsignedlong capacity; unsignedlong min_capacity; /* Min per-CPU capacity in group */ unsignedlong max_capacity; /* Max per-CPU capacity in group */ unsignedlong next_update; int imbalance; /* XXX unrelated to capacity but shared group state */
int id;
unsignedlong cpumask[]; /* Balance mask */
};
struct sched_group { struct sched_group *next; /* Must be a circular list */
atomic_t ref;
unsignedint group_weight; unsignedint cores; struct sched_group_capacity *sgc; int asym_prefer_cpu; /* CPU of highest priority in group */ int flags;
/* * The CPUs this group covers. * * NOTE: this field is variable length. (Allocated dynamically * by attaching extra space to the end of the structure, * depending on how many CPUs the kernel has booted up with)
*/ unsignedlong cpumask[];
};
/* * Return the group to which this tasks belongs. * * We cannot use task_css() and friends because the cgroup subsystem * changes that value before the cgroup_subsys::attach() method is called, * therefore we cannot pin it and might observe the wrong value. * * The same is true for autogroup's p->signal->autogroup->tg, the autogroup * core changes this before calling sched_move_task(). * * Instead we use a 'copy' which is updated from sched_move_task() while * holding both task_struct::pi_lock and rq::lock.
*/ staticinlinestruct task_group *task_group(struct task_struct *p)
{ return p->sched_task_group;
}
/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ staticinlinevoid set_task_rq(struct task_struct *p, unsignedint cpu)
{ #ifdefined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_RT_GROUP_SCHED) struct task_group *tg = task_group(p); #endif
#ifdef CONFIG_RT_GROUP_SCHED /* * p->rt.rt_rq is NULL initially and it is easier to assign * root_task_group's rt_rq than switching in rt_rq_of_se() * Clobbers tg(!)
*/ if (!rt_group_sched_enabled())
tg = &root_task_group;
p->rt.rt_rq = tg->rt_rq[cpu];
p->rt.parent = tg->rt_se[cpu]; #endif/* CONFIG_RT_GROUP_SCHED */
}
staticinlinevoid __set_task_cpu(struct task_struct *p, unsignedint cpu)
{
set_task_rq(p, cpu); #ifdef CONFIG_SMP /* * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be * successfully executed on another CPU. We must ensure that updates of * per-task data have been completed by this moment.
*/
smp_wmb();
WRITE_ONCE(task_thread_info(p)->cpu, cpu);
p->wake_cpu = cpu; #endif/* CONFIG_SMP */
}
/* * To support run-time toggling of sched features, all the translation units * (but core.c) reference the sysctl_sched_features defined in core.c.
*/ extern __read_mostly unsignedint sysctl_sched_features;
/* * Is p the current execution context?
*/ staticinlineint task_current(struct rq *rq, struct task_struct *p)
{ return rq->curr == p;
}
/* * Is p the current scheduling context? * * Note that it might be the current execution context at the same time if * rq->curr == rq->donor == p.
*/ staticinlineint task_current_donor(struct rq *rq, struct task_struct *p)
{ return rq->donor == p;
}
staticinlinebool task_is_blocked(struct task_struct *p)
{ if (!sched_proxy_exec()) returnfalse;
/* Wake flags. The first three directly map to some SD flag value */ #define WF_EXEC 0x02 /* Wakeup after exec; maps to SD_BALANCE_EXEC */ #define WF_FORK 0x04 /* Wakeup after fork; maps to SD_BALANCE_FORK */ #define WF_TTWU 0x08 /* Wakeup; maps to SD_BALANCE_WAKE */
#define WF_SYNC 0x10 /* Waker goes to sleep after wakeup */ #define WF_MIGRATED 0x20 /* Internal use, task got migrated */ #define WF_CURRENT_CPU 0x40 /* Prefer to move the wakee to the current CPU. */ #define WF_RQ_SELECTED 0x80 /* ->select_task_rq() was called */
/* * To aid in avoiding the subversion of "niceness" due to uneven distribution * of tasks with abnormal "nice" values across CPUs the contribution that * each task makes to its run queue's load is weighted according to its * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a * scaled version of the new time slice allocation that they receive on time * slice expiry etc.
*/
/* * {de,en}queue flags: * * DEQUEUE_SLEEP - task is no longer runnable * ENQUEUE_WAKEUP - task just became runnable * * SAVE/RESTORE - an otherwise spurious dequeue/enqueue, done to ensure tasks * are in a known state which allows modification. Such pairs * should preserve as much state as possible. * * MOVE - paired with SAVE/RESTORE, explicitly does not preserve the location * in the runqueue. * * NOCLOCK - skip the update_rq_clock() (avoids double updates) * * MIGRATION - p->on_rq == TASK_ON_RQ_MIGRATING (used for DEADLINE) * * ENQUEUE_HEAD - place at front of runqueue (tail if not specified) * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline) * ENQUEUE_MIGRATED - the task was migrated during wakeup * ENQUEUE_RQ_SELECTED - ->select_task_rq() was called *
*/
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.