Quelle fair.c Sprache: C

// SPDX-License-Identifier: GPL-2.0
/*
* Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
*
*  Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
*
*  Interactivity improvements by Mike Galbraith
*  (C) 2007 Mike Galbraith <efault@gmx.de>
*
*  Various enhancements by Dmitry Adamushko.
*  (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
*
*  Group scheduling enhancements by Srivatsa Vaddagiri
*  Copyright IBM Corporation, 2007
*  Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
*
*  Scaled math optimizations by Thomas Gleixner
*  Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
*
*  Adaptive scheduling granularity, math enhancements by Peter Zijlstra
*  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
*/
#include <linux/energy_model.h>
#include <linux/mmap_lock.h>
#include <linux/hugetlb_inline.h>
#include <linux/jiffies.h>
#include <linux/mm_api.h>
#include <linux/highmem.h>
#include <linux/spinlock_api.h>
#include <linux/cpumask_api.h>
#include <linux/lockdep_api.h>
#include <linux/softirq.h>
#include <linux/refcount_api.h>
#include <linux/topology.h>
#include <linux/sched/clock.h>
#include <linux/sched/cond_resched.h>
#include <linux/sched/cputime.h>
#include <linux/sched/isolation.h>
#include <linux/sched/nohz.h>
#include <linux/sched/prio.h>

#include <linux/cpuidle.h>
#include <linux/interrupt.h>
#include <linux/memory-tiers.h>
#include <linux/mempolicy.h>
#include <linux/mutex_api.h>
#include <linux/profile.h>
#include <linux/psi.h>
#include <linux/ratelimit.h>
#include <linux/task_work.h>
#include <linux/rbtree_augmented.h>

#include <asm/switch_to.h>

#include <uapi/linux/sched/types.h>

#include "sched.h"
#include "stats.h"
#include "autogroup.h"

/*
* The initial- and re-scaling of tunables is configurable
*
* Options are:
*
*   SCHED_TUNABLESCALING_NONE - unscaled, always *1
*   SCHED_TUNABLESCALING_LOG - scaled logarithmically, *1+ilog(ncpus)
*   SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
*
* (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
*/
unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;

/*
* Minimal preemption granularity for CPU-bound tasks:
*
* (default: 0.70 msec * (1 + ilog(ncpus)), units: nanoseconds)
*/
unsigned int sysctl_sched_base_slice   = 700000ULL;
static unsigned int normalized_sysctl_sched_base_slice = 700000ULL;

__read_mostly unsigned int sysctl_sched_migration_cost = 500000UL;

static int __init setup_sched_thermal_decay_shift(char *str)
{
pr_warn("Ignoring the deprecated sched_thermal_decay_shift= option\n");
return 1;
}
__setup("sched_thermal_decay_shift=", setup_sched_thermal_decay_shift);

/*
* For asym packing, by default the lower numbered CPU has higher priority.
*/
int __weak arch_asym_cpu_priority(int cpu)
{
return -cpu;
}

/*
* The margin used when comparing utilization with CPU capacity.
*
* (default: ~20%)
*/
#define fits_capacity(cap, max) ((cap) * 1280 < (max) * 1024)

/*
* The margin used when comparing CPU capacities.
* is 'cap1' noticeably greater than 'cap2'
*
* (default: ~5%)
*/
#define capacity_greater(cap1, cap2) ((cap1) * 1024 > (cap2) * 1078)

#ifdef CONFIG_CFS_BANDWIDTH
/*
* Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
* each time a cfs_rq requests quota.
*
* Note: in the case that the slice exceeds the runtime remaining (either due
* to consumption or the quota being specified to be smaller than the slice)
* we will always only issue the remaining available time.
*
* (default: 5 msec, units: microseconds)
*/
static unsigned int sysctl_sched_cfs_bandwidth_slice  = 5000UL;
#endif

#ifdef CONFIG_NUMA_BALANCING
/* Restrict the NUMA promotion throughput (MB/s) for each target node. */
static unsigned int sysctl_numa_balancing_promote_rate_limit = 65536;
#endif

#ifdef CONFIG_SYSCTL
static const struct ctl_table sched_fair_sysctls[] = {
#ifdef CONFIG_CFS_BANDWIDTH
{
  .procname       = "sched_cfs_bandwidth_slice_us",
  .data           = &sysctl_sched_cfs_bandwidth_slice,
  .maxlen         = sizeof(unsigned int),
  .mode           = 0644,
  .proc_handler   = proc_dointvec_minmax,
  .extra1         = SYSCTL_ONE,
},
#endif
#ifdef CONFIG_NUMA_BALANCING
{
  .procname = "numa_balancing_promote_rate_limit_MBps",
  .data  = &sysctl_numa_balancing_promote_rate_limit,
  .maxlen  = sizeof(unsigned int),
  .mode  = 0644,
  .proc_handler = proc_dointvec_minmax,
  .extra1  = SYSCTL_ZERO,
},
#endif /* CONFIG_NUMA_BALANCING */
};

static int __init sched_fair_sysctl_init(void)
{
register_sysctl_init("kernel", sched_fair_sysctls);
return 0;
}
late_initcall(sched_fair_sysctl_init);
#endif /* CONFIG_SYSCTL */

static inline void update_load_add(struct load_weight *lw, unsigned long inc)
{
lw->weight += inc;
lw->inv_weight = 0;
}

static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
{
lw->weight -= dec;
lw->inv_weight = 0;
}

static inline void update_load_set(struct load_weight *lw, unsigned long w)
{
lw->weight = w;
lw->inv_weight = 0;
}

/*
* Increase the granularity value when there are more CPUs,
* because with more CPUs the 'effective latency' as visible
* to users decreases. But the relationship is not linear,
* so pick a second-best guess by going with the log2 of the
* number of CPUs.
*
* This idea comes from the SD scheduler of Con Kolivas:
*/
static unsigned int get_update_sysctl_factor(void)
{
unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8);
unsigned int factor;

switch (sysctl_sched_tunable_scaling) {
case SCHED_TUNABLESCALING_NONE:
  factor = 1;
  break;
case SCHED_TUNABLESCALING_LINEAR:
  factor = cpus;
  break;
case SCHED_TUNABLESCALING_LOG:
default:
  factor = 1 + ilog2(cpus);
  break;
}

return factor;
}

static void update_sysctl(void)
{
unsigned int factor = get_update_sysctl_factor();

#define SET_SYSCTL(name) \
(sysctl_##name = (factor) * normalized_sysctl_##name)
SET_SYSCTL(sched_base_slice);
#undef SET_SYSCTL
}

void __init sched_init_granularity(void)
{
update_sysctl();
}

#define WMULT_CONST (~0U)
#define WMULT_SHIFT 32

static void __update_inv_weight(struct load_weight *lw)
{
unsigned long w;

if (likely(lw->inv_weight))
  return;

w = scale_load_down(lw->weight);

if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
  lw->inv_weight = 1;
else if (unlikely(!w))
  lw->inv_weight = WMULT_CONST;
else
  lw->inv_weight = WMULT_CONST / w;
}

/*
* delta_exec * weight / lw.weight
*   OR
* (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
*
* Either weight := NICE_0_LOAD and lw \e sched_prio_to_wmult[], in which case
* we're guaranteed shift stays positive because inv_weight is guaranteed to
* fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
*
* Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus
* weight/lw.weight <= 1, and therefore our shift will also be positive.
*/
static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
{
u64 fact = scale_load_down(weight);
u32 fact_hi = (u32)(fact >> 32);
int shift = WMULT_SHIFT;
int fs;

__update_inv_weight(lw);

if (unlikely(fact_hi)) {
  fs = fls(fact_hi);
  shift -= fs;
  fact >>= fs;
}

fact = mul_u32_u32(fact, lw->inv_weight);

fact_hi = (u32)(fact >> 32);
if (fact_hi) {
  fs = fls(fact_hi);
  shift -= fs;
  fact >>= fs;
}

return mul_u64_u32_shr(delta_exec, fact, shift);
}

/*
* delta /= w
*/
static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
{
if (unlikely(se->load.weight != NICE_0_LOAD))
  delta = __calc_delta(delta, NICE_0_LOAD, &se->load);

return delta;
}

const struct sched_class fair_sched_class;

/**************************************************************
* CFS operations on generic schedulable entities:
*/

#ifdef CONFIG_FAIR_GROUP_SCHED

/* Walk up scheduling entities hierarchy */
#define for_each_sched_entity(se) \
  for (; se; se = se->parent)

static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
struct rq *rq = rq_of(cfs_rq);
int cpu = cpu_of(rq);

if (cfs_rq->on_list)
  return rq->tmp_alone_branch == &rq->leaf_cfs_rq_list;

cfs_rq->on_list = 1;

/*
* Ensure we either appear before our parent (if already
* enqueued) or force our parent to appear after us when it is
* enqueued. The fact that we always enqueue bottom-up
* reduces this to two cases and a special case for the root
* cfs_rq. Furthermore, it also means that we will always reset
* tmp_alone_branch either when the branch is connected
* to a tree or when we reach the top of the tree
*/
if (cfs_rq->tg->parent &&
     cfs_rq->tg->parent->cfs_rq[cpu]->on_list) {
  /*
* If parent is already on the list, we add the child
* just before. Thanks to circular linked property of
* the list, this means to put the child at the tail
* of the list that starts by parent.
*/
  list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
   &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list));
  /*
* The branch is now connected to its tree so we can
* reset tmp_alone_branch to the beginning of the
* list.
*/
  rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
  return true;
}

if (!cfs_rq->tg->parent) {
  /*
* cfs rq without parent should be put
* at the tail of the list.
*/
  list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
   &rq->leaf_cfs_rq_list);
  /*
* We have reach the top of a tree so we can reset
* tmp_alone_branch to the beginning of the list.
*/
  rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
  return true;
}

/*
* The parent has not already been added so we want to
* make sure that it will be put after us.
* tmp_alone_branch points to the begin of the branch
* where we will add parent.
*/
list_add_rcu(&cfs_rq->leaf_cfs_rq_list, rq->tmp_alone_branch);
/*
* update tmp_alone_branch to points to the new begin
* of the branch
*/
rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list;
return false;
}

static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
if (cfs_rq->on_list) {
  struct rq *rq = rq_of(cfs_rq);

  /*
* With cfs_rq being unthrottled/throttled during an enqueue,
* it can happen the tmp_alone_branch points to the leaf that
* we finally want to delete. In this case, tmp_alone_branch moves
* to the prev element but it will point to rq->leaf_cfs_rq_list
* at the end of the enqueue.
*/
  if (rq->tmp_alone_branch == &cfs_rq->leaf_cfs_rq_list)
   rq->tmp_alone_branch = cfs_rq->leaf_cfs_rq_list.prev;

  list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
  cfs_rq->on_list = 0;
}
}

static inline void assert_list_leaf_cfs_rq(struct rq *rq)
{
WARN_ON_ONCE(rq->tmp_alone_branch != &rq->leaf_cfs_rq_list);
}

/* Iterate through all leaf cfs_rq's on a runqueue */
#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos)   \
list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list, \
     leaf_cfs_rq_list)

/* Do the two (enqueued) entities belong to the same group ? */
static inline struct cfs_rq *
is_same_group(struct sched_entity *se, struct sched_entity *pse)
{
if (se->cfs_rq == pse->cfs_rq)
  return se->cfs_rq;

return NULL;
}

static inline struct sched_entity *parent_entity(const struct sched_entity *se)
{
return se->parent;
}

static void
find_matching_se(struct sched_entity **se, struct sched_entity **pse)
{
int se_depth, pse_depth;

/*
* preemption test can be made between sibling entities who are in the
* same cfs_rq i.e who have a common parent. Walk up the hierarchy of
* both tasks until we find their ancestors who are siblings of common
* parent.
*/

/* First walk up until both entities are at same depth */
se_depth = (*se)->depth;
pse_depth = (*pse)->depth;

while (se_depth > pse_depth) {
  se_depth--;
  *se = parent_entity(*se);
}

while (pse_depth > se_depth) {
  pse_depth--;
  *pse = parent_entity(*pse);
}

while (!is_same_group(*se, *pse)) {
  *se = parent_entity(*se);
  *pse = parent_entity(*pse);
}
}

static int tg_is_idle(struct task_group *tg)
{
return tg->idle > 0;
}

static int cfs_rq_is_idle(struct cfs_rq *cfs_rq)
{
return cfs_rq->idle > 0;
}

static int se_is_idle(struct sched_entity *se)
{
if (entity_is_task(se))
  return task_has_idle_policy(task_of(se));
return cfs_rq_is_idle(group_cfs_rq(se));
}

#else /* !CONFIG_FAIR_GROUP_SCHED: */

#define for_each_sched_entity(se) \
  for (; se; se = NULL)

static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
return true;
}

static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
}

static inline void assert_list_leaf_cfs_rq(struct rq *rq)
{
}

#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
  for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos)

static inline struct sched_entity *parent_entity(struct sched_entity *se)
{
return NULL;
}

static inline void
find_matching_se(struct sched_entity **se, struct sched_entity **pse)
{
}

static inline int tg_is_idle(struct task_group *tg)
{
return 0;
}

static int cfs_rq_is_idle(struct cfs_rq *cfs_rq)
{
return 0;
}

static int se_is_idle(struct sched_entity *se)
{
return task_has_idle_policy(task_of(se));
}

#endif /* !CONFIG_FAIR_GROUP_SCHED */

static __always_inline
void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);

/**************************************************************
* Scheduling class tree data structure manipulation methods:
*/

static inline __maybe_unused u64 max_vruntime(u64 max_vruntime, u64 vruntime)
{
s64 delta = (s64)(vruntime - max_vruntime);
if (delta > 0)
  max_vruntime = vruntime;

return max_vruntime;
}

static inline __maybe_unused u64 min_vruntime(u64 min_vruntime, u64 vruntime)
{
s64 delta = (s64)(vruntime - min_vruntime);
if (delta < 0)
  min_vruntime = vruntime;

return min_vruntime;
}

static inline bool entity_before(const struct sched_entity *a,
     const struct sched_entity *b)
{
/*
* Tiebreak on vruntime seems unnecessary since it can
* hardly happen.
*/
return (s64)(a->deadline - b->deadline) < 0;
}

static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
return (s64)(se->vruntime - cfs_rq->min_vruntime);
}

#define __node_2_se(node) \
rb_entry((node), struct sched_entity, run_node)

/*
* Compute virtual time from the per-task service numbers:
*
* Fair schedulers conserve lag:
*
*   \Sum lag_i = 0
*
* Where lag_i is given by:
*
*   lag_i = S - s_i = w_i * (V - v_i)
*
* Where S is the ideal service time and V is it's virtual time counterpart.
* Therefore:
*
*   \Sum lag_i = 0
*   \Sum w_i * (V - v_i) = 0
*   \Sum w_i * V - w_i * v_i = 0
*
* From which we can solve an expression for V in v_i (which we have in
* se->vruntime):
*
*       \Sum v_i * w_i   \Sum v_i * w_i
*   V = -------------- = --------------
*          \Sum w_i            W
*
* Specifically, this is the weighted average of all entity virtual runtimes.
*
* [[ NOTE: this is only equal to the ideal scheduler under the condition
*          that join/leave operations happen at lag_i = 0, otherwise the
*          virtual time has non-contiguous motion equivalent to:
*
*       V +-= lag_i / W
*
*     Also see the comment in place_entity() that deals with this. ]]
*
* However, since v_i is u64, and the multiplication could easily overflow
* transform it into a relative form that uses smaller quantities:
*
* Substitute: v_i == (v_i - v0) + v0
*
*     \Sum ((v_i - v0) + v0) * w_i   \Sum (v_i - v0) * w_i
* V = ---------------------------- = --------------------- + v0
*                  W                            W
*
* Which we track using:
*
*                    v0 := cfs_rq->min_vruntime
* \Sum (v_i - v0) * w_i := cfs_rq->avg_vruntime
*              \Sum w_i := cfs_rq->avg_load
*
* Since min_vruntime is a monotonic increasing variable that closely tracks
* the per-task service, these deltas: (v_i - v), will be in the order of the
* maximal (virtual) lag induced in the system due to quantisation.
*
* Also, we use scale_load_down() to reduce the size.
*
* As measured, the max (key * weight) value was ~44 bits for a kernel build.
*/
static void
avg_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
unsigned long weight = scale_load_down(se->load.weight);
s64 key = entity_key(cfs_rq, se);

cfs_rq->avg_vruntime += key * weight;
cfs_rq->avg_load += weight;
}

static void
avg_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
unsigned long weight = scale_load_down(se->load.weight);
s64 key = entity_key(cfs_rq, se);

cfs_rq->avg_vruntime -= key * weight;
cfs_rq->avg_load -= weight;
}

static inline
void avg_vruntime_update(struct cfs_rq *cfs_rq, s64 delta)
{
/*
* v' = v + d ==> avg_vruntime' = avg_runtime - d*avg_load
*/
cfs_rq->avg_vruntime -= cfs_rq->avg_load * delta;
}

/*
* Specifically: avg_runtime() + 0 must result in entity_eligible() := true
* For this to be so, the result of this function must have a left bias.
*/
u64 avg_vruntime(struct cfs_rq *cfs_rq)
{
struct sched_entity *curr = cfs_rq->curr;
s64 avg = cfs_rq->avg_vruntime;
long load = cfs_rq->avg_load;

if (curr && curr->on_rq) {
  unsigned long weight = scale_load_down(curr->load.weight);

  avg += entity_key(cfs_rq, curr) * weight;
  load += weight;
}

if (load) {
  /* sign flips effective floor / ceiling */
  if (avg < 0)
   avg -= (load - 1);
  avg = div_s64(avg, load);
}

return cfs_rq->min_vruntime + avg;
}

/*
* lag_i = S - s_i = w_i * (V - v_i)
*
* However, since V is approximated by the weighted average of all entities it
* is possible -- by addition/removal/reweight to the tree -- to move V around
* and end up with a larger lag than we started with.
*
* Limit this to either double the slice length with a minimum of TICK_NSEC
* since that is the timing granularity.
*
* EEVDF gives the following limit for a steady state system:
*
*   -r_max < lag < max(r_max, q)
*
* XXX could add max_slice to the augmented data to track this.
*/
static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
s64 vlag, limit;

WARN_ON_ONCE(!se->on_rq);

vlag = avg_vruntime(cfs_rq) - se->vruntime;
limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se);

se->vlag = clamp(vlag, -limit, limit);
}

/*
* Entity is eligible once it received less service than it ought to have,
* eg. lag >= 0.
*
* lag_i = S - s_i = w_i*(V - v_i)
*
* lag_i >= 0 -> V >= v_i
*
*     \Sum (v_i - v)*w_i
* V = ------------------ + v
*          \Sum w_i
*
* lag_i >= 0 -> \Sum (v_i - v)*w_i >= (v_i - v)*(\Sum w_i)
*
* Note: using 'avg_vruntime() > se->vruntime' is inaccurate due
*       to the loss in precision caused by the division.
*/
static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime)
{
struct sched_entity *curr = cfs_rq->curr;
s64 avg = cfs_rq->avg_vruntime;
long load = cfs_rq->avg_load;

if (curr && curr->on_rq) {
  unsigned long weight = scale_load_down(curr->load.weight);

  avg += entity_key(cfs_rq, curr) * weight;
  load += weight;
}

return avg >= (s64)(vruntime - cfs_rq->min_vruntime) * load;
}

int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
return vruntime_eligible(cfs_rq, se->vruntime);
}

static u64 __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime)
{
u64 min_vruntime = cfs_rq->min_vruntime;
/*
* open coded max_vruntime() to allow updating avg_vruntime
*/
s64 delta = (s64)(vruntime - min_vruntime);
if (delta > 0) {
  avg_vruntime_update(cfs_rq, delta);
  min_vruntime = vruntime;
}
return min_vruntime;
}

static void update_min_vruntime(struct cfs_rq *cfs_rq)
{
struct sched_entity *se = __pick_root_entity(cfs_rq);
struct sched_entity *curr = cfs_rq->curr;
u64 vruntime = cfs_rq->min_vruntime;

if (curr) {
  if (curr->on_rq)
   vruntime = curr->vruntime;
  else
   curr = NULL;
}

if (se) {
  if (!curr)
   vruntime = se->min_vruntime;
  else
   vruntime = min_vruntime(vruntime, se->min_vruntime);
}

/* ensure we never gain time by being placed backwards. */
cfs_rq->min_vruntime = __update_min_vruntime(cfs_rq, vruntime);
}

static inline u64 cfs_rq_min_slice(struct cfs_rq *cfs_rq)
{
struct sched_entity *root = __pick_root_entity(cfs_rq);
struct sched_entity *curr = cfs_rq->curr;
u64 min_slice = ~0ULL;

if (curr && curr->on_rq)
  min_slice = curr->slice;

if (root)
  min_slice = min(min_slice, root->min_slice);

return min_slice;
}

static inline bool __entity_less(struct rb_node *a, const struct rb_node *b)
{
return entity_before(__node_2_se(a), __node_2_se(b));
}

#define vruntime_gt(field, lse, rse) ({ (s64)((lse)->field - (rse)->field) > 0; })

static inline void __min_vruntime_update(struct sched_entity *se, struct rb_node *node)
{
if (node) {
  struct sched_entity *rse = __node_2_se(node);
  if (vruntime_gt(min_vruntime, se, rse))
   se->min_vruntime = rse->min_vruntime;
}
}

static inline void __min_slice_update(struct sched_entity *se, struct rb_node *node)
{
if (node) {
  struct sched_entity *rse = __node_2_se(node);
  if (rse->min_slice < se->min_slice)
   se->min_slice = rse->min_slice;
}
}

/*
* se->min_vruntime = min(se->vruntime, {left,right}->min_vruntime)
*/
static inline bool min_vruntime_update(struct sched_entity *se, bool exit)
{
u64 old_min_vruntime = se->min_vruntime;
u64 old_min_slice = se->min_slice;
struct rb_node *node = &se->run_node;

se->min_vruntime = se->vruntime;
__min_vruntime_update(se, node->rb_right);
__min_vruntime_update(se, node->rb_left);

se->min_slice = se->slice;
__min_slice_update(se, node->rb_right);
__min_slice_update(se, node->rb_left);

return se->min_vruntime == old_min_vruntime &&
        se->min_slice == old_min_slice;
}

RB_DECLARE_CALLBACKS(static, min_vruntime_cb, struct sched_entity,
       run_node, min_vruntime, min_vruntime_update);

/*
* Enqueue an entity into the rb-tree:
*/
static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
avg_vruntime_add(cfs_rq, se);
se->min_vruntime = se->vruntime;
se->min_slice = se->slice;
rb_add_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
    __entity_less, &min_vruntime_cb);
}

static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
rb_erase_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
      &min_vruntime_cb);
avg_vruntime_sub(cfs_rq, se);
}

struct sched_entity *__pick_root_entity(struct cfs_rq *cfs_rq)
{
struct rb_node *root = cfs_rq->tasks_timeline.rb_root.rb_node;

if (!root)
  return NULL;

return __node_2_se(root);
}

struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
{
struct rb_node *left = rb_first_cached(&cfs_rq->tasks_timeline);

if (!left)
  return NULL;

return __node_2_se(left);
}

/*
* Set the vruntime up to which an entity can run before looking
* for another entity to pick.
* In case of run to parity, we use the shortest slice of the enqueued
* entities to set the protected period.
* When run to parity is disabled, we give a minimum quantum to the running
* entity to ensure progress.
*/
static inline void set_protect_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
u64 slice = normalized_sysctl_sched_base_slice;
u64 vprot = se->deadline;

if (sched_feat(RUN_TO_PARITY))
  slice = cfs_rq_min_slice(cfs_rq);

slice = min(slice, se->slice);
if (slice != se->slice)
  vprot = min_vruntime(vprot, se->vruntime + calc_delta_fair(slice, se));

se->vprot = vprot;
}

static inline void update_protect_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
u64 slice = cfs_rq_min_slice(cfs_rq);

se->vprot = min_vruntime(se->vprot, se->vruntime + calc_delta_fair(slice, se));
}

static inline bool protect_slice(struct sched_entity *se)
{
return ((s64)(se->vprot - se->vruntime) > 0);
}

static inline void cancel_protect_slice(struct sched_entity *se)
{
if (protect_slice(se))
  se->vprot = se->vruntime;
}

/*
* Earliest Eligible Virtual Deadline First
*
* In order to provide latency guarantees for different request sizes
* EEVDF selects the best runnable task from two criteria:
*
*  1) the task must be eligible (must be owed service)
*
*  2) from those tasks that meet 1), we select the one
*     with the earliest virtual deadline.
*
* We can do this in O(log n) time due to an augmented RB-tree. The
* tree keeps the entries sorted on deadline, but also functions as a
* heap based on the vruntime by keeping:
*
*  se->min_vruntime = min(se->vruntime, se->{left,right}->min_vruntime)
*
* Which allows tree pruning through eligibility.
*/
static struct sched_entity *__pick_eevdf(struct cfs_rq *cfs_rq, bool protect)
{
struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node;
struct sched_entity *se = __pick_first_entity(cfs_rq);
struct sched_entity *curr = cfs_rq->curr;
struct sched_entity *best = NULL;

/*
* We can safely skip eligibility check if there is only one entity
* in this cfs_rq, saving some cycles.
*/
if (cfs_rq->nr_queued == 1)
  return curr && curr->on_rq ? curr : se;

if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr)))
  curr = NULL;

if (curr && protect && protect_slice(curr))
  return curr;

/* Pick the leftmost entity if it's eligible */
if (se && entity_eligible(cfs_rq, se)) {
  best = se;
  goto found;
}

/* Heap search for the EEVD entity */
while (node) {
  struct rb_node *left = node->rb_left;

  /*
* Eligible entities in left subtree are always better
* choices, since they have earlier deadlines.
*/
  if (left && vruntime_eligible(cfs_rq,
     __node_2_se(left)->min_vruntime)) {
   node = left;
   continue;
  }

  se = __node_2_se(node);

  /*
* The left subtree either is empty or has no eligible
* entity, so check the current node since it is the one
* with earliest deadline that might be eligible.
*/
  if (entity_eligible(cfs_rq, se)) {
   best = se;
   break;
  }

  node = node->rb_right;
}
found:
if (!best || (curr && entity_before(curr, best)))
  best = curr;

return best;
}

static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
{
return __pick_eevdf(cfs_rq, true);
}

struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
{
struct rb_node *last = rb_last(&cfs_rq->tasks_timeline.rb_root);

if (!last)
  return NULL;

return __node_2_se(last);
}

/**************************************************************
* Scheduling class statistics methods:
*/
int sched_update_scaling(void)
{
unsigned int factor = get_update_sysctl_factor();

#define WRT_SYSCTL(name) \
(normalized_sysctl_##name = sysctl_##name / (factor))
WRT_SYSCTL(sched_base_slice);
#undef WRT_SYSCTL

return 0;
}

static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se);

/*
* XXX: strictly: vd_i += N*r_i/w_i such that: vd_i > ve_i
* this is probably good enough.
*/
static bool update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
if ((s64)(se->vruntime - se->deadline) < 0)
  return false;

/*
* For EEVDF the virtual time slope is determined by w_i (iow.
* nice) while the request time r_i is determined by
* sysctl_sched_base_slice.
*/
if (!se->custom_slice)
  se->slice = sysctl_sched_base_slice;

/*
* EEVDF: vd_i = ve_i + r_i / w_i
*/
se->deadline = se->vruntime + calc_delta_fair(se->slice, se);

/*
* The task has consumed its request, reschedule.
*/
return true;
}

#include "pelt.h"

static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
static unsigned long task_h_load(struct task_struct *p);
static unsigned long capacity_of(int cpu);

/* Give new sched_entity start runnable values to heavy its load in infant time */
void init_entity_runnable_average(struct sched_entity *se)
{
struct sched_avg *sa = &se->avg;

memset(sa, 0, sizeof(*sa));

/*
* Tasks are initialized with full load to be seen as heavy tasks until
* they get a chance to stabilize to their real load level.
* Group entities are initialized with zero load to reflect the fact that
* nothing has been attached to the task group yet.
*/
if (entity_is_task(se))
  sa->load_avg = scale_load_down(se->load.weight);

/* when this task is enqueued, it will contribute to its cfs_rq's load_avg */
}

/*
* With new tasks being created, their initial util_avgs are extrapolated
* based on the cfs_rq's current util_avg:
*
*   util_avg = cfs_rq->avg.util_avg / (cfs_rq->avg.load_avg + 1)
* * se_weight(se)
*
* However, in many cases, the above util_avg does not give a desired
* value. Moreover, the sum of the util_avgs may be divergent, such
* as when the series is a harmonic series.
*
* To solve this problem, we also cap the util_avg of successive tasks to
* only 1/2 of the left utilization budget:
*
*   util_avg_cap = (cpu_scale - cfs_rq->avg.util_avg) / 2^n
*
* where n denotes the nth task and cpu_scale the CPU capacity.
*
* For example, for a CPU with 1024 of capacity, a simplest series from
* the beginning would be like:
*
*  task  util_avg: 512, 256, 128,  64,  32,   16,    8, ...
* cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ...
*
* Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap)
* if util_avg > util_avg_cap.
*/
void post_init_entity_util_avg(struct task_struct *p)
{
struct sched_entity *se = &p->se;
struct cfs_rq *cfs_rq = cfs_rq_of(se);
struct sched_avg *sa = &se->avg;
long cpu_scale = arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq)));
long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / 2;

if (p->sched_class != &fair_sched_class) {
  /*
* For !fair tasks do:
*
update_cfs_rq_load_avg(now, cfs_rq);
attach_entity_load_avg(cfs_rq, se);
switched_from_fair(rq, p);
*
* such that the next switched_to_fair() has the
* expected state.
*/
  se->avg.last_update_time = cfs_rq_clock_pelt(cfs_rq);
  return;
}

if (cap > 0) {
  if (cfs_rq->avg.util_avg != 0) {
   sa->util_avg  = cfs_rq->avg.util_avg * se_weight(se);
   sa->util_avg /= (cfs_rq->avg.load_avg + 1);

   if (sa->util_avg > cap)
    sa->util_avg = cap;
  } else {
   sa->util_avg = cap;
  }
}

sa->runnable_avg = sa->util_avg;
}

static s64 update_se(struct rq *rq, struct sched_entity *se)
{
u64 now = rq_clock_task(rq);
s64 delta_exec;

delta_exec = now - se->exec_start;
if (unlikely(delta_exec <= 0))
  return delta_exec;

se->exec_start = now;
if (entity_is_task(se)) {
  struct task_struct *donor = task_of(se);
  struct task_struct *running = rq->curr;
  /*
* If se is a task, we account the time against the running
* task, as w/ proxy-exec they may not be the same.
*/
  running->se.exec_start = now;
  running->se.sum_exec_runtime += delta_exec;

  trace_sched_stat_runtime(running, delta_exec);
  account_group_exec_runtime(running, delta_exec);

  /* cgroup time is always accounted against the donor */
  cgroup_account_cputime(donor, delta_exec);
} else {
  /* If not task, account the time against donor se  */
  se->sum_exec_runtime += delta_exec;
}

if (schedstat_enabled()) {
  struct sched_statistics *stats;

  stats = __schedstats_from_se(se);
  __schedstat_set(stats->exec_max,
    max(delta_exec, stats->exec_max));
}

return delta_exec;
}

/*
* Used by other classes to account runtime.
*/
s64 update_curr_common(struct rq *rq)
{
return update_se(rq, &rq->donor->se);
}

/*
* Update the current task's runtime statistics.
*/
static void update_curr(struct cfs_rq *cfs_rq)
{
/*
* Note: cfs_rq->curr corresponds to the task picked to
* run (ie: rq->donor.se) which due to proxy-exec may
* not necessarily be the actual task running
* (rq->curr.se). This is easy to confuse!
*/
struct sched_entity *curr = cfs_rq->curr;
struct rq *rq = rq_of(cfs_rq);
s64 delta_exec;
bool resched;

if (unlikely(!curr))
  return;

delta_exec = update_se(rq, curr);
if (unlikely(delta_exec <= 0))
  return;

curr->vruntime += calc_delta_fair(delta_exec, curr);
resched = update_deadline(cfs_rq, curr);
update_min_vruntime(cfs_rq);

if (entity_is_task(curr)) {
  /*
* If the fair_server is active, we need to account for the
* fair_server time whether or not the task is running on
* behalf of fair_server or not:
*  - If the task is running on behalf of fair_server, we need
*    to limit its time based on the assigned runtime.
*  - Fair task that runs outside of fair_server should account
*    against fair_server such that it can account for this time
*    and possibly avoid running this period.
*/
  if (dl_server_active(&rq->fair_server))
   dl_server_update(&rq->fair_server, delta_exec);
}

account_cfs_rq_runtime(cfs_rq, delta_exec);

if (cfs_rq->nr_queued == 1)
  return;

if (resched || !protect_slice(curr)) {
  resched_curr_lazy(rq);
  clear_buddies(cfs_rq, curr);
}
}

static void update_curr_fair(struct rq *rq)
{
update_curr(cfs_rq_of(&rq->donor->se));
}

static inline void
update_stats_wait_start_fair(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
struct sched_statistics *stats;
struct task_struct *p = NULL;

if (!schedstat_enabled())
  return;

stats = __schedstats_from_se(se);

if (entity_is_task(se))
  p = task_of(se);

__update_stats_wait_start(rq_of(cfs_rq), p, stats);
}

static inline void
update_stats_wait_end_fair(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
struct sched_statistics *stats;
struct task_struct *p = NULL;

if (!schedstat_enabled())
  return;

stats = __schedstats_from_se(se);

/*
* When the sched_schedstat changes from 0 to 1, some sched se
* maybe already in the runqueue, the se->statistics.wait_start
* will be 0.So it will let the delta wrong. We need to avoid this
* scenario.
*/
if (unlikely(!schedstat_val(stats->wait_start)))
  return;

if (entity_is_task(se))
  p = task_of(se);

__update_stats_wait_end(rq_of(cfs_rq), p, stats);
}

static inline void
update_stats_enqueue_sleeper_fair(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
struct sched_statistics *stats;
struct task_struct *tsk = NULL;

if (!schedstat_enabled())
  return;

stats = __schedstats_from_se(se);

if (entity_is_task(se))
  tsk = task_of(se);

__update_stats_enqueue_sleeper(rq_of(cfs_rq), tsk, stats);
}

/*
* Task is being enqueued - update stats:
*/
static inline void
update_stats_enqueue_fair(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
if (!schedstat_enabled())
  return;

/*
* Are we enqueueing a waiting task? (for current tasks
* a dequeue/enqueue event is a NOP)
*/
if (se != cfs_rq->curr)
  update_stats_wait_start_fair(cfs_rq, se);

if (flags & ENQUEUE_WAKEUP)
  update_stats_enqueue_sleeper_fair(cfs_rq, se);
}

static inline void
update_stats_dequeue_fair(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{

if (!schedstat_enabled())
  return;

/*
* Mark the end of the wait period if dequeueing a
* waiting task:
*/
if (se != cfs_rq->curr)
  update_stats_wait_end_fair(cfs_rq, se);

if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) {
  struct task_struct *tsk = task_of(se);
  unsigned int state;

  /* XXX racy against TTWU */
  state = READ_ONCE(tsk->__state);
  if (state & TASK_INTERRUPTIBLE)
   __schedstat_set(tsk->stats.sleep_start,
          rq_clock(rq_of(cfs_rq)));
  if (state & TASK_UNINTERRUPTIBLE)
   __schedstat_set(tsk->stats.block_start,
          rq_clock(rq_of(cfs_rq)));
}
}

/*
* We are picking a new current task - update its stats:
*/
static inline void
update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
/*
* We are starting a new run period:
*/
se->exec_start = rq_clock_task(rq_of(cfs_rq));
}

/**************************************************
* Scheduling class queueing methods:
*/

static inline bool is_core_idle(int cpu)
{
#ifdef CONFIG_SCHED_SMT
int sibling;

for_each_cpu(sibling, cpu_smt_mask(cpu)) {
  if (cpu == sibling)
   continue;

  if (!idle_cpu(sibling))
   return false;
}
#endif

return true;
}

#ifdef CONFIG_NUMA
#define NUMA_IMBALANCE_MIN 2

static inline long
adjust_numa_imbalance(int imbalance, int dst_running, int imb_numa_nr)
{
/*
* Allow a NUMA imbalance if busy CPUs is less than the maximum
* threshold. Above this threshold, individual tasks may be contending
* for both memory bandwidth and any shared HT resources.  This is an
* approximation as the number of running tasks may not be related to
* the number of busy CPUs due to sched_setaffinity.
*/
if (dst_running > imb_numa_nr)
  return imbalance;

/*
* Allow a small imbalance based on a simple pair of communicating
* tasks that remain local when the destination is lightly loaded.
*/
if (imbalance <= NUMA_IMBALANCE_MIN)
  return 0;

return imbalance;
}
#endif /* CONFIG_NUMA */

#ifdef CONFIG_NUMA_BALANCING
/*
* Approximate time to scan a full NUMA task in ms. The task scan period is
* calculated based on the tasks virtual memory size and
* numa_balancing_scan_size.
*/
unsigned int sysctl_numa_balancing_scan_period_min = 1000;
unsigned int sysctl_numa_balancing_scan_period_max = 60000;

/* Portion of address space to scan in MB */
unsigned int sysctl_numa_balancing_scan_size = 256;

/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
unsigned int sysctl_numa_balancing_scan_delay = 1000;

/* The page with hint page fault latency < threshold in ms is considered hot */
unsigned int sysctl_numa_balancing_hot_threshold = MSEC_PER_SEC;

struct numa_group {
refcount_t refcount;

spinlock_t lock; /* nr_tasks, tasks */
int nr_tasks;
pid_t gid;
int active_nodes;

struct rcu_head rcu;
unsigned long total_faults;
unsigned long max_faults_cpu;
/*
* faults[] array is split into two regions: faults_mem and faults_cpu.
*
* Faults_cpu is used to decide whether memory should move
* towards the CPU. As a consequence, these stats are weighted
* more by CPU use than by memory faults.
*/
unsigned long faults[];
};

/*
* For functions that can be called in multiple contexts that permit reading
* ->numa_group (see struct task_struct for locking rules).
*/
static struct numa_group *deref_task_numa_group(struct task_struct *p)
{
return rcu_dereference_check(p->numa_group, p == current ||
  (lockdep_is_held(__rq_lockp(task_rq(p))) && !READ_ONCE(p->on_cpu)));
}

static struct numa_group *deref_curr_numa_group(struct task_struct *p)
{
return rcu_dereference_protected(p->numa_group, p == current);
}

static inline unsigned long group_faults_priv(struct numa_group *ng);
static inline unsigned long group_faults_shared(struct numa_group *ng);

static unsigned int task_nr_scan_windows(struct task_struct *p)
{
unsigned long rss = 0;
unsigned long nr_scan_pages;

/*
* Calculations based on RSS as non-present and empty pages are skipped
* by the PTE scanner and NUMA hinting faults should be trapped based
* on resident pages
*/
nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
rss = get_mm_rss(p->mm);
if (!rss)
  rss = nr_scan_pages;

rss = round_up(rss, nr_scan_pages);
return rss / nr_scan_pages;
}

/* For sanity's sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
#define MAX_SCAN_WINDOW 2560

static unsigned int task_scan_min(struct task_struct *p)
{
unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size);
unsigned int scan, floor;
unsigned int windows = 1;

if (scan_size < MAX_SCAN_WINDOW)
  windows = MAX_SCAN_WINDOW / scan_size;
floor = 1000 / windows;

scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
return max_t(unsigned int, floor, scan);
}

static unsigned int task_scan_start(struct task_struct *p)
{
unsigned long smin = task_scan_min(p);
unsigned long period = smin;
struct numa_group *ng;

/* Scale the maximum scan period with the amount of shared memory. */
rcu_read_lock();
ng = rcu_dereference(p->numa_group);
if (ng) {
  unsigned long shared = group_faults_shared(ng);
  unsigned long private = group_faults_priv(ng);

  period *= refcount_read(&ng->refcount);
  period *= shared + 1;
  period /= private + shared + 1;
}
rcu_read_unlock();

return max(smin, period);
}

static unsigned int task_scan_max(struct task_struct *p)
{
unsigned long smin = task_scan_min(p);
unsigned long smax;
struct numa_group *ng;

/* Watch for min being lower than max due to floor calculations */
smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);

/* Scale the maximum scan period with the amount of shared memory. */
ng = deref_curr_numa_group(p);
if (ng) {
  unsigned long shared = group_faults_shared(ng);
  unsigned long private = group_faults_priv(ng);
  unsigned long period = smax;

  period *= refcount_read(&ng->refcount);
  period *= shared + 1;
  period /= private + shared + 1;

  smax = max(smax, period);
}

return max(smin, smax);
}

static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
{
rq->nr_numa_running += (p->numa_preferred_nid != NUMA_NO_NODE);
rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
}

static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
{
rq->nr_numa_running -= (p->numa_preferred_nid != NUMA_NO_NODE);
rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
}

/* Shared or private faults. */
#define NR_NUMA_HINT_FAULT_TYPES 2

/* Memory and CPU locality */
#define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2)

/* Averaged statistics, and temporary buffers. */
#define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2)

pid_t task_numa_group_id(struct task_struct *p)
{
struct numa_group *ng;
pid_t gid = 0;

rcu_read_lock();
ng = rcu_dereference(p->numa_group);
if (ng)
  gid = ng->gid;
rcu_read_unlock();

return gid;
}

/*
* The averaged statistics, shared & private, memory & CPU,
* occupy the first half of the array. The second half of the
* array is for current counters, which are averaged into the
* first set by task_numa_placement.
*/
static inline int task_faults_idx(enum numa_faults_stats s, int nid, int priv)
{
return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv;
}

static inline unsigned long task_faults(struct task_struct *p, int nid)
{
if (!p->numa_faults)
  return 0;

return p->numa_faults[task_faults_idx(NUMA_MEM, nid, 0)] +
  p->numa_faults[task_faults_idx(NUMA_MEM, nid, 1)];
}

static inline unsigned long group_faults(struct task_struct *p, int nid)
{
struct numa_group *ng = deref_task_numa_group(p);

if (!ng)
  return 0;

return ng->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
  ng->faults[task_faults_idx(NUMA_MEM, nid, 1)];
}

static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
{
return group->faults[task_faults_idx(NUMA_CPU, nid, 0)] +
  group->faults[task_faults_idx(NUMA_CPU, nid, 1)];
}

static inline unsigned long group_faults_priv(struct numa_group *ng)
{
unsigned long faults = 0;
int node;

for_each_online_node(node) {
  faults += ng->faults[task_faults_idx(NUMA_MEM, node, 1)];
}

return faults;
}

static inline unsigned long group_faults_shared(struct numa_group *ng)
{
unsigned long faults = 0;
int node;

for_each_online_node(node) {
  faults += ng->faults[task_faults_idx(NUMA_MEM, node, 0)];
}

return faults;
}

/*
* A node triggering more than 1/3 as many NUMA faults as the maximum is
* considered part of a numa group's pseudo-interleaving set. Migrations
* between these nodes are slowed down, to allow things to settle down.
*/
#define ACTIVE_NODE_FRACTION 3

static bool numa_is_active_node(int nid, struct numa_group *ng)
{
return group_faults_cpu(ng, nid) * ACTIVE_NODE_FRACTION > ng->max_faults_cpu;
}

/* Handle placement on systems where not all nodes are directly connected. */
static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
     int lim_dist, bool task)
{
unsigned long score = 0;
int node, max_dist;

/*
* All nodes are directly connected, and the same distance
* from each other. No need for fancy placement algorithms.
*/
if (sched_numa_topology_type == NUMA_DIRECT)
  return 0;

/* sched_max_numa_distance may be changed in parallel. */
max_dist = READ_ONCE(sched_max_numa_distance);
/*
* This code is called for each node, introducing N^2 complexity,
* which should be OK given the number of nodes rarely exceeds 8.
*/
for_each_online_node(node) {
  unsigned long faults;
  int dist = node_distance(nid, node);

  /*
* The furthest away nodes in the system are not interesting
* for placement; nid was already counted.
*/
  if (dist >= max_dist || node == nid)
   continue;

  /*
* On systems with a backplane NUMA topology, compare groups
* of nodes, and move tasks towards the group with the most
* memory accesses. When comparing two nodes at distance
* "hoplimit", only nodes closer by than "hoplimit" are part
* of each group. Skip other nodes.
*/
  if (sched_numa_topology_type == NUMA_BACKPLANE && dist >= lim_dist)
   continue;

  /* Add up the faults from nearby nodes. */
  if (task)
   faults = task_faults(p, node);
  else
   faults = group_faults(p, node);

  /*
* On systems with a glueless mesh NUMA topology, there are
* no fixed "groups of nodes". Instead, nodes that are not
* directly connected bounce traffic through intermediate
* nodes; a numa_group can occupy any set of nodes.
* The further away a node is, the less the faults count.
* This seems to result in good task placement.
*/
  if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
   faults *= (max_dist - dist);
   faults /= (max_dist - LOCAL_DISTANCE);
  }

  score += faults;
}

return score;
}

/*
* These return the fraction of accesses done by a particular task, or
* task group, on a particular numa node.  The group weight is given a
* larger multiplier, in order to group tasks together that are almost
* evenly spread out between numa nodes.
*/
static inline unsigned long task_weight(struct task_struct *p, int nid,
     int dist)
{
unsigned long faults, total_faults;

if (!p->numa_faults)
  return 0;

total_faults = p->total_numa_faults;

if (!total_faults)
  return 0;

faults = task_faults(p, nid);
faults += score_nearby_nodes(p, nid, dist, true);

return 1000 * faults / total_faults;
}

static inline unsigned long group_weight(struct task_struct *p, int nid,
      int dist)
{
struct numa_group *ng = deref_task_numa_group(p);
unsigned long faults, total_faults;

if (!ng)
  return 0;

total_faults = ng->total_faults;

if (!total_faults)
  return 0;

faults = group_faults(p, nid);
faults += score_nearby_nodes(p, nid, dist, false);

return 1000 * faults / total_faults;
}

/*
* If memory tiering mode is enabled, cpupid of slow memory page is
* used to record scan time instead of CPU and PID.  When tiering mode
* is disabled at run time, the scan time (in cpupid) will be
* interpreted as CPU and PID.  So CPU needs to be checked to avoid to
* access out of array bound.
*/
static inline bool cpupid_valid(int cpupid)
{
return cpupid_to_cpu(cpupid) < nr_cpu_ids;
}

/*
* For memory tiering mode, if there are enough free pages (more than
* enough watermark defined here) in fast memory node, to take full
* advantage of fast memory capacity, all recently accessed slow
* memory pages will be migrated to fast memory node without
* considering hot threshold.
*/
static bool pgdat_free_space_enough(struct pglist_data *pgdat)
{
int z;
unsigned long enough_wmark;

enough_wmark = max(1UL * 1024 * 1024 * 1024 >> PAGE_SHIFT,
      pgdat->node_present_pages >> 4);
for (z = pgdat->nr_zones - 1; z >= 0; z--) {
  struct zone *zone = pgdat->node_zones + z;

  if (!populated_zone(zone))
   continue;

  if (zone_watermark_ok(zone, 0,
          promo_wmark_pages(zone) + enough_wmark,
          ZONE_MOVABLE, 0))
   return true;
}
return false;
}

/*
* For memory tiering mode, when page tables are scanned, the scan
* time will be recorded in struct page in addition to make page
* PROT_NONE for slow memory page.  So when the page is accessed, in
* hint page fault handler, the hint page fault latency is calculated
* via,
*
* hint page fault latency = hint page fault time - scan time
*
* The smaller the hint page fault latency, the higher the possibility
* for the page to be hot.
*/
static int numa_hint_fault_latency(struct folio *folio)
{
int last_time, time;

time = jiffies_to_msecs(jiffies);
last_time = folio_xchg_access_time(folio, time);

return (time - last_time) & PAGE_ACCESS_TIME_MASK;
}

/*
* For memory tiering mode, too high promotion/demotion throughput may
* hurt application latency.  So we provide a mechanism to rate limit
* the number of pages that are tried to be promoted.
*/
static bool numa_promotion_rate_limit(struct pglist_data *pgdat,
          unsigned long rate_limit, int nr)
{
unsigned long nr_cand;
unsigned int now, start;

now = jiffies_to_msecs(jiffies);
mod_node_page_state(pgdat, PGPROMOTE_CANDIDATE, nr);
nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE);
start = pgdat->nbp_rl_start;
if (now - start > MSEC_PER_SEC &&
     cmpxchg(&pgdat->nbp_rl_start, start, now) == start)
  pgdat->nbp_rl_nr_cand = nr_cand;
if (nr_cand - pgdat->nbp_rl_nr_cand >= rate_limit)
  return true;
return false;
}

#define NUMA_MIGRATION_ADJUST_STEPS 16

static void numa_promotion_adjust_threshold(struct pglist_data *pgdat,
         unsigned long rate_limit,
         unsigned int ref_th)
{
unsigned int now, start, th_period, unit_th, th;
unsigned long nr_cand, ref_cand, diff_cand;

now = jiffies_to_msecs(jiffies);
th_period = sysctl_numa_balancing_scan_period_max;
start = pgdat->nbp_th_start;
if (now - start > th_period &&
     cmpxchg(&pgdat->nbp_th_start, start, now) == start) {
  ref_cand = rate_limit *
   sysctl_numa_balancing_scan_period_max / MSEC_PER_SEC;
  nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE);
  diff_cand = nr_cand - pgdat->nbp_th_nr_cand;
  unit_th = ref_th * 2 / NUMA_MIGRATION_ADJUST_STEPS;
  th = pgdat->nbp_threshold ? : ref_th;
  if (diff_cand > ref_cand * 11 / 10)
   th = max(th - unit_th, unit_th);
  else if (diff_cand < ref_cand * 9 / 10)
   th = min(th + unit_th, ref_th * 2);
  pgdat->nbp_th_nr_cand = nr_cand;
  pgdat->nbp_threshold = th;
}
}

bool should_numa_migrate_memory(struct task_struct *p, struct folio *folio,
    int src_nid, int dst_cpu)
{
struct numa_group *ng = deref_curr_numa_group(p);
int dst_nid = cpu_to_node(dst_cpu);
int last_cpupid, this_cpupid;

/*
* Cannot migrate to memoryless nodes.
*/
if (!node_state(dst_nid, N_MEMORY))
  return false;

/*
* The pages in slow memory node should be migrated according
* to hot/cold instead of private/shared.
*/
if (folio_use_access_time(folio)) {
  struct pglist_data *pgdat;
  unsigned long rate_limit;
  unsigned int latency, th, def_th;

  pgdat = NODE_DATA(dst_nid);
  if (pgdat_free_space_enough(pgdat)) {
   /* workload changed, reset hot threshold */
   pgdat->nbp_threshold = 0;
   return true;
  }

  def_th = sysctl_numa_balancing_hot_threshold;
  rate_limit = sysctl_numa_balancing_promote_rate_limit << \
   (20 - PAGE_SHIFT);
  numa_promotion_adjust_threshold(pgdat, rate_limit, def_th);

  th = pgdat->nbp_threshold ? : def_th;
  latency = numa_hint_fault_latency(folio);
  if (latency >= th)
   return false;

  return !numa_promotion_rate_limit(pgdat, rate_limit,
        folio_nr_pages(folio));
}

this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
last_cpupid = folio_xchg_last_cpupid(folio, this_cpupid);

if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) &&
     !node_is_toptier(src_nid) && !cpupid_valid(last_cpupid))
  return false;

/*
* Allow first faults or private faults to migrate immediately early in
* the lifetime of a task. The magic number 4 is based on waiting for
* two full passes of the "multi-stage node selection" test that is
* executed below.
*/
if ((p->numa_preferred_nid == NUMA_NO_NODE || p->numa_scan_seq <= 4) &&
     (cpupid_pid_unset(last_cpupid) || cpupid_match_pid(p, last_cpupid)))
  return true;

/*
* Multi-stage node selection is used in conjunction with a periodic
* migration fault to build a temporal task<->page relation. By using
* a two-stage filter we remove short/unlikely relations.
*
* Using P(p) ~ n_p / n_t as per frequentist probability, we can equate
* a task's usage of a particular page (n_p) per total usage of this
* page (n_t) (in a given time-span) to a probability.
*
* Our periodic faults will sample this probability and getting the
* same result twice in a row, given these samples are fully
* independent, is then given by P(n)^2, provided our sample period
* is sufficiently short compared to the usage pattern.
*
* This quadric squishes small probabilities, making it less likely we
* act on an unlikely task<->page relation.
*/
if (!cpupid_pid_unset(last_cpupid) &&
    cpupid_to_nid(last_cpupid) != dst_nid)
  return false;

/* Always allow migrate on private faults */
if (cpupid_match_pid(p, last_cpupid))
  return true;

/* A shared fault, but p->numa_group has not been set up yet. */
if (!ng)
  return true;

/*
* Destination node is much more heavily used than the source
* node? Allow migration.
*/
if (group_faults_cpu(ng, dst_nid) > group_faults_cpu(ng, src_nid) *
     ACTIVE_NODE_FRACTION)
  return true;

/*
* Distribute memory according to CPU & memory use on each node,
* with 3/4 hysteresis to avoid unnecessary memory migrations:
*
* faults_cpu(dst)   3   faults_cpu(src)
* --------------- * - > ---------------
* faults_mem(dst)   4   faults_mem(src)
*/
return group_faults_cpu(ng, dst_nid) * group_faults(p, src_nid) * 3 >
        group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
}

/*
* 'numa_type' describes the node at the moment of load balancing.
*/
enum numa_type {
/* The node has spare capacity that can be used to run more tasks.  */
node_has_spare = 0,
/*
* The node is fully used and the tasks don't compete for more CPU
* cycles. Nevertheless, some tasks might wait before running.
*/
node_fully_busy,
/*
* The node is overloaded and can't provide expected CPU cycles to all
* tasks.
*/
node_overloaded
};

/* Cached statistics for all CPUs within a node */
struct numa_stats {
unsigned long load;
unsigned long runnable;
unsigned long util;
/* Total compute capacity of CPUs on a node */
unsigned long compute_capacity;
unsigned int nr_running;
unsigned int weight;
enum numa_type node_type;
int idle_cpu;
};

struct task_numa_env {
struct task_struct *p;

int src_cpu, src_nid;
int dst_cpu, dst_nid;
int imb_numa_nr;

struct numa_stats src_stats, dst_stats;

int imbalance_pct;
int dist;

struct task_struct *best_task;
long best_imp;
int best_cpu;
};

static unsigned long cpu_load(struct rq *rq);
static unsigned long cpu_runnable(struct rq *rq);

static inline enum
numa_type numa_classify(unsigned int imbalance_pct,
    struct numa_stats *ns)
{
if ((ns->nr_running > ns->weight) &&
     (((ns->compute_capacity * 100) < (ns->util * imbalance_pct)) ||
      ((ns->compute_capacity * imbalance_pct) < (ns->runnable * 100))))
  return node_overloaded;

if ((ns->nr_running < ns->weight) ||
     (((ns->compute_capacity * 100) > (ns->util * imbalance_pct)) &&
      ((ns->compute_capacity * imbalance_pct) > (ns->runnable * 100))))
  return node_has_spare;

return node_fully_busy;
}

#ifdef CONFIG_SCHED_SMT
/* Forward declarations of select_idle_sibling helpers */
static inline bool test_idle_cores(int cpu);
static inline int numa_idle_core(int idle_core, int cpu)
{
if (!static_branch_likely(&sched_smt_present) ||
     idle_core >= 0 || !test_idle_cores(cpu))
  return idle_core;

/*
* Prefer cores instead of packing HT siblings
* and triggering future load balancing.
*/
if (is_core_idle(cpu))
  idle_core = cpu;

return idle_core;
}
#else /* !CONFIG_SCHED_SMT: */
static inline int numa_idle_core(int idle_core, int cpu)
{
return idle_core;
}
#endif /* !CONFIG_SCHED_SMT */

/*
* Gather all necessary information to make NUMA balancing placement
* decisions that are compatible with standard load balancer. This
* borrows code and logic from update_sg_lb_stats but sharing a
* common implementation is impractical.
*/
static void update_numa_stats(struct task_numa_env *env,
         struct numa_stats *ns, int nid,
         bool find_idle)
{
int cpu, idle_core = -1;

memset(ns, 0, sizeof(*ns));
ns->idle_cpu = -1;

rcu_read_lock();
for_each_cpu(cpu, cpumask_of_node(nid)) {
  struct rq *rq = cpu_rq(cpu);

  ns->load += cpu_load(rq);
  ns->runnable += cpu_runnable(rq);
  ns->util += cpu_util_cfs(cpu);
  ns->nr_running += rq->cfs.h_nr_runnable;
  ns->compute_capacity += capacity_of(cpu);

  if (find_idle && idle_core < 0 && !rq->nr_running && idle_cpu(cpu)) {
   if (READ_ONCE(rq->numa_migrate_on) ||
       !cpumask_test_cpu(cpu, env->p->cpus_ptr))
    continue;

   if (ns->idle_cpu == -1)
    ns->idle_cpu = cpu;

   idle_core = numa_idle_core(idle_core, cpu);
  }
}
rcu_read_unlock();

ns->weight = cpumask_weight(cpumask_of_node(nid));

ns->node_type = numa_classify(env->imbalance_pct, ns);

if (idle_core >= 0)
  ns->idle_cpu = idle_core;
}

static void task_numa_assign(struct task_numa_env *env,
        struct task_struct *p, long imp)
{
struct rq *rq = cpu_rq(env->dst_cpu);

/* Check if run-queue part of active NUMA balance. */
if (env->best_cpu != env->dst_cpu && xchg(&rq->numa_migrate_on, 1)) {
  int cpu;
  int start = env->dst_cpu;

  /* Find alternative idle CPU. */
  for_each_cpu_wrap(cpu, cpumask_of_node(env->dst_nid), start + 1) {
   if (cpu == env->best_cpu || !idle_cpu(cpu) ||
       !cpumask_test_cpu(cpu, env->p->cpus_ptr)) {
    continue;
   }

   env->dst_cpu = cpu;
   rq = cpu_rq(env->dst_cpu);
   if (!xchg(&rq->numa_migrate_on, 1))
    goto assign;
  }

  /* Failed to find an alternative idle CPU */
  return;
}

assign:
/*
* Clear previous best_cpu/rq numa-migrate flag, since task now
* found a better CPU to move/swap.
*/
if (env->best_cpu != -1 && env->best_cpu != env->dst_cpu) {
  rq = cpu_rq(env->best_cpu);
  WRITE_ONCE(rq->numa_migrate_on, 0);
}

if (env->best_task)
  put_task_struct(env->best_task);
if (p)
  get_task_struct(p);

env->best_task = p;
env->best_imp = imp;
env->best_cpu = env->dst_cpu;
}

static bool load_too_imbalanced(long src_load, long dst_load,
    struct task_numa_env *env)
{
long imb, old_imb;
long orig_src_load, orig_dst_load;
long src_capacity, dst_capacity;

/*
* The load is corrected for the CPU capacity available on each node.
*
* src_load        dst_load
* ------------ vs ---------
* src_capacity    dst_capacity
*/
src_capacity = env->src_stats.compute_capacity;
dst_capacity = env->dst_stats.compute_capacity;

imb = abs(dst_load * src_capacity - src_load * dst_capacity);

orig_src_load = env->src_stats.load;
orig_dst_load = env->dst_stats.load;

old_imb = abs(orig_dst_load * src_capacity - orig_src_load * dst_capacity);

/* Would this change make things worse? */
return (imb > old_imb);
}

/*
* Maximum NUMA importance can be 1998 (2*999);
* SMALLIMP @ 30 would be close to 1998/64.
* Used to deter task migration.
*/
#define SMALLIMP 30

/*
* This checks if the overall compute and NUMA accesses of the system would
* be improved if the source tasks was migrated to the target dst_cpu taking
* into account that it might be best if task running on the dst_cpu should
* be exchanged with the source task
*/
static bool task_numa_compare(struct task_numa_env *env,
         long taskimp, long groupimp, bool maymove)
{
struct numa_group *cur_ng, *p_ng = deref_curr_numa_group(env->p);
struct rq *dst_rq = cpu_rq(env->dst_cpu);
long imp = p_ng ? groupimp : taskimp;
struct task_struct *cur;
long src_load, dst_load;
int dist = env->dist;
long moveimp = imp;
long load;
bool stopsearch = false;

if (READ_ONCE(dst_rq->numa_migrate_on))
  return false;

rcu_read_lock();
cur = rcu_dereference(dst_rq->curr);
if (cur && ((cur->flags & (PF_EXITING | PF_KTHREAD)) ||
      !cur->mm))
  cur = NULL;

/*
* Because we have preemption enabled we can get migrated around and
* end try selecting ourselves (current == env->p) as a swap candidate.
*/
if (cur == env->p) {
  stopsearch = true;
  goto unlock;
}

if (!cur) {
  if (maymove && moveimp >= env->best_imp)
   goto assign;
  else
   goto unlock;
}

/* Skip this swap candidate if cannot move to the source cpu. */
if (!cpumask_test_cpu(env->src_cpu, cur->cpus_ptr))
  goto unlock;

/*
* Skip this swap candidate if it is not moving to its preferred
* node and the best task is.
*/
if (env->best_task &&
     env->best_task->numa_preferred_nid == env->src_nid &&
     cur->numa_preferred_nid != env->src_nid) {
  goto unlock;
}

/*
* "imp" is the fault differential for the source task between the
* source and destination node. Calculate the total differential for
* the source task and potential destination task. The more negative
* the value is, the more remote accesses that would be expected to
* be incurred if the tasks were swapped.
*
* If dst and source tasks are in the same NUMA group, or not
* in any group then look only at task weights.
*/
cur_ng = rcu_dereference(cur->numa_group);
if (cur_ng == p_ng) {
  /*
* Do not swap within a group or between tasks that have
* no group if there is spare capacity. Swapping does
* not address the load imbalance and helps one task at
* the cost of punishing another.
*/
  if (env->dst_stats.node_type == node_has_spare)
   goto unlock;

  imp = taskimp + task_weight(cur, env->src_nid, dist) -
        task_weight(cur, env->dst_nid, dist);
  /*
* Add some hysteresis to prevent swapping the
* tasks within a group over tiny differences.
*/
  if (cur_ng)
   imp -= imp / 16;
} else {
  /*
* Compare the group weights. If a task is all by itself
* (not part of a group), use the task weight instead.
*/
  if (cur_ng && p_ng)
   imp += group_weight(cur, env->src_nid, dist) -
          group_weight(cur, env->dst_nid, dist);
  else
   imp += task_weight(cur, env->src_nid, dist) -
          task_weight(cur, env->dst_nid, dist);
}

/* Discourage picking a task already on its preferred node */
if (cur->numa_preferred_nid == env->dst_nid)
  imp -= imp / 16;

/*
* Encourage picking a task that moves to its preferred node.
* This potentially makes imp larger than it's maximum of
* 1998 (see SMALLIMP and task_weight for why) but in this
* case, it does not matter.
*/
if (cur->numa_preferred_nid == env->src_nid)
  imp += imp / 8;

if (maymove && moveimp > imp && moveimp > env->best_imp) {
  imp = moveimp;
  cur = NULL;
  goto assign;
}

/*
* Prefer swapping with a task moving to its preferred node over a
* task that is not.
*/
if (env->best_task && cur->numa_preferred_nid == env->src_nid &&
     env->best_task->numa_preferred_nid != env->src_nid) {
  goto assign;
}

/*
* If the NUMA importance is less than SMALLIMP,
* task migration might only result in ping pong
* of tasks and also hurt performance due to cache
* misses.
*/
if (imp < SMALLIMP || imp <= env->best_imp + SMALLIMP / 2)
  goto unlock;

/*
* In the overloaded case, try and keep the load balanced.
*/
load = task_h_load(env->p) - task_h_load(cur);
if (!load)
  goto assign;

dst_load = env->dst_stats.load + load;
src_load = env->src_stats.load - load;

if (load_too_imbalanced(src_load, dst_load, env))
  goto unlock;

assign:
/* Evaluate an idle CPU for a task numa move. */
if (!cur) {
  int cpu = env->dst_stats.idle_cpu;

  /* Nothing cached so current CPU went idle since the search. */
  if (cpu < 0)
   cpu = env->dst_cpu;

  /*
* If the CPU is no longer truly idle and the previous best CPU
* is, keep using it.
*/
  if (!idle_cpu(cpu) && env->best_cpu >= 0 &&
      idle_cpu(env->best_cpu)) {
   cpu = env->best_cpu;
  }

  env->dst_cpu = cpu;
}

task_numa_assign(env, cur, imp);

/*
* If a move to idle is allowed because there is capacity or load
* balance improves then stop the search. While a better swap
* candidate may exist, a search is not free.
*/
if (maymove && !cur && env->best_cpu >= 0 && idle_cpu(env->best_cpu))
  stopsearch = true;

/*
* If a swap candidate must be identified and the current best task
* moves its preferred node then stop the search.
*/
if (!maymove && env->best_task &&
     env->best_task->numa_preferred_nid == env->src_nid) {
  stopsearch = true;
}
unlock:
rcu_read_unlock();

return stopsearch;
}

static void task_numa_find_cpu(struct task_numa_env *env,
    long taskimp, long groupimp)
{
bool maymove = false;
int cpu;

/*
* If dst node has spare capacity, then check if there is an
* imbalance that would be overruled by the load balancer.
*/
if (env->dst_stats.node_type == node_has_spare) {
  unsigned int imbalance;
  int src_running, dst_running;

  /*
* Would movement cause an imbalance? Note that if src has
* more running tasks that the imbalance is ignored as the
* move improves the imbalance from the perspective of the
* CPU load balancer.
* */
  src_running = env->src_stats.nr_running - 1;
  dst_running = env->dst_stats.nr_running + 1;
  imbalance = max(0, dst_running - src_running);
  imbalance = adjust_numa_imbalance(imbalance, dst_running,
        env->imb_numa_nr);

  /* Use idle CPU if there is no imbalance */
  if (!imbalance) {
   maymove = true;
   if (env->dst_stats.idle_cpu >= 0) {
    env->dst_cpu = env->dst_stats.idle_cpu;
--> --------------------

--> maximum size reached

--> --------------------

Messung V0.5

¤ Diese beiden folgenden Angebotsgruppen bietet das Unternehmen0.34Angebot Wie Sie bei der Firma Beratungs- und Dienstleistungen beauftragen können ¤

*Eine klare Vorstellung vom Zielzustand

Wurzel

Suchen

Beweissystem der NASA

Beweissystem Isabelle

NIST Cobol Testsuite

Cephes Mathematical Library

Wiener Entwicklungsmethode

Haftungshinweis

Die Informationen auf dieser Webseite wurden nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit, noch Qualität der bereit gestellten Informationen zugesichert.

Bemerkung:

Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.