Quelle memcontrol.c Sprache: C

// SPDX-License-Identifier: GPL-2.0-or-later
/* memcontrol.c - Memory Controller
*
* Copyright IBM Corporation, 2007
* Author Balbir Singh <balbir@linux.vnet.ibm.com>
*
* Copyright 2007 OpenVZ SWsoft Inc
* Author: Pavel Emelianov <xemul@openvz.org>
*
* Memory thresholds
* Copyright (C) 2009 Nokia Corporation
* Author: Kirill A. Shutemov
*
* Kernel Memory Controller
* Copyright (C) 2012 Parallels Inc. and Google Inc.
* Authors: Glauber Costa and Suleiman Souhlal
*
* Native page reclaim
* Charge lifetime sanitation
* Lockless page tracking & accounting
* Unified hierarchy configuration model
* Copyright (C) 2015 Red Hat, Inc., Johannes Weiner
*
* Per memcg lru locking
* Copyright (C) 2020 Alibaba, Inc, Alex Shi
*/

#include <linux/cgroup-defs.h>
#include <linux/page_counter.h>
#include <linux/memcontrol.h>
#include <linux/cgroup.h>
#include <linux/cpuset.h>
#include <linux/sched/mm.h>
#include <linux/shmem_fs.h>
#include <linux/hugetlb.h>
#include <linux/pagemap.h>
#include <linux/pagevec.h>
#include <linux/vm_event_item.h>
#include <linux/smp.h>
#include <linux/page-flags.h>
#include <linux/backing-dev.h>
#include <linux/bit_spinlock.h>
#include <linux/rcupdate.h>
#include <linux/limits.h>
#include <linux/export.h>
#include <linux/list.h>
#include <linux/mutex.h>
#include <linux/rbtree.h>
#include <linux/slab.h>
#include <linux/swapops.h>
#include <linux/spinlock.h>
#include <linux/fs.h>
#include <linux/seq_file.h>
#include <linux/vmpressure.h>
#include <linux/memremap.h>
#include <linux/mm_inline.h>
#include <linux/swap_cgroup.h>
#include <linux/cpu.h>
#include <linux/oom.h>
#include <linux/lockdep.h>
#include <linux/resume_user_mode.h>
#include <linux/psi.h>
#include <linux/seq_buf.h>
#include <linux/sched/isolation.h>
#include <linux/kmemleak.h>
#include "internal.h"
#include <net/sock.h>
#include <net/ip.h>
#include "slab.h"
#include "memcontrol-v1.h"

#include <linux/uaccess.h>

#define CREATE_TRACE_POINTS
#include <trace/events/memcg.h>
#undef CREATE_TRACE_POINTS

#include <trace/events/vmscan.h>

struct cgroup_subsys memory_cgrp_subsys __read_mostly;
EXPORT_SYMBOL(memory_cgrp_subsys);

struct mem_cgroup *root_mem_cgroup __read_mostly;

/* Active memory cgroup to use from an interrupt context */
DEFINE_PER_CPU(struct mem_cgroup *, int_active_memcg);
EXPORT_PER_CPU_SYMBOL_GPL(int_active_memcg);

/* Socket memory accounting disabled? */
static bool cgroup_memory_nosocket __ro_after_init;

/* Kernel memory accounting disabled? */
static bool cgroup_memory_nokmem __ro_after_init;

/* BPF memory accounting disabled? */
static bool cgroup_memory_nobpf __ro_after_init;

static struct kmem_cache *memcg_cachep;
static struct kmem_cache *memcg_pn_cachep;

#ifdef CONFIG_CGROUP_WRITEBACK
static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
#endif

static inline bool task_is_dying(void)
{
return tsk_is_oom_victim(current) || fatal_signal_pending(current) ||
  (current->flags & PF_EXITING);
}

/* Some nice accessors for the vmpressure. */
struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
{
if (!memcg)
  memcg = root_mem_cgroup;
return &memcg->vmpressure;
}

struct mem_cgroup *vmpressure_to_memcg(struct vmpressure *vmpr)
{
return container_of(vmpr, struct mem_cgroup, vmpressure);
}

#define SEQ_BUF_SIZE SZ_4K
#define CURRENT_OBJCG_UPDATE_BIT 0
#define CURRENT_OBJCG_UPDATE_FLAG (1UL << CURRENT_OBJCG_UPDATE_BIT)

static DEFINE_SPINLOCK(objcg_lock);

bool mem_cgroup_kmem_disabled(void)
{
return cgroup_memory_nokmem;
}

static void memcg_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages);

static void obj_cgroup_release(struct percpu_ref *ref)
{
struct obj_cgroup *objcg = container_of(ref, struct obj_cgroup, refcnt);
unsigned int nr_bytes;
unsigned int nr_pages;
unsigned long flags;

/*
* At this point all allocated objects are freed, and
* objcg->nr_charged_bytes can't have an arbitrary byte value.
* However, it can be PAGE_SIZE or (x * PAGE_SIZE).
*
* The following sequence can lead to it:
* 1) CPU0: objcg == stock->cached_objcg
* 2) CPU1: we do a small allocation (e.g. 92 bytes),
*          PAGE_SIZE bytes are charged
* 3) CPU1: a process from another memcg is allocating something,
*          the stock if flushed,
*          objcg->nr_charged_bytes = PAGE_SIZE - 92
* 5) CPU0: we do release this object,
*          92 bytes are added to stock->nr_bytes
* 6) CPU0: stock is flushed,
*          92 bytes are added to objcg->nr_charged_bytes
*
* In the result, nr_charged_bytes == PAGE_SIZE.
* This page will be uncharged in obj_cgroup_release().
*/
nr_bytes = atomic_read(&objcg->nr_charged_bytes);
WARN_ON_ONCE(nr_bytes & (PAGE_SIZE - 1));
nr_pages = nr_bytes >> PAGE_SHIFT;

if (nr_pages) {
  struct mem_cgroup *memcg;

  memcg = get_mem_cgroup_from_objcg(objcg);
  mod_memcg_state(memcg, MEMCG_KMEM, -nr_pages);
  memcg1_account_kmem(memcg, -nr_pages);
  if (!mem_cgroup_is_root(memcg))
   memcg_uncharge(memcg, nr_pages);
  mem_cgroup_put(memcg);
}

spin_lock_irqsave(&objcg_lock, flags);
list_del(&objcg->list);
spin_unlock_irqrestore(&objcg_lock, flags);

percpu_ref_exit(ref);
kfree_rcu(objcg, rcu);
}

static struct obj_cgroup *obj_cgroup_alloc(void)
{
struct obj_cgroup *objcg;
int ret;

objcg = kzalloc(sizeof(struct obj_cgroup), GFP_KERNEL);
if (!objcg)
  return NULL;

ret = percpu_ref_init(&objcg->refcnt, obj_cgroup_release, 0,
         GFP_KERNEL);
if (ret) {
  kfree(objcg);
  return NULL;
}
INIT_LIST_HEAD(&objcg->list);
return objcg;
}

static void memcg_reparent_objcgs(struct mem_cgroup *memcg,
      struct mem_cgroup *parent)
{
struct obj_cgroup *objcg, *iter;

objcg = rcu_replace_pointer(memcg->objcg, NULL, true);

spin_lock_irq(&objcg_lock);

/* 1) Ready to reparent active objcg. */
list_add(&objcg->list, &memcg->objcg_list);
/* 2) Reparent active objcg and already reparented objcgs to parent. */
list_for_each_entry(iter, &memcg->objcg_list, list)
  WRITE_ONCE(iter->memcg, parent);
/* 3) Move already reparented objcgs to the parent's list */
list_splice(&memcg->objcg_list, &parent->objcg_list);

spin_unlock_irq(&objcg_lock);

percpu_ref_kill(&objcg->refcnt);
}

/*
* A lot of the calls to the cache allocation functions are expected to be
* inlined by the compiler. Since the calls to memcg_slab_post_alloc_hook() are
* conditional to this static branch, we'll have to allow modules that does
* kmem_cache_alloc and the such to see this symbol as well
*/
DEFINE_STATIC_KEY_FALSE(memcg_kmem_online_key);
EXPORT_SYMBOL(memcg_kmem_online_key);

DEFINE_STATIC_KEY_FALSE(memcg_bpf_enabled_key);
EXPORT_SYMBOL(memcg_bpf_enabled_key);

/**
* mem_cgroup_css_from_folio - css of the memcg associated with a folio
* @folio: folio of interest
*
* If memcg is bound to the default hierarchy, css of the memcg associated
* with @folio is returned.  The returned css remains associated with @folio
* until it is released.
*
* If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup
* is returned.
*/
struct cgroup_subsys_state *mem_cgroup_css_from_folio(struct folio *folio)
{
struct mem_cgroup *memcg = folio_memcg(folio);

if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
  memcg = root_mem_cgroup;

return &memcg->css;
}

/**
* page_cgroup_ino - return inode number of the memcg a page is charged to
* @page: the page
*
* Look up the closest online ancestor of the memory cgroup @page is charged to
* and return its inode number or 0 if @page is not charged to any cgroup. It
* is safe to call this function without holding a reference to @page.
*
* Note, this function is inherently racy, because there is nothing to prevent
* the cgroup inode from getting torn down and potentially reallocated a moment
* after page_cgroup_ino() returns, so it only should be used by callers that
* do not care (such as procfs interfaces).
*/
ino_t page_cgroup_ino(struct page *page)
{
struct mem_cgroup *memcg;
unsigned long ino = 0;

rcu_read_lock();
/* page_folio() is racy here, but the entire function is racy anyway */
memcg = folio_memcg_check(page_folio(page));

while (memcg && !(memcg->css.flags & CSS_ONLINE))
  memcg = parent_mem_cgroup(memcg);
if (memcg)
  ino = cgroup_ino(memcg->css.cgroup);
rcu_read_unlock();
return ino;
}

/* Subset of node_stat_item for memcg stats */
static const unsigned int memcg_node_stat_items[] = {
NR_INACTIVE_ANON,
NR_ACTIVE_ANON,
NR_INACTIVE_FILE,
NR_ACTIVE_FILE,
NR_UNEVICTABLE,
NR_SLAB_RECLAIMABLE_B,
NR_SLAB_UNRECLAIMABLE_B,
WORKINGSET_REFAULT_ANON,
WORKINGSET_REFAULT_FILE,
WORKINGSET_ACTIVATE_ANON,
WORKINGSET_ACTIVATE_FILE,
WORKINGSET_RESTORE_ANON,
WORKINGSET_RESTORE_FILE,
WORKINGSET_NODERECLAIM,
NR_ANON_MAPPED,
NR_FILE_MAPPED,
NR_FILE_PAGES,
NR_FILE_DIRTY,
NR_WRITEBACK,
NR_SHMEM,
NR_SHMEM_THPS,
NR_FILE_THPS,
NR_ANON_THPS,
NR_KERNEL_STACK_KB,
NR_PAGETABLE,
NR_SECONDARY_PAGETABLE,
#ifdef CONFIG_SWAP
NR_SWAPCACHE,
#endif
#ifdef CONFIG_NUMA_BALANCING
PGPROMOTE_SUCCESS,
#endif
PGDEMOTE_KSWAPD,
PGDEMOTE_DIRECT,
PGDEMOTE_KHUGEPAGED,
PGDEMOTE_PROACTIVE,
#ifdef CONFIG_HUGETLB_PAGE
NR_HUGETLB,
#endif
};

static const unsigned int memcg_stat_items[] = {
MEMCG_SWAP,
MEMCG_SOCK,
MEMCG_PERCPU_B,
MEMCG_VMALLOC,
MEMCG_KMEM,
MEMCG_ZSWAP_B,
MEMCG_ZSWAPPED,
};

#define NR_MEMCG_NODE_STAT_ITEMS ARRAY_SIZE(memcg_node_stat_items)
#define MEMCG_VMSTAT_SIZE (NR_MEMCG_NODE_STAT_ITEMS + \
      ARRAY_SIZE(memcg_stat_items))
#define BAD_STAT_IDX(index) ((u32)(index) >= U8_MAX)
static u8 mem_cgroup_stats_index[MEMCG_NR_STAT] __read_mostly;

static void init_memcg_stats(void)
{
u8 i, j = 0;

BUILD_BUG_ON(MEMCG_NR_STAT >= U8_MAX);

memset(mem_cgroup_stats_index, U8_MAX, sizeof(mem_cgroup_stats_index));

for (i = 0; i < NR_MEMCG_NODE_STAT_ITEMS; ++i, ++j)
  mem_cgroup_stats_index[memcg_node_stat_items[i]] = j;

for (i = 0; i < ARRAY_SIZE(memcg_stat_items); ++i, ++j)
  mem_cgroup_stats_index[memcg_stat_items[i]] = j;
}

static inline int memcg_stats_index(int idx)
{
return mem_cgroup_stats_index[idx];
}

struct lruvec_stats_percpu {
/* Local (CPU and cgroup) state */
long state[NR_MEMCG_NODE_STAT_ITEMS];

/* Delta calculation for lockless upward propagation */
long state_prev[NR_MEMCG_NODE_STAT_ITEMS];
};

struct lruvec_stats {
/* Aggregated (CPU and subtree) state */
long state[NR_MEMCG_NODE_STAT_ITEMS];

/* Non-hierarchical (CPU aggregated) state */
long state_local[NR_MEMCG_NODE_STAT_ITEMS];

/* Pending child counts during tree propagation */
long state_pending[NR_MEMCG_NODE_STAT_ITEMS];
};

unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx)
{
struct mem_cgroup_per_node *pn;
long x;
int i;

if (mem_cgroup_disabled())
  return node_page_state(lruvec_pgdat(lruvec), idx);

i = memcg_stats_index(idx);
if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx))
  return 0;

pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
x = READ_ONCE(pn->lruvec_stats->state[i]);
#ifdef CONFIG_SMP
if (x < 0)
  x = 0;
#endif
return x;
}

unsigned long lruvec_page_state_local(struct lruvec *lruvec,
          enum node_stat_item idx)
{
struct mem_cgroup_per_node *pn;
long x;
int i;

if (mem_cgroup_disabled())
  return node_page_state(lruvec_pgdat(lruvec), idx);

i = memcg_stats_index(idx);
if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx))
  return 0;

pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
x = READ_ONCE(pn->lruvec_stats->state_local[i]);
#ifdef CONFIG_SMP
if (x < 0)
  x = 0;
#endif
return x;
}

/* Subset of vm_event_item to report for memcg event stats */
static const unsigned int memcg_vm_event_stat[] = {
#ifdef CONFIG_MEMCG_V1
PGPGIN,
PGPGOUT,
#endif
PSWPIN,
PSWPOUT,
PGSCAN_KSWAPD,
PGSCAN_DIRECT,
PGSCAN_KHUGEPAGED,
PGSCAN_PROACTIVE,
PGSTEAL_KSWAPD,
PGSTEAL_DIRECT,
PGSTEAL_KHUGEPAGED,
PGSTEAL_PROACTIVE,
PGFAULT,
PGMAJFAULT,
PGREFILL,
PGACTIVATE,
PGDEACTIVATE,
PGLAZYFREE,
PGLAZYFREED,
#ifdef CONFIG_SWAP
SWPIN_ZERO,
SWPOUT_ZERO,
#endif
#ifdef CONFIG_ZSWAP
ZSWPIN,
ZSWPOUT,
ZSWPWB,
#endif
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
THP_FAULT_ALLOC,
THP_COLLAPSE_ALLOC,
THP_SWPOUT,
THP_SWPOUT_FALLBACK,
#endif
#ifdef CONFIG_NUMA_BALANCING
NUMA_PAGE_MIGRATE,
NUMA_PTE_UPDATES,
NUMA_HINT_FAULTS,
#endif
};

#define NR_MEMCG_EVENTS ARRAY_SIZE(memcg_vm_event_stat)
static u8 mem_cgroup_events_index[NR_VM_EVENT_ITEMS] __read_mostly;

static void init_memcg_events(void)
{
u8 i;

BUILD_BUG_ON(NR_VM_EVENT_ITEMS >= U8_MAX);

memset(mem_cgroup_events_index, U8_MAX,
        sizeof(mem_cgroup_events_index));

for (i = 0; i < NR_MEMCG_EVENTS; ++i)
  mem_cgroup_events_index[memcg_vm_event_stat[i]] = i;
}

static inline int memcg_events_index(enum vm_event_item idx)
{
return mem_cgroup_events_index[idx];
}

struct memcg_vmstats_percpu {
/* Stats updates since the last flush */
unsigned int   stats_updates;

/* Cached pointers for fast iteration in memcg_rstat_updated() */
struct memcg_vmstats_percpu __percpu *parent_pcpu;
struct memcg_vmstats   *vmstats;

/* The above should fit a single cacheline for memcg_rstat_updated() */

/* Local (CPU and cgroup) page state & events */
long   state[MEMCG_VMSTAT_SIZE];
unsigned long  events[NR_MEMCG_EVENTS];

/* Delta calculation for lockless upward propagation */
long   state_prev[MEMCG_VMSTAT_SIZE];
unsigned long  events_prev[NR_MEMCG_EVENTS];
} ____cacheline_aligned;

struct memcg_vmstats {
/* Aggregated (CPU and subtree) page state & events */
long   state[MEMCG_VMSTAT_SIZE];
unsigned long  events[NR_MEMCG_EVENTS];

/* Non-hierarchical (CPU aggregated) page state & events */
long   state_local[MEMCG_VMSTAT_SIZE];
unsigned long  events_local[NR_MEMCG_EVENTS];

/* Pending child counts during tree propagation */
long   state_pending[MEMCG_VMSTAT_SIZE];
unsigned long  events_pending[NR_MEMCG_EVENTS];

/* Stats updates since the last flush */
atomic_t  stats_updates;
};

/*
* memcg and lruvec stats flushing
*
* Many codepaths leading to stats update or read are performance sensitive and
* adding stats flushing in such codepaths is not desirable. So, to optimize the
* flushing the kernel does:
*
* 1) Periodically and asynchronously flush the stats every 2 seconds to not let
*    rstat update tree grow unbounded.
*
* 2) Flush the stats synchronously on reader side only when there are more than
*    (MEMCG_CHARGE_BATCH * nr_cpus) update events. Though this optimization
*    will let stats be out of sync by atmost (MEMCG_CHARGE_BATCH * nr_cpus) but
*    only for 2 seconds due to (1).
*/
static void flush_memcg_stats_dwork(struct work_struct *w);
static DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork);
static u64 flush_last_time;

#define FLUSH_TIME (2UL*HZ)

static bool memcg_vmstats_needs_flush(struct memcg_vmstats *vmstats)
{
return atomic_read(&vmstats->stats_updates) >
  MEMCG_CHARGE_BATCH * num_online_cpus();
}

static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val,
           int cpu)
{
struct memcg_vmstats_percpu __percpu *statc_pcpu;
struct memcg_vmstats_percpu *statc;
unsigned int stats_updates;

if (!val)
  return;

css_rstat_updated(&memcg->css, cpu);
statc_pcpu = memcg->vmstats_percpu;
for (; statc_pcpu; statc_pcpu = statc->parent_pcpu) {
  statc = this_cpu_ptr(statc_pcpu);
  /*
* If @memcg is already flushable then all its ancestors are
* flushable as well and also there is no need to increase
* stats_updates.
*/
  if (memcg_vmstats_needs_flush(statc->vmstats))
   break;

  stats_updates = this_cpu_add_return(statc_pcpu->stats_updates,
          abs(val));
  if (stats_updates < MEMCG_CHARGE_BATCH)
   continue;

  stats_updates = this_cpu_xchg(statc_pcpu->stats_updates, 0);
  atomic_add(stats_updates, &statc->vmstats->stats_updates);
}
}

static void __mem_cgroup_flush_stats(struct mem_cgroup *memcg, bool force)
{
bool needs_flush = memcg_vmstats_needs_flush(memcg->vmstats);

trace_memcg_flush_stats(memcg, atomic_read(&memcg->vmstats->stats_updates),
  force, needs_flush);

if (!force && !needs_flush)
  return;

if (mem_cgroup_is_root(memcg))
  WRITE_ONCE(flush_last_time, jiffies_64);

css_rstat_flush(&memcg->css);
}

/*
* mem_cgroup_flush_stats - flush the stats of a memory cgroup subtree
* @memcg: root of the subtree to flush
*
* Flushing is serialized by the underlying global rstat lock. There is also a
* minimum amount of work to be done even if there are no stat updates to flush.
* Hence, we only flush the stats if the updates delta exceeds a threshold. This
* avoids unnecessary work and contention on the underlying lock.
*/
void mem_cgroup_flush_stats(struct mem_cgroup *memcg)
{
if (mem_cgroup_disabled())
  return;

if (!memcg)
  memcg = root_mem_cgroup;

__mem_cgroup_flush_stats(memcg, false);
}

void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg)
{
/* Only flush if the periodic flusher is one full cycle late */
if (time_after64(jiffies_64, READ_ONCE(flush_last_time) + 2*FLUSH_TIME))
  mem_cgroup_flush_stats(memcg);
}

static void flush_memcg_stats_dwork(struct work_struct *w)
{
/*
* Deliberately ignore memcg_vmstats_needs_flush() here so that flushing
* in latency-sensitive paths is as cheap as possible.
*/
__mem_cgroup_flush_stats(root_mem_cgroup, true);
queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME);
}

unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
{
long x;
int i = memcg_stats_index(idx);

if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx))
  return 0;

x = READ_ONCE(memcg->vmstats->state[i]);
#ifdef CONFIG_SMP
if (x < 0)
  x = 0;
#endif
return x;
}

static int memcg_page_state_unit(int item);

/*
* Normalize the value passed into memcg_rstat_updated() to be in pages. Round
* up non-zero sub-page updates to 1 page as zero page updates are ignored.
*/
static int memcg_state_val_in_pages(int idx, int val)
{
int unit = memcg_page_state_unit(idx);

if (!val || unit == PAGE_SIZE)
  return val;
else
  return max(val * unit / PAGE_SIZE, 1UL);
}

/**
* mod_memcg_state - update cgroup memory statistics
* @memcg: the memory cgroup
* @idx: the stat item - can be enum memcg_stat_item or enum node_stat_item
* @val: delta to add to the counter, can be negative
*/
void mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx,
         int val)
{
int i = memcg_stats_index(idx);
int cpu;

if (mem_cgroup_disabled())
  return;

if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx))
  return;

cpu = get_cpu();

this_cpu_add(memcg->vmstats_percpu->state[i], val);
val = memcg_state_val_in_pages(idx, val);
memcg_rstat_updated(memcg, val, cpu);
trace_mod_memcg_state(memcg, idx, val);

put_cpu();
}

#ifdef CONFIG_MEMCG_V1
/* idx can be of type enum memcg_stat_item or node_stat_item. */
unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx)
{
long x;
int i = memcg_stats_index(idx);

if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx))
  return 0;

x = READ_ONCE(memcg->vmstats->state_local[i]);
#ifdef CONFIG_SMP
if (x < 0)
  x = 0;
#endif
return x;
}
#endif

static void mod_memcg_lruvec_state(struct lruvec *lruvec,
         enum node_stat_item idx,
         int val)
{
struct mem_cgroup_per_node *pn;
struct mem_cgroup *memcg;
int i = memcg_stats_index(idx);
int cpu;

if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx))
  return;

pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
memcg = pn->memcg;

cpu = get_cpu();

/* Update memcg */
this_cpu_add(memcg->vmstats_percpu->state[i], val);

/* Update lruvec */
this_cpu_add(pn->lruvec_stats_percpu->state[i], val);

val = memcg_state_val_in_pages(idx, val);
memcg_rstat_updated(memcg, val, cpu);
trace_mod_memcg_lruvec_state(memcg, idx, val);

put_cpu();
}

/**
* __mod_lruvec_state - update lruvec memory statistics
* @lruvec: the lruvec
* @idx: the stat item
* @val: delta to add to the counter, can be negative
*
* The lruvec is the intersection of the NUMA node and a cgroup. This
* function updates the all three counters that are affected by a
* change of state at this level: per-node, per-cgroup, per-lruvec.
*/
void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
   int val)
{
/* Update node */
__mod_node_page_state(lruvec_pgdat(lruvec), idx, val);

/* Update memcg and lruvec */
if (!mem_cgroup_disabled())
  mod_memcg_lruvec_state(lruvec, idx, val);
}

void __lruvec_stat_mod_folio(struct folio *folio, enum node_stat_item idx,
        int val)
{
struct mem_cgroup *memcg;
pg_data_t *pgdat = folio_pgdat(folio);
struct lruvec *lruvec;

rcu_read_lock();
memcg = folio_memcg(folio);
/* Untracked pages have no memcg, no lruvec. Update only the node */
if (!memcg) {
  rcu_read_unlock();
  __mod_node_page_state(pgdat, idx, val);
  return;
}

lruvec = mem_cgroup_lruvec(memcg, pgdat);
__mod_lruvec_state(lruvec, idx, val);
rcu_read_unlock();
}
EXPORT_SYMBOL(__lruvec_stat_mod_folio);

void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val)
{
pg_data_t *pgdat = page_pgdat(virt_to_page(p));
struct mem_cgroup *memcg;
struct lruvec *lruvec;

rcu_read_lock();
memcg = mem_cgroup_from_slab_obj(p);

/*
* Untracked pages have no memcg, no lruvec. Update only the
* node. If we reparent the slab objects to the root memcg,
* when we free the slab object, we need to update the per-memcg
* vmstats to keep it correct for the root memcg.
*/
if (!memcg) {
  __mod_node_page_state(pgdat, idx, val);
} else {
  lruvec = mem_cgroup_lruvec(memcg, pgdat);
  __mod_lruvec_state(lruvec, idx, val);
}
rcu_read_unlock();
}

/**
* count_memcg_events - account VM events in a cgroup
* @memcg: the memory cgroup
* @idx: the event item
* @count: the number of events that occurred
*/
void count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
     unsigned long count)
{
int i = memcg_events_index(idx);
int cpu;

if (mem_cgroup_disabled())
  return;

if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx))
  return;

cpu = get_cpu();

this_cpu_add(memcg->vmstats_percpu->events[i], count);
memcg_rstat_updated(memcg, count, cpu);
trace_count_memcg_events(memcg, idx, count);

put_cpu();
}

unsigned long memcg_events(struct mem_cgroup *memcg, int event)
{
int i = memcg_events_index(event);

if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, event))
  return 0;

return READ_ONCE(memcg->vmstats->events[i]);
}

#ifdef CONFIG_MEMCG_V1
unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
{
int i = memcg_events_index(event);

if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, event))
  return 0;

return READ_ONCE(memcg->vmstats->events_local[i]);
}
#endif

struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
{
/*
* mm_update_next_owner() may clear mm->owner to NULL
* if it races with swapoff, page migration, etc.
* So this can be called with p == NULL.
*/
if (unlikely(!p))
  return NULL;

return mem_cgroup_from_css(task_css(p, memory_cgrp_id));
}
EXPORT_SYMBOL(mem_cgroup_from_task);

static __always_inline struct mem_cgroup *active_memcg(void)
{
if (!in_task())
  return this_cpu_read(int_active_memcg);
else
  return current->active_memcg;
}

/**
* get_mem_cgroup_from_mm: Obtain a reference on given mm_struct's memcg.
* @mm: mm from which memcg should be extracted. It can be NULL.
*
* Obtain a reference on mm->memcg and returns it if successful. If mm
* is NULL, then the memcg is chosen as follows:
* 1) The active memcg, if set.
* 2) current->mm->memcg, if available
* 3) root memcg
* If mem_cgroup is disabled, NULL is returned.
*/
struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
{
struct mem_cgroup *memcg;

if (mem_cgroup_disabled())
  return NULL;

/*
* Page cache insertions can happen without an
* actual mm context, e.g. during disk probing
* on boot, loopback IO, acct() writes etc.
*
* No need to css_get on root memcg as the reference
* counting is disabled on the root level in the
* cgroup core. See CSS_NO_REF.
*/
if (unlikely(!mm)) {
  memcg = active_memcg();
  if (unlikely(memcg)) {
   /* remote memcg must hold a ref */
   css_get(&memcg->css);
   return memcg;
  }
  mm = current->mm;
  if (unlikely(!mm))
   return root_mem_cgroup;
}

rcu_read_lock();
do {
  memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
  if (unlikely(!memcg))
   memcg = root_mem_cgroup;
} while (!css_tryget(&memcg->css));
rcu_read_unlock();
return memcg;
}
EXPORT_SYMBOL(get_mem_cgroup_from_mm);

/**
* get_mem_cgroup_from_current - Obtain a reference on current task's memcg.
*/
struct mem_cgroup *get_mem_cgroup_from_current(void)
{
struct mem_cgroup *memcg;

if (mem_cgroup_disabled())
  return NULL;

again:
rcu_read_lock();
memcg = mem_cgroup_from_task(current);
if (!css_tryget(&memcg->css)) {
  rcu_read_unlock();
  goto again;
}
rcu_read_unlock();
return memcg;
}

/**
* get_mem_cgroup_from_folio - Obtain a reference on a given folio's memcg.
* @folio: folio from which memcg should be extracted.
*/
struct mem_cgroup *get_mem_cgroup_from_folio(struct folio *folio)
{
struct mem_cgroup *memcg = folio_memcg(folio);

if (mem_cgroup_disabled())
  return NULL;

rcu_read_lock();
if (!memcg || WARN_ON_ONCE(!css_tryget(&memcg->css)))
  memcg = root_mem_cgroup;
rcu_read_unlock();
return memcg;
}

/**
* mem_cgroup_iter - iterate over memory cgroup hierarchy
* @root: hierarchy root
* @prev: previously returned memcg, NULL on first invocation
* @reclaim: cookie for shared reclaim walks, NULL for full walks
*
* Returns references to children of the hierarchy below @root, or
* @root itself, or %NULL after a full round-trip.
*
* Caller must pass the return value in @prev on subsequent
* invocations for reference counting, or use mem_cgroup_iter_break()
* to cancel a hierarchy walk before the round-trip is complete.
*
* Reclaimers can specify a node in @reclaim to divide up the memcgs
* in the hierarchy among all concurrent reclaimers operating on the
* same node.
*/
struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
       struct mem_cgroup *prev,
       struct mem_cgroup_reclaim_cookie *reclaim)
{
struct mem_cgroup_reclaim_iter *iter;
struct cgroup_subsys_state *css;
struct mem_cgroup *pos;
struct mem_cgroup *next;

if (mem_cgroup_disabled())
  return NULL;

if (!root)
  root = root_mem_cgroup;

rcu_read_lock();
restart:
next = NULL;

if (reclaim) {
  int gen;
  int nid = reclaim->pgdat->node_id;

  iter = &root->nodeinfo[nid]->iter;
  gen = atomic_read(&iter->generation);

  /*
* On start, join the current reclaim iteration cycle.
* Exit when a concurrent walker completes it.
*/
  if (!prev)
   reclaim->generation = gen;
  else if (reclaim->generation != gen)
   goto out_unlock;

  pos = READ_ONCE(iter->position);
} else
  pos = prev;

css = pos ? &pos->css : NULL;

while ((css = css_next_descendant_pre(css, &root->css))) {
  /*
* Verify the css and acquire a reference.  The root
* is provided by the caller, so we know it's alive
* and kicking, and don't take an extra reference.
*/
  if (css == &root->css || css_tryget(css))
   break;
}

next = mem_cgroup_from_css(css);

if (reclaim) {
  /*
* The position could have already been updated by a competing
* thread, so check that the value hasn't changed since we read
* it to avoid reclaiming from the same cgroup twice.
*/
  if (cmpxchg(&iter->position, pos, next) != pos) {
   if (css && css != &root->css)
    css_put(css);
   goto restart;
  }

  if (!next) {
   atomic_inc(&iter->generation);

   /*
* Reclaimers share the hierarchy walk, and a
* new one might jump in right at the end of
* the hierarchy - make sure they see at least
* one group and restart from the beginning.
*/
   if (!prev)
    goto restart;
  }
}

out_unlock:
rcu_read_unlock();
if (prev && prev != root)
  css_put(&prev->css);

return next;
}

/**
* mem_cgroup_iter_break - abort a hierarchy walk prematurely
* @root: hierarchy root
* @prev: last visited hierarchy member as returned by mem_cgroup_iter()
*/
void mem_cgroup_iter_break(struct mem_cgroup *root,
      struct mem_cgroup *prev)
{
if (!root)
  root = root_mem_cgroup;
if (prev && prev != root)
  css_put(&prev->css);
}

static void __invalidate_reclaim_iterators(struct mem_cgroup *from,
     struct mem_cgroup *dead_memcg)
{
struct mem_cgroup_reclaim_iter *iter;
struct mem_cgroup_per_node *mz;
int nid;

for_each_node(nid) {
  mz = from->nodeinfo[nid];
  iter = &mz->iter;
  cmpxchg(&iter->position, dead_memcg, NULL);
}
}

static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
{
struct mem_cgroup *memcg = dead_memcg;
struct mem_cgroup *last;

do {
  __invalidate_reclaim_iterators(memcg, dead_memcg);
  last = memcg;
} while ((memcg = parent_mem_cgroup(memcg)));

/*
* When cgroup1 non-hierarchy mode is used,
* parent_mem_cgroup() does not walk all the way up to the
* cgroup root (root_mem_cgroup). So we have to handle
* dead_memcg from cgroup root separately.
*/
if (!mem_cgroup_is_root(last))
  __invalidate_reclaim_iterators(root_mem_cgroup,
      dead_memcg);
}

/**
* mem_cgroup_scan_tasks - iterate over tasks of a memory cgroup hierarchy
* @memcg: hierarchy root
* @fn: function to call for each task
* @arg: argument passed to @fn
*
* This function iterates over tasks attached to @memcg or to any of its
* descendants and calls @fn for each task. If @fn returns a non-zero
* value, the function breaks the iteration loop. Otherwise, it will iterate
* over all tasks and return 0.
*
* This function must not be called for the root memory cgroup.
*/
void mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
      int (*fn)(struct task_struct *, void *), void *arg)
{
struct mem_cgroup *iter;
int ret = 0;

BUG_ON(mem_cgroup_is_root(memcg));

for_each_mem_cgroup_tree(iter, memcg) {
  struct css_task_iter it;
  struct task_struct *task;

  css_task_iter_start(&iter->css, CSS_TASK_ITER_PROCS, &it);
  while (!ret && (task = css_task_iter_next(&it))) {
   ret = fn(task, arg);
   /* Avoid potential softlockup warning */
   cond_resched();
  }
  css_task_iter_end(&it);
  if (ret) {
   mem_cgroup_iter_break(memcg, iter);
   break;
  }
}
}

#ifdef CONFIG_DEBUG_VM
void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio)
{
struct mem_cgroup *memcg;

if (mem_cgroup_disabled())
  return;

memcg = folio_memcg(folio);

if (!memcg)
  VM_BUG_ON_FOLIO(!mem_cgroup_is_root(lruvec_memcg(lruvec)), folio);
else
  VM_BUG_ON_FOLIO(lruvec_memcg(lruvec) != memcg, folio);
}
#endif

/**
* folio_lruvec_lock - Lock the lruvec for a folio.
* @folio: Pointer to the folio.
*
* These functions are safe to use under any of the following conditions:
* - folio locked
* - folio_test_lru false
* - folio frozen (refcount of 0)
*
* Return: The lruvec this folio is on with its lock held.
*/
struct lruvec *folio_lruvec_lock(struct folio *folio)
{
struct lruvec *lruvec = folio_lruvec(folio);

spin_lock(&lruvec->lru_lock);
lruvec_memcg_debug(lruvec, folio);

return lruvec;
}

/**
* folio_lruvec_lock_irq - Lock the lruvec for a folio.
* @folio: Pointer to the folio.
*
* These functions are safe to use under any of the following conditions:
* - folio locked
* - folio_test_lru false
* - folio frozen (refcount of 0)
*
* Return: The lruvec this folio is on with its lock held and interrupts
* disabled.
*/
struct lruvec *folio_lruvec_lock_irq(struct folio *folio)
{
struct lruvec *lruvec = folio_lruvec(folio);

spin_lock_irq(&lruvec->lru_lock);
lruvec_memcg_debug(lruvec, folio);

return lruvec;
}

/**
* folio_lruvec_lock_irqsave - Lock the lruvec for a folio.
* @folio: Pointer to the folio.
* @flags: Pointer to irqsave flags.
*
* These functions are safe to use under any of the following conditions:
* - folio locked
* - folio_test_lru false
* - folio frozen (refcount of 0)
*
* Return: The lruvec this folio is on with its lock held and interrupts
* disabled.
*/
struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio,
  unsigned long *flags)
{
struct lruvec *lruvec = folio_lruvec(folio);

spin_lock_irqsave(&lruvec->lru_lock, *flags);
lruvec_memcg_debug(lruvec, folio);

return lruvec;
}

/**
* mem_cgroup_update_lru_size - account for adding or removing an lru page
* @lruvec: mem_cgroup per zone lru vector
* @lru: index of lru list the page is sitting on
* @zid: zone id of the accounted pages
* @nr_pages: positive when adding or negative when removing
*
* This function must be called under lru_lock, just before a page is added
* to or just after a page is removed from an lru list.
*/
void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
    int zid, int nr_pages)
{
struct mem_cgroup_per_node *mz;
unsigned long *lru_size;
long size;

if (mem_cgroup_disabled())
  return;

mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
lru_size = &mz->lru_zone_size[zid][lru];

if (nr_pages < 0)
  *lru_size += nr_pages;

size = *lru_size;
if (WARN_ONCE(size < 0,
  "%s(%p, %d, %d): lru_size %ld\n",
  __func__, lruvec, lru, nr_pages, size)) {
  VM_BUG_ON(1);
  *lru_size = 0;
}

if (nr_pages > 0)
  *lru_size += nr_pages;
}

/**
* mem_cgroup_margin - calculate chargeable space of a memory cgroup
* @memcg: the memory cgroup
*
* Returns the maximum amount of memory @mem can be charged with, in
* pages.
*/
static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
{
unsigned long margin = 0;
unsigned long count;
unsigned long limit;

count = page_counter_read(&memcg->memory);
limit = READ_ONCE(memcg->memory.max);
if (count < limit)
  margin = limit - count;

if (do_memsw_account()) {
  count = page_counter_read(&memcg->memsw);
  limit = READ_ONCE(memcg->memsw.max);
  if (count < limit)
   margin = min(margin, limit - count);
  else
   margin = 0;
}

return margin;
}

struct memory_stat {
const char *name;
unsigned int idx;
};

static const struct memory_stat memory_stats[] = {
{ "anon",   NR_ANON_MAPPED   },
{ "file",   NR_FILE_PAGES   },
{ "kernel",   MEMCG_KMEM   },
{ "kernel_stack",  NR_KERNEL_STACK_KB  },
{ "pagetables",   NR_PAGETABLE   },
{ "sec_pagetables",  NR_SECONDARY_PAGETABLE  },
{ "percpu",   MEMCG_PERCPU_B   },
{ "sock",   MEMCG_SOCK   },
{ "vmalloc",   MEMCG_VMALLOC   },
{ "shmem",   NR_SHMEM   },
#ifdef CONFIG_ZSWAP
{ "zswap",   MEMCG_ZSWAP_B   },
{ "zswapped",   MEMCG_ZSWAPPED   },
#endif
{ "file_mapped",  NR_FILE_MAPPED   },
{ "file_dirty",   NR_FILE_DIRTY   },
{ "file_writeback",  NR_WRITEBACK   },
#ifdef CONFIG_SWAP
{ "swapcached",   NR_SWAPCACHE   },
#endif
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
{ "anon_thp",   NR_ANON_THPS   },
{ "file_thp",   NR_FILE_THPS   },
{ "shmem_thp",   NR_SHMEM_THPS   },
#endif
{ "inactive_anon",  NR_INACTIVE_ANON  },
{ "active_anon",  NR_ACTIVE_ANON   },
{ "inactive_file",  NR_INACTIVE_FILE  },
{ "active_file",  NR_ACTIVE_FILE   },
{ "unevictable",  NR_UNEVICTABLE   },
{ "slab_reclaimable",  NR_SLAB_RECLAIMABLE_B  },
{ "slab_unreclaimable",  NR_SLAB_UNRECLAIMABLE_B  },
#ifdef CONFIG_HUGETLB_PAGE
{ "hugetlb",   NR_HUGETLB   },
#endif

/* The memory events */
{ "workingset_refault_anon", WORKINGSET_REFAULT_ANON  },
{ "workingset_refault_file", WORKINGSET_REFAULT_FILE  },
{ "workingset_activate_anon", WORKINGSET_ACTIVATE_ANON },
{ "workingset_activate_file", WORKINGSET_ACTIVATE_FILE },
{ "workingset_restore_anon", WORKINGSET_RESTORE_ANON  },
{ "workingset_restore_file", WORKINGSET_RESTORE_FILE  },
{ "workingset_nodereclaim", WORKINGSET_NODERECLAIM  },

{ "pgdemote_kswapd",  PGDEMOTE_KSWAPD  },
{ "pgdemote_direct",  PGDEMOTE_DIRECT  },
{ "pgdemote_khugepaged", PGDEMOTE_KHUGEPAGED },
{ "pgdemote_proactive",  PGDEMOTE_PROACTIVE },
#ifdef CONFIG_NUMA_BALANCING
{ "pgpromote_success",  PGPROMOTE_SUCCESS },
#endif
};

/* The actual unit of the state item, not the same as the output unit */
static int memcg_page_state_unit(int item)
{
switch (item) {
case MEMCG_PERCPU_B:
case MEMCG_ZSWAP_B:
case NR_SLAB_RECLAIMABLE_B:
case NR_SLAB_UNRECLAIMABLE_B:
  return 1;
case NR_KERNEL_STACK_KB:
  return SZ_1K;
default:
  return PAGE_SIZE;
}
}

/* Translate stat items to the correct unit for memory.stat output */
static int memcg_page_state_output_unit(int item)
{
/*
* Workingset state is actually in pages, but we export it to userspace
* as a scalar count of events, so special case it here.
*
* Demotion and promotion activities are exported in pages, consistent
* with their global counterparts.
*/
switch (item) {
case WORKINGSET_REFAULT_ANON:
case WORKINGSET_REFAULT_FILE:
case WORKINGSET_ACTIVATE_ANON:
case WORKINGSET_ACTIVATE_FILE:
case WORKINGSET_RESTORE_ANON:
case WORKINGSET_RESTORE_FILE:
case WORKINGSET_NODERECLAIM:
case PGDEMOTE_KSWAPD:
case PGDEMOTE_DIRECT:
case PGDEMOTE_KHUGEPAGED:
case PGDEMOTE_PROACTIVE:
#ifdef CONFIG_NUMA_BALANCING
case PGPROMOTE_SUCCESS:
#endif
  return 1;
default:
  return memcg_page_state_unit(item);
}
}

unsigned long memcg_page_state_output(struct mem_cgroup *memcg, int item)
{
return memcg_page_state(memcg, item) *
  memcg_page_state_output_unit(item);
}

#ifdef CONFIG_MEMCG_V1
unsigned long memcg_page_state_local_output(struct mem_cgroup *memcg, int item)
{
return memcg_page_state_local(memcg, item) *
  memcg_page_state_output_unit(item);
}
#endif

#ifdef CONFIG_HUGETLB_PAGE
static bool memcg_accounts_hugetlb(void)
{
return cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING;
}
#else /* CONFIG_HUGETLB_PAGE */
static bool memcg_accounts_hugetlb(void)
{
return false;
}
#endif /* CONFIG_HUGETLB_PAGE */

static void memcg_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
{
int i;

/*
* Provide statistics on the state of the memory subsystem as
* well as cumulative event counters that show past behavior.
*
* This list is ordered following a combination of these gradients:
* 1) generic big picture -> specifics and details
* 2) reflecting userspace activity -> reflecting kernel heuristics
*
* Current memory state:
*/
mem_cgroup_flush_stats(memcg);

for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
  u64 size;

#ifdef CONFIG_HUGETLB_PAGE
  if (unlikely(memory_stats[i].idx == NR_HUGETLB) &&
   !memcg_accounts_hugetlb())
   continue;
#endif
  size = memcg_page_state_output(memcg, memory_stats[i].idx);
  seq_buf_printf(s, "%s %llu\n", memory_stats[i].name, size);

  if (unlikely(memory_stats[i].idx == NR_SLAB_UNRECLAIMABLE_B)) {
   size += memcg_page_state_output(memcg,
       NR_SLAB_RECLAIMABLE_B);
   seq_buf_printf(s, "slab %llu\n", size);
  }
}

/* Accumulated memory events */
seq_buf_printf(s, "pgscan %lu\n",
         memcg_events(memcg, PGSCAN_KSWAPD) +
         memcg_events(memcg, PGSCAN_DIRECT) +
         memcg_events(memcg, PGSCAN_PROACTIVE) +
         memcg_events(memcg, PGSCAN_KHUGEPAGED));
seq_buf_printf(s, "pgsteal %lu\n",
         memcg_events(memcg, PGSTEAL_KSWAPD) +
         memcg_events(memcg, PGSTEAL_DIRECT) +
         memcg_events(memcg, PGSTEAL_PROACTIVE) +
         memcg_events(memcg, PGSTEAL_KHUGEPAGED));

for (i = 0; i < ARRAY_SIZE(memcg_vm_event_stat); i++) {
#ifdef CONFIG_MEMCG_V1
  if (memcg_vm_event_stat[i] == PGPGIN ||
      memcg_vm_event_stat[i] == PGPGOUT)
   continue;
#endif
  seq_buf_printf(s, "%s %lu\n",
          vm_event_name(memcg_vm_event_stat[i]),
          memcg_events(memcg, memcg_vm_event_stat[i]));
}
}

static void memory_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
{
if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
  memcg_stat_format(memcg, s);
else
  memcg1_stat_format(memcg, s);
if (seq_buf_has_overflowed(s))
  pr_warn("%s: Warning, stat buffer overflow, please report\n", __func__);
}

/**
* mem_cgroup_print_oom_context: Print OOM information relevant to
* memory controller.
* @memcg: The memory cgroup that went over limit
* @p: Task that is going to be killed
*
* NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
* enabled
*/
void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p)
{
rcu_read_lock();

if (memcg) {
  pr_cont(",oom_memcg=");
  pr_cont_cgroup_path(memcg->css.cgroup);
} else
  pr_cont(",global_oom");
if (p) {
  pr_cont(",task_memcg=");
  pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));
}
rcu_read_unlock();
}

/**
* mem_cgroup_print_oom_meminfo: Print OOM memory information relevant to
* memory controller.
* @memcg: The memory cgroup that went over limit
*/
void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
{
/* Use static buffer, for the caller is holding oom_lock. */
static char buf[SEQ_BUF_SIZE];
struct seq_buf s;
unsigned long memory_failcnt;

lockdep_assert_held(&oom_lock);

if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
  memory_failcnt = atomic_long_read(&memcg->memory_events[MEMCG_MAX]);
else
  memory_failcnt = memcg->memory.failcnt;

pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",
  K((u64)page_counter_read(&memcg->memory)),
  K((u64)READ_ONCE(memcg->memory.max)), memory_failcnt);
if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
  pr_info("swap: usage %llukB, limit %llukB, failcnt %lu\n",
   K((u64)page_counter_read(&memcg->swap)),
   K((u64)READ_ONCE(memcg->swap.max)),
   atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX]));
#ifdef CONFIG_MEMCG_V1
else {
  pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
   K((u64)page_counter_read(&memcg->memsw)),
   K((u64)memcg->memsw.max), memcg->memsw.failcnt);
  pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",
   K((u64)page_counter_read(&memcg->kmem)),
   K((u64)memcg->kmem.max), memcg->kmem.failcnt);
}
#endif

pr_info("Memory cgroup stats for ");
pr_cont_cgroup_path(memcg->css.cgroup);
pr_cont(":");
seq_buf_init(&s, buf, SEQ_BUF_SIZE);
memory_stat_format(memcg, &s);
seq_buf_do_printk(&s, KERN_INFO);
}

/*
* Return the memory (and swap, if configured) limit for a memcg.
*/
unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
{
unsigned long max = READ_ONCE(memcg->memory.max);

if (do_memsw_account()) {
  if (mem_cgroup_swappiness(memcg)) {
   /* Calculate swap excess capacity from memsw limit */
   unsigned long swap = READ_ONCE(memcg->memsw.max) - max;

   max += min(swap, (unsigned long)total_swap_pages);
  }
} else {
  if (mem_cgroup_swappiness(memcg))
   max += min(READ_ONCE(memcg->swap.max),
       (unsigned long)total_swap_pages);
}
return max;
}

unsigned long mem_cgroup_size(struct mem_cgroup *memcg)
{
return page_counter_read(&memcg->memory);
}

static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
         int order)
{
struct oom_control oc = {
  .zonelist = NULL,
  .nodemask = NULL,
  .memcg = memcg,
  .gfp_mask = gfp_mask,
  .order = order,
};
bool ret = true;

if (mutex_lock_killable(&oom_lock))
  return true;

if (mem_cgroup_margin(memcg) >= (1 << order))
  goto unlock;

/*
* A few threads which were not waiting at mutex_lock_killable() can
* fail to bail out. Therefore, check again after holding oom_lock.
*/
ret = out_of_memory(&oc);

unlock:
mutex_unlock(&oom_lock);
return ret;
}

/*
* Returns true if successfully killed one or more processes. Though in some
* corner cases it can return true even without killing any process.
*/
static bool mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
{
bool locked, ret;

if (order > PAGE_ALLOC_COSTLY_ORDER)
  return false;

memcg_memory_event(memcg, MEMCG_OOM);

if (!memcg1_oom_prepare(memcg, &locked))
  return false;

ret = mem_cgroup_out_of_memory(memcg, mask, order);

memcg1_oom_finish(memcg, locked);

return ret;
}

/**
* mem_cgroup_get_oom_group - get a memory cgroup to clean up after OOM
* @victim: task to be killed by the OOM killer
* @oom_domain: memcg in case of memcg OOM, NULL in case of system-wide OOM
*
* Returns a pointer to a memory cgroup, which has to be cleaned up
* by killing all belonging OOM-killable tasks.
*
* Caller has to call mem_cgroup_put() on the returned non-NULL memcg.
*/
struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim,
         struct mem_cgroup *oom_domain)
{
struct mem_cgroup *oom_group = NULL;
struct mem_cgroup *memcg;

if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
  return NULL;

if (!oom_domain)
  oom_domain = root_mem_cgroup;

rcu_read_lock();

memcg = mem_cgroup_from_task(victim);
if (mem_cgroup_is_root(memcg))
  goto out;

/*
* If the victim task has been asynchronously moved to a different
* memory cgroup, we might end up killing tasks outside oom_domain.
* In this case it's better to ignore memory.group.oom.
*/
if (unlikely(!mem_cgroup_is_descendant(memcg, oom_domain)))
  goto out;

/*
* Traverse the memory cgroup hierarchy from the victim task's
* cgroup up to the OOMing cgroup (or root) to find the
* highest-level memory cgroup with oom.group set.
*/
for (; memcg; memcg = parent_mem_cgroup(memcg)) {
  if (READ_ONCE(memcg->oom_group))
   oom_group = memcg;

  if (memcg == oom_domain)
   break;
}

if (oom_group)
  css_get(&oom_group->css);
out:
rcu_read_unlock();

return oom_group;
}

void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
{
pr_info("Tasks in ");
pr_cont_cgroup_path(memcg->css.cgroup);
pr_cont(" are going to be killed due to memory.oom.group set\n");
}

/*
* The value of NR_MEMCG_STOCK is selected to keep the cached memcgs and their
* nr_pages in a single cacheline. This may change in future.
*/
#define NR_MEMCG_STOCK 7
#define FLUSHING_CACHED_CHARGE 0
struct memcg_stock_pcp {
local_trylock_t lock;
uint8_t nr_pages[NR_MEMCG_STOCK];
struct mem_cgroup *cached[NR_MEMCG_STOCK];

struct work_struct work;
unsigned long flags;
};

static DEFINE_PER_CPU_ALIGNED(struct memcg_stock_pcp, memcg_stock) = {
.lock = INIT_LOCAL_TRYLOCK(lock),
};

struct obj_stock_pcp {
local_trylock_t lock;
unsigned int nr_bytes;
struct obj_cgroup *cached_objcg;
struct pglist_data *cached_pgdat;
int nr_slab_reclaimable_b;
int nr_slab_unreclaimable_b;

struct work_struct work;
unsigned long flags;
};

static DEFINE_PER_CPU_ALIGNED(struct obj_stock_pcp, obj_stock) = {
.lock = INIT_LOCAL_TRYLOCK(lock),
};

static DEFINE_MUTEX(percpu_charge_mutex);

static void drain_obj_stock(struct obj_stock_pcp *stock);
static bool obj_stock_flush_required(struct obj_stock_pcp *stock,
         struct mem_cgroup *root_memcg);

/**
* consume_stock: Try to consume stocked charge on this cpu.
* @memcg: memcg to consume from.
* @nr_pages: how many pages to charge.
*
* Consume the cached charge if enough nr_pages are present otherwise return
* failure. Also return failure for charge request larger than
* MEMCG_CHARGE_BATCH or if the local lock is already taken.
*
* returns true if successful, false otherwise.
*/
static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
{
struct memcg_stock_pcp *stock;
uint8_t stock_pages;
bool ret = false;
int i;

if (nr_pages > MEMCG_CHARGE_BATCH ||
     !local_trylock(&memcg_stock.lock))
  return ret;

stock = this_cpu_ptr(&memcg_stock);

for (i = 0; i < NR_MEMCG_STOCK; ++i) {
  if (memcg != READ_ONCE(stock->cached[i]))
   continue;

  stock_pages = READ_ONCE(stock->nr_pages[i]);
  if (stock_pages >= nr_pages) {
   WRITE_ONCE(stock->nr_pages[i], stock_pages - nr_pages);
   ret = true;
  }
  break;
}

local_unlock(&memcg_stock.lock);

return ret;
}

static void memcg_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages)
{
page_counter_uncharge(&memcg->memory, nr_pages);
if (do_memsw_account())
  page_counter_uncharge(&memcg->memsw, nr_pages);
}

/*
* Returns stocks cached in percpu and reset cached information.
*/
static void drain_stock(struct memcg_stock_pcp *stock, int i)
{
struct mem_cgroup *old = READ_ONCE(stock->cached[i]);
uint8_t stock_pages;

if (!old)
  return;

stock_pages = READ_ONCE(stock->nr_pages[i]);
if (stock_pages) {
  memcg_uncharge(old, stock_pages);
  WRITE_ONCE(stock->nr_pages[i], 0);
}

css_put(&old->css);
WRITE_ONCE(stock->cached[i], NULL);
}

static void drain_stock_fully(struct memcg_stock_pcp *stock)
{
int i;

for (i = 0; i < NR_MEMCG_STOCK; ++i)
  drain_stock(stock, i);
}

static void drain_local_memcg_stock(struct work_struct *dummy)
{
struct memcg_stock_pcp *stock;

if (WARN_ONCE(!in_task(), "drain in non-task context"))
  return;

local_lock(&memcg_stock.lock);

stock = this_cpu_ptr(&memcg_stock);
drain_stock_fully(stock);
clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);

local_unlock(&memcg_stock.lock);
}

static void drain_local_obj_stock(struct work_struct *dummy)
{
struct obj_stock_pcp *stock;

if (WARN_ONCE(!in_task(), "drain in non-task context"))
  return;

local_lock(&obj_stock.lock);

stock = this_cpu_ptr(&obj_stock);
drain_obj_stock(stock);
clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);

local_unlock(&obj_stock.lock);
}

static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
{
struct memcg_stock_pcp *stock;
struct mem_cgroup *cached;
uint8_t stock_pages;
bool success = false;
int empty_slot = -1;
int i;

/*
* For now limit MEMCG_CHARGE_BATCH to 127 and less. In future if we
* decide to increase it more than 127 then we will need more careful
* handling of nr_pages[] in struct memcg_stock_pcp.
*/
BUILD_BUG_ON(MEMCG_CHARGE_BATCH > S8_MAX);

VM_WARN_ON_ONCE(mem_cgroup_is_root(memcg));

if (nr_pages > MEMCG_CHARGE_BATCH ||
     !local_trylock(&memcg_stock.lock)) {
  /*
* In case of larger than batch refill or unlikely failure to
* lock the percpu memcg_stock.lock, uncharge memcg directly.
*/
  memcg_uncharge(memcg, nr_pages);
  return;
}

stock = this_cpu_ptr(&memcg_stock);
for (i = 0; i < NR_MEMCG_STOCK; ++i) {
  cached = READ_ONCE(stock->cached[i]);
  if (!cached && empty_slot == -1)
   empty_slot = i;
  if (memcg == READ_ONCE(stock->cached[i])) {
   stock_pages = READ_ONCE(stock->nr_pages[i]) + nr_pages;
   WRITE_ONCE(stock->nr_pages[i], stock_pages);
   if (stock_pages > MEMCG_CHARGE_BATCH)
    drain_stock(stock, i);
   success = true;
   break;
  }
}

if (!success) {
  i = empty_slot;
  if (i == -1) {
   i = get_random_u32_below(NR_MEMCG_STOCK);
   drain_stock(stock, i);
  }
  css_get(&memcg->css);
  WRITE_ONCE(stock->cached[i], memcg);
  WRITE_ONCE(stock->nr_pages[i], nr_pages);
}

local_unlock(&memcg_stock.lock);
}

static bool is_memcg_drain_needed(struct memcg_stock_pcp *stock,
      struct mem_cgroup *root_memcg)
{
struct mem_cgroup *memcg;
bool flush = false;
int i;

rcu_read_lock();
for (i = 0; i < NR_MEMCG_STOCK; ++i) {
  memcg = READ_ONCE(stock->cached[i]);
  if (!memcg)
   continue;

  if (READ_ONCE(stock->nr_pages[i]) &&
      mem_cgroup_is_descendant(memcg, root_memcg)) {
   flush = true;
   break;
  }
}
rcu_read_unlock();
return flush;
}

/*
* Drains all per-CPU charge caches for given root_memcg resp. subtree
* of the hierarchy under it.
*/
void drain_all_stock(struct mem_cgroup *root_memcg)
{
int cpu, curcpu;

/* If someone's already draining, avoid adding running more workers. */
if (!mutex_trylock(&percpu_charge_mutex))
  return;
/*
* Notify other cpus that system-wide "drain" is running
* We do not care about races with the cpu hotplug because cpu down
* as well as workers from this path always operate on the local
* per-cpu data. CPU up doesn't touch memcg_stock at all.
*/
migrate_disable();
curcpu = smp_processor_id();
for_each_online_cpu(cpu) {
  struct memcg_stock_pcp *memcg_st = &per_cpu(memcg_stock, cpu);
  struct obj_stock_pcp *obj_st = &per_cpu(obj_stock, cpu);

  if (!test_bit(FLUSHING_CACHED_CHARGE, &memcg_st->flags) &&
      is_memcg_drain_needed(memcg_st, root_memcg) &&
      !test_and_set_bit(FLUSHING_CACHED_CHARGE,
          &memcg_st->flags)) {
   if (cpu == curcpu)
    drain_local_memcg_stock(&memcg_st->work);
   else if (!cpu_is_isolated(cpu))
    schedule_work_on(cpu, &memcg_st->work);
  }

  if (!test_bit(FLUSHING_CACHED_CHARGE, &obj_st->flags) &&
      obj_stock_flush_required(obj_st, root_memcg) &&
      !test_and_set_bit(FLUSHING_CACHED_CHARGE,
          &obj_st->flags)) {
   if (cpu == curcpu)
    drain_local_obj_stock(&obj_st->work);
   else if (!cpu_is_isolated(cpu))
    schedule_work_on(cpu, &obj_st->work);
  }
}
migrate_enable();
mutex_unlock(&percpu_charge_mutex);
}

static int memcg_hotplug_cpu_dead(unsigned int cpu)
{
/* no need for the local lock */
drain_obj_stock(&per_cpu(obj_stock, cpu));
drain_stock_fully(&per_cpu(memcg_stock, cpu));

return 0;
}

static unsigned long reclaim_high(struct mem_cgroup *memcg,
      unsigned int nr_pages,
      gfp_t gfp_mask)
{
unsigned long nr_reclaimed = 0;

do {
  unsigned long pflags;

  if (page_counter_read(&memcg->memory) <=
      READ_ONCE(memcg->memory.high))
   continue;

  memcg_memory_event(memcg, MEMCG_HIGH);

  psi_memstall_enter(&pflags);
  nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages,
       gfp_mask,
       MEMCG_RECLAIM_MAY_SWAP,
       NULL);
  psi_memstall_leave(&pflags);
} while ((memcg = parent_mem_cgroup(memcg)) &&
   !mem_cgroup_is_root(memcg));

return nr_reclaimed;
}

static void high_work_func(struct work_struct *work)
{
struct mem_cgroup *memcg;

memcg = container_of(work, struct mem_cgroup, high_work);
reclaim_high(memcg, MEMCG_CHARGE_BATCH, GFP_KERNEL);
}

/*
* Clamp the maximum sleep time per allocation batch to 2 seconds. This is
* enough to still cause a significant slowdown in most cases, while still
* allowing diagnostics and tracing to proceed without becoming stuck.
*/
#define MEMCG_MAX_HIGH_DELAY_JIFFIES (2UL*HZ)

/*
* When calculating the delay, we use these either side of the exponentiation to
* maintain precision and scale to a reasonable number of jiffies (see the table
* below.
*
* - MEMCG_DELAY_PRECISION_SHIFT: Extra precision bits while translating the
*   overage ratio to a delay.
* - MEMCG_DELAY_SCALING_SHIFT: The number of bits to scale down the
*   proposed penalty in order to reduce to a reasonable number of jiffies, and
*   to produce a reasonable delay curve.
*
* MEMCG_DELAY_SCALING_SHIFT just happens to be a number that produces a
* reasonable delay curve compared to precision-adjusted overage, not
* penalising heavily at first, but still making sure that growth beyond the
* limit penalises misbehaviour cgroups by slowing them down exponentially. For
* example, with a high of 100 megabytes:
*
*  +-------+------------------------+
*  | usage | time to allocate in ms |
*  +-------+------------------------+
*  | 100M  |                      0 |
*  | 101M  |                      6 |
*  | 102M  |                     25 |
*  | 103M  |                     57 |
*  | 104M  |                    102 |
*  | 105M  |                    159 |
*  | 106M  |                    230 |
*  | 107M  |                    313 |
*  | 108M  |                    409 |
*  | 109M  |                    518 |
*  | 110M  |                    639 |
*  | 111M  |                    774 |
*  | 112M  |                    921 |
*  | 113M  |                   1081 |
*  | 114M  |                   1254 |
*  | 115M  |                   1439 |
*  | 116M  |                   1638 |
*  | 117M  |                   1849 |
*  | 118M  |                   2000 |
*  | 119M  |                   2000 |
*  | 120M  |                   2000 |
*  +-------+------------------------+
*/
#define MEMCG_DELAY_PRECISION_SHIFT 20
#define MEMCG_DELAY_SCALING_SHIFT 14

static u64 calculate_overage(unsigned long usage, unsigned long high)
{
u64 overage;

if (usage <= high)
  return 0;

/*
* Prevent division by 0 in overage calculation by acting as if
* it was a threshold of 1 page
*/
high = max(high, 1UL);

overage = usage - high;
overage <<= MEMCG_DELAY_PRECISION_SHIFT;
return div64_u64(overage, high);
}

static u64 mem_find_max_overage(struct mem_cgroup *memcg)
{
u64 overage, max_overage = 0;

do {
  overage = calculate_overage(page_counter_read(&memcg->memory),
         READ_ONCE(memcg->memory.high));
  max_overage = max(overage, max_overage);
} while ((memcg = parent_mem_cgroup(memcg)) &&
   !mem_cgroup_is_root(memcg));

return max_overage;
}

static u64 swap_find_max_overage(struct mem_cgroup *memcg)
{
u64 overage, max_overage = 0;

do {
  overage = calculate_overage(page_counter_read(&memcg->swap),
         READ_ONCE(memcg->swap.high));
  if (overage)
   memcg_memory_event(memcg, MEMCG_SWAP_HIGH);
  max_overage = max(overage, max_overage);
} while ((memcg = parent_mem_cgroup(memcg)) &&
   !mem_cgroup_is_root(memcg));

return max_overage;
}

/*
* Get the number of jiffies that we should penalise a mischievous cgroup which
* is exceeding its memory.high by checking both it and its ancestors.
*/
static unsigned long calculate_high_delay(struct mem_cgroup *memcg,
       unsigned int nr_pages,
       u64 max_overage)
{
unsigned long penalty_jiffies;

if (!max_overage)
  return 0;

/*
* We use overage compared to memory.high to calculate the number of
* jiffies to sleep (penalty_jiffies). Ideally this value should be
* fairly lenient on small overages, and increasingly harsh when the
* memcg in question makes it clear that it has no intention of stopping
* its crazy behaviour, so we exponentially increase the delay based on
* overage amount.
*/
penalty_jiffies = max_overage * max_overage * HZ;
penalty_jiffies >>= MEMCG_DELAY_PRECISION_SHIFT;
penalty_jiffies >>= MEMCG_DELAY_SCALING_SHIFT;

/*
* Factor in the task's own contribution to the overage, such that four
* N-sized allocations are throttled approximately the same as one
* 4N-sized allocation.
*
* MEMCG_CHARGE_BATCH pages is nominal, so work out how much smaller or
* larger the current charge patch is than that.
*/
return penalty_jiffies * nr_pages / MEMCG_CHARGE_BATCH;
}

/*
* Reclaims memory over the high limit. Called directly from
* try_charge() (context permitting), as well as from the userland
* return path where reclaim is always able to block.
*/
void mem_cgroup_handle_over_high(gfp_t gfp_mask)
{
unsigned long penalty_jiffies;
unsigned long pflags;
unsigned long nr_reclaimed;
unsigned int nr_pages = current->memcg_nr_pages_over_high;
int nr_retries = MAX_RECLAIM_RETRIES;
struct mem_cgroup *memcg;
bool in_retry = false;

if (likely(!nr_pages))
  return;

memcg = get_mem_cgroup_from_mm(current->mm);
current->memcg_nr_pages_over_high = 0;

retry_reclaim:
/*
* Bail if the task is already exiting. Unlike memory.max,
* memory.high enforcement isn't as strict, and there is no
* OOM killer involved, which means the excess could already
* be much bigger (and still growing) than it could for
* memory.max; the dying task could get stuck in fruitless
* reclaim for a long time, which isn't desirable.
*/
if (task_is_dying())
  goto out;

/*
* The allocating task should reclaim at least the batch size, but for
* subsequent retries we only want to do what's necessary to prevent oom
* or breaching resource isolation.
*
* This is distinct from memory.max or page allocator behaviour because
* memory.high is currently batched, whereas memory.max and the page
* allocator run every time an allocation is made.
*/
nr_reclaimed = reclaim_high(memcg,
        in_retry ? SWAP_CLUSTER_MAX : nr_pages,
        gfp_mask);

/*
* memory.high is breached and reclaim is unable to keep up. Throttle
* allocators proactively to slow down excessive growth.
*/
penalty_jiffies = calculate_high_delay(memcg, nr_pages,
            mem_find_max_overage(memcg));

penalty_jiffies += calculate_high_delay(memcg, nr_pages,
      swap_find_max_overage(memcg));

/*
* Clamp the max delay per usermode return so as to still keep the
* application moving forwards and also permit diagnostics, albeit
* extremely slowly.
*/
penalty_jiffies = min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES);

/*
* Don't sleep if the amount of jiffies this memcg owes us is so low
* that it's not even worth doing, in an attempt to be nice to those who
* go only a small amount over their memory.high value and maybe haven't
* been aggressively reclaimed enough yet.
*/
if (penalty_jiffies <= HZ / 100)
  goto out;

/*
* If reclaim is making forward progress but we're still over
* memory.high, we want to encourage that rather than doing allocator
* throttling.
*/
if (nr_reclaimed || nr_retries--) {
  in_retry = true;
  goto retry_reclaim;
}

/*
* Reclaim didn't manage to push usage below the limit, slow
* this allocating task down.
*
* If we exit early, we're guaranteed to die (since
* schedule_timeout_killable sets TASK_KILLABLE). This means we don't
* need to account for any ill-begotten jiffies to pay them off later.
*/
psi_memstall_enter(&pflags);
schedule_timeout_killable(penalty_jiffies);
psi_memstall_leave(&pflags);

out:
css_put(&memcg->css);
}

static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
       unsigned int nr_pages)
{
unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages);
int nr_retries = MAX_RECLAIM_RETRIES;
struct mem_cgroup *mem_over_limit;
struct page_counter *counter;
unsigned long nr_reclaimed;
bool passed_oom = false;
unsigned int reclaim_options = MEMCG_RECLAIM_MAY_SWAP;
bool drained = false;
bool raised_max_event = false;
unsigned long pflags;
bool allow_spinning = gfpflags_allow_spinning(gfp_mask);

retry:
if (consume_stock(memcg, nr_pages))
  return 0;

if (!allow_spinning)
  /* Avoid the refill and flush of the older stock */
  batch = nr_pages;

if (!do_memsw_account() ||
     page_counter_try_charge(&memcg->memsw, batch, &counter)) {
  if (page_counter_try_charge(&memcg->memory, batch, &counter))
   goto done_restock;
  if (do_memsw_account())
   page_counter_uncharge(&memcg->memsw, batch);
  mem_over_limit = mem_cgroup_from_counter(counter, memory);
} else {
  mem_over_limit = mem_cgroup_from_counter(counter, memsw);
  reclaim_options &= ~MEMCG_RECLAIM_MAY_SWAP;
}

if (batch > nr_pages) {
  batch = nr_pages;
  goto retry;
}

/*
* Prevent unbounded recursion when reclaim operations need to
* allocate memory. This might exceed the limits temporarily,
* but we prefer facilitating memory reclaim and getting back
* under the limit over triggering OOM kills in these cases.
*/
if (unlikely(current->flags & PF_MEMALLOC))
  goto force;

if (unlikely(task_in_memcg_oom(current)))
  goto nomem;

if (!gfpflags_allow_blocking(gfp_mask))
  goto nomem;

__memcg_memory_event(mem_over_limit, MEMCG_MAX, allow_spinning);
raised_max_event = true;

psi_memstall_enter(&pflags);
nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
          gfp_mask, reclaim_options, NULL);
psi_memstall_leave(&pflags);

if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
  goto retry;

if (!drained) {
  drain_all_stock(mem_over_limit);
  drained = true;
  goto retry;
}

if (gfp_mask & __GFP_NORETRY)
  goto nomem;
/*
* Even though the limit is exceeded at this point, reclaim
* may have been able to free some pages.  Retry the charge
* before killing the task.
*
* Only for regular pages, though: huge pages are rather
* unlikely to succeed so close to the limit, and we fall back
* to regular pages anyway in case of failure.
*/
if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER))
  goto retry;

if (nr_retries--)
  goto retry;

if (gfp_mask & __GFP_RETRY_MAYFAIL)
  goto nomem;

/* Avoid endless loop for tasks bypassed by the oom killer */
if (passed_oom && task_is_dying())
  goto nomem;

/*
* keep retrying as long as the memcg oom killer is able to make
* a forward progress or bypass the charge if the oom killer
* couldn't make any progress.
*/
if (mem_cgroup_oom(mem_over_limit, gfp_mask,
      get_order(nr_pages * PAGE_SIZE))) {
  passed_oom = true;
  nr_retries = MAX_RECLAIM_RETRIES;
  goto retry;
}
nomem:
/*
* Memcg doesn't have a dedicated reserve for atomic
* allocations. But like the global atomic pool, we need to
* put the burden of reclaim on regular allocation requests
* and let these go through as privileged allocations.
*/
if (!(gfp_mask & (__GFP_NOFAIL | __GFP_HIGH)))
  return -ENOMEM;
force:
/*
* If the allocation has to be enforced, don't forget to raise
* a MEMCG_MAX event.
*/
if (!raised_max_event)
  __memcg_memory_event(mem_over_limit, MEMCG_MAX, allow_spinning);

/*
* The allocation either can't fail or will lead to more memory
* being freed very soon.  Allow memory usage go over the limit
* temporarily by force charging it.
*/
page_counter_charge(&memcg->memory, nr_pages);
if (do_memsw_account())
  page_counter_charge(&memcg->memsw, nr_pages);

return 0;

done_restock:
if (batch > nr_pages)
  refill_stock(memcg, batch - nr_pages);

/*
* If the hierarchy is above the normal consumption range, schedule
* reclaim on returning to userland.  We can perform reclaim here
* if __GFP_RECLAIM but let's always punt for simplicity and so that
* GFP_KERNEL can consistently be used during reclaim.  @memcg is
* not recorded as it most likely matches current's and won't
--> --------------------

--> maximum size reached

--> --------------------

Messung V0.5

¤ Dauer der Verarbeitung: 0.50 Sekunden (vorverarbeitet) ¤

Wurzel

Suchen

Beweissystem der NASA

Beweissystem Isabelle

NIST Cobol Testsuite

Cephes Mathematical Library

Wiener Entwicklungsmethode

Haftungshinweis

Die Informationen auf dieser Webseite wurden nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit, noch Qualität der bereit gestellten Informationen zugesichert.

Bemerkung:

Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.