Quelle page_alloc.c Sprache: C

// SPDX-License-Identifier: GPL-2.0-only
/*
*  linux/mm/page_alloc.c
*
*  Manages the free list, the system allocates free pages here.
*  Note that kmalloc() lives in slab.c
*
*  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
*  Swap reorganised 29.12.95, Stephen Tweedie
*  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
*  Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
*  Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
*  Zone balancing, Kanoj Sarcar, SGI, Jan 2000
*  Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
*          (lots of bits borrowed from Ingo Molnar & Andrew Morton)
*/

#include <linux/stddef.h>
#include <linux/mm.h>
#include <linux/highmem.h>
#include <linux/interrupt.h>
#include <linux/jiffies.h>
#include <linux/compiler.h>
#include <linux/kernel.h>
#include <linux/kasan.h>
#include <linux/kmsan.h>
#include <linux/module.h>
#include <linux/suspend.h>
#include <linux/ratelimit.h>
#include <linux/oom.h>
#include <linux/topology.h>
#include <linux/sysctl.h>
#include <linux/cpu.h>
#include <linux/cpuset.h>
#include <linux/pagevec.h>
#include <linux/memory_hotplug.h>
#include <linux/nodemask.h>
#include <linux/vmstat.h>
#include <linux/fault-inject.h>
#include <linux/compaction.h>
#include <trace/events/kmem.h>
#include <trace/events/oom.h>
#include <linux/prefetch.h>
#include <linux/mm_inline.h>
#include <linux/mmu_notifier.h>
#include <linux/migrate.h>
#include <linux/sched/mm.h>
#include <linux/page_owner.h>
#include <linux/page_table_check.h>
#include <linux/memcontrol.h>
#include <linux/ftrace.h>
#include <linux/lockdep.h>
#include <linux/psi.h>
#include <linux/khugepaged.h>
#include <linux/delayacct.h>
#include <linux/cacheinfo.h>
#include <linux/pgalloc_tag.h>
#include <asm/div64.h>
#include "internal.h"
#include "shuffle.h"
#include "page_reporting.h"

/* Free Page Internal flags: for internal, non-pcp variants of free_pages(). */
typedef int __bitwise fpi_t;

/* No special request */
#define FPI_NONE  ((__force fpi_t)0)

/*
* Skip free page reporting notification for the (possibly merged) page.
* This does not hinder free page reporting from grabbing the page,
* reporting it and marking it "reported" -  it only skips notifying
* the free page reporting infrastructure about a newly freed page. For
* example, used when temporarily pulling a page from a freelist and
* putting it back unmodified.
*/
#define FPI_SKIP_REPORT_NOTIFY ((__force fpi_t)BIT(0))

/*
* Place the (possibly merged) page to the tail of the freelist. Will ignore
* page shuffling (relevant code - e.g., memory onlining - is expected to
* shuffle the whole zone).
*
* Note: No code should rely on this flag for correctness - it's purely
*       to allow for optimizations when handing back either fresh pages
*       (memory onlining) or untouched pages (page isolation, free page
*       reporting).
*/
#define FPI_TO_TAIL  ((__force fpi_t)BIT(1))

/* Free the page without taking locks. Rely on trylock only. */
#define FPI_TRYLOCK  ((__force fpi_t)BIT(2))

/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
static DEFINE_MUTEX(pcp_batch_high_lock);
#define MIN_PERCPU_PAGELIST_HIGH_FRACTION (8)

#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)
/*
* On SMP, spin_trylock is sufficient protection.
* On PREEMPT_RT, spin_trylock is equivalent on both SMP and UP.
*/
#define pcp_trylock_prepare(flags) do { } while (0)
#define pcp_trylock_finish(flag) do { } while (0)
#else

/* UP spin_trylock always succeeds so disable IRQs to prevent re-entrancy. */
#define pcp_trylock_prepare(flags) local_irq_save(flags)
#define pcp_trylock_finish(flags) local_irq_restore(flags)
#endif

/*
* Locking a pcp requires a PCP lookup followed by a spinlock. To avoid
* a migration causing the wrong PCP to be locked and remote memory being
* potentially allocated, pin the task to the CPU for the lookup+lock.
* preempt_disable is used on !RT because it is faster than migrate_disable.
* migrate_disable is used on RT because otherwise RT spinlock usage is
* interfered with and a high priority task cannot preempt the allocator.
*/
#ifndef CONFIG_PREEMPT_RT
#define pcpu_task_pin()  preempt_disable()
#define pcpu_task_unpin() preempt_enable()
#else
#define pcpu_task_pin()  migrate_disable()
#define pcpu_task_unpin() migrate_enable()
#endif

/*
* Generic helper to lookup and a per-cpu variable with an embedded spinlock.
* Return value should be used with equivalent unlock helper.
*/
#define pcpu_spin_lock(type, member, ptr)    \
({         \
type *_ret;       \
pcpu_task_pin();      \
_ret = this_cpu_ptr(ptr);     \
spin_lock(&_ret->member);     \
_ret;        \
})

#define pcpu_spin_trylock(type, member, ptr)    \
({         \
type *_ret;       \
pcpu_task_pin();      \
_ret = this_cpu_ptr(ptr);     \
if (!spin_trylock(&_ret->member)) {    \
  pcpu_task_unpin();     \
  _ret = NULL;      \
}        \
_ret;        \
})

#define pcpu_spin_unlock(member, ptr)     \
({         \
spin_unlock(&ptr->member);     \
pcpu_task_unpin();      \
})

/* struct per_cpu_pages specific helpers. */
#define pcp_spin_lock(ptr)      \
pcpu_spin_lock(struct per_cpu_pages, lock, ptr)

#define pcp_spin_trylock(ptr)      \
pcpu_spin_trylock(struct per_cpu_pages, lock, ptr)

#define pcp_spin_unlock(ptr)      \
pcpu_spin_unlock(lock, ptr)

#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
DEFINE_PER_CPU(int, numa_node);
EXPORT_PER_CPU_SYMBOL(numa_node);
#endif

DEFINE_STATIC_KEY_TRUE(vm_numa_stat_key);

#ifdef CONFIG_HAVE_MEMORYLESS_NODES
/*
* N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.
* It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined.
* Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem()
* defined in <linux/topology.h>.
*/
DEFINE_PER_CPU(int, _numa_mem_);  /* Kernel "local memory" node */
EXPORT_PER_CPU_SYMBOL(_numa_mem_);
#endif

static DEFINE_MUTEX(pcpu_drain_mutex);

#ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY
volatile unsigned long latent_entropy __latent_entropy;
EXPORT_SYMBOL(latent_entropy);
#endif

/*
* Array of node states.
*/
nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
[N_POSSIBLE] = NODE_MASK_ALL,
[N_ONLINE] = { { [0] = 1UL } },
#ifndef CONFIG_NUMA
[N_NORMAL_MEMORY] = { { [0] = 1UL } },
#ifdef CONFIG_HIGHMEM
[N_HIGH_MEMORY] = { { [0] = 1UL } },
#endif
[N_MEMORY] = { { [0] = 1UL } },
[N_CPU] = { { [0] = 1UL } },
#endif /* NUMA */
};
EXPORT_SYMBOL(node_states);

gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;

#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
unsigned int pageblock_order __read_mostly;
#endif

static void __free_pages_ok(struct page *page, unsigned int order,
       fpi_t fpi_flags);

/*
* results with 256, 32 in the lowmem_reserve sysctl:
* 1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
* 1G machine -> (16M dma, 784M normal, 224M high)
* NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
* HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
* HIGHMEM allocation will leave (224M+784M)/256 of ram reserved in ZONE_DMA
*
* TBD: should special case ZONE_DMA32 machines here - in those we normally
* don't need any ZONE_NORMAL reservation
*/
static int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES] = {
#ifdef CONFIG_ZONE_DMA
[ZONE_DMA] = 256,
#endif
#ifdef CONFIG_ZONE_DMA32
[ZONE_DMA32] = 256,
#endif
[ZONE_NORMAL] = 32,
#ifdef CONFIG_HIGHMEM
[ZONE_HIGHMEM] = 0,
#endif
[ZONE_MOVABLE] = 0,
};

char * const zone_names[MAX_NR_ZONES] = {
#ifdef CONFIG_ZONE_DMA
  "DMA",
#endif
#ifdef CONFIG_ZONE_DMA32
  "DMA32",
#endif
  "Normal",
#ifdef CONFIG_HIGHMEM
  "HighMem",
#endif
  "Movable",
#ifdef CONFIG_ZONE_DEVICE
  "Device",
#endif
};

const char * const migratetype_names[MIGRATE_TYPES] = {
"Unmovable",
"Movable",
"Reclaimable",
"HighAtomic",
#ifdef CONFIG_CMA
"CMA",
#endif
#ifdef CONFIG_MEMORY_ISOLATION
"Isolate",
#endif
};

int min_free_kbytes = 1024;
int user_min_free_kbytes = -1;
static int watermark_boost_factor __read_mostly = 15000;
static int watermark_scale_factor = 10;
int defrag_mode;

/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
int movable_zone;
EXPORT_SYMBOL(movable_zone);

#if MAX_NUMNODES > 1
unsigned int nr_node_ids __read_mostly = MAX_NUMNODES;
unsigned int nr_online_nodes __read_mostly = 1;
EXPORT_SYMBOL(nr_node_ids);
EXPORT_SYMBOL(nr_online_nodes);
#endif

static bool page_contains_unaccepted(struct page *page, unsigned int order);
static bool cond_accept_memory(struct zone *zone, unsigned int order,
          int alloc_flags);
static bool __free_unaccepted(struct page *page);

int page_group_by_mobility_disabled __read_mostly;

#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
/*
* During boot we initialize deferred pages on-demand, as needed, but once
* page_alloc_init_late() has finished, the deferred pages are all initialized,
* and we can permanently disable that path.
*/
DEFINE_STATIC_KEY_TRUE(deferred_pages);

static inline bool deferred_pages_enabled(void)
{
return static_branch_unlikely(&deferred_pages);
}

/*
* deferred_grow_zone() is __init, but it is called from
* get_page_from_freelist() during early boot until deferred_pages permanently
* disables this call. This is why we have refdata wrapper to avoid warning,
* and to ensure that the function body gets unloaded.
*/
static bool __ref
_deferred_grow_zone(struct zone *zone, unsigned int order)
{
return deferred_grow_zone(zone, order);
}
#else
static inline bool deferred_pages_enabled(void)
{
return false;
}

static inline bool _deferred_grow_zone(struct zone *zone, unsigned int order)
{
return false;
}
#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */

/* Return a pointer to the bitmap storing bits affecting a block of pages */
static inline unsigned long *get_pageblock_bitmap(const struct page *page,
       unsigned long pfn)
{
#ifdef CONFIG_SPARSEMEM
return section_to_usemap(__pfn_to_section(pfn));
#else
return page_zone(page)->pageblock_flags;
#endif /* CONFIG_SPARSEMEM */
}

static inline int pfn_to_bitidx(const struct page *page, unsigned long pfn)
{
#ifdef CONFIG_SPARSEMEM
pfn &= (PAGES_PER_SECTION-1);
#else
pfn = pfn - pageblock_start_pfn(page_zone(page)->zone_start_pfn);
#endif /* CONFIG_SPARSEMEM */
return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
}

static __always_inline bool is_standalone_pb_bit(enum pageblock_bits pb_bit)
{
return pb_bit > PB_migrate_end && pb_bit < __NR_PAGEBLOCK_BITS;
}

static __always_inline void
get_pfnblock_bitmap_bitidx(const struct page *page, unsigned long pfn,
      unsigned long **bitmap_word, unsigned long *bitidx)
{
unsigned long *bitmap;
unsigned long word_bitidx;

#ifdef CONFIG_MEMORY_ISOLATION
BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 8);
#else
BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
#endif
BUILD_BUG_ON(__MIGRATE_TYPE_END >= (1 << PB_migratetype_bits));
VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page);

bitmap = get_pageblock_bitmap(page, pfn);
*bitidx = pfn_to_bitidx(page, pfn);
word_bitidx = *bitidx / BITS_PER_LONG;
*bitidx &= (BITS_PER_LONG - 1);
*bitmap_word = &bitmap[word_bitidx];
}

/**
* __get_pfnblock_flags_mask - Return the requested group of flags for
* a pageblock_nr_pages block of pages
* @page: The page within the block of interest
* @pfn: The target page frame number
* @mask: mask of bits that the caller is interested in
*
* Return: pageblock_bits flags
*/
static unsigned long __get_pfnblock_flags_mask(const struct page *page,
            unsigned long pfn,
            unsigned long mask)
{
unsigned long *bitmap_word;
unsigned long bitidx;
unsigned long word;

get_pfnblock_bitmap_bitidx(page, pfn, &bitmap_word, &bitidx);
/*
* This races, without locks, with set_pfnblock_migratetype(). Ensure
* a consistent read of the memory array, so that results, even though
* racy, are not corrupted.
*/
word = READ_ONCE(*bitmap_word);
return (word >> bitidx) & mask;
}

/**
* get_pfnblock_bit - Check if a standalone bit of a pageblock is set
* @page: The page within the block of interest
* @pfn: The target page frame number
* @pb_bit: pageblock bit to check
*
* Return: true if the bit is set, otherwise false
*/
bool get_pfnblock_bit(const struct page *page, unsigned long pfn,
        enum pageblock_bits pb_bit)
{
unsigned long *bitmap_word;
unsigned long bitidx;

if (WARN_ON_ONCE(!is_standalone_pb_bit(pb_bit)))
  return false;

get_pfnblock_bitmap_bitidx(page, pfn, &bitmap_word, &bitidx);

return test_bit(bitidx + pb_bit, bitmap_word);
}

/**
* get_pfnblock_migratetype - Return the migratetype of a pageblock
* @page: The page within the block of interest
* @pfn: The target page frame number
*
* Return: The migratetype of the pageblock
*
* Use get_pfnblock_migratetype() if caller already has both @page and @pfn
* to save a call to page_to_pfn().
*/
__always_inline enum migratetype
get_pfnblock_migratetype(const struct page *page, unsigned long pfn)
{
unsigned long mask = MIGRATETYPE_AND_ISO_MASK;
unsigned long flags;

flags = __get_pfnblock_flags_mask(page, pfn, mask);

#ifdef CONFIG_MEMORY_ISOLATION
if (flags & BIT(PB_migrate_isolate))
  return MIGRATE_ISOLATE;
#endif
return flags & MIGRATETYPE_MASK;
}

/**
* __set_pfnblock_flags_mask - Set the requested group of flags for
* a pageblock_nr_pages block of pages
* @page: The page within the block of interest
* @pfn: The target page frame number
* @flags: The flags to set
* @mask: mask of bits that the caller is interested in
*/
static void __set_pfnblock_flags_mask(struct page *page, unsigned long pfn,
          unsigned long flags, unsigned long mask)
{
unsigned long *bitmap_word;
unsigned long bitidx;
unsigned long word;

get_pfnblock_bitmap_bitidx(page, pfn, &bitmap_word, &bitidx);

mask <<= bitidx;
flags <<= bitidx;

word = READ_ONCE(*bitmap_word);
do {
} while (!try_cmpxchg(bitmap_word, &word, (word & ~mask) | flags));
}

/**
* set_pfnblock_bit - Set a standalone bit of a pageblock
* @page: The page within the block of interest
* @pfn: The target page frame number
* @pb_bit: pageblock bit to set
*/
void set_pfnblock_bit(const struct page *page, unsigned long pfn,
        enum pageblock_bits pb_bit)
{
unsigned long *bitmap_word;
unsigned long bitidx;

if (WARN_ON_ONCE(!is_standalone_pb_bit(pb_bit)))
  return;

get_pfnblock_bitmap_bitidx(page, pfn, &bitmap_word, &bitidx);

set_bit(bitidx + pb_bit, bitmap_word);
}

/**
* clear_pfnblock_bit - Clear a standalone bit of a pageblock
* @page: The page within the block of interest
* @pfn: The target page frame number
* @pb_bit: pageblock bit to clear
*/
void clear_pfnblock_bit(const struct page *page, unsigned long pfn,
   enum pageblock_bits pb_bit)
{
unsigned long *bitmap_word;
unsigned long bitidx;

if (WARN_ON_ONCE(!is_standalone_pb_bit(pb_bit)))
  return;

get_pfnblock_bitmap_bitidx(page, pfn, &bitmap_word, &bitidx);

clear_bit(bitidx + pb_bit, bitmap_word);
}

/**
* set_pageblock_migratetype - Set the migratetype of a pageblock
* @page: The page within the block of interest
* @migratetype: migratetype to set
*/
static void set_pageblock_migratetype(struct page *page,
          enum migratetype migratetype)
{
if (unlikely(page_group_by_mobility_disabled &&
       migratetype < MIGRATE_PCPTYPES))
  migratetype = MIGRATE_UNMOVABLE;

#ifdef CONFIG_MEMORY_ISOLATION
if (migratetype == MIGRATE_ISOLATE) {
  VM_WARN_ONCE(1,
   "Use set_pageblock_isolate() for pageblock isolation");
  return;
}
VM_WARN_ONCE(get_pfnblock_bit(page, page_to_pfn(page),
          PB_migrate_isolate),
       "Use clear_pageblock_isolate() to unisolate pageblock");
/* MIGRATETYPE_AND_ISO_MASK clears PB_migrate_isolate if it is set */
#endif
__set_pfnblock_flags_mask(page, page_to_pfn(page),
      (unsigned long)migratetype,
      MIGRATETYPE_AND_ISO_MASK);
}

void __meminit init_pageblock_migratetype(struct page *page,
       enum migratetype migratetype,
       bool isolate)
{
unsigned long flags;

if (unlikely(page_group_by_mobility_disabled &&
       migratetype < MIGRATE_PCPTYPES))
  migratetype = MIGRATE_UNMOVABLE;

flags = migratetype;

#ifdef CONFIG_MEMORY_ISOLATION
if (migratetype == MIGRATE_ISOLATE) {
  VM_WARN_ONCE(
   1,
   "Set isolate=true to isolate pageblock with a migratetype");
  return;
}
if (isolate)
  flags |= BIT(PB_migrate_isolate);
#endif
__set_pfnblock_flags_mask(page, page_to_pfn(page), flags,
      MIGRATETYPE_AND_ISO_MASK);
}

#ifdef CONFIG_DEBUG_VM
static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
{
int ret;
unsigned seq;
unsigned long pfn = page_to_pfn(page);
unsigned long sp, start_pfn;

do {
  seq = zone_span_seqbegin(zone);
  start_pfn = zone->zone_start_pfn;
  sp = zone->spanned_pages;
  ret = !zone_spans_pfn(zone, pfn);
} while (zone_span_seqretry(zone, seq));

if (ret)
  pr_err("page 0x%lx outside node %d zone %s [ 0x%lx - 0x%lx ]\n",
   pfn, zone_to_nid(zone), zone->name,
   start_pfn, start_pfn + sp);

return ret;
}

/*
* Temporary debugging check for pages not lying within a given zone.
*/
static bool __maybe_unused bad_range(struct zone *zone, struct page *page)
{
if (page_outside_zone_boundaries(zone, page))
  return true;
if (zone != page_zone(page))
  return true;

return false;
}
#else
static inline bool __maybe_unused bad_range(struct zone *zone, struct page *page)
{
return false;
}
#endif

static void bad_page(struct page *page, const char *reason)
{
static unsigned long resume;
static unsigned long nr_shown;
static unsigned long nr_unshown;

/*
* Allow a burst of 60 reports, then keep quiet for that minute;
* or allow a steady drip of one report per second.
*/
if (nr_shown == 60) {
  if (time_before(jiffies, resume)) {
   nr_unshown++;
   goto out;
  }
  if (nr_unshown) {
   pr_alert(
         "BUG: Bad page state: %lu messages suppressed\n",
    nr_unshown);
   nr_unshown = 0;
  }
  nr_shown = 0;
}
if (nr_shown++ == 0)
  resume = jiffies + 60 * HZ;

pr_alert("BUG: Bad page state in process %s pfn:%05lx\n",
  current->comm, page_to_pfn(page));
dump_page(page, reason);

print_modules();
dump_stack();
out:
/* Leave bad fields for debug, except PageBuddy could make trouble */
if (PageBuddy(page))
  __ClearPageBuddy(page);
add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
}

static inline unsigned int order_to_pindex(int migratetype, int order)
{

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
bool movable;
if (order > PAGE_ALLOC_COSTLY_ORDER) {
  VM_BUG_ON(order != HPAGE_PMD_ORDER);

  movable = migratetype == MIGRATE_MOVABLE;

  return NR_LOWORDER_PCP_LISTS + movable;
}
#else
VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER);
#endif

return (MIGRATE_PCPTYPES * order) + migratetype;
}

static inline int pindex_to_order(unsigned int pindex)
{
int order = pindex / MIGRATE_PCPTYPES;

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
if (pindex >= NR_LOWORDER_PCP_LISTS)
  order = HPAGE_PMD_ORDER;
#else
VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER);
#endif

return order;
}

static inline bool pcp_allowed_order(unsigned int order)
{
if (order <= PAGE_ALLOC_COSTLY_ORDER)
  return true;
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
if (order == HPAGE_PMD_ORDER)
  return true;
#endif
return false;
}

/*
* Higher-order pages are called "compound pages".  They are structured thusly:
*
* The first PAGE_SIZE page is called the "head page" and have PG_head set.
*
* The remaining PAGE_SIZE pages are called "tail pages". PageTail() is encoded
* in bit 0 of page->compound_head. The rest of bits is pointer to head page.
*
* The first tail page's ->compound_order holds the order of allocation.
* This usage means that zero-order pages may not be compound.
*/

void prep_compound_page(struct page *page, unsigned int order)
{
int i;
int nr_pages = 1 << order;

__SetPageHead(page);
for (i = 1; i < nr_pages; i++)
  prep_compound_tail(page, i);

prep_compound_head(page, order);
}

static inline void set_buddy_order(struct page *page, unsigned int order)
{
set_page_private(page, order);
__SetPageBuddy(page);
}

#ifdef CONFIG_COMPACTION
static inline struct capture_control *task_capc(struct zone *zone)
{
struct capture_control *capc = current->capture_control;

return unlikely(capc) &&
  !(current->flags & PF_KTHREAD) &&
  !capc->page &&
  capc->cc->zone == zone ? capc : NULL;
}

static inline bool
compaction_capture(struct capture_control *capc, struct page *page,
     int order, int migratetype)
{
if (!capc || order != capc->cc->order)
  return false;

/* Do not accidentally pollute CMA or isolated regions*/
if (is_migrate_cma(migratetype) ||
     is_migrate_isolate(migratetype))
  return false;

/*
* Do not let lower order allocations pollute a movable pageblock
* unless compaction is also requesting movable pages.
* This might let an unmovable request use a reclaimable pageblock
* and vice-versa but no more than normal fallback logic which can
* have trouble finding a high-order free page.
*/
if (order < pageblock_order && migratetype == MIGRATE_MOVABLE &&
     capc->cc->migratetype != MIGRATE_MOVABLE)
  return false;

if (migratetype != capc->cc->migratetype)
  trace_mm_page_alloc_extfrag(page, capc->cc->order, order,
         capc->cc->migratetype, migratetype);

capc->page = page;
return true;
}

#else
static inline struct capture_control *task_capc(struct zone *zone)
{
return NULL;
}

static inline bool
compaction_capture(struct capture_control *capc, struct page *page,
     int order, int migratetype)
{
return false;
}
#endif /* CONFIG_COMPACTION */

static inline void account_freepages(struct zone *zone, int nr_pages,
         int migratetype)
{
lockdep_assert_held(&zone->lock);

if (is_migrate_isolate(migratetype))
  return;

__mod_zone_page_state(zone, NR_FREE_PAGES, nr_pages);

if (is_migrate_cma(migratetype))
  __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, nr_pages);
else if (is_migrate_highatomic(migratetype))
  WRITE_ONCE(zone->nr_free_highatomic,
      zone->nr_free_highatomic + nr_pages);
}

/* Used for pages not on another list */
static inline void __add_to_free_list(struct page *page, struct zone *zone,
          unsigned int order, int migratetype,
          bool tail)
{
struct free_area *area = &zone->free_area[order];
int nr_pages = 1 << order;

VM_WARN_ONCE(get_pageblock_migratetype(page) != migratetype,
       "page type is %d, passed migratetype is %d (nr=%d)\n",
       get_pageblock_migratetype(page), migratetype, nr_pages);

if (tail)
  list_add_tail(&page->buddy_list, &area->free_list[migratetype]);
else
  list_add(&page->buddy_list, &area->free_list[migratetype]);
area->nr_free++;

if (order >= pageblock_order && !is_migrate_isolate(migratetype))
  __mod_zone_page_state(zone, NR_FREE_PAGES_BLOCKS, nr_pages);
}

/*
* Used for pages which are on another list. Move the pages to the tail
* of the list - so the moved pages won't immediately be considered for
* allocation again (e.g., optimization for memory onlining).
*/
static inline void move_to_free_list(struct page *page, struct zone *zone,
         unsigned int order, int old_mt, int new_mt)
{
struct free_area *area = &zone->free_area[order];
int nr_pages = 1 << order;

/* Free page moving can fail, so it happens before the type update */
VM_WARN_ONCE(get_pageblock_migratetype(page) != old_mt,
       "page type is %d, passed migratetype is %d (nr=%d)\n",
       get_pageblock_migratetype(page), old_mt, nr_pages);

list_move_tail(&page->buddy_list, &area->free_list[new_mt]);

account_freepages(zone, -nr_pages, old_mt);
account_freepages(zone, nr_pages, new_mt);

if (order >= pageblock_order &&
     is_migrate_isolate(old_mt) != is_migrate_isolate(new_mt)) {
  if (!is_migrate_isolate(old_mt))
   nr_pages = -nr_pages;
  __mod_zone_page_state(zone, NR_FREE_PAGES_BLOCKS, nr_pages);
}
}

static inline void __del_page_from_free_list(struct page *page, struct zone *zone,
          unsigned int order, int migratetype)
{
int nr_pages = 1 << order;

        VM_WARN_ONCE(get_pageblock_migratetype(page) != migratetype,
       "page type is %d, passed migratetype is %d (nr=%d)\n",
       get_pageblock_migratetype(page), migratetype, nr_pages);

/* clear reported state and update reported page count */
if (page_reported(page))
  __ClearPageReported(page);

list_del(&page->buddy_list);
__ClearPageBuddy(page);
set_page_private(page, 0);
zone->free_area[order].nr_free--;

if (order >= pageblock_order && !is_migrate_isolate(migratetype))
  __mod_zone_page_state(zone, NR_FREE_PAGES_BLOCKS, -nr_pages);
}

static inline void del_page_from_free_list(struct page *page, struct zone *zone,
        unsigned int order, int migratetype)
{
__del_page_from_free_list(page, zone, order, migratetype);
account_freepages(zone, -(1 << order), migratetype);
}

static inline struct page *get_page_from_free_area(struct free_area *area,
         int migratetype)
{
return list_first_entry_or_null(&area->free_list[migratetype],
     struct page, buddy_list);
}

/*
* If this is less than the 2nd largest possible page, check if the buddy
* of the next-higher order is free. If it is, it's possible
* that pages are being freed that will coalesce soon. In case,
* that is happening, add the free page to the tail of the list
* so it's less likely to be used soon and more likely to be merged
* as a 2-level higher order page
*/
static inline bool
buddy_merge_likely(unsigned long pfn, unsigned long buddy_pfn,
     struct page *page, unsigned int order)
{
unsigned long higher_page_pfn;
struct page *higher_page;

if (order >= MAX_PAGE_ORDER - 1)
  return false;

higher_page_pfn = buddy_pfn & pfn;
higher_page = page + (higher_page_pfn - pfn);

return find_buddy_page_pfn(higher_page, higher_page_pfn, order + 1,
   NULL) != NULL;
}

/*
* Freeing function for a buddy system allocator.
*
* The concept of a buddy system is to maintain direct-mapped table
* (containing bit values) for memory blocks of various "orders".
* The bottom level table contains the map for the smallest allocatable
* units of memory (here, pages), and each level above it describes
* pairs of units from the levels below, hence, "buddies".
* At a high level, all that happens here is marking the table entry
* at the bottom level available, and propagating the changes upward
* as necessary, plus some accounting needed to play nicely with other
* parts of the VM system.
* At each level, we keep a list of pages, which are heads of continuous
* free pages of length of (1 << order) and marked with PageBuddy.
* Page's order is recorded in page_private(page) field.
* So when we are allocating or freeing one, we can derive the state of the
* other.  That is, if we allocate a small block, and both were
* free, the remainder of the region must be split into blocks.
* If a block is freed, and its buddy is also free, then this
* triggers coalescing into a block of larger size.
*
* -- nyc
*/

static inline void __free_one_page(struct page *page,
  unsigned long pfn,
  struct zone *zone, unsigned int order,
  int migratetype, fpi_t fpi_flags)
{
struct capture_control *capc = task_capc(zone);
unsigned long buddy_pfn = 0;
unsigned long combined_pfn;
struct page *buddy;
bool to_tail;

VM_BUG_ON(!zone_is_initialized(zone));
VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page);

VM_BUG_ON(migratetype == -1);
VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page);
VM_BUG_ON_PAGE(bad_range(zone, page), page);

account_freepages(zone, 1 << order, migratetype);

while (order < MAX_PAGE_ORDER) {
  int buddy_mt = migratetype;

  if (compaction_capture(capc, page, order, migratetype)) {
   account_freepages(zone, -(1 << order), migratetype);
   return;
  }

  buddy = find_buddy_page_pfn(page, pfn, order, &buddy_pfn);
  if (!buddy)
   goto done_merging;

  if (unlikely(order >= pageblock_order)) {
   /*
* We want to prevent merge between freepages on pageblock
* without fallbacks and normal pageblock. Without this,
* pageblock isolation could cause incorrect freepage or CMA
* accounting or HIGHATOMIC accounting.
*/
   buddy_mt = get_pfnblock_migratetype(buddy, buddy_pfn);

   if (migratetype != buddy_mt &&
       (!migratetype_is_mergeable(migratetype) ||
        !migratetype_is_mergeable(buddy_mt)))
    goto done_merging;
  }

  /*
* Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
* merge with it and move up one order.
*/
  if (page_is_guard(buddy))
   clear_page_guard(zone, buddy, order);
  else
   __del_page_from_free_list(buddy, zone, order, buddy_mt);

  if (unlikely(buddy_mt != migratetype)) {
   /*
* Match buddy type. This ensures that an
* expand() down the line puts the sub-blocks
* on the right freelists.
*/
   set_pageblock_migratetype(buddy, migratetype);
  }

  combined_pfn = buddy_pfn & pfn;
  page = page + (combined_pfn - pfn);
  pfn = combined_pfn;
  order++;
}

done_merging:
set_buddy_order(page, order);

if (fpi_flags & FPI_TO_TAIL)
  to_tail = true;
else if (is_shuffle_order(order))
  to_tail = shuffle_pick_tail();
else
  to_tail = buddy_merge_likely(pfn, buddy_pfn, page, order);

__add_to_free_list(page, zone, order, migratetype, to_tail);

/* Notify page reporting subsystem of freed page */
if (!(fpi_flags & FPI_SKIP_REPORT_NOTIFY))
  page_reporting_notify_free(order);
}

/*
* A bad page could be due to a number of fields. Instead of multiple branches,
* try and check multiple fields with one check. The caller must do a detailed
* check if necessary.
*/
static inline bool page_expected_state(struct page *page,
     unsigned long check_flags)
{
if (unlikely(atomic_read(&page->_mapcount) != -1))
  return false;

if (unlikely((unsigned long)page->mapping |
   page_ref_count(page) |
#ifdef CONFIG_MEMCG
   page->memcg_data |
#endif
   page_pool_page_is_pp(page) |
   (page->flags & check_flags)))
  return false;

return true;
}

static const char *page_bad_reason(struct page *page, unsigned long flags)
{
const char *bad_reason = NULL;

if (unlikely(atomic_read(&page->_mapcount) != -1))
  bad_reason = "nonzero mapcount";
if (unlikely(page->mapping != NULL))
  bad_reason = "non-NULL mapping";
if (unlikely(page_ref_count(page) != 0))
  bad_reason = "nonzero _refcount";
if (unlikely(page->flags & flags)) {
  if (flags == PAGE_FLAGS_CHECK_AT_PREP)
   bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag(s) set";
  else
   bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
}
#ifdef CONFIG_MEMCG
if (unlikely(page->memcg_data))
  bad_reason = "page still charged to cgroup";
#endif
if (unlikely(page_pool_page_is_pp(page)))
  bad_reason = "page_pool leak";
return bad_reason;
}

static inline bool free_page_is_bad(struct page *page)
{
if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE)))
  return false;

/* Something has gone sideways, find it */
bad_page(page, page_bad_reason(page, PAGE_FLAGS_CHECK_AT_FREE));
return true;
}

static inline bool is_check_pages_enabled(void)
{
return static_branch_unlikely(&check_pages_enabled);
}

static int free_tail_page_prepare(struct page *head_page, struct page *page)
{
struct folio *folio = (struct folio *)head_page;
int ret = 1;

/*
* We rely page->lru.next never has bit 0 set, unless the page
* is PageTail(). Let's make sure that's true even for poisoned ->lru.
*/
BUILD_BUG_ON((unsigned long)LIST_POISON1 & 1);

if (!is_check_pages_enabled()) {
  ret = 0;
  goto out;
}
switch (page - head_page) {
case 1:
  /* the first tail page: these may be in place of ->mapping */
  if (unlikely(folio_large_mapcount(folio))) {
   bad_page(page, "nonzero large_mapcount");
   goto out;
  }
  if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT) &&
      unlikely(atomic_read(&folio->_nr_pages_mapped))) {
   bad_page(page, "nonzero nr_pages_mapped");
   goto out;
  }
  if (IS_ENABLED(CONFIG_MM_ID)) {
   if (unlikely(folio->_mm_id_mapcount[0] != -1)) {
    bad_page(page, "nonzero mm mapcount 0");
    goto out;
   }
   if (unlikely(folio->_mm_id_mapcount[1] != -1)) {
    bad_page(page, "nonzero mm mapcount 1");
    goto out;
   }
  }
  if (IS_ENABLED(CONFIG_64BIT)) {
   if (unlikely(atomic_read(&folio->_entire_mapcount) + 1)) {
    bad_page(page, "nonzero entire_mapcount");
    goto out;
   }
   if (unlikely(atomic_read(&folio->_pincount))) {
    bad_page(page, "nonzero pincount");
    goto out;
   }
  }
  break;
case 2:
  /* the second tail page: deferred_list overlaps ->mapping */
  if (unlikely(!list_empty(&folio->_deferred_list))) {
   bad_page(page, "on deferred list");
   goto out;
  }
  if (!IS_ENABLED(CONFIG_64BIT)) {
   if (unlikely(atomic_read(&folio->_entire_mapcount) + 1)) {
    bad_page(page, "nonzero entire_mapcount");
    goto out;
   }
   if (unlikely(atomic_read(&folio->_pincount))) {
    bad_page(page, "nonzero pincount");
    goto out;
   }
  }
  break;
case 3:
  /* the third tail page: hugetlb specifics overlap ->mappings */
  if (IS_ENABLED(CONFIG_HUGETLB_PAGE))
   break;
  fallthrough;
default:
  if (page->mapping != TAIL_MAPPING) {
   bad_page(page, "corrupted mapping in tail page");
   goto out;
  }
  break;
}
if (unlikely(!PageTail(page))) {
  bad_page(page, "PageTail not set");
  goto out;
}
if (unlikely(compound_head(page) != head_page)) {
  bad_page(page, "compound_head not consistent");
  goto out;
}
ret = 0;
out:
page->mapping = NULL;
clear_compound_head(page);
return ret;
}

/*
* Skip KASAN memory poisoning when either:
*
* 1. For generic KASAN: deferred memory initialization has not yet completed.
*    Tag-based KASAN modes skip pages freed via deferred memory initialization
*    using page tags instead (see below).
* 2. For tag-based KASAN modes: the page has a match-all KASAN tag, indicating
*    that error detection is disabled for accesses via the page address.
*
* Pages will have match-all tags in the following circumstances:
*
* 1. Pages are being initialized for the first time, including during deferred
*    memory init; see the call to page_kasan_tag_reset in __init_single_page.
* 2. The allocation was not unpoisoned due to __GFP_SKIP_KASAN, with the
*    exception of pages unpoisoned by kasan_unpoison_vmalloc.
* 3. The allocation was excluded from being checked due to sampling,
*    see the call to kasan_unpoison_pages.
*
* Poisoning pages during deferred memory init will greatly lengthen the
* process and cause problem in large memory systems as the deferred pages
* initialization is done with interrupt disabled.
*
* Assuming that there will be no reference to those newly initialized
* pages before they are ever allocated, this should have no effect on
* KASAN memory tracking as the poison will be properly inserted at page
* allocation time. The only corner case is when pages are allocated by
* on-demand allocation and then freed again before the deferred pages
* initialization is done, but this is not likely to happen.
*/
static inline bool should_skip_kasan_poison(struct page *page)
{
if (IS_ENABLED(CONFIG_KASAN_GENERIC))
  return deferred_pages_enabled();

return page_kasan_tag(page) == KASAN_TAG_KERNEL;
}

static void kernel_init_pages(struct page *page, int numpages)
{
int i;

/* s390's use of memset() could override KASAN redzones. */
kasan_disable_current();
for (i = 0; i < numpages; i++)
  clear_highpage_kasan_tagged(page + i);
kasan_enable_current();
}

#ifdef CONFIG_MEM_ALLOC_PROFILING

/* Should be called only if mem_alloc_profiling_enabled() */
void __clear_page_tag_ref(struct page *page)
{
union pgtag_ref_handle handle;
union codetag_ref ref;

if (get_page_tag_ref(page, &ref, &handle)) {
  set_codetag_empty(&ref);
  update_page_tag_ref(handle, &ref);
  put_page_tag_ref(handle);
}
}

/* Should be called only if mem_alloc_profiling_enabled() */
static noinline
void __pgalloc_tag_add(struct page *page, struct task_struct *task,
         unsigned int nr)
{
union pgtag_ref_handle handle;
union codetag_ref ref;

if (get_page_tag_ref(page, &ref, &handle)) {
  alloc_tag_add(&ref, task->alloc_tag, PAGE_SIZE * nr);
  update_page_tag_ref(handle, &ref);
  put_page_tag_ref(handle);
}
}

static inline void pgalloc_tag_add(struct page *page, struct task_struct *task,
       unsigned int nr)
{
if (mem_alloc_profiling_enabled())
  __pgalloc_tag_add(page, task, nr);
}

/* Should be called only if mem_alloc_profiling_enabled() */
static noinline
void __pgalloc_tag_sub(struct page *page, unsigned int nr)
{
union pgtag_ref_handle handle;
union codetag_ref ref;

if (get_page_tag_ref(page, &ref, &handle)) {
  alloc_tag_sub(&ref, PAGE_SIZE * nr);
  update_page_tag_ref(handle, &ref);
  put_page_tag_ref(handle);
}
}

static inline void pgalloc_tag_sub(struct page *page, unsigned int nr)
{
if (mem_alloc_profiling_enabled())
  __pgalloc_tag_sub(page, nr);
}

/* When tag is not NULL, assuming mem_alloc_profiling_enabled */
static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr)
{
if (tag)
  this_cpu_sub(tag->counters->bytes, PAGE_SIZE * nr);
}

#else /* CONFIG_MEM_ALLOC_PROFILING */

static inline void pgalloc_tag_add(struct page *page, struct task_struct *task,
       unsigned int nr) {}
static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) {}
static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr) {}

#endif /* CONFIG_MEM_ALLOC_PROFILING */

__always_inline bool free_pages_prepare(struct page *page,
   unsigned int order)
{
int bad = 0;
bool skip_kasan_poison = should_skip_kasan_poison(page);
bool init = want_init_on_free();
bool compound = PageCompound(page);
struct folio *folio = page_folio(page);

VM_BUG_ON_PAGE(PageTail(page), page);

trace_mm_page_free(page, order);
kmsan_free_page(page, order);

if (memcg_kmem_online() && PageMemcgKmem(page))
  __memcg_kmem_uncharge_page(page, order);

/*
* In rare cases, when truncation or holepunching raced with
* munlock after VM_LOCKED was cleared, Mlocked may still be
* found set here.  This does not indicate a problem, unless
* "unevictable_pgs_cleared" appears worryingly large.
*/
if (unlikely(folio_test_mlocked(folio))) {
  long nr_pages = folio_nr_pages(folio);

  __folio_clear_mlocked(folio);
  zone_stat_mod_folio(folio, NR_MLOCK, -nr_pages);
  count_vm_events(UNEVICTABLE_PGCLEARED, nr_pages);
}

if (unlikely(PageHWPoison(page)) && !order) {
  /* Do not let hwpoison pages hit pcplists/buddy */
  reset_page_owner(page, order);
  page_table_check_free(page, order);
  pgalloc_tag_sub(page, 1 << order);

  /*
* The page is isolated and accounted for.
* Mark the codetag as empty to avoid accounting error
* when the page is freed by unpoison_memory().
*/
  clear_page_tag_ref(page);
  return false;
}

VM_BUG_ON_PAGE(compound && compound_order(page) != order, page);

/*
* Check tail pages before head page information is cleared to
* avoid checking PageCompound for order-0 pages.
*/
if (unlikely(order)) {
  int i;

  if (compound) {
   page[1].flags &= ~PAGE_FLAGS_SECOND;
#ifdef NR_PAGES_IN_LARGE_FOLIO
   folio->_nr_pages = 0;
#endif
  }
  for (i = 1; i < (1 << order); i++) {
   if (compound)
    bad += free_tail_page_prepare(page, page + i);
   if (is_check_pages_enabled()) {
    if (free_page_is_bad(page + i)) {
     bad++;
     continue;
    }
   }
   (page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
  }
}
if (folio_test_anon(folio)) {
  mod_mthp_stat(order, MTHP_STAT_NR_ANON, -1);
  folio->mapping = NULL;
}
if (unlikely(page_has_type(page)))
  /* Reset the page_type (which overlays _mapcount) */
  page->page_type = UINT_MAX;

if (is_check_pages_enabled()) {
  if (free_page_is_bad(page))
   bad++;
  if (bad)
   return false;
}

page_cpupid_reset_last(page);
page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
reset_page_owner(page, order);
page_table_check_free(page, order);
pgalloc_tag_sub(page, 1 << order);

if (!PageHighMem(page)) {
  debug_check_no_locks_freed(page_address(page),
        PAGE_SIZE << order);
  debug_check_no_obj_freed(page_address(page),
        PAGE_SIZE << order);
}

kernel_poison_pages(page, 1 << order);

/*
* As memory initialization might be integrated into KASAN,
* KASAN poisoning and memory initialization code must be
* kept together to avoid discrepancies in behavior.
*
* With hardware tag-based KASAN, memory tags must be set before the
* page becomes unavailable via debug_pagealloc or arch_free_page.
*/
if (!skip_kasan_poison) {
  kasan_poison_pages(page, order, init);

  /* Memory is already initialized if KASAN did it internally. */
  if (kasan_has_integrated_init())
   init = false;
}
if (init)
  kernel_init_pages(page, 1 << order);

/*
* arch_free_page() can make the page's contents inaccessible.  s390
* does this.  So nothing which can access the page's contents should
* happen after this.
*/
arch_free_page(page, order);

debug_pagealloc_unmap_pages(page, 1 << order);

return true;
}

/*
* Frees a number of pages from the PCP lists
* Assumes all pages on list are in same zone.
* count is the number of pages to free.
*/
static void free_pcppages_bulk(struct zone *zone, int count,
     struct per_cpu_pages *pcp,
     int pindex)
{
unsigned long flags;
unsigned int order;
struct page *page;

/*
* Ensure proper count is passed which otherwise would stuck in the
* below while (list_empty(list)) loop.
*/
count = min(pcp->count, count);

/* Ensure requested pindex is drained first. */
pindex = pindex - 1;

spin_lock_irqsave(&zone->lock, flags);

while (count > 0) {
  struct list_head *list;
  int nr_pages;

  /* Remove pages from lists in a round-robin fashion. */
  do {
   if (++pindex > NR_PCP_LISTS - 1)
    pindex = 0;
   list = &pcp->lists[pindex];
  } while (list_empty(list));

  order = pindex_to_order(pindex);
  nr_pages = 1 << order;
  do {
   unsigned long pfn;
   int mt;

   page = list_last_entry(list, struct page, pcp_list);
   pfn = page_to_pfn(page);
   mt = get_pfnblock_migratetype(page, pfn);

   /* must delete to avoid corrupting pcp list */
   list_del(&page->pcp_list);
   count -= nr_pages;
   pcp->count -= nr_pages;

   __free_one_page(page, pfn, zone, order, mt, FPI_NONE);
   trace_mm_page_pcpu_drain(page, order, mt);
  } while (count > 0 && !list_empty(list));
}

spin_unlock_irqrestore(&zone->lock, flags);
}

/* Split a multi-block free page into its individual pageblocks. */
static void split_large_buddy(struct zone *zone, struct page *page,
         unsigned long pfn, int order, fpi_t fpi)
{
unsigned long end = pfn + (1 << order);

VM_WARN_ON_ONCE(!IS_ALIGNED(pfn, 1 << order));
/* Caller removed page from freelist, buddy info cleared! */
VM_WARN_ON_ONCE(PageBuddy(page));

if (order > pageblock_order)
  order = pageblock_order;

do {
  int mt = get_pfnblock_migratetype(page, pfn);

  __free_one_page(page, pfn, zone, order, mt, fpi);
  pfn += 1 << order;
  if (pfn == end)
   break;
  page = pfn_to_page(pfn);
} while (1);
}

static void add_page_to_zone_llist(struct zone *zone, struct page *page,
       unsigned int order)
{
/* Remember the order */
page->order = order;
/* Add the page to the free list */
llist_add(&page->pcp_llist, &zone->trylock_free_pages);
}

static void free_one_page(struct zone *zone, struct page *page,
     unsigned long pfn, unsigned int order,
     fpi_t fpi_flags)
{
struct llist_head *llhead;
unsigned long flags;

if (unlikely(fpi_flags & FPI_TRYLOCK)) {
  if (!spin_trylock_irqsave(&zone->lock, flags)) {
   add_page_to_zone_llist(zone, page, order);
   return;
  }
} else {
  spin_lock_irqsave(&zone->lock, flags);
}

/* The lock succeeded. Process deferred pages. */
llhead = &zone->trylock_free_pages;
if (unlikely(!llist_empty(llhead) && !(fpi_flags & FPI_TRYLOCK))) {
  struct llist_node *llnode;
  struct page *p, *tmp;

  llnode = llist_del_all(llhead);
  llist_for_each_entry_safe(p, tmp, llnode, pcp_llist) {
   unsigned int p_order = p->order;

   split_large_buddy(zone, p, page_to_pfn(p), p_order, fpi_flags);
   __count_vm_events(PGFREE, 1 << p_order);
  }
}
split_large_buddy(zone, page, pfn, order, fpi_flags);
spin_unlock_irqrestore(&zone->lock, flags);

__count_vm_events(PGFREE, 1 << order);
}

static void __free_pages_ok(struct page *page, unsigned int order,
       fpi_t fpi_flags)
{
unsigned long pfn = page_to_pfn(page);
struct zone *zone = page_zone(page);

if (free_pages_prepare(page, order))
  free_one_page(zone, page, pfn, order, fpi_flags);
}

void __meminit __free_pages_core(struct page *page, unsigned int order,
  enum meminit_context context)
{
unsigned int nr_pages = 1 << order;
struct page *p = page;
unsigned int loop;

/*
* When initializing the memmap, __init_single_page() sets the refcount
* of all pages to 1 ("allocated"/"not free"). We have to set the
* refcount of all involved pages to 0.
*
* Note that hotplugged memory pages are initialized to PageOffline().
* Pages freed from memblock might be marked as reserved.
*/
if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG) &&
     unlikely(context == MEMINIT_HOTPLUG)) {
  for (loop = 0; loop < nr_pages; loop++, p++) {
   VM_WARN_ON_ONCE(PageReserved(p));
   __ClearPageOffline(p);
   set_page_count(p, 0);
  }

  adjust_managed_page_count(page, nr_pages);
} else {
  for (loop = 0; loop < nr_pages; loop++, p++) {
   __ClearPageReserved(p);
   set_page_count(p, 0);
  }

  /* memblock adjusts totalram_pages() manually. */
  atomic_long_add(nr_pages, &page_zone(page)->managed_pages);
}

if (page_contains_unaccepted(page, order)) {
  if (order == MAX_PAGE_ORDER && __free_unaccepted(page))
   return;

  accept_memory(page_to_phys(page), PAGE_SIZE << order);
}

/*
* Bypass PCP and place fresh pages right to the tail, primarily
* relevant for memory onlining.
*/
__free_pages_ok(page, order, FPI_TO_TAIL);
}

/*
* Check that the whole (or subset of) a pageblock given by the interval of
* [start_pfn, end_pfn) is valid and within the same zone, before scanning it
* with the migration of free compaction scanner.
*
* Return struct page pointer of start_pfn, or NULL if checks were not passed.
*
* It's possible on some configurations to have a setup like node0 node1 node0
* i.e. it's possible that all pages within a zones range of pages do not
* belong to a single zone. We assume that a border between node0 and node1
* can occur within a single pageblock, but not a node0 node1 node0
* interleaving within a single pageblock. It is therefore sufficient to check
* the first and last page of a pageblock and avoid checking each individual
* page in a pageblock.
*
* Note: the function may return non-NULL struct page even for a page block
* which contains a memory hole (i.e. there is no physical memory for a subset
* of the pfn range). For example, if the pageblock order is MAX_PAGE_ORDER, which
* will fall into 2 sub-sections, and the end pfn of the pageblock may be hole
* even though the start pfn is online and valid. This should be safe most of
* the time because struct pages are still initialized via init_unavailable_range()
* and pfn walkers shouldn't touch any physical memory range for which they do
* not recognize any specific metadata in struct pages.
*/
struct page *__pageblock_pfn_to_page(unsigned long start_pfn,
         unsigned long end_pfn, struct zone *zone)
{
struct page *start_page;
struct page *end_page;

/* end_pfn is one past the range we are checking */
end_pfn--;

if (!pfn_valid(end_pfn))
  return NULL;

start_page = pfn_to_online_page(start_pfn);
if (!start_page)
  return NULL;

if (page_zone(start_page) != zone)
  return NULL;

end_page = pfn_to_page(end_pfn);

/* This gives a shorter code than deriving page_zone(end_page) */
if (page_zone_id(start_page) != page_zone_id(end_page))
  return NULL;

return start_page;
}

/*
* The order of subdivision here is critical for the IO subsystem.
* Please do not alter this order without good reasons and regression
* testing. Specifically, as large blocks of memory are subdivided,
* the order in which smaller blocks are delivered depends on the order
* they're subdivided in this function. This is the primary factor
* influencing the order in which pages are delivered to the IO
* subsystem according to empirical testing, and this is also justified
* by considering the behavior of a buddy system containing a single
* large block of memory acted on by a series of small allocations.
* This behavior is a critical factor in sglist merging's success.
*
* -- nyc
*/
static inline unsigned int expand(struct zone *zone, struct page *page, int low,
      int high, int migratetype)
{
unsigned int size = 1 << high;
unsigned int nr_added = 0;

while (high > low) {
  high--;
  size >>= 1;
  VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]);

  /*
* Mark as guard pages (or page), that will allow to
* merge back to allocator when buddy will be freed.
* Corresponding page table entries will not be touched,
* pages will stay not present in virtual address space
*/
  if (set_page_guard(zone, &page[size], high))
   continue;

  __add_to_free_list(&page[size], zone, high, migratetype, false);
  set_buddy_order(&page[size], high);
  nr_added += size;
}

return nr_added;
}

static __always_inline void page_del_and_expand(struct zone *zone,
      struct page *page, int low,
      int high, int migratetype)
{
int nr_pages = 1 << high;

__del_page_from_free_list(page, zone, high, migratetype);
nr_pages -= expand(zone, page, low, high, migratetype);
account_freepages(zone, -nr_pages, migratetype);
}

static void check_new_page_bad(struct page *page)
{
if (unlikely(PageHWPoison(page))) {
  /* Don't complain about hwpoisoned pages */
  if (PageBuddy(page))
   __ClearPageBuddy(page);
  return;
}

bad_page(page,
   page_bad_reason(page, PAGE_FLAGS_CHECK_AT_PREP));
}

/*
* This page is about to be returned from the page allocator
*/
static bool check_new_page(struct page *page)
{
if (likely(page_expected_state(page,
    PAGE_FLAGS_CHECK_AT_PREP|__PG_HWPOISON)))
  return false;

check_new_page_bad(page);
return true;
}

static inline bool check_new_pages(struct page *page, unsigned int order)
{
if (is_check_pages_enabled()) {
  for (int i = 0; i < (1 << order); i++) {
   struct page *p = page + i;

   if (check_new_page(p))
    return true;
  }
}

return false;
}

static inline bool should_skip_kasan_unpoison(gfp_t flags)
{
/* Don't skip if a software KASAN mode is enabled. */
if (IS_ENABLED(CONFIG_KASAN_GENERIC) ||
     IS_ENABLED(CONFIG_KASAN_SW_TAGS))
  return false;

/* Skip, if hardware tag-based KASAN is not enabled. */
if (!kasan_hw_tags_enabled())
  return true;

/*
* With hardware tag-based KASAN enabled, skip if this has been
* requested via __GFP_SKIP_KASAN.
*/
return flags & __GFP_SKIP_KASAN;
}

static inline bool should_skip_init(gfp_t flags)
{
/* Don't skip, if hardware tag-based KASAN is not enabled. */
if (!kasan_hw_tags_enabled())
  return false;

/* For hardware tag-based KASAN, skip if requested. */
return (flags & __GFP_SKIP_ZERO);
}

inline void post_alloc_hook(struct page *page, unsigned int order,
    gfp_t gfp_flags)
{
bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags) &&
   !should_skip_init(gfp_flags);
bool zero_tags = init && (gfp_flags & __GFP_ZEROTAGS);
int i;

set_page_private(page, 0);

arch_alloc_page(page, order);
debug_pagealloc_map_pages(page, 1 << order);

/*
* Page unpoisoning must happen before memory initialization.
* Otherwise, the poison pattern will be overwritten for __GFP_ZERO
* allocations and the page unpoisoning code will complain.
*/
kernel_unpoison_pages(page, 1 << order);

/*
* As memory initialization might be integrated into KASAN,
* KASAN unpoisoning and memory initializion code must be
* kept together to avoid discrepancies in behavior.
*/

/*
* If memory tags should be zeroed
* (which happens only when memory should be initialized as well).
*/
if (zero_tags) {
  /* Initialize both memory and memory tags. */
  for (i = 0; i != 1 << order; ++i)
   tag_clear_highpage(page + i);

  /* Take note that memory was initialized by the loop above. */
  init = false;
}
if (!should_skip_kasan_unpoison(gfp_flags) &&
     kasan_unpoison_pages(page, order, init)) {
  /* Take note that memory was initialized by KASAN. */
  if (kasan_has_integrated_init())
   init = false;
} else {
  /*
* If memory tags have not been set by KASAN, reset the page
* tags to ensure page_address() dereferencing does not fault.
*/
  for (i = 0; i != 1 << order; ++i)
   page_kasan_tag_reset(page + i);
}
/* If memory is still not initialized, initialize it now. */
if (init)
  kernel_init_pages(page, 1 << order);

set_page_owner(page, order, gfp_flags);
page_table_check_alloc(page, order);
pgalloc_tag_add(page, current, 1 << order);
}

static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
       unsigned int alloc_flags)
{
post_alloc_hook(page, order, gfp_flags);

if (order && (gfp_flags & __GFP_COMP))
  prep_compound_page(page, order);

/*
* page is set pfmemalloc when ALLOC_NO_WATERMARKS was necessary to
* allocate the page. The expectation is that the caller is taking
* steps that will free more memory. The caller should avoid the page
* being used for !PFMEMALLOC purposes.
*/
if (alloc_flags & ALLOC_NO_WATERMARKS)
  set_page_pfmemalloc(page);
else
  clear_page_pfmemalloc(page);
}

/*
* Go through the free lists for the given migratetype and remove
* the smallest available page from the freelists
*/
static __always_inline
struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
      int migratetype)
{
unsigned int current_order;
struct free_area *area;
struct page *page;

/* Find a page of the appropriate size in the preferred list */
for (current_order = order; current_order < NR_PAGE_ORDERS; ++current_order) {
  area = &(zone->free_area[current_order]);
  page = get_page_from_free_area(area, migratetype);
  if (!page)
   continue;

  page_del_and_expand(zone, page, order, current_order,
        migratetype);
  trace_mm_page_alloc_zone_locked(page, order, migratetype,
    pcp_allowed_order(order) &&
    migratetype < MIGRATE_PCPTYPES);
  return page;
}

return NULL;
}

/*
* This array describes the order lists are fallen back to when
* the free lists for the desirable migrate type are depleted
*
* The other migratetypes do not have fallbacks.
*/
static int fallbacks[MIGRATE_PCPTYPES][MIGRATE_PCPTYPES - 1] = {
[MIGRATE_UNMOVABLE]   = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE   },
[MIGRATE_MOVABLE]     = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE },
[MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE,   MIGRATE_MOVABLE   },
};

#ifdef CONFIG_CMA
static __always_inline struct page *__rmqueue_cma_fallback(struct zone *zone,
     unsigned int order)
{
return __rmqueue_smallest(zone, order, MIGRATE_CMA);
}
#else
static inline struct page *__rmqueue_cma_fallback(struct zone *zone,
     unsigned int order) { return NULL; }
#endif

/*
* Move all free pages of a block to new type's freelist. Caller needs to
* change the block type.
*/
static int __move_freepages_block(struct zone *zone, unsigned long start_pfn,
      int old_mt, int new_mt)
{
struct page *page;
unsigned long pfn, end_pfn;
unsigned int order;
int pages_moved = 0;

VM_WARN_ON(start_pfn & (pageblock_nr_pages - 1));
end_pfn = pageblock_end_pfn(start_pfn);

for (pfn = start_pfn; pfn < end_pfn;) {
  page = pfn_to_page(pfn);
  if (!PageBuddy(page)) {
   pfn++;
   continue;
  }

  /* Make sure we are not inadvertently changing nodes */
  VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
  VM_BUG_ON_PAGE(page_zone(page) != zone, page);

  order = buddy_order(page);

  move_to_free_list(page, zone, order, old_mt, new_mt);

  pfn += 1 << order;
  pages_moved += 1 << order;
}

return pages_moved;
}

static bool prep_move_freepages_block(struct zone *zone, struct page *page,
          unsigned long *start_pfn,
          int *num_free, int *num_movable)
{
unsigned long pfn, start, end;

pfn = page_to_pfn(page);
start = pageblock_start_pfn(pfn);
end = pageblock_end_pfn(pfn);

/*
* The caller only has the lock for @zone, don't touch ranges
* that straddle into other zones. While we could move part of
* the range that's inside the zone, this call is usually
* accompanied by other operations such as migratetype updates
* which also should be locked.
*/
if (!zone_spans_pfn(zone, start))
  return false;
if (!zone_spans_pfn(zone, end - 1))
  return false;

*start_pfn = start;

if (num_free) {
  *num_free = 0;
  *num_movable = 0;
  for (pfn = start; pfn < end;) {
   page = pfn_to_page(pfn);
   if (PageBuddy(page)) {
    int nr = 1 << buddy_order(page);

    *num_free += nr;
    pfn += nr;
    continue;
   }
   /*
* We assume that pages that could be isolated for
* migration are movable. But we don't actually try
* isolating, as that would be expensive.
*/
   if (PageLRU(page) || page_has_movable_ops(page))
    (*num_movable)++;
   pfn++;
  }
}

return true;
}

static int move_freepages_block(struct zone *zone, struct page *page,
    int old_mt, int new_mt)
{
unsigned long start_pfn;
int res;

if (!prep_move_freepages_block(zone, page, &start_pfn, NULL, NULL))
  return -1;

res = __move_freepages_block(zone, start_pfn, old_mt, new_mt);
set_pageblock_migratetype(pfn_to_page(start_pfn), new_mt);

return res;

}

#ifdef CONFIG_MEMORY_ISOLATION
/* Look for a buddy that straddles start_pfn */
static unsigned long find_large_buddy(unsigned long start_pfn)
{
int order = 0;
struct page *page;
unsigned long pfn = start_pfn;

while (!PageBuddy(page = pfn_to_page(pfn))) {
  /* Nothing found */
  if (++order > MAX_PAGE_ORDER)
   return start_pfn;
  pfn &= ~0UL << order;
}

/*
* Found a preceding buddy, but does it straddle?
*/
if (pfn + (1 << buddy_order(page)) > start_pfn)
  return pfn;

/* Nothing found */
return start_pfn;
}

static inline void toggle_pageblock_isolate(struct page *page, bool isolate)
{
if (isolate)
  set_pfnblock_bit(page, page_to_pfn(page), PB_migrate_isolate);
else
  clear_pfnblock_bit(page, page_to_pfn(page), PB_migrate_isolate);
}

/**
* __move_freepages_block_isolate - move free pages in block for page isolation
* @zone: the zone
* @page: the pageblock page
* @isolate: to isolate the given pageblock or unisolate it
*
* This is similar to move_freepages_block(), but handles the special
* case encountered in page isolation, where the block of interest
* might be part of a larger buddy spanning multiple pageblocks.
*
* Unlike the regular page allocator path, which moves pages while
* stealing buddies off the freelist, page isolation is interested in
* arbitrary pfn ranges that may have overlapping buddies on both ends.
*
* This function handles that. Straddling buddies are split into
* individual pageblocks. Only the block of interest is moved.
*
* Returns %true if pages could be moved, %false otherwise.
*/
static bool __move_freepages_block_isolate(struct zone *zone,
  struct page *page, bool isolate)
{
unsigned long start_pfn, pfn;
int from_mt;
int to_mt;

if (isolate == get_pageblock_isolate(page)) {
  VM_WARN_ONCE(1, "%s a pageblock that is already in that state",
        isolate ? "Isolate" : "Unisolate");
  return false;
}

if (!prep_move_freepages_block(zone, page, &start_pfn, NULL, NULL))
  return false;

/* No splits needed if buddies can't span multiple blocks */
if (pageblock_order == MAX_PAGE_ORDER)
  goto move;

/* We're a tail block in a larger buddy */
pfn = find_large_buddy(start_pfn);
if (pfn != start_pfn) {
  struct page *buddy = pfn_to_page(pfn);
  int order = buddy_order(buddy);

  del_page_from_free_list(buddy, zone, order,
     get_pfnblock_migratetype(buddy, pfn));
  toggle_pageblock_isolate(page, isolate);
  split_large_buddy(zone, buddy, pfn, order, FPI_NONE);
  return true;
}

/* We're the starting block of a larger buddy */
if (PageBuddy(page) && buddy_order(page) > pageblock_order) {
  int order = buddy_order(page);

  del_page_from_free_list(page, zone, order,
     get_pfnblock_migratetype(page, pfn));
  toggle_pageblock_isolate(page, isolate);
  split_large_buddy(zone, page, pfn, order, FPI_NONE);
  return true;
}
move:
/* Use MIGRATETYPE_MASK to get non-isolate migratetype */
if (isolate) {
  from_mt = __get_pfnblock_flags_mask(page, page_to_pfn(page),
          MIGRATETYPE_MASK);
  to_mt = MIGRATE_ISOLATE;
} else {
  from_mt = MIGRATE_ISOLATE;
  to_mt = __get_pfnblock_flags_mask(page, page_to_pfn(page),
        MIGRATETYPE_MASK);
}

__move_freepages_block(zone, start_pfn, from_mt, to_mt);
toggle_pageblock_isolate(pfn_to_page(start_pfn), isolate);

return true;
}

bool pageblock_isolate_and_move_free_pages(struct zone *zone, struct page *page)
{
return __move_freepages_block_isolate(zone, page, true);
}

bool pageblock_unisolate_and_move_free_pages(struct zone *zone, struct page *page)
{
return __move_freepages_block_isolate(zone, page, false);
}

#endif /* CONFIG_MEMORY_ISOLATION */

static void change_pageblock_range(struct page *pageblock_page,
     int start_order, int migratetype)
{
int nr_pageblocks = 1 << (start_order - pageblock_order);

while (nr_pageblocks--) {
  set_pageblock_migratetype(pageblock_page, migratetype);
  pageblock_page += pageblock_nr_pages;
}
}

static inline bool boost_watermark(struct zone *zone)
{
unsigned long max_boost;

if (!watermark_boost_factor)
  return false;
/*
* Don't bother in zones that are unlikely to produce results.
* On small machines, including kdump capture kernels running
* in a small area, boosting the watermark can cause an out of
* memory situation immediately.
*/
if ((pageblock_nr_pages * 4) > zone_managed_pages(zone))
  return false;

max_boost = mult_frac(zone->_watermark[WMARK_HIGH],
   watermark_boost_factor, 10000);

/*
* high watermark may be uninitialised if fragmentation occurs
* very early in boot so do not boost. We do not fall
* through and boost by pageblock_nr_pages as failing
* allocations that early means that reclaim is not going
* to help and it may even be impossible to reclaim the
* boosted watermark resulting in a hang.
*/
if (!max_boost)
  return false;

max_boost = max(pageblock_nr_pages, max_boost);

zone->watermark_boost = min(zone->watermark_boost + pageblock_nr_pages,
  max_boost);

return true;
}

/*
* When we are falling back to another migratetype during allocation, should we
* try to claim an entire block to satisfy further allocations, instead of
* polluting multiple pageblocks?
*/
static bool should_try_claim_block(unsigned int order, int start_mt)
{
/*
* Leaving this order check is intended, although there is
* relaxed order check in next check. The reason is that
* we can actually claim the whole pageblock if this condition met,
* but, below check doesn't guarantee it and that is just heuristic
* so could be changed anytime.
*/
if (order >= pageblock_order)
  return true;

/*
* Above a certain threshold, always try to claim, as it's likely there
* will be more free pages in the pageblock.
*/
if (order >= pageblock_order / 2)
  return true;

/*
* Unmovable/reclaimable allocations would cause permanent
* fragmentations if they fell back to allocating from a movable block
* (polluting it), so we try to claim the whole block regardless of the
* allocation size. Later movable allocations can always steal from this
* block, which is less problematic.
*/
if (start_mt == MIGRATE_RECLAIMABLE || start_mt == MIGRATE_UNMOVABLE)
  return true;

if (page_group_by_mobility_disabled)
  return true;

/*
* Movable pages won't cause permanent fragmentation, so when you alloc
* small pages, we just need to temporarily steal unmovable or
* reclaimable pages that are closest to the request size. After a
* while, memory compaction may occur to form large contiguous pages,
* and the next movable allocation may not need to steal.
*/
return false;
}

/*
* Check whether there is a suitable fallback freepage with requested order.
* If claimable is true, this function returns fallback_mt only if
* we would do this whole-block claiming. This would help to reduce
* fragmentation due to mixed migratetype pages in one pageblock.
*/
int find_suitable_fallback(struct free_area *area, unsigned int order,
      int migratetype, bool claimable)
{
int i;

if (claimable && !should_try_claim_block(order, migratetype))
  return -2;

if (area->nr_free == 0)
  return -1;

for (i = 0; i < MIGRATE_PCPTYPES - 1 ; i++) {
  int fallback_mt = fallbacks[migratetype][i];

  if (!free_area_empty(area, fallback_mt))
   return fallback_mt;
}

return -1;
}

/*
* This function implements actual block claiming behaviour. If order is large
* enough, we can claim the whole pageblock for the requested migratetype. If
* not, we check the pageblock for constituent pages; if at least half of the
* pages are free or compatible, we can still claim the whole block, so pages
* freed in the future will be put on the correct free list.
*/
static struct page *
try_to_claim_block(struct zone *zone, struct page *page,
     int current_order, int order, int start_type,
     int block_type, unsigned int alloc_flags)
{
int free_pages, movable_pages, alike_pages;
unsigned long start_pfn;

--> --------------------

--> maximum size reached

--> --------------------

Messung V0.5

¤ Dauer der Verarbeitung: 0.27 Sekunden (vorverarbeitet) ¤

Wurzel

Suchen

Beweissystem der NASA

Beweissystem Isabelle

NIST Cobol Testsuite

Cephes Mathematical Library

Wiener Entwicklungsmethode

Haftungshinweis

Die Informationen auf dieser Webseite wurden nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit, noch Qualität der bereit gestellten Informationen zugesichert.

Bemerkung:

Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.