// SPDX-License-Identifier: GPL-2.0-only /* * linux/mm/page_alloc.c * * Manages the free list, the system allocates free pages here. * Note that kmalloc() lives in slab.c * * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds * Swap reorganised 29.12.95, Stephen Tweedie * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999 * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999 * Zone balancing, Kanoj Sarcar, SGI, Jan 2000 * Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002 * (lots of bits borrowed from Ingo Molnar & Andrew Morton)
*/
/* Free Page Internal flags: for internal, non-pcp variants of free_pages(). */ typedefint __bitwise fpi_t;
/* No special request */ #define FPI_NONE ((__force fpi_t)0)
/* * Skip free page reporting notification for the (possibly merged) page. * This does not hinder free page reporting from grabbing the page, * reporting it and marking it "reported" - it only skips notifying * the free page reporting infrastructure about a newly freed page. For * example, used when temporarily pulling a page from a freelist and * putting it back unmodified.
*/ #define FPI_SKIP_REPORT_NOTIFY ((__force fpi_t)BIT(0))
/* * Place the (possibly merged) page to the tail of the freelist. Will ignore * page shuffling (relevant code - e.g., memory onlining - is expected to * shuffle the whole zone). * * Note: No code should rely on this flag for correctness - it's purely * to allow for optimizations when handing back either fresh pages * (memory onlining) or untouched pages (page isolation, free page * reporting).
*/ #define FPI_TO_TAIL ((__force fpi_t)BIT(1))
/* Free the page without taking locks. Rely on trylock only. */ #define FPI_TRYLOCK ((__force fpi_t)BIT(2))
/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */ static DEFINE_MUTEX(pcp_batch_high_lock); #define MIN_PERCPU_PAGELIST_HIGH_FRACTION (8)
#ifdefined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT) /* * On SMP, spin_trylock is sufficient protection. * On PREEMPT_RT, spin_trylock is equivalent on both SMP and UP.
*/ #define pcp_trylock_prepare(flags) do { } while (0) #define pcp_trylock_finish(flag) do { } while (0) #else
/* UP spin_trylock always succeeds so disable IRQs to prevent re-entrancy. */ #define pcp_trylock_prepare(flags) local_irq_save(flags) #define pcp_trylock_finish(flags) local_irq_restore(flags) #endif
/* * Locking a pcp requires a PCP lookup followed by a spinlock. To avoid * a migration causing the wrong PCP to be locked and remote memory being * potentially allocated, pin the task to the CPU for the lookup+lock. * preempt_disable is used on !RT because it is faster than migrate_disable. * migrate_disable is used on RT because otherwise RT spinlock usage is * interfered with and a high priority task cannot preempt the allocator.
*/ #ifndef CONFIG_PREEMPT_RT #define pcpu_task_pin() preempt_disable() #define pcpu_task_unpin() preempt_enable() #else #define pcpu_task_pin() migrate_disable() #define pcpu_task_unpin() migrate_enable() #endif
/* * Generic helper to lookup and a per-cpu variable with an embedded spinlock. * Return value should be used with equivalent unlock helper.
*/ #define pcpu_spin_lock(type, member, ptr) \
({ \
type *_ret; \
pcpu_task_pin(); \
_ret = this_cpu_ptr(ptr); \
spin_lock(&_ret->member); \
_ret; \
})
#ifdef CONFIG_HAVE_MEMORYLESS_NODES /* * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly. * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined. * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem() * defined in <linux/topology.h>.
*/
DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */
EXPORT_PER_CPU_SYMBOL(_numa_mem_); #endif
/* * results with 256, 32 in the lowmem_reserve sysctl: * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high) * 1G machine -> (16M dma, 784M normal, 224M high) * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL * HIGHMEM allocation will leave (224M+784M)/256 of ram reserved in ZONE_DMA * * TBD: should special case ZONE_DMA32 machines here - in those we normally * don't need any ZONE_NORMAL reservation
*/ staticint sysctl_lowmem_reserve_ratio[MAX_NR_ZONES] = { #ifdef CONFIG_ZONE_DMA
[ZONE_DMA] = 256, #endif #ifdef CONFIG_ZONE_DMA32
[ZONE_DMA32] = 256, #endif
[ZONE_NORMAL] = 32, #ifdef CONFIG_HIGHMEM
[ZONE_HIGHMEM] = 0, #endif
[ZONE_MOVABLE] = 0,
};
staticbool page_contains_unaccepted(struct page *page, unsignedint order); staticbool cond_accept_memory(struct zone *zone, unsignedint order, int alloc_flags); staticbool __free_unaccepted(struct page *page);
int page_group_by_mobility_disabled __read_mostly;
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT /* * During boot we initialize deferred pages on-demand, as needed, but once * page_alloc_init_late() has finished, the deferred pages are all initialized, * and we can permanently disable that path.
*/
DEFINE_STATIC_KEY_TRUE(deferred_pages);
/* * deferred_grow_zone() is __init, but it is called from * get_page_from_freelist() during early boot until deferred_pages permanently * disables this call. This is why we have refdata wrapper to avoid warning, * and to ensure that the function body gets unloaded.
*/ staticbool __ref
_deferred_grow_zone(struct zone *zone, unsignedint order)
{ return deferred_grow_zone(zone, order);
} #else staticinlinebool deferred_pages_enabled(void)
{ returnfalse;
}
/** * __get_pfnblock_flags_mask - Return the requested group of flags for * a pageblock_nr_pages block of pages * @page: The page within the block of interest * @pfn: The target page frame number * @mask: mask of bits that the caller is interested in * * Return: pageblock_bits flags
*/ staticunsignedlong __get_pfnblock_flags_mask(conststruct page *page, unsignedlong pfn, unsignedlong mask)
{ unsignedlong *bitmap_word; unsignedlong bitidx; unsignedlong word;
get_pfnblock_bitmap_bitidx(page, pfn, &bitmap_word, &bitidx); /* * This races, without locks, with set_pfnblock_migratetype(). Ensure * a consistent read of the memory array, so that results, even though * racy, are not corrupted.
*/
word = READ_ONCE(*bitmap_word); return (word >> bitidx) & mask;
}
/** * get_pfnblock_bit - Check if a standalone bit of a pageblock is set * @page: The page within the block of interest * @pfn: The target page frame number * @pb_bit: pageblock bit to check * * Return: true if the bit is set, otherwise false
*/ bool get_pfnblock_bit(conststruct page *page, unsignedlong pfn, enum pageblock_bits pb_bit)
{ unsignedlong *bitmap_word; unsignedlong bitidx;
if (WARN_ON_ONCE(!is_standalone_pb_bit(pb_bit))) returnfalse;
/** * get_pfnblock_migratetype - Return the migratetype of a pageblock * @page: The page within the block of interest * @pfn: The target page frame number * * Return: The migratetype of the pageblock * * Use get_pfnblock_migratetype() if caller already has both @page and @pfn * to save a call to page_to_pfn().
*/
__always_inline enum migratetype
get_pfnblock_migratetype(conststruct page *page, unsignedlong pfn)
{ unsignedlong mask = MIGRATETYPE_AND_ISO_MASK; unsignedlong flags;
/** * __set_pfnblock_flags_mask - Set the requested group of flags for * a pageblock_nr_pages block of pages * @page: The page within the block of interest * @pfn: The target page frame number * @flags: The flags to set * @mask: mask of bits that the caller is interested in
*/ staticvoid __set_pfnblock_flags_mask(struct page *page, unsignedlong pfn, unsignedlong flags, unsignedlong mask)
{ unsignedlong *bitmap_word; unsignedlong bitidx; unsignedlong word;
word = READ_ONCE(*bitmap_word); do {
} while (!try_cmpxchg(bitmap_word, &word, (word & ~mask) | flags));
}
/** * set_pfnblock_bit - Set a standalone bit of a pageblock * @page: The page within the block of interest * @pfn: The target page frame number * @pb_bit: pageblock bit to set
*/ void set_pfnblock_bit(conststruct page *page, unsignedlong pfn, enum pageblock_bits pb_bit)
{ unsignedlong *bitmap_word; unsignedlong bitidx;
if (WARN_ON_ONCE(!is_standalone_pb_bit(pb_bit))) return;
/** * clear_pfnblock_bit - Clear a standalone bit of a pageblock * @page: The page within the block of interest * @pfn: The target page frame number * @pb_bit: pageblock bit to clear
*/ void clear_pfnblock_bit(conststruct page *page, unsignedlong pfn, enum pageblock_bits pb_bit)
{ unsignedlong *bitmap_word; unsignedlong bitidx;
if (WARN_ON_ONCE(!is_standalone_pb_bit(pb_bit))) return;
/** * set_pageblock_migratetype - Set the migratetype of a pageblock * @page: The page within the block of interest * @migratetype: migratetype to set
*/ staticvoid set_pageblock_migratetype(struct page *page, enum migratetype migratetype)
{ if (unlikely(page_group_by_mobility_disabled &&
migratetype < MIGRATE_PCPTYPES))
migratetype = MIGRATE_UNMOVABLE;
#ifdef CONFIG_MEMORY_ISOLATION if (migratetype == MIGRATE_ISOLATE) {
VM_WARN_ONCE(1, "Use set_pageblock_isolate() for pageblock isolation"); return;
}
VM_WARN_ONCE(get_pfnblock_bit(page, page_to_pfn(page),
PB_migrate_isolate), "Use clear_pageblock_isolate() to unisolate pageblock"); /* MIGRATETYPE_AND_ISO_MASK clears PB_migrate_isolate if it is set */ #endif
__set_pfnblock_flags_mask(page, page_to_pfn(page),
(unsignedlong)migratetype,
MIGRATETYPE_AND_ISO_MASK);
}
/* * Temporary debugging check for pages not lying within a given zone.
*/ staticbool __maybe_unused bad_range(struct zone *zone, struct page *page)
{ if (page_outside_zone_boundaries(zone, page)) returntrue; if (zone != page_zone(page)) returntrue;
/* * Allow a burst of 60 reports, then keep quiet for that minute; * or allow a steady drip of one report per second.
*/ if (nr_shown == 60) { if (time_before(jiffies, resume)) {
nr_unshown++; goto out;
} if (nr_unshown) {
pr_alert( "BUG: Bad page state: %lu messages suppressed\n",
nr_unshown);
nr_unshown = 0;
}
nr_shown = 0;
} if (nr_shown++ == 0)
resume = jiffies + 60 * HZ;
pr_alert("BUG: Bad page state in process %s pfn:%05lx\n",
current->comm, page_to_pfn(page));
dump_page(page, reason);
print_modules();
dump_stack();
out: /* Leave bad fields for debug, except PageBuddy could make trouble */ if (PageBuddy(page))
__ClearPageBuddy(page);
add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
}
staticinlineunsignedint order_to_pindex(int migratetype, int order)
{
staticinlineint pindex_to_order(unsignedint pindex)
{ int order = pindex / MIGRATE_PCPTYPES;
#ifdef CONFIG_TRANSPARENT_HUGEPAGE if (pindex >= NR_LOWORDER_PCP_LISTS)
order = HPAGE_PMD_ORDER; #else
VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER); #endif
return order;
}
staticinlinebool pcp_allowed_order(unsignedint order)
{ if (order <= PAGE_ALLOC_COSTLY_ORDER) returntrue; #ifdef CONFIG_TRANSPARENT_HUGEPAGE if (order == HPAGE_PMD_ORDER) returntrue; #endif returnfalse;
}
/* * Higher-order pages are called "compound pages". They are structured thusly: * * The first PAGE_SIZE page is called the "head page" and have PG_head set. * * The remaining PAGE_SIZE pages are called "tail pages". PageTail() is encoded * in bit 0 of page->compound_head. The rest of bits is pointer to head page. * * The first tail page's ->compound_order holds the order of allocation. * This usage means that zero-order pages may not be compound.
*/
void prep_compound_page(struct page *page, unsignedint order)
{ int i; int nr_pages = 1 << order;
__SetPageHead(page); for (i = 1; i < nr_pages; i++)
prep_compound_tail(page, i);
staticinlinebool
compaction_capture(struct capture_control *capc, struct page *page, int order, int migratetype)
{ if (!capc || order != capc->cc->order) returnfalse;
/* Do not accidentally pollute CMA or isolated regions*/ if (is_migrate_cma(migratetype) ||
is_migrate_isolate(migratetype)) returnfalse;
/* * Do not let lower order allocations pollute a movable pageblock * unless compaction is also requesting movable pages. * This might let an unmovable request use a reclaimable pageblock * and vice-versa but no more than normal fallback logic which can * have trouble finding a high-order free page.
*/ if (order < pageblock_order && migratetype == MIGRATE_MOVABLE &&
capc->cc->migratetype != MIGRATE_MOVABLE) returnfalse;
if (migratetype != capc->cc->migratetype)
trace_mm_page_alloc_extfrag(page, capc->cc->order, order,
capc->cc->migratetype, migratetype);
capc->page = page; returntrue;
}
#else staticinlinestruct capture_control *task_capc(struct zone *zone)
{ return NULL;
}
staticinlinebool
compaction_capture(struct capture_control *capc, struct page *page, int order, int migratetype)
{ returnfalse;
} #endif/* CONFIG_COMPACTION */
staticinlinevoid account_freepages(struct zone *zone, int nr_pages, int migratetype)
{
lockdep_assert_held(&zone->lock);
/* Used for pages not on another list */ staticinlinevoid __add_to_free_list(struct page *page, struct zone *zone, unsignedint order, int migratetype, bool tail)
{ struct free_area *area = &zone->free_area[order]; int nr_pages = 1 << order;
VM_WARN_ONCE(get_pageblock_migratetype(page) != migratetype, "page type is %d, passed migratetype is %d (nr=%d)\n",
get_pageblock_migratetype(page), migratetype, nr_pages);
if (tail)
list_add_tail(&page->buddy_list, &area->free_list[migratetype]); else
list_add(&page->buddy_list, &area->free_list[migratetype]);
area->nr_free++;
if (order >= pageblock_order && !is_migrate_isolate(migratetype))
__mod_zone_page_state(zone, NR_FREE_PAGES_BLOCKS, nr_pages);
}
/* * Used for pages which are on another list. Move the pages to the tail * of the list - so the moved pages won't immediately be considered for * allocation again (e.g., optimization for memory onlining).
*/ staticinlinevoid move_to_free_list(struct page *page, struct zone *zone, unsignedint order, int old_mt, int new_mt)
{ struct free_area *area = &zone->free_area[order]; int nr_pages = 1 << order;
/* Free page moving can fail, so it happens before the type update */
VM_WARN_ONCE(get_pageblock_migratetype(page) != old_mt, "page type is %d, passed migratetype is %d (nr=%d)\n",
get_pageblock_migratetype(page), old_mt, nr_pages);
if (order >= pageblock_order &&
is_migrate_isolate(old_mt) != is_migrate_isolate(new_mt)) { if (!is_migrate_isolate(old_mt))
nr_pages = -nr_pages;
__mod_zone_page_state(zone, NR_FREE_PAGES_BLOCKS, nr_pages);
}
}
staticinlinevoid __del_page_from_free_list(struct page *page, struct zone *zone, unsignedint order, int migratetype)
{ int nr_pages = 1 << order;
VM_WARN_ONCE(get_pageblock_migratetype(page) != migratetype, "page type is %d, passed migratetype is %d (nr=%d)\n",
get_pageblock_migratetype(page), migratetype, nr_pages);
/* clear reported state and update reported page count */ if (page_reported(page))
__ClearPageReported(page);
/* * If this is less than the 2nd largest possible page, check if the buddy * of the next-higher order is free. If it is, it's possible * that pages are being freed that will coalesce soon. In case, * that is happening, add the free page to the tail of the list * so it's less likely to be used soon and more likely to be merged * as a 2-level higher order page
*/ staticinlinebool
buddy_merge_likely(unsignedlong pfn, unsignedlong buddy_pfn, struct page *page, unsignedint order)
{ unsignedlong higher_page_pfn; struct page *higher_page;
return find_buddy_page_pfn(higher_page, higher_page_pfn, order + 1,
NULL) != NULL;
}
/* * Freeing function for a buddy system allocator. * * The concept of a buddy system is to maintain direct-mapped table * (containing bit values) for memory blocks of various "orders". * The bottom level table contains the map for the smallest allocatable * units of memory (here, pages), and each level above it describes * pairs of units from the levels below, hence, "buddies". * At a high level, all that happens here is marking the table entry * at the bottom level available, and propagating the changes upward * as necessary, plus some accounting needed to play nicely with other * parts of the VM system. * At each level, we keep a list of pages, which are heads of continuous * free pages of length of (1 << order) and marked with PageBuddy. * Page's order is recorded in page_private(page) field. * So when we are allocating or freeing one, we can derive the state of the * other. That is, if we allocate a small block, and both were * free, the remainder of the region must be split into blocks. * If a block is freed, and its buddy is also free, then this * triggers coalescing into a block of larger size. * * -- nyc
*/
buddy = find_buddy_page_pfn(page, pfn, order, &buddy_pfn); if (!buddy) goto done_merging;
if (unlikely(order >= pageblock_order)) { /* * We want to prevent merge between freepages on pageblock * without fallbacks and normal pageblock. Without this, * pageblock isolation could cause incorrect freepage or CMA * accounting or HIGHATOMIC accounting.
*/
buddy_mt = get_pfnblock_migratetype(buddy, buddy_pfn);
/* * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page, * merge with it and move up one order.
*/ if (page_is_guard(buddy))
clear_page_guard(zone, buddy, order); else
__del_page_from_free_list(buddy, zone, order, buddy_mt);
if (unlikely(buddy_mt != migratetype)) { /* * Match buddy type. This ensures that an * expand() down the line puts the sub-blocks * on the right freelists.
*/
set_pageblock_migratetype(buddy, migratetype);
}
/* Notify page reporting subsystem of freed page */ if (!(fpi_flags & FPI_SKIP_REPORT_NOTIFY))
page_reporting_notify_free(order);
}
/* * A bad page could be due to a number of fields. Instead of multiple branches, * try and check multiple fields with one check. The caller must do a detailed * check if necessary.
*/ staticinlinebool page_expected_state(struct page *page, unsignedlong check_flags)
{ if (unlikely(atomic_read(&page->_mapcount) != -1)) returnfalse;
staticint free_tail_page_prepare(struct page *head_page, struct page *page)
{ struct folio *folio = (struct folio *)head_page; int ret = 1;
/* * We rely page->lru.next never has bit 0 set, unless the page * is PageTail(). Let's make sure that's true even for poisoned ->lru.
*/
BUILD_BUG_ON((unsignedlong)LIST_POISON1 & 1);
if (!is_check_pages_enabled()) {
ret = 0; goto out;
} switch (page - head_page) { case 1: /* the first tail page: these may be in place of ->mapping */ if (unlikely(folio_large_mapcount(folio))) {
bad_page(page, "nonzero large_mapcount"); goto out;
} if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT) &&
unlikely(atomic_read(&folio->_nr_pages_mapped))) {
bad_page(page, "nonzero nr_pages_mapped"); goto out;
} if (IS_ENABLED(CONFIG_MM_ID)) { if (unlikely(folio->_mm_id_mapcount[0] != -1)) {
bad_page(page, "nonzero mm mapcount 0"); goto out;
} if (unlikely(folio->_mm_id_mapcount[1] != -1)) {
bad_page(page, "nonzero mm mapcount 1"); goto out;
}
} if (IS_ENABLED(CONFIG_64BIT)) { if (unlikely(atomic_read(&folio->_entire_mapcount) + 1)) {
bad_page(page, "nonzero entire_mapcount"); goto out;
} if (unlikely(atomic_read(&folio->_pincount))) {
bad_page(page, "nonzero pincount"); goto out;
}
} break; case 2: /* the second tail page: deferred_list overlaps ->mapping */ if (unlikely(!list_empty(&folio->_deferred_list))) {
bad_page(page, "on deferred list"); goto out;
} if (!IS_ENABLED(CONFIG_64BIT)) { if (unlikely(atomic_read(&folio->_entire_mapcount) + 1)) {
bad_page(page, "nonzero entire_mapcount"); goto out;
} if (unlikely(atomic_read(&folio->_pincount))) {
bad_page(page, "nonzero pincount"); goto out;
}
} break; case 3: /* the third tail page: hugetlb specifics overlap ->mappings */ if (IS_ENABLED(CONFIG_HUGETLB_PAGE)) break;
fallthrough; default: if (page->mapping != TAIL_MAPPING) {
bad_page(page, "corrupted mapping in tail page"); goto out;
} break;
} if (unlikely(!PageTail(page))) {
bad_page(page, "PageTail not set"); goto out;
} if (unlikely(compound_head(page) != head_page)) {
bad_page(page, "compound_head not consistent"); goto out;
}
ret = 0;
out:
page->mapping = NULL;
clear_compound_head(page); return ret;
}
/* * Skip KASAN memory poisoning when either: * * 1. For generic KASAN: deferred memory initialization has not yet completed. * Tag-based KASAN modes skip pages freed via deferred memory initialization * using page tags instead (see below). * 2. For tag-based KASAN modes: the page has a match-all KASAN tag, indicating * that error detection is disabled for accesses via the page address. * * Pages will have match-all tags in the following circumstances: * * 1. Pages are being initialized for the first time, including during deferred * memory init; see the call to page_kasan_tag_reset in __init_single_page. * 2. The allocation was not unpoisoned due to __GFP_SKIP_KASAN, with the * exception of pages unpoisoned by kasan_unpoison_vmalloc. * 3. The allocation was excluded from being checked due to sampling, * see the call to kasan_unpoison_pages. * * Poisoning pages during deferred memory init will greatly lengthen the * process and cause problem in large memory systems as the deferred pages * initialization is done with interrupt disabled. * * Assuming that there will be no reference to those newly initialized * pages before they are ever allocated, this should have no effect on * KASAN memory tracking as the poison will be properly inserted at page * allocation time. The only corner case is when pages are allocated by * on-demand allocation and then freed again before the deferred pages * initialization is done, but this is not likely to happen.
*/ staticinlinebool should_skip_kasan_poison(struct page *page)
{ if (IS_ENABLED(CONFIG_KASAN_GENERIC)) return deferred_pages_enabled();
staticvoid kernel_init_pages(struct page *page, int numpages)
{ int i;
/* s390's use of memset() could override KASAN redzones. */
kasan_disable_current(); for (i = 0; i < numpages; i++)
clear_highpage_kasan_tagged(page + i);
kasan_enable_current();
}
#ifdef CONFIG_MEM_ALLOC_PROFILING
/* Should be called only if mem_alloc_profiling_enabled() */ void __clear_page_tag_ref(struct page *page)
{ union pgtag_ref_handle handle; union codetag_ref ref;
/* Should be called only if mem_alloc_profiling_enabled() */ static noinline void __pgalloc_tag_add(struct page *page, struct task_struct *task, unsignedint nr)
{ union pgtag_ref_handle handle; union codetag_ref ref;
/* Should be called only if mem_alloc_profiling_enabled() */ static noinline void __pgalloc_tag_sub(struct page *page, unsignedint nr)
{ union pgtag_ref_handle handle; union codetag_ref ref;
/* When tag is not NULL, assuming mem_alloc_profiling_enabled */ staticinlinevoid pgalloc_tag_sub_pages(struct alloc_tag *tag, unsignedint nr)
{ if (tag)
this_cpu_sub(tag->counters->bytes, PAGE_SIZE * nr);
}
if (memcg_kmem_online() && PageMemcgKmem(page))
__memcg_kmem_uncharge_page(page, order);
/* * In rare cases, when truncation or holepunching raced with * munlock after VM_LOCKED was cleared, Mlocked may still be * found set here. This does not indicate a problem, unless * "unevictable_pgs_cleared" appears worryingly large.
*/ if (unlikely(folio_test_mlocked(folio))) { long nr_pages = folio_nr_pages(folio);
if (unlikely(PageHWPoison(page)) && !order) { /* Do not let hwpoison pages hit pcplists/buddy */
reset_page_owner(page, order);
page_table_check_free(page, order);
pgalloc_tag_sub(page, 1 << order);
/* * The page is isolated and accounted for. * Mark the codetag as empty to avoid accounting error * when the page is freed by unpoison_memory().
*/
clear_page_tag_ref(page); returnfalse;
}
/* * As memory initialization might be integrated into KASAN, * KASAN poisoning and memory initialization code must be * kept together to avoid discrepancies in behavior. * * With hardware tag-based KASAN, memory tags must be set before the * page becomes unavailable via debug_pagealloc or arch_free_page.
*/ if (!skip_kasan_poison) {
kasan_poison_pages(page, order, init);
/* Memory is already initialized if KASAN did it internally. */ if (kasan_has_integrated_init())
init = false;
} if (init)
kernel_init_pages(page, 1 << order);
/* * arch_free_page() can make the page's contents inaccessible. s390 * does this. So nothing which can access the page's contents should * happen after this.
*/
arch_free_page(page, order);
debug_pagealloc_unmap_pages(page, 1 << order);
returntrue;
}
/* * Frees a number of pages from the PCP lists * Assumes all pages on list are in same zone. * count is the number of pages to free.
*/ staticvoid free_pcppages_bulk(struct zone *zone, int count, struct per_cpu_pages *pcp, int pindex)
{ unsignedlong flags; unsignedint order; struct page *page;
/* * Ensure proper count is passed which otherwise would stuck in the * below while (list_empty(list)) loop.
*/
count = min(pcp->count, count);
while (count > 0) { struct list_head *list; int nr_pages;
/* Remove pages from lists in a round-robin fashion. */ do { if (++pindex > NR_PCP_LISTS - 1)
pindex = 0;
list = &pcp->lists[pindex];
} while (list_empty(list));
order = pindex_to_order(pindex);
nr_pages = 1 << order; do { unsignedlong pfn; int mt;
staticvoid add_page_to_zone_llist(struct zone *zone, struct page *page, unsignedint order)
{ /* Remember the order */
page->order = order; /* Add the page to the free list */
llist_add(&page->pcp_llist, &zone->trylock_free_pages);
}
/* * When initializing the memmap, __init_single_page() sets the refcount * of all pages to 1 ("allocated"/"not free"). We have to set the * refcount of all involved pages to 0. * * Note that hotplugged memory pages are initialized to PageOffline(). * Pages freed from memblock might be marked as reserved.
*/ if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG) &&
unlikely(context == MEMINIT_HOTPLUG)) { for (loop = 0; loop < nr_pages; loop++, p++) {
VM_WARN_ON_ONCE(PageReserved(p));
__ClearPageOffline(p);
set_page_count(p, 0);
}
/* * Bypass PCP and place fresh pages right to the tail, primarily * relevant for memory onlining.
*/
__free_pages_ok(page, order, FPI_TO_TAIL);
}
/* * Check that the whole (or subset of) a pageblock given by the interval of * [start_pfn, end_pfn) is valid and within the same zone, before scanning it * with the migration of free compaction scanner. * * Return struct page pointer of start_pfn, or NULL if checks were not passed. * * It's possible on some configurations to have a setup like node0 node1 node0 * i.e. it's possible that all pages within a zones range of pages do not * belong to a single zone. We assume that a border between node0 and node1 * can occur within a single pageblock, but not a node0 node1 node0 * interleaving within a single pageblock. It is therefore sufficient to check * the first and last page of a pageblock and avoid checking each individual * page in a pageblock. * * Note: the function may return non-NULL struct page even for a page block * which contains a memory hole (i.e. there is no physical memory for a subset * of the pfn range). For example, if the pageblock order is MAX_PAGE_ORDER, which * will fall into 2 sub-sections, and the end pfn of the pageblock may be hole * even though the start pfn is online and valid. This should be safe most of * the time because struct pages are still initialized via init_unavailable_range() * and pfn walkers shouldn't touch any physical memory range for which they do * not recognize any specific metadata in struct pages.
*/ struct page *__pageblock_pfn_to_page(unsignedlong start_pfn, unsignedlong end_pfn, struct zone *zone)
{ struct page *start_page; struct page *end_page;
/* end_pfn is one past the range we are checking */
end_pfn--;
if (!pfn_valid(end_pfn)) return NULL;
start_page = pfn_to_online_page(start_pfn); if (!start_page) return NULL;
if (page_zone(start_page) != zone) return NULL;
end_page = pfn_to_page(end_pfn);
/* This gives a shorter code than deriving page_zone(end_page) */ if (page_zone_id(start_page) != page_zone_id(end_page)) return NULL;
return start_page;
}
/* * The order of subdivision here is critical for the IO subsystem. * Please do not alter this order without good reasons and regression * testing. Specifically, as large blocks of memory are subdivided, * the order in which smaller blocks are delivered depends on the order * they're subdivided in this function. This is the primary factor * influencing the order in which pages are delivered to the IO * subsystem according to empirical testing, and this is also justified * by considering the behavior of a buddy system containing a single * large block of memory acted on by a series of small allocations. * This behavior is a critical factor in sglist merging's success. * * -- nyc
*/ staticinlineunsignedint expand(struct zone *zone, struct page *page, int low, int high, int migratetype)
{ unsignedint size = 1 << high; unsignedint nr_added = 0;
/* * Mark as guard pages (or page), that will allow to * merge back to allocator when buddy will be freed. * Corresponding page table entries will not be touched, * pages will stay not present in virtual address space
*/ if (set_page_guard(zone, &page[size], high)) continue;
/* * This page is about to be returned from the page allocator
*/ staticbool check_new_page(struct page *page)
{ if (likely(page_expected_state(page,
PAGE_FLAGS_CHECK_AT_PREP|__PG_HWPOISON))) returnfalse;
check_new_page_bad(page); returntrue;
}
staticinlinebool check_new_pages(struct page *page, unsignedint order)
{ if (is_check_pages_enabled()) { for (int i = 0; i < (1 << order); i++) { struct page *p = page + i;
if (check_new_page(p)) returntrue;
}
}
returnfalse;
}
staticinlinebool should_skip_kasan_unpoison(gfp_t flags)
{ /* Don't skip if a software KASAN mode is enabled. */ if (IS_ENABLED(CONFIG_KASAN_GENERIC) ||
IS_ENABLED(CONFIG_KASAN_SW_TAGS)) returnfalse;
/* Skip, if hardware tag-based KASAN is not enabled. */ if (!kasan_hw_tags_enabled()) returntrue;
/* * With hardware tag-based KASAN enabled, skip if this has been * requested via __GFP_SKIP_KASAN.
*/ return flags & __GFP_SKIP_KASAN;
}
staticinlinebool should_skip_init(gfp_t flags)
{ /* Don't skip, if hardware tag-based KASAN is not enabled. */ if (!kasan_hw_tags_enabled()) returnfalse;
/* For hardware tag-based KASAN, skip if requested. */ return (flags & __GFP_SKIP_ZERO);
}
/* * Page unpoisoning must happen before memory initialization. * Otherwise, the poison pattern will be overwritten for __GFP_ZERO * allocations and the page unpoisoning code will complain.
*/
kernel_unpoison_pages(page, 1 << order);
/* * As memory initialization might be integrated into KASAN, * KASAN unpoisoning and memory initializion code must be * kept together to avoid discrepancies in behavior.
*/
/* * If memory tags should be zeroed * (which happens only when memory should be initialized as well).
*/ if (zero_tags) { /* Initialize both memory and memory tags. */ for (i = 0; i != 1 << order; ++i)
tag_clear_highpage(page + i);
/* Take note that memory was initialized by the loop above. */
init = false;
} if (!should_skip_kasan_unpoison(gfp_flags) &&
kasan_unpoison_pages(page, order, init)) { /* Take note that memory was initialized by KASAN. */ if (kasan_has_integrated_init())
init = false;
} else { /* * If memory tags have not been set by KASAN, reset the page * tags to ensure page_address() dereferencing does not fault.
*/ for (i = 0; i != 1 << order; ++i)
page_kasan_tag_reset(page + i);
} /* If memory is still not initialized, initialize it now. */ if (init)
kernel_init_pages(page, 1 << order);
if (order && (gfp_flags & __GFP_COMP))
prep_compound_page(page, order);
/* * page is set pfmemalloc when ALLOC_NO_WATERMARKS was necessary to * allocate the page. The expectation is that the caller is taking * steps that will free more memory. The caller should avoid the page * being used for !PFMEMALLOC purposes.
*/ if (alloc_flags & ALLOC_NO_WATERMARKS)
set_page_pfmemalloc(page); else
clear_page_pfmemalloc(page);
}
/* * Go through the free lists for the given migratetype and remove * the smallest available page from the freelists
*/ static __always_inline struct page *__rmqueue_smallest(struct zone *zone, unsignedint order, int migratetype)
{ unsignedint current_order; struct free_area *area; struct page *page;
/* Find a page of the appropriate size in the preferred list */ for (current_order = order; current_order < NR_PAGE_ORDERS; ++current_order) {
area = &(zone->free_area[current_order]);
page = get_page_from_free_area(area, migratetype); if (!page) continue;
/* * This array describes the order lists are fallen back to when * the free lists for the desirable migrate type are depleted * * The other migratetypes do not have fallbacks.
*/ staticint fallbacks[MIGRATE_PCPTYPES][MIGRATE_PCPTYPES - 1] = {
[MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE },
[MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE },
[MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE },
};
/* * Move all free pages of a block to new type's freelist. Caller needs to * change the block type.
*/ staticint __move_freepages_block(struct zone *zone, unsignedlong start_pfn, int old_mt, int new_mt)
{ struct page *page; unsignedlong pfn, end_pfn; unsignedint order; int pages_moved = 0;
for (pfn = start_pfn; pfn < end_pfn;) {
page = pfn_to_page(pfn); if (!PageBuddy(page)) {
pfn++; continue;
}
/* Make sure we are not inadvertently changing nodes */
VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
VM_BUG_ON_PAGE(page_zone(page) != zone, page);
staticbool prep_move_freepages_block(struct zone *zone, struct page *page, unsignedlong *start_pfn, int *num_free, int *num_movable)
{ unsignedlong pfn, start, end;
pfn = page_to_pfn(page);
start = pageblock_start_pfn(pfn);
end = pageblock_end_pfn(pfn);
/* * The caller only has the lock for @zone, don't touch ranges * that straddle into other zones. While we could move part of * the range that's inside the zone, this call is usually * accompanied by other operations such as migratetype updates * which also should be locked.
*/ if (!zone_spans_pfn(zone, start)) returnfalse; if (!zone_spans_pfn(zone, end - 1)) returnfalse;
*start_pfn = start;
if (num_free) {
*num_free = 0;
*num_movable = 0; for (pfn = start; pfn < end;) {
page = pfn_to_page(pfn); if (PageBuddy(page)) { int nr = 1 << buddy_order(page);
*num_free += nr;
pfn += nr; continue;
} /* * We assume that pages that could be isolated for * migration are movable. But we don't actually try * isolating, as that would be expensive.
*/ if (PageLRU(page) || page_has_movable_ops(page))
(*num_movable)++;
pfn++;
}
}
returntrue;
}
staticint move_freepages_block(struct zone *zone, struct page *page, int old_mt, int new_mt)
{ unsignedlong start_pfn; int res;
if (!prep_move_freepages_block(zone, page, &start_pfn, NULL, NULL)) return -1;
res = __move_freepages_block(zone, start_pfn, old_mt, new_mt);
set_pageblock_migratetype(pfn_to_page(start_pfn), new_mt);
return res;
}
#ifdef CONFIG_MEMORY_ISOLATION /* Look for a buddy that straddles start_pfn */ staticunsignedlong find_large_buddy(unsignedlong start_pfn)
{ int order = 0; struct page *page; unsignedlong pfn = start_pfn;
while (!PageBuddy(page = pfn_to_page(pfn))) { /* Nothing found */ if (++order > MAX_PAGE_ORDER) return start_pfn;
pfn &= ~0UL << order;
}
/* * Found a preceding buddy, but does it straddle?
*/ if (pfn + (1 << buddy_order(page)) > start_pfn) return pfn;
/** * __move_freepages_block_isolate - move free pages in block for page isolation * @zone: the zone * @page: the pageblock page * @isolate: to isolate the given pageblock or unisolate it * * This is similar to move_freepages_block(), but handles the special * case encountered in page isolation, where the block of interest * might be part of a larger buddy spanning multiple pageblocks. * * Unlike the regular page allocator path, which moves pages while * stealing buddies off the freelist, page isolation is interested in * arbitrary pfn ranges that may have overlapping buddies on both ends. * * This function handles that. Straddling buddies are split into * individual pageblocks. Only the block of interest is moved. * * Returns %true if pages could be moved, %false otherwise.
*/ staticbool __move_freepages_block_isolate(struct zone *zone, struct page *page, bool isolate)
{ unsignedlong start_pfn, pfn; int from_mt; int to_mt;
if (isolate == get_pageblock_isolate(page)) {
VM_WARN_ONCE(1, "%s a pageblock that is already in that state",
isolate ? "Isolate" : "Unisolate"); returnfalse;
}
if (!prep_move_freepages_block(zone, page, &start_pfn, NULL, NULL)) returnfalse;
/* No splits needed if buddies can't span multiple blocks */ if (pageblock_order == MAX_PAGE_ORDER) goto move;
/* We're a tail block in a larger buddy */
pfn = find_large_buddy(start_pfn); if (pfn != start_pfn) { struct page *buddy = pfn_to_page(pfn); int order = buddy_order(buddy);
staticvoid change_pageblock_range(struct page *pageblock_page, int start_order, int migratetype)
{ int nr_pageblocks = 1 << (start_order - pageblock_order);
while (nr_pageblocks--) {
set_pageblock_migratetype(pageblock_page, migratetype);
pageblock_page += pageblock_nr_pages;
}
}
staticinlinebool boost_watermark(struct zone *zone)
{ unsignedlong max_boost;
if (!watermark_boost_factor) returnfalse; /* * Don't bother in zones that are unlikely to produce results. * On small machines, including kdump capture kernels running * in a small area, boosting the watermark can cause an out of * memory situation immediately.
*/ if ((pageblock_nr_pages * 4) > zone_managed_pages(zone)) returnfalse;
/* * high watermark may be uninitialised if fragmentation occurs * very early in boot so do not boost. We do not fall * through and boost by pageblock_nr_pages as failing * allocations that early means that reclaim is not going * to help and it may even be impossible to reclaim the * boosted watermark resulting in a hang.
*/ if (!max_boost) returnfalse;
/* * When we are falling back to another migratetype during allocation, should we * try to claim an entire block to satisfy further allocations, instead of * polluting multiple pageblocks?
*/ staticbool should_try_claim_block(unsignedint order, int start_mt)
{ /* * Leaving this order check is intended, although there is * relaxed order check in next check. The reason is that * we can actually claim the whole pageblock if this condition met, * but, below check doesn't guarantee it and that is just heuristic * so could be changed anytime.
*/ if (order >= pageblock_order) returntrue;
/* * Above a certain threshold, always try to claim, as it's likely there * will be more free pages in the pageblock.
*/ if (order >= pageblock_order / 2) returntrue;
/* * Unmovable/reclaimable allocations would cause permanent * fragmentations if they fell back to allocating from a movable block * (polluting it), so we try to claim the whole block regardless of the * allocation size. Later movable allocations can always steal from this * block, which is less problematic.
*/ if (start_mt == MIGRATE_RECLAIMABLE || start_mt == MIGRATE_UNMOVABLE) returntrue;
if (page_group_by_mobility_disabled) returntrue;
/* * Movable pages won't cause permanent fragmentation, so when you alloc * small pages, we just need to temporarily steal unmovable or * reclaimable pages that are closest to the request size. After a * while, memory compaction may occur to form large contiguous pages, * and the next movable allocation may not need to steal.
*/ returnfalse;
}
/* * Check whether there is a suitable fallback freepage with requested order. * If claimable is true, this function returns fallback_mt only if * we would do this whole-block claiming. This would help to reduce * fragmentation due to mixed migratetype pages in one pageblock.
*/ int find_suitable_fallback(struct free_area *area, unsignedint order, int migratetype, bool claimable)
{ int i;
if (claimable && !should_try_claim_block(order, migratetype)) return -2;
if (area->nr_free == 0) return -1;
for (i = 0; i < MIGRATE_PCPTYPES - 1 ; i++) { int fallback_mt = fallbacks[migratetype][i];
if (!free_area_empty(area, fallback_mt)) return fallback_mt;
}
return -1;
}
/* * This function implements actual block claiming behaviour. If order is large * enough, we can claim the whole pageblock for the requested migratetype. If * not, we check the pageblock for constituent pages; if at least half of the * pages are free or compatible, we can still claim the whole block, so pages * freed in the future will be put on the correct free list.
*/ staticstruct page *
try_to_claim_block(struct zone *zone, struct page *page, int current_order, int order, int start_type, int block_type, unsignedint alloc_flags)
{ int free_pages, movable_pages, alike_pages; unsignedlong start_pfn;
--> --------------------
--> maximum size reached
--> --------------------
Messung V0.5
¤ Dauer der Verarbeitung: 0.27 Sekunden
(vorverarbeitet)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.