/* Defines the order for the number of pages that have a migrate type. */ #ifndef CONFIG_PAGE_BLOCK_MAX_ORDER #define PAGE_BLOCK_MAX_ORDER MAX_PAGE_ORDER #else #define PAGE_BLOCK_MAX_ORDER CONFIG_PAGE_BLOCK_MAX_ORDER #endif/* CONFIG_PAGE_BLOCK_MAX_ORDER */
/* * The MAX_PAGE_ORDER, which defines the max order of pages to be allocated * by the buddy allocator, has to be larger or equal to the PAGE_BLOCK_MAX_ORDER, * which defines the order for the number of pages that can have a migrate type
*/ #if (PAGE_BLOCK_MAX_ORDER > MAX_PAGE_ORDER) #error MAX_PAGE_ORDER must be >= PAGE_BLOCK_MAX_ORDER #endif
/* * PAGE_ALLOC_COSTLY_ORDER is the order at which allocations are deemed * costly to service. That is between allocation orders which should * coalesce naturally under reasonable reclaim pressure and those which * will not.
*/ #define PAGE_ALLOC_COSTLY_ORDER 3
enum migratetype {
MIGRATE_UNMOVABLE,
MIGRATE_MOVABLE,
MIGRATE_RECLAIMABLE,
MIGRATE_PCPTYPES, /* the number of types on the pcp lists */
MIGRATE_HIGHATOMIC = MIGRATE_PCPTYPES, #ifdef CONFIG_CMA /* * MIGRATE_CMA migration type is designed to mimic the way * ZONE_MOVABLE works. Only movable pages can be allocated * from MIGRATE_CMA pageblocks and page allocator never * implicitly change migration type of MIGRATE_CMA pageblock. * * The way to use it is to change migratetype of a range of * pageblocks to MIGRATE_CMA which can be done by * __free_pageblock_cma() function.
*/
MIGRATE_CMA,
__MIGRATE_TYPE_END = MIGRATE_CMA, #else
__MIGRATE_TYPE_END = MIGRATE_HIGHATOMIC, #endif #ifdef CONFIG_MEMORY_ISOLATION
MIGRATE_ISOLATE, /* can't allocate from here */ #endif
MIGRATE_TYPES
};
/* In mm/page_alloc.c; keep in sync also with show_migration_types() there */ externconstchar * const migratetype_names[MIGRATE_TYPES];
#ifdef CONFIG_CMA # define is_migrate_cma(migratetype) unlikely((migratetype) == MIGRATE_CMA) # define is_migrate_cma_page(_page) (get_pageblock_migratetype(_page) == MIGRATE_CMA) /* * __dump_folio() in mm/debug.c passes a folio pointer to on-stack struct folio, * so folio_pfn() cannot be used and pfn is needed.
*/ # define is_migrate_cma_folio(folio, pfn) \
(get_pfnblock_migratetype(&folio->page, pfn) == MIGRATE_CMA) #else # define is_migrate_cma(migratetype) false # define is_migrate_cma_page(_page) false # define is_migrate_cma_folio(folio, pfn) false #endif
/* * Check whether a migratetype can be merged with another migratetype. * * It is only mergeable when it can fall back to other migratetypes for * allocation. See fallbacks[MIGRATE_TYPES][3] in page_alloc.c.
*/ staticinlinebool migratetype_is_mergeable(int mt)
{ return mt < MIGRATE_PCPTYPES;
}
#define for_each_migratetype_order(order, type) \ for (order = 0; order < NR_PAGE_ORDERS; order++) \ for (type = 0; type < MIGRATE_TYPES; type++)
/* * Returns true if the item should be printed in THPs (/proc/vmstat * currently prints number of anon, file and shmem THPs. But the item * is charged in pages).
*/ static __always_inline bool vmstat_item_print_in_thp(enum node_stat_item item)
{ if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) returnfalse;
/* * Returns true if the value is measured in bytes (most vmstat values are * measured in pages). This defines the API part, the internal representation * might be different.
*/ static __always_inline bool vmstat_item_in_bytes(int idx)
{ /* * Global and per-node slab counters track slab pages. * It's expected that changes are multiples of PAGE_SIZE. * Internally values are stored in pages. * * Per-memcg and per-lruvec counters track memory, consumed * by individual slab objects. These counters are actually * byte-precise.
*/ return (idx == NR_SLAB_RECLAIMABLE_B ||
idx == NR_SLAB_UNRECLAIMABLE_B);
}
/* * We do arithmetic on the LRU lists in various places in the code, * so it is important to keep the active lists LRU_ACTIVE higher in * the array than the corresponding inactive lists, and to keep * the *_FILE lists LRU_FILE higher than the corresponding _ANON lists. * * This has to be kept in sync with the statistics in zone_stat_item * above and the descriptions in vmstat_text in mm/vmstat.c
*/ #define LRU_BASE 0 #define LRU_ACTIVE 1 #define LRU_FILE 2
enum lruvec_flags { /* * An lruvec has many dirty pages backed by a congested BDI: * 1. LRUVEC_CGROUP_CONGESTED is set by cgroup-level reclaim. * It can be cleared by cgroup reclaim or kswapd. * 2. LRUVEC_NODE_CONGESTED is set by kswapd node-level reclaim. * It can only be cleared by kswapd. * * Essentially, kswapd can unthrottle an lruvec throttled by cgroup * reclaim, but not vice versa. This only applies to the root cgroup. * The goal is to prevent cgroup reclaim on the root cgroup (e.g. * memory.reclaim) to unthrottle an unbalanced node (that was throttled * by kswapd).
*/
LRUVEC_CGROUP_CONGESTED,
LRUVEC_NODE_CONGESTED,
};
#endif/* !__GENERATING_BOUNDS_H */
/* * Evictable folios are divided into multiple generations. The youngest and the * oldest generation numbers, max_seq and min_seq, are monotonically increasing. * They form a sliding window of a variable size [MIN_NR_GENS, MAX_NR_GENS]. An * offset within MAX_NR_GENS, i.e., gen, indexes the LRU list of the * corresponding generation. The gen counter in folio->flags stores gen+1 while * a folio is on one of lrugen->folios[]. Otherwise it stores 0. * * After a folio is faulted in, the aging needs to check the accessed bit at * least twice before handing this folio over to the eviction. The first check * clears the accessed bit from the initial fault; the second check makes sure * this folio hasn't been used since then. This process, AKA second chance, * requires a minimum of two generations, hence MIN_NR_GENS. And to maintain ABI * compatibility with the active/inactive LRU, e.g., /proc/vmstat, these two * generations are considered active; the rest of generations, if they exist, * are considered inactive. See lru_gen_is_active(). * * PG_active is always cleared while a folio is on one of lrugen->folios[] so * that the sliding window needs not to worry about it. And it's set again when * a folio considered active is isolated for non-reclaiming purposes, e.g., * migration. See lru_gen_add_folio() and lru_gen_del_folio(). * * MAX_NR_GENS is set to 4 so that the multi-gen LRU can support twice the * number of categories of the active/inactive LRU when keeping track of * accesses through page tables. This requires order_base_2(MAX_NR_GENS+1) bits * in folio->flags, masked by LRU_GEN_MASK.
*/ #define MIN_NR_GENS 2U #define MAX_NR_GENS 4U
/* * Each generation is divided into multiple tiers. A folio accessed N times * through file descriptors is in tier order_base_2(N). A folio in the first * tier (N=0,1) is marked by PG_referenced unless it was faulted in through page * tables or read ahead. A folio in the last tier (MAX_NR_TIERS-1) is marked by * PG_workingset. A folio in any other tier (1<N<5) between the first and last * is marked by additional bits of LRU_REFS_WIDTH in folio->flags. * * In contrast to moving across generations which requires the LRU lock, moving * across tiers only involves atomic operations on folio->flags and therefore * has a negligible cost in the buffered access path. In the eviction path, * comparisons of refaulted/(evicted+protected) from the first tier and the rest * infer whether folios accessed multiple times through file descriptors are * statistically hot and thus worth protecting. * * MAX_NR_TIERS is set to 4 so that the multi-gen LRU can support twice the * number of categories of the active/inactive LRU when keeping track of * accesses through file descriptors. This uses MAX_NR_TIERS-2 spare bits in * folio->flags, masked by LRU_REFS_MASK.
*/ #define MAX_NR_TIERS 4U
/* * For folios accessed multiple times through file descriptors, * lru_gen_inc_refs() sets additional bits of LRU_REFS_WIDTH in folio->flags * after PG_referenced, then PG_workingset after LRU_REFS_WIDTH. After all its * bits are set, i.e., LRU_REFS_FLAGS|BIT(PG_workingset), a folio is lazily * promoted into the second oldest generation in the eviction path. And when * folio_inc_gen() does that, it clears LRU_REFS_FLAGS so that * lru_gen_inc_refs() can start over. Note that for this case, LRU_REFS_MASK is * only valid when PG_referenced is set. * * For folios accessed multiple times through page tables, folio_update_gen() * from a page table walk or lru_gen_set_refs() from a rmap walk sets * PG_referenced after the accessed bit is cleared for the first time. * Thereafter, those two paths set PG_workingset and promote folios to the * youngest generation. Like folio_inc_gen(), folio_update_gen() also clears * PG_referenced. Note that for this case, LRU_REFS_MASK is not used. * * For both cases above, after PG_workingset is set on a folio, it remains until * this folio is either reclaimed, or "deactivated" by lru_gen_clear_refs(). It * can be set again if lru_gen_test_recent() returns true upon a refault.
*/ #define LRU_REFS_FLAGS (LRU_REFS_MASK | BIT(PG_referenced))
/* whether to keep historical stats from evicted generations */ #ifdef CONFIG_LRU_GEN_STATS #define NR_HIST_GENS MAX_NR_GENS #else #define NR_HIST_GENS 1U #endif
/* * The youngest generation number is stored in max_seq for both anon and file * types as they are aged on an equal footing. The oldest generation numbers are * stored in min_seq[] separately for anon and file types so that they can be * incremented independently. Ideally min_seq[] are kept in sync when both anon * and file types are evictable. However, to adapt to situations like extreme * swappiness, they are allowed to be out of sync by at most * MAX_NR_GENS-MIN_NR_GENS-1. * * The number of pages in each generation is eventually consistent and therefore * can be transiently negative when reset_batch_size() is pending.
*/ struct lru_gen_folio { /* the aging increments the youngest generation number */ unsignedlong max_seq; /* the eviction increments the oldest generation numbers */ unsignedlong min_seq[ANON_AND_FILE]; /* the birth time of each generation in jiffies */ unsignedlong timestamps[MAX_NR_GENS]; /* the multi-gen LRU lists, lazily sorted on eviction */ struct list_head folios[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; /* the multi-gen LRU sizes, eventually consistent */ long nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; /* the exponential moving average of refaulted */ unsignedlong avg_refaulted[ANON_AND_FILE][MAX_NR_TIERS]; /* the exponential moving average of evicted+protected */ unsignedlong avg_total[ANON_AND_FILE][MAX_NR_TIERS]; /* can only be modified under the LRU lock */ unsignedlongprotected[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; /* can be modified without holding the LRU lock */
atomic_long_t evicted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; /* whether the multi-gen LRU is enabled */ bool enabled; /* the memcg generation this lru_gen_folio belongs to */
u8 gen; /* the list segment this lru_gen_folio belongs to */
u8 seg; /* per-node lru_gen_folio list for global reclaim */ struct hlist_nulls_node list;
};
enum {
MM_LEAF_TOTAL, /* total leaf entries */
MM_LEAF_YOUNG, /* young leaf entries */
MM_NONLEAF_FOUND, /* non-leaf entries found in Bloom filters */
MM_NONLEAF_ADDED, /* non-leaf entries added to Bloom filters */
NR_MM_STATS
};
struct lru_gen_mm_state { /* synced with max_seq after each iteration */ unsignedlong seq; /* where the current iteration continues after */ struct list_head *head; /* where the last iteration ended before */ struct list_head *tail; /* Bloom filters flip after each iteration */ unsignedlong *filters[NR_BLOOM_FILTERS]; /* the mm stats for debugging */ unsignedlong stats[NR_HIST_GENS][NR_MM_STATS];
};
struct lru_gen_mm_walk { /* the lruvec under reclaim */ struct lruvec *lruvec; /* max_seq from lru_gen_folio: can be out of date */ unsignedlong seq; /* the next address within an mm to scan */ unsignedlong next_addr; /* to batch promoted pages */ int nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; /* to batch the mm stats */ int mm_stats[NR_MM_STATS]; /* total batched items */ int batched; int swappiness; bool force_scan;
};
/* * For each node, memcgs are divided into two generations: the old and the * young. For each generation, memcgs are randomly sharded into multiple bins * to improve scalability. For each bin, the hlist_nulls is virtually divided * into three segments: the head, the tail and the default. * * An onlining memcg is added to the tail of a random bin in the old generation. * The eviction starts at the head of a random bin in the old generation. The * per-node memcg generation counter, whose reminder (mod MEMCG_NR_GENS) indexes * the old generation, is incremented when all its bins become empty. * * There are four operations: * 1. MEMCG_LRU_HEAD, which moves a memcg to the head of a random bin in its * current generation (old or young) and updates its "seg" to "head"; * 2. MEMCG_LRU_TAIL, which moves a memcg to the tail of a random bin in its * current generation (old or young) and updates its "seg" to "tail"; * 3. MEMCG_LRU_OLD, which moves a memcg to the head of a random bin in the old * generation, updates its "gen" to "old" and resets its "seg" to "default"; * 4. MEMCG_LRU_YOUNG, which moves a memcg to the tail of a random bin in the * young generation, updates its "gen" to "young" and resets its "seg" to * "default". * * The events that trigger the above operations are: * 1. Exceeding the soft limit, which triggers MEMCG_LRU_HEAD; * 2. The first attempt to reclaim a memcg below low, which triggers * MEMCG_LRU_TAIL; * 3. The first attempt to reclaim a memcg offlined or below reclaimable size * threshold, which triggers MEMCG_LRU_TAIL; * 4. The second attempt to reclaim a memcg offlined or below reclaimable size * threshold, which triggers MEMCG_LRU_YOUNG; * 5. Attempting to reclaim a memcg below min, which triggers MEMCG_LRU_YOUNG; * 6. Finishing the aging on the eviction path, which triggers MEMCG_LRU_YOUNG; * 7. Offlining a memcg, which triggers MEMCG_LRU_OLD. * * Notes: * 1. Memcg LRU only applies to global reclaim, and the round-robin incrementing * of their max_seq counters ensures the eventual fairness to all eligible * memcgs. For memcg reclaim, it still relies on mem_cgroup_iter(). * 2. There are only two valid generations: old (seq) and young (seq+1). * MEMCG_NR_GENS is set to three so that when reading the generation counter * locklessly, a stale value (seq-1) does not wraparound to young.
*/ #define MEMCG_NR_GENS 3 #define MEMCG_NR_BINS 8
struct lru_gen_memcg { /* the per-node memcg generation counter */ unsignedlong seq; /* each memcg has one lru_gen_folio per node */ unsignedlong nr_memcgs[MEMCG_NR_GENS]; /* per-node lru_gen_folio list for global reclaim */ struct hlist_nulls_head fifo[MEMCG_NR_GENS][MEMCG_NR_BINS]; /* protects the above */
spinlock_t lock;
};
staticinlinevoid lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid)
{
}
#endif/* CONFIG_LRU_GEN */
struct lruvec { struct list_head lists[NR_LRU_LISTS]; /* per lruvec lru_lock for memcg */
spinlock_t lru_lock; /* * These track the cost of reclaiming one LRU - file or anon - * over the other. As the observed cost of reclaiming one LRU * increases, the reclaim scan balance tips toward the other.
*/ unsignedlong anon_cost; unsignedlong file_cost; /* Non-resident age, driven by LRU movement */
atomic_long_t nonresident_age; /* Refaults at the time of last reclaim cycle */ unsignedlong refaults[ANON_AND_FILE]; /* Various lruvec state flags (enum lruvec_flags) */ unsignedlong flags; #ifdef CONFIG_LRU_GEN /* evictable pages divided into generations */ struct lru_gen_folio lrugen; #ifdef CONFIG_LRU_GEN_WALKS_MMU /* to concurrently iterate lru_gen_mm_list */ struct lru_gen_mm_state mm_state; #endif #endif/* CONFIG_LRU_GEN */ #ifdef CONFIG_MEMCG struct pglist_data *pgdat; #endif struct zswap_lruvec_state zswap_lruvec_state;
};
/* * One per migratetype for each PAGE_ALLOC_COSTLY_ORDER. Two additional lists * are added for THP. One PCP list is used by GPF_MOVABLE, and the other PCP list * is used by GFP_UNMOVABLE and GFP_RECLAIMABLE.
*/ #ifdef CONFIG_TRANSPARENT_HUGEPAGE #define NR_PCP_THP 2 #else #define NR_PCP_THP 0 #endif #define NR_LOWORDER_PCP_LISTS (MIGRATE_PCPTYPES * (PAGE_ALLOC_COSTLY_ORDER + 1)) #define NR_PCP_LISTS (NR_LOWORDER_PCP_LISTS + NR_PCP_THP)
/* * Flags used in pcp->flags field. * * PCPF_PREV_FREE_HIGH_ORDER: a high-order page is freed in the * previous page freeing. To avoid to drain PCP for an accident * high-order page freeing. * * PCPF_FREE_HIGH_BATCH: preserve "pcp->batch" pages in PCP before * draining PCP for consecutive high-order pages freeing without * allocation if data cache slice of CPU is large enough. To reduce * zone lock contention and keep cache-hot pages reusing.
*/ #define PCPF_PREV_FREE_HIGH_ORDER BIT(0) #define PCPF_FREE_HIGH_BATCH BIT(1)
struct per_cpu_pages {
spinlock_t lock; /* Protects lists field */ int count; /* number of pages in the list */ int high; /* high watermark, emptying needed */ int high_min; /* min high watermark */ int high_max; /* max high watermark */ int batch; /* chunk size for buddy add/remove */
u8 flags; /* protected by pcp->lock */
u8 alloc_factor; /* batch scaling factor during allocate */ #ifdef CONFIG_NUMA
u8 expire; /* When 0, remote pagesets are drained */ #endif short free_count; /* consecutive free count */
/* Lists of pages, one per migrate type stored on the pcp-lists */ struct list_head lists[NR_PCP_LISTS];
} ____cacheline_aligned_in_smp;
struct per_cpu_zonestat { #ifdef CONFIG_SMP
s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS];
s8 stat_threshold; #endif #ifdef CONFIG_NUMA /* * Low priority inaccurate counters that are only folded * on demand. Use a large type to avoid the overhead of * folding during refresh_cpu_vm_stats.
*/ unsignedlong vm_numa_event[NR_VM_NUMA_EVENT_ITEMS]; #endif
};
enum zone_type { /* * ZONE_DMA and ZONE_DMA32 are used when there are peripherals not able * to DMA to all of the addressable memory (ZONE_NORMAL). * On architectures where this area covers the whole 32 bit address * space ZONE_DMA32 is used. ZONE_DMA is left for the ones with smaller * DMA addressing constraints. This distinction is important as a 32bit * DMA mask is assumed when ZONE_DMA32 is defined. Some 64-bit * platforms may need both zones as they support peripherals with * different DMA addressing limitations.
*/ #ifdef CONFIG_ZONE_DMA
ZONE_DMA, #endif #ifdef CONFIG_ZONE_DMA32
ZONE_DMA32, #endif /* * Normal addressable memory is in ZONE_NORMAL. DMA operations can be * performed on pages in ZONE_NORMAL if the DMA devices support * transfers to all addressable memory.
*/
ZONE_NORMAL, #ifdef CONFIG_HIGHMEM /* * A memory area that is only addressable by the kernel through * mapping portions into its own address space. This is for example * used by i386 to allow the kernel to address the memory beyond * 900MB. The kernel will set up special mappings (page * table entries on i386) for each page that the kernel needs to * access.
*/
ZONE_HIGHMEM, #endif /* * ZONE_MOVABLE is similar to ZONE_NORMAL, except that it contains * movable pages with few exceptional cases described below. Main use * cases for ZONE_MOVABLE are to make memory offlining/unplug more * likely to succeed, and to locally limit unmovable allocations - e.g., * to increase the number of THP/huge pages. Notable special cases are: * * 1. Pinned pages: (long-term) pinning of movable pages might * essentially turn such pages unmovable. Therefore, we do not allow * pinning long-term pages in ZONE_MOVABLE. When pages are pinned and * faulted, they come from the right zone right away. However, it is * still possible that address space already has pages in * ZONE_MOVABLE at the time when pages are pinned (i.e. user has * touches that memory before pinning). In such case we migrate them * to a different zone. When migration fails - pinning fails. * 2. memblock allocations: kernelcore/movablecore setups might create * situations where ZONE_MOVABLE contains unmovable allocations * after boot. Memory offlining and allocations fail early. * 3. Memory holes: kernelcore/movablecore setups might create very rare * situations where ZONE_MOVABLE contains memory holes after boot, * for example, if we have sections that are only partially * populated. Memory offlining and allocations fail early. * 4. PG_hwpoison pages: while poisoned pages can be skipped during * memory offlining, such pages cannot be allocated. * 5. Unmovable PG_offline pages: in paravirtualized environments, * hotplugged memory blocks might only partially be managed by the * buddy (e.g., via XEN-balloon, Hyper-V balloon, virtio-mem). The * parts not manged by the buddy are unmovable PG_offline pages. In * some cases (virtio-mem), such pages can be skipped during * memory offlining, however, cannot be moved/allocated. These * techniques might use alloc_contig_range() to hide previously * exposed pages from the buddy again (e.g., to implement some sort * of memory unplug in virtio-mem). * 6. ZERO_PAGE(0), kernelcore/movablecore setups might create * situations where ZERO_PAGE(0) which is allocated differently * on different platforms may end up in a movable zone. ZERO_PAGE(0) * cannot be migrated. * 7. Memory-hotplug: when using memmap_on_memory and onlining the * memory to the MOVABLE zone, the vmemmap pages are also placed in * such zone. Such pages cannot be really moved around as they are * self-stored in the range, but they are treated as movable when * the range they describe is about to be offlined. * * In general, no unmovable allocations that degrade memory offlining * should end up in ZONE_MOVABLE. Allocators (like alloc_contig_range()) * have to expect that migrating pages in ZONE_MOVABLE can fail (even * if has_unmovable_pages() states that there are no unmovable pages, * there can be false negatives).
*/
ZONE_MOVABLE, #ifdef CONFIG_ZONE_DEVICE
ZONE_DEVICE, #endif
__MAX_NR_ZONES
};
#ifndef __GENERATING_BOUNDS_H
#define ASYNC_AND_SYNC 2
struct zone { /* Read-mostly fields */
/* zone watermarks, access with *_wmark_pages(zone) macros */ unsignedlong _watermark[NR_WMARK]; unsignedlong watermark_boost;
/* * We don't know if the memory that we're going to allocate will be * freeable or/and it will be released eventually, so to avoid totally * wasting several GB of ram we must reserve some of the lower zone * memory (otherwise we risk to run OOM on the lower zones despite * there being tons of freeable ram on the higher zones). This array is * recalculated at runtime if the sysctl_lowmem_reserve_ratio sysctl * changes.
*/ long lowmem_reserve[MAX_NR_ZONES];
#ifdef CONFIG_NUMA int node; #endif struct pglist_data *zone_pgdat; struct per_cpu_pages __percpu *per_cpu_pageset; struct per_cpu_zonestat __percpu *per_cpu_zonestats; /* * the high and batch values are copied to individual pagesets for * faster access
*/ int pageset_high_min; int pageset_high_max; int pageset_batch;
#ifndef CONFIG_SPARSEMEM /* * Flags for a pageblock_nr_pages block. See pageblock-flags.h. * In SPARSEMEM, this map is stored in struct mem_section
*/ unsignedlong *pageblock_flags; #endif/* CONFIG_SPARSEMEM */
/* * spanned_pages is the total pages spanned by the zone, including * holes, which is calculated as: * spanned_pages = zone_end_pfn - zone_start_pfn; * * present_pages is physical pages existing within the zone, which * is calculated as: * present_pages = spanned_pages - absent_pages(pages in holes); * * present_early_pages is present pages existing within the zone * located on memory available since early boot, excluding hotplugged * memory. * * managed_pages is present pages managed by the buddy system, which * is calculated as (reserved_pages includes pages allocated by the * bootmem allocator): * managed_pages = present_pages - reserved_pages; * * cma pages is present pages that are assigned for CMA use * (MIGRATE_CMA). * * So present_pages may be used by memory hotplug or memory power * management logic to figure out unmanaged pages by checking * (present_pages - managed_pages). And managed_pages should be used * by page allocator and vm scanner to calculate all kinds of watermarks * and thresholds. * * Locking rules: * * zone_start_pfn and spanned_pages are protected by span_seqlock. * It is a seqlock because it has to be read outside of zone->lock, * and it is done in the main allocator path. But, it is written * quite infrequently. * * The span_seq lock is declared along with zone->lock because it is * frequently read in proximity to zone->lock. It's good to * give them a chance of being in the same cacheline. * * Write access to present_pages at runtime should be protected by * mem_hotplug_begin/done(). Any reader who can't tolerant drift of * present_pages should use get_online_mems() to get a stable value.
*/
atomic_long_t managed_pages; unsignedlong spanned_pages; unsignedlong present_pages; #ifdefined(CONFIG_MEMORY_HOTPLUG) unsignedlong present_early_pages; #endif #ifdef CONFIG_CMA unsignedlong cma_pages; #endif
constchar *name;
#ifdef CONFIG_MEMORY_ISOLATION /* * Number of isolated pageblock. It is used to solve incorrect * freepage counting problem due to racy retrieving migratetype * of pageblock. Protected by zone->lock.
*/ unsignedlong nr_isolate_pageblock; #endif
#ifdef CONFIG_MEMORY_HOTPLUG /* see spanned/present_pages for more description */
seqlock_t span_seqlock; #endif
int initialized;
/* Write-intensive fields used from the page allocator */
CACHELINE_PADDING(_pad1_);
/* free areas of different sizes */ struct free_area free_area[NR_PAGE_ORDERS];
#ifdef CONFIG_UNACCEPTED_MEMORY /* Pages to be accepted. All pages on the list are MAX_PAGE_ORDER */ struct list_head unaccepted_pages;
/* To be called once the last page in the zone is accepted */ struct work_struct unaccepted_cleanup; #endif
/* Pages to be freed when next trylock succeeds */ struct llist_head trylock_free_pages;
/* Write-intensive fields used by compaction and vmstats. */
CACHELINE_PADDING(_pad2_);
/* * When free pages are below this point, additional steps are taken * when reading the number of free pages to avoid per-cpu counter * drift allowing watermarks to be breached
*/ unsignedlong percpu_drift_mark;
#ifdefined CONFIG_COMPACTION || defined CONFIG_CMA /* pfn where compaction free scanner should start */ unsignedlong compact_cached_free_pfn; /* pfn where compaction migration scanner should start */ unsignedlong compact_cached_migrate_pfn[ASYNC_AND_SYNC]; unsignedlong compact_init_migrate_pfn; unsignedlong compact_init_free_pfn; #endif
#ifdef CONFIG_COMPACTION /* * On compaction failure, 1<<compact_defer_shift compactions * are skipped before trying again. The number attempted since * last failure is tracked with compact_considered. * compact_order_failed is the minimum compaction failed order.
*/ unsignedint compact_considered; unsignedint compact_defer_shift; int compact_order_failed; #endif
#ifdefined CONFIG_COMPACTION || defined CONFIG_CMA /* Set to true when the PG_migrate_skip bits should be cleared */ bool compact_blockskip_flush; #endif
enum pgdat_flags {
PGDAT_DIRTY, /* reclaim scanning has recently found * many dirty file pages at the tail * of the LRU.
*/
PGDAT_WRITEBACK, /* reclaim scanning has recently found * many pages under writeback
*/
PGDAT_RECLAIM_LOCKED, /* prevents concurrent reclaim */
};
enum zone_flags {
ZONE_BOOSTED_WATERMARK, /* zone recently boosted watermarks. * Cleared when kswapd is woken.
*/
ZONE_RECLAIM_ACTIVE, /* kswapd may be scanning the zone. */
ZONE_BELOW_HIGH, /* zone is below high watermark. */
};
staticinlinebool zone_is_initialized(struct zone *zone)
{ return zone->initialized;
}
staticinlinebool zone_is_empty(struct zone *zone)
{ return zone->spanned_pages == 0;
}
#ifndef BUILD_VDSO32_64 /* * The zone field is never updated after free_area_init_core() * sets it, so none of the operations on it need to be atomic.
*/
/* * Consecutive zone device pages should not be merged into the same sgl * or bvec segment with other types of pages or if they belong to different * pgmaps. Otherwise getting the pgmap of a given segment is not possible * without scanning the entire segment. This helper returns true either if * both pages are not zone device pages or both pages are zone device pages * with the same pgmap.
*/ staticinlinebool zone_device_pages_have_same_pgmap(conststruct page *a, conststruct page *b)
{ if (is_zone_device_page(a) != is_zone_device_page(b)) returnfalse; if (!is_zone_device_page(a)) returntrue; return page_pgmap(a) == page_pgmap(b);
}
/* * Return true if [start_pfn, start_pfn + nr_pages) range has a non-empty * intersection with the given zone
*/ staticinlinebool zone_intersects(struct zone *zone, unsignedlong start_pfn, unsignedlong nr_pages)
{ if (zone_is_empty(zone)) returnfalse; if (start_pfn >= zone_end_pfn(zone) ||
start_pfn + nr_pages <= zone->zone_start_pfn) returnfalse;
returntrue;
}
/* * The "priority" of VM scanning is how much of the queues we will scan in one * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the * queues ("queue_length >> 12") during an aging round.
*/ #define DEF_PRIORITY 12
/* Maximum number of zones on a zonelist */ #define MAX_ZONES_PER_ZONELIST (MAX_NUMNODES * MAX_NR_ZONES)
enum {
ZONELIST_FALLBACK, /* zonelist with fallback */ #ifdef CONFIG_NUMA /* * The NUMA zonelists are doubled because we need zonelists that * restrict the allocations to a single node for __GFP_THISNODE.
*/
ZONELIST_NOFALLBACK, /* zonelist without fallback (__GFP_THISNODE) */ #endif
MAX_ZONELISTS
};
/* * This struct contains information about a zone in a zonelist. It is stored * here to avoid dereferences into large structures and lookups of tables
*/ struct zoneref { struct zone *zone; /* Pointer to actual zone */ int zone_idx; /* zone_idx(zoneref->zone) */
};
/* * One allocation request operates on a zonelist. A zonelist * is a list of zones, the first one is the 'goal' of the * allocation, the other zones are fallback zones, in decreasing * priority. * * To speed the reading of the zonelist, the zonerefs contain the zone index * of the entry being read. Helper functions to access information given * a struct zoneref are * * zonelist_zone() - Return the struct zone * for an entry in _zonerefs * zonelist_zone_idx() - Return the index of the zone for an entry * zonelist_node_idx() - Return the index of the node for an entry
*/ struct zonelist { struct zoneref _zonerefs[MAX_ZONES_PER_ZONELIST + 1];
};
/* * The array of struct pages for flatmem. * It must be declared for SPARSEMEM as well because there are configurations * that rely on that.
*/ externstruct page *mem_map;
#ifdef CONFIG_MEMORY_FAILURE /* * Per NUMA node memory failure handling statistics.
*/ struct memory_failure_stats { /* * Number of raw pages poisoned. * Cases not accounted: memory outside kernel control, offline page, * arch-specific memory_failure (SGX), hwpoison_filter() filtered * error events, and unpoison actions from hwpoison_unpoison.
*/ unsignedlong total; /* * Recovery results of poisoned raw pages handled by memory_failure, * in sync with mf_result. * total = ignored + failed + delayed + recovered. * total * PAGE_SIZE * #nodes = /proc/meminfo/HardwareCorrupted.
*/ unsignedlong ignored; unsignedlong failed; unsignedlong delayed; unsignedlong recovered;
}; #endif
/* * On NUMA machines, each NUMA node would have a pg_data_t to describe * it's memory layout. On UMA machines there is a single pglist_data which * describes the whole memory. * * Memory statistics and page replacement data structures are maintained on a * per-zone basis.
*/ typedefstruct pglist_data { /* * node_zones contains just the zones for THIS node. Not all of the * zones may be populated, but it is the full list. It is referenced by * this node's node_zonelists as well as other node's node_zonelists.
*/ struct zone node_zones[MAX_NR_ZONES];
/* * node_zonelists contains references to all zones in all nodes. * Generally the first zones will be references to this node's * node_zones.
*/ struct zonelist node_zonelists[MAX_ZONELISTS];
int nr_zones; /* number of populated zones in this node */ #ifdef CONFIG_FLATMEM /* means !SPARSEMEM */ struct page *node_mem_map; #ifdef CONFIG_PAGE_EXTENSION struct page_ext *node_page_ext; #endif #endif #ifdefined(CONFIG_MEMORY_HOTPLUG) || defined(CONFIG_DEFERRED_STRUCT_PAGE_INIT) /* * Must be held any time you expect node_start_pfn, * node_present_pages, node_spanned_pages or nr_zones to stay constant. * Also synchronizes pgdat->first_deferred_pfn during deferred page * init. * * pgdat_resize_lock() and pgdat_resize_unlock() are provided to * manipulate node_size_lock without checking for CONFIG_MEMORY_HOTPLUG * or CONFIG_DEFERRED_STRUCT_PAGE_INIT. * * Nests above zone->lock and zone->span_seqlock
*/
spinlock_t node_size_lock; #endif unsignedlong node_start_pfn; unsignedlong node_present_pages; /* total number of physical pages */ unsignedlong node_spanned_pages; /* total size of physical page
range, including holes */ int node_id;
wait_queue_head_t kswapd_wait;
wait_queue_head_t pfmemalloc_wait;
/* workqueues for throttling reclaim for different reasons. */
wait_queue_head_t reclaim_wait[NR_VMSCAN_THROTTLE];
atomic_t nr_writeback_throttled;/* nr of writeback-throttled tasks */ unsignedlong nr_reclaim_start; /* nr pages written while throttled
* when throttling started. */ #ifdef CONFIG_MEMORY_HOTPLUG struct mutex kswapd_lock; #endif struct task_struct *kswapd; /* Protected by kswapd_lock */ int kswapd_order; enum zone_type kswapd_highest_zoneidx;
int kswapd_failures; /* Number of 'reclaimed == 0' runs */
#ifdef CONFIG_COMPACTION int kcompactd_max_order; enum zone_type kcompactd_highest_zoneidx;
wait_queue_head_t kcompactd_wait; struct task_struct *kcompactd; bool proactive_compact_trigger; #endif /* * This is a per-node reserve of pages that are not available * to userspace allocations.
*/ unsignedlong totalreserve_pages;
#ifdef CONFIG_NUMA /* * node reclaim becomes active if more unmapped pages exist.
*/ unsignedlong min_unmapped_pages; unsignedlong min_slab_pages; #endif/* CONFIG_NUMA */
/* Write-intensive fields used by page reclaim */
CACHELINE_PADDING(_pad1_);
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT /* * If memory initialisation on large machines is deferred then this * is the first PFN that needs to be initialised.
*/ unsignedlong first_deferred_pfn; #endif/* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
#ifdef CONFIG_NUMA_BALANCING /* start time in ms of current promote rate limit period */ unsignedint nbp_rl_start; /* number of promote candidate pages at start time of current rate limit period */ unsignedlong nbp_rl_nr_cand; /* promote threshold in ms */ unsignedint nbp_threshold; /* start time in ms of current promote threshold adjustment period */ unsignedint nbp_th_start; /* * number of promote candidate pages at start time of current promote * threshold adjustment period
*/ unsignedlong nbp_th_nr_cand; #endif /* Fields commonly accessed by the page reclaim scanner */
/* * NOTE: THIS IS UNUSED IF MEMCG IS ENABLED. * * Use mem_cgroup_lruvec() to look up lruvecs.
*/ struct lruvec __lruvec;
unsignedlong flags;
#ifdef CONFIG_LRU_GEN /* kswap mm walk data */ struct lru_gen_mm_walk mm_walk; /* lru_gen_folio list */ struct lru_gen_memcg memcg_lru; #endif
/* * Returns true if a zone has pages managed by the buddy allocator. * All the reclaim decisions have to use this function rather than * populated_zone(). If the whole zone is reserved then we can easily * end up with populated_zone() && !managed_zone().
*/ staticinlinebool managed_zone(struct zone *zone)
{ return zone_managed_pages(zone);
}
/* Returns true if a zone has memory */ staticinlinebool populated_zone(struct zone *zone)
{ return zone->present_pages;
}
#ifdef CONFIG_NUMA staticinlineint zone_to_nid(struct zone *zone)
{ return zone->node;
}
staticinlinevoid zone_set_nid(struct zone *zone, int nid)
{
zone->node = nid;
} #else staticinlineint zone_to_nid(struct zone *zone)
{ return 0;
}
staticinlinevoid zone_set_nid(struct zone *zone, int nid) {} #endif
/** * is_highmem - helper function to quickly check if a struct zone is a * highmem zone or not. This is an attempt to keep references * to ZONE_{DMA/NORMAL/HIGHMEM/etc} in general code to a minimum. * @zone: pointer to struct zone variable * Return: 1 for a highmem zone, 0 otherwise
*/ staticinlineint is_highmem(struct zone *zone)
{ return is_highmem_idx(zone_idx(zone));
}
externstruct pglist_data *first_online_pgdat(void); externstruct pglist_data *next_online_pgdat(struct pglist_data *pgdat); externstruct zone *next_zone(struct zone *zone);
/** * for_each_online_pgdat - helper macro to iterate over all online nodes * @pgdat: pointer to a pg_data_t variable
*/ #define for_each_online_pgdat(pgdat) \ for (pgdat = first_online_pgdat(); \
pgdat; \
pgdat = next_online_pgdat(pgdat)) /** * for_each_zone - helper macro to iterate over all memory zones * @zone: pointer to struct zone variable * * The user only needs to declare the zone variable, for_each_zone * fills it in.
*/ #define for_each_zone(zone) \ for (zone = (first_online_pgdat())->node_zones; \
zone; \
zone = next_zone(zone))
#define for_each_populated_zone(zone) \ for (zone = (first_online_pgdat())->node_zones; \
zone; \
zone = next_zone(zone)) \ if (!populated_zone(zone)) \
; /* do nothing */ \ else
staticinlinestruct zone *zonelist_zone(struct zoneref *zoneref)
{ return zoneref->zone;
}
/** * next_zones_zonelist - Returns the next zone at or below highest_zoneidx within the allowed nodemask using a cursor within a zonelist as a starting point * @z: The cursor used as a starting point for the search * @highest_zoneidx: The zone index of the highest zone to return * @nodes: An optional nodemask to filter the zonelist with * * This function returns the next zone at or below a given zone index that is * within the allowed nodemask using a cursor as the starting point for the * search. The zoneref returned is a cursor that represents the current zone * being examined. It should be advanced by one before calling * next_zones_zonelist again. * * Return: the next zone at or below highest_zoneidx within the allowed * nodemask using a cursor within a zonelist as a starting point
*/ static __always_inline struct zoneref *next_zones_zonelist(struct zoneref *z, enum zone_type highest_zoneidx,
nodemask_t *nodes)
{ if (likely(!nodes && zonelist_zone_idx(z) <= highest_zoneidx)) return z; return __next_zones_zonelist(z, highest_zoneidx, nodes);
}
/** * first_zones_zonelist - Returns the first zone at or below highest_zoneidx within the allowed nodemask in a zonelist * @zonelist: The zonelist to search for a suitable zone * @highest_zoneidx: The zone index of the highest zone to return * @nodes: An optional nodemask to filter the zonelist with * * This function returns the first zone at or below a given zone index that is * within the allowed nodemask. The zoneref returned is a cursor that can be * used to iterate the zonelist with next_zones_zonelist by advancing it by * one before calling. * * When no eligible zone is found, zoneref->zone is NULL (zoneref itself is * never NULL). This may happen either genuinely, or due to concurrent nodemask * update due to cpuset modification. * * Return: Zoneref pointer for the first suitable zone found
*/ staticinlinestruct zoneref *first_zones_zonelist(struct zonelist *zonelist, enum zone_type highest_zoneidx,
nodemask_t *nodes)
{ return next_zones_zonelist(zonelist->_zonerefs,
highest_zoneidx, nodes);
}
/** * for_each_zone_zonelist_nodemask - helper macro to iterate over valid zones in a zonelist at or below a given zone index and within a nodemask * @zone: The current zone in the iterator * @z: The current pointer within zonelist->_zonerefs being iterated * @zlist: The zonelist being iterated * @highidx: The zone index of the highest zone to return * @nodemask: Nodemask allowed by the allocator * * This iterator iterates though all zones at or below a given zone index and * within a given nodemask
*/ #define for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, nodemask) \ for (z = first_zones_zonelist(zlist, highidx, nodemask), zone = zonelist_zone(z); \
zone; \
z = next_zones_zonelist(++z, highidx, nodemask), \
zone = zonelist_zone(z))
#define for_next_zone_zonelist_nodemask(zone, z, highidx, nodemask) \ for (zone = zonelist_zone(z); \
zone; \
z = next_zones_zonelist(++z, highidx, nodemask), \
zone = zonelist_zone(z))
/** * for_each_zone_zonelist - helper macro to iterate over valid zones in a zonelist at or below a given zone index * @zone: The current zone in the iterator * @z: The current pointer within zonelist->zones being iterated * @zlist: The zonelist being iterated * @highidx: The zone index of the highest zone to return * * This iterator iterates though all zones at or below a given zone index.
*/ #define for_each_zone_zonelist(zone, z, zlist, highidx) \
for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, NULL)
/* Whether the 'nodes' are all movable nodes */ staticinlinebool movable_only_nodes(nodemask_t *nodes)
{ struct zonelist *zonelist; struct zoneref *z; int nid;
if (nodes_empty(*nodes)) returnfalse;
/* * We can chose arbitrary node from the nodemask to get a * zonelist as they are interlinked. We just need to find * at least one zone that can satisfy kernel allocations.
*/
nid = first_node(*nodes);
zonelist = &NODE_DATA(nid)->node_zonelists[ZONELIST_FALLBACK];
z = first_zones_zonelist(zonelist, ZONE_NORMAL, nodes); return (!zonelist_zone(z)) ? true : false;
}
struct mem_section_usage { struct rcu_head rcu; #ifdef CONFIG_SPARSEMEM_VMEMMAP
DECLARE_BITMAP(subsection_map, SUBSECTIONS_PER_SECTION); #endif /* See declaration of similar field in struct zone */ unsignedlong pageblock_flags[0];
};
struct page; struct page_ext; struct mem_section { /* * This is, logically, a pointer to an array of struct * pages. However, it is stored with some other magic. * (see sparse.c::sparse_init_one_section()) * * Additionally during early boot we encode node id of * the location of the section here to guide allocation. * (see sparse.c::memory_present()) * * Making it a UL at least makes someone do a cast * before using it wrong.
*/ unsignedlong section_mem_map;
struct mem_section_usage *usage; #ifdef CONFIG_PAGE_EXTENSION /* * If SPARSEMEM, pgdat doesn't have page_ext pointer. We use * section. (see page_ext.h about this.)
*/ struct page_ext *page_ext; unsignedlong pad; #endif /* * WARNING: mem_section must be a power-of-2 in size for the * calculation and use of SECTION_ROOT_MASK to make sense.
*/
};
/* * We use the lower bits of the mem_map pointer to store * a little bit of information. The pointer is calculated * as mem_map - section_nr_to_pfn(pnum). The result is * aligned to the minimum alignment of the two values: * 1. All mem_map arrays are page-aligned. * 2. section_nr_to_pfn() always clears PFN_SECTION_SHIFT * lowest bits. PFN_SECTION_SHIFT is arch-specific * (equal SECTION_SIZE_BITS - PAGE_SHIFT), and the * worst combination is powerpc with 256k pages, * which results in PFN_SECTION_SHIFT equal 6. * To sum it up, at least 6 bits are available on all architectures. * However, we can exceed 6 bits on some other architectures except * powerpc (e.g. 15 bits are available on x86_64, 13 bits are available * with the worst case of 64K pages on arm64) if we make sure the * exceeded bit is not applicable to powerpc.
*/ enum {
SECTION_MARKED_PRESENT_BIT,
SECTION_HAS_MEM_MAP_BIT,
SECTION_IS_ONLINE_BIT,
SECTION_IS_EARLY_BIT, #ifdef CONFIG_ZONE_DEVICE
SECTION_TAINT_ZONE_DEVICE_BIT, #endif #ifdef CONFIG_SPARSEMEM_VMEMMAP_PREINIT
SECTION_IS_VMEMMAP_PREINIT_BIT, #endif
SECTION_MAP_LAST_BIT,
};
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung ist noch experimentell.