/* * high_memory defines the upper bound on direct map memory, then end * of ZONE_NORMAL.
*/ void *high_memory;
EXPORT_SYMBOL(high_memory);
#ifdef CONFIG_DEBUG_MEMORY_INIT int __meminitdata mminit_loglevel;
/* The zonelists are simply reported, validation is manual. */ void __init mminit_verify_zonelist(void)
{ int nid;
if (mminit_loglevel < MMINIT_VERIFY) return;
for_each_online_node(nid) {
pg_data_t *pgdat = NODE_DATA(nid); struct zone *zone; struct zoneref *z; struct zonelist *zonelist; int i, listid, zoneid;
for (i = 0; i < MAX_ZONELISTS * MAX_NR_ZONES; i++) {
/* Identify the zone and nodelist */
zoneid = i % MAX_NR_ZONES;
listid = i / MAX_NR_ZONES;
zonelist = &pgdat->node_zonelists[listid];
zone = &pgdat->node_zones[zoneid]; if (!populated_zone(zone)) continue;
/* Print information about the zonelist */
printk(KERN_DEBUG "mminit::zonelist %s %d:%s = ",
listid > 0 ? "thisnode" : "general", nid,
zone->name);
/* * For policy OVERCOMMIT_NEVER, set batch size to 0.4% of * (total memory/#cpus), and lift it to 25% for other policies * to easy the possible lock contention for percpu_counter * vm_committed_as, while the max limit is INT_MAX
*/ if (overcommit_policy == OVERCOMMIT_NEVER)
memsized_batch = min_t(u64, ram_pages/nr/256, INT_MAX); else
memsized_batch = min_t(u64, ram_pages/nr/4, INT_MAX);
/* Value may be a percentage of total memory, otherwise bytes */
coremem = simple_strtoull(p, &endptr, 0); if (*endptr == '%') { /* Paranoid check for percent values greater than 100 */
WARN_ON(coremem > 100);
*percent = coremem;
} else {
coremem = memparse(p, &p); /* Paranoid check that UL is enough for the coremem value */
WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);
/* * kernelcore=size sets the amount of memory for use for allocations that * cannot be reclaimed or migrated.
*/ staticint __init cmdline_parse_kernelcore(char *p)
{ /* parse kernelcore=mirror */ if (parse_option_str(p, "mirror")) {
mirrored_kernelcore = true; return 0;
}
/* * movablecore=size sets the amount of memory for use for allocations that * can be reclaimed or migrated.
*/ staticint __init cmdline_parse_movablecore(char *p)
{ return cmdline_parse_core(p, &required_movablecore,
&required_movablecore_percent);
}
early_param("movablecore", cmdline_parse_movablecore);
/* * early_calculate_totalpages() * Sum pages in active regions for movable zone. * Populate N_MEMORY for calculating usable_nodes.
*/ staticunsignedlong __init early_calculate_totalpages(void)
{ unsignedlong totalpages = 0; unsignedlong start_pfn, end_pfn; int i, nid;
/* * This finds a zone that can be used for ZONE_MOVABLE pages. The * assumption is made that zones within a node are ordered in monotonic * increasing memory addresses so that the "highest" populated zone is used
*/ staticvoid __init find_usable_zone_for_movable(void)
{ int zone_index; for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) { if (zone_index == ZONE_MOVABLE) continue;
if (arch_zone_highest_possible_pfn[zone_index] >
arch_zone_lowest_possible_pfn[zone_index]) break;
}
/* * Find the PFN the Movable zone begins in each node. Kernel memory * is spread evenly between nodes as long as the nodes have enough * memory. When they don't, some nodes will have more kernelcore than * others
*/ staticvoid __init find_zone_movable_pfns_for_nodes(void)
{ int i, nid; unsignedlong usable_startpfn; unsignedlong kernelcore_node, kernelcore_remaining; /* save the state before borrow the nodemask */
nodemask_t saved_node_state = node_states[N_MEMORY]; unsignedlong totalpages = early_calculate_totalpages(); int usable_nodes = nodes_weight(node_states[N_MEMORY]); struct memblock_region *r;
/* Need to find movable_zone earlier when movable_node is specified. */
find_usable_zone_for_movable();
/* * If movable_node is specified, ignore kernelcore and movablecore * options.
*/ if (movable_node_is_enabled()) {
for_each_mem_region(r) { if (!memblock_is_hotpluggable(r)) continue;
if (mem_below_4gb_not_mirrored)
pr_warn("This configuration results in unmirrored kernel memory.\n");
goto out2;
}
/* * If kernelcore=nn% or movablecore=nn% was specified, calculate the * amount of necessary memory.
*/ if (required_kernelcore_percent)
required_kernelcore = (totalpages * 100 * required_kernelcore_percent) /
10000UL; if (required_movablecore_percent)
required_movablecore = (totalpages * 100 * required_movablecore_percent) /
10000UL;
/* * If movablecore= was specified, calculate what size of * kernelcore that corresponds so that memory usable for * any allocation type is evenly spread. If both kernelcore * and movablecore are specified, then the value of kernelcore * will be used for required_kernelcore if it's greater than * what movablecore would have allowed.
*/ if (required_movablecore) { unsignedlong corepages;
/* * Round-up so that ZONE_MOVABLE is at least as large as what * was requested by the user
*/
required_movablecore =
round_up(required_movablecore, MAX_ORDER_NR_PAGES);
required_movablecore = min(totalpages, required_movablecore);
corepages = totalpages - required_movablecore;
/* * If kernelcore was not specified or kernelcore size is larger * than totalpages, there is no ZONE_MOVABLE.
*/ if (!required_kernelcore || required_kernelcore >= totalpages) goto out;
/* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
restart: /* Spread kernelcore memory as evenly as possible throughout nodes */
kernelcore_node = required_kernelcore / usable_nodes;
for_each_node_state(nid, N_MEMORY) { unsignedlong start_pfn, end_pfn;
/* * Recalculate kernelcore_node if the division per node * now exceeds what is necessary to satisfy the requested * amount of memory for the kernel
*/ if (required_kernelcore < kernelcore_node)
kernelcore_node = required_kernelcore / usable_nodes;
/* * As the map is walked, we track how much memory is usable * by the kernel using kernelcore_remaining. When it is * 0, the rest of the node is usable by ZONE_MOVABLE
*/
kernelcore_remaining = kernelcore_node;
/* Go through each range of PFNs within this node */
for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { unsignedlong size_pages;
start_pfn = max(start_pfn, zone_movable_pfn[nid]); if (start_pfn >= end_pfn) continue;
/* Account for what is only usable for kernelcore */ if (start_pfn < usable_startpfn) { unsignedlong kernel_pages;
kernel_pages = min(end_pfn, usable_startpfn)
- start_pfn;
/* Continue if range is now fully accounted */ if (end_pfn <= usable_startpfn) {
/* * Push zone_movable_pfn to the end so * that if we have to rebalance * kernelcore across nodes, we will * not double account here
*/
zone_movable_pfn[nid] = end_pfn; continue;
}
start_pfn = usable_startpfn;
}
/* * The usable PFN range for ZONE_MOVABLE is from * start_pfn->end_pfn. Calculate size_pages as the * number of pages used as kernelcore
*/
size_pages = end_pfn - start_pfn; if (size_pages > kernelcore_remaining)
size_pages = kernelcore_remaining;
zone_movable_pfn[nid] = start_pfn + size_pages;
/* * Some kernelcore has been met, update counts and * break if the kernelcore for this node has been * satisfied
*/
required_kernelcore -= min(required_kernelcore,
size_pages);
kernelcore_remaining -= size_pages; if (!kernelcore_remaining) break;
}
}
/* * If there is still required_kernelcore, we do another pass with one * less node in the count. This will push zone_movable_pfn[nid] further * along on the nodes that still have memory until kernelcore is * satisfied
*/
usable_nodes--; if (usable_nodes && required_kernelcore > usable_nodes) goto restart;
out2: /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
for_each_node_state(nid, N_MEMORY) { unsignedlong start_pfn, end_pfn;
INIT_LIST_HEAD(&page->lru); #ifdef WANT_PAGE_VIRTUAL /* The shift won't overflow because ZONE_NORMAL is below 4G. */ if (!is_highmem_idx(zone))
set_page_address(page, __va(pfn << PAGE_SHIFT)); #endif
}
#ifdef CONFIG_NUMA /* * During memory init memblocks map pfns to nids. The search is expensive and * this caches recent lookups. The implementation of __early_pfn_to_nid * treats start/end as pfns.
*/ struct mminit_pfnnid_cache { unsignedlong last_start; unsignedlong last_end; int last_nid;
};
/* * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
*/ staticint __meminit __early_pfn_to_nid(unsignedlong pfn, struct mminit_pfnnid_cache *state)
{ unsignedlong start_pfn, end_pfn; int nid;
if (state->last_start <= pfn && pfn < state->last_end) return state->last_nid;
nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn); if (nid != NUMA_NO_NODE) {
state->last_start = start_pfn;
state->last_end = end_pfn;
state->last_nid = nid;
}
return nid;
}
int __meminit early_pfn_to_nid(unsignedlong pfn)
{ static DEFINE_SPINLOCK(early_pfn_lock); int nid;
spin_lock(&early_pfn_lock);
nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache); if (nid < 0)
nid = first_online_node;
spin_unlock(&early_pfn_lock);
/* * Initialize a reserved page unconditionally, finding its zone first.
*/ void __meminit __init_page_from_nid(unsignedlong pfn, int nid)
{
pg_data_t *pgdat; int zid;
pgdat = NODE_DATA(nid);
for (zid = 0; zid < MAX_NR_ZONES; zid++) { struct zone *zone = &pgdat->node_zones[zid];
if (zone_spans_pfn(zone, pfn)) break;
}
__init_single_page(pfn_to_page(pfn), pfn, zid, nid);
if (pageblock_aligned(pfn))
init_pageblock_migratetype(pfn_to_page(pfn), MIGRATE_MOVABLE, false);
}
/* Returns true if the struct page for the pfn is initialised */ staticinlinebool __meminit early_page_initialised(unsignedlong pfn, int nid)
{ if (node_online(nid) && pfn >= NODE_DATA(nid)->first_deferred_pfn) returnfalse;
returntrue;
}
/* * Returns true when the remaining initialisation should be deferred until * later in the boot cycle when it can be parallelised.
*/ staticbool __meminit
defer_init(int nid, unsignedlong pfn, unsignedlong end_pfn)
{ staticunsignedlong prev_end_pfn, nr_initialised;
if (early_page_ext_enabled()) returnfalse;
/* Always populate low zones for address-constrained allocations */ if (end_pfn < pgdat_end_pfn(NODE_DATA(nid))) returnfalse;
if (NODE_DATA(nid)->first_deferred_pfn != ULONG_MAX) returntrue;
/* * prev_end_pfn static that contains the end of previous zone * No need to protect because called very early in boot before smp_init.
*/ if (prev_end_pfn != end_pfn) {
prev_end_pfn = end_pfn;
nr_initialised = 0;
}
/* * We start only with one section of pages, more pages are added as * needed until the rest of deferred pages are initialized.
*/
nr_initialised++; if ((nr_initialised > PAGES_PER_SECTION) &&
(pfn & (PAGES_PER_SECTION - 1)) == 0) {
NODE_DATA(nid)->first_deferred_pfn = pfn; returntrue;
} returnfalse;
}
staticvoid __meminit __init_deferred_page(unsignedlong pfn, int nid)
{ if (early_page_initialised(pfn, nid)) return;
staticinlinevoid __init_deferred_page(unsignedlong pfn, int nid)
{
} #endif/* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
void __meminit init_deferred_page(unsignedlong pfn, int nid)
{
__init_deferred_page(pfn, nid);
}
/* * Initialised pages do not have PageReserved set. This function is * called for each range allocated by the bootmem allocator and * marks the pages PageReserved. The remaining valid pages are later * sent to the buddy page allocator.
*/ void __meminit reserve_bootmem_region(phys_addr_t start,
phys_addr_t end, int nid)
{ unsignedlong pfn;
/* * no need for atomic set_bit because the struct * page is not visible yet so nobody should * access it yet.
*/
__SetPageReserved(page);
}
}
/* If zone is ZONE_MOVABLE but memory is mirrored, it is an overlapped init */ staticbool __meminit
overlap_memmap_init(unsignedlong zone, unsignedlong *pfn)
{ staticstruct memblock_region *r;
if (mirrored_kernelcore && zone == ZONE_MOVABLE) { if (!r || *pfn >= memblock_region_memory_end_pfn(r)) {
for_each_mem_region(r) { if (*pfn < memblock_region_memory_end_pfn(r)) break;
}
} if (*pfn >= memblock_region_memory_base_pfn(r) &&
memblock_is_mirror(r)) {
*pfn = memblock_region_memory_end_pfn(r); returntrue;
}
} returnfalse;
}
/* * Only struct pages that correspond to ranges defined by memblock.memory * are zeroed and initialized by going through __init_single_page() during * memmap_init_zone_range(). * * But, there could be struct pages that correspond to holes in * memblock.memory. This can happen because of the following reasons: * - physical memory bank size is not necessarily the exact multiple of the * arbitrary section size * - early reserved memory may not be listed in memblock.memory * - non-memory regions covered by the contiguous flatmem mapping * - memory layouts defined with memmap= kernel parameter may not align * nicely with memmap sections * * Explicitly initialize those struct pages so that: * - PG_Reserved is set * - zone and node links point to zone and node that span the page if the * hole is in the middle of a zone * - zone and node links point to adjacent zone/node if the hole falls on * the zone boundary; the pages in such holes will be prepended to the * zone/node above the hole except for the trailing pages in the last * section that will be appended to the zone/node below.
*/ staticvoid __init init_unavailable_range(unsignedlong spfn, unsignedlong epfn, int zone, int node)
{ unsignedlong pfn;
u64 pgcnt = 0;
if (pgcnt)
pr_info("On node %d, zone %s: %lld pages in unavailable ranges\n",
node, zone_names[zone], pgcnt);
}
/* * Initially all pages are reserved - free ones are freed * up by memblock_free_all() once the early boot process is * done. Non-atomic initialization, single-pass. * * All aligned pageblocks are initialized to the specified migratetype * (usually MIGRATE_MOVABLE). Besides setting the migratetype, no related * zone stats (e.g., nr_isolate_pageblock) are touched.
*/ void __meminit memmap_init_range(unsignedlong size, int nid, unsignedlong zone, unsignedlong start_pfn, unsignedlong zone_end_pfn, enum meminit_context context, struct vmem_altmap *altmap, int migratetype, bool isolate_pageblock)
{ unsignedlong pfn, end_pfn = start_pfn + size; struct page *page;
#ifdef CONFIG_ZONE_DEVICE /* * Honor reservation requested by the driver for this ZONE_DEVICE * memory. We limit the total number of pages to initialize to just * those that might contain the memory mapping. We will defer the * ZONE_DEVICE page initialization until after we have released * the hotplug lock.
*/ if (zone == ZONE_DEVICE) { if (!altmap) return;
for (pfn = start_pfn; pfn < end_pfn; ) { /* * There can be holes in boot-time mem_map[]s handed to this * function. They do not exist on hotplugged memory.
*/ if (context == MEMINIT_EARLY) { if (overlap_memmap_init(zone, &pfn)) continue; if (defer_init(nid, pfn, zone_end_pfn)) {
deferred_struct_pages = true; break;
}
}
/* * Usually, we want to mark the pageblock MIGRATE_MOVABLE, * such that unmovable allocations won't be scattered all * over the place during system boot.
*/ if (pageblock_aligned(pfn)) {
init_pageblock_migratetype(page, migratetype,
isolate_pageblock);
cond_resched();
}
pfn++;
}
}
staticvoid __init memmap_init_zone_range(struct zone *zone, unsignedlong start_pfn, unsignedlong end_pfn, unsignedlong *hole_pfn)
{ unsignedlong zone_start_pfn = zone->zone_start_pfn; unsignedlong zone_end_pfn = zone_start_pfn + zone->spanned_pages; int nid = zone_to_nid(zone), zone_id = zone_idx(zone);
/* * Initialize the memory map for hole in the range [memory_end, * section_end] for SPARSEMEM and in the range [memory_end, memmap_end] * for FLATMEM. * Append the pages in this hole to the highest zone in the last * node.
*/ #ifdef CONFIG_SPARSEMEM
end_pfn = round_up(end_pfn, PAGES_PER_SECTION); #else
end_pfn = round_up(end_pfn, MAX_ORDER_NR_PAGES); #endif if (hole_pfn < end_pfn)
init_unavailable_range(hole_pfn, end_pfn, zone_id, nid);
}
/* * Mark page reserved as it will need to wait for onlining * phase for it to be fully associated with a zone. * * We can use the non-atomic __set_bit operation for setting * the flag as we are still initializing the pages.
*/
__SetPageReserved(page);
/* * ZONE_DEVICE pages union ->lru with a ->pgmap back pointer * and zone_device_data. It is a bug if a ZONE_DEVICE page is * ever freed or placed on a driver-private list.
*/
page_folio(page)->pgmap = pgmap;
page->zone_device_data = NULL;
/* * Mark the block movable so that blocks are reserved for * movable at startup. This will force kernel allocations * to reserve their blocks rather than leaking throughout * the address space during boot when many long-lived * kernel allocations are made. * * Please note that MEMINIT_HOTPLUG path doesn't clear memmap * because this is done early in section_activate()
*/ if (pageblock_aligned(pfn)) {
init_pageblock_migratetype(page, MIGRATE_MOVABLE, false);
cond_resched();
}
/* * ZONE_DEVICE pages other than MEMORY_TYPE_GENERIC are released * directly to the driver page allocator which will set the page count * to 1 when allocating the page. * * MEMORY_TYPE_GENERIC and MEMORY_TYPE_FS_DAX pages automatically have * their refcount reset to one whenever they are freed (ie. after * their refcount drops to 0).
*/ switch (pgmap->type) { case MEMORY_DEVICE_FS_DAX: case MEMORY_DEVICE_PRIVATE: case MEMORY_DEVICE_COHERENT: case MEMORY_DEVICE_PCI_P2PDMA:
set_page_count(page, 0); break;
case MEMORY_DEVICE_GENERIC: break;
}
}
/* * With compound page geometry and when struct pages are stored in ram most * tail pages are reused. Consequently, the amount of unique struct pages to * initialize is a lot smaller that the total amount of struct pages being * mapped. This is a paired / mild layering violation with explicit knowledge * of how the sparse_vmemmap internals handle compound pages in the lack * of an altmap. See vmemmap_populate_compound_pages().
*/ staticinlineunsignedlong compound_nr_pages(struct vmem_altmap *altmap, struct dev_pagemap *pgmap)
{ if (!vmemmap_can_optimize(altmap, pgmap)) return pgmap_vmemmap_nr(pgmap);
/* * The first tail page stores important compound page info. * Call prep_compound_head() after the first tail page has * been initialized, to not have the data overwritten.
*/ if (pfn == head_pfn + 1)
prep_compound_head(head, order);
}
}
if (WARN_ON_ONCE(!pgmap || zone_idx != ZONE_DEVICE)) return;
/* * The call to memmap_init should have already taken care * of the pages reserved for the memmap, so we can just jump to * the end of that region and start processing the device pages.
*/ if (altmap) {
start_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
nr_pages = end_pfn - start_pfn;
}
/* * The zone ranges provided by the architecture do not include ZONE_MOVABLE * because it is sized independent of architecture. Unlike the other zones, * the starting point for ZONE_MOVABLE is not fixed. It may be different * in each node depending on the size of each node and how evenly kernelcore * is distributed. This helper function adjusts the zone ranges * provided by the architecture for a given node by using the end of the * highest usable zone for ZONE_MOVABLE. This preserves the assumption that * zones within a node are in order of monotonic increases memory addresses
*/ staticvoid __init adjust_zone_range_for_zone_movable(int nid, unsignedlong zone_type, unsignedlong node_end_pfn, unsignedlong *zone_start_pfn, unsignedlong *zone_end_pfn)
{ /* Only adjust if ZONE_MOVABLE is on this node */ if (zone_movable_pfn[nid]) { /* Size ZONE_MOVABLE */ if (zone_type == ZONE_MOVABLE) {
*zone_start_pfn = zone_movable_pfn[nid];
*zone_end_pfn = min(node_end_pfn,
arch_zone_highest_possible_pfn[movable_zone]);
/* Adjust for ZONE_MOVABLE starting within this range */
} elseif (!mirrored_kernelcore &&
*zone_start_pfn < zone_movable_pfn[nid] &&
*zone_end_pfn > zone_movable_pfn[nid]) {
*zone_end_pfn = zone_movable_pfn[nid];
/* Check if this whole range is within ZONE_MOVABLE */
} elseif (*zone_start_pfn >= zone_movable_pfn[nid])
*zone_start_pfn = *zone_end_pfn;
}
}
/* * Return the number of holes in a range on a node. If nid is MAX_NUMNODES, * then all holes in the requested range will be accounted for.
*/ staticunsignedlong __init __absent_pages_in_range(int nid, unsignedlong range_start_pfn, unsignedlong range_end_pfn)
{ unsignedlong nr_absent = range_end_pfn - range_start_pfn; unsignedlong start_pfn, end_pfn; int i;
/** * absent_pages_in_range - Return number of page frames in holes within a range * @start_pfn: The start PFN to start searching for holes * @end_pfn: The end PFN to stop searching for holes * * Return: the number of pages frames in memory holes within a range.
*/ unsignedlong __init absent_pages_in_range(unsignedlong start_pfn, unsignedlong end_pfn)
{ return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);
}
/* Return the number of page frames in holes in a zone on a node */ staticunsignedlong __init zone_absent_pages_in_node(int nid, unsignedlong zone_type, unsignedlong zone_start_pfn, unsignedlong zone_end_pfn)
{ unsignedlong nr_absent;
/* zone is empty, we don't have any absent pages */ if (zone_start_pfn == zone_end_pfn) return 0;
/* * ZONE_MOVABLE handling. * Treat pages to be ZONE_MOVABLE in ZONE_NORMAL as absent pages * and vice versa.
*/ if (mirrored_kernelcore && zone_movable_pfn[nid]) { unsignedlong start_pfn, end_pfn; struct memblock_region *r;
/* * Return the number of pages a zone spans in a node, including holes * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
*/ staticunsignedlong __init zone_spanned_pages_in_node(int nid, unsignedlong zone_type, unsignedlong node_start_pfn, unsignedlong node_end_pfn, unsignedlong *zone_start_pfn, unsignedlong *zone_end_pfn)
{ unsignedlong zone_low = arch_zone_lowest_possible_pfn[zone_type]; unsignedlong zone_high = arch_zone_highest_possible_pfn[zone_type];
/* Get the start and end of the zone */
*zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
*zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
adjust_zone_range_for_zone_movable(nid, zone_type, node_end_pfn,
zone_start_pfn, zone_end_pfn);
/* Check that this node has pages within the zone's required range */ if (*zone_end_pfn < node_start_pfn || *zone_start_pfn > node_end_pfn) return 0;
/* Move the zone boundaries inside the node if necessary */
*zone_end_pfn = min(*zone_end_pfn, node_end_pfn);
*zone_start_pfn = max(*zone_start_pfn, node_start_pfn);
#ifndef CONFIG_SPARSEMEM /* * Calculate the size of the zone->pageblock_flags rounded to an unsigned long * Start by making sure zonesize is a multiple of pageblock_order by rounding * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally * round what is now in bits to nearest long in bits, then return it in * bytes.
*/ staticunsignedlong __init usemap_size(unsignedlong zone_start_pfn, unsignedlong zonesize)
{ unsignedlong usemapsize;
staticvoid __ref setup_usemap(struct zone *zone)
{ unsignedlong usemapsize = usemap_size(zone->zone_start_pfn,
zone->spanned_pages);
zone->pageblock_flags = NULL; if (usemapsize) {
zone->pageblock_flags =
memblock_alloc_node(usemapsize, SMP_CACHE_BYTES,
zone_to_nid(zone)); if (!zone->pageblock_flags)
panic("Failed to allocate %ld bytes for zone %s pageblock flags on node %d\n",
usemapsize, zone->name, zone_to_nid(zone));
}
} #else staticinlinevoid setup_usemap(struct zone *zone) {} #endif/* CONFIG_SPARSEMEM */
#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ void __init set_pageblock_order(void)
{ unsignedint order = PAGE_BLOCK_MAX_ORDER;
/* Check that pageblock_nr_pages has not already been setup */ if (pageblock_order) return;
/* Don't let pageblocks exceed the maximum allocation granularity. */ if (HPAGE_SHIFT > PAGE_SHIFT && HUGETLB_PAGE_ORDER < order)
order = HUGETLB_PAGE_ORDER;
/* * Assume the largest contiguous order of interest is a huge page. * This value may be variable depending on boot parameters on powerpc.
*/
pageblock_order = order;
} #else/* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
/* * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order() * is unused as pageblock_order is set at compile-time. See * include/linux/pageblock-flags.h for the values of pageblock_order based on * the kernel config
*/ void __init set_pageblock_order(void)
{
}
#endif/* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
/* * Set up the zone data structures * - init pgdat internals * - init all zones belonging to this node * * NOTE: this function is only called during memory hotplug
*/ #ifdef CONFIG_MEMORY_HOTPLUG void __ref free_area_init_core_hotplug(struct pglist_data *pgdat)
{ int nid = pgdat->node_id; enum zone_type z; int cpu;
pgdat_init_internals(pgdat);
if (pgdat->per_cpu_nodestats == &boot_nodestats)
pgdat->per_cpu_nodestats = alloc_percpu(struct per_cpu_nodestat);
/* * Reset the nr_zones, order and highest_zoneidx before reuse. * Note that kswapd will init kswapd_highest_zoneidx properly * when it starts in the near future.
*/
pgdat->nr_zones = 0;
pgdat->kswapd_order = 0;
pgdat->kswapd_highest_zoneidx = 0;
pgdat->node_start_pfn = 0;
pgdat->node_present_pages = 0;
p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu);
memset(p, 0, sizeof(*p));
}
/* * When memory is hot-added, all the memory is in offline state. So * clear all zones' present_pages and managed_pages because they will * be updated in online_pages() and offline_pages().
*/ for (z = 0; z < MAX_NR_ZONES; z++) { struct zone *zone = pgdat->node_zones + z;
for (j = 0; j < MAX_NR_ZONES; j++) { struct zone *zone = pgdat->node_zones + j; unsignedlong size = zone->spanned_pages;
/* * Initialize zone->managed_pages as 0 , it will be reset * when memblock allocator frees pages into buddy system.
*/
zone_init_internals(zone, j, nid, zone->present_pages);
/* * Kmemleak will explicitly scan mem_map by traversing all valid * `struct *page`,so memblock does not need to be added to the scan list.
*/ if (exact_nid)
ptr = memblock_alloc_exact_nid_raw(size, align, min_addr,
MEMBLOCK_ALLOC_NOLEAKTRACE,
nid); else
ptr = memblock_alloc_try_nid_raw(size, align, min_addr,
MEMBLOCK_ALLOC_NOLEAKTRACE,
nid);
/** * get_pfn_range_for_nid - Return the start and end page frames for a node * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned. * @start_pfn: Passed by reference. On return, it will have the node start_pfn. * @end_pfn: Passed by reference. On return, it will have the node end_pfn. * * It returns the start and end page frame of a node based on information * provided by memblock_set_node(). If called for a node * with no available memory, the start and end PFNs will be 0.
*/ void __init get_pfn_range_for_nid(unsignedint nid, unsignedlong *start_pfn, unsignedlong *end_pfn)
{ unsignedlong this_start_pfn, this_end_pfn; int i;
/* * Some architectures, e.g. ARC may have ZONE_HIGHMEM below ZONE_NORMAL. For * such cases we allow max_zone_pfn sorted in the descending order
*/ staticbool arch_has_descending_max_zone_pfns(void)
{ return IS_ENABLED(CONFIG_ARC) && !IS_ENABLED(CONFIG_ARC_HAS_PAE40);
}
/* * Some architectures (e.g. ARM) set high_memory very early and * use it in arch setup code. * If an architecture already set high_memory don't overwrite it
*/ if (high_memory) return;
/** * free_area_init - Initialise all pg_data_t and zone data * @max_zone_pfn: an array of max PFNs for each zone * * This will call free_area_init_node() for each active node in the system. * Using the page ranges provided by memblock_set_node(), the size of each * zone in each node and their holes is calculated. If the maximum PFN * between two adjacent zones match, it is assumed that the zone is empty. * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed * that arch_max_dma32_pfn has no pages. It is also assumed that a zone * starts where the previous one ended. For example, ZONE_DMA32 starts * at arch_max_dma_pfn.
*/ void __init free_area_init(unsignedlong *max_zone_pfn)
{ unsignedlong start_pfn, end_pfn; int i, nid, zone; bool descending;
/* Record where the zone boundaries are */
memset(arch_zone_lowest_possible_pfn, 0, sizeof(arch_zone_lowest_possible_pfn));
memset(arch_zone_highest_possible_pfn, 0, sizeof(arch_zone_highest_possible_pfn));
/* Find the PFNs that ZONE_MOVABLE begins at in each node */
memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
find_zone_movable_pfns_for_nodes();
/* Print out the zone ranges */
pr_info("Zone ranges:\n"); for (i = 0; i < MAX_NR_ZONES; i++) { if (i == ZONE_MOVABLE) continue;
pr_info(" %-8s ", zone_names[i]); if (arch_zone_lowest_possible_pfn[i] ==
arch_zone_highest_possible_pfn[i])
pr_cont("empty\n"); else
pr_cont("[mem %#018Lx-%#018Lx]\n",
(u64)arch_zone_lowest_possible_pfn[i]
<< PAGE_SHIFT,
((u64)arch_zone_highest_possible_pfn[i]
<< PAGE_SHIFT) - 1);
}
/* Print out the PFNs ZONE_MOVABLE begins at in each node */
pr_info("Movable zone start for each node\n"); for (i = 0; i < MAX_NUMNODES; i++) { if (zone_movable_pfn[i])
pr_info(" Node %d: %#018Lx\n", i,
(u64)zone_movable_pfn[i] << PAGE_SHIFT);
}
/* * Print out the early node map, and initialize the * subsection-map relative to active online memory ranges to * enable future "sub-section" extensions of the memory map.
*/
pr_info("Early memory node ranges\n");
for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
pr_info(" node %3d: [mem %#018Lx-%#018Lx]\n", nid,
(u64)start_pfn << PAGE_SHIFT,
((u64)end_pfn << PAGE_SHIFT) - 1);
subsection_map_init(start_pfn, end_pfn - start_pfn);
}
/* Initialise every node */
mminit_verify_pageflags_layout();
setup_nr_node_ids();
set_pageblock_order();
for_each_node(nid) {
pg_data_t *pgdat;
if (!node_online(nid))
alloc_offline_node_data(nid);
pgdat = NODE_DATA(nid);
free_area_init_node(nid);
/* * No sysfs hierarchy will be created via register_one_node() *for memory-less node because here it's not marked as N_MEMORY *and won't be set online later. The benefit is userspace *program won't be confused by sysfs files/directories of *memory-less node. The pgdat will get fully initialized by *hotadd_init_pgdat() when memory is hotplugged into this node.
*/ if (pgdat->node_present_pages) {
node_set_state(nid, N_MEMORY);
check_for_memory(pgdat);
}
}
/* disable hash distribution for systems with a single node */
fixup_hashdist();
set_high_memory();
}
/** * node_map_pfn_alignment - determine the maximum internode alignment * * This function should be called after node map is populated and sorted. * It calculates the maximum power of two alignment which can distinguish * all the nodes. * * For example, if all nodes are 1GiB and aligned to 1GiB, the return value * would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)). If the * nodes are shifted by 256MiB, 256MiB. Note that if only the last node is * shifted, 1GiB is enough and this function will indicate so. * * This is used to test whether pfn -> nid mapping of the chosen memory * model has fine enough granularity to avoid incorrect mapping for the * populated node map. * * Return: the determined alignment in pfn's. 0 if there is no alignment * requirement (single node).
*/ unsignedlong __init node_map_pfn_alignment(void)
{ unsignedlong accl_mask = 0, last_end = 0; unsignedlong start, end, mask; int last_nid = NUMA_NO_NODE; int i, nid;
/* * Start with a mask granular enough to pin-point to the * start pfn and tick off bits one-by-one until it becomes * too coarse to separate the current node from the last.
*/
mask = ~((1 << __ffs(start)) - 1); while (mask && last_end <= (start & (mask << 1)))
mask <<= 1;
/* accumulate all internode masks */
accl_mask |= mask;
}
/* convert mask to number of pages */ return ~accl_mask + 1;
}
/* Free a large naturally-aligned chunk if possible */ if (nr_pages == MAX_ORDER_NR_PAGES && IS_MAX_ORDER_ALIGNED(pfn)) { for (i = 0; i < nr_pages; i += pageblock_nr_pages)
init_pageblock_migratetype(page + i, MIGRATE_MOVABLE, false);
__free_pages_core(page, MAX_PAGE_ORDER, MEMINIT_EARLY); return;
}
staticinlinevoid __init pgdat_init_report_one_done(void)
{ if (atomic_dec_and_test(&pgdat_init_n_undone))
complete(&pgdat_init_all_done_comp);
}
/* * Initialize struct pages. We minimize pfn page lookups and scheduler checks * by performing it only once every MAX_ORDER_NR_PAGES. * Return number of pages initialized.
*/ staticunsignedlong __init deferred_init_pages(struct zone *zone, unsignedlong pfn, unsignedlong end_pfn)
{ int nid = zone_to_nid(zone); unsignedlong nr_pages = end_pfn - pfn; int zid = zone_idx(zone); struct page *page = pfn_to_page(pfn);
/* * This function is meant to pre-load the iterator for the zone init from * a given point. * Specifically it walks through the ranges starting with initial index * passed to it until we are caught up to the first_init_pfn value and * exits there. If we never encounter the value we return false indicating * there are no valid ranges left.
*/ staticbool __init
deferred_init_mem_pfn_range_in_zone(u64 *i, struct zone *zone, unsignedlong *spfn, unsignedlong *epfn, unsignedlong first_init_pfn)
{
u64 j = *i;
if (j == 0)
__next_mem_pfn_range_in_zone(&j, zone, spfn, epfn);
/* * Start out by walking through the ranges in this zone that have * already been initialized. We don't need to do anything with them * so we just need to flush them out of the system.
*/
for_each_free_mem_pfn_range_in_zone_from(j, zone, spfn, epfn) { if (*epfn <= first_init_pfn) continue; if (*spfn < first_init_pfn)
*spfn = first_init_pfn;
*i = j; returntrue;
}
returnfalse;
}
/* * Initialize and free pages. We do it in two loops: first we initialize * struct page, then free to buddy allocator, because while we are * freeing pages we can access pages that are ahead (computing buddy * page in __free_one_page()). * * In order to try and keep some memory in the cache we have the loop * broken along max page order boundaries. This way we will not cause * any issues with the buddy page computation.
*/ staticunsignedlong __init
deferred_init_maxorder(u64 *i, struct zone *zone, unsignedlong *start_pfn, unsignedlong *end_pfn)
{ unsignedlong mo_pfn = ALIGN(*start_pfn + 1, MAX_ORDER_NR_PAGES); unsignedlong spfn = *start_pfn, epfn = *end_pfn; unsignedlong nr_pages = 0;
u64 j = *i;
/* First we loop through and initialize the page values */
for_each_free_mem_pfn_range_in_zone_from(j, zone, start_pfn, end_pfn) { unsignedlong t;
if (mo_pfn <= *start_pfn) break;
t = min(mo_pfn, *end_pfn);
nr_pages += deferred_init_pages(zone, *start_pfn, t);
/* * Initialize and free pages in MAX_PAGE_ORDER sized increments so that * we can avoid introducing any issues with the buddy allocator.
*/ while (spfn < end_pfn) {
deferred_init_maxorder(&i, zone, &spfn, &epfn);
cond_resched();
}
}
/* * Once we unlock here, the zone cannot be grown anymore, thus if an * interrupt thread must allocate this early in boot, zone must be * pre-grown prior to start of deferred page initialization.
*/
pgdat_resize_unlock(pgdat, &flags);
/* Only the highest zone is deferred */
zone = pgdat->node_zones + pgdat->nr_zones - 1;
/* * If this zone has deferred pages, try to grow it by initializing enough * deferred pages to satisfy the allocation specified by order, rounded up to * the nearest PAGES_PER_SECTION boundary. So we're adding memory in increments * of SECTION_SIZE bytes by initializing struct pages in increments of * PAGES_PER_SECTION * sizeof(struct page) bytes. * * Return true when zone was grown, otherwise return false. We return true even * when we grow less than requested, to let the caller decide if there are * enough pages to satisfy the allocation.
*/ bool __init deferred_grow_zone(struct zone *zone, unsignedint order)
{ unsignedlong nr_pages_needed = ALIGN(1 << order, PAGES_PER_SECTION);
pg_data_t *pgdat = zone->zone_pgdat; unsignedlong first_deferred_pfn = pgdat->first_deferred_pfn; unsignedlong spfn, epfn, flags; unsignedlong nr_pages = 0;
u64 i = 0;
/* Only the last zone may have deferred pages */ if (zone_end_pfn(zone) != pgdat_end_pfn(pgdat)) returnfalse;
pgdat_resize_lock(pgdat, &flags);
/* * If someone grew this zone while we were waiting for spinlock, return * true, as there might be enough pages already.
*/ if (first_deferred_pfn != pgdat->first_deferred_pfn) {
pgdat_resize_unlock(pgdat, &flags); returntrue;
}
/* If the zone is empty somebody else may have cleared out the zone */ if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
first_deferred_pfn)) {
pgdat->first_deferred_pfn = ULONG_MAX;
pgdat_resize_unlock(pgdat, &flags); /* Retry only once. */ return first_deferred_pfn != ULONG_MAX;
}
/* * Initialize and free pages in MAX_PAGE_ORDER sized increments so * that we can avoid introducing any issues with the buddy * allocator.
*/ while (spfn < epfn) { /* update our first deferred PFN for this section */
first_deferred_pfn = spfn;
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.