/* * In "forced" memmap_on_memory mode, we add extra pages to align the * vmemmap size to cover full pageblocks. That way, we can add memory * even if the vmemmap size is not properly aligned, however, we might waste * memory.
*/ if (memmap_mode == MEMMAP_ON_MEMORY_FORCE) return pageblock_align(nr_pages); return nr_pages;
}
staticconststruct kernel_param_ops memmap_mode_ops = {
.set = set_memmap_mode,
.get = get_memmap_mode,
};
module_param_cb(memmap_on_memory, &memmap_mode_ops, &memmap_mode, 0444);
MODULE_PARM_DESC(memmap_on_memory, "Enable memmap on memory for memory hotplug\n" "With value \"force\" it could result in memory wastage due " "to memmap size limitations (Y/N/force)");
/* * memory_hotplug.online_policy: configure online behavior when onlining without * specifying a zone (MMOP_ONLINE) * * "contig-zones": keep zone contiguous * "auto-movable": online memory to ZONE_MOVABLE if the configuration * (auto_movable_ratio, auto_movable_numa_aware) allows for it
*/ staticint online_policy __read_mostly = ONLINE_POLICY_CONTIG_ZONES; staticconststruct kernel_param_ops online_policy_ops = {
.set = set_online_policy,
.get = get_online_policy,
};
module_param_cb(online_policy, &online_policy_ops, &online_policy, 0644);
MODULE_PARM_DESC(online_policy, "Set the online policy (\"contig-zones\", \"auto-movable\") " "Default: \"contig-zones\"");
/* * memory_hotplug.auto_movable_ratio: specify maximum MOVABLE:KERNEL ratio * * The ratio represent an upper limit and the kernel might decide to not * online some memory to ZONE_MOVABLE -- e.g., because hotplugged KERNEL memory * doesn't allow for more MOVABLE memory.
*/ staticunsignedint auto_movable_ratio __read_mostly = 301;
module_param(auto_movable_ratio, uint, 0644);
MODULE_PARM_DESC(auto_movable_ratio, "Set the maximum ratio of MOVABLE:KERNEL memory in the system " "in percent for \"auto-movable\" online policy. Default: 301");
/* * memory_hotplug.auto_movable_numa_aware: consider numa node stats
*/ #ifdef CONFIG_NUMA staticbool auto_movable_numa_aware __read_mostly = true;
module_param(auto_movable_numa_aware, bool, 0644);
MODULE_PARM_DESC(auto_movable_numa_aware, "Consider numa node stats in addition to global stats in " "\"auto-movable\" online policy. Default: true"); #endif/* CONFIG_NUMA */
/* * online_page_callback contains pointer to current page onlining function. * Initially it is generic_online_page(). If it is required it could be * changed by calling set_online_page_callback() for callback registration * and restore_online_page_callback() for generic callback restore.
*/
if (strcmp(resource_name, "System RAM"))
flags |= IORESOURCE_SYSRAM_DRIVER_MANAGED;
if (!mhp_range_allowed(start, size, true)) return ERR_PTR(-E2BIG);
/* * Make sure value parsed from 'mem=' only restricts memory adding * while booting, so that memory hotplug won't be impacted. Please * refer to document of 'mem=' in kernel-parameters.txt for more * details.
*/ if (start + size > max_mem_size && system_state < SYSTEM_RUNNING) return ERR_PTR(-E2BIG);
/* * Request ownership of the new memory range. This might be * a child of an existing resource that was present but * not marked as busy.
*/
res = __request_region(&iomem_resource, start, size,
resource_name, flags);
if (!res) {
pr_debug("Unable to reserve System RAM region: %016llx->%016llx\n",
start, start + size); return ERR_PTR(-EEXIST);
} return res;
}
staticint check_pfn_span(unsignedlong pfn, unsignedlong nr_pages)
{ /* * Disallow all operations smaller than a sub-section and only * allow operations smaller than a section for * SPARSEMEM_VMEMMAP. Note that check_hotplug_memory_range() * enforces a larger memory_block_size_bytes() granularity for * memory that will be marked online, so this check should only * fire for direct arch_{add,remove}_memory() users outside of * add_memory_resource().
*/ unsignedlong min_align;
if (IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP))
min_align = PAGES_PER_SUBSECTION; else
min_align = PAGES_PER_SECTION; if (!IS_ALIGNED(pfn | nr_pages, min_align)) return -EINVAL; return 0;
}
/* * Return page for the valid pfn only if the page is online. All pfn * walkers which rely on the fully initialized page->flags and others * should use this rather than pfn_valid && pfn_to_page
*/ struct page *pfn_to_online_page(unsignedlong pfn)
{ unsignedlong nr = pfn_to_section_nr(pfn); struct dev_pagemap *pgmap; struct mem_section *ms;
if (nr >= NR_MEM_SECTIONS) return NULL;
ms = __nr_to_section(nr); if (!online_section(ms)) return NULL;
/* * Save some code text when online_section() + * pfn_section_valid() are sufficient.
*/ if (IS_ENABLED(CONFIG_HAVE_ARCH_PFN_VALID) && !pfn_valid(pfn)) return NULL;
if (!pfn_section_valid(ms, pfn)) return NULL;
if (!online_device_section(ms)) return pfn_to_page(pfn);
/* * Slowpath: when ZONE_DEVICE collides with * ZONE_{NORMAL,MOVABLE} within the same section some pfns in * the section may be 'offline' but 'valid'. Only * get_dev_pagemap() can determine sub-section online status.
*/
pgmap = get_dev_pagemap(pfn, NULL);
put_dev_pagemap(pgmap);
/* The presence of a pgmap indicates ZONE_DEVICE offline pfn */ if (pgmap) return NULL;
for (; pfn < end_pfn; pfn += cur_nr_pages) { /* Select all remaining pages up to the next section boundary */
cur_nr_pages = min(end_pfn - pfn,
SECTION_ALIGN_UP(pfn + 1) - pfn);
err = sparse_add_section(nid, pfn, cur_nr_pages, altmap,
params->pgmap); if (err) break;
cond_resched();
}
vmemmap_populate_print_last(); return err;
}
/* find the smallest valid pfn in the range [start_pfn, end_pfn) */ staticunsignedlong find_smallest_section_pfn(int nid, struct zone *zone, unsignedlong start_pfn, unsignedlong end_pfn)
{ for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SUBSECTION) { if (unlikely(!pfn_to_online_page(start_pfn))) continue;
if (unlikely(pfn_to_nid(start_pfn) != nid)) continue;
if (zone != page_zone(pfn_to_page(start_pfn))) continue;
return start_pfn;
}
return 0;
}
/* find the biggest valid pfn in the range [start_pfn, end_pfn). */ staticunsignedlong find_biggest_section_pfn(int nid, struct zone *zone, unsignedlong start_pfn, unsignedlong end_pfn)
{ unsignedlong pfn;
/* pfn is the end pfn of a memory section. */
pfn = end_pfn - 1; for (; pfn >= start_pfn; pfn -= PAGES_PER_SUBSECTION) { if (unlikely(!pfn_to_online_page(pfn))) continue;
if (unlikely(pfn_to_nid(pfn) != nid)) continue;
if (zone != page_zone(pfn_to_page(pfn))) continue;
return pfn;
}
return 0;
}
staticvoid shrink_zone_span(struct zone *zone, unsignedlong start_pfn, unsignedlong end_pfn)
{ unsignedlong pfn; int nid = zone_to_nid(zone);
if (zone->zone_start_pfn == start_pfn) { /* * If the section is smallest section in the zone, it need * shrink zone->zone_start_pfn and zone->zone_spanned_pages. * In this case, we find second smallest valid mem_section * for shrinking zone.
*/
pfn = find_smallest_section_pfn(nid, zone, end_pfn,
zone_end_pfn(zone)); if (pfn) {
zone->spanned_pages = zone_end_pfn(zone) - pfn;
zone->zone_start_pfn = pfn;
} else {
zone->zone_start_pfn = 0;
zone->spanned_pages = 0;
}
} elseif (zone_end_pfn(zone) == end_pfn) { /* * If the section is biggest section in the zone, it need * shrink zone->spanned_pages. * In this case, we find second biggest valid mem_section for * shrinking zone.
*/
pfn = find_biggest_section_pfn(nid, zone, zone->zone_start_pfn,
start_pfn); if (pfn)
zone->spanned_pages = pfn - zone->zone_start_pfn + 1; else {
zone->zone_start_pfn = 0;
zone->spanned_pages = 0;
}
}
}
for (zone = pgdat->node_zones;
zone < pgdat->node_zones + MAX_NR_ZONES; zone++) { unsignedlong end_pfn = zone_end_pfn(zone);
/* No need to lock the zones, they can't change. */ if (!zone->spanned_pages) continue; if (!node_end_pfn) {
node_start_pfn = zone->zone_start_pfn;
node_end_pfn = end_pfn; continue;
}
if (end_pfn > node_end_pfn)
node_end_pfn = end_pfn; if (zone->zone_start_pfn < node_start_pfn)
node_start_pfn = zone->zone_start_pfn;
}
/* Poison struct pages because they are now uninitialized again. */ for (pfn = start_pfn; pfn < end_pfn; pfn += cur_nr_pages) {
cond_resched();
/* Select all remaining pages up to the next section boundary */
cur_nr_pages =
min(end_pfn - pfn, SECTION_ALIGN_UP(pfn + 1) - pfn);
page_init_poison(pfn_to_page(pfn), sizeof(struct page) * cur_nr_pages);
}
/* * Zone shrinking code cannot properly deal with ZONE_DEVICE. So * we will not try to shrink the zones - which is okay as * set_zone_contiguous() cannot deal with ZONE_DEVICE either way.
*/ if (zone_is_zone_device(zone)) return;
/** * __remove_pages() - remove sections of pages * @pfn: starting pageframe (must be aligned to start of a section) * @nr_pages: number of pages to remove (must be multiple of section size) * @altmap: alternative device page map or %NULL if default memmap is used * * Generic helper function to remove section mappings and sysfs entries * for the section of the memory we are removing. Caller needs to make * sure that pages are marked reserved and zones are adjust properly by * calling offline_pages().
*/ void __remove_pages(unsignedlong pfn, unsignedlong nr_pages, struct vmem_altmap *altmap)
{ constunsignedlong end_pfn = pfn + nr_pages; unsignedlong cur_nr_pages;
/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ void generic_online_page(struct page *page, unsignedint order)
{
__free_pages_core(page, order, MEMINIT_HOTPLUG);
}
EXPORT_SYMBOL_GPL(generic_online_page);
/* * Online the pages in MAX_PAGE_ORDER aligned chunks. The callback might * decide to not expose all pages to the buddy (e.g., expose them * later). We account all pages as being online and belonging to this * zone ("present"). * When using memmap_on_memory, the range might not be aligned to * MAX_ORDER_NR_PAGES - 1, but pageblock aligned. __ffs() will detect * this and the first chunk to online will be pageblock_nr_pages.
*/ for (pfn = start_pfn; pfn < end_pfn;) { struct page *page = pfn_to_page(pfn); int order;
/* * Free to online pages in the largest chunks alignment allows. * * __ffs() behaviour is undefined for 0. start == 0 is * MAX_PAGE_ORDER-aligned, Set order to MAX_PAGE_ORDER for * the case.
*/ if (pfn)
order = min_t(int, MAX_PAGE_ORDER, __ffs(pfn)); else
order = MAX_PAGE_ORDER;
/* * Exposing the page to the buddy by freeing can cause * issues with debug_pagealloc enabled: some archs don't * like double-unmappings. So treat them like any pages that * were allocated from the buddy.
*/
debug_pagealloc_map_pages(page, 1 << order);
(*online_page_callback)(page, order);
pfn += (1UL << order);
}
/* mark all involved sections as online */
online_mem_sections(start_pfn, end_pfn);
}
/* * Associate the pfn range with the given zone, initializing the memmaps * and resizing the pgdat/zone data to span the added pages. After this * call, all affected pages are PageOffline(). * * All aligned pageblocks are initialized to the specified migratetype * (usually MIGRATE_MOVABLE). Besides setting the migratetype, no related * zone stats (e.g., nr_isolate_pageblock) are touched.
*/ void move_pfn_range_to_zone(struct zone *zone, unsignedlong start_pfn, unsignedlong nr_pages, struct vmem_altmap *altmap, int migratetype, bool isolate_pageblock)
{ struct pglist_data *pgdat = zone->zone_pgdat; int nid = pgdat->node_id;
/* * Subsection population requires care in pfn_to_online_page(). * Set the taint to enable the slow path detection of * ZONE_DEVICE pages in an otherwise ZONE_{NORMAL,MOVABLE} * section.
*/ if (zone_is_zone_device(zone)) { if (!IS_ALIGNED(start_pfn, PAGES_PER_SECTION))
section_taint_zone_device(start_pfn); if (!IS_ALIGNED(start_pfn + nr_pages, PAGES_PER_SECTION))
section_taint_zone_device(start_pfn + nr_pages);
}
/* * TODO now we have a visible range of pages which are not associated * with their zone properly. Not nice but set_pfnblock_migratetype() * expects the zone spans the pfn range. All the pages in the range * are reserved so nobody should be touching them so we should be safe
*/
memmap_init_range(nr_pages, nid, zone_idx(zone), start_pfn, 0,
MEMINIT_HOTPLUG, altmap, migratetype,
isolate_pageblock);
staticint auto_movable_stats_account_group(struct memory_group *group, void *arg)
{ constint ratio = READ_ONCE(auto_movable_ratio); struct auto_movable_group_stats *stats = arg; long pages;
/* * We don't support modifying the config while the auto-movable online * policy is already enabled. Just avoid the division by zero below.
*/ if (!ratio) return 0;
/* * Calculate how many early kernel pages this group requires to * satisfy the configured zone ratio.
*/
pages = group->present_movable_pages * 100 / ratio;
pages -= group->present_kernel_pages;
/* Walk all relevant zones and collect MOVABLE vs. KERNEL stats. */ if (nid == NUMA_NO_NODE) { /* TODO: cache values */
for_each_populated_zone(zone)
auto_movable_stats_account_zone(&stats, zone);
} else { for (i = 0; i < MAX_NR_ZONES; i++) {
pg_data_t *pgdat = NODE_DATA(nid);
zone = pgdat->node_zones + i; if (populated_zone(zone))
auto_movable_stats_account_zone(&stats, zone);
}
}
/* * Kernel memory inside dynamic memory group allows for more MOVABLE * memory within the same group. Remove the effect of all but the * current group from the stats.
*/
walk_dynamic_memory_groups(nid, auto_movable_stats_account_group,
group, &group_stats); if (kernel_early_pages <= group_stats.req_kernel_early_pages) returnfalse;
kernel_early_pages -= group_stats.req_kernel_early_pages;
movable_pages -= group_stats.movable_pages;
if (group && group->is_dynamic)
kernel_early_pages += group->present_kernel_pages;
/* * Test if we could online the given number of pages to ZONE_MOVABLE * and still stay in the configured ratio.
*/
movable_pages += nr_pages; return movable_pages <= (auto_movable_ratio * kernel_early_pages) / 100;
}
/* * Returns a default kernel memory zone for the given pfn range. * If no kernel zone covers this pfn range it will automatically go * to the ZONE_NORMAL.
*/ staticstruct zone *default_kernel_zone_for_pfn(int nid, unsignedlong start_pfn, unsignedlong nr_pages)
{ struct pglist_data *pgdat = NODE_DATA(nid); int zid;
for (zid = 0; zid < ZONE_NORMAL; zid++) { struct zone *zone = &pgdat->node_zones[zid];
if (zone_intersects(zone, start_pfn, nr_pages)) return zone;
}
return &pgdat->node_zones[ZONE_NORMAL];
}
/* * Determine to which zone to online memory dynamically based on user * configuration and system stats. We care about the following ratio: * * MOVABLE : KERNEL * * Whereby MOVABLE is memory in ZONE_MOVABLE and KERNEL is memory in * one of the kernel zones. CMA pages inside one of the kernel zones really * behaves like ZONE_MOVABLE, so we treat them accordingly. * * We don't allow for hotplugged memory in a KERNEL zone to increase the * amount of MOVABLE memory we can have, so we end up with: * * MOVABLE : KERNEL_EARLY * * Whereby KERNEL_EARLY is memory in one of the kernel zones, available sinze * boot. We base our calculation on KERNEL_EARLY internally, because: * * a) Hotplugged memory in one of the kernel zones can sometimes still get * hotunplugged, especially when hot(un)plugging individual memory blocks. * There is no coordination across memory devices, therefore "automatic" * hotunplugging, as implemented in hypervisors, could result in zone * imbalances. * b) Early/boot memory in one of the kernel zones can usually not get * hotunplugged again (e.g., no firmware interface to unplug, fragmented * with unmovable allocations). While there are corner cases where it might * still work, it is barely relevant in practice. * * Exceptions are dynamic memory groups, which allow for more MOVABLE * memory within the same memory group -- because in that case, there is * coordination within the single memory device managed by a single driver. * * We rely on "present pages" instead of "managed pages", as the latter is * highly unreliable and dynamic in virtualized environments, and does not * consider boot time allocations. For example, memory ballooning adjusts the * managed pages when inflating/deflating the balloon, and balloon compaction * can even migrate inflated pages between zones. * * Using "present pages" is better but some things to keep in mind are: * * a) Some memblock allocations, such as for the crashkernel area, are * effectively unused by the kernel, yet they account to "present pages". * Fortunately, these allocations are comparatively small in relevant setups * (e.g., fraction of system memory). * b) Some hotplugged memory blocks in virtualized environments, esecially * hotplugged by virtio-mem, look like they are completely present, however, * only parts of the memory block are actually currently usable. * "present pages" is an upper limit that can get reached at runtime. As * we base our calculations on KERNEL_EARLY, this is not an issue.
*/ staticstruct zone *auto_movable_zone_for_pfn(int nid, struct memory_group *group, unsignedlong pfn, unsignedlong nr_pages)
{ unsignedlong online_pages = 0, max_pages, end_pfn; struct page *page;
/* If anything is !MOVABLE online the rest !MOVABLE. */ if (group->present_kernel_pages) goto kernel_zone;
} elseif (!group || group->d.unit_pages == nr_pages) {
max_pages = nr_pages;
} else {
max_pages = group->d.unit_pages; /* * Take a look at all online sections in the current unit. * We can safely assume that all pages within a section belong * to the same zone, because dynamic memory groups only deal * with hotplugged memory.
*/
pfn = ALIGN_DOWN(pfn, group->d.unit_pages);
end_pfn = pfn + group->d.unit_pages; for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
page = pfn_to_online_page(pfn); if (!page) continue; /* If anything is !MOVABLE online the rest !MOVABLE. */ if (!is_zone_movable_page(page)) goto kernel_zone;
online_pages += PAGES_PER_SECTION;
}
}
/* * Online MOVABLE if we could *currently* online all remaining parts * MOVABLE. We expect to (add+) online them immediately next, so if * nobody interferes, all will be MOVABLE if possible.
*/
nr_pages = max_pages - online_pages; if (!auto_movable_can_online_movable(NUMA_NO_NODE, group, nr_pages)) goto kernel_zone;
/* * We inherit the existing zone in a simple case where zones do not * overlap in the given range
*/ if (in_kernel ^ in_movable) return (in_kernel) ? kernel_zone : movable_zone;
/* * If the range doesn't belong to any zone or two zones overlap in the * given range then we use movable zone only if movable_node is * enabled because we always online to a kernel zone by default.
*/ return movable_node_enabled ? movable_zone : kernel_zone;
}
struct zone *zone_for_pfn_range(int online_type, int nid, struct memory_group *group, unsignedlong start_pfn, unsignedlong nr_pages)
{ if (online_type == MMOP_ONLINE_KERNEL) return default_kernel_zone_for_pfn(nid, start_pfn, nr_pages);
if (online_type == MMOP_ONLINE_MOVABLE) return &NODE_DATA(nid)->node_zones[ZONE_MOVABLE];
if (online_policy == ONLINE_POLICY_AUTO_MOVABLE) return auto_movable_zone_for_pfn(nid, group, start_pfn, nr_pages);
/* * This function should only be called by memory_block_{online,offline}, * and {online,offline}_pages.
*/ void adjust_present_page_count(struct page *page, struct memory_group *group, long nr_pages)
{ struct zone *zone = page_zone(page); constbool movable = zone_idx(zone) == ZONE_MOVABLE;
/* * We only support onlining/offlining/adding/removing of complete * memory blocks; therefore, either all is either early or hotplugged.
*/ if (early_section(__pfn_to_section(page_to_pfn(page))))
zone->present_early_pages += nr_pages;
zone->present_pages += nr_pages;
zone->zone_pgdat->node_present_pages += nr_pages;
int mhp_init_memmap_on_memory(unsignedlong pfn, unsignedlong nr_pages, struct zone *zone, bool mhp_off_inaccessible)
{ unsignedlong end_pfn = pfn + nr_pages; int ret, i;
ret = kasan_add_zero_shadow(__va(PFN_PHYS(pfn)), PFN_PHYS(nr_pages)); if (ret) return ret;
/* * Memory block is accessible at this stage and hence poison the struct * pages now. If the memory block is accessible during memory hotplug * addition phase, then page poisining is already performed in * sparse_add_section().
*/ if (mhp_off_inaccessible)
page_init_poison(pfn_to_page(pfn), sizeof(struct page) * nr_pages);
/* * It might be that the vmemmap_pages fully span sections. If that is * the case, mark those sections online here as otherwise they will be * left offline.
*/ if (nr_pages >= PAGES_PER_SECTION)
online_mem_sections(pfn, ALIGN_DOWN(end_pfn, PAGES_PER_SECTION));
/* * It might be that the vmemmap_pages fully span sections. If that is * the case, mark those sections offline here as otherwise they will be * left online.
*/ if (nr_pages >= PAGES_PER_SECTION)
offline_mem_sections(pfn, ALIGN_DOWN(end_pfn, PAGES_PER_SECTION));
/* * The pages associated with this vmemmap have been offlined, so * we can reset its state here.
*/
remove_pfn_range_from_zone(page_zone(pfn_to_page(pfn)), pfn, nr_pages);
kasan_remove_zero_shadow(__va(PFN_PHYS(pfn)), PFN_PHYS(nr_pages));
}
/* * Must be called with mem_hotplug_lock in write mode.
*/ int online_pages(unsignedlong pfn, unsignedlong nr_pages, struct zone *zone, struct memory_group *group)
{ struct memory_notify mem_arg = {
.start_pfn = pfn,
.nr_pages = nr_pages,
}; struct node_notify node_arg = {
.nid = NUMA_NO_NODE,
}; constint nid = zone_to_nid(zone); int need_zonelists_rebuild = 0; unsignedlong flags; int ret;
/* * {on,off}lining is constrained to full memory sections (or more * precisely to memory blocks from the user space POV). * memmap_on_memory is an exception because it reserves initial part * of the physical memory space for vmemmaps. That space is pageblock * aligned.
*/ if (WARN_ON_ONCE(!nr_pages || !pageblock_aligned(pfn) ||
!IS_ALIGNED(pfn + nr_pages, PAGES_PER_SECTION))) return -EINVAL;
/* associate pfn range with the zone */
move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_MOVABLE, true);
if (!node_state(nid, N_MEMORY)) { /* Adding memory to the node for the first time */
node_arg.nid = nid;
ret = node_notify(NODE_ADDING_FIRST_MEMORY, &node_arg);
ret = notifier_to_errno(ret); if (ret) goto failed_addition;
}
ret = memory_notify(MEM_GOING_ONLINE, &mem_arg);
ret = notifier_to_errno(ret); if (ret) goto failed_addition;
/* * Fixup the number of isolated pageblocks before marking the sections * onlining, such that undo_isolate_page_range() works correctly.
*/
spin_lock_irqsave(&zone->lock, flags);
zone->nr_isolate_pageblock += nr_pages / pageblock_nr_pages;
spin_unlock_irqrestore(&zone->lock, flags);
/* * If this zone is not populated, then it is not in zonelist. * This means the page allocator ignores this zone. * So, zonelist must be updated after online.
*/ if (!populated_zone(zone)) {
need_zonelists_rebuild = 1;
setup_zone_pageset(zone);
}
if (node_arg.nid >= 0)
node_set_state(nid, N_MEMORY); if (need_zonelists_rebuild)
build_all_zonelists(NULL);
/* Basic onlining is complete, allow allocation of onlined pages. */
undo_isolate_page_range(pfn, pfn + nr_pages);
/* * Freshly onlined pages aren't shuffled (e.g., all pages are placed to * the tail of the freelist when undoing isolation). Shuffle the whole * zone to make sure the just onlined pages are properly distributed * across the whole freelist - to create an initial shuffle.
*/
shuffle_zone(zone);
/* reinitialise watermarks and update pcp limits */
init_per_zone_wmark_min();
kswapd_run(nid);
kcompactd_run(nid);
if (node_arg.nid >= 0) /* First memory added successfully. Notify consumers. */
node_notify(NODE_ADDED_FIRST_MEMORY, &node_arg);
/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ static pg_data_t *hotadd_init_pgdat(int nid)
{ struct pglist_data *pgdat;
/* * NODE_DATA is preallocated (free_area_init) but its internal * state is not allocated completely. Add missing pieces. * Completely offline nodes stay around and they just need * reintialization.
*/
pgdat = NODE_DATA(nid);
/* init node's zones as empty zones, we don't have any present pages.*/
free_area_init_core_hotplug(pgdat);
/* * The node we allocated has no zone fallback lists. For avoiding * to access not-initialized zonelist, build here.
*/
build_all_zonelists(pgdat);
return pgdat;
}
/* * __try_online_node - online a node if offlined * @nid: the node ID * @set_node_online: Whether we want to online the node * called by cpu_up() to online a node without onlined memory. * * Returns: * 1 -> a new node has been allocated * 0 -> the node is already online * -ENOMEM -> the node could not be allocated
*/ staticint __try_online_node(int nid, bool set_node_online)
{
pg_data_t *pgdat; int ret = 1;
if (node_online(nid)) return 0;
pgdat = hotadd_init_pgdat(nid); if (!pgdat) {
pr_err("Cannot online node %d due to NULL pgdat\n", nid);
ret = -ENOMEM; goto out;
}
if (set_node_online) {
node_set_online(nid);
ret = register_one_node(nid);
BUG_ON(ret);
}
out: return ret;
}
/* * Users of this function always want to online/register the node
*/ int try_online_node(int nid)
{ int ret;
mem_hotplug_begin();
ret = __try_online_node(nid, true);
mem_hotplug_done(); return ret;
}
staticint check_hotplug_memory_range(u64 start, u64 size)
{ /* memory range must be block size aligned */ if (!size || !IS_ALIGNED(start, memory_block_size_bytes()) ||
!IS_ALIGNED(size, memory_block_size_bytes())) {
pr_err("Block size [%#lx] unaligned hotplug range: start %#llx, size %#llx",
memory_block_size_bytes(), start, size); return -EINVAL;
}
#ifndef arch_supports_memmap_on_memory staticinlinebool arch_supports_memmap_on_memory(unsignedlong vmemmap_size)
{ /* * As default, we want the vmemmap to span a complete PMD such that we * can map the vmemmap using a single PMD if supported by the * architecture.
*/ return IS_ALIGNED(vmemmap_size, PMD_SIZE);
} #endif
/* * Besides having arch support and the feature enabled at runtime, we * need a few more assumptions to hold true: * * a) The vmemmap pages span complete PMDs: We don't want vmemmap code * to populate memory from the altmap for unrelated parts (i.e., * other memory blocks) * * b) The vmemmap pages (and thereby the pages that will be exposed to * the buddy) have to cover full pageblocks: memory onlining/offlining * code requires applicable ranges to be page-aligned, for example, to * set the migratetypes properly. * * TODO: Although we have a check here to make sure that vmemmap pages * fully populate a PMD, it is not the right place to check for * this. A much better solution involves improving vmemmap code * to fallback to base pages when trying to populate vmemmap using * altmap as an alternative source of memory, and we do not exactly * populate a single PMD.
*/ if (!mhp_memmap_on_memory()) returnfalse;
/* * Make sure the vmemmap allocation is fully contained * so that we always allocate vmemmap memory from altmap area.
*/ if (!IS_ALIGNED(vmemmap_size, PAGE_SIZE)) returnfalse;
/* * start pfn should be pageblock_nr_pages aligned for correctly * setting migrate types
*/ if (!pageblock_aligned(memmap_pages)) returnfalse;
if (memmap_pages == PHYS_PFN(memory_block_size_bytes())) /* No effective hotplugged memory doesn't make sense. */ returnfalse;
/* * For memmap_on_memory, the altmaps were added on a per-memblock * basis; we have to process each individual memory block.
*/ for (cur_start = start; cur_start < start + size;
cur_start += memblock_size) { struct vmem_altmap *altmap = NULL; struct memory_block *mem;
mem = find_memory_block(pfn_to_section_nr(PFN_DOWN(cur_start))); if (WARN_ON_ONCE(!mem)) continue;
/* * NOTE: The caller must call lock_device_hotplug() to serialize hotplug * and online/offline operations (triggered e.g. by sysfs). * * we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG
*/ int add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
{ struct mhp_params params = { .pgprot = pgprot_mhp(PAGE_KERNEL) }; enum memblock_flags memblock_flags = MEMBLOCK_NONE; struct memory_group *group = NULL;
u64 start, size; bool new_node = false; int ret;
start = res->start;
size = resource_size(res);
ret = check_hotplug_memory_range(start, size); if (ret) return ret;
if (mhp_flags & MHP_NID_IS_MGID) {
group = memory_group_find_by_id(nid); if (!group) return -EINVAL;
nid = group->nid;
}
if (!node_possible(nid)) {
WARN(1, "node %d was absent from the node_possible_map\n", nid); return -EINVAL;
}
mem_hotplug_begin();
if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) { if (res->flags & IORESOURCE_SYSRAM_DRIVER_MANAGED)
memblock_flags = MEMBLOCK_DRIVER_MANAGED;
ret = memblock_add_node(start, size, nid, memblock_flags); if (ret) goto error_mem_hotplug_end;
}
ret = __try_online_node(nid, false); if (ret < 0) goto error;
new_node = ret;
/* * Self hosted memmap array
*/ if ((mhp_flags & MHP_MEMMAP_ON_MEMORY) &&
mhp_supports_memmap_on_memory()) {
ret = create_altmaps_and_memory_blocks(nid, group, start, size, mhp_flags); if (ret) goto error;
} else {
ret = arch_add_memory(nid, start, size, ¶ms); if (ret < 0) goto error;
/* create memory block devices after memory was added */
ret = create_memory_block_devices(start, size, NULL, group); if (ret) {
arch_remove_memory(start, size, params.altmap); goto error;
}
}
if (new_node) { /* If sysfs file of new node can't be created, cpu on the node * can't be hot-added. There is no rollback way now. * So, check by BUG_ON() to catch it reluctantly.. * We online node here. We can't roll back from here.
*/
node_set_online(nid);
ret = register_one_node(nid);
BUG_ON(ret);
}
/* create new memmap entry */ if (!strcmp(res->name, "System RAM"))
firmware_map_add_hotplug(start, start + size, "System RAM");
/* device_online() will take the lock when calling online_pages() */
mem_hotplug_done();
/* * In case we're allowed to merge the resource, flag it and trigger * merging now that adding succeeded.
*/ if (mhp_flags & MHP_MERGE_RESOURCE)
merge_system_ram_resource(res);
/* online pages if requested */ if (mhp_get_default_online_type() != MMOP_OFFLINE)
walk_memory_blocks(start, size, NULL, online_memory_block);
/* * Add special, driver-managed memory to the system as system RAM. Such * memory is not exposed via the raw firmware-provided memmap as system * RAM, instead, it is detected and added by a driver - during cold boot, * after a reboot, and after kexec. * * Reasons why this memory should not be used for the initial memmap of a * kexec kernel or for placing kexec images: * - The booting kernel is in charge of determining how this memory will be * used (e.g., use persistent memory as system RAM) * - Coordination with a hypervisor is required before this memory * can be used (e.g., inaccessible parts). * * For this memory, no entries in /sys/firmware/memmap ("raw firmware-provided * memory map") are created. Also, the created memory resource is flagged * with IORESOURCE_SYSRAM_DRIVER_MANAGED, so in-kernel users can special-case * this memory as well (esp., not place kexec images onto it). * * The resource_name (visible via /proc/iomem) has to have the format * "System RAM ($DRIVER)".
*/ int add_memory_driver_managed(int nid, u64 start, u64 size, constchar *resource_name, mhp_t mhp_flags)
{ struct resource *res; int rc;
/* * Platforms should define arch_get_mappable_range() that provides * maximum possible addressable physical memory range for which the * linear mapping could be created. The platform returned address * range must adhere to these following semantics. * * - range.start <= range.end * - Range includes both end points [range.start..range.end] * * There is also a fallback definition provided here, allowing the * entire possible physical address range in case any platform does * not define arch_get_mappable_range().
*/ struct range __weak arch_get_mappable_range(void)
{ struct range mhp_range = {
.start = 0UL,
.end = -1ULL,
}; return mhp_range;
}
struct range mhp_get_pluggable_range(bool need_mapping)
{ const u64 max_phys = DIRECT_MAP_PHYSMEM_END; struct range mhp_range;
bool mhp_range_allowed(u64 start, u64 size, bool need_mapping)
{ struct range mhp_range = mhp_get_pluggable_range(need_mapping);
u64 end = start + size;
if (start < end && start >= mhp_range.start && (end - 1) <= mhp_range.end) returntrue;
pr_warn("Hotplug memory [%#llx-%#llx] exceeds maximum addressable range [%#llx-%#llx]\n",
start, end, mhp_range.start, mhp_range.end); returnfalse;
}
#ifdef CONFIG_MEMORY_HOTREMOVE /* * Scan pfn range [start,end) to find movable/migratable pages (LRU and * hugetlb folio, movable_ops pages). Will skip over most unmovable * pages (esp., pages that can be skipped when offlining), but bail out on * definitely unmovable pages. * * Returns: * 0 in case a movable page is found and movable_pfn was updated. * -ENOENT in case no movable page was found. * -EBUSY in case a definitely unmovable page was found.
*/ staticint scan_movable_pages(unsignedlong start, unsignedlong end, unsignedlong *movable_pfn)
{ unsignedlong pfn;
page = pfn_to_page(pfn); if (PageLRU(page) || page_has_movable_ops(page)) goto found;
/* * PageOffline() pages that do not have movable_ops and * have a reference count > 0 (after MEM_GOING_OFFLINE) are * definitely unmovable. If their reference count would be 0, * they could at least be skipped when offlining memory.
*/ if (PageOffline(page) && page_count(page)) return -EBUSY;
if (!PageHuge(page)) continue;
folio = page_folio(page); /* * This test is racy as we hold no reference or lock. The * hugetlb page could have been free'ed and head is no longer * a hugetlb page before the following check. In such unlikely * cases false positives and negatives are possible. Calling * code must deal with these scenarios.
*/ if (folio_test_hugetlb_migratable(folio)) goto found;
pfn |= folio_nr_pages(folio) - 1;
} return -ENOENT;
found:
*movable_pfn = pfn; return 0;
}
if (unlikely(page_folio(page) != folio)) goto put_folio;
if (folio_test_large(folio))
pfn = folio_pfn(folio) + folio_nr_pages(folio) - 1;
if (folio_contain_hwpoisoned_page(folio)) { /* * unmap_poisoned_folio() cannot handle large folios * in all cases yet.
*/ if (folio_test_large(folio) && !folio_test_hugetlb(folio)) goto put_folio; if (folio_test_lru(folio) && !folio_isolate_lru(folio)) goto put_folio; if (folio_mapped(folio)) {
folio_lock(folio);
unmap_poisoned_folio(folio, pfn, false);
folio_unlock(folio);
}
goto put_folio;
}
if (!isolate_folio_to_list(folio, &source)) { if (__ratelimit(&migrate_rs)) {
pr_warn("failed to isolate pfn %lx\n",
page_to_pfn(page));
dump_page(page, "isolation failed");
}
}
put_folio:
folio_put(folio);
} if (!list_empty(&source)) {
nodemask_t nmask = node_states[N_MEMORY]; struct migration_target_control mtc = {
.nmask = &nmask,
.gfp_mask = GFP_KERNEL | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
.reason = MR_MEMORY_HOTPLUG,
}; int ret;
/* * We have checked that migration range is on a single zone so * we can use the nid of the first page to all the others.
*/
mtc.nid = folio_nid(list_first_entry(&source, struct folio, lru));
/* * try to allocate from a different node but reuse this node * if there are no other online nodes to be used (e.g. we are * offlining a part of the only existing node)
*/
node_clear(mtc.nid, nmask); if (nodes_empty(nmask))
node_set(mtc.nid, nmask);
ret = migrate_pages(&source, alloc_migration_target, NULL,
(unsignedlong)&mtc, MIGRATE_SYNC, MR_MEMORY_HOTPLUG, NULL); if (ret) {
list_for_each_entry(folio, &source, lru) { if (__ratelimit(&migrate_rs)) {
pr_warn("migrating pfn %lx failed ret:%d\n",
folio_pfn(folio), ret);
dump_page(&folio->page, "migration failure");
}
}
putback_movable_pages(&source);
}
}
}
/* * Must be called with mem_hotplug_lock in write mode.
*/ int offline_pages(unsignedlong start_pfn, unsignedlong nr_pages, struct zone *zone, struct memory_group *group)
{ unsignedlong pfn, managed_pages, system_ram_pages = 0; constunsignedlong end_pfn = start_pfn + nr_pages; struct pglist_data *pgdat = zone->zone_pgdat; constint node = zone_to_nid(zone); struct memory_notify mem_arg = {
.start_pfn = start_pfn,
.nr_pages = nr_pages,
}; struct node_notify node_arg = {
.nid = NUMA_NO_NODE,
}; unsignedlong flags; char *reason; int ret;
/* * {on,off}lining is constrained to full memory sections (or more * precisely to memory blocks from the user space POV). * memmap_on_memory is an exception because it reserves initial part * of the physical memory space for vmemmaps. That space is pageblock * aligned.
*/ if (WARN_ON_ONCE(!nr_pages || !pageblock_aligned(start_pfn) ||
!IS_ALIGNED(start_pfn + nr_pages, PAGES_PER_SECTION))) return -EINVAL;
/* * Don't allow to offline memory blocks that contain holes. * Consequently, memory blocks with holes can never get onlined * via the hotplug path - online_pages() - as hotplugged memory has * no holes. This way, we don't have to worry about memory holes, * don't need pfn_valid() checks, and can avoid using * walk_system_ram_range() later.
*/
walk_system_ram_range(start_pfn, nr_pages, &system_ram_pages,
count_system_ram_pages_cb); if (system_ram_pages != nr_pages) {
ret = -EINVAL;
reason = "memory holes"; goto failed_removal;
}
/* * We only support offlining of memory blocks managed by a single zone, * checked by calling code. This is just a sanity check that we might * want to remove in the future.
*/ if (WARN_ON_ONCE(page_zone(pfn_to_page(start_pfn)) != zone ||
page_zone(pfn_to_page(end_pfn - 1)) != zone)) {
ret = -EINVAL;
reason = "multizone range"; goto failed_removal;
}
/* * Disable pcplists so that page isolation cannot race with freeing * in a way that pages from isolated pageblock are left on pcplists.
*/
zone_pcp_disable(zone);
lru_cache_disable();
/* set above range as isolated */
ret = start_isolate_page_range(start_pfn, end_pfn,
PB_ISOLATE_MODE_MEM_OFFLINE); if (ret) {
reason = "failure to isolate range"; goto failed_removal_pcplists_disabled;
}
/* * Check whether the node will have no present pages after we offline * 'nr_pages' more. If so, we know that the node will become empty, and * so we will clear N_MEMORY for it.
*/ if (nr_pages >= pgdat->node_present_pages) {
node_arg.nid = node;
ret = node_notify(NODE_REMOVING_LAST_MEMORY, &node_arg);
ret = notifier_to_errno(ret); if (ret) {
reason = "node notifier failure"; goto failed_removal_isolated;
}
}
ret = memory_notify(MEM_GOING_OFFLINE, &mem_arg);
ret = notifier_to_errno(ret); if (ret) {
reason = "notifier failure"; goto failed_removal_isolated;
}
do {
pfn = start_pfn; do { /* * Historically we always checked for any signal and * can't limit it to fatal signals without eventually * breaking user space.
*/ if (signal_pending(current)) {
ret = -EINTR;
reason = "signal backoff"; goto failed_removal_isolated;
}
cond_resched();
ret = scan_movable_pages(pfn, end_pfn, &pfn); if (!ret) { /* * TODO: fatal migration failures should bail * out
*/
do_migrate_range(pfn, end_pfn);
}
} while (!ret);
/* * Dissolve free hugetlb folios in the memory block before doing * offlining actually in order to make hugetlbfs's object * counting consistent.
*/
ret = dissolve_free_hugetlb_folios(start_pfn, end_pfn); if (ret) {
reason = "failure to dissolve huge pages"; goto failed_removal_isolated;
}
ret = test_pages_isolated(start_pfn, end_pfn,
PB_ISOLATE_MODE_MEM_OFFLINE);
} while (ret);
/* Mark all sections offline and remove free pages from the buddy. */
managed_pages = __offline_isolated_pages(start_pfn, end_pfn);
pr_debug("Offlined Pages %ld\n", nr_pages);
/* * The memory sections are marked offline, and the pageblock flags * effectively stale; nobody should be touching them. Fixup the number * of isolated pageblocks, memory onlining will properly revert this.
*/
spin_lock_irqsave(&zone->lock, flags);
zone->nr_isolate_pageblock -= nr_pages / pageblock_nr_pages;
spin_unlock_irqrestore(&zone->lock, flags);
/* reinitialise watermarks and update pcp limits */
init_per_zone_wmark_min();
/* * Make sure to mark the node as memory-less before rebuilding the zone * list. Otherwise this node would still appear in the fallback lists.
*/ if (node_arg.nid >= 0)
node_clear_state(node, N_MEMORY); if (!populated_zone(zone)) {
zone_pcp_reset(zone);
build_all_zonelists(NULL);
}
if (node_arg.nid >= 0) {
kcompactd_stop(node);
kswapd_stop(node); /* Node went memoryless. Notify consumers */
node_notify(NODE_REMOVED_LAST_MEMORY, &node_arg);
}
for_each_present_cpu(cpu) { if (cpu_to_node(cpu) == nid) /* * the cpu on this node isn't removed, and we can't * offline this node.
*/ return -EBUSY;
}
return 0;
}
staticint check_no_memblock_for_node_cb(struct memory_block *mem, void *arg)
{ int nid = *(int *)arg;
/* * If a memory block belongs to multiple nodes, the stored nid is not * reliable. However, such blocks are always online (e.g., cannot get * offlined) and, therefore, are still spanned by the node.
*/ return mem->nid == nid ? -EEXIST : 0;
}
/** * try_offline_node * @nid: the node ID * * Offline a node if all memory sections and cpus of the node are removed. * * NOTE: The caller must call lock_device_hotplug() to serialize hotplug * and online/offline operations before this call.
*/ void try_offline_node(int nid)
{ int rc;
/* * If the node still spans pages (especially ZONE_DEVICE), don't * offline it. A node spans memory after move_pfn_range_to_zone(), * e.g., after the memory block was onlined.
*/ if (node_spanned_pages(nid)) return;
/* * Especially offline memory blocks might not be spanned by the * node. They will get spanned by the node once they get onlined. * However, they link to the node in sysfs and can get onlined later.
*/
rc = for_each_memory_block(&nid, check_no_memblock_for_node_cb); if (rc) return;
if (check_cpu_on_node(nid)) return;
/* * all memory/cpu of this node are removed, we can offline this * node now.
*/
node_set_offline(nid);
unregister_one_node(nid);
}
EXPORT_SYMBOL(try_offline_node);
if (WARN_ON_ONCE(num_memblocks != num_altmaps)) return -EINVAL;
return 1;
}
staticint try_remove_memory(u64 start, u64 size)
{ int rc, nid = NUMA_NO_NODE;
BUG_ON(check_hotplug_memory_range(start, size));
/* * All memory blocks must be offlined before removing memory. Check * whether all memory blocks in question are offline and return error * if this is not the case. * * While at it, determine the nid. Note that if we'd have mixed nodes, * we'd only try to offline the last determined one -- which is good * enough for the cases we care about.
*/
rc = walk_memory_blocks(start, size, &nid, check_memblock_offlined_cb); if (rc) return rc;
rc = memory_blocks_have_altmaps(start, size); if (rc < 0) {
mem_hotplug_done(); return rc;
} elseif (!rc) { /* * Memory block device removal under the device_hotplug_lock is * a barrier against racing online attempts. * No altmaps present, do the removal directly
*/
remove_memory_block_devices(start, size);
arch_remove_memory(start, size, NULL);
} else { /* all memblocks in the range have altmaps */
remove_memory_blocks_and_altmaps(start, size);
}
if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK))
memblock_remove(start, size);
release_mem_region_adjustable(start, size);
if (nid != NUMA_NO_NODE)
try_offline_node(nid);
mem_hotplug_done(); return 0;
}
/** * __remove_memory - Remove memory if every memory block is offline * @start: physical address of the region to remove * @size: size of the region to remove * * NOTE: The caller must call lock_device_hotplug() to serialize hotplug * and online/offline operations before this call, as required by * try_offline_node().
*/ void __remove_memory(u64 start, u64 size)
{
/* * trigger BUG() if some memory is not offlined prior to calling this * function
*/ if (try_remove_memory(start, size))
BUG();
}
/* * Remove memory if every memory block is offline, otherwise return -EBUSY is * some memory is not offline
*/ int remove_memory(u64 start, u64 size)
{ int rc;
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung ist noch experimentell.