// SPDX-License-Identifier: GPL-2.0-only /* * mm/percpu.c - percpu memory allocator * * Copyright (C) 2009 SUSE Linux Products GmbH * Copyright (C) 2009 Tejun Heo <tj@kernel.org> * * Copyright (C) 2017 Facebook Inc. * Copyright (C) 2017 Dennis Zhou <dennis@kernel.org> * * The percpu allocator handles both static and dynamic areas. Percpu * areas are allocated in chunks which are divided into units. There is * a 1-to-1 mapping for units to possible cpus. These units are grouped * based on NUMA properties of the machine. * * c0 c1 c2 * ------------------- ------------------- ------------ * | u0 | u1 | u2 | u3 | | u0 | u1 | u2 | u3 | | u0 | u1 | u * ------------------- ...... ------------------- .... ------------ * * Allocation is done by offsets into a unit's address space. Ie., an * area of 512 bytes at 6k in c1 occupies 512 bytes at 6k in c1:u0, * c1:u1, c1:u2, etc. On NUMA machines, the mapping may be non-linear * and even sparse. Access is handled by configuring percpu base * registers according to the cpu to unit mappings and offsetting the * base address using pcpu_unit_size. * * There is special consideration for the first chunk which must handle * the static percpu variables in the kernel image as allocation services * are not online yet. In short, the first chunk is structured like so: * * <Static | [Reserved] | Dynamic> * * The static data is copied from the original section managed by the * linker. The reserved section, if non-zero, primarily manages static * percpu variables from kernel modules. Finally, the dynamic section * takes care of normal allocations. * * The allocator organizes chunks into lists according to free size and * memcg-awareness. To make a percpu allocation memcg-aware the __GFP_ACCOUNT * flag should be passed. All memcg-aware allocations are sharing one set * of chunks and all unaccounted allocations and allocations performed * by processes belonging to the root memory cgroup are using the second set. * * The allocator tries to allocate from the fullest chunk first. Each chunk * is managed by a bitmap with metadata blocks. The allocation map is updated * on every allocation and free to reflect the current state while the boundary * map is only updated on allocation. Each metadata block contains * information to help mitigate the need to iterate over large portions * of the bitmap. The reverse mapping from page to chunk is stored in * the page's index. Lastly, units are lazily backed and grow in unison. * * There is a unique conversion that goes on here between bytes and bits. * Each bit represents a fragment of size PCPU_MIN_ALLOC_SIZE. The chunk * tracks the number of pages it is responsible for in nr_pages. Helper * functions are used to convert from between the bytes, bits, and blocks. * All hints are managed in bits unless explicitly stated. * * To use this allocator, arch code should do the following: * * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate * regular address to percpu pointer and back if they need to be * different from the default * * - use pcpu_setup_first_chunk() during percpu area initialization to * setup the first chunk containing the kernel static percpu area
*/
/* * The slots are sorted by the size of the biggest continuous free area. * 1-31 bytes share the same slot.
*/ #define PCPU_SLOT_BASE_SHIFT 5 /* chunks in slots below this are subject to being sidelined on failed alloc */ #define PCPU_SLOT_FAIL_THRESHOLD 3
staticint pcpu_unit_pages __ro_after_init; staticint pcpu_unit_size __ro_after_init; staticint pcpu_nr_units __ro_after_init; staticint pcpu_atom_size __ro_after_init; int pcpu_nr_slots __ro_after_init; staticint pcpu_free_slot __ro_after_init; int pcpu_sidelined_slot __ro_after_init; int pcpu_to_depopulate_slot __ro_after_init; static size_t pcpu_chunk_struct_size __ro_after_init;
/* cpus with the lowest and highest unit addresses */ staticunsignedint pcpu_low_unit_cpu __ro_after_init; staticunsignedint pcpu_high_unit_cpu __ro_after_init;
/* the address of the first chunk which starts with the kernel static area */ void *pcpu_base_addr __ro_after_init;
staticconstint *pcpu_unit_map __ro_after_init; /* cpu -> unit */ constunsignedlong *pcpu_unit_offsets __ro_after_init; /* cpu -> unit offset */
/* group information, used for vm allocation */ staticint pcpu_nr_groups __ro_after_init; staticconstunsignedlong *pcpu_group_offsets __ro_after_init; staticconst size_t *pcpu_group_sizes __ro_after_init;
/* * The first chunk which always exists. Note that unlike other * chunks, this one can be allocated and mapped in several different * ways and thus often doesn't live in the vmalloc area.
*/ struct pcpu_chunk *pcpu_first_chunk __ro_after_init;
/* * Optional reserved chunk. This chunk reserves part of the first * chunk and serves it for reserved allocations. When the reserved * region doesn't exist, the following variable is NULL.
*/ struct pcpu_chunk *pcpu_reserved_chunk __ro_after_init;
DEFINE_SPINLOCK(pcpu_lock); /* all internal data structures */ static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop, map ext */
struct list_head *pcpu_chunk_lists __ro_after_init; /* chunk list slots */
/* * The number of empty populated pages, protected by pcpu_lock. * The reserved chunk doesn't contribute to the count.
*/ int pcpu_nr_empty_pop_pages;
/* * The number of populated pages in use by the allocator, protected by * pcpu_lock. This number is kept per a unit per chunk (i.e. when a page gets * allocated/deallocated, it is allocated/deallocated in all units of a chunk * and increments/decrements this count by 1).
*/ staticunsignedlong pcpu_nr_populated;
/* * Balance work is used to populate or destroy chunks asynchronously. We * try to keep the number of populated free pages between * PCPU_EMPTY_POP_PAGES_LOW and HIGH for atomic allocations and at most one * empty chunk.
*/ staticvoid pcpu_balance_workfn(struct work_struct *work); static DECLARE_WORK(pcpu_balance_work, pcpu_balance_workfn); staticbool pcpu_async_enabled __read_mostly; staticbool pcpu_atomic_alloc_failed;
staticvoid pcpu_schedule_balance_work(void)
{ if (pcpu_async_enabled)
schedule_work(&pcpu_balance_work);
}
/** * pcpu_addr_in_chunk - check if the address is served from this chunk * @chunk: chunk of interest * @addr: percpu address * * RETURNS: * True if the address is served from this chunk.
*/ staticbool pcpu_addr_in_chunk(struct pcpu_chunk *chunk, void *addr)
{ void *start_addr, *end_addr;
/* set the pointer to a chunk in a page struct */ staticvoid pcpu_set_page_chunk(struct page *page, struct pcpu_chunk *pcpu)
{
page->private = (unsignedlong)pcpu;
}
/* obtain pointer to a chunk from a page struct */ staticstruct pcpu_chunk *pcpu_get_page_chunk(struct page *page)
{ return (struct pcpu_chunk *)page->private;
}
/* * The following are helper functions to help access bitmaps and convert * between bitmap offsets to address offsets.
*/ staticunsignedlong *pcpu_index_alloc_map(struct pcpu_chunk *chunk, int index)
{ return chunk->alloc_map +
(index * PCPU_BITMAP_BLOCK_BITS / BITS_PER_LONG);
}
staticunsignedlong pcpu_off_to_block_index(int off)
{ return off / PCPU_BITMAP_BLOCK_BITS;
}
staticunsignedlong pcpu_block_off_to_off(int index, int off)
{ return index * PCPU_BITMAP_BLOCK_BITS + off;
}
/** * pcpu_check_block_hint - check against the contig hint * @block: block of interest * @bits: size of allocation * @align: alignment of area (max PAGE_SIZE) * * Check to see if the allocation can fit in the block's contig hint. * Note, a chunk uses the same hints as a block so this can also check against * the chunk's contig hint.
*/ staticbool pcpu_check_block_hint(struct pcpu_block_md *block, int bits,
size_t align)
{ int bit_off = ALIGN(block->contig_hint_start, align) -
block->contig_hint_start;
return bit_off + bits <= block->contig_hint;
}
/* * pcpu_next_hint - determine which hint to use * @block: block of interest * @alloc_bits: size of allocation * * This determines if we should scan based on the scan_hint or first_free. * In general, we want to scan from first_free to fulfill allocations by * first fit. However, if we know a scan_hint at position scan_hint_start * cannot fulfill an allocation, we can begin scanning from there knowing * the contig_hint will be our fallback.
*/ staticint pcpu_next_hint(struct pcpu_block_md *block, int alloc_bits)
{ /* * The three conditions below determine if we can skip past the * scan_hint. First, does the scan hint exist. Second, is the * contig_hint after the scan_hint (possibly not true iff * contig_hint == scan_hint). Third, is the allocation request * larger than the scan_hint.
*/ if (block->scan_hint &&
block->contig_hint_start > block->scan_hint_start &&
alloc_bits > block->scan_hint) return block->scan_hint_start + block->scan_hint;
return block->first_free;
}
/** * pcpu_next_md_free_region - finds the next hint free area * @chunk: chunk of interest * @bit_off: chunk offset * @bits: size of free area * * Helper function for pcpu_for_each_md_free_region. It checks * block->contig_hint and performs aggregation across blocks to find the * next hint. It modifies bit_off and bits in-place to be consumed in the * loop.
*/ staticvoid pcpu_next_md_free_region(struct pcpu_chunk *chunk, int *bit_off, int *bits)
{ int i = pcpu_off_to_block_index(*bit_off); int block_off = pcpu_off_to_block_off(*bit_off); struct pcpu_block_md *block;
*bits = 0; for (block = chunk->md_blocks + i; i < pcpu_chunk_nr_blocks(chunk);
block++, i++) { /* handles contig area across blocks */ if (*bits) {
*bits += block->left_free; if (block->left_free == PCPU_BITMAP_BLOCK_BITS) continue; return;
}
/* * This checks three things. First is there a contig_hint to * check. Second, have we checked this hint before by * comparing the block_off. Third, is this the same as the * right contig hint. In the last case, it spills over into * the next block and should be handled by the contig area * across blocks code.
*/
*bits = block->contig_hint; if (*bits && block->contig_hint_start >= block_off &&
*bits + block->contig_hint_start < PCPU_BITMAP_BLOCK_BITS) {
*bit_off = pcpu_block_off_to_off(i,
block->contig_hint_start); return;
} /* reset to satisfy the second predicate above */
block_off = 0;
/** * pcpu_next_fit_region - finds fit areas for a given allocation request * @chunk: chunk of interest * @alloc_bits: size of allocation * @align: alignment of area (max PAGE_SIZE) * @bit_off: chunk offset * @bits: size of free area * * Finds the next free region that is viable for use with a given size and * alignment. This only returns if there is a valid area to be used for this * allocation. block->first_free is returned if the allocation request fits * within the block to see if the request can be fulfilled prior to the contig * hint.
*/ staticvoid pcpu_next_fit_region(struct pcpu_chunk *chunk, int alloc_bits, int align, int *bit_off, int *bits)
{ int i = pcpu_off_to_block_index(*bit_off); int block_off = pcpu_off_to_block_off(*bit_off); struct pcpu_block_md *block;
*bits = 0; for (block = chunk->md_blocks + i; i < pcpu_chunk_nr_blocks(chunk);
block++, i++) { /* handles contig area across blocks */ if (*bits) {
*bits += block->left_free; if (*bits >= alloc_bits) return; if (block->left_free == PCPU_BITMAP_BLOCK_BITS) continue;
}
/* check block->contig_hint */
*bits = ALIGN(block->contig_hint_start, align) -
block->contig_hint_start; /* * This uses the block offset to determine if this has been * checked in the prior iteration.
*/ if (block->contig_hint &&
block->contig_hint_start >= block_off &&
block->contig_hint >= *bits + alloc_bits) { int start = pcpu_next_hint(block, alloc_bits);
*bits += alloc_bits + block->contig_hint_start -
start;
*bit_off = pcpu_block_off_to_off(i, start); return;
} /* reset to satisfy the second predicate above */
block_off = 0;
/* no valid offsets were found - fail condition */
*bit_off = pcpu_chunk_map_bits(chunk);
}
/* * Metadata free area iterators. These perform aggregation of free areas * based on the metadata blocks and return the offset @bit_off and size in * bits of the free area @bits. pcpu_for_each_fit_region only returns when * a fit is found for the allocation request.
*/ #define pcpu_for_each_md_free_region(chunk, bit_off, bits) \ for (pcpu_next_md_free_region((chunk), &(bit_off), &(bits)); \
(bit_off) < pcpu_chunk_map_bits((chunk)); \
(bit_off) += (bits) + 1, \
pcpu_next_md_free_region((chunk), &(bit_off), &(bits)))
/** * pcpu_mem_zalloc - allocate memory * @size: bytes to allocate * @gfp: allocation flags * * Allocate @size bytes. If @size is smaller than PAGE_SIZE, * kzalloc() is used; otherwise, the equivalent of vzalloc() is used. * This is to facilitate passing through whitelisted flags. The * returned memory is always zeroed. * * RETURNS: * Pointer to the allocated area on success, NULL on failure.
*/ staticvoid *pcpu_mem_zalloc(size_t size, gfp_t gfp)
{ if (WARN_ON_ONCE(!slab_is_available())) return NULL;
/** * pcpu_chunk_relocate - put chunk in the appropriate chunk slot * @chunk: chunk of interest * @oslot: the previous slot it was on * * This function is called after an allocation or free changed @chunk. * New slot according to the changed state is determined and @chunk is * moved to the slot. Note that the reserved chunk is never put on * chunk slots. * * CONTEXT: * pcpu_lock.
*/ staticvoid pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
{ int nslot = pcpu_chunk_slot(chunk);
/* leave isolated chunks in-place */ if (chunk->isolated) return;
if (oslot != nslot)
__pcpu_chunk_move(chunk, nslot, oslot < nslot);
}
/* * pcpu_update_empty_pages - update empty page counters * @chunk: chunk of interest * @nr: nr of empty pages * * This is used to keep track of the empty pages now based on the premise * a md_block covers a page. The hint update functions recognize if a block * is made full or broken to calculate deltas for keeping track of free pages.
*/ staticinlinevoid pcpu_update_empty_pages(struct pcpu_chunk *chunk, int nr)
{
chunk->nr_empty_pop_pages += nr; if (chunk != pcpu_reserved_chunk && !chunk->isolated)
pcpu_nr_empty_pop_pages += nr;
}
/* * pcpu_region_overlap - determines if two regions overlap * @a: start of first region, inclusive * @b: end of first region, exclusive * @x: start of second region, inclusive * @y: end of second region, exclusive * * This is used to determine if the hint region [a, b) overlaps with the * allocated region [x, y).
*/ staticinlinebool pcpu_region_overlap(int a, int b, int x, int y)
{ return (a < y) && (x < b);
}
/** * pcpu_block_update - updates a block given a free area * @block: block of interest * @start: start offset in block * @end: end offset in block * * Updates a block given a known free area. The region [start, end) is * expected to be the entirety of the free area within a block. Chooses * the best starting offset if the contig hints are equal.
*/ staticvoid pcpu_block_update(struct pcpu_block_md *block, int start, int end)
{ int contig = end - start;
if (end == block->nr_bits)
block->right_free = contig;
if (contig > block->contig_hint) { /* promote the old contig_hint to be the new scan_hint */ if (start > block->contig_hint_start) { if (block->contig_hint > block->scan_hint) {
block->scan_hint_start =
block->contig_hint_start;
block->scan_hint = block->contig_hint;
} elseif (start < block->scan_hint_start) { /* * The old contig_hint == scan_hint. But, the * new contig is larger so hold the invariant * scan_hint_start < contig_hint_start.
*/
block->scan_hint = 0;
}
} else {
block->scan_hint = 0;
}
block->contig_hint_start = start;
block->contig_hint = contig;
} elseif (contig == block->contig_hint) { if (block->contig_hint_start &&
(!start ||
__ffs(start) > __ffs(block->contig_hint_start))) { /* start has a better alignment so use it */
block->contig_hint_start = start; if (start < block->scan_hint_start &&
block->contig_hint > block->scan_hint)
block->scan_hint = 0;
} elseif (start > block->scan_hint_start ||
block->contig_hint > block->scan_hint) { /* * Knowing contig == contig_hint, update the scan_hint * if it is farther than or larger than the current * scan_hint.
*/
block->scan_hint_start = start;
block->scan_hint = contig;
}
} else { /* * The region is smaller than the contig_hint. So only update * the scan_hint if it is larger than or equal and farther than * the current scan_hint.
*/ if ((start < block->contig_hint_start &&
(contig > block->scan_hint ||
(contig == block->scan_hint &&
start > block->scan_hint_start)))) {
block->scan_hint_start = start;
block->scan_hint = contig;
}
}
}
/* * pcpu_block_update_scan - update a block given a free area from a scan * @chunk: chunk of interest * @bit_off: chunk offset * @bits: size of free area * * Finding the final allocation spot first goes through pcpu_find_block_fit() * to find a block that can hold the allocation and then pcpu_alloc_area() * where a scan is used. When allocations require specific alignments, * we can inadvertently create holes which will not be seen in the alloc * or free paths. * * This takes a given free area hole and updates a block as it may change the * scan_hint. We need to scan backwards to ensure we don't miss free bits * from alignment.
*/ staticvoid pcpu_block_update_scan(struct pcpu_chunk *chunk, int bit_off, int bits)
{ int s_off = pcpu_off_to_block_off(bit_off); int e_off = s_off + bits; int s_index, l_bit; struct pcpu_block_md *block;
/* scan backwards in case of alignment skipping free bits */
l_bit = find_last_bit(pcpu_index_alloc_map(chunk, s_index), s_off);
s_off = (s_off == l_bit) ? 0 : l_bit + 1;
pcpu_block_update(block, s_off, e_off);
}
/** * pcpu_chunk_refresh_hint - updates metadata about a chunk * @chunk: chunk of interest * @full_scan: if we should scan from the beginning * * Iterates over the metadata blocks to find the largest contig area. * A full scan can be avoided on the allocation path as this is triggered * if we broke the contig_hint. In doing so, the scan_hint will be before * the contig_hint or after if the scan_hint == contig_hint. This cannot * be prevented on freeing as we want to find the largest area possibly * spanning blocks.
*/ staticvoid pcpu_chunk_refresh_hint(struct pcpu_chunk *chunk, bool full_scan)
{ struct pcpu_block_md *chunk_md = &chunk->chunk_md; int bit_off, bits;
/** * pcpu_block_refresh_hint * @chunk: chunk of interest * @index: index of the metadata block * * Scans over the block beginning at first_free and updates the block * metadata accordingly.
*/ staticvoid pcpu_block_refresh_hint(struct pcpu_chunk *chunk, int index)
{ struct pcpu_block_md *block = chunk->md_blocks + index; unsignedlong *alloc_map = pcpu_index_alloc_map(chunk, index); unsignedint start, end; /* region start, region end */
/* iterate over free areas and update the contig hints */
for_each_clear_bitrange_from(start, end, alloc_map, PCPU_BITMAP_BLOCK_BITS)
pcpu_block_update(block, start, end);
}
/** * pcpu_block_update_hint_alloc - update hint on allocation path * @chunk: chunk of interest * @bit_off: chunk offset * @bits: size of request * * Updates metadata for the allocation path. The metadata only has to be * refreshed by a full scan iff the chunk's contig hint is broken. Block level * scans are required if the block's contig hint is broken.
*/ staticvoid pcpu_block_update_hint_alloc(struct pcpu_chunk *chunk, int bit_off, int bits)
{ struct pcpu_block_md *chunk_md = &chunk->chunk_md; int nr_empty_pages = 0; struct pcpu_block_md *s_block, *e_block, *block; int s_index, e_index; /* block indexes of the freed allocation */ int s_off, e_off; /* block offsets of the freed allocation */
/* * Calculate per block offsets. * The calculation uses an inclusive range, but the resulting offsets * are [start, end). e_index always points to the last block in the * range.
*/
s_index = pcpu_off_to_block_index(bit_off);
e_index = pcpu_off_to_block_index(bit_off + bits - 1);
s_off = pcpu_off_to_block_off(bit_off);
e_off = pcpu_off_to_block_off(bit_off + bits - 1) + 1;
/* * Update s_block.
*/ if (s_block->contig_hint == PCPU_BITMAP_BLOCK_BITS)
nr_empty_pages++;
/* * block->first_free must be updated if the allocation takes its place. * If the allocation breaks the contig_hint, a scan is required to * restore this hint.
*/ if (s_off == s_block->first_free)
s_block->first_free = find_next_zero_bit(
pcpu_index_alloc_map(chunk, s_index),
PCPU_BITMAP_BLOCK_BITS,
s_off + bits);
if (pcpu_region_overlap(s_block->contig_hint_start,
s_block->contig_hint_start +
s_block->contig_hint,
s_off,
s_off + bits)) { /* block contig hint is broken - scan to fix it */ if (!s_off)
s_block->left_free = 0;
pcpu_block_refresh_hint(chunk, s_index);
} else { /* update left and right contig manually */
s_block->left_free = min(s_block->left_free, s_off); if (s_index == e_index)
s_block->right_free = min_t(int, s_block->right_free,
PCPU_BITMAP_BLOCK_BITS - e_off); else
s_block->right_free = 0;
}
/* * Update e_block.
*/ if (s_index != e_index) { if (e_block->contig_hint == PCPU_BITMAP_BLOCK_BITS)
nr_empty_pages++;
/* * When the allocation is across blocks, the end is along * the left part of the e_block.
*/
e_block->first_free = find_next_zero_bit(
pcpu_index_alloc_map(chunk, e_index),
PCPU_BITMAP_BLOCK_BITS, e_off);
if (e_off == PCPU_BITMAP_BLOCK_BITS) { /* reset the block */
e_block++;
} else { if (e_off > e_block->scan_hint_start)
e_block->scan_hint = 0;
e_block->left_free = 0; if (e_off > e_block->contig_hint_start) { /* contig hint is broken - scan to fix it */
pcpu_block_refresh_hint(chunk, e_index);
} else {
e_block->right_free =
min_t(int, e_block->right_free,
PCPU_BITMAP_BLOCK_BITS - e_off);
}
}
/* * If the allocation is not atomic, some blocks may not be * populated with pages, while we account it here. The number * of pages will be added back with pcpu_chunk_populated() * when populating pages.
*/ if (nr_empty_pages)
pcpu_update_empty_pages(chunk, -nr_empty_pages);
/* * The only time a full chunk scan is required is if the chunk * contig hint is broken. Otherwise, it means a smaller space * was used and therefore the chunk contig hint is still correct.
*/ if (pcpu_region_overlap(chunk_md->contig_hint_start,
chunk_md->contig_hint_start +
chunk_md->contig_hint,
bit_off,
bit_off + bits))
pcpu_chunk_refresh_hint(chunk, false);
}
/** * pcpu_block_update_hint_free - updates the block hints on the free path * @chunk: chunk of interest * @bit_off: chunk offset * @bits: size of request * * Updates metadata for the allocation path. This avoids a blind block * refresh by making use of the block contig hints. If this fails, it scans * forward and backward to determine the extent of the free area. This is * capped at the boundary of blocks. * * A chunk update is triggered if a page becomes free, a block becomes free, * or the free spans across blocks. This tradeoff is to minimize iterating * over the block metadata to update chunk_md->contig_hint. * chunk_md->contig_hint may be off by up to a page, but it will never be more * than the available space. If the contig hint is contained in one block, it * will be accurate.
*/ staticvoid pcpu_block_update_hint_free(struct pcpu_chunk *chunk, int bit_off, int bits)
{ int nr_empty_pages = 0; struct pcpu_block_md *s_block, *e_block, *block; int s_index, e_index; /* block indexes of the freed allocation */ int s_off, e_off; /* block offsets of the freed allocation */ int start, end; /* start and end of the whole free area */
/* * Calculate per block offsets. * The calculation uses an inclusive range, but the resulting offsets * are [start, end). e_index always points to the last block in the * range.
*/
s_index = pcpu_off_to_block_index(bit_off);
e_index = pcpu_off_to_block_index(bit_off + bits - 1);
s_off = pcpu_off_to_block_off(bit_off);
e_off = pcpu_off_to_block_off(bit_off + bits - 1) + 1;
/* * Check if the freed area aligns with the block->contig_hint. * If it does, then the scan to find the beginning/end of the * larger free area can be avoided. * * start and end refer to beginning and end of the free area * within each their respective blocks. This is not necessarily * the entire free area as it may span blocks past the beginning * or end of the block.
*/
start = s_off; if (s_off == s_block->contig_hint + s_block->contig_hint_start) {
start = s_block->contig_hint_start;
} else { /* * Scan backwards to find the extent of the free area. * find_last_bit returns the starting bit, so if the start bit * is returned, that means there was no last bit and the * remainder of the chunk is free.
*/ int l_bit = find_last_bit(pcpu_index_alloc_map(chunk, s_index),
start);
start = (start == l_bit) ? 0 : l_bit + 1;
}
end = e_off; if (e_off == e_block->contig_hint_start)
end = e_block->contig_hint_start + e_block->contig_hint; else
end = find_next_bit(pcpu_index_alloc_map(chunk, e_index),
PCPU_BITMAP_BLOCK_BITS, end);
/* freeing in the same block */ if (s_index != e_index) { /* update e_block */ if (end == PCPU_BITMAP_BLOCK_BITS)
nr_empty_pages++;
pcpu_block_update(e_block, 0, end);
if (nr_empty_pages)
pcpu_update_empty_pages(chunk, nr_empty_pages);
/* * Refresh chunk metadata when the free makes a block free or spans * across blocks. The contig_hint may be off by up to a page, but if * the contig_hint is contained in a block, it will be accurate with * the else condition below.
*/ if (((end - start) >= PCPU_BITMAP_BLOCK_BITS) || s_index != e_index)
pcpu_chunk_refresh_hint(chunk, true); else
pcpu_block_update(&chunk->chunk_md,
pcpu_block_off_to_off(s_index, start),
end);
}
/** * pcpu_is_populated - determines if the region is populated * @chunk: chunk of interest * @bit_off: chunk offset * @bits: size of area * @next_off: return value for the next offset to start searching * * For atomic allocations, check if the backing pages are populated. * * RETURNS: * Bool if the backing pages are populated. * next_index is to skip over unpopulated blocks in pcpu_find_block_fit.
*/ staticbool pcpu_is_populated(struct pcpu_chunk *chunk, int bit_off, int bits, int *next_off)
{ unsignedint start, end;
start = find_next_zero_bit(chunk->populated, end, start); if (start >= end) returntrue;
end = find_next_bit(chunk->populated, end, start + 1);
*next_off = end * PAGE_SIZE / PCPU_MIN_ALLOC_SIZE; returnfalse;
}
/** * pcpu_find_block_fit - finds the block index to start searching * @chunk: chunk of interest * @alloc_bits: size of request in allocation units * @align: alignment of area (max PAGE_SIZE bytes) * @pop_only: use populated regions only * * Given a chunk and an allocation spec, find the offset to begin searching * for a free region. This iterates over the bitmap metadata blocks to * find an offset that will be guaranteed to fit the requirements. It is * not quite first fit as if the allocation does not fit in the contig hint * of a block or chunk, it is skipped. This errs on the side of caution * to prevent excess iteration. Poor alignment can cause the allocator to * skip over blocks and chunks that have valid free areas. * * RETURNS: * The offset in the bitmap to begin searching. * -1 if no offset is found.
*/ staticint pcpu_find_block_fit(struct pcpu_chunk *chunk, int alloc_bits,
size_t align, bool pop_only)
{ struct pcpu_block_md *chunk_md = &chunk->chunk_md; int bit_off, bits, next_off;
/* * This is an optimization to prevent scanning by assuming if the * allocation cannot fit in the global hint, there is memory pressure * and creating a new chunk would happen soon.
*/ if (!pcpu_check_block_hint(chunk_md, alloc_bits, align)) return -1;
if (bit_off == pcpu_chunk_map_bits(chunk)) return -1;
return bit_off;
}
/* * pcpu_find_zero_area - modified from bitmap_find_next_zero_area_off() * @map: the address to base the search on * @size: the bitmap size in bits * @start: the bitnumber to start searching at * @nr: the number of zeroed bits we're looking for * @align_mask: alignment mask for zero area * @largest_off: offset of the largest area skipped * @largest_bits: size of the largest area skipped * * The @align_mask should be one less than a power of 2. * * This is a modified version of bitmap_find_next_zero_area_off() to remember * the largest area that was skipped. This is imperfect, but in general is * good enough. The largest remembered region is the largest failed region * seen. This does not include anything we possibly skipped due to alignment. * pcpu_block_update_scan() does scan backwards to try and recover what was * lost to alignment. While this can cause scanning to miss earlier possible * free areas, smaller allocations will eventually fill those holes.
*/ staticunsignedlong pcpu_find_zero_area(unsignedlong *map, unsignedlong size, unsignedlong start, unsignedlong nr, unsignedlong align_mask, unsignedlong *largest_off, unsignedlong *largest_bits)
{ unsignedlong index, end, i, area_off, area_bits;
again:
index = find_next_zero_bit(map, size, start);
end = index + nr; if (end > size) return end;
i = find_next_bit(map, end, index); if (i < end) {
area_bits = i - area_off; /* remember largest unused area with best alignment */ if (area_bits > *largest_bits ||
(area_bits == *largest_bits && *largest_off &&
(!area_off || __ffs(area_off) > __ffs(*largest_off)))) {
*largest_off = area_off;
*largest_bits = area_bits;
}
start = i + 1; goto again;
} return index;
}
/** * pcpu_alloc_area - allocates an area from a pcpu_chunk * @chunk: chunk of interest * @alloc_bits: size of request in allocation units * @align: alignment of area (max PAGE_SIZE) * @start: bit_off to start searching * * This function takes in a @start offset to begin searching to fit an * allocation of @alloc_bits with alignment @align. It needs to scan * the allocation map because if it fits within the block's contig hint, * @start will be block->first_free. This is an attempt to fill the * allocation prior to breaking the contig hint. The allocation and * boundary maps are updated accordingly if it confirms a valid * free area. * * RETURNS: * Allocated addr offset in @chunk on success. * -1 if no matching area is found.
*/ staticint pcpu_alloc_area(struct pcpu_chunk *chunk, int alloc_bits,
size_t align, int start)
{ struct pcpu_block_md *chunk_md = &chunk->chunk_md;
size_t align_mask = (align) ? (align - 1) : 0; unsignedlong area_off = 0, area_bits = 0; int bit_off, end, oslot;
lockdep_assert_held(&pcpu_lock);
oslot = pcpu_chunk_slot(chunk);
/* * Search to find a fit.
*/
end = min_t(int, start + alloc_bits + PCPU_BITMAP_BLOCK_BITS,
pcpu_chunk_map_bits(chunk));
bit_off = pcpu_find_zero_area(chunk->alloc_map, end, start, alloc_bits,
align_mask, &area_off, &area_bits); if (bit_off >= end) return -1;
if (area_bits)
pcpu_block_update_scan(chunk, area_off, area_bits);
/** * pcpu_free_area - frees the corresponding offset * @chunk: chunk of interest * @off: addr offset into chunk * * This function determines the size of an allocation to free using * the boundary bitmap and clears the allocation map. * * RETURNS: * Number of freed bytes.
*/ staticint pcpu_free_area(struct pcpu_chunk *chunk, int off)
{ struct pcpu_block_md *chunk_md = &chunk->chunk_md; int bit_off, bits, end, oslot, freed;
/* find end index */
end = find_next_bit(chunk->bound_map, pcpu_chunk_map_bits(chunk),
bit_off + 1);
bits = end - bit_off;
bitmap_clear(chunk->alloc_map, bit_off, bits);
freed = bits * PCPU_MIN_ALLOC_SIZE;
/* update metadata */
chunk->free_bytes += freed;
/* update first free bit */
chunk_md->first_free = min(chunk_md->first_free, bit_off);
/** * pcpu_alloc_first_chunk - creates chunks that serve the first chunk * @tmp_addr: the start of the region served * @map_size: size of the region served * * This is responsible for creating the chunks that serve the first chunk. The * base_addr is page aligned down of @tmp_addr while the region end is page * aligned up. Offsets are kept track of to determine the region served. All * this is done to appease the bitmap allocator in avoiding partial blocks. * * RETURNS: * Chunk serving the region at @tmp_addr of @map_size.
*/ staticstruct pcpu_chunk * __init pcpu_alloc_first_chunk(unsignedlong tmp_addr, int map_size)
{ struct pcpu_chunk *chunk; unsignedlong aligned_addr; int start_offset, offset_bits, region_size, region_bits;
size_t alloc_size;
/* region calculations */
aligned_addr = tmp_addr & PAGE_MASK;
/** * pcpu_chunk_populated - post-population bookkeeping * @chunk: pcpu_chunk which got populated * @page_start: the start page * @page_end: the end page * * Pages in [@page_start,@page_end) have been populated to @chunk. Update * the bookkeeping information accordingly. Must be called after each * successful population.
*/ staticvoid pcpu_chunk_populated(struct pcpu_chunk *chunk, int page_start, int page_end)
{ int nr = page_end - page_start;
/** * pcpu_chunk_depopulated - post-depopulation bookkeeping * @chunk: pcpu_chunk which got depopulated * @page_start: the start page * @page_end: the end page * * Pages in [@page_start,@page_end) have been depopulated from @chunk. * Update the bookkeeping information accordingly. Must be called after * each successful depopulation.
*/ staticvoid pcpu_chunk_depopulated(struct pcpu_chunk *chunk, int page_start, int page_end)
{ int nr = page_end - page_start;
/* * Chunk management implementation. * * To allow different implementations, chunk alloc/free and * [de]population are implemented in a separate file which is pulled * into this file and compiled together. The following functions * should be implemented. * * pcpu_populate_chunk - populate the specified range of a chunk * pcpu_depopulate_chunk - depopulate the specified range of a chunk * pcpu_post_unmap_tlb_flush - flush tlb for the specified range of a chunk * pcpu_create_chunk - create a new chunk * pcpu_destroy_chunk - destroy a chunk, always preceded by full depop * pcpu_addr_to_page - translate address to physical address * pcpu_verify_alloc_info - check alloc_info is acceptable during init
*/ staticint pcpu_populate_chunk(struct pcpu_chunk *chunk, int page_start, int page_end, gfp_t gfp); staticvoid pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int page_start, int page_end); staticvoid pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk, int page_start, int page_end); staticstruct pcpu_chunk *pcpu_create_chunk(gfp_t gfp); staticvoid pcpu_destroy_chunk(struct pcpu_chunk *chunk); staticstruct page *pcpu_addr_to_page(void *addr); staticint __init pcpu_verify_alloc_info(conststruct pcpu_alloc_info *ai);
/** * pcpu_chunk_addr_search - determine chunk containing specified address * @addr: address for which the chunk needs to be determined. * * This is an internal function that handles all but static allocations. * Static percpu address values should never be passed into the allocator. * * RETURNS: * The address of the found chunk.
*/ staticstruct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
{ /* is it in the dynamic region (first chunk)? */ if (pcpu_addr_in_chunk(pcpu_first_chunk, addr)) return pcpu_first_chunk;
/* is it in the reserved region? */ if (pcpu_addr_in_chunk(pcpu_reserved_chunk, addr)) return pcpu_reserved_chunk;
/* * The address is relative to unit0 which might be unused and * thus unmapped. Offset the address to the unit space of the * current processor before looking it up in the vmalloc * space. Note that any possible cpu id can be used here, so * there's no need to worry about preemption or cpu hotplug.
*/
addr += pcpu_unit_offsets[raw_smp_processor_id()]; return pcpu_get_page_chunk(pcpu_addr_to_page(addr));
}
/** * pcpu_alloc - the percpu allocator * @size: size of area to allocate in bytes * @align: alignment of area (max PAGE_SIZE) * @reserved: allocate from the reserved chunk if available * @gfp: allocation flags * * Allocate percpu area of @size bytes aligned at @align. If @gfp doesn't * contain %GFP_KERNEL, the allocation is atomic. If @gfp has __GFP_NOWARN * then no warning will be triggered on invalid or failed allocation * requests. * * RETURNS: * Percpu pointer to the allocated area on success, NULL on failure.
*/ void __percpu *pcpu_alloc_noprof(size_t size, size_t align, bool reserved,
gfp_t gfp)
{
gfp_t pcpu_gfp; bool is_atomic; bool do_warn; struct obj_cgroup *objcg = NULL; static atomic_t warn_limit = ATOMIC_INIT(10); struct pcpu_chunk *chunk, *next; constchar *err; int slot, off, cpu, ret; unsignedlong flags; void __percpu *ptr;
size_t bits, bit_align;
gfp = current_gfp_context(gfp); /* whitelisted flags that can be passed to the backing allocators */
pcpu_gfp = gfp & (GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN);
is_atomic = !gfpflags_allow_blocking(gfp);
do_warn = !(gfp & __GFP_NOWARN);
/* * There is now a minimum allocation size of PCPU_MIN_ALLOC_SIZE, * therefore alignment must be a minimum of that many bytes. * An allocation may have internal fragmentation from rounding up * of up to PCPU_MIN_ALLOC_SIZE - 1 bytes.
*/ if (unlikely(align < PCPU_MIN_ALLOC_SIZE))
align = PCPU_MIN_ALLOC_SIZE;
if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE ||
!is_power_of_2(align))) {
WARN(do_warn, "illegal size (%zu) or align (%zu) for percpu allocation\n",
size, align); return NULL;
}
if (unlikely(!pcpu_memcg_pre_alloc_hook(size, gfp, &objcg))) return NULL;
if (!is_atomic) { /* * pcpu_balance_workfn() allocates memory under this mutex, * and it may wait for memory reclaim. Allow current task * to become OOM victim, in case of memory pressure.
*/ if (gfp & __GFP_NOFAIL) {
mutex_lock(&pcpu_alloc_mutex);
} elseif (mutex_lock_killable(&pcpu_alloc_mutex)) {
pcpu_memcg_post_alloc_hook(objcg, NULL, 0, size); return NULL;
}
}
spin_lock_irqsave(&pcpu_lock, flags);
/* serve reserved allocations from the reserved chunk if available */ if (reserved && pcpu_reserved_chunk) {
chunk = pcpu_reserved_chunk;
off = pcpu_find_block_fit(chunk, bits, bit_align, is_atomic); if (off < 0) {
err = "alloc from reserved chunk failed"; goto fail_unlock;
}
off = pcpu_alloc_area(chunk, bits, bit_align, off); if (off >= 0) goto area_found;
err = "alloc from reserved chunk failed"; goto fail_unlock;
}
restart: /* search through normal chunks */ for (slot = pcpu_size_to_slot(size); slot <= pcpu_free_slot; slot++) {
list_for_each_entry_safe(chunk, next, &pcpu_chunk_lists[slot],
list) {
off = pcpu_find_block_fit(chunk, bits, bit_align,
is_atomic); if (off < 0) { if (slot < PCPU_SLOT_FAIL_THRESHOLD)
pcpu_chunk_move(chunk, 0); continue;
}
off = pcpu_alloc_area(chunk, bits, bit_align, off); if (off >= 0) {
pcpu_reintegrate_chunk(chunk); goto area_found;
}
}
}
spin_unlock_irqrestore(&pcpu_lock, flags);
if (is_atomic) {
err = "atomic alloc failed, no space left"; goto fail;
}
/* No space left. Create a new chunk. */ if (list_empty(&pcpu_chunk_lists[pcpu_free_slot])) {
chunk = pcpu_create_chunk(pcpu_gfp); if (!chunk) {
err = "failed to allocate new chunk"; goto fail;
}
/* clear the areas and return address relative to base address */
for_each_possible_cpu(cpu)
memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
if (do_warn) { int remaining = atomic_dec_if_positive(&warn_limit);
if (remaining >= 0) {
pr_warn("allocation failed, size=%zu align=%zu atomic=%d, %s\n",
size, align, is_atomic, err); if (!is_atomic)
dump_stack(); if (remaining == 0)
pr_info("limit reached, disable warning\n");
}
}
if (is_atomic) { /* see the flag handling in pcpu_balance_workfn() */
pcpu_atomic_alloc_failed = true;
pcpu_schedule_balance_work();
} else {
mutex_unlock(&pcpu_alloc_mutex);
}
/** * pcpu_balance_free - manage the amount of free chunks * @empty_only: free chunks only if there are no populated pages * * If empty_only is %false, reclaim all fully free chunks regardless of the * number of populated pages. Otherwise, only reclaim chunks that have no * populated pages. * * CONTEXT: * pcpu_lock (can be dropped temporarily)
*/ staticvoid pcpu_balance_free(bool empty_only)
{
LIST_HEAD(to_free); struct list_head *free_head = &pcpu_chunk_lists[pcpu_free_slot]; struct pcpu_chunk *chunk, *next;
lockdep_assert_held(&pcpu_lock);
/* * There's no reason to keep around multiple unused chunks and VM * areas can be scarce. Destroy all free chunks except for one.
*/
list_for_each_entry_safe(chunk, next, free_head, list) {
WARN_ON(chunk->immutable);
/* spare the first one */ if (chunk == list_first_entry(free_head, struct pcpu_chunk, list)) continue;
if (!empty_only || chunk->nr_empty_pop_pages == 0)
list_move(&chunk->list, &to_free);
}
/** * pcpu_balance_populated - manage the amount of populated pages * * Maintain a certain amount of populated pages to satisfy atomic allocations. * It is possible that this is called when physical memory is scarce causing * OOM killer to be triggered. We should avoid doing so until an actual * allocation causes the failure as it is possible that requests can be * serviced from already backed regions. * * CONTEXT: * pcpu_lock (can be dropped temporarily)
*/ staticvoid pcpu_balance_populated(void)
{ /* gfp flags passed to underlying allocators */ const gfp_t gfp = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN; struct pcpu_chunk *chunk; int slot, nr_to_pop, ret;
lockdep_assert_held(&pcpu_lock);
/* * Ensure there are certain number of free populated pages for * atomic allocs. Fill up from the most packed so that atomic * allocs don't increase fragmentation. If atomic allocation * failed previously, always populate the maximum amount. This * should prevent atomic allocs larger than PAGE_SIZE from keeping * failing indefinitely; however, large atomic allocs are not * something we support properly and can be highly unreliable and * inefficient.
*/
retry_pop: if (pcpu_atomic_alloc_failed) {
nr_to_pop = PCPU_EMPTY_POP_PAGES_HIGH; /* best effort anyway, don't worry about synchronization */
pcpu_atomic_alloc_failed = false;
} else {
nr_to_pop = clamp(PCPU_EMPTY_POP_PAGES_HIGH -
pcpu_nr_empty_pop_pages,
0, PCPU_EMPTY_POP_PAGES_HIGH);
}
/* @chunk can't go away while pcpu_alloc_mutex is held */
for_each_clear_bitrange(rs, re, chunk->populated, chunk->nr_pages) { int nr = min_t(int, re - rs, nr_to_pop);
if (nr_to_pop) { /* ran out of chunks to populate, create a new one and retry */
spin_unlock_irq(&pcpu_lock);
chunk = pcpu_create_chunk(gfp);
cond_resched();
spin_lock_irq(&pcpu_lock); if (chunk) {
pcpu_chunk_relocate(chunk, -1); goto retry_pop;
}
}
}
/** * pcpu_reclaim_populated - scan over to_depopulate chunks and free empty pages * * Scan over chunks in the depopulate list and try to release unused populated * pages back to the system. Depopulated chunks are sidelined to prevent * repopulating these pages unless required. Fully free chunks are reintegrated * and freed accordingly (1 is kept around). If we drop below the empty * populated pages threshold, reintegrate the chunk if it has empty free pages. * Each chunk is scanned in the reverse order to keep populated pages close to * the beginning of the chunk. * * CONTEXT: * pcpu_lock (can be dropped temporarily) *
*/ staticvoid pcpu_reclaim_populated(void)
{ struct pcpu_chunk *chunk; struct pcpu_block_md *block; int freed_page_start, freed_page_end; int i, end; bool reintegrate;
lockdep_assert_held(&pcpu_lock);
/* * Once a chunk is isolated to the to_depopulate list, the chunk is no * longer discoverable to allocations whom may populate pages. The only * other accessor is the free path which only returns area back to the * allocator not touching the populated bitmap.
*/ while ((chunk = list_first_entry_or_null(
&pcpu_chunk_lists[pcpu_to_depopulate_slot], struct pcpu_chunk, list))) {
WARN_ON(chunk->immutable);
/* * Scan chunk's pages in the reverse order to keep populated * pages close to the beginning of the chunk.
*/
freed_page_start = chunk->nr_pages;
freed_page_end = 0;
reintegrate = false; for (i = chunk->nr_pages - 1, end = -1; i >= 0; i--) { /* no more work to do */ if (chunk->nr_empty_pop_pages == 0) break;
/* reintegrate chunk to prevent atomic alloc failures */ if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_HIGH) {
reintegrate = true; break;
}
/* * If the page is empty and populated, start or * extend the (i, end) range. If i == 0, decrease * i and perform the depopulation to cover the last * (first) page in the chunk.
*/
block = chunk->md_blocks + i; if (block->contig_hint == PCPU_BITMAP_BLOCK_BITS &&
test_bit(i, chunk->populated)) { if (end == -1)
end = i; if (i > 0) continue;
i--;
}
/* depopulate if there is an active range */ if (end == -1) continue;
spin_unlock_irq(&pcpu_lock);
pcpu_depopulate_chunk(chunk, i + 1, end + 1);
cond_resched();
spin_lock_irq(&pcpu_lock);
pcpu_chunk_depopulated(chunk, i + 1, end + 1);
freed_page_start = min(freed_page_start, i + 1);
freed_page_end = max(freed_page_end, end + 1);
/* reset the range and continue */
end = -1;
}
/* batch tlb flush per chunk to amortize cost */ if (freed_page_start < freed_page_end) {
spin_unlock_irq(&pcpu_lock);
pcpu_post_unmap_tlb_flush(chunk,
freed_page_start,
freed_page_end);
cond_resched();
spin_lock_irq(&pcpu_lock);
}
/** * pcpu_balance_workfn - manage the amount of free chunks and populated pages * @work: unused * * For each chunk type, manage the number of fully free chunks and the number of * populated pages. An important thing to consider is when pages are freed and * how they contribute to the global counts.
*/ staticvoid pcpu_balance_workfn(struct work_struct *work)
{ /* * pcpu_balance_free() is called twice because the first time we may * trim pages in the active pcpu_nr_empty_pop_pages which may cause us * to grow other chunks. This then gives pcpu_reclaim_populated() time * to move fully free chunks to the active list to be freed if * appropriate. * * Enforce GFP_NOIO allocations because we have pcpu_alloc users * constrained to GFP_NOIO/NOFS contexts and they could form lock * dependency through pcpu_alloc_mutex
*/ unsignedint flags = memalloc_noio_save();
mutex_lock(&pcpu_alloc_mutex);
spin_lock_irq(&pcpu_lock);
/** * free_percpu - free percpu area * @ptr: pointer to area to free * * Free percpu area @ptr. * * CONTEXT: * Can be called from atomic context.
*/ void free_percpu(void __percpu *ptr)
{ void *addr; struct pcpu_chunk *chunk; unsignedlong flags; int size, off; bool need_balance = false;
/* * If there are more than one fully free chunks, wake up grim reaper. * If the chunk is isolated, it may be in the process of being * reclaimed. Let reclaim manage cleaning up of that chunk.
*/ if (!chunk->isolated && chunk->free_bytes == pcpu_unit_size) { struct pcpu_chunk *pos;
if (va >= start && va < start + static_size) { if (can_addr) {
*can_addr = (unsignedlong) (va - start);
*can_addr += (unsignedlong)
per_cpu_ptr(base, get_boot_cpu_id());
} returntrue;
}
} #endif /* on UP, can't distinguish from other static vars, always false */ returnfalse;
}
/** * is_kernel_percpu_address - test whether address is from static percpu area * @addr: address to test * * Test whether @addr belongs to in-kernel static percpu area. Module * static percpu areas are not considered. For those, use * is_module_percpu_address(). * * RETURNS: * %true if @addr is from in-kernel static percpu area, %false otherwise.
*/ bool is_kernel_percpu_address(unsignedlong addr)
{ return __is_kernel_percpu_address(addr, NULL);
}
/** * per_cpu_ptr_to_phys - convert translated percpu address to physical address * @addr: the address to be converted to physical address * * Given @addr which is dereferenceable address obtained via one of * percpu access macros, this function translates it into its physical * address. The caller is responsible for ensuring @addr stays valid * until this function finishes. * * percpu allocator has special setup for the first chunk, which currently * supports either embedding in linear address space or vmalloc mapping, * and, from the second one, the backing allocator (currently either vm or * km) provides translation. * * The addr can be translated simply without checking if it falls into the * first chunk. But the current code reflects better how percpu allocator * actually works, and the verification can discover both bugs in percpu * allocator itself and per_cpu_ptr_to_phys() callers. So we keep current * code. * * RETURNS: * The physical address for @addr.
*/
phys_addr_t per_cpu_ptr_to_phys(void *addr)
{ void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr); bool in_first_chunk = false; unsignedlong first_low, first_high; unsignedint cpu;
/* * The following test on unit_low/high isn't strictly * necessary but will speed up lookups of addresses which * aren't in the first chunk. * * The address check is against full chunk sizes. pcpu_base_addr * points to the beginning of the first chunk including the * static region. Assumes good intent as the first chunk may * not be full (ie. < pcpu_unit_pages in size).
*/
first_low = (unsignedlong)pcpu_base_addr +
pcpu_unit_page_offset(pcpu_low_unit_cpu, 0);
first_high = (unsignedlong)pcpu_base_addr +
pcpu_unit_page_offset(pcpu_high_unit_cpu, pcpu_unit_pages); if ((unsignedlong)addr >= first_low &&
(unsignedlong)addr < first_high) {
for_each_possible_cpu(cpu) { void *start = per_cpu_ptr(base, cpu);
if (in_first_chunk) { if (!is_vmalloc_addr(addr)) return __pa(addr); else return page_to_phys(vmalloc_to_page(addr)) +
offset_in_page(addr);
} else return page_to_phys(pcpu_addr_to_page(addr)) +
offset_in_page(addr);
}
/** * pcpu_alloc_alloc_info - allocate percpu allocation info * @nr_groups: the number of groups * @nr_units: the number of units * * Allocate ai which is large enough for @nr_groups groups containing * @nr_units units. The returned ai's groups[0].cpu_map points to the * cpu_map array which is long enough for @nr_units and filled with * NR_CPUS. It's the caller's responsibility to initialize cpu_map * pointer of other groups. * * RETURNS: * Pointer to the allocated pcpu_alloc_info on success, NULL on * failure.
*/ struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups, int nr_units)
{ struct pcpu_alloc_info *ai;
size_t base_size, ai_size; void *ptr; int unit;
/** * pcpu_free_alloc_info - free percpu allocation info * @ai: pcpu_alloc_info to free * * Free @ai which was allocated by pcpu_alloc_alloc_info().
*/ void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai)
{
memblock_free(ai, ai->__ai_size);
}
/** * pcpu_dump_alloc_info - print out information about pcpu_alloc_info * @lvl: loglevel * @ai: allocation info to dump * * Print out information about @ai using loglevel @lvl.
*/ staticvoid pcpu_dump_alloc_info(constchar *lvl, conststruct pcpu_alloc_info *ai)
{ int group_width = 1, cpu_width = 1, width; char empty_str[] = "--------"; int alloc = 0, alloc_end = 0; int group, v; int upa, apl; /* units per alloc, allocs per line */
v = ai->nr_groups; while (v /= 10)
group_width++;
v = num_possible_cpus(); while (v /= 10)
cpu_width++;
empty_str[min_t(int, cpu_width, sizeof(empty_str) - 1)] = '\0';
for (unit_end += upa; unit < unit_end; unit++) if (gi->cpu_map[unit] != NR_CPUS)
pr_cont("%0*d ",
cpu_width, gi->cpu_map[unit]); else
pr_cont("%s ", empty_str);
}
}
pr_cont("\n");
}
/** * pcpu_setup_first_chunk - initialize the first percpu chunk * @ai: pcpu_alloc_info describing how to percpu area is shaped * @base_addr: mapped address * * Initialize the first percpu chunk which contains the kernel static * percpu area. This function is to be called from arch percpu area * setup path. * * @ai contains all information necessary to initialize the first * chunk and prime the dynamic percpu allocator. * * @ai->static_size is the size of static percpu area. * * @ai->reserved_size, if non-zero, specifies the amount of bytes to * reserve after the static area in the first chunk. This reserves * the first chunk such that it's available only through reserved * percpu allocation. This is primarily used to serve module percpu * static areas on architectures where the addressing model has * limited offset range for symbol relocations to guarantee module * percpu symbols fall inside the relocatable range. * * @ai->dyn_size determines the number of bytes available for dynamic * allocation in the first chunk. The area between @ai->static_size + * @ai->reserved_size + @ai->dyn_size and @ai->unit_size is unused. * * @ai->unit_size specifies unit size and must be aligned to PAGE_SIZE * and equal to or larger than @ai->static_size + @ai->reserved_size + * @ai->dyn_size. * * @ai->atom_size is the allocation atom size and used as alignment * for vm areas. * * @ai->alloc_size is the allocation size and always multiple of * @ai->atom_size. This is larger than @ai->atom_size if * @ai->unit_size is larger than @ai->atom_size. * * @ai->nr_groups and @ai->groups describe virtual memory layout of * percpu areas. Units which should be colocated are put into the * same group. Dynamic VM areas will be allocated according to these * groupings. If @ai->nr_groups is zero, a single group containing * all units is assumed. * * The caller should have mapped the first chunk at @base_addr and * copied static data to each unit. * * The first chunk will always contain a static and a dynamic region. * However, the static region is not managed by any chunk. If the first * chunk also contains a reserved region, it is served by two chunks - * one for the reserved region and one for the dynamic region. They * share the same vm, but use offset regions in the area allocation map. * The chunk serving the dynamic region is circulated in the chunk slots * and available for dynamic allocation like any other chunk.
*/ void __init pcpu_setup_first_chunk(conststruct pcpu_alloc_info *ai, void *base_addr)
{
size_t size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
size_t static_size, dyn_size; unsignedlong *group_offsets;
size_t *group_sizes; unsignedlong *unit_off; unsignedint cpu; int *unit_map; int group, unit, i; unsignedlong tmp_addr;
size_t alloc_size;
#define PCPU_SETUP_BUG_ON(cond) do { \ if (unlikely(cond)) { \
pr_emerg("failed to initialize, %s\n", #cond); \
pr_emerg("cpu_possible_mask=%*pb\n", \
cpumask_pr_args(cpu_possible_mask)); \
pcpu_dump_alloc_info(KERN_EMERG, ai); \
BUG(); \
} \
} while (0)
/* process group information and build config tables accordingly */
alloc_size = ai->nr_groups * sizeof(group_offsets[0]);
group_offsets = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES);
for (i = 0; i < pcpu_nr_slots; i++)
INIT_LIST_HEAD(&pcpu_chunk_lists[i]);
/* * The end of the static region needs to be aligned with the * minimum allocation size as this offsets the reserved and * dynamic region. The first chunk ends page aligned by * expanding the dynamic region, therefore the dynamic region * can be shrunk to compensate while still staying above the * configured sizes.
*/
static_size = ALIGN(ai->static_size, PCPU_MIN_ALLOC_SIZE);
dyn_size = ai->dyn_size - (static_size - ai->static_size);
/* * Initialize first chunk: * This chunk is broken up into 3 parts: * < static | [reserved] | dynamic > * - static - there is no backing chunk because these allocations can * never be freed. * - reserved (pcpu_reserved_chunk) - exists primarily to serve * allocations from module load. * - dynamic (pcpu_first_chunk) - serves the dynamic part of the first * chunk.
*/
tmp_addr = (unsignedlong)base_addr + static_size; if (ai->reserved_size)
pcpu_reserved_chunk = pcpu_alloc_first_chunk(tmp_addr,
ai->reserved_size);
tmp_addr = (unsignedlong)base_addr + static_size + ai->reserved_size;
pcpu_first_chunk = pcpu_alloc_first_chunk(tmp_addr, dyn_size);
/* * pcpu_embed_first_chunk() is used by the generic percpu setup. * Build it if needed by the arch config or the generic setup is going * to be used.
*/ #ifdefined(CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK) || \
!defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) #define BUILD_EMBED_FIRST_CHUNK #endif
/* build pcpu_page_first_chunk() iff needed by the arch config */ #ifdefined(CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK) #define BUILD_PAGE_FIRST_CHUNK #endif
/* pcpu_build_alloc_info() is used by both embed and page first chunk */ #ifdefined(BUILD_EMBED_FIRST_CHUNK) || defined(BUILD_PAGE_FIRST_CHUNK) /** * pcpu_build_alloc_info - build alloc_info considering distances between CPUs * @reserved_size: the size of reserved percpu area in bytes * @dyn_size: minimum free size for dynamic allocation in bytes * @atom_size: allocation atom size * @cpu_distance_fn: callback to determine distance between cpus, optional * * This function determines grouping of units, their mappings to cpus * and other parameters considering needed percpu size, allocation * atom size and distances between CPUs. * * Groups are always multiples of atom size and CPUs which are of * LOCAL_DISTANCE both ways are grouped together and share space for * units in the same group. The returned configuration is guaranteed * to have CPUs on different nodes on different groups and >=75% usage * of allocated virtual address space. * * RETURNS: * On success, pointer to the new allocation_info is returned. On * failure, ERR_PTR value is returned.
*/ staticstruct pcpu_alloc_info * __init __flatten pcpu_build_alloc_info(
size_t reserved_size, size_t dyn_size,
size_t atom_size,
pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
{ staticint group_map[NR_CPUS] __initdata; staticint group_cnt[NR_CPUS] __initdata; staticstruct cpumask mask __initdata; const size_t static_size = __per_cpu_end - __per_cpu_start; int nr_groups = 1, nr_units = 0;
size_t size_sum, min_unit_size, alloc_size; int upa, max_upa, best_upa; /* units_per_alloc */ int last_allocs, group, unit; unsignedint cpu, tcpu; struct pcpu_alloc_info *ai; unsignedint *cpu_map;
/* this function may be called multiple times */
memset(group_map, 0, sizeof(group_map));
memset(group_cnt, 0, sizeof(group_cnt));
cpumask_clear(&mask);
/* calculate size_sum and ensure dyn_size is enough for early alloc */
size_sum = PFN_ALIGN(static_size + reserved_size +
max_t(size_t, dyn_size, PERCPU_DYNAMIC_EARLY_SIZE));
dyn_size = size_sum - static_size - reserved_size;
/* * Determine min_unit_size, alloc_size and max_upa such that * alloc_size is multiple of atom_size and is the smallest * which can accommodate 4k aligned segments which are equal to * or larger than min_unit_size.
*/
min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
/* determine the maximum # of units that can fit in an allocation */
alloc_size = roundup(min_unit_size, atom_size);
upa = alloc_size / min_unit_size; while (alloc_size % upa || (offset_in_page(alloc_size / upa)))
upa--;
max_upa = upa;
cpumask_copy(&mask, cpu_possible_mask);
/* group cpus according to their proximity */ for (group = 0; !cpumask_empty(&mask); group++) { /* pop the group's first cpu */
cpu = cpumask_first(&mask);
group_map[cpu] = group;
group_cnt[group]++;
cpumask_clear_cpu(cpu, &mask);
/* * Wasted space is caused by a ratio imbalance of upa to group_cnt. * Expand the unit_size until we use >= 75% of the units allocated. * Related to atom_size, which could be much larger than the unit_size.
*/
last_allocs = INT_MAX;
best_upa = 0; for (upa = max_upa; upa; upa--) { int allocs = 0, wasted = 0;
if (alloc_size % upa || (offset_in_page(alloc_size / upa))) continue;
for (group = 0; group < nr_groups; group++) { int this_allocs = DIV_ROUND_UP(group_cnt[group], upa);
allocs += this_allocs;
wasted += this_allocs * upa - group_cnt[group];
}
/* * Don't accept if wastage is over 1/3. The * greater-than comparison ensures upa==1 always * passes the following check.
*/ if (wasted > num_possible_cpus() / 3) continue;
/* and then don't consume more memory */ if (allocs > last_allocs) break;
last_allocs = allocs;
best_upa = upa;
}
BUG_ON(!best_upa);
upa = best_upa;
/* allocate and fill alloc_info */ for (group = 0; group < nr_groups; group++)
nr_units += roundup(group_cnt[group], upa);
ai = pcpu_alloc_alloc_info(nr_groups, nr_units); if (!ai) return ERR_PTR(-ENOMEM);
cpu_map = ai->groups[0].cpu_map;
for (group = 0; group < nr_groups; group++) {
ai->groups[group].cpu_map = cpu_map;
cpu_map += roundup(group_cnt[group], upa);
}
for (group = 0, unit = 0; group < nr_groups; group++) { struct pcpu_group_info *gi = &ai->groups[group];
/* * Initialize base_offset as if all groups are located * back-to-back. The caller should update this to * reflect actual allocation.
*/
gi->base_offset = unit * ai->unit_size;
for_each_possible_cpu(cpu) if (group_map[cpu] == group)
gi->cpu_map[gi->nr_units++] = cpu;
gi->nr_units = roundup(gi->nr_units, upa);
unit += gi->nr_units;
}
BUG_ON(unit != nr_units);
if (node == NUMA_NO_NODE || !node_online(node) || !NODE_DATA(node)) {
ptr = memblock_alloc_from(size, align, goal);
pr_info("cpu %d has no node %d or node-local memory\n",
cpu, node);
pr_debug("per cpu data for cpu%d %zu bytes at 0x%llx\n",
cpu, size, (u64)__pa(ptr));
} else {
ptr = memblock_alloc_try_nid(size, align, goal,
MEMBLOCK_ALLOC_ACCESSIBLE,
node);
pr_debug("per cpu data for cpu%d %zu bytes on node%d at 0x%llx\n",
cpu, size, node, (u64)__pa(ptr));
} return ptr; #else return memblock_alloc_from(size, align, goal); #endif
}
#ifdefined(BUILD_EMBED_FIRST_CHUNK) /** * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem * @reserved_size: the size of reserved percpu area in bytes * @dyn_size: minimum free size for dynamic allocation in bytes * @atom_size: allocation atom size * @cpu_distance_fn: callback to determine distance between cpus, optional * @cpu_to_nd_fn: callback to convert cpu to it's node, optional * * This is a helper to ease setting up embedded first percpu chunk and * can be called where pcpu_setup_first_chunk() is expected. * * If this function is used to setup the first chunk, it is allocated * by calling pcpu_fc_alloc and used as-is without being mapped into * vmalloc area. Allocations are always whole multiples of @atom_size * aligned to @atom_size. * * This enables the first chunk to piggy back on the linear physical * mapping which often uses larger page size. Please note that this * can result in very sparse cpu->unit mapping on NUMA machines thus * requiring large vmalloc address space. Don't use this allocator if * vmalloc space is not orders of magnitude larger than distances * between node memory addresses (ie. 32bit NUMA machines). * * @dyn_size specifies the minimum dynamic area size. * * If the needed size is smaller than the minimum or specified unit * size, the leftover is returned using pcpu_fc_free. * * RETURNS: * 0 on success, -errno on failure.
*/ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
size_t atom_size,
pcpu_fc_cpu_distance_fn_t cpu_distance_fn,
pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn)
{ void *base = (void *)ULONG_MAX; void **areas = NULL; struct pcpu_alloc_info *ai;
size_t size_sum, areas_size; unsignedlong max_distance; int group, i, highest_group, rc = 0;
ai = pcpu_build_alloc_info(reserved_size, dyn_size, atom_size,
cpu_distance_fn); if (IS_ERR(ai)) return PTR_ERR(ai);
/* warn if maximum distance is further than 75% of vmalloc space */ if (max_distance > VMALLOC_TOTAL * 3 / 4) {
pr_warn("max_distance=0x%lx too large for vmalloc space 0x%lx\n",
max_distance, VMALLOC_TOTAL); #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK /* and fail if we have fallback */
rc = -EINVAL; goto out_free_areas; #endif
}
/* * Copy data and free unused parts. This should happen after all * allocations are complete; otherwise, we may end up with * overlapping groups.
*/ for (group = 0; group < ai->nr_groups; group++) { struct pcpu_group_info *gi = &ai->groups[group]; void *ptr = areas[group];
for (i = 0; i < gi->nr_units; i++, ptr += ai->unit_size) { if (gi->cpu_map[i] == NR_CPUS) { /* unused unit, free whole */
pcpu_fc_free(ptr, ai->unit_size); continue;
} /* copy and return the unused part */
memcpy(ptr, __per_cpu_start, ai->static_size);
pcpu_fc_free(ptr + size_sum, ai->unit_size - size_sum);
}
}
/* base address is now known, determine group base offsets */ for (group = 0; group < ai->nr_groups; group++) {
ai->groups[group].base_offset = areas[group] - base;
}
pmd = pmd_offset(pud, addr); if (!pmd_present(*pmd)) {
pte_t *new;
new = memblock_alloc_or_panic(PTE_TABLE_SIZE, PTE_TABLE_SIZE);
pmd_populate_kernel(&init_mm, pmd, new);
}
return;
}
/** * pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages * @reserved_size: the size of reserved percpu area in bytes * @cpu_to_nd_fn: callback to convert cpu to it's node, optional * * This is a helper to ease setting up page-remapped first percpu * chunk and can be called where pcpu_setup_first_chunk() is expected. * * This is the basic allocator. Static percpu area is allocated * page-by-page into vmalloc area. * * RETURNS: * 0 on success, -errno on failure.
*/ int __init pcpu_page_first_chunk(size_t reserved_size, pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn)
{ staticstruct vm_struct vm; struct pcpu_alloc_info *ai; char psize_str[16]; int unit_pages;
size_t pages_size; struct page **pages; int unit, i, j, rc = 0; int upa; int nr_g0_units;
#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA /* * Generic SMP percpu area setup. * * The embedding helper is used because its behavior closely resembles * the original non-dynamic generic percpu area setup. This is * important because many archs have addressing restrictions and might * fail if the percpu area is located far away from the previous * location. As an added bonus, in non-NUMA cases, embedding is * generally a good idea TLB-wise because percpu area can piggy back * on the physical linear memory mapping which uses large page * mappings on applicable archs.
*/ unsignedlong __per_cpu_offset[NR_CPUS] __read_mostly;
EXPORT_SYMBOL(__per_cpu_offset);
void __init setup_per_cpu_areas(void)
{ unsignedlong delta; unsignedint cpu; int rc;
/* * Always reserve area for module percpu variables. That's * what the legacy allocator did.
*/
rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE, PERCPU_DYNAMIC_RESERVE,
PAGE_SIZE, NULL, NULL); if (rc < 0)
panic("Failed to initialize percpu areas.");
/* * UP percpu area setup. * * UP always uses km-based percpu allocator with identity mapping. * Static percpu variables are indistinguishable from the usual static * variables and don't require any special preparation.
*/ void __init setup_per_cpu_areas(void)
{ const size_t unit_size =
roundup_pow_of_two(max_t(size_t, PCPU_MIN_UNIT_SIZE,
PERCPU_DYNAMIC_RESERVE)); struct pcpu_alloc_info *ai; void *fc;
ai = pcpu_alloc_alloc_info(1, 1);
fc = memblock_alloc_from(unit_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); if (!ai || !fc)
panic("Failed to allocate memory for percpu areas."); /* kmemleak tracks the percpu allocations separately */
kmemleak_ignore_phys(__pa(fc));
/* * pcpu_nr_pages - calculate total number of populated backing pages * * This reflects the number of pages populated to back chunks. Metadata is * excluded in the number exposed in meminfo as the number of backing pages * scales with the number of cpus and can quickly outweigh the memory used for * metadata. It also keeps this calculation nice and simple. * * RETURNS: * Total number of populated backing pages in use by the allocator.
*/ unsignedlong pcpu_nr_pages(void)
{ return data_race(READ_ONCE(pcpu_nr_populated)) * pcpu_nr_units;
}
/* * Percpu allocator is initialized early during boot when neither slab or * workqueue is available. Plug async management until everything is up * and running.
*/ staticint __init percpu_enable_async(void)
{
pcpu_async_enabled = true; return 0;
}
subsys_initcall(percpu_enable_async);
Messung V0.5 in Prozent
¤ Dauer der Verarbeitung: 0.47 Sekunden
(vorverarbeitet am 2026-04-25)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.