// SPDX-License-Identifier: GPL-2.0-only /* * mm/percpu.c - percpu memory allocator * * Copyright (C) 2009 SUSE Linux Products GmbH * Copyright (C) 2009 Tejun Heo <tj@kernel.org> * * Copyright (C) 2017 Facebook Inc. * Copyright (C) 2017 Dennis Zhou <dennis@kernel.org> * * The percpu allocator handles both static and dynamic areas. Percpu * areas are allocated in chunks which are divided into units. There is * a 1-to-1 mapping for units to possible cpus. These units are grouped * based on NUMA properties of the machine. * * c0 c1 c2 * ------------------- ------------------- ------------ * | u0 | u1 | u2 | u3 | | u0 | u1 | u2 | u3 | | u0 | u1 | u * ------------------- ...... ------------------- .... ------------ * * Allocation is done by offsets into a unit's address space. Ie., an * area of 512 bytes at 6k in c1 occupies 512 bytes at 6k in c1:u0, * c1:u1, c1:u2, etc. On NUMA machines, the mapping may be non-linear * and even sparse. Access is handled by configuring percpu base * registers according to the cpu to unit mappings and offsetting the * base address using pcpu_unit_size. * * There is special consideration for the first chunk which must handle * the static percpu variables in the kernel image as allocation services * are not online yet. In short, the first chunk is structured like so: * * <Static | [Reserved] | Dynamic> * * The static data is copied from the original section managed by the * linker. The reserved section, if non-zero, primarily manages static * percpu variables from kernel modules. Finally, the dynamic section * takes care of normal allocations. * * The allocator organizes chunks into lists according to free size and * memcg-awareness. To make a percpu allocation memcg-aware the __GFP_ACCOUNT * flag should be passed. All memcg-aware allocations are sharing one set * of chunks and all unaccounted allocations and allocations performed * by processes belonging to the root memory cgroup are using the second set. * * The allocator tries to allocate from the fullest chunk first. Each chunk * is managed by a bitmap with metadata blocks. The allocation map is updated * on every allocation and free to reflect the current state while the boundary * map is only updated on allocation. Each metadata block contains * information to help mitigate the need to iterate over large portions * of the bitmap. The reverse mapping from page to chunk is stored in * the page's index. Lastly, units are lazily backed and grow in unison. * * There is a unique conversion that goes on here between bytes and bits. * Each bit represents a fragment of size PCPU_MIN_ALLOC_SIZE. The chunk * tracks the number of pages it is responsible for in nr_pages. Helper * functions are used to convert from between the bytes, bits, and blocks. * All hints are managed in bits unless explicitly stated. * * To use this allocator, arch code should do the following: * * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate * regular address to percpu pointer and back if they need to be * different from the default * * - use pcpu_setup_first_chunk() during percpu area initialization to * setup the first chunk containing the kernel static percpu area
*/
/* * The slots are sorted by the size of the biggest continuous free area. * 1-31 bytes share the same slot.
*/ #define PCPU_SLOT_BASE_SHIFT 5 /* chunks in slots below this are subject to being sidelined on failed alloc */ #define PCPU_SLOT_FAIL_THRESHOLD 3
staticint pcpu_unit_pages __ro_after_init; staticint pcpu_unit_size __ro_after_init; staticint pcpu_nr_units __ro_after_init; staticint pcpu_atom_size __ro_after_init; int pcpu_nr_slots __ro_after_init; staticint pcpu_free_slot __ro_after_init; int pcpu_sidelined_slot __ro_after_init; int pcpu_to_depopulate_slot __ro_after_init; static size_t pcpu_chunk_struct_size __ro_after_init;
/* cpus with the lowest and highest unit addresses */ staticunsignedint pcpu_low_unit_cpu __ro_after_init; staticunsignedint pcpu_high_unit_cpu __ro_after_init;
/* the address of the first chunk which starts with the kernel static area */ void *pcpu_base_addr __ro_after_init;
staticconstint *pcpu_unit_map __ro_after_init; /* cpu -> unit */ constunsignedlong *pcpu_unit_offsets __ro_after_init; /* cpu -> unit offset */
/* group information, used for vm allocation */ staticint pcpu_nr_groups __ro_after_init; staticconstunsignedlong *pcpu_group_offsets __ro_after_init; staticconst size_t *pcpu_group_sizes __ro_after_init;
/* * The first chunk which always exists. Note that unlike other * chunks, this one can be allocated and mapped in several different * ways and thus often doesn't live in the vmalloc area.
*/ struct pcpu_chunk *pcpu_first_chunk __ro_after_init;
/* * Optional reserved chunk. This chunk reserves part of the first * chunk and serves it for reserved allocations. When the reserved * region doesn't exist, the following variable is NULL.
*/ struct pcpu_chunk *pcpu_reserved_chunk __ro_after_init;
DEFINE_SPINLOCK(pcpu_lock); /* all internal data structures */ static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop, map ext */
struct list_head *pcpu_chunk_lists __ro_after_init; /* chunk list slots */
/* * The number of empty populated pages, protected by pcpu_lock. * The reserved chunk doesn't contribute to the count.
*/ int pcpu_nr_empty_pop_pages;
/* * The number of populated pages in use by the allocator, protected by * pcpu_lock. This number is kept per a unit per chunk (i.e. when a page gets * allocated/deallocated, it is allocated/deallocated in all units of a chunk * and increments/decrements this count by 1).
*/ staticunsignedlong pcpu_nr_populated;
/* * Balance work is used to populate or destroy chunks asynchronously. We * try to keep the number of populated free pages between * PCPU_EMPTY_POP_PAGES_LOW and HIGH for atomic allocations and at most one * empty chunk.
*/ staticvoid pcpu_balance_workfn(struct work_struct *work); static DECLARE_WORK(pcpu_balance_work, pcpu_balance_workfn); staticbool pcpu_async_enabled __read_mostly; staticbool pcpu_atomic_alloc_failed;
staticvoid pcpu_schedule_balance_work(void)
{ if (pcpu_async_enabled)
schedule_work(&pcpu_balance_work);
}
/** * pcpu_addr_in_chunk - check if the address is served from this chunk * @chunk: chunk of interest * @addr: percpu address * * RETURNS: * True if the address is served from this chunk.
*/ staticbool pcpu_addr_in_chunk(struct pcpu_chunk *chunk, void *addr)
{ void *start_addr, *end_addr;
/* set the pointer to a chunk in a page struct */ staticvoid pcpu_set_page_chunk(struct page *page, struct pcpu_chunk *pcpu)
{
page->private = (unsignedlong)pcpu;
}
/* obtain pointer to a chunk from a page struct */ staticstruct pcpu_chunk *pcpu_get_page_chunk(struct page *page)
{ return (struct pcpu_chunk *)page->private;
}
/* * The following are helper functions to help access bitmaps and convert * between bitmap offsets to address offsets.
*/ staticunsignedlong *pcpu_index_alloc_map(struct pcpu_chunk *chunk, int index)
{ return chunk->alloc_map +
(index * PCPU_BITMAP_BLOCK_BITS / BITS_PER_LONG);
}
staticunsignedlong pcpu_off_to_block_index(int off)
{ return off / PCPU_BITMAP_BLOCK_BITS;
}
staticunsignedlong pcpu_block_off_to_off(int index, int off)
{ return index * PCPU_BITMAP_BLOCK_BITS + off;
}
/** * pcpu_check_block_hint - check against the contig hint * @block: block of interest * @bits: size of allocation * @align: alignment of area (max PAGE_SIZE) * * Check to see if the allocation can fit in the block's contig hint. * Note, a chunk uses the same hints as a block so this can also check against * the chunk's contig hint.
*/ staticbool pcpu_check_block_hint(struct pcpu_block_md *block, int bits,
size_t align)
{ int bit_off = ALIGN(block->contig_hint_start, align) -
block->contig_hint_start;
return bit_off + bits <= block->contig_hint;
}
/* * pcpu_next_hint - determine which hint to use * @block: block of interest * @alloc_bits: size of allocation * * This determines if we should scan based on the scan_hint or first_free. * In general, we want to scan from first_free to fulfill allocations by * first fit. However, if we know a scan_hint at position scan_hint_start * cannot fulfill an allocation, we can begin scanning from there knowing * the contig_hint will be our fallback.
*/ staticint pcpu_next_hint(struct pcpu_block_md *block, int alloc_bits)
{ /* * The three conditions below determine if we can skip past the * scan_hint. First, does the scan hint exist. Second, is the * contig_hint after the scan_hint (possibly not true iff * contig_hint == scan_hint). Third, is the allocation request * larger than the scan_hint.
*/ if (block->scan_hint &&
block->contig_hint_start > block->scan_hint_start &&
alloc_bits > block->scan_hint) return block->scan_hint_start + block->scan_hint;
return block->first_free;
}
/** * pcpu_next_md_free_region - finds the next hint free area * @chunk: chunk of interest * @bit_off: chunk offset * @bits: size of free area * * Helper function for pcpu_for_each_md_free_region. It checks * block->contig_hint and performs aggregation across blocks to find the * next hint. It modifies bit_off and bits in-place to be consumed in the * loop.
*/ staticvoid pcpu_next_md_free_region(struct pcpu_chunk *chunk, int *bit_off, int *bits)
{ int i = pcpu_off_to_block_index(*bit_off); int block_off = pcpu_off_to_block_off(*bit_off); struct pcpu_block_md *block;
*bits = 0; for (block = chunk->md_blocks + i; i < pcpu_chunk_nr_blocks(chunk);
block++, i++) { /* handles contig area across blocks */ if (*bits) {
*bits += block->left_free; if (block->left_free == PCPU_BITMAP_BLOCK_BITS) continue; return;
}
/* * This checks three things. First is there a contig_hint to * check. Second, have we checked this hint before by * comparing the block_off. Third, is this the same as the * right contig hint. In the last case, it spills over into * the next block and should be handled by the contig area * across blocks code.
*/
*bits = block->contig_hint; if (*bits && block->contig_hint_start >= block_off &&
*bits + block->contig_hint_start < PCPU_BITMAP_BLOCK_BITS) {
*bit_off = pcpu_block_off_to_off(i,
block->contig_hint_start); return;
} /* reset to satisfy the second predicate above */
block_off = 0;
/** * pcpu_next_fit_region - finds fit areas for a given allocation request * @chunk: chunk of interest * @alloc_bits: size of allocation * @align: alignment of area (max PAGE_SIZE) * @bit_off: chunk offset * @bits: size of free area * * Finds the next free region that is viable for use with a given size and * alignment. This only returns if there is a valid area to be used for this * allocation. block->first_free is returned if the allocation request fits * within the block to see if the request can be fulfilled prior to the contig * hint.
*/ staticvoid pcpu_next_fit_region(struct pcpu_chunk *chunk, int alloc_bits, int align, int *bit_off, int *bits)
{ int i = pcpu_off_to_block_index(*bit_off); int block_off = pcpu_off_to_block_off(*bit_off); struct pcpu_block_md *block;
*bits = 0; for (block = chunk->md_blocks + i; i < pcpu_chunk_nr_blocks(chunk);
block++, i++) { /* handles contig area across blocks */ if (*bits) {
*bits += block->left_free; if (*bits >= alloc_bits) return; if (block->left_free == PCPU_BITMAP_BLOCK_BITS) continue;
}
/* check block->contig_hint */
*bits = ALIGN(block->contig_hint_start, align) -
block->contig_hint_start; /* * This uses the block offset to determine if this has been * checked in the prior iteration.
*/ if (block->contig_hint &&
block->contig_hint_start >= block_off &&
block->contig_hint >= *bits + alloc_bits) { int start = pcpu_next_hint(block, alloc_bits);
*bits += alloc_bits + block->contig_hint_start -
start;
*bit_off = pcpu_block_off_to_off(i, start); return;
} /* reset to satisfy the second predicate above */
block_off = 0;
/* no valid offsets were found - fail condition */
*bit_off = pcpu_chunk_map_bits(chunk);
}
/* * Metadata free area iterators. These perform aggregation of free areas * based on the metadata blocks and return the offset @bit_off and size in * bits of the free area @bits. pcpu_for_each_fit_region only returns when * a fit is found for the allocation request.
*/ #define pcpu_for_each_md_free_region(chunk, bit_off, bits) \ for (pcpu_next_md_free_region((chunk), &(bit_off), &(bits)); \
(bit_off) < pcpu_chunk_map_bits((chunk)); \
(bit_off) += (bits) + 1, \
pcpu_next_md_free_region((chunk), &(bit_off), &(bits)))
/** * pcpu_mem_zalloc - allocate memory * @size: bytes to allocate * @gfp: allocation flags * * Allocate @size bytes. If @size is smaller than PAGE_SIZE, * kzalloc() is used; otherwise, the equivalent of vzalloc() is used. * This is to facilitate passing through whitelisted flags. The * returned memory is always zeroed. * * RETURNS: * Pointer to the allocated area on success, NULL on failure.
*/ staticvoid *pcpu_mem_zalloc(size_t size, gfp_t gfp)
{ if (WARN_ON_ONCE(!slab_is_available())) return NULL;
/** * pcpu_chunk_relocate - put chunk in the appropriate chunk slot * @chunk: chunk of interest * @oslot: the previous slot it was on * * This function is called after an allocation or free changed @chunk. * New slot according to the changed state is determined and @chunk is * moved to the slot. Note that the reserved chunk is never put on * chunk slots. * * CONTEXT: * pcpu_lock.
*/ staticvoid pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
{ int nslot = pcpu_chunk_slot(chunk);
/* leave isolated chunks in-place */ if (chunk->isolated) return;
if (oslot != nslot)
__pcpu_chunk_move(chunk, nslot, oslot < nslot);
}
/* * pcpu_update_empty_pages - update empty page counters * @chunk: chunk of interest * @nr: nr of empty pages * * This is used to keep track of the empty pages now based on the premise * a md_block covers a page. The hint update functions recognize if a block * is made full or broken to calculate deltas for keeping track of free pages.
*/ staticinlinevoid pcpu_update_empty_pages(struct pcpu_chunk *chunk, int nr)
{
chunk->nr_empty_pop_pages += nr; if (chunk != pcpu_reserved_chunk && !chunk->isolated)
pcpu_nr_empty_pop_pages += nr;
}
/* * pcpu_region_overlap - determines if two regions overlap * @a: start of first region, inclusive * @b: end of first region, exclusive * @x: start of second region, inclusive * @y: end of second region, exclusive * * This is used to determine if the hint region [a, b) overlaps with the * allocated region [x, y).
*/ staticinlinebool pcpu_region_overlap(int a, int b, int x, int y)
{ return (a < y) && (x < b);
}
/** * pcpu_block_update - updates a block given a free area * @block: block of interest * @start: start offset in block * @end: end offset in block * * Updates a block given a known free area. The region [start, end) is * expected to be the entirety of the free area within a block. Chooses * the best starting offset if the contig hints are equal.
*/ staticvoid pcpu_block_update(struct pcpu_block_md *block, int start, int end)
{ int contig = end - start;
if (end == block->nr_bits)
block->right_free = contig;
if (contig > block->contig_hint) { /* promote the old contig_hint to be the new scan_hint */ if (start > block->contig_hint_start) { if (block->contig_hint > block->scan_hint) {
block->scan_hint_start =
block->contig_hint_start;
block->scan_hint = block->contig_hint;
} elseif (start < block->scan_hint_start) { /* * The old contig_hint == scan_hint. But, the * new contig is larger so hold the invariant * scan_hint_start < contig_hint_start.
*/
block->scan_hint = 0;
}
} else {
block->scan_hint = 0;
}
block->contig_hint_start = start;
block->contig_hint = contig;
} elseif (contig == block->contig_hint) { if (block->contig_hint_start &&
(!start ||
__ffs(start) > __ffs(block->contig_hint_start))) { /* start has a better alignment so use it */
block->contig_hint_start = start; if (start < block->scan_hint_start &&
block->contig_hint > block->scan_hint)
block->scan_hint = 0;
} elseif (start > block->scan_hint_start ||
block->contig_hint > block->scan_hint) { /* * Knowing contig == contig_hint, update the scan_hint * if it is farther than or larger than the current * scan_hint.
*/
block->scan_hint_start = start;
block->scan_hint = contig;
}
} else { /* * The region is smaller than the contig_hint. So only update * the scan_hint if it is larger than or equal and farther than * the current scan_hint.
*/ if ((start < block->contig_hint_start &&
(contig > block->scan_hint ||
(contig == block->scan_hint &&
start > block->scan_hint_start)))) {
block->scan_hint_start = start;
block->scan_hint = contig;
}
}
}
/* * pcpu_block_update_scan - update a block given a free area from a scan * @chunk: chunk of interest * @bit_off: chunk offset * @bits: size of free area * * Finding the final allocation spot first goes through pcpu_find_block_fit() * to find a block that can hold the allocation and then pcpu_alloc_area() * where a scan is used. When allocations require specific alignments, * we can inadvertently create holes which will not be seen in the alloc * or free paths. * * This takes a given free area hole and updates a block as it may change the * scan_hint. We need to scan backwards to ensure we don't miss free bits * from alignment.
*/ staticvoid pcpu_block_update_scan(struct pcpu_chunk *chunk, int bit_off, int bits)
{ int s_off = pcpu_off_to_block_off(bit_off); int e_off = s_off + bits; int s_index, l_bit; struct pcpu_block_md *block;
/* scan backwards in case of alignment skipping free bits */
l_bit = find_last_bit(pcpu_index_alloc_map(chunk, s_index), s_off);
s_off = (s_off == l_bit) ? 0 : l_bit + 1;
pcpu_block_update(block, s_off, e_off);
}
/** * pcpu_chunk_refresh_hint - updates metadata about a chunk * @chunk: chunk of interest * @full_scan: if we should scan from the beginning * * Iterates over the metadata blocks to find the largest contig area. * A full scan can be avoided on the allocation path as this is triggered * if we broke the contig_hint. In doing so, the scan_hint will be before * the contig_hint or after if the scan_hint == contig_hint. This cannot * be prevented on freeing as we want to find the largest area possibly * spanning blocks.
*/ staticvoid pcpu_chunk_refresh_hint(struct pcpu_chunk *chunk, bool full_scan)
{ struct pcpu_block_md *chunk_md = &chunk->chunk_md; int bit_off, bits;
/** * pcpu_block_refresh_hint * @chunk: chunk of interest * @index: index of the metadata block * * Scans over the block beginning at first_free and updates the block * metadata accordingly.
*/ staticvoid pcpu_block_refresh_hint(struct pcpu_chunk *chunk, int index)
{ struct pcpu_block_md *block = chunk->md_blocks + index; unsignedlong *alloc_map = pcpu_index_alloc_map(chunk, index); unsignedint start, end; /* region start, region end */
/* iterate over free areas and update the contig hints */
for_each_clear_bitrange_from(start, end, alloc_map, PCPU_BITMAP_BLOCK_BITS)
pcpu_block_update(block, start, end);
}
/** * pcpu_block_update_hint_alloc - update hint on allocation path * @chunk: chunk of interest * @bit_off: chunk offset * @bits: size of request * * Updates metadata for the allocation path. The metadata only has to be * refreshed by a full scan iff the chunk's contig hint is broken. Block level * scans are required if the block's contig hint is broken.
*/ staticvoid pcpu_block_update_hint_alloc(struct pcpu_chunk *chunk, int bit_off, int bits)
{ struct pcpu_block_md *chunk_md = &chunk->chunk_md; int nr_empty_pages = 0; struct pcpu_block_md *s_block, *e_block, *block; int s_index, e_index; /* block indexes of the freed allocation */ int s_off, e_off; /* block offsets of the freed allocation */
/* * Calculate per block offsets. * The calculation uses an inclusive range, but the resulting offsets * are [start, end). e_index always points to the last block in the * range.
*/
s_index = pcpu_off_to_block_index(bit_off);
e_index = pcpu_off_to_block_index(bit_off + bits - 1);
s_off = pcpu_off_to_block_off(bit_off);
e_off = pcpu_off_to_block_off(bit_off + bits - 1) + 1;
/* * Update s_block.
*/ if (s_block->contig_hint == PCPU_BITMAP_BLOCK_BITS)
nr_empty_pages++;
/* * block->first_free must be updated if the allocation takes its place. * If the allocation breaks the contig_hint, a scan is required to * restore this hint.
*/ if (s_off == s_block->first_free)
s_block->first_free = find_next_zero_bit(
pcpu_index_alloc_map(chunk, s_index),
PCPU_BITMAP_BLOCK_BITS,
s_off + bits);
if (pcpu_region_overlap(s_block->contig_hint_start,
s_block->contig_hint_start +
s_block->contig_hint,
s_off,
s_off + bits)) { /* block contig hint is broken - scan to fix it */ if (!s_off)
s_block->left_free = 0;
pcpu_block_refresh_hint(chunk, s_index);
} else { /* update left and right contig manually */
s_block->left_free = min(s_block->left_free, s_off); if (s_index == e_index)
s_block->right_free = min_t(int, s_block->right_free,
PCPU_BITMAP_BLOCK_BITS - e_off); else
s_block->right_free = 0;
}
/* * Update e_block.
*/ if (s_index != e_index) { if (e_block->contig_hint == PCPU_BITMAP_BLOCK_BITS)
nr_empty_pages++;
/* * When the allocation is across blocks, the end is along * the left part of the e_block.
*/
e_block->first_free = find_next_zero_bit(
pcpu_index_alloc_map(chunk, e_index),
PCPU_BITMAP_BLOCK_BITS, e_off);
if (e_off == PCPU_BITMAP_BLOCK_BITS) { /* reset the block */
e_block++;
} else { if (e_off > e_block->scan_hint_start)
e_block->scan_hint = 0;
e_block->left_free = 0; if (e_off > e_block->contig_hint_start) { /* contig hint is broken - scan to fix it */
pcpu_block_refresh_hint(chunk, e_index);
} else {
e_block->right_free =
min_t(int, e_block->right_free,
PCPU_BITMAP_BLOCK_BITS - e_off);
}
}
/* * If the allocation is not atomic, some blocks may not be * populated with pages, while we account it here. The number * of pages will be added back with pcpu_chunk_populated() * when populating pages.
*/ if (nr_empty_pages)
pcpu_update_empty_pages(chunk, -nr_empty_pages);
/* * The only time a full chunk scan is required is if the chunk * contig hint is broken. Otherwise, it means a smaller space * was used and therefore the chunk contig hint is still correct.
*/ if (pcpu_region_overlap(chunk_md->contig_hint_start,
chunk_md->contig_hint_start +
chunk_md->contig_hint,
bit_off,
bit_off + bits))
pcpu_chunk_refresh_hint(chunk, false);
}
/** * pcpu_block_update_hint_free - updates the block hints on the free path * @chunk: chunk of interest * @bit_off: chunk offset * @bits: size of request * * Updates metadata for the allocation path. This avoids a blind block * refresh by making use of the block contig hints. If this fails, it scans * forward and backward to determine the extent of the free area. This is * capped at the boundary of blocks. * * A chunk update is triggered if a page becomes free, a block becomes free, * or the free spans across blocks. This tradeoff is to minimize iterating * over the block metadata to update chunk_md->contig_hint. * chunk_md->contig_hint may be off by up to a page, but it will never be more * than the available space. If the contig hint is contained in one block, it * will be accurate.
*/ staticvoid pcpu_block_update_hint_free(struct pcpu_chunk *chunk, int bit_off, int bits)
{ int nr_empty_pages = 0; struct pcpu_block_md *s_block, *e_block, *block; int s_index, e_index; /* block indexes of the freed allocation */ int s_off, e_off; /* block offsets of the freed allocation */ int start, end; /* start and end of the whole free area */
/* * Calculate per block offsets. * The calculation uses an inclusive range, but the resulting offsets * are [start, end). e_index always points to the last block in the * range.
*/
s_index = pcpu_off_to_block_index(bit_off);
e_index = pcpu_off_to_block_index(bit_off + bits - 1);
s_off = pcpu_off_to_block_off(bit_off);
e_off = pcpu_off_to_block_off(bit_off + bits - 1) + 1;
/* * Check if the freed area aligns with the block->contig_hint. * If it does, then the scan to find the beginning/end of the * larger free area can be avoided. * * start and end refer to beginning and end of the free area * within each their respective blocks. This is not necessarily * the entire free area as it may span blocks past the beginning * or end of the block.
*/
start = s_off; if (s_off == s_block->contig_hint + s_block->contig_hint_start) {
start = s_block->contig_hint_start;
} else { /* * Scan backwards to find the extent of the free area. * find_last_bit returns the starting bit, so if the start bit * is returned, that means there was no last bit and the * remainder of the chunk is free.
*/ int l_bit = find_last_bit(pcpu_index_alloc_map(chunk, s_index),
start);
start = (start == l_bit) ? 0 : l_bit + 1;
}
end = e_off; if (e_off == e_block->contig_hint_start)
end = e_block->contig_hint_start + e_block->contig_hint; else
end = find_next_bit(pcpu_index_alloc_map(chunk, e_index),
PCPU_BITMAP_BLOCK_BITS, end);
/* freeing in the same block */ if (s_index != e_index) { /* update e_block */ if (end == PCPU_BITMAP_BLOCK_BITS)
nr_empty_pages++;
pcpu_block_update(e_block, 0, end);
if (nr_empty_pages)
pcpu_update_empty_pages(chunk, nr_empty_pages);
/* * Refresh chunk metadata when the free makes a block free or spans * across blocks. The contig_hint may be off by up to a page, but if * the contig_hint is contained in a block, it will be accurate with * the else condition below.
*/ if (((end - start) >= PCPU_BITMAP_BLOCK_BITS) || s_index != e_index)
pcpu_chunk_refresh_hint(chunk, true); else
pcpu_block_update(&chunk->chunk_md,
pcpu_block_off_to_off(s_index, start),
end);
}
/** * pcpu_is_populated - determines if the region is populated * @chunk: chunk of interest * @bit_off: chunk offset * @bits: size of area * @next_off: return value for the next offset to start searching * * For atomic allocations, check if the backing pages are populated. * * RETURNS: * Bool if the backing pages are populated. * next_index is to skip over unpopulated blocks in pcpu_find_block_fit.
*/ staticbool pcpu_is_populated(struct pcpu_chunk *chunk, int bit_off, int bits, int *next_off)
{ unsignedint start, end;
start = find_next_zero_bit(chunk->populated, end, start); if (start >= end) returntrue;
end = find_next_bit(chunk->populated, end, start + 1);
*next_off = end * PAGE_SIZE / PCPU_MIN_ALLOC_SIZE; returnfalse;
}
/** * pcpu_find_block_fit - finds the block index to start searching * @chunk: chunk of interest * @alloc_bits: size of request in allocation units * @align: alignment of area (max PAGE_SIZE bytes) * @pop_only: use populated regions only * * Given a chunk and an allocation spec, find the offset to begin searching * for a free region. This iterates over the bitmap metadata blocks to * find an offset that will be guaranteed to fit the requirements. It is * not quite first fit as if the allocation does not fit in the contig hint * of a block or chunk, it is skipped. This errs on the side of caution * to prevent excess iteration. Poor alignment can cause the allocator to * skip over blocks and chunks that have valid free areas. * * RETURNS: * The offset in the bitmap to begin searching. * -1 if no offset is found.
*/ staticint pcpu_find_block_fit(struct pcpu_chunk *chunk, int alloc_bits,
size_t align, bool pop_only)
{ struct pcpu_block_md *chunk_md = &chunk->chunk_md; int bit_off, bits, next_off;
/* * This is an optimization to prevent scanning by assuming if the * allocation cannot fit in the global hint, there is memory pressure * and creating a new chunk would happen soon.
*/ if (!pcpu_check_block_hint(chunk_md, alloc_bits, align)) return -1;
if (bit_off == pcpu_chunk_map_bits(chunk)) return -1;
return bit_off;
}
/* * pcpu_find_zero_area - modified from bitmap_find_next_zero_area_off() * @map: the address to base the search on * @size: the bitmap size in bits * @start: the bitnumber to start searching at * @nr: the number of zeroed bits we're looking for * @align_mask: alignment mask for zero area * @largest_off: offset of the largest area skipped * @largest_bits: size of the largest area skipped * * The @align_mask should be one less than a power of 2. * * This is a modified version of bitmap_find_next_zero_area_off() to remember * the largest area that was skipped. This is imperfect, but in general is * good enough. The largest remembered region is the largest failed region * seen. This does not include anything we possibly skipped due to alignment. * pcpu_block_update_scan() does scan backwards to try and recover what was * lost to alignment. While this can cause scanning to miss earlier possible * free areas, smaller allocations will eventually fill those holes.
*/ staticunsignedlong pcpu_find_zero_area(unsignedlong *map, unsignedlong size, unsignedlong start, unsignedlong nr, unsignedlong align_mask, unsignedlong *largest_off, unsignedlong *largest_bits)
{ unsignedlong index, end, i, area_off, area_bits;
again:
index = find_next_zero_bit(map, size, start);
end = index + nr; if (end > size) return end;
i = find_next_bit(map, end, index); if (i < end) {
area_bits = i - area_off; /* remember largest unused area with best alignment */ if (area_bits > *largest_bits ||
(area_bits == *largest_bits && *largest_off &&
(!area_off || __ffs(area_off) > __ffs(*largest_off)))) {
*largest_off = area_off;
*largest_bits = area_bits;
}
start = i + 1; goto again;
} return index;
}
/** * pcpu_alloc_area - allocates an area from a pcpu_chunk * @chunk: chunk of interest * @alloc_bits: size of request in allocation units * @align: alignment of area (max PAGE_SIZE) * @start: bit_off to start searching * * This function takes in a @start offset to begin searching to fit an * allocation of @alloc_bits with alignment @align. It needs to scan * the allocation map because if it fits within the block's contig hint, * @start will be block->first_free. This is an attempt to fill the * allocation prior to breaking the contig hint. The allocation and * boundary maps are updated accordingly if it confirms a valid * free area. * * RETURNS: * Allocated addr offset in @chunk on success. * -1 if no matching area is found.
*/ staticint pcpu_alloc_area(struct pcpu_chunk *chunk, int alloc_bits,
size_t align, int start)
{ struct pcpu_block_md *chunk_md = &chunk->chunk_md;
size_t align_mask = (align) ? (align - 1) : 0; unsignedlong area_off = 0, area_bits = 0; int bit_off, end, oslot;
lockdep_assert_held(&pcpu_lock);
oslot = pcpu_chunk_slot(chunk);
/* * Search to find a fit.
*/
end = min_t(int, start + alloc_bits + PCPU_BITMAP_BLOCK_BITS,
pcpu_chunk_map_bits(chunk));
bit_off = pcpu_find_zero_area(chunk->alloc_map, end, start, alloc_bits,
align_mask, &area_off, &area_bits); if (bit_off >= end) return -1;
if (area_bits)
pcpu_block_update_scan(chunk, area_off, area_bits);
/** * pcpu_free_area - frees the corresponding offset * @chunk: chunk of interest * @off: addr offset into chunk * * This function determines the size of an allocation to free using * the boundary bitmap and clears the allocation map. * * RETURNS: * Number of freed bytes.
*/ staticint pcpu_free_area(struct pcpu_chunk *chunk, int off)
{ struct pcpu_block_md *chunk_md = &chunk->chunk_md; int bit_off, bits, end, oslot, freed;
/* find end index */
end = find_next_bit(chunk->bound_map, pcpu_chunk_map_bits(chunk),
bit_off + 1);
bits = end - bit_off;
bitmap_clear(chunk->alloc_map, bit_off, bits);
freed = bits * PCPU_MIN_ALLOC_SIZE;
/* update metadata */
chunk->free_bytes += freed;
/* update first free bit */
chunk_md->first_free = min(chunk_md->first_free, bit_off);
/** * pcpu_alloc_first_chunk - creates chunks that serve the first chunk * @tmp_addr: the start of the region served * @map_size: size of the region served * * This is responsible for creating the chunks that serve the first chunk. The * base_addr is page aligned down of @tmp_addr while the region end is page * aligned up. Offsets are kept track of to determine the region served. All * this is done to appease the bitmap allocator in avoiding partial blocks. * * RETURNS: * Chunk serving the region at @tmp_addr of @map_size.
*/ staticstruct pcpu_chunk * __init pcpu_alloc_first_chunk(unsignedlong tmp_addr, int map_size)
{ struct pcpu_chunk *chunk; unsignedlong aligned_addr; int start_offset, offset_bits, region_size, region_bits;
size_t alloc_size;
/* region calculations */
aligned_addr = tmp_addr & PAGE_MASK;
/** * pcpu_chunk_populated - post-population bookkeeping * @chunk: pcpu_chunk which got populated * @page_start: the start page * @page_end: the end page * * Pages in [@page_start,@page_end) have been populated to @chunk. Update * the bookkeeping information accordingly. Must be called after each * successful population.
*/ staticvoid pcpu_chunk_populated(struct pcpu_chunk *chunk, int page_start, int page_end)
{ int nr = page_end - page_start;
/** * pcpu_chunk_depopulated - post-depopulation bookkeeping * @chunk: pcpu_chunk which got depopulated * @page_start: the start page * @page_end: the end page * * Pages in [@page_start,@page_end) have been depopulated from @chunk. * Update the bookkeeping information accordingly. Must be called after * each successful depopulation.
*/ staticvoid pcpu_chunk_depopulated(struct pcpu_chunk *chunk, int page_start, int page_end)
{ int nr = page_end - page_start;
/* * Chunk management implementation. * * To allow different implementations, chunk alloc/free and * [de]population are implemented in a separate file which is pulled * into this file and compiled together. The following functions * should be implemented. * * pcpu_populate_chunk - populate the specified range of a chunk * pcpu_depopulate_chunk - depopulate the specified range of a chunk * pcpu_post_unmap_tlb_flush - flush tlb for the specified range of a chunk * pcpu_create_chunk - create a new chunk * pcpu_destroy_chunk - destroy a chunk, always preceded by full depop * pcpu_addr_to_page - translate address to physical address * pcpu_verify_alloc_info - check alloc_info is acceptable during init
*/ staticint pcpu_populate_chunk(struct pcpu_chunk *chunk, int page_start, int page_end, gfp_t gfp); staticvoid pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int page_start, int page_end); staticvoid pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk, int page_start, int page_end); staticstruct pcpu_chunk *pcpu_create_chunk(gfp_t gfp); staticvoid pcpu_destroy_chunk(struct pcpu_chunk *chunk); staticstruct page *pcpu_addr_to_page(void *addr); staticint __init pcpu_verify_alloc_info(conststruct pcpu_alloc_info *ai);
/** * pcpu_chunk_addr_search - determine chunk containing specified address * @addr: address for which the chunk needs to be determined. * * This is an internal function that handles all but static allocations. * Static percpu address values should never be passed into the allocator. * * RETURNS: * The address of the found chunk.
*/ staticstruct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
{ /* is it in the dynamic region (first chunk)? */ if (pcpu_addr_in_chunk(pcpu_first_chunk, addr)) return pcpu_first_chunk;
/* is it in the reserved region? */ if (pcpu_addr_in_chunk(pcpu_reserved_chunk, addr)) return pcpu_reserved_chunk;
/* * The address is relative to unit0 which might be unused and * thus unmapped. Offset the address to the unit space of the * current processor before looking it up in the vmalloc * space. Note that any possible cpu id can be used here, so * there's no need to worry about preemption or cpu hotplug.
*/
addr += pcpu_unit_offsets[raw_smp_processor_id()]; return pcpu_get_page_chunk(pcpu_addr_to_page(addr));
}
/** * pcpu_alloc - the percpu allocator * @size: size of area to allocate in bytes * @align: alignment of area (max PAGE_SIZE) * @reserved: allocate from the reserved chunk if available * @gfp: allocation flags * * Allocate percpu area of @size bytes aligned at @align. If @gfp doesn't * contain %GFP_KERNEL, the allocation is atomic. If @gfp has __GFP_NOWARN * then no warning will be triggered on invalid or failed allocation * requests. * * RETURNS: * Percpu pointer to the allocated area on success, NULL on failure.
*/ void __percpu *pcpu_alloc_noprof(size_t size, size_t align, bool reserved,
gfp_t gfp)
{
gfp_t pcpu_gfp; bool is_atomic; bool do_warn; struct obj_cgroup *objcg = NULL; static atomic_t warn_limit = ATOMIC_INIT(10); struct pcpu_chunk *chunk, *next; constchar *err; int slot, off, cpu, ret; unsignedlong flags; void __percpu *ptr;
size_t bits, bit_align;
gfp = current_gfp_context(gfp); /* whitelisted flags that can be passed to the backing allocators */
pcpu_gfp = gfp & (GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN);
is_atomic = !gfpflags_allow_blocking(gfp);
do_warn = !(gfp & __GFP_NOWARN);
/* * There is now a minimum allocation size of PCPU_MIN_ALLOC_SIZE, * therefore alignment must be a minimum of that many bytes. * An allocation may have internal fragmentation from rounding up * of up to PCPU_MIN_ALLOC_SIZE - 1 bytes.
*/ if (unlikely(align < PCPU_MIN_ALLOC_SIZE))
align = PCPU_MIN_ALLOC_SIZE;
if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE ||
!is_power_of_2(align))) {
WARN(do_warn, "illegal size (%zu) or align (%zu) for percpu allocation\n",
size, align); return NULL;
}
if (unlikely(!pcpu_memcg_pre_alloc_hook(size, gfp, &objcg))) return NULL;
if (!is_atomic) { /* * pcpu_balance_workfn() allocates memory under this mutex, * and it may wait for memory reclaim. Allow current task * to become OOM victim, in case of memory pressure.
*/ if (gfp & __GFP_NOFAIL) {
mutex_lock(&pcpu_alloc_mutex);
} elseif (mutex_lock_killable(&pcpu_alloc_mutex)) {
pcpu_memcg_post_alloc_hook(objcg, NULL, 0, size); return NULL;
}
}
spin_lock_irqsave(&pcpu_lock, flags);
/* serve reserved allocations from the reserved chunk if available */ if (reserved && pcpu_reserved_chunk) {
chunk = pcpu_reserved_chunk;
off = pcpu_find_block_fit(chunk, bits, bit_align, is_atomic); if (off < 0) {
err = "alloc from reserved chunk failed"; goto fail_unlock;
}
off = pcpu_alloc_area(chunk, bits, bit_align, off); if (off >= 0) goto area_found;
err = "alloc from reserved chunk failed"; goto fail_unlock;
}
restart: /* search through normal chunks */ for (slot = pcpu_size_to_slot(size); slot <= pcpu_free_slot; slot++) {
list_for_each_entry_safe(chunk, next, &pcpu_chunk_lists[slot],
list) {
off = pcpu_find_block_fit(chunk, bits, bit_align,
is_atomic); if (off < 0) { if (slot < PCPU_SLOT_FAIL_THRESHOLD)
pcpu_chunk_move(chunk, 0); continue;
}
off = pcpu_alloc_area(chunk, bits, bit_align, off); if (off >= 0) {
pcpu_reintegrate_chunk(chunk); goto area_found;
}
}
}
spin_unlock_irqrestore(&pcpu_lock, flags);
if (is_atomic) {
err = "atomic alloc failed, no space left"; goto fail;
}
/* No space left. Create a new chunk. */ if (list_empty(&pcpu_chunk_lists[pcpu_free_slot])) {
chunk = pcpu_create_chunk(pcpu_gfp); if (!chunk) {
err = "failed to allocate new chunk"; goto fail;
}
/* clear the areas and return address relative to base address */
for_each_possible_cpu(cpu)
memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
if (do_warn) { int remaining = atomic_dec_if_positive(&warn_limit);
if (remaining >= 0) {
pr_warn("allocation failed, size=%zu align=%zu atomic=%d, %s\n",
size, align, is_atomic, err); if (!is_atomic)
dump_stack(); if (remaining == 0)
pr_info("limit reached, disable warning\n");
}
}
if (is_atomic) { /* see the flag handling in pcpu_balance_workfn() */
pcpu_atomic_alloc_failed = true;
pcpu_schedule_balance_work();
} else {
mutex_unlock(&pcpu_alloc_mutex);
}
/** * pcpu_balance_free - manage the amount of free chunks * @empty_only: free chunks only if there are no populated pages * * If empty_only is %false, reclaim all fully free chunks regardless of the * number of populated pages. Otherwise, only reclaim chunks that have no * populated pages. * * CONTEXT: * pcpu_lock (can be dropped temporarily)
*/ staticvoid pcpu_balance_free(bool empty_only)
{
LIST_HEAD(to_free); struct list_head *free_head = &pcpu_chunk_lists[pcpu_free_slot]; struct pcpu_chunk *chunk, *next;
lockdep_assert_held(&pcpu_lock);
/* * There's no reason to keep around multiple unused chunks and VM * areas can be scarce. Destroy all free chunks except for one.
*/
list_for_each_entry_safe(chunk, next, free_head, list) {
WARN_ON(chunk->immutable);
/* spare the first one */ if (chunk == list_first_entry(free_head, struct pcpu_chunk, list)) continue;
if (!empty_only || chunk->nr_empty_pop_pages == 0)
list_move(&chunk->list, &to_free);
}
/** * pcpu_balance_populated - manage the amount of populated pages * * Maintain a certain amount of populated pages to satisfy atomic allocations. * It is possible that this is called when physical memory is scarce causing * OOM killer to be triggered. We should avoid doing so until an actual * allocation causes the failure as it is possible that requests can be * serviced from already backed regions. * * CONTEXT: * pcpu_lock (can be dropped temporarily)
*/ staticvoid pcpu_balance_populated(void)
{ /* gfp flags passed to underlying allocators */ const gfp_t gfp = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN; struct pcpu_chunk *chunk; int slot, nr_to_pop, ret;
lockdep_assert_held(&pcpu_lock);
/* * Ensure there are certain number of free populated pages for * atomic allocs. Fill up from the most packed so that atomic * allocs don't increase fragmentation. If atomic allocation * failed previously, always populate the maximum amount. This * should prevent atomic allocs larger than PAGE_SIZE from keeping * failing indefinitely; however, large atomic allocs are not * something we support properly and can be highly unreliable and * inefficient.
*/
retry_pop: if (pcpu_atomic_alloc_failed) {
nr_to_pop = PCPU_EMPTY_POP_PAGES_HIGH; /* best effort anyway, don't worry about synchronization */
pcpu_atomic_alloc_failed = false;
} else {
nr_to_pop = clamp(PCPU_EMPTY_POP_PAGES_HIGH -
pcpu_nr_empty_pop_pages,
0, PCPU_EMPTY_POP_PAGES_HIGH);
}
/* @chunk can't go away while pcpu_alloc_mutex is held */
for_each_clear_bitrange(rs, re, chunk->populated, chunk->nr_pages) { int nr = min_t(int, re - rs, nr_to_pop);
if (nr_to_pop) { /* ran out of chunks to populate, create a new one and retry */
spin_unlock_irq(&pcpu_lock);
chunk = pcpu_create_chunk(gfp);
cond_resched();
spin_lock_irq(&pcpu_lock); if (chunk) {
pcpu_chunk_relocate(chunk, -1); goto retry_pop;
}
}
}
/** * pcpu_reclaim_populated - scan over to_depopulate chunks and free empty pages * * Scan over chunks in the depopulate list and try to release unused populated * pages back to the system. Depopulated chunks are sidelined to prevent * repopulating these pages unless required. Fully free chunks are reintegrated * and freed accordingly (1 is kept around). If we drop below the empty * populated pages threshold, reintegrate the chunk if it has empty free pages. * Each chunk is scanned in the reverse order to keep populated pages close to * the beginning of the chunk. * * CONTEXT: * pcpu_lock (can be dropped temporarily) *
*/ staticvoid pcpu_reclaim_populated(void)
{ struct pcpu_chunk *chunk; struct pcpu_block_md *block; int freed_page_start, freed_page_end; int i, end; bool reintegrate;
lockdep_assert_held(&pcpu_lock);
/* * Once a chunk is isolated to the to_depopulate list, the chunk is no * longer discoverable to allocations whom may populate pages. The only * other accessor is the free path which only returns area back to the * allocator not touching the populated bitmap.
*/ while ((chunk = list_first_entry_or_null(
&pcpu_chunk_lists[pcpu_to_depopulate_slot], struct pcpu_chunk, list))) {
WARN_ON(chunk->immutable);
/* * Scan chunk's pages in the reverse order to keep populated * pages close to the beginning of the chunk.
*/
freed_page_start = chunk->nr_pages;
freed_page_end = 0;
reintegrate = false; for (i = chunk->nr_pages - 1, end = -1; i >= 0; i--) { /* no more work to do */ if (chunk->nr_empty_pop_pages == 0) break;
/* reintegrate chunk to prevent atomic alloc failures */ if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_HIGH) {
reintegrate = true; break;
}
/* * If the page is empty and populated, start or * extend the (i, end) range. If i == 0, decrease * i and perform the depopulation to cover the last * (first) page in the chunk.
*/
block = chunk->md_blocks + i; if (block->contig_hint == PCPU_BITMAP_BLOCK_BITS &&
test_bit(i, chunk->populated)) { if (end == -1)
end = i; if (i > 0) continue;
i--;
}
/* depopulate if there is an active range */ if (end == -1) continue;
spin_unlock_irq(&pcpu_lock);
pcpu_depopulate_chunk(chunk, i + 1, end + 1);
cond_resched();
spin_lock_irq(&pcpu_lock);
pcpu_chunk_depopulated(chunk, i + 1, end + 1);
freed_page_start = min(freed_page_start, i + 1);
freed_page_end = max(freed_page_end, end + 1);
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung ist noch experimentell.