// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 1993 Linus Torvalds * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 * SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000 * Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002 * Numa awareness, Christoph Lameter, SGI, June 2005 * Improving global KVA allocator, Uladzislau Rezki, Sony, May 2019
*/
p4d = p4d_offset(pgd, addr); do {
next = p4d_addr_end(addr, end);
p4d_clear_huge(p4d); if (p4d_bad(*p4d))
*mask |= PGTBL_P4D_MODIFIED;
if (p4d_none_or_clear_bad(p4d)) continue;
vunmap_pud_range(p4d, addr, next, mask);
} while (p4d++, addr = next, addr != end);
}
/* * vunmap_range_noflush is similar to vunmap_range, but does not * flush caches or TLBs. * * The caller is responsible for calling flush_cache_vmap() before calling * this function, and flush_tlb_kernel_range after it has returned * successfully (and before the addresses are expected to cause a page fault * or be re-mapped for something else, if TLB flushes are being delayed or * coalesced). * * This is an internal function only. Do not use outside mm/.
*/ void __vunmap_range_noflush(unsignedlong start, unsignedlong end)
{ unsignedlong next;
pgd_t *pgd; unsignedlong addr = start;
pgtbl_mod_mask mask = 0;
BUG_ON(addr >= end);
pgd = pgd_offset_k(addr); do {
next = pgd_addr_end(addr, end); if (pgd_bad(*pgd))
mask |= PGTBL_PGD_MODIFIED; if (pgd_none_or_clear_bad(pgd)) continue;
vunmap_p4d_range(pgd, addr, next, &mask);
} while (pgd++, addr = next, addr != end);
if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
arch_sync_kernel_mappings(start, end);
}
/** * vunmap_range - unmap kernel virtual addresses * @addr: start of the VM area to unmap * @end: end of the VM area to unmap (non-inclusive) * * Clears any present PTEs in the virtual address range, flushes TLBs and * caches. Any subsequent access to the address before it has been re-mapped * is a kernel bug.
*/ void vunmap_range(unsignedlong addr, unsignedlong end)
{
flush_cache_vunmap(addr, end);
vunmap_range_noflush(addr, end);
flush_tlb_kernel_range(addr, end);
}
BUG_ON(addr >= end);
pgd = pgd_offset_k(addr); do {
next = pgd_addr_end(addr, end); if (pgd_bad(*pgd))
mask |= PGTBL_PGD_MODIFIED;
err = vmap_pages_p4d_range(pgd, addr, next, prot, pages, &nr, &mask); if (err) break;
} while (pgd++, addr = next, addr != end);
if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
arch_sync_kernel_mappings(start, end);
return err;
}
/* * vmap_pages_range_noflush is similar to vmap_pages_range, but does not * flush caches. * * The caller is responsible for calling flush_cache_vmap() after this * function returns successfully and before the addresses are accessed. * * This is an internal function only. Do not use outside mm/.
*/ int __vmap_pages_range_noflush(unsignedlong addr, unsignedlong end,
pgprot_t prot, struct page **pages, unsignedint page_shift)
{ unsignedint i, nr = (end - addr) >> PAGE_SHIFT;
/** * vmap_pages_range - map pages to a kernel virtual address * @addr: start of the VM area to map * @end: end of the VM area to map (non-inclusive) * @prot: page protection flags to use * @pages: pages to map (always PAGE_SIZE pages) * @page_shift: maximum shift that the pages may be mapped with, @pages must * be aligned and contiguous up to at least this shift. * * RETURNS: * 0 on success, -errno on failure.
*/ int vmap_pages_range(unsignedlong addr, unsignedlong end,
pgprot_t prot, struct page **pages, unsignedint page_shift)
{ int err;
int is_vmalloc_or_module_addr(constvoid *x)
{ /* * ARM, x86-64 and sparc64 put modules in a special place, * and fall back on vmalloc() if that fails. Others * just put it in the vmalloc space.
*/ #ifdefined(CONFIG_EXECMEM) && defined(MODULES_VADDR) unsignedlong addr = (unsignedlong)kasan_reset_tag(x); if (addr >= MODULES_VADDR && addr < MODULES_END) return 1; #endif return is_vmalloc_addr(x);
}
EXPORT_SYMBOL_GPL(is_vmalloc_or_module_addr);
/* * Walk a vmap address to the struct page it maps. Huge vmap mappings will * return the tail page that corresponds to the base page address, which * matches small vmap mappings.
*/ struct page *vmalloc_to_page(constvoid *vmalloc_addr)
{ unsignedlong addr = (unsignedlong) vmalloc_addr; struct page *page = NULL;
pgd_t *pgd = pgd_offset_k(addr);
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pte_t *ptep, pte;
/* * XXX we might need to change this if we add VIRTUAL_BUG_ON for * architectures that do not vmalloc module space
*/
VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr));
if (pgd_none(*pgd)) return NULL; if (WARN_ON_ONCE(pgd_leaf(*pgd))) return NULL; /* XXX: no allowance for huge pgd */ if (WARN_ON_ONCE(pgd_bad(*pgd))) return NULL;
p4d = p4d_offset(pgd, addr); if (p4d_none(*p4d)) return NULL; if (p4d_leaf(*p4d)) return p4d_page(*p4d) + ((addr & ~P4D_MASK) >> PAGE_SHIFT); if (WARN_ON_ONCE(p4d_bad(*p4d))) return NULL;
pud = pud_offset(p4d, addr); if (pud_none(*pud)) return NULL; if (pud_leaf(*pud)) return pud_page(*pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); if (WARN_ON_ONCE(pud_bad(*pud))) return NULL;
pmd = pmd_offset(pud, addr); if (pmd_none(*pmd)) return NULL; if (pmd_leaf(*pmd)) return pmd_page(*pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); if (WARN_ON_ONCE(pmd_bad(*pmd))) return NULL;
/* * This kmem_cache is used for vmap_area objects. Instead of * allocating from slab we reuse an object from this cache to * make things faster. Especially in "no edge" splitting of * free block.
*/ staticstruct kmem_cache *vmap_area_cachep;
/* * This linked list is used in pair with free_vmap_area_root. * It gives O(1) access to prev/next to perform fast coalescing.
*/ static LIST_HEAD(free_vmap_area_list);
/* * This augment red-black tree represents the free vmap space. * All vmap_area objects in this tree are sorted by va->va_start * address. It is used for allocation and merging when a vmap * object is released. * * Each vmap_area node contains a maximum available free block * of its sub-tree, right or left. Therefore it is possible to * find a lowest match of free area.
*/ staticstruct rb_root free_vmap_area_root = RB_ROOT;
/* * Preload a CPU with one object for "no edge" split case. The * aim is to get rid of allocations from the atomic context, thus * to use more permissive allocation masks.
*/ static DEFINE_PER_CPU(struct vmap_area *, ne_fit_preload_node);
/* * This structure defines a single, solid model where a list and * rb-tree are part of one entity protected by the lock. Nodes are * sorted in ascending order, thus for O(1) access to left/right * neighbors a list is used as well as for sequential traversal.
*/ struct rb_list { struct rb_root root; struct list_head head;
spinlock_t lock;
};
/* * A fast size storage contains VAs up to 1M size. A pool consists * of linked between each other ready to go VAs of certain sizes. * An index in the pool-array corresponds to number of pages + 1.
*/ #define MAX_VA_SIZE_PAGES 256
/* * An effective vmap-node logic. Users make use of nodes instead * of a global heap. It allows to balance an access and mitigate * contention.
*/ staticstruct vmap_node { /* Simple size segregated storage. */ struct vmap_pool pool[MAX_VA_SIZE_PAGES];
spinlock_t pool_lock; bool skip_populate;
/* Bookkeeping data of this node. */ struct rb_list busy; struct rb_list lazy;
/* * Initial setup consists of one single node, i.e. a balancing * is fully disabled. Later on, after vmap is initialized these * parameters are updated based on a system capacity.
*/ staticstruct vmap_node *vmap_nodes = &single; static __read_mostly unsignedint nr_vmap_nodes = 1; static __read_mostly unsignedint vmap_zone_size = 1;
/* A simple iterator over all vmap-nodes. */ #define for_each_vmap_node(vn) \ for ((vn) = &vmap_nodes[0]; \
(vn) < &vmap_nodes[nr_vmap_nodes]; (vn)++)
WARN_ONCE(1, "An address 0x%p is out-of-bounds.\n", node); return 0;
}
/* * We use the value 0 to represent "no node", that is why * an encoded value will be the node-id incremented by 1. * It is always greater then 0. A valid node_id which can * be encoded is [0:nr_vmap_nodes - 1]. If a passed node_id * is not valid 0 is returned.
*/ staticunsignedint
encode_vn_id(unsignedint node_id)
{ /* Can store U8_MAX [0:254] nodes. */ if (node_id < nr_vmap_nodes) return (node_id + 1) << BITS_PER_BYTE;
/* Warn and no node encoded. */
WARN_ONCE(1, "Encode wrong node id (%u)\n", node_id); return 0;
}
/* * Returns an encoded node-id, the valid range is within * [0:nr_vmap_nodes-1] values. Otherwise nr_vmap_nodes is * returned if extracted data is wrong.
*/ staticunsignedint
decode_vn_id(unsignedint val)
{ unsignedint node_id = (val >> BITS_PER_BYTE) - 1;
/* Can store U8_MAX [0:254] nodes. */ if (node_id < nr_vmap_nodes) return node_id;
/* If it was _not_ zero, warn. */
WARN_ONCE(node_id != UINT_MAX, "Decode wrong node id (%d)\n", node_id);
return nr_vmap_nodes;
}
staticbool
is_vn_id_valid(unsignedint node_id)
{ if (node_id < nr_vmap_nodes) returntrue;
tmp = rb_entry(n, struct vmap_area, rb_node); if (tmp->va_end > addr) {
va = tmp; if (tmp->va_start <= addr) break;
n = n->rb_left;
} else
n = n->rb_right;
}
return va;
}
/* * Returns a node where a first VA, that satisfies addr < va_end, resides. * If success, a node is locked. A user is responsible to unlock it when a * VA is no longer needed to be accessed. * * Returns NULL if nothing found.
*/ staticstruct vmap_node *
find_vmap_area_exceed_addr_lock(unsignedlong addr, struct vmap_area **va)
{ unsignedlong va_start_lowest; struct vmap_node *vn;
if (*va) if (!va_start_lowest || (*va)->va_start < va_start_lowest)
va_start_lowest = (*va)->va_start;
spin_unlock(&vn->busy.lock);
}
/* * Check if found VA exists, it might have gone away. In this case we * repeat the search because a VA has been removed concurrently and we * need to proceed to the next one, which is a rare case.
*/ if (va_start_lowest) {
vn = addr_to_node(va_start_lowest);
/* * This function returns back addresses of parent node * and its left or right link for further processing. * * Otherwise NULL is returned. In that case all further * steps regarding inserting of conflicting overlap range * have to be declined and actually considered as a bug.
*/ static __always_inline struct rb_node **
find_va_links(struct vmap_area *va, struct rb_root *root, struct rb_node *from, struct rb_node **parent)
{ struct vmap_area *tmp_va; struct rb_node **link;
if (root) {
link = &root->rb_node; if (unlikely(!*link)) {
*parent = NULL; return link;
}
} else {
link = &from;
}
/* * Go to the bottom of the tree. When we hit the last point * we end up with parent rb_node and correct direction, i name * it link, where the new va->rb_node will be attached to.
*/ do {
tmp_va = rb_entry(*link, struct vmap_area, rb_node);
/* * During the traversal we also do some sanity check. * Trigger the BUG() if there are sides(left/right) * or full overlaps.
*/ if (va->va_end <= tmp_va->va_start)
link = &(*link)->rb_left; elseif (va->va_start >= tmp_va->va_end)
link = &(*link)->rb_right; else {
WARN(1, "vmalloc bug: 0x%lx-0x%lx overlaps with 0x%lx-0x%lx\n",
va->va_start, va->va_end, tmp_va->va_start, tmp_va->va_end);
if (unlikely(!parent)) /* * The red-black tree where we try to find VA neighbors * before merging or inserting is empty, i.e. it means * there is no free vmap space. Normally it does not * happen but we handle this case anyway.
*/ return NULL;
list = &rb_entry(parent, struct vmap_area, rb_node)->list; return (&parent->rb_right == link ? list->next : list);
}
static __always_inline void
__link_va(struct vmap_area *va, struct rb_root *root, struct rb_node *parent, struct rb_node **link, struct list_head *head, bool augment)
{ /* * VA is still not in the list, but we can * identify its future previous list_head node.
*/ if (likely(parent)) {
head = &rb_entry(parent, struct vmap_area, rb_node)->list; if (&parent->rb_right != link)
head = head->prev;
}
/* Insert to the rb-tree */
rb_link_node(&va->rb_node, parent, link); if (augment) { /* * Some explanation here. Just perform simple insertion * to the tree. We do not set va->subtree_max_size to * its current size before calling rb_insert_augmented(). * It is because we populate the tree from the bottom * to parent levels when the node _is_ in the tree. * * Therefore we set subtree_max_size to zero after insertion, * to let __augment_tree_propagate_from() puts everything to * the correct order later on.
*/
rb_insert_augmented(&va->rb_node,
root, &free_vmap_area_rb_augment_cb);
va->subtree_max_size = 0;
} else {
rb_insert_color(&va->rb_node, root);
}
/* Address-sort this list */
list_add(&va->list, head);
}
/* * This function populates subtree_max_size from bottom to upper * levels starting from VA point. The propagation must be done * when VA size is modified by changing its va_start/va_end. Or * in case of newly inserting of VA to the tree. * * It means that __augment_tree_propagate_from() must be called: * - After VA has been inserted to the tree(free path); * - After VA has been shrunk(allocation path); * - After VA has been increased(merging path). * * Please note that, it does not mean that upper parent nodes * and their subtree_max_size are recalculated all the time up * to the root node. * * 4--8 * /\ * / \ * / \ * 2--2 8--8 * * For example if we modify the node 4, shrinking it to 2, then * no any modification is required. If we shrink the node 2 to 1 * its subtree_max_size is updated only, and set to 1. If we shrink * the node 8 to 6, then its subtree_max_size is set to 6 and parent * node becomes 4--6.
*/ static __always_inline void
augment_tree_propagate_from(struct vmap_area *va)
{ /* * Populate the tree from bottom towards the root until * the calculated maximum available size of checked node * is equal to its current one.
*/
free_vmap_area_rb_augment_cb_propagate(&va->rb_node, NULL);
/* * Merge de-allocated chunk of VA memory with previous * and next free blocks. If coalesce is not done a new * free area is inserted. If VA has been merged, it is * freed. * * Please note, it can return NULL in case of overlap * ranges, followed by WARN() report. Despite it is a * buggy behaviour, a system can be alive and keep * ongoing.
*/ static __always_inline struct vmap_area *
__merge_or_add_vmap_area(struct vmap_area *va, struct rb_root *root, struct list_head *head, bool augment)
{ struct vmap_area *sibling; struct list_head *next; struct rb_node **link; struct rb_node *parent; bool merged = false;
/* * Find a place in the tree where VA potentially will be * inserted, unless it is merged with its sibling/siblings.
*/
link = find_va_links(va, root, NULL, &parent); if (!link) return NULL;
/* * Get next node of VA to check if merging can be done.
*/
next = get_va_next_sibling(parent, link); if (unlikely(next == NULL)) goto insert;
/* * start end * | | * |<------VA------>|<-----Next----->| * | | * start end
*/ if (next != head) {
sibling = list_entry(next, struct vmap_area, list); if (sibling->va_start == va->va_end) {
sibling->va_start = va->va_start;
/* Point to the new merged area. */
va = sibling;
merged = true;
}
}
/* * start end * | | * |<-----Prev----->|<------VA------>| * | | * start end
*/ if (next->prev != head) {
sibling = list_entry(next->prev, struct vmap_area, list); if (sibling->va_end == va->va_start) { /* * If both neighbors are coalesced, it is important * to unlink the "next" node first, followed by merging * with "previous" one. Otherwise the tree might not be * fully populated if a sibling's augmented value is * "normalized" because of rotation operations.
*/ if (merged)
__unlink_va(va, root, augment);
/* Can be overflowed due to big size or alignment. */ if (nva_start_addr + size < nva_start_addr ||
nva_start_addr < vstart) returnfalse;
return (nva_start_addr + size <= va->va_end);
}
/* * Find the first free block(lowest start address) in the tree, * that will accomplish the request corresponding to passing * parameters. Please note, with an alignment bigger than PAGE_SIZE, * a search length is adjusted to account for worst case alignment * overhead.
*/ static __always_inline struct vmap_area *
find_vmap_lowest_match(struct rb_root *root, unsignedlong size, unsignedlong align, unsignedlong vstart, bool adjust_search_size)
{ struct vmap_area *va; struct rb_node *node; unsignedlong length;
/* Start from the root. */
node = root->rb_node;
/* Adjust the search size for alignment overhead. */
length = adjust_search_size ? size + align - 1 : size;
while (node) {
va = rb_entry(node, struct vmap_area, rb_node);
/* * Does not make sense to go deeper towards the right * sub-tree if it does not have a free block that is * equal or bigger to the requested search length.
*/ if (get_subtree_max_size(node->rb_right) >= length) {
node = node->rb_right; continue;
}
/* * OK. We roll back and find the first right sub-tree, * that will satisfy the search criteria. It can happen * due to "vstart" restriction or an alignment overhead * that is bigger then PAGE_SIZE.
*/ while ((node = rb_parent(node))) {
va = rb_entry(node, struct vmap_area, rb_node); if (is_within_this_va(va, size, align, vstart)) return va;
if (get_subtree_max_size(node->rb_right) >= length &&
vstart <= va->va_start) { /* * Shift the vstart forward. Please note, we update it with * parent's start address adding "1" because we do not want * to enter same sub-tree after it has already been checked * and no suitable free block found there.
*/
vstart = va->va_start + 1;
node = node->rb_right; break;
}
}
}
}
enum fit_type {
NOTHING_FIT = 0,
FL_FIT_TYPE = 1, /* full fit */
LE_FIT_TYPE = 2, /* left edge fit */
RE_FIT_TYPE = 3, /* right edge fit */
NE_FIT_TYPE = 4 /* no edge fit */
};
if (type == FL_FIT_TYPE) { /* * No need to split VA, it fully fits. * * | | * V NVA V * |---------------|
*/
unlink_va_augment(va, root);
kmem_cache_free(vmap_area_cachep, va);
} elseif (type == LE_FIT_TYPE) { /* * Split left edge of fit VA. * * | | * V NVA V R * |-------|-------|
*/
va->va_start += size;
} elseif (type == RE_FIT_TYPE) { /* * Split right edge of fit VA. * * | | * L V NVA V * |-------|-------|
*/
va->va_end = nva_start_addr;
} elseif (type == NE_FIT_TYPE) { /* * Split no edge of fit VA. * * | | * L V NVA V R * |---|-------|---|
*/
lva = __this_cpu_xchg(ne_fit_preload_node, NULL); if (unlikely(!lva)) { /* * For percpu allocator we do not do any pre-allocation * and leave it as it is. The reason is it most likely * never ends up with NE_FIT_TYPE splitting. In case of * percpu allocations offsets and sizes are aligned to * fixed align request, i.e. RE_FIT_TYPE and FL_FIT_TYPE * are its main fitting cases. * * There are a few exceptions though, as an example it is * a first allocation (early boot up) when we have "one" * big free space that has to be split. * * Also we can hit this path in case of regular "vmap" * allocations, if "this" current CPU was not preloaded. * See the comment in alloc_vmap_area() why. If so, then * GFP_NOWAIT is used instead to get an extra object for * split purpose. That is rare and most time does not * occur. * * What happens if an allocation gets failed. Basically, * an "overflow" path is triggered to purge lazily freed * areas to free some memory, then, the "retry" path is * triggered to repeat one more time. See more details * in alloc_vmap_area() function.
*/
lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT); if (!lva) return -ENOMEM;
}
/* Check the "vend" restriction. */ if (nva_start_addr + size > vend) return -ERANGE;
/* Update the free vmap_area. */
ret = va_clip(root, head, va, nva_start_addr, size); if (WARN_ON_ONCE(ret)) return ret;
return nva_start_addr;
}
/* * Returns a start address of the newly allocated area, if success. * Otherwise an error value is returned that indicates failure.
*/ static __always_inline unsignedlong
__alloc_vmap_area(struct rb_root *root, struct list_head *head, unsignedlong size, unsignedlong align, unsignedlong vstart, unsignedlong vend)
{ bool adjust_search_size = true; unsignedlong nva_start_addr; struct vmap_area *va;
/* * Do not adjust when: * a) align <= PAGE_SIZE, because it does not make any sense. * All blocks(their start addresses) are at least PAGE_SIZE * aligned anyway; * b) a short range where a requested size corresponds to exactly * specified [vstart:vend] interval and an alignment > PAGE_SIZE. * With adjusted search length an allocation would not succeed.
*/ if (align <= PAGE_SIZE || (align > PAGE_SIZE && (vend - vstart) == size))
adjust_search_size = false;
va = find_vmap_lowest_match(root, size, align, vstart, adjust_search_size); if (unlikely(!va)) return -ENOENT;
#if DEBUG_AUGMENT_LOWEST_MATCH_CHECK if (!IS_ERR_VALUE(nva_start_addr))
find_vmap_lowest_match_check(root, head, size, align); #endif
return nva_start_addr;
}
/* * Free a region of KVA allocated by alloc_vmap_area
*/ staticvoid free_vmap_area(struct vmap_area *va)
{ struct vmap_node *vn = addr_to_node(va->va_start);
/* * Remove from the busy tree/list.
*/
spin_lock(&vn->busy.lock);
unlink_va(va, &vn->busy.root);
spin_unlock(&vn->busy.lock);
/* * Insert/Merge it back to the free tree/list.
*/
spin_lock(&free_vmap_area_lock);
merge_or_add_vmap_area_augment(va, &free_vmap_area_root, &free_vmap_area_list);
spin_unlock(&free_vmap_area_lock);
}
/* * Preload this CPU with one extra vmap_area object. It is used * when fit type of free area is NE_FIT_TYPE. It guarantees that * a CPU that does an allocation is preloaded. * * We do it in non-atomic context, thus it allows us to use more * permissive allocation masks to be more stable under low memory * condition and high memory pressure.
*/ if (!this_cpu_read(ne_fit_preload_node))
va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node);
vp = size_to_va_pool(vn, size); if (!vp || list_empty(&vp->head)) return NULL;
spin_lock(&vn->pool_lock); if (!list_empty(&vp->head)) {
va = list_first_entry(&vp->head, struct vmap_area, list);
if (IS_ALIGNED(va->va_start, align)) { /* * Do some sanity check and emit a warning * if one of below checks detects an error.
*/
err |= (va_size(va) != size);
err |= (va->va_start < vstart);
err |= (va->va_end > vend);
if (!WARN_ON_ONCE(err)) {
list_del_init(&va->list);
WRITE_ONCE(vp->len, vp->len - 1);
} else {
va = NULL;
}
} else {
list_move_tail(&va->list, &vp->head);
va = NULL;
}
}
spin_unlock(&vn->pool_lock);
/* * Fallback to a global heap if not vmalloc or there * is only one node.
*/ if (vstart != VMALLOC_START || vend != VMALLOC_END ||
nr_vmap_nodes == 1) return NULL;
/* * Allocate a region of KVA of the specified size and alignment, within the * vstart and vend. If vm is passed in, the two will also be bound.
*/ staticstruct vmap_area *alloc_vmap_area(unsignedlong size, unsignedlong align, unsignedlong vstart, unsignedlong vend, int node, gfp_t gfp_mask, unsignedlong va_flags, struct vm_struct *vm)
{ struct vmap_node *vn; struct vmap_area *va; unsignedlong freed; unsignedlong addr; unsignedint vn_id; int purged = 0; int ret;
if (unlikely(!size || offset_in_page(size) || !is_power_of_2(align))) return ERR_PTR(-EINVAL);
if (unlikely(!vmap_initialized)) return ERR_PTR(-EBUSY);
/* Only reclaim behaviour flags are relevant. */
gfp_mask = gfp_mask & GFP_RECLAIM_MASK;
might_sleep();
/* * If a VA is obtained from a global heap(if it fails here) * it is anyway marked with this "vn_id" so it is returned * to this pool's node later. Such way gives a possibility * to populate pools based on users demand. * * On success a ready to go VA is returned.
*/
va = node_alloc(size, align, vstart, vend, &addr, &vn_id); if (!va) {
va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node); if (unlikely(!va)) return ERR_PTR(-ENOMEM);
/* * Only scan the relevant parts containing pointers to other objects * to avoid false negatives.
*/
kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask);
}
int register_vmap_purge_notifier(struct notifier_block *nb)
{ return blocking_notifier_chain_register(&vmap_notify_list, nb);
}
EXPORT_SYMBOL_GPL(register_vmap_purge_notifier);
int unregister_vmap_purge_notifier(struct notifier_block *nb)
{ return blocking_notifier_chain_unregister(&vmap_notify_list, nb);
}
EXPORT_SYMBOL_GPL(unregister_vmap_purge_notifier);
/* * lazy_max_pages is the maximum amount of virtual address space we gather up * before attempting to purge with a TLB flush. * * There is a tradeoff here: a larger number will cover more kernel page tables * and take slightly longer to purge, but it will linearly reduce the number of * global TLB flushes that must be performed. It would seem natural to scale * this number up linearly with the number of CPUs (because vmapping activity * could also scale linearly with the number of CPUs), however it is likely * that in practice, workloads might be constrained in other ways that mean * vmap activity will not scale linearly with CPUs. Also, I want to be * conservative and not introduce a big latency on huge systems, so go with * a less aggressive log scale. It will still be an improvement over the old * code, and it will be simple to change the scale factor if we find that it * becomes a problem on bigger systems.
*/ staticunsignedlong lazy_max_pages(void)
{ unsignedint log;
log = fls(num_online_cpus());
return log * (32UL * 1024 * 1024 / PAGE_SIZE);
}
/* * Serialize vmap purging. There is no actual critical section protected * by this lock, but we want to avoid concurrent calls for performance * reasons and to make the pcpu_get_vm_areas more deterministic.
*/ static DEFINE_MUTEX(vmap_purge_lock);
/* for per-CPU blocks */ staticvoid purge_fragmented_blocks_allcpus(void);
for (i = 0; i < MAX_VA_SIZE_PAGES; i++) {
LIST_HEAD(tmp_list);
if (list_empty(&vn->pool[i].head)) continue;
/* Detach the pool, so no-one can access it. */
spin_lock(&vn->pool_lock);
list_replace_init(&vn->pool[i].head, &tmp_list);
spin_unlock(&vn->pool_lock);
/* * Attach the pool back if it has been partly decayed. * Please note, it is supposed that nobody(other contexts) * can populate the pool therefore a simple list replace * operation takes place here.
*/ if (!list_empty(&tmp_list)) {
spin_lock(&vn->pool_lock);
list_replace_init(&tmp_list, &vn->pool[i].head);
WRITE_ONCE(vn->pool[i].len, pool_len);
spin_unlock(&vn->pool_lock);
}
}
end = max(end, list_last_entry(&vn->purge_list, struct vmap_area, list)->va_end);
cpumask_set_cpu(node_to_id(vn), &purge_nodes);
}
nr_purge_nodes = cpumask_weight(&purge_nodes); if (nr_purge_nodes > 0) {
flush_tlb_kernel_range(start, end);
/* One extra worker is per a lazy_max_pages() full set minus one. */
nr_purge_helpers = atomic_long_read(&vmap_lazy_nr) / lazy_max_pages();
nr_purge_helpers = clamp(nr_purge_helpers, 1U, nr_purge_nodes) - 1;
/* * Free a vmap area, caller ensuring that the area has been unmapped, * unlinked and flush_cache_vunmap had been called for the correct * range previously.
*/ staticvoid free_vmap_area_noflush(struct vmap_area *va)
{ unsignedlong nr_lazy_max = lazy_max_pages(); unsignedlong va_start = va->va_start; unsignedint vn_id = decode_vn_id(va->flags); struct vmap_node *vn; unsignedlong nr_lazy;
/* * If it was request by a certain node we would like to * return it to that node, i.e. its pool for later reuse.
*/
vn = is_vn_id_valid(vn_id) ?
id_to_node(vn_id):addr_to_node(va->va_start);
/* After this point, we may free va at any time */ if (unlikely(nr_lazy > nr_lazy_max))
schedule_work(&drain_vmap_work);
}
/* * Free and unmap a vmap area
*/ staticvoid free_unmap_vmap_area(struct vmap_area *va)
{
flush_cache_vunmap(va->va_start, va->va_end);
vunmap_range_noflush(va->va_start, va->va_end); if (debug_pagealloc_enabled_static())
flush_tlb_kernel_range(va->va_start, va->va_end);
free_vmap_area_noflush(va);
}
struct vmap_area *find_vmap_area(unsignedlong addr)
{ struct vmap_node *vn; struct vmap_area *va; int i, j;
if (unlikely(!vmap_initialized)) return NULL;
/* * An addr_to_node_id(addr) converts an address to a node index * where a VA is located. If VA spans several zones and passed * addr is not the same as va->va_start, what is not common, we * may need to scan extra nodes. See an example: * * <----va----> * -|-----|-----|-----|-----|- * 1 2 0 1 * * VA resides in node 1 whereas it spans 1, 2 an 0. If passed * addr is within 2 or 0 nodes we should do extra work.
*/
i = j = addr_to_node_id(addr); do {
vn = &vmap_nodes[i];
spin_lock(&vn->busy.lock);
va = __find_vmap_area(addr, &vn->busy.root);
spin_unlock(&vn->busy.lock);
if (va) return va;
} while ((i = (i + nr_vmap_nodes - 1) % nr_vmap_nodes) != j);
return NULL;
}
staticstruct vmap_area *find_unlink_vmap_area(unsignedlong addr)
{ struct vmap_node *vn; struct vmap_area *va; int i, j;
/* * Check the comment in the find_vmap_area() about the loop.
*/
i = j = addr_to_node_id(addr); do {
vn = &vmap_nodes[i];
spin_lock(&vn->busy.lock);
va = __find_vmap_area(addr, &vn->busy.root); if (va)
unlink_va(va, &vn->busy.root);
spin_unlock(&vn->busy.lock);
if (va) return va;
} while ((i = (i + nr_vmap_nodes - 1) % nr_vmap_nodes) != j);
return NULL;
}
/*** Per cpu kva allocator ***/
/* * vmap space is limited especially on 32 bit architectures. Ensure there is * room for at least 16 percpu vmap blocks per CPU.
*/ /* * If we had a constant VMALLOC_START and VMALLOC_END, we'd like to be able * to #define VMALLOC_SPACE (VMALLOC_END-VMALLOC_START). Guess * instead (we just need a rough idea)
*/ #if BITS_PER_LONG == 32 #define VMALLOC_SPACE (128UL*1024*1024) #else #define VMALLOC_SPACE (128UL*1024*1024*1024) #endif
/* * Purge threshold to prevent overeager purging of fragmented blocks for * regular operations: Purge if vb->free is less than 1/4 of the capacity.
*/ #define VMAP_PURGE_THRESHOLD (VMAP_BBMAP_BITS / 4)
#define VMAP_RAM 0x1 /* indicates vm_map_ram area*/ #define VMAP_BLOCK 0x2 /* mark out the vmap_block sub-type*/ #define VMAP_FLAGS_MASK 0x3
/* * An xarray requires an extra memory dynamically to * be allocated. If it is an issue, we can use rb-tree * instead.
*/ struct xarray vmap_blocks;
};
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung ist noch experimentell.