/* * Due to ordering constraints across the init code for various * architectures, hugetlb hstate cmdline parameters can't simply * be early_param. early_param might call the setup function * before valid hugetlb page sizes are determined, leading to * incorrect rejection of valid hugepagesz= options. * * So, record the parameters early and consume them whenever the * init code is ready for them, by calling hugetlb_parse_params().
*/
/* one (hugepagesz=,hugepages=) pair per hstate, one default_hugepagesz */ #define HUGE_MAX_CMDLINE_ARGS (2 * HUGE_MAX_HSTATE + 1) struct hugetlb_cmdline { char *val; int (*setup)(char *val);
};
/* * Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages, * free_huge_pages, and surplus_huge_pages.
*/
__cacheline_aligned_in_smp DEFINE_SPINLOCK(hugetlb_lock);
/* * Serializes faults on the same logical page. This is used to * prevent spurious OOMs when the hugepage pool is fully utilized.
*/ staticint num_fault_mutexes __ro_after_init; struct mutex *hugetlb_fault_mutex_table __ro_after_init;
/* If no pages are used, and no other handles to the subpool * remain, give up any reservations based on minimum size and
* free the subpool */ if (subpool_is_free(spool)) { if (spool->min_hpages != -1)
hugetlb_acct_memory(spool->hstate,
-spool->min_hpages);
kfree(spool);
}
}
struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages, long min_hpages)
{ struct hugepage_subpool *spool;
spool = kzalloc(sizeof(*spool), GFP_KERNEL); if (!spool) return NULL;
/* * Subpool accounting for allocating and reserving pages. * Return -ENOMEM if there are not enough resources to satisfy the * request. Otherwise, return the number of pages by which the * global pools must be adjusted (upward). The returned value may * only be different than the passed value (delta) in the case where * a subpool minimum size must be maintained.
*/ staticlong hugepage_subpool_get_pages(struct hugepage_subpool *spool, long delta)
{ long ret = delta;
if (!spool) return ret;
spin_lock_irq(&spool->lock);
if (spool->max_hpages != -1) { /* maximum size accounting */ if ((spool->used_hpages + delta) <= spool->max_hpages)
spool->used_hpages += delta; else {
ret = -ENOMEM; goto unlock_ret;
}
}
/* minimum size accounting */ if (spool->min_hpages != -1 && spool->rsv_hpages) { if (delta > spool->rsv_hpages) { /* * Asking for more reserves than those already taken on * behalf of subpool. Return difference.
*/
ret = delta - spool->rsv_hpages;
spool->rsv_hpages = 0;
} else {
ret = 0; /* reserves already accounted for */
spool->rsv_hpages -= delta;
}
}
/* * Subpool accounting for freeing and unreserving pages. * Return the number of global page reservations that must be dropped. * The return value may only be different than the passed value (delta) * in the case where a subpool minimum size must be maintained.
*/ staticlong hugepage_subpool_put_pages(struct hugepage_subpool *spool, long delta)
{ long ret = delta; unsignedlong flags;
if (!spool) return delta;
spin_lock_irqsave(&spool->lock, flags);
if (spool->max_hpages != -1) /* maximum size accounting */
spool->used_hpages -= delta;
/* minimum size accounting */ if (spool->min_hpages != -1 && spool->used_hpages < spool->min_hpages) { if (spool->rsv_hpages + delta <= spool->min_hpages)
ret = 0; else
ret = spool->rsv_hpages + delta - spool->min_hpages;
/* * vma_lock structure may or not be released as a result of put, * it certainly will no longer be attached to vma so clear pointer. * Semaphore synchronizes access to vma_lock->vma field.
*/
vma_lock->vma = NULL;
vma->vm_private_data = NULL;
up_write(&vma_lock->rw_sema);
kref_put(&vma_lock->refs, hugetlb_vma_lock_release);
}
/* Only establish in (flags) sharable vmas */ if (!vma || !(vma->vm_flags & VM_MAYSHARE)) return;
/* Should never get here with non-NULL vm_private_data */ if (vma->vm_private_data) return;
vma_lock = kmalloc(sizeof(*vma_lock), GFP_KERNEL); if (!vma_lock) { /* * If we can not allocate structure, then vma can not * participate in pmd sharing. This is only a possible * performance enhancement and memory saving issue. * However, the lock is also used to synchronize page * faults with truncation. If the lock is not present, * unlikely races could leave pages in a file past i_size * until the file is removed. Warn in the unlikely case of * allocation failure.
*/
pr_warn_once("HugeTLB: unable to allocate vma specific lock\n"); return;
}
/* Helper that removes a struct file_region from the resv_map cache and returns * it for use.
*/ staticstruct file_region *
get_file_region_entry_from_cache(struct resv_map *resv, long from, long to)
{ struct file_region *nrg;
/* Helper that records hugetlb_cgroup uncharge info. */ staticvoid record_hugetlb_cgroup_uncharge_info(struct hugetlb_cgroup *h_cg, struct hstate *h, struct resv_map *resv, struct file_region *nrg)
{ #ifdef CONFIG_CGROUP_HUGETLB if (h_cg) {
nrg->reservation_counter =
&h_cg->rsvd_hugepage[hstate_index(h)];
nrg->css = &h_cg->css; /* * The caller will hold exactly one h_cg->css reference for the * whole contiguous reservation region. But this area might be * scattered when there are already some file_regions reside in * it. As a result, many file_regions may share only one css * reference. In order to ensure that one file_region must hold * exactly one h_cg->css reference, we should do css_get for * each file_region and leave the reference held by caller * untouched.
*/
css_get(&h_cg->css); if (!resv->pages_per_hpage)
resv->pages_per_hpage = pages_per_huge_page(h); /* pages_per_hpage should be the same for all entries in * a resv_map.
*/
VM_BUG_ON(resv->pages_per_hpage != pages_per_huge_page(h));
} else {
nrg->reservation_counter = NULL;
nrg->css = NULL;
} #endif
}
/* * Must be called with resv->lock held. * * Calling this with regions_needed != NULL will count the number of pages * to be added but will not modify the linked list. And regions_needed will * indicate the number of file_regions needed in the cache to carry out to add * the regions for this range.
*/ staticlong add_reservation_in_range(struct resv_map *resv, long f, long t, struct hugetlb_cgroup *h_cg, struct hstate *h, long *regions_needed)
{ long add = 0; struct list_head *head = &resv->regions; long last_accounted_offset = f; struct file_region *iter, *trg = NULL; struct list_head *rg = NULL;
if (regions_needed)
*regions_needed = 0;
/* In this loop, we essentially handle an entry for the range * [last_accounted_offset, iter->from), at every iteration, with some * bounds checking.
*/
list_for_each_entry_safe(iter, trg, head, link) { /* Skip irrelevant regions that start before our range. */ if (iter->from < f) { /* If this region ends after the last accounted offset, * then we need to update last_accounted_offset.
*/ if (iter->to > last_accounted_offset)
last_accounted_offset = iter->to; continue;
}
/* When we find a region that starts beyond our range, we've * finished.
*/ if (iter->from >= t) {
rg = iter->link.prev; break;
}
/* Add an entry for last_accounted_offset -> iter->from, and * update last_accounted_offset.
*/ if (iter->from > last_accounted_offset)
add += hugetlb_resv_map_add(resv, iter->link.prev,
last_accounted_offset,
iter->from, h, h_cg,
regions_needed);
last_accounted_offset = iter->to;
}
/* Handle the case where our range extends beyond * last_accounted_offset.
*/ if (!rg)
rg = head->prev; if (last_accounted_offset < t)
add += hugetlb_resv_map_add(resv, rg, last_accounted_offset,
t, h, h_cg, regions_needed);
return add;
}
/* Must be called with resv->lock acquired. Will drop lock to allocate entries.
*/ staticint allocate_file_region_entries(struct resv_map *resv, int regions_needed)
__must_hold(&resv->lock)
{
LIST_HEAD(allocated_regions); int to_allocate = 0, i = 0; struct file_region *trg = NULL, *rg = NULL;
VM_BUG_ON(regions_needed < 0);
/* * Check for sufficient descriptors in the cache to accommodate * the number of in progress add operations plus regions_needed. * * This is a while loop because when we drop the lock, some other call * to region_add or region_del may have consumed some region_entries, * so we keep looping here until we finally have enough entries for * (adds_in_progress + regions_needed).
*/ while (resv->region_cache_count <
(resv->adds_in_progress + regions_needed)) {
to_allocate = resv->adds_in_progress + regions_needed -
resv->region_cache_count;
/* At this point, we should have enough entries in the cache * for all the existing adds_in_progress. We should only be * needing to allocate for regions_needed.
*/
VM_BUG_ON(resv->region_cache_count < resv->adds_in_progress);
spin_unlock(&resv->lock); for (i = 0; i < to_allocate; i++) {
trg = kmalloc(sizeof(*trg), GFP_KERNEL); if (!trg) goto out_of_memory;
list_add(&trg->link, &allocated_regions);
}
/* * Add the huge page range represented by [f, t) to the reserve * map. Regions will be taken from the cache to fill in this range. * Sufficient regions should exist in the cache due to the previous * call to region_chg with the same range, but in some cases the cache will not * have sufficient entries due to races with other code doing region_add or * region_del. The extra needed entries will be allocated. * * regions_needed is the out value provided by a previous call to region_chg. * * Return the number of new huge pages added to the map. This number is greater * than or equal to zero. If file_region entries needed to be allocated for * this operation and we were not able to allocate, it returns -ENOMEM. * region_add of regions of length 1 never allocate file_regions and cannot * fail; region_chg will always allocate at least 1 entry and a region_add for * 1 page will only require at most 1 entry.
*/ staticlong region_add(struct resv_map *resv, long f, long t, long in_regions_needed, struct hstate *h, struct hugetlb_cgroup *h_cg)
{ long add = 0, actual_regions_needed = 0;
spin_lock(&resv->lock);
retry:
/* Count how many regions are actually needed to execute this add. */
add_reservation_in_range(resv, f, t, NULL, NULL,
&actual_regions_needed);
/* * Check for sufficient descriptors in the cache to accommodate * this add operation. Note that actual_regions_needed may be greater * than in_regions_needed, as the resv_map may have been modified since * the region_chg call. In this case, we need to make sure that we * allocate extra entries, such that we have enough for all the * existing adds_in_progress, plus the excess needed for this * operation.
*/ if (actual_regions_needed > in_regions_needed &&
resv->region_cache_count <
resv->adds_in_progress +
(actual_regions_needed - in_regions_needed)) { /* region_add operation of range 1 should never need to * allocate file_region entries.
*/
VM_BUG_ON(t - f <= 1);
if (allocate_file_region_entries(
resv, actual_regions_needed - in_regions_needed)) { return -ENOMEM;
}
/* * Examine the existing reserve map and determine how many * huge pages in the specified range [f, t) are NOT currently * represented. This routine is called before a subsequent * call to region_add that will actually modify the reserve * map to add the specified range [f, t). region_chg does * not change the number of huge pages represented by the * map. A number of new file_region structures is added to the cache as a * placeholder, for the subsequent region_add call to use. At least 1 * file_region structure is added. * * out_regions_needed is the number of regions added to the * resv->adds_in_progress. This value needs to be provided to a follow up call * to region_add or region_abort for proper accounting. * * Returns the number of huge pages that need to be added to the existing * reservation map for the range [f, t). This number is greater or equal to * zero. -ENOMEM is returned if a new file_region structure or cache entry * is needed and can not be allocated.
*/ staticlong region_chg(struct resv_map *resv, long f, long t, long *out_regions_needed)
{ long chg = 0;
spin_lock(&resv->lock);
/* Count how many hugepages in this range are NOT represented. */
chg = add_reservation_in_range(resv, f, t, NULL, NULL,
out_regions_needed);
if (*out_regions_needed == 0)
*out_regions_needed = 1;
if (allocate_file_region_entries(resv, *out_regions_needed)) return -ENOMEM;
resv->adds_in_progress += *out_regions_needed;
spin_unlock(&resv->lock); return chg;
}
/* * Abort the in progress add operation. The adds_in_progress field * of the resv_map keeps track of the operations in progress between * calls to region_chg and region_add. Operations are sometimes * aborted after the call to region_chg. In such cases, region_abort * is called to decrement the adds_in_progress counter. regions_needed * is the value returned by the region_chg call, it is used to decrement * the adds_in_progress counter. * * NOTE: The range arguments [f, t) are not needed or used in this * routine. They are kept to make reading the calling code easier as * arguments will match the associated region_chg call.
*/ staticvoid region_abort(struct resv_map *resv, long f, long t, long regions_needed)
{
spin_lock(&resv->lock);
VM_BUG_ON(!resv->region_cache_count);
resv->adds_in_progress -= regions_needed;
spin_unlock(&resv->lock);
}
/* * Delete the specified range [f, t) from the reserve map. If the * t parameter is LONG_MAX, this indicates that ALL regions after f * should be deleted. Locate the regions which intersect [f, t) * and either trim, delete or split the existing regions. * * Returns the number of huge pages deleted from the reserve map. * In the normal case, the return value is zero or more. In the * case where a region must be split, a new region descriptor must * be allocated. If the allocation fails, -ENOMEM will be returned. * NOTE: If the parameter t == LONG_MAX, then we will never split * a region and possibly return -ENOMEM. Callers specifying * t == LONG_MAX do not need to check for -ENOMEM error.
*/ staticlong region_del(struct resv_map *resv, long f, long t)
{ struct list_head *head = &resv->regions; struct file_region *rg, *trg; struct file_region *nrg = NULL; long del = 0;
retry:
spin_lock(&resv->lock);
list_for_each_entry_safe(rg, trg, head, link) { /* * Skip regions before the range to be deleted. file_region * ranges are normally of the form [from, to). However, there * may be a "placeholder" entry in the map which is of the form * (from, to) with from == to. Check for placeholder entries * at the beginning of the range to be deleted.
*/ if (rg->to <= f && (rg->to != rg->from || rg->to != f)) continue;
if (rg->from >= t) break;
if (f > rg->from && t < rg->to) { /* Must split region */ /* * Check for an entry in the cache before dropping * lock and attempting allocation.
*/ if (!nrg &&
resv->region_cache_count > resv->adds_in_progress) {
nrg = list_first_entry(&resv->region_cache, struct file_region,
link);
list_del(&nrg->link);
resv->region_cache_count--;
}
if (!nrg) {
spin_unlock(&resv->lock);
nrg = kmalloc(sizeof(*nrg), GFP_KERNEL); if (!nrg) return -ENOMEM; goto retry;
}
del += t - f;
hugetlb_cgroup_uncharge_file_region(
resv, rg, t - f, false);
/* New entry for end of split region */
nrg->from = t;
nrg->to = rg->to;
/* * A rare out of memory error was encountered which prevented removal of * the reserve map region for a page. The huge page itself was free'ed * and removed from the page cache. This routine will adjust the subpool * usage count, and the global reserve count if needed. By incrementing * these counts, the reserve map entry which could not be deleted will * appear as a "reserved" entry instead of simply dangling with incorrect * counts.
*/ void hugetlb_fix_reserve_counts(struct inode *inode)
{ struct hugepage_subpool *spool = subpool_inode(inode); long rsv_adjust; bool reserved = false;
if (!reserved)
pr_warn("hugetlb: Huge Page Reserved count may go negative.\n");
}
/* * Count and return the number of huge pages in the reserve map * that intersect with the range [f, t).
*/ staticlong region_count(struct resv_map *resv, long f, long t)
{ struct list_head *head = &resv->regions; struct file_region *rg; long chg = 0;
spin_lock(&resv->lock); /* Locate each segment we overlap with, and count that overlap. */
list_for_each_entry(rg, head, link) { long seg_from; long seg_to;
if (rg->to <= f) continue; if (rg->from >= t) break;
/* * Convert the address within this vma to the page offset within * the mapping, huge page units here.
*/ static pgoff_t vma_hugecache_offset(struct hstate *h, struct vm_area_struct *vma, unsignedlong address)
{ return ((address - vma->vm_start) >> huge_page_shift(h)) +
(vma->vm_pgoff >> huge_page_order(h));
}
/** * vma_kernel_pagesize - Page size granularity for this VMA. * @vma: The user mapping. * * Folios in this VMA will be aligned to, and at least the size of the * number of bytes returned by this function. * * Return: The default size of the folios allocated when backing a VMA.
*/ unsignedlong vma_kernel_pagesize(struct vm_area_struct *vma)
{ if (vma->vm_ops && vma->vm_ops->pagesize) return vma->vm_ops->pagesize(vma); return PAGE_SIZE;
}
EXPORT_SYMBOL_GPL(vma_kernel_pagesize);
/* * Return the page size being used by the MMU to back a VMA. In the majority * of cases, the page size used by the kernel matches the MMU size. On * architectures where it differs, an architecture-specific 'strong' * version of this symbol is required.
*/
__weak unsignedlong vma_mmu_pagesize(struct vm_area_struct *vma)
{ return vma_kernel_pagesize(vma);
}
/* * Flags for MAP_PRIVATE reservations. These are stored in the bottom * bits of the reservation map pointer, which are always clear due to * alignment.
*/ #define HPAGE_RESV_OWNER (1UL << 0) #define HPAGE_RESV_UNMAPPED (1UL << 1) #define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED)
/* * These helpers are used to track how many pages are reserved for * faults in a MAP_PRIVATE mapping. Only the process that called mmap() * is guaranteed to have their future faults succeed. * * With the exception of hugetlb_dup_vma_private() which is called at fork(), * the reserve counters are updated with the hugetlb_lock held. It is safe * to reset the VMA at fork() time as it is not in use yet and there is no * chance of the global counters getting corrupted as a result of the values. * * The private mapping reservation is represented in a subtly different * manner to a shared mapping. A shared mapping has a region map associated * with the underlying file, this region map represents the backing file * pages which have ever had a reservation assigned which this persists even * after the page is instantiated. A private mapping has a region map * associated with the original mmap which is attached to all VMAs which * reference it, this region map represents those offsets which have consumed * reservation ie. where pages have been instantiated.
*/ staticunsignedlong get_vma_private_data(struct vm_area_struct *vma)
{ return (unsignedlong)vma->vm_private_data;
}
resv_map->adds_in_progress = 0; /* * Initialize these to 0. On shared mappings, 0's here indicate these * fields don't do cgroup accounting. On private mappings, these will be * re-initialized to the proper values, to indicate that hugetlb cgroup * reservations are to be un-charged from here.
*/
resv_map_set_hugetlb_cgroup_uncharge_info(resv_map, NULL, NULL);
/* Clear out any active regions before we release the map. */
region_del(resv_map, 0, LONG_MAX);
/* ... and any entries left in the cache */
list_for_each_entry_safe(rg, trg, head, link) {
list_del(&rg->link);
kfree(rg);
}
VM_BUG_ON(resv_map->adds_in_progress);
kfree(resv_map);
}
staticinlinestruct resv_map *inode_resv_map(struct inode *inode)
{ /* * At inode evict time, i_mapping may not point to the original * address space within the inode. This original address space * contains the pointer to the resv_map. So, always use the * address space embedded within the inode. * The VERY common case is inode->mapping == &inode->i_data but, * this may not be true for device special inodes.
*/ return (struct resv_map *)(&inode->i_data)->i_private_data;
}
void hugetlb_dup_vma_private(struct vm_area_struct *vma)
{
VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); /* * Clear vm_private_data * - For shared mappings this is a per-vma semaphore that may be * allocated in a subsequent call to hugetlb_vm_op_open. * Before clearing, make sure pointer is not associated with vma * as this will leak the structure. This is the case when called * via clear_vma_resv_huge_pages() and hugetlb_vm_op_open has already * been called to allocate a new structure. * - For MAP_PRIVATE mappings, this is the reserve map which does * not apply to children. Faults generated by the children are * not guaranteed to succeed, even if read-only.
*/ if (vma->vm_flags & VM_MAYSHARE) { struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
/* * Reset and decrement one ref on hugepage private reservation. * Called with mm->mmap_lock writer semaphore held. * This function should be only used by mremap and operate on * same sized vma. It should never come here with last ref on the * reservation.
*/ void clear_vma_resv_huge_pages(struct vm_area_struct *vma)
{ /* * Clear the old hugetlb private page reservation. * It has already been transferred to new_vma. * * During a mremap() operation of a hugetlb vma we call move_vma() * which copies vma into new_vma and unmaps vma. After the copy * operation both new_vma and vma share a reference to the resv_map * struct, and at that point vma is about to be unmapped. We don't * want to return the reservation to the pool at unmap of vma because * the reservation still lives on in new_vma, so simply decrement the * ref here and remove the resv_map reference from this vma.
*/ struct resv_map *reservations = vma_resv_map(vma);
if (reservations && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
resv_map_put_hugetlb_cgroup_uncharge_info(reservations);
kref_put(&reservations->refs, resv_map_release);
}
hugetlb_dup_vma_private(vma);
}
staticvoid enqueue_hugetlb_folio(struct hstate *h, struct folio *folio)
{ int nid = folio_nid(folio);
if (!cpuset_zone_allowed(zone, gfp_mask)) continue; /* * no need to ask again on the same node. Pool is node rather than * zone aware
*/ if (zone_to_nid(zone) == node) continue;
node = zone_to_nid(zone);
folio = dequeue_hugetlb_folio_node_exact(h, node); if (folio) return folio;
} if (unlikely(read_mems_allowed_retry(cpuset_mems_cookie))) goto retry_cpuset;
/* * gbl_chg==1 means the allocation requires a new page that was not * reserved before. Making sure there's at least one free page.
*/ if (gbl_chg && !available_huge_pages(h)) goto err;
gfp_mask = htlb_alloc_mask(h);
nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
if (mpol_is_preferred_many(mpol)) {
folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask,
nid, nodemask);
/* Fallback to all nodes if page==NULL */
nodemask = NULL;
}
if (!folio)
folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask,
nid, nodemask);
mpol_cond_put(mpol); return folio;
err: return NULL;
}
/* * common helper functions for hstate_next_node_to_{alloc|free}. * We may have allocated or freed a huge page based on a different * nodes_allowed previously, so h->next_node_to_{alloc|free} might * be outside of *nodes_allowed. Ensure that we use an allowed * node for alloc or free.
*/ staticint next_node_allowed(int nid, nodemask_t *nodes_allowed)
{
nid = next_node_in(nid, *nodes_allowed);
VM_BUG_ON(nid >= MAX_NUMNODES);
return nid;
}
staticint get_valid_node_allowed(int nid, nodemask_t *nodes_allowed)
{ if (!node_isset(nid, *nodes_allowed))
nid = next_node_allowed(nid, nodes_allowed); return nid;
}
/* * returns the previously saved node ["this node"] from which to * allocate a persistent huge page for the pool and advance the * next node from which to allocate, handling wrap at end of node * mask.
*/ staticint hstate_next_node_to_alloc(int *next_node,
nodemask_t *nodes_allowed)
{ int nid;
VM_BUG_ON(!nodes_allowed);
nid = get_valid_node_allowed(*next_node, nodes_allowed);
*next_node = next_node_allowed(nid, nodes_allowed);
return nid;
}
/* * helper for remove_pool_hugetlb_folio() - return the previously saved * node ["this node"] from which to free a huge page. Advance the * next node id whether or not we find a free huge page to free so * that the next attempt to free addresses the next node.
*/ staticint hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
{ int nid;
VM_BUG_ON(!nodes_allowed);
nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed);
h->next_nid_to_free = next_node_allowed(nid, nodes_allowed);
/* * Remove hugetlb folio from lists. * If vmemmap exists for the folio, clear the hugetlb flag so that the * folio appears as just a compound page. Otherwise, wait until after * allocating vmemmap to clear the flag. * * Must be called with hugetlb lock held.
*/ staticvoid remove_hugetlb_folio(struct hstate *h, struct folio *folio, bool adjust_surplus)
{ int nid = folio_nid(folio);
lockdep_assert_held(&hugetlb_lock); if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) return;
list_del(&folio->lru);
if (folio_test_hugetlb_freed(folio)) {
folio_clear_hugetlb_freed(folio);
h->free_huge_pages--;
h->free_huge_pages_node[nid]--;
} if (adjust_surplus) {
h->surplus_huge_pages--;
h->surplus_huge_pages_node[nid]--;
}
/* * We can only clear the hugetlb flag after allocating vmemmap * pages. Otherwise, someone (memory error handling) may try to write * to tail struct pages.
*/ if (!folio_test_hugetlb_vmemmap_optimized(folio))
__folio_clear_hugetlb(folio);
if (adjust_surplus) {
h->surplus_huge_pages++;
h->surplus_huge_pages_node[nid]++;
}
__folio_set_hugetlb(folio);
folio_change_private(folio, NULL); /* * We have to set hugetlb_vmemmap_optimized again as above * folio_change_private(folio, NULL) cleared it.
*/
folio_set_hugetlb_vmemmap_optimized(folio);
if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) return;
/* * If we don't know which subpages are hwpoisoned, we can't free * the hugepage, so it's leaked intentionally.
*/ if (folio_test_hugetlb_raw_hwp_unreliable(folio)) return;
/* * If folio is not vmemmap optimized (!clear_flag), then the folio * is no longer identified as a hugetlb page. hugetlb_vmemmap_restore_folio * can only be passed hugetlb pages and will BUG otherwise.
*/ if (clear_flag && hugetlb_vmemmap_restore_folio(h, folio)) {
spin_lock_irq(&hugetlb_lock); /* * If we cannot allocate vmemmap pages, just refuse to free the * page and put the page back on the hugetlb free list and treat * as a surplus page.
*/
add_hugetlb_folio(h, folio, true);
spin_unlock_irq(&hugetlb_lock); return;
}
/* * If vmemmap pages were allocated above, then we need to clear the * hugetlb flag under the hugetlb lock.
*/ if (folio_test_hugetlb(folio)) {
spin_lock_irq(&hugetlb_lock);
__folio_clear_hugetlb(folio);
spin_unlock_irq(&hugetlb_lock);
}
/* * Move PageHWPoison flag from head page to the raw error pages, * which makes any healthy subpages reusable.
*/ if (unlikely(folio_test_hwpoison(folio)))
folio_clear_hugetlb_hwpoison(folio);
folio_ref_unfreeze(folio, 1);
hugetlb_free_folio(folio);
}
/* * As update_and_free_hugetlb_folio() can be called under any context, so we cannot * use GFP_KERNEL to allocate vmemmap pages. However, we can defer the * actual freeing in a workqueue to prevent from using GFP_ATOMIC to allocate * the vmemmap pages. * * free_hpage_workfn() locklessly retrieves the linked list of pages to be * freed and frees them one-by-one. As the page->mapping pointer is going * to be cleared in free_hpage_workfn() anyway, it is reused as the llist_node * structure of a lockless linked list of huge pages to be freed.
*/ static LLIST_HEAD(hpage_freelist);
while (node) { struct folio *folio; struct hstate *h;
folio = container_of((struct address_space **)node, struct folio, mapping);
node = node->next;
folio->mapping = NULL; /* * The VM_BUG_ON_FOLIO(!folio_test_hugetlb(folio), folio) in * folio_hstate() is going to trigger because a previous call to * remove_hugetlb_folio() will clear the hugetlb bit, so do * not use folio_hstate() directly.
*/
h = size_to_hstate(folio_size(folio));
/* * Defer freeing to avoid using GFP_ATOMIC to allocate vmemmap pages. * * Only call schedule_work() if hpage_freelist is previously * empty. Otherwise, schedule_work() had been called but the workfn * hasn't retrieved the list yet.
*/ if (llist_add((struct llist_node *)&folio->mapping, &hpage_freelist))
schedule_work(&free_hpage_work);
}
if (!list_empty(non_hvo_folios)) { /* * Free any restored hugetlb pages so that restore of the * entire list can be retried. * The idea is that in the common case of ENOMEM errors freeing * hugetlb pages with vmemmap we will free up memory so that we * can allocate vmemmap for more hugetlb pages.
*/
list_for_each_entry_safe(folio, t_folio, non_hvo_folios, lru) {
list_del(&folio->lru);
spin_lock_irq(&hugetlb_lock);
__folio_clear_hugetlb(folio);
spin_unlock_irq(&hugetlb_lock);
update_and_free_hugetlb_folio(h, folio, false);
cond_resched();
}
} else { /* * In the case where there are no folios which can be * immediately freed, we loop through the list trying to restore * vmemmap individually in the hope that someone elsewhere may * have done something to cause success (such as freeing some * memory). If unable to restore a hugetlb page, the hugetlb * page is made a surplus page and removed from the list. * If are able to restore vmemmap and free one hugetlb page, we * quit processing the list to retry the bulk operation.
*/
list_for_each_entry_safe(folio, t_folio, folio_list, lru) if (hugetlb_vmemmap_restore_folio(h, folio)) {
list_del(&folio->lru);
spin_lock_irq(&hugetlb_lock);
add_hugetlb_folio(h, folio, true);
spin_unlock_irq(&hugetlb_lock);
} else {
list_del(&folio->lru);
spin_lock_irq(&hugetlb_lock);
__folio_clear_hugetlb(folio);
spin_unlock_irq(&hugetlb_lock);
update_and_free_hugetlb_folio(h, folio, false);
cond_resched(); break;
}
}
}
/* * First allocate required vmemmmap (if necessary) for all folios. * Carefully handle errors and free up any available hugetlb pages * in an effort to make forward progress.
*/
retry:
ret = hugetlb_vmemmap_restore_folios(h, folio_list, &non_hvo_folios); if (ret < 0) {
bulk_vmemmap_restore_error(h, folio_list, &non_hvo_folios); goto retry;
}
/* * At this point, list should be empty, ret should be >= 0 and there * should only be pages on the non_hvo_folios list. * Do note that the non_hvo_folios list could be empty. * Without HVO enabled, ret will be 0 and there is no need to call * __folio_clear_hugetlb as this was done previously.
*/
VM_WARN_ON(!list_empty(folio_list));
VM_WARN_ON(ret < 0); if (!list_empty(&non_hvo_folios) && ret) {
spin_lock_irq(&hugetlb_lock);
list_for_each_entry(folio, &non_hvo_folios, lru)
__folio_clear_hugetlb(folio);
spin_unlock_irq(&hugetlb_lock);
}
void free_huge_folio(struct folio *folio)
{ /* * Can't pass hstate in here because it is called from the * generic mm code.
*/ struct hstate *h = folio_hstate(folio); int nid = folio_nid(folio); struct hugepage_subpool *spool = hugetlb_folio_subpool(folio); bool restore_reserve; unsignedlong flags;
/* * If HPageRestoreReserve was set on page, page allocation consumed a * reservation. If the page was associated with a subpool, there * would have been a page reserved in the subpool before allocation * via hugepage_subpool_get_pages(). Since we are 'restoring' the * reservation, do not call hugepage_subpool_put_pages() as this will * remove the reserved page from the subpool.
*/ if (!restore_reserve) { /* * A return code of zero implies that the subpool will be * under its minimum size if the reservation is not restored * after page is free. Therefore, force restore_reserve * operation.
*/ if (hugepage_subpool_put_pages(spool, 1) == 0)
restore_reserve = true;
}
if (folio_test_hugetlb_temporary(folio)) {
remove_hugetlb_folio(h, folio, false);
spin_unlock_irqrestore(&hugetlb_lock, flags);
update_and_free_hugetlb_folio(h, folio, true);
} elseif (h->surplus_huge_pages_node[nid]) { /* remove the page from active list */
remove_hugetlb_folio(h, folio, true);
spin_unlock_irqrestore(&hugetlb_lock, flags);
update_and_free_hugetlb_folio(h, folio, true);
} else {
arch_clear_hugetlb_flags(folio);
enqueue_hugetlb_folio(h, folio);
spin_unlock_irqrestore(&hugetlb_lock, flags);
}
}
/* * Must be called with the hugetlb lock held
*/ staticvoid __prep_account_new_huge_page(struct hstate *h, int nid)
{
lockdep_assert_held(&hugetlb_lock);
h->nr_huge_pages++;
h->nr_huge_pages_node[nid]++;
}
/* * Find and lock address space (mapping) in write mode. * * Upon entry, the folio is locked which means that folio_mapping() is * stable. Due to locking order, we can only trylock_write. If we can * not get the lock, simply return NULL to caller.
*/ struct address_space *hugetlb_folio_mapping_lock_write(struct folio *folio)
{ struct address_space *mapping = folio_mapping(folio);
if (!mapping) return mapping;
if (i_mmap_trylock_write(mapping)) return mapping;
return NULL;
}
staticstruct folio *alloc_buddy_hugetlb_folio(struct hstate *h,
gfp_t gfp_mask, int nid, nodemask_t *nmask,
nodemask_t *node_alloc_noretry)
{ int order = huge_page_order(h); struct folio *folio; bool alloc_try_hard = true;
/* * By default we always try hard to allocate the folio with * __GFP_RETRY_MAYFAIL flag. However, if we are allocating folios in * a loop (to adjust global huge page counts) and previous allocation * failed, do not continue to try hard on the same node. Use the * node_alloc_noretry bitmap to manage this state information.
*/ if (node_alloc_noretry && node_isset(nid, *node_alloc_noretry))
alloc_try_hard = false; if (alloc_try_hard)
gfp_mask |= __GFP_RETRY_MAYFAIL; if (nid == NUMA_NO_NODE)
nid = numa_mem_id();
/* * If we did not specify __GFP_RETRY_MAYFAIL, but still got a * folio this indicates an overall state change. Clear bit so * that we resume normal 'try hard' allocations.
*/ if (node_alloc_noretry && folio && !alloc_try_hard)
node_clear(nid, *node_alloc_noretry);
/* * If we tried hard to get a folio but failed, set bit so that * subsequent attempts will not try as hard until there is an * overall state change.
*/ if (node_alloc_noretry && !folio && alloc_try_hard)
node_set(nid, *node_alloc_noretry);
if (!folio) {
__count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); return NULL;
}
/* * Common helper to allocate a fresh hugetlb page. All specific allocators * should use this function to get new hugetlb pages * * Note that returned page is 'frozen': ref count of head page and all tail * pages is zero.
*/ staticstruct folio *alloc_fresh_hugetlb_folio(struct hstate *h,
gfp_t gfp_mask, int nid, nodemask_t *nmask)
{ struct folio *folio;
if (hstate_is_gigantic(h))
folio = alloc_gigantic_folio(h, gfp_mask, nid, nmask); else
folio = alloc_buddy_hugetlb_folio(h, gfp_mask, nid, nmask, NULL); if (!folio) return NULL;
/* Send list for bulk vmemmap optimization processing */
hugetlb_vmemmap_optimize_folios(h, folio_list);
/* Add all new pool pages to free lists in one lock cycle */
spin_lock_irqsave(&hugetlb_lock, flags);
list_for_each_entry_safe(folio, tmp_f, folio_list, lru) {
__prep_account_new_huge_page(h, folio_nid(folio));
enqueue_hugetlb_folio(h, folio);
}
spin_unlock_irqrestore(&hugetlb_lock, flags);
}
/* * Allocates a fresh hugetlb page in a node interleaved manner. The page * will later be added to the appropriate hugetlb pool.
*/ staticstruct folio *alloc_pool_huge_folio(struct hstate *h,
nodemask_t *nodes_allowed,
nodemask_t *node_alloc_noretry, int *next_node)
{
gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; int nr_nodes, node;
/* * Remove huge page from pool from next node to free. Attempt to keep * persistent huge pages more or less balanced over allowed nodes. * This routine only 'removes' the hugetlb page. The caller must make * an additional call to free the page to low level allocators. * Called with hugetlb_lock locked.
*/ staticstruct folio *remove_pool_hugetlb_folio(struct hstate *h,
nodemask_t *nodes_allowed, bool acct_surplus)
{ int nr_nodes, node; struct folio *folio = NULL;
/* * Dissolve a given free hugetlb folio into free buddy pages. This function * does nothing for in-use hugetlb folios and non-hugetlb folios. * This function returns values like below: * * -ENOMEM: failed to allocate vmemmap pages to free the freed hugepages * when the system is under memory pressure and the feature of * freeing unused vmemmap pages associated with each hugetlb page * is enabled. * -EBUSY: failed to dissolved free hugepages or the hugepage is in-use * (allocated or reserved.) * 0: successfully dissolved free hugepages or the page is not a * hugepage (considered as already dissolved)
*/ int dissolve_free_hugetlb_folio(struct folio *folio)
{ int rc = -EBUSY;
retry: /* Not to disrupt normal path by vainly holding hugetlb_lock */ if (!folio_test_hugetlb(folio)) return 0;
spin_lock_irq(&hugetlb_lock); if (!folio_test_hugetlb(folio)) {
rc = 0; goto out;
}
/* * We should make sure that the page is already on the free list * when it is dissolved.
*/ if (unlikely(!folio_test_hugetlb_freed(folio))) {
spin_unlock_irq(&hugetlb_lock);
cond_resched();
/* * Theoretically, we should return -EBUSY when we * encounter this race. In fact, we have a chance * to successfully dissolve the page if we do a * retry. Because the race window is quite small. * If we seize this opportunity, it is an optimization * for increasing the success rate of dissolving page.
*/ goto retry;
}
if (h->surplus_huge_pages_node[folio_nid(folio)])
adjust_surplus = true;
remove_hugetlb_folio(h, folio, adjust_surplus);
h->max_huge_pages--;
spin_unlock_irq(&hugetlb_lock);
/* * Normally update_and_free_hugtlb_folio will allocate required vmemmmap * before freeing the page. update_and_free_hugtlb_folio will fail to * free the page if it can not allocate required vmemmap. We * need to adjust max_huge_pages if the page is not freed. * Attempt to allocate vmemmmap here so that we can take * appropriate action on failure. * * The folio_test_hugetlb check here is because * remove_hugetlb_folio will clear hugetlb folio flag for * non-vmemmap optimized hugetlb folios.
*/ if (folio_test_hugetlb(folio)) {
rc = hugetlb_vmemmap_restore_folio(h, folio); if (rc) {
spin_lock_irq(&hugetlb_lock);
add_hugetlb_folio(h, folio, adjust_surplus);
h->max_huge_pages++; goto out;
}
} else
rc = 0;
/* * Dissolve free hugepages in a given pfn range. Used by memory hotplug to * make specified memory blocks removable from the system. * Note that this will dissolve a free gigantic hugepage completely, if any * part of it lies within the given range. * Also note that if dissolve_free_hugetlb_folio() returns with an error, all * free hugetlb folios that were dissolved before that error are lost.
*/ int dissolve_free_hugetlb_folios(unsignedlong start_pfn, unsignedlong end_pfn)
{ unsignedlong pfn; struct folio *folio; int rc = 0; unsignedint order; struct hstate *h;
if (!hugepages_supported()) return rc;
order = huge_page_order(&default_hstate);
for_each_hstate(h)
order = min(order, huge_page_order(h));
spin_lock_irq(&hugetlb_lock); /* * nr_huge_pages needs to be adjusted within the same lock cycle * as surplus_pages, otherwise it might confuse * persistent_huge_pages() momentarily.
*/
__prep_account_new_huge_page(h, folio_nid(folio));
/* * We could have raced with the pool size change. * Double check that and simply deallocate the new page * if we would end up overcommiting the surpluses. Abuse * temporary page to workaround the nasty free_huge_folio * codeflow
*/ if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
folio_set_hugetlb_temporary(folio);
spin_unlock_irq(&hugetlb_lock);
free_huge_folio(folio); return NULL;
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.