Quelle hugetlb.c Sprache: C

// SPDX-License-Identifier: GPL-2.0-only
/*
* Generic hugetlb support.
* (C) Nadia Yvette Chambers, April 2004
*/
#include <linux/list.h>
#include <linux/init.h>
#include <linux/mm.h>
#include <linux/seq_file.h>
#include <linux/sysctl.h>
#include <linux/highmem.h>
#include <linux/mmu_notifier.h>
#include <linux/nodemask.h>
#include <linux/pagemap.h>
#include <linux/mempolicy.h>
#include <linux/compiler.h>
#include <linux/cpumask.h>
#include <linux/cpuset.h>
#include <linux/mutex.h>
#include <linux/memblock.h>
#include <linux/minmax.h>
#include <linux/sysfs.h>
#include <linux/slab.h>
#include <linux/sched/mm.h>
#include <linux/mmdebug.h>
#include <linux/sched/signal.h>
#include <linux/rmap.h>
#include <linux/string_choices.h>
#include <linux/string_helpers.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/jhash.h>
#include <linux/numa.h>
#include <linux/llist.h>
#include <linux/cma.h>
#include <linux/migrate.h>
#include <linux/nospec.h>
#include <linux/delayacct.h>
#include <linux/memory.h>
#include <linux/mm_inline.h>
#include <linux/padata.h>

#include <asm/page.h>
#include <asm/pgalloc.h>
#include <asm/tlb.h>
#include <asm/setup.h>

#include <linux/io.h>
#include <linux/hugetlb.h>
#include <linux/hugetlb_cgroup.h>
#include <linux/node.h>
#include <linux/page_owner.h>
#include "internal.h"
#include "hugetlb_vmemmap.h"
#include "hugetlb_cma.h"
#include <linux/page-isolation.h>

int hugetlb_max_hstate __read_mostly;
unsigned int default_hstate_idx;
struct hstate hstates[HUGE_MAX_HSTATE];

__initdata nodemask_t hugetlb_bootmem_nodes;
__initdata struct list_head huge_boot_pages[MAX_NUMNODES];
static unsigned long hstate_boot_nrinvalid[HUGE_MAX_HSTATE] __initdata;

/*
* Due to ordering constraints across the init code for various
* architectures, hugetlb hstate cmdline parameters can't simply
* be early_param. early_param might call the setup function
* before valid hugetlb page sizes are determined, leading to
* incorrect rejection of valid hugepagesz= options.
*
* So, record the parameters early and consume them whenever the
* init code is ready for them, by calling hugetlb_parse_params().
*/

/* one (hugepagesz=,hugepages=) pair per hstate, one default_hugepagesz */
#define HUGE_MAX_CMDLINE_ARGS (2 * HUGE_MAX_HSTATE + 1)
struct hugetlb_cmdline {
char *val;
int (*setup)(char *val);
};

/* for command line parsing */
static struct hstate * __initdata parsed_hstate;
static unsigned long __initdata default_hstate_max_huge_pages;
static bool __initdata parsed_valid_hugepagesz = true;
static bool __initdata parsed_default_hugepagesz;
static unsigned int default_hugepages_in_node[MAX_NUMNODES] __initdata;
static unsigned long hugepage_allocation_threads __initdata;

static char hstate_cmdline_buf[COMMAND_LINE_SIZE] __initdata;
static int hstate_cmdline_index __initdata;
static struct hugetlb_cmdline hugetlb_params[HUGE_MAX_CMDLINE_ARGS] __initdata;
static int hugetlb_param_index __initdata;
static __init int hugetlb_add_param(char *s, int (*setup)(char *val));
static __init void hugetlb_parse_params(void);

#define hugetlb_early_param(str, func) \
static __init int func##args(char *s) \
{ \
return hugetlb_add_param(s, func); \
} \
early_param(str, func##args)

/*
* Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages,
* free_huge_pages, and surplus_huge_pages.
*/
__cacheline_aligned_in_smp DEFINE_SPINLOCK(hugetlb_lock);

/*
* Serializes faults on the same logical page.  This is used to
* prevent spurious OOMs when the hugepage pool is fully utilized.
*/
static int num_fault_mutexes __ro_after_init;
struct mutex *hugetlb_fault_mutex_table __ro_after_init;

/* Forward declaration */
static int hugetlb_acct_memory(struct hstate *h, long delta);
static void hugetlb_vma_lock_free(struct vm_area_struct *vma);
static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma);
static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma);
static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
  unsigned long start, unsigned long end, bool take_locks);
static struct resv_map *vma_resv_map(struct vm_area_struct *vma);

static void hugetlb_free_folio(struct folio *folio)
{
if (folio_test_hugetlb_cma(folio)) {
  hugetlb_cma_free_folio(folio);
  return;
}

folio_put(folio);
}

static inline bool subpool_is_free(struct hugepage_subpool *spool)
{
if (spool->count)
  return false;
if (spool->max_hpages != -1)
  return spool->used_hpages == 0;
if (spool->min_hpages != -1)
  return spool->rsv_hpages == spool->min_hpages;

return true;
}

static inline void unlock_or_release_subpool(struct hugepage_subpool *spool,
      unsigned long irq_flags)
{
spin_unlock_irqrestore(&spool->lock, irq_flags);

/* If no pages are used, and no other handles to the subpool
* remain, give up any reservations based on minimum size and
* free the subpool */
if (subpool_is_free(spool)) {
  if (spool->min_hpages != -1)
   hugetlb_acct_memory(spool->hstate,
      -spool->min_hpages);
  kfree(spool);
}
}

struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages,
      long min_hpages)
{
struct hugepage_subpool *spool;

spool = kzalloc(sizeof(*spool), GFP_KERNEL);
if (!spool)
  return NULL;

spin_lock_init(&spool->lock);
spool->count = 1;
spool->max_hpages = max_hpages;
spool->hstate = h;
spool->min_hpages = min_hpages;

if (min_hpages != -1 && hugetlb_acct_memory(h, min_hpages)) {
  kfree(spool);
  return NULL;
}
spool->rsv_hpages = min_hpages;

return spool;
}

void hugepage_put_subpool(struct hugepage_subpool *spool)
{
unsigned long flags;

spin_lock_irqsave(&spool->lock, flags);
BUG_ON(!spool->count);
spool->count--;
unlock_or_release_subpool(spool, flags);
}

/*
* Subpool accounting for allocating and reserving pages.
* Return -ENOMEM if there are not enough resources to satisfy the
* request.  Otherwise, return the number of pages by which the
* global pools must be adjusted (upward).  The returned value may
* only be different than the passed value (delta) in the case where
* a subpool minimum size must be maintained.
*/
static long hugepage_subpool_get_pages(struct hugepage_subpool *spool,
          long delta)
{
long ret = delta;

if (!spool)
  return ret;

spin_lock_irq(&spool->lock);

if (spool->max_hpages != -1) {  /* maximum size accounting */
  if ((spool->used_hpages + delta) <= spool->max_hpages)
   spool->used_hpages += delta;
  else {
   ret = -ENOMEM;
   goto unlock_ret;
  }
}

/* minimum size accounting */
if (spool->min_hpages != -1 && spool->rsv_hpages) {
  if (delta > spool->rsv_hpages) {
   /*
* Asking for more reserves than those already taken on
* behalf of subpool.  Return difference.
*/
   ret = delta - spool->rsv_hpages;
   spool->rsv_hpages = 0;
  } else {
   ret = 0; /* reserves already accounted for */
   spool->rsv_hpages -= delta;
  }
}

unlock_ret:
spin_unlock_irq(&spool->lock);
return ret;
}

/*
* Subpool accounting for freeing and unreserving pages.
* Return the number of global page reservations that must be dropped.
* The return value may only be different than the passed value (delta)
* in the case where a subpool minimum size must be maintained.
*/
static long hugepage_subpool_put_pages(struct hugepage_subpool *spool,
           long delta)
{
long ret = delta;
unsigned long flags;

if (!spool)
  return delta;

spin_lock_irqsave(&spool->lock, flags);

if (spool->max_hpages != -1)  /* maximum size accounting */
  spool->used_hpages -= delta;

  /* minimum size accounting */
if (spool->min_hpages != -1 && spool->used_hpages < spool->min_hpages) {
  if (spool->rsv_hpages + delta <= spool->min_hpages)
   ret = 0;
  else
   ret = spool->rsv_hpages + delta - spool->min_hpages;

  spool->rsv_hpages += delta;
  if (spool->rsv_hpages > spool->min_hpages)
   spool->rsv_hpages = spool->min_hpages;
}

/*
* If hugetlbfs_put_super couldn't free spool due to an outstanding
* quota reference, free it now.
*/
unlock_or_release_subpool(spool, flags);

return ret;
}

static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
{
return subpool_inode(file_inode(vma->vm_file));
}

/*
* hugetlb vma_lock helper routines
*/
void hugetlb_vma_lock_read(struct vm_area_struct *vma)
{
if (__vma_shareable_lock(vma)) {
  struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;

  down_read(&vma_lock->rw_sema);
} else if (__vma_private_lock(vma)) {
  struct resv_map *resv_map = vma_resv_map(vma);

  down_read(&resv_map->rw_sema);
}
}

void hugetlb_vma_unlock_read(struct vm_area_struct *vma)
{
if (__vma_shareable_lock(vma)) {
  struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;

  up_read(&vma_lock->rw_sema);
} else if (__vma_private_lock(vma)) {
  struct resv_map *resv_map = vma_resv_map(vma);

  up_read(&resv_map->rw_sema);
}
}

void hugetlb_vma_lock_write(struct vm_area_struct *vma)
{
if (__vma_shareable_lock(vma)) {
  struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;

  down_write(&vma_lock->rw_sema);
} else if (__vma_private_lock(vma)) {
  struct resv_map *resv_map = vma_resv_map(vma);

  down_write(&resv_map->rw_sema);
}
}

void hugetlb_vma_unlock_write(struct vm_area_struct *vma)
{
if (__vma_shareable_lock(vma)) {
  struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;

  up_write(&vma_lock->rw_sema);
} else if (__vma_private_lock(vma)) {
  struct resv_map *resv_map = vma_resv_map(vma);

  up_write(&resv_map->rw_sema);
}
}

int hugetlb_vma_trylock_write(struct vm_area_struct *vma)
{

if (__vma_shareable_lock(vma)) {
  struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;

  return down_write_trylock(&vma_lock->rw_sema);
} else if (__vma_private_lock(vma)) {
  struct resv_map *resv_map = vma_resv_map(vma);

  return down_write_trylock(&resv_map->rw_sema);
}

return 1;
}

void hugetlb_vma_assert_locked(struct vm_area_struct *vma)
{
if (__vma_shareable_lock(vma)) {
  struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;

  lockdep_assert_held(&vma_lock->rw_sema);
} else if (__vma_private_lock(vma)) {
  struct resv_map *resv_map = vma_resv_map(vma);

  lockdep_assert_held(&resv_map->rw_sema);
}
}

void hugetlb_vma_lock_release(struct kref *kref)
{
struct hugetlb_vma_lock *vma_lock = container_of(kref,
   struct hugetlb_vma_lock, refs);

kfree(vma_lock);
}

static void __hugetlb_vma_unlock_write_put(struct hugetlb_vma_lock *vma_lock)
{
struct vm_area_struct *vma = vma_lock->vma;

/*
* vma_lock structure may or not be released as a result of put,
* it certainly will no longer be attached to vma so clear pointer.
* Semaphore synchronizes access to vma_lock->vma field.
*/
vma_lock->vma = NULL;
vma->vm_private_data = NULL;
up_write(&vma_lock->rw_sema);
kref_put(&vma_lock->refs, hugetlb_vma_lock_release);
}

static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma)
{
if (__vma_shareable_lock(vma)) {
  struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;

  __hugetlb_vma_unlock_write_put(vma_lock);
} else if (__vma_private_lock(vma)) {
  struct resv_map *resv_map = vma_resv_map(vma);

  /* no free for anon vmas, but still need to unlock */
  up_write(&resv_map->rw_sema);
}
}

static void hugetlb_vma_lock_free(struct vm_area_struct *vma)
{
/*
* Only present in sharable vmas.
*/
if (!vma || !__vma_shareable_lock(vma))
  return;

if (vma->vm_private_data) {
  struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;

  down_write(&vma_lock->rw_sema);
  __hugetlb_vma_unlock_write_put(vma_lock);
}
}

static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma)
{
struct hugetlb_vma_lock *vma_lock;

/* Only establish in (flags) sharable vmas */
if (!vma || !(vma->vm_flags & VM_MAYSHARE))
  return;

/* Should never get here with non-NULL vm_private_data */
if (vma->vm_private_data)
  return;

vma_lock = kmalloc(sizeof(*vma_lock), GFP_KERNEL);
if (!vma_lock) {
  /*
* If we can not allocate structure, then vma can not
* participate in pmd sharing.  This is only a possible
* performance enhancement and memory saving issue.
* However, the lock is also used to synchronize page
* faults with truncation.  If the lock is not present,
* unlikely races could leave pages in a file past i_size
* until the file is removed.  Warn in the unlikely case of
* allocation failure.
*/
  pr_warn_once("HugeTLB: unable to allocate vma specific lock\n");
  return;
}

kref_init(&vma_lock->refs);
init_rwsem(&vma_lock->rw_sema);
vma_lock->vma = vma;
vma->vm_private_data = vma_lock;
}

/* Helper that removes a struct file_region from the resv_map cache and returns
* it for use.
*/
static struct file_region *
get_file_region_entry_from_cache(struct resv_map *resv, long from, long to)
{
struct file_region *nrg;

VM_BUG_ON(resv->region_cache_count <= 0);

resv->region_cache_count--;
nrg = list_first_entry(&resv->region_cache, struct file_region, link);
list_del(&nrg->link);

nrg->from = from;
nrg->to = to;

return nrg;
}

static void copy_hugetlb_cgroup_uncharge_info(struct file_region *nrg,
           struct file_region *rg)
{
#ifdef CONFIG_CGROUP_HUGETLB
nrg->reservation_counter = rg->reservation_counter;
nrg->css = rg->css;
if (rg->css)
  css_get(rg->css);
#endif
}

/* Helper that records hugetlb_cgroup uncharge info. */
static void record_hugetlb_cgroup_uncharge_info(struct hugetlb_cgroup *h_cg,
      struct hstate *h,
      struct resv_map *resv,
      struct file_region *nrg)
{
#ifdef CONFIG_CGROUP_HUGETLB
if (h_cg) {
  nrg->reservation_counter =
   &h_cg->rsvd_hugepage[hstate_index(h)];
  nrg->css = &h_cg->css;
  /*
* The caller will hold exactly one h_cg->css reference for the
* whole contiguous reservation region. But this area might be
* scattered when there are already some file_regions reside in
* it. As a result, many file_regions may share only one css
* reference. In order to ensure that one file_region must hold
* exactly one h_cg->css reference, we should do css_get for
* each file_region and leave the reference held by caller
* untouched.
*/
  css_get(&h_cg->css);
  if (!resv->pages_per_hpage)
   resv->pages_per_hpage = pages_per_huge_page(h);
  /* pages_per_hpage should be the same for all entries in
* a resv_map.
*/
  VM_BUG_ON(resv->pages_per_hpage != pages_per_huge_page(h));
} else {
  nrg->reservation_counter = NULL;
  nrg->css = NULL;
}
#endif
}

static void put_uncharge_info(struct file_region *rg)
{
#ifdef CONFIG_CGROUP_HUGETLB
if (rg->css)
  css_put(rg->css);
#endif
}

static bool has_same_uncharge_info(struct file_region *rg,
       struct file_region *org)
{
#ifdef CONFIG_CGROUP_HUGETLB
return rg->reservation_counter == org->reservation_counter &&
        rg->css == org->css;

#else
return true;
#endif
}

static void coalesce_file_region(struct resv_map *resv, struct file_region *rg)
{
struct file_region *nrg, *prg;

prg = list_prev_entry(rg, link);
if (&prg->link != &resv->regions && prg->to == rg->from &&
     has_same_uncharge_info(prg, rg)) {
  prg->to = rg->to;

  list_del(&rg->link);
  put_uncharge_info(rg);
  kfree(rg);

  rg = prg;
}

nrg = list_next_entry(rg, link);
if (&nrg->link != &resv->regions && nrg->from == rg->to &&
     has_same_uncharge_info(nrg, rg)) {
  nrg->from = rg->from;

  list_del(&rg->link);
  put_uncharge_info(rg);
  kfree(rg);
}
}

static inline long
hugetlb_resv_map_add(struct resv_map *map, struct list_head *rg, long from,
       long to, struct hstate *h, struct hugetlb_cgroup *cg,
       long *regions_needed)
{
struct file_region *nrg;

if (!regions_needed) {
  nrg = get_file_region_entry_from_cache(map, from, to);
  record_hugetlb_cgroup_uncharge_info(cg, h, map, nrg);
  list_add(&nrg->link, rg);
  coalesce_file_region(map, nrg);
} else
  *regions_needed += 1;

return to - from;
}

/*
* Must be called with resv->lock held.
*
* Calling this with regions_needed != NULL will count the number of pages
* to be added but will not modify the linked list. And regions_needed will
* indicate the number of file_regions needed in the cache to carry out to add
* the regions for this range.
*/
static long add_reservation_in_range(struct resv_map *resv, long f, long t,
         struct hugetlb_cgroup *h_cg,
         struct hstate *h, long *regions_needed)
{
long add = 0;
struct list_head *head = &resv->regions;
long last_accounted_offset = f;
struct file_region *iter, *trg = NULL;
struct list_head *rg = NULL;

if (regions_needed)
  *regions_needed = 0;

/* In this loop, we essentially handle an entry for the range
* [last_accounted_offset, iter->from), at every iteration, with some
* bounds checking.
*/
list_for_each_entry_safe(iter, trg, head, link) {
  /* Skip irrelevant regions that start before our range. */
  if (iter->from < f) {
   /* If this region ends after the last accounted offset,
* then we need to update last_accounted_offset.
*/
   if (iter->to > last_accounted_offset)
    last_accounted_offset = iter->to;
   continue;
  }

  /* When we find a region that starts beyond our range, we've
* finished.
*/
  if (iter->from >= t) {
   rg = iter->link.prev;
   break;
  }

  /* Add an entry for last_accounted_offset -> iter->from, and
* update last_accounted_offset.
*/
  if (iter->from > last_accounted_offset)
   add += hugetlb_resv_map_add(resv, iter->link.prev,
          last_accounted_offset,
          iter->from, h, h_cg,
          regions_needed);

  last_accounted_offset = iter->to;
}

/* Handle the case where our range extends beyond
* last_accounted_offset.
*/
if (!rg)
  rg = head->prev;
if (last_accounted_offset < t)
  add += hugetlb_resv_map_add(resv, rg, last_accounted_offset,
         t, h, h_cg, regions_needed);

return add;
}

/* Must be called with resv->lock acquired. Will drop lock to allocate entries.
*/
static int allocate_file_region_entries(struct resv_map *resv,
     int regions_needed)
__must_hold(&resv->lock)
{
LIST_HEAD(allocated_regions);
int to_allocate = 0, i = 0;
struct file_region *trg = NULL, *rg = NULL;

VM_BUG_ON(regions_needed < 0);

/*
* Check for sufficient descriptors in the cache to accommodate
* the number of in progress add operations plus regions_needed.
*
* This is a while loop because when we drop the lock, some other call
* to region_add or region_del may have consumed some region_entries,
* so we keep looping here until we finally have enough entries for
* (adds_in_progress + regions_needed).
*/
while (resv->region_cache_count <
        (resv->adds_in_progress + regions_needed)) {
  to_allocate = resv->adds_in_progress + regions_needed -
         resv->region_cache_count;

  /* At this point, we should have enough entries in the cache
* for all the existing adds_in_progress. We should only be
* needing to allocate for regions_needed.
*/
  VM_BUG_ON(resv->region_cache_count < resv->adds_in_progress);

  spin_unlock(&resv->lock);
  for (i = 0; i < to_allocate; i++) {
   trg = kmalloc(sizeof(*trg), GFP_KERNEL);
   if (!trg)
    goto out_of_memory;
   list_add(&trg->link, &allocated_regions);
  }

  spin_lock(&resv->lock);

  list_splice(&allocated_regions, &resv->region_cache);
  resv->region_cache_count += to_allocate;
}

return 0;

out_of_memory:
list_for_each_entry_safe(rg, trg, &allocated_regions, link) {
  list_del(&rg->link);
  kfree(rg);
}
return -ENOMEM;
}

/*
* Add the huge page range represented by [f, t) to the reserve
* map.  Regions will be taken from the cache to fill in this range.
* Sufficient regions should exist in the cache due to the previous
* call to region_chg with the same range, but in some cases the cache will not
* have sufficient entries due to races with other code doing region_add or
* region_del.  The extra needed entries will be allocated.
*
* regions_needed is the out value provided by a previous call to region_chg.
*
* Return the number of new huge pages added to the map.  This number is greater
* than or equal to zero.  If file_region entries needed to be allocated for
* this operation and we were not able to allocate, it returns -ENOMEM.
* region_add of regions of length 1 never allocate file_regions and cannot
* fail; region_chg will always allocate at least 1 entry and a region_add for
* 1 page will only require at most 1 entry.
*/
static long region_add(struct resv_map *resv, long f, long t,
         long in_regions_needed, struct hstate *h,
         struct hugetlb_cgroup *h_cg)
{
long add = 0, actual_regions_needed = 0;

spin_lock(&resv->lock);
retry:

/* Count how many regions are actually needed to execute this add. */
add_reservation_in_range(resv, f, t, NULL, NULL,
     &actual_regions_needed);

/*
* Check for sufficient descriptors in the cache to accommodate
* this add operation. Note that actual_regions_needed may be greater
* than in_regions_needed, as the resv_map may have been modified since
* the region_chg call. In this case, we need to make sure that we
* allocate extra entries, such that we have enough for all the
* existing adds_in_progress, plus the excess needed for this
* operation.
*/
if (actual_regions_needed > in_regions_needed &&
     resv->region_cache_count <
      resv->adds_in_progress +
       (actual_regions_needed - in_regions_needed)) {
  /* region_add operation of range 1 should never need to
* allocate file_region entries.
*/
  VM_BUG_ON(t - f <= 1);

  if (allocate_file_region_entries(
       resv, actual_regions_needed - in_regions_needed)) {
   return -ENOMEM;
  }

  goto retry;
}

add = add_reservation_in_range(resv, f, t, h_cg, h, NULL);

resv->adds_in_progress -= in_regions_needed;

spin_unlock(&resv->lock);
return add;
}

/*
* Examine the existing reserve map and determine how many
* huge pages in the specified range [f, t) are NOT currently
* represented.  This routine is called before a subsequent
* call to region_add that will actually modify the reserve
* map to add the specified range [f, t).  region_chg does
* not change the number of huge pages represented by the
* map.  A number of new file_region structures is added to the cache as a
* placeholder, for the subsequent region_add call to use. At least 1
* file_region structure is added.
*
* out_regions_needed is the number of regions added to the
* resv->adds_in_progress.  This value needs to be provided to a follow up call
* to region_add or region_abort for proper accounting.
*
* Returns the number of huge pages that need to be added to the existing
* reservation map for the range [f, t).  This number is greater or equal to
* zero.  -ENOMEM is returned if a new file_region structure or cache entry
* is needed and can not be allocated.
*/
static long region_chg(struct resv_map *resv, long f, long t,
         long *out_regions_needed)
{
long chg = 0;

spin_lock(&resv->lock);

/* Count how many hugepages in this range are NOT represented. */
chg = add_reservation_in_range(resv, f, t, NULL, NULL,
           out_regions_needed);

if (*out_regions_needed == 0)
  *out_regions_needed = 1;

if (allocate_file_region_entries(resv, *out_regions_needed))
  return -ENOMEM;

resv->adds_in_progress += *out_regions_needed;

spin_unlock(&resv->lock);
return chg;
}

/*
* Abort the in progress add operation.  The adds_in_progress field
* of the resv_map keeps track of the operations in progress between
* calls to region_chg and region_add.  Operations are sometimes
* aborted after the call to region_chg.  In such cases, region_abort
* is called to decrement the adds_in_progress counter. regions_needed
* is the value returned by the region_chg call, it is used to decrement
* the adds_in_progress counter.
*
* NOTE: The range arguments [f, t) are not needed or used in this
* routine.  They are kept to make reading the calling code easier as
* arguments will match the associated region_chg call.
*/
static void region_abort(struct resv_map *resv, long f, long t,
    long regions_needed)
{
spin_lock(&resv->lock);
VM_BUG_ON(!resv->region_cache_count);
resv->adds_in_progress -= regions_needed;
spin_unlock(&resv->lock);
}

/*
* Delete the specified range [f, t) from the reserve map.  If the
* t parameter is LONG_MAX, this indicates that ALL regions after f
* should be deleted.  Locate the regions which intersect [f, t)
* and either trim, delete or split the existing regions.
*
* Returns the number of huge pages deleted from the reserve map.
* In the normal case, the return value is zero or more.  In the
* case where a region must be split, a new region descriptor must
* be allocated.  If the allocation fails, -ENOMEM will be returned.
* NOTE: If the parameter t == LONG_MAX, then we will never split
* a region and possibly return -ENOMEM.  Callers specifying
* t == LONG_MAX do not need to check for -ENOMEM error.
*/
static long region_del(struct resv_map *resv, long f, long t)
{
struct list_head *head = &resv->regions;
struct file_region *rg, *trg;
struct file_region *nrg = NULL;
long del = 0;

retry:
spin_lock(&resv->lock);
list_for_each_entry_safe(rg, trg, head, link) {
  /*
* Skip regions before the range to be deleted.  file_region
* ranges are normally of the form [from, to).  However, there
* may be a "placeholder" entry in the map which is of the form
* (from, to) with from == to.  Check for placeholder entries
* at the beginning of the range to be deleted.
*/
  if (rg->to <= f && (rg->to != rg->from || rg->to != f))
   continue;

  if (rg->from >= t)
   break;

  if (f > rg->from && t < rg->to) { /* Must split region */
   /*
* Check for an entry in the cache before dropping
* lock and attempting allocation.
*/
   if (!nrg &&
       resv->region_cache_count > resv->adds_in_progress) {
    nrg = list_first_entry(&resv->region_cache,
       struct file_region,
       link);
    list_del(&nrg->link);
    resv->region_cache_count--;
   }

   if (!nrg) {
    spin_unlock(&resv->lock);
    nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
    if (!nrg)
     return -ENOMEM;
    goto retry;
   }

   del += t - f;
   hugetlb_cgroup_uncharge_file_region(
    resv, rg, t - f, false);

   /* New entry for end of split region */
   nrg->from = t;
   nrg->to = rg->to;

   copy_hugetlb_cgroup_uncharge_info(nrg, rg);

   INIT_LIST_HEAD(&nrg->link);

   /* Original entry is trimmed */
   rg->to = f;

   list_add(&nrg->link, &rg->link);
   nrg = NULL;
   break;
  }

  if (f <= rg->from && t >= rg->to) { /* Remove entire region */
   del += rg->to - rg->from;
   hugetlb_cgroup_uncharge_file_region(resv, rg,
           rg->to - rg->from, true);
   list_del(&rg->link);
   kfree(rg);
   continue;
  }

  if (f <= rg->from) { /* Trim beginning of region */
   hugetlb_cgroup_uncharge_file_region(resv, rg,
           t - rg->from, false);

   del += t - rg->from;
   rg->from = t;
  } else {  /* Trim end of region */
   hugetlb_cgroup_uncharge_file_region(resv, rg,
           rg->to - f, false);

   del += rg->to - f;
   rg->to = f;
  }
}

spin_unlock(&resv->lock);
kfree(nrg);
return del;
}

/*
* A rare out of memory error was encountered which prevented removal of
* the reserve map region for a page.  The huge page itself was free'ed
* and removed from the page cache.  This routine will adjust the subpool
* usage count, and the global reserve count if needed.  By incrementing
* these counts, the reserve map entry which could not be deleted will
* appear as a "reserved" entry instead of simply dangling with incorrect
* counts.
*/
void hugetlb_fix_reserve_counts(struct inode *inode)
{
struct hugepage_subpool *spool = subpool_inode(inode);
long rsv_adjust;
bool reserved = false;

rsv_adjust = hugepage_subpool_get_pages(spool, 1);
if (rsv_adjust > 0) {
  struct hstate *h = hstate_inode(inode);

  if (!hugetlb_acct_memory(h, 1))
   reserved = true;
} else if (!rsv_adjust) {
  reserved = true;
}

if (!reserved)
  pr_warn("hugetlb: Huge Page Reserved count may go negative.\n");
}

/*
* Count and return the number of huge pages in the reserve map
* that intersect with the range [f, t).
*/
static long region_count(struct resv_map *resv, long f, long t)
{
struct list_head *head = &resv->regions;
struct file_region *rg;
long chg = 0;

spin_lock(&resv->lock);
/* Locate each segment we overlap with, and count that overlap. */
list_for_each_entry(rg, head, link) {
  long seg_from;
  long seg_to;

  if (rg->to <= f)
   continue;
  if (rg->from >= t)
   break;

  seg_from = max(rg->from, f);
  seg_to = min(rg->to, t);

  chg += seg_to - seg_from;
}
spin_unlock(&resv->lock);

return chg;
}

/*
* Convert the address within this vma to the page offset within
* the mapping, huge page units here.
*/
static pgoff_t vma_hugecache_offset(struct hstate *h,
   struct vm_area_struct *vma, unsigned long address)
{
return ((address - vma->vm_start) >> huge_page_shift(h)) +
   (vma->vm_pgoff >> huge_page_order(h));
}

/**
* vma_kernel_pagesize - Page size granularity for this VMA.
* @vma: The user mapping.
*
* Folios in this VMA will be aligned to, and at least the size of the
* number of bytes returned by this function.
*
* Return: The default size of the folios allocated when backing a VMA.
*/
unsigned long vma_kernel_pagesize(struct vm_area_struct *vma)
{
if (vma->vm_ops && vma->vm_ops->pagesize)
  return vma->vm_ops->pagesize(vma);
return PAGE_SIZE;
}
EXPORT_SYMBOL_GPL(vma_kernel_pagesize);

/*
* Return the page size being used by the MMU to back a VMA. In the majority
* of cases, the page size used by the kernel matches the MMU size. On
* architectures where it differs, an architecture-specific 'strong'
* version of this symbol is required.
*/
__weak unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
{
return vma_kernel_pagesize(vma);
}

/*
* Flags for MAP_PRIVATE reservations.  These are stored in the bottom
* bits of the reservation map pointer, which are always clear due to
* alignment.
*/
#define HPAGE_RESV_OWNER    (1UL << 0)
#define HPAGE_RESV_UNMAPPED (1UL << 1)
#define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED)

/*
* These helpers are used to track how many pages are reserved for
* faults in a MAP_PRIVATE mapping. Only the process that called mmap()
* is guaranteed to have their future faults succeed.
*
* With the exception of hugetlb_dup_vma_private() which is called at fork(),
* the reserve counters are updated with the hugetlb_lock held. It is safe
* to reset the VMA at fork() time as it is not in use yet and there is no
* chance of the global counters getting corrupted as a result of the values.
*
* The private mapping reservation is represented in a subtly different
* manner to a shared mapping.  A shared mapping has a region map associated
* with the underlying file, this region map represents the backing file
* pages which have ever had a reservation assigned which this persists even
* after the page is instantiated.  A private mapping has a region map
* associated with the original mmap which is attached to all VMAs which
* reference it, this region map represents those offsets which have consumed
* reservation ie. where pages have been instantiated.
*/
static unsigned long get_vma_private_data(struct vm_area_struct *vma)
{
return (unsigned long)vma->vm_private_data;
}

static void set_vma_private_data(struct vm_area_struct *vma,
       unsigned long value)
{
vma->vm_private_data = (void *)value;
}

static void
resv_map_set_hugetlb_cgroup_uncharge_info(struct resv_map *resv_map,
       struct hugetlb_cgroup *h_cg,
       struct hstate *h)
{
#ifdef CONFIG_CGROUP_HUGETLB
if (!h_cg || !h) {
  resv_map->reservation_counter = NULL;
  resv_map->pages_per_hpage = 0;
  resv_map->css = NULL;
} else {
  resv_map->reservation_counter =
   &h_cg->rsvd_hugepage[hstate_index(h)];
  resv_map->pages_per_hpage = pages_per_huge_page(h);
  resv_map->css = &h_cg->css;
}
#endif
}

struct resv_map *resv_map_alloc(void)
{
struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL);
struct file_region *rg = kmalloc(sizeof(*rg), GFP_KERNEL);

if (!resv_map || !rg) {
  kfree(resv_map);
  kfree(rg);
  return NULL;
}

kref_init(&resv_map->refs);
spin_lock_init(&resv_map->lock);
INIT_LIST_HEAD(&resv_map->regions);
init_rwsem(&resv_map->rw_sema);

resv_map->adds_in_progress = 0;
/*
* Initialize these to 0. On shared mappings, 0's here indicate these
* fields don't do cgroup accounting. On private mappings, these will be
* re-initialized to the proper values, to indicate that hugetlb cgroup
* reservations are to be un-charged from here.
*/
resv_map_set_hugetlb_cgroup_uncharge_info(resv_map, NULL, NULL);

INIT_LIST_HEAD(&resv_map->region_cache);
list_add(&rg->link, &resv_map->region_cache);
resv_map->region_cache_count = 1;

return resv_map;
}

void resv_map_release(struct kref *ref)
{
struct resv_map *resv_map = container_of(ref, struct resv_map, refs);
struct list_head *head = &resv_map->region_cache;
struct file_region *rg, *trg;

/* Clear out any active regions before we release the map. */
region_del(resv_map, 0, LONG_MAX);

/* ... and any entries left in the cache */
list_for_each_entry_safe(rg, trg, head, link) {
  list_del(&rg->link);
  kfree(rg);
}

VM_BUG_ON(resv_map->adds_in_progress);

kfree(resv_map);
}

static inline struct resv_map *inode_resv_map(struct inode *inode)
{
/*
* At inode evict time, i_mapping may not point to the original
* address space within the inode.  This original address space
* contains the pointer to the resv_map.  So, always use the
* address space embedded within the inode.
* The VERY common case is inode->mapping == &inode->i_data but,
* this may not be true for device special inodes.
*/
return (struct resv_map *)(&inode->i_data)->i_private_data;
}

static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
{
VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
if (vma->vm_flags & VM_MAYSHARE) {
  struct address_space *mapping = vma->vm_file->f_mapping;
  struct inode *inode = mapping->host;

  return inode_resv_map(inode);

} else {
  return (struct resv_map *)(get_vma_private_data(vma) &
       ~HPAGE_RESV_MASK);
}
}

static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map)
{
VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma);

set_vma_private_data(vma, (unsigned long)map);
}

static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
{
VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma);

set_vma_private_data(vma, get_vma_private_data(vma) | flags);
}

static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
{
VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);

return (get_vma_private_data(vma) & flag) != 0;
}

bool __vma_private_lock(struct vm_area_struct *vma)
{
return !(vma->vm_flags & VM_MAYSHARE) &&
  get_vma_private_data(vma) & ~HPAGE_RESV_MASK &&
  is_vma_resv_set(vma, HPAGE_RESV_OWNER);
}

void hugetlb_dup_vma_private(struct vm_area_struct *vma)
{
VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
/*
* Clear vm_private_data
* - For shared mappings this is a per-vma semaphore that may be
*   allocated in a subsequent call to hugetlb_vm_op_open.
*   Before clearing, make sure pointer is not associated with vma
*   as this will leak the structure.  This is the case when called
*   via clear_vma_resv_huge_pages() and hugetlb_vm_op_open has already
*   been called to allocate a new structure.
* - For MAP_PRIVATE mappings, this is the reserve map which does
*   not apply to children.  Faults generated by the children are
*   not guaranteed to succeed, even if read-only.
*/
if (vma->vm_flags & VM_MAYSHARE) {
  struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;

  if (vma_lock && vma_lock->vma != vma)
   vma->vm_private_data = NULL;
} else
  vma->vm_private_data = NULL;
}

/*
* Reset and decrement one ref on hugepage private reservation.
* Called with mm->mmap_lock writer semaphore held.
* This function should be only used by mremap and operate on
* same sized vma. It should never come here with last ref on the
* reservation.
*/
void clear_vma_resv_huge_pages(struct vm_area_struct *vma)
{
/*
* Clear the old hugetlb private page reservation.
* It has already been transferred to new_vma.
*
* During a mremap() operation of a hugetlb vma we call move_vma()
* which copies vma into new_vma and unmaps vma. After the copy
* operation both new_vma and vma share a reference to the resv_map
* struct, and at that point vma is about to be unmapped. We don't
* want to return the reservation to the pool at unmap of vma because
* the reservation still lives on in new_vma, so simply decrement the
* ref here and remove the resv_map reference from this vma.
*/
struct resv_map *reservations = vma_resv_map(vma);

if (reservations && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
  resv_map_put_hugetlb_cgroup_uncharge_info(reservations);
  kref_put(&reservations->refs, resv_map_release);
}

hugetlb_dup_vma_private(vma);
}

static void enqueue_hugetlb_folio(struct hstate *h, struct folio *folio)
{
int nid = folio_nid(folio);

lockdep_assert_held(&hugetlb_lock);
VM_BUG_ON_FOLIO(folio_ref_count(folio), folio);

list_move(&folio->lru, &h->hugepage_freelists[nid]);
h->free_huge_pages++;
h->free_huge_pages_node[nid]++;
folio_set_hugetlb_freed(folio);
}

static struct folio *dequeue_hugetlb_folio_node_exact(struct hstate *h,
        int nid)
{
struct folio *folio;
bool pin = !!(current->flags & PF_MEMALLOC_PIN);

lockdep_assert_held(&hugetlb_lock);
list_for_each_entry(folio, &h->hugepage_freelists[nid], lru) {
  if (pin && !folio_is_longterm_pinnable(folio))
   continue;

  if (folio_test_hwpoison(folio))
   continue;

  if (is_migrate_isolate_page(&folio->page))
   continue;

  list_move(&folio->lru, &h->hugepage_activelist);
  folio_ref_unfreeze(folio, 1);
  folio_clear_hugetlb_freed(folio);
  h->free_huge_pages--;
  h->free_huge_pages_node[nid]--;
  return folio;
}

return NULL;
}

static struct folio *dequeue_hugetlb_folio_nodemask(struct hstate *h, gfp_t gfp_mask,
       int nid, nodemask_t *nmask)
{
unsigned int cpuset_mems_cookie;
struct zonelist *zonelist;
struct zone *zone;
struct zoneref *z;
int node = NUMA_NO_NODE;

/* 'nid' should not be NUMA_NO_NODE. Try to catch any misuse of it and rectifiy. */
if (nid == NUMA_NO_NODE)
  nid = numa_node_id();

zonelist = node_zonelist(nid, gfp_mask);

retry_cpuset:
cpuset_mems_cookie = read_mems_allowed_begin();
for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nmask) {
  struct folio *folio;

  if (!cpuset_zone_allowed(zone, gfp_mask))
   continue;
  /*
* no need to ask again on the same node. Pool is node rather than
* zone aware
*/
  if (zone_to_nid(zone) == node)
   continue;
  node = zone_to_nid(zone);

  folio = dequeue_hugetlb_folio_node_exact(h, node);
  if (folio)
   return folio;
}
if (unlikely(read_mems_allowed_retry(cpuset_mems_cookie)))
  goto retry_cpuset;

return NULL;
}

static unsigned long available_huge_pages(struct hstate *h)
{
return h->free_huge_pages - h->resv_huge_pages;
}

static struct folio *dequeue_hugetlb_folio_vma(struct hstate *h,
    struct vm_area_struct *vma,
    unsigned long address, long gbl_chg)
{
struct folio *folio = NULL;
struct mempolicy *mpol;
gfp_t gfp_mask;
nodemask_t *nodemask;
int nid;

/*
* gbl_chg==1 means the allocation requires a new page that was not
* reserved before.  Making sure there's at least one free page.
*/
if (gbl_chg && !available_huge_pages(h))
  goto err;

gfp_mask = htlb_alloc_mask(h);
nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask);

if (mpol_is_preferred_many(mpol)) {
  folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask,
       nid, nodemask);

  /* Fallback to all nodes if page==NULL */
  nodemask = NULL;
}

if (!folio)
  folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask,
       nid, nodemask);

mpol_cond_put(mpol);
return folio;

err:
return NULL;
}

/*
* common helper functions for hstate_next_node_to_{alloc|free}.
* We may have allocated or freed a huge page based on a different
* nodes_allowed previously, so h->next_node_to_{alloc|free} might
* be outside of *nodes_allowed.  Ensure that we use an allowed
* node for alloc or free.
*/
static int next_node_allowed(int nid, nodemask_t *nodes_allowed)
{
nid = next_node_in(nid, *nodes_allowed);
VM_BUG_ON(nid >= MAX_NUMNODES);

return nid;
}

static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed)
{
if (!node_isset(nid, *nodes_allowed))
  nid = next_node_allowed(nid, nodes_allowed);
return nid;
}

/*
* returns the previously saved node ["this node"] from which to
* allocate a persistent huge page for the pool and advance the
* next node from which to allocate, handling wrap at end of node
* mask.
*/
static int hstate_next_node_to_alloc(int *next_node,
     nodemask_t *nodes_allowed)
{
int nid;

VM_BUG_ON(!nodes_allowed);

nid = get_valid_node_allowed(*next_node, nodes_allowed);
*next_node = next_node_allowed(nid, nodes_allowed);

return nid;
}

/*
* helper for remove_pool_hugetlb_folio() - return the previously saved
* node ["this node"] from which to free a huge page.  Advance the
* next node id whether or not we find a free huge page to free so
* that the next attempt to free addresses the next node.
*/
static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
{
int nid;

VM_BUG_ON(!nodes_allowed);

nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed);
h->next_nid_to_free = next_node_allowed(nid, nodes_allowed);

return nid;
}

#define for_each_node_mask_to_alloc(next_node, nr_nodes, node, mask)  \
for (nr_nodes = nodes_weight(*mask);    \
  nr_nodes > 0 &&      \
  ((node = hstate_next_node_to_alloc(next_node, mask)) || 1); \
  nr_nodes--)

#define for_each_node_mask_to_free(hs, nr_nodes, node, mask)  \
for (nr_nodes = nodes_weight(*mask);    \
  nr_nodes > 0 &&      \
  ((node = hstate_next_node_to_free(hs, mask)) || 1); \
  nr_nodes--)

#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE
#ifdef CONFIG_CONTIG_ALLOC
static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask,
  int nid, nodemask_t *nodemask)
{
struct folio *folio;
int order = huge_page_order(h);
bool retried = false;

if (nid == NUMA_NO_NODE)
  nid = numa_mem_id();
retry:
folio = hugetlb_cma_alloc_folio(h, gfp_mask, nid, nodemask);
if (!folio) {
  if (hugetlb_cma_exclusive_alloc())
   return NULL;

  folio = folio_alloc_gigantic(order, gfp_mask, nid, nodemask);
  if (!folio)
   return NULL;
}

if (folio_ref_freeze(folio, 1))
  return folio;

pr_warn("HugeTLB: unexpected refcount on PFN %lu\n", folio_pfn(folio));
hugetlb_free_folio(folio);
if (!retried) {
  retried = true;
  goto retry;
}
return NULL;
}

#else /* !CONFIG_CONTIG_ALLOC */
static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask,
     int nid, nodemask_t *nodemask)
{
return NULL;
}
#endif /* CONFIG_CONTIG_ALLOC */

#else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE */
static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask,
     int nid, nodemask_t *nodemask)
{
return NULL;
}
#endif

/*
* Remove hugetlb folio from lists.
* If vmemmap exists for the folio, clear the hugetlb flag so that the
* folio appears as just a compound page.  Otherwise, wait until after
* allocating vmemmap to clear the flag.
*
* Must be called with hugetlb lock held.
*/
static void remove_hugetlb_folio(struct hstate *h, struct folio *folio,
       bool adjust_surplus)
{
int nid = folio_nid(folio);

VM_BUG_ON_FOLIO(hugetlb_cgroup_from_folio(folio), folio);
VM_BUG_ON_FOLIO(hugetlb_cgroup_from_folio_rsvd(folio), folio);

lockdep_assert_held(&hugetlb_lock);
if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
  return;

list_del(&folio->lru);

if (folio_test_hugetlb_freed(folio)) {
  folio_clear_hugetlb_freed(folio);
  h->free_huge_pages--;
  h->free_huge_pages_node[nid]--;
}
if (adjust_surplus) {
  h->surplus_huge_pages--;
  h->surplus_huge_pages_node[nid]--;
}

/*
* We can only clear the hugetlb flag after allocating vmemmap
* pages.  Otherwise, someone (memory error handling) may try to write
* to tail struct pages.
*/
if (!folio_test_hugetlb_vmemmap_optimized(folio))
  __folio_clear_hugetlb(folio);

h->nr_huge_pages--;
h->nr_huge_pages_node[nid]--;
}

static void add_hugetlb_folio(struct hstate *h, struct folio *folio,
        bool adjust_surplus)
{
int nid = folio_nid(folio);

VM_BUG_ON_FOLIO(!folio_test_hugetlb_vmemmap_optimized(folio), folio);

lockdep_assert_held(&hugetlb_lock);

INIT_LIST_HEAD(&folio->lru);
h->nr_huge_pages++;
h->nr_huge_pages_node[nid]++;

if (adjust_surplus) {
  h->surplus_huge_pages++;
  h->surplus_huge_pages_node[nid]++;
}

__folio_set_hugetlb(folio);
folio_change_private(folio, NULL);
/*
* We have to set hugetlb_vmemmap_optimized again as above
* folio_change_private(folio, NULL) cleared it.
*/
folio_set_hugetlb_vmemmap_optimized(folio);

arch_clear_hugetlb_flags(folio);
enqueue_hugetlb_folio(h, folio);
}

static void __update_and_free_hugetlb_folio(struct hstate *h,
      struct folio *folio)
{
bool clear_flag = folio_test_hugetlb_vmemmap_optimized(folio);

if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
  return;

/*
* If we don't know which subpages are hwpoisoned, we can't free
* the hugepage, so it's leaked intentionally.
*/
if (folio_test_hugetlb_raw_hwp_unreliable(folio))
  return;

/*
* If folio is not vmemmap optimized (!clear_flag), then the folio
* is no longer identified as a hugetlb page.  hugetlb_vmemmap_restore_folio
* can only be passed hugetlb pages and will BUG otherwise.
*/
if (clear_flag && hugetlb_vmemmap_restore_folio(h, folio)) {
  spin_lock_irq(&hugetlb_lock);
  /*
* If we cannot allocate vmemmap pages, just refuse to free the
* page and put the page back on the hugetlb free list and treat
* as a surplus page.
*/
  add_hugetlb_folio(h, folio, true);
  spin_unlock_irq(&hugetlb_lock);
  return;
}

/*
* If vmemmap pages were allocated above, then we need to clear the
* hugetlb flag under the hugetlb lock.
*/
if (folio_test_hugetlb(folio)) {
  spin_lock_irq(&hugetlb_lock);
  __folio_clear_hugetlb(folio);
  spin_unlock_irq(&hugetlb_lock);
}

/*
* Move PageHWPoison flag from head page to the raw error pages,
* which makes any healthy subpages reusable.
*/
if (unlikely(folio_test_hwpoison(folio)))
  folio_clear_hugetlb_hwpoison(folio);

folio_ref_unfreeze(folio, 1);

hugetlb_free_folio(folio);
}

/*
* As update_and_free_hugetlb_folio() can be called under any context, so we cannot
* use GFP_KERNEL to allocate vmemmap pages. However, we can defer the
* actual freeing in a workqueue to prevent from using GFP_ATOMIC to allocate
* the vmemmap pages.
*
* free_hpage_workfn() locklessly retrieves the linked list of pages to be
* freed and frees them one-by-one. As the page->mapping pointer is going
* to be cleared in free_hpage_workfn() anyway, it is reused as the llist_node
* structure of a lockless linked list of huge pages to be freed.
*/
static LLIST_HEAD(hpage_freelist);

static void free_hpage_workfn(struct work_struct *work)
{
struct llist_node *node;

node = llist_del_all(&hpage_freelist);

while (node) {
  struct folio *folio;
  struct hstate *h;

  folio = container_of((struct address_space **)node,
         struct folio, mapping);
  node = node->next;
  folio->mapping = NULL;
  /*
* The VM_BUG_ON_FOLIO(!folio_test_hugetlb(folio), folio) in
* folio_hstate() is going to trigger because a previous call to
* remove_hugetlb_folio() will clear the hugetlb bit, so do
* not use folio_hstate() directly.
*/
  h = size_to_hstate(folio_size(folio));

  __update_and_free_hugetlb_folio(h, folio);

  cond_resched();
}
}
static DECLARE_WORK(free_hpage_work, free_hpage_workfn);

static inline void flush_free_hpage_work(struct hstate *h)
{
if (hugetlb_vmemmap_optimizable(h))
  flush_work(&free_hpage_work);
}

static void update_and_free_hugetlb_folio(struct hstate *h, struct folio *folio,
     bool atomic)
{
if (!folio_test_hugetlb_vmemmap_optimized(folio) || !atomic) {
  __update_and_free_hugetlb_folio(h, folio);
  return;
}

/*
* Defer freeing to avoid using GFP_ATOMIC to allocate vmemmap pages.
*
* Only call schedule_work() if hpage_freelist is previously
* empty. Otherwise, schedule_work() had been called but the workfn
* hasn't retrieved the list yet.
*/
if (llist_add((struct llist_node *)&folio->mapping, &hpage_freelist))
  schedule_work(&free_hpage_work);
}

static void bulk_vmemmap_restore_error(struct hstate *h,
     struct list_head *folio_list,
     struct list_head *non_hvo_folios)
{
struct folio *folio, *t_folio;

if (!list_empty(non_hvo_folios)) {
  /*
* Free any restored hugetlb pages so that restore of the
* entire list can be retried.
* The idea is that in the common case of ENOMEM errors freeing
* hugetlb pages with vmemmap we will free up memory so that we
* can allocate vmemmap for more hugetlb pages.
*/
  list_for_each_entry_safe(folio, t_folio, non_hvo_folios, lru) {
   list_del(&folio->lru);
   spin_lock_irq(&hugetlb_lock);
   __folio_clear_hugetlb(folio);
   spin_unlock_irq(&hugetlb_lock);
   update_and_free_hugetlb_folio(h, folio, false);
   cond_resched();
  }
} else {
  /*
* In the case where there are no folios which can be
* immediately freed, we loop through the list trying to restore
* vmemmap individually in the hope that someone elsewhere may
* have done something to cause success (such as freeing some
* memory).  If unable to restore a hugetlb page, the hugetlb
* page is made a surplus page and removed from the list.
* If are able to restore vmemmap and free one hugetlb page, we
* quit processing the list to retry the bulk operation.
*/
  list_for_each_entry_safe(folio, t_folio, folio_list, lru)
   if (hugetlb_vmemmap_restore_folio(h, folio)) {
    list_del(&folio->lru);
    spin_lock_irq(&hugetlb_lock);
    add_hugetlb_folio(h, folio, true);
    spin_unlock_irq(&hugetlb_lock);
   } else {
    list_del(&folio->lru);
    spin_lock_irq(&hugetlb_lock);
    __folio_clear_hugetlb(folio);
    spin_unlock_irq(&hugetlb_lock);
    update_and_free_hugetlb_folio(h, folio, false);
    cond_resched();
    break;
   }
}
}

static void update_and_free_pages_bulk(struct hstate *h,
      struct list_head *folio_list)
{
long ret;
struct folio *folio, *t_folio;
LIST_HEAD(non_hvo_folios);

/*
* First allocate required vmemmmap (if necessary) for all folios.
* Carefully handle errors and free up any available hugetlb pages
* in an effort to make forward progress.
*/
retry:
ret = hugetlb_vmemmap_restore_folios(h, folio_list, &non_hvo_folios);
if (ret < 0) {
  bulk_vmemmap_restore_error(h, folio_list, &non_hvo_folios);
  goto retry;
}

/*
* At this point, list should be empty, ret should be >= 0 and there
* should only be pages on the non_hvo_folios list.
* Do note that the non_hvo_folios list could be empty.
* Without HVO enabled, ret will be 0 and there is no need to call
* __folio_clear_hugetlb as this was done previously.
*/
VM_WARN_ON(!list_empty(folio_list));
VM_WARN_ON(ret < 0);
if (!list_empty(&non_hvo_folios) && ret) {
  spin_lock_irq(&hugetlb_lock);
  list_for_each_entry(folio, &non_hvo_folios, lru)
   __folio_clear_hugetlb(folio);
  spin_unlock_irq(&hugetlb_lock);
}

list_for_each_entry_safe(folio, t_folio, &non_hvo_folios, lru) {
  update_and_free_hugetlb_folio(h, folio, false);
  cond_resched();
}
}

struct hstate *size_to_hstate(unsigned long size)
{
struct hstate *h;

for_each_hstate(h) {
  if (huge_page_size(h) == size)
   return h;
}
return NULL;
}

void free_huge_folio(struct folio *folio)
{
/*
* Can't pass hstate in here because it is called from the
* generic mm code.
*/
struct hstate *h = folio_hstate(folio);
int nid = folio_nid(folio);
struct hugepage_subpool *spool = hugetlb_folio_subpool(folio);
bool restore_reserve;
unsigned long flags;

VM_BUG_ON_FOLIO(folio_ref_count(folio), folio);
VM_BUG_ON_FOLIO(folio_mapcount(folio), folio);

hugetlb_set_folio_subpool(folio, NULL);
if (folio_test_anon(folio))
  __ClearPageAnonExclusive(&folio->page);
folio->mapping = NULL;
restore_reserve = folio_test_hugetlb_restore_reserve(folio);
folio_clear_hugetlb_restore_reserve(folio);

/*
* If HPageRestoreReserve was set on page, page allocation consumed a
* reservation.  If the page was associated with a subpool, there
* would have been a page reserved in the subpool before allocation
* via hugepage_subpool_get_pages().  Since we are 'restoring' the
* reservation, do not call hugepage_subpool_put_pages() as this will
* remove the reserved page from the subpool.
*/
if (!restore_reserve) {
  /*
* A return code of zero implies that the subpool will be
* under its minimum size if the reservation is not restored
* after page is free.  Therefore, force restore_reserve
* operation.
*/
  if (hugepage_subpool_put_pages(spool, 1) == 0)
   restore_reserve = true;
}

spin_lock_irqsave(&hugetlb_lock, flags);
folio_clear_hugetlb_migratable(folio);
hugetlb_cgroup_uncharge_folio(hstate_index(h),
         pages_per_huge_page(h), folio);
hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h),
       pages_per_huge_page(h), folio);
lruvec_stat_mod_folio(folio, NR_HUGETLB, -pages_per_huge_page(h));
mem_cgroup_uncharge(folio);
if (restore_reserve)
  h->resv_huge_pages++;

if (folio_test_hugetlb_temporary(folio)) {
  remove_hugetlb_folio(h, folio, false);
  spin_unlock_irqrestore(&hugetlb_lock, flags);
  update_and_free_hugetlb_folio(h, folio, true);
} else if (h->surplus_huge_pages_node[nid]) {
  /* remove the page from active list */
  remove_hugetlb_folio(h, folio, true);
  spin_unlock_irqrestore(&hugetlb_lock, flags);
  update_and_free_hugetlb_folio(h, folio, true);
} else {
  arch_clear_hugetlb_flags(folio);
  enqueue_hugetlb_folio(h, folio);
  spin_unlock_irqrestore(&hugetlb_lock, flags);
}
}

/*
* Must be called with the hugetlb lock held
*/
static void __prep_account_new_huge_page(struct hstate *h, int nid)
{
lockdep_assert_held(&hugetlb_lock);
h->nr_huge_pages++;
h->nr_huge_pages_node[nid]++;
}

static void init_new_hugetlb_folio(struct hstate *h, struct folio *folio)
{
__folio_set_hugetlb(folio);
INIT_LIST_HEAD(&folio->lru);
hugetlb_set_folio_subpool(folio, NULL);
set_hugetlb_cgroup(folio, NULL);
set_hugetlb_cgroup_rsvd(folio, NULL);
}

static void __prep_new_hugetlb_folio(struct hstate *h, struct folio *folio)
{
init_new_hugetlb_folio(h, folio);
hugetlb_vmemmap_optimize_folio(h, folio);
}

static void prep_new_hugetlb_folio(struct hstate *h, struct folio *folio, int nid)
{
__prep_new_hugetlb_folio(h, folio);
spin_lock_irq(&hugetlb_lock);
__prep_account_new_huge_page(h, nid);
spin_unlock_irq(&hugetlb_lock);
}

/*
* Find and lock address space (mapping) in write mode.
*
* Upon entry, the folio is locked which means that folio_mapping() is
* stable.  Due to locking order, we can only trylock_write.  If we can
* not get the lock, simply return NULL to caller.
*/
struct address_space *hugetlb_folio_mapping_lock_write(struct folio *folio)
{
struct address_space *mapping = folio_mapping(folio);

if (!mapping)
  return mapping;

if (i_mmap_trylock_write(mapping))
  return mapping;

return NULL;
}

static struct folio *alloc_buddy_hugetlb_folio(struct hstate *h,
  gfp_t gfp_mask, int nid, nodemask_t *nmask,
  nodemask_t *node_alloc_noretry)
{
int order = huge_page_order(h);
struct folio *folio;
bool alloc_try_hard = true;

/*
* By default we always try hard to allocate the folio with
* __GFP_RETRY_MAYFAIL flag.  However, if we are allocating folios in
* a loop (to adjust global huge page counts) and previous allocation
* failed, do not continue to try hard on the same node.  Use the
* node_alloc_noretry bitmap to manage this state information.
*/
if (node_alloc_noretry && node_isset(nid, *node_alloc_noretry))
  alloc_try_hard = false;
if (alloc_try_hard)
  gfp_mask |= __GFP_RETRY_MAYFAIL;
if (nid == NUMA_NO_NODE)
  nid = numa_mem_id();

folio = (struct folio *)__alloc_frozen_pages(gfp_mask, order, nid, nmask);

/*
* If we did not specify __GFP_RETRY_MAYFAIL, but still got a
* folio this indicates an overall state change.  Clear bit so
* that we resume normal 'try hard' allocations.
*/
if (node_alloc_noretry && folio && !alloc_try_hard)
  node_clear(nid, *node_alloc_noretry);

/*
* If we tried hard to get a folio but failed, set bit so that
* subsequent attempts will not try as hard until there is an
* overall state change.
*/
if (node_alloc_noretry && !folio && alloc_try_hard)
  node_set(nid, *node_alloc_noretry);

if (!folio) {
  __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
  return NULL;
}

__count_vm_event(HTLB_BUDDY_PGALLOC);
return folio;
}

static struct folio *only_alloc_fresh_hugetlb_folio(struct hstate *h,
  gfp_t gfp_mask, int nid, nodemask_t *nmask,
  nodemask_t *node_alloc_noretry)
{
struct folio *folio;

if (hstate_is_gigantic(h))
  folio = alloc_gigantic_folio(h, gfp_mask, nid, nmask);
else
  folio = alloc_buddy_hugetlb_folio(h, gfp_mask, nid, nmask, node_alloc_noretry);
if (folio)
  init_new_hugetlb_folio(h, folio);
return folio;
}

/*
* Common helper to allocate a fresh hugetlb page. All specific allocators
* should use this function to get new hugetlb pages
*
* Note that returned page is 'frozen':  ref count of head page and all tail
* pages is zero.
*/
static struct folio *alloc_fresh_hugetlb_folio(struct hstate *h,
  gfp_t gfp_mask, int nid, nodemask_t *nmask)
{
struct folio *folio;

if (hstate_is_gigantic(h))
  folio = alloc_gigantic_folio(h, gfp_mask, nid, nmask);
else
  folio = alloc_buddy_hugetlb_folio(h, gfp_mask, nid, nmask, NULL);
if (!folio)
  return NULL;

prep_new_hugetlb_folio(h, folio, folio_nid(folio));
return folio;
}

static void prep_and_add_allocated_folios(struct hstate *h,
     struct list_head *folio_list)
{
unsigned long flags;
struct folio *folio, *tmp_f;

/* Send list for bulk vmemmap optimization processing */
hugetlb_vmemmap_optimize_folios(h, folio_list);

/* Add all new pool pages to free lists in one lock cycle */
spin_lock_irqsave(&hugetlb_lock, flags);
list_for_each_entry_safe(folio, tmp_f, folio_list, lru) {
  __prep_account_new_huge_page(h, folio_nid(folio));
  enqueue_hugetlb_folio(h, folio);
}
spin_unlock_irqrestore(&hugetlb_lock, flags);
}

/*
* Allocates a fresh hugetlb page in a node interleaved manner.  The page
* will later be added to the appropriate hugetlb pool.
*/
static struct folio *alloc_pool_huge_folio(struct hstate *h,
     nodemask_t *nodes_allowed,
     nodemask_t *node_alloc_noretry,
     int *next_node)
{
gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
int nr_nodes, node;

for_each_node_mask_to_alloc(next_node, nr_nodes, node, nodes_allowed) {
  struct folio *folio;

  folio = only_alloc_fresh_hugetlb_folio(h, gfp_mask, node,
     nodes_allowed, node_alloc_noretry);
  if (folio)
   return folio;
}

return NULL;
}

/*
* Remove huge page from pool from next node to free.  Attempt to keep
* persistent huge pages more or less balanced over allowed nodes.
* This routine only 'removes' the hugetlb page.  The caller must make
* an additional call to free the page to low level allocators.
* Called with hugetlb_lock locked.
*/
static struct folio *remove_pool_hugetlb_folio(struct hstate *h,
  nodemask_t *nodes_allowed, bool acct_surplus)
{
int nr_nodes, node;
struct folio *folio = NULL;

lockdep_assert_held(&hugetlb_lock);
for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
  /*
* If we're returning unused surplus pages, only examine
* nodes with surplus pages.
*/
  if ((!acct_surplus || h->surplus_huge_pages_node[node]) &&
      !list_empty(&h->hugepage_freelists[node])) {
   folio = list_entry(h->hugepage_freelists[node].next,
       struct folio, lru);
   remove_hugetlb_folio(h, folio, acct_surplus);
   break;
  }
}

return folio;
}

/*
* Dissolve a given free hugetlb folio into free buddy pages. This function
* does nothing for in-use hugetlb folios and non-hugetlb folios.
* This function returns values like below:
*
*  -ENOMEM: failed to allocate vmemmap pages to free the freed hugepages
*           when the system is under memory pressure and the feature of
*           freeing unused vmemmap pages associated with each hugetlb page
*           is enabled.
*  -EBUSY:  failed to dissolved free hugepages or the hugepage is in-use
*           (allocated or reserved.)
*       0:  successfully dissolved free hugepages or the page is not a
*           hugepage (considered as already dissolved)
*/
int dissolve_free_hugetlb_folio(struct folio *folio)
{
int rc = -EBUSY;

retry:
/* Not to disrupt normal path by vainly holding hugetlb_lock */
if (!folio_test_hugetlb(folio))
  return 0;

spin_lock_irq(&hugetlb_lock);
if (!folio_test_hugetlb(folio)) {
  rc = 0;
  goto out;
}

if (!folio_ref_count(folio)) {
  struct hstate *h = folio_hstate(folio);
  bool adjust_surplus = false;

  if (!available_huge_pages(h))
   goto out;

  /*
* We should make sure that the page is already on the free list
* when it is dissolved.
*/
  if (unlikely(!folio_test_hugetlb_freed(folio))) {
   spin_unlock_irq(&hugetlb_lock);
   cond_resched();

   /*
* Theoretically, we should return -EBUSY when we
* encounter this race. In fact, we have a chance
* to successfully dissolve the page if we do a
* retry. Because the race window is quite small.
* If we seize this opportunity, it is an optimization
* for increasing the success rate of dissolving page.
*/
   goto retry;
  }

  if (h->surplus_huge_pages_node[folio_nid(folio)])
   adjust_surplus = true;
  remove_hugetlb_folio(h, folio, adjust_surplus);
  h->max_huge_pages--;
  spin_unlock_irq(&hugetlb_lock);

  /*
* Normally update_and_free_hugtlb_folio will allocate required vmemmmap
* before freeing the page.  update_and_free_hugtlb_folio will fail to
* free the page if it can not allocate required vmemmap.  We
* need to adjust max_huge_pages if the page is not freed.
* Attempt to allocate vmemmmap here so that we can take
* appropriate action on failure.
*
* The folio_test_hugetlb check here is because
* remove_hugetlb_folio will clear hugetlb folio flag for
* non-vmemmap optimized hugetlb folios.
*/
  if (folio_test_hugetlb(folio)) {
   rc = hugetlb_vmemmap_restore_folio(h, folio);
   if (rc) {
    spin_lock_irq(&hugetlb_lock);
    add_hugetlb_folio(h, folio, adjust_surplus);
    h->max_huge_pages++;
    goto out;
   }
  } else
   rc = 0;

  update_and_free_hugetlb_folio(h, folio, false);
  return rc;
}
out:
spin_unlock_irq(&hugetlb_lock);
return rc;
}

/*
* Dissolve free hugepages in a given pfn range. Used by memory hotplug to
* make specified memory blocks removable from the system.
* Note that this will dissolve a free gigantic hugepage completely, if any
* part of it lies within the given range.
* Also note that if dissolve_free_hugetlb_folio() returns with an error, all
* free hugetlb folios that were dissolved before that error are lost.
*/
int dissolve_free_hugetlb_folios(unsigned long start_pfn, unsigned long end_pfn)
{
unsigned long pfn;
struct folio *folio;
int rc = 0;
unsigned int order;
struct hstate *h;

if (!hugepages_supported())
  return rc;

order = huge_page_order(&default_hstate);
for_each_hstate(h)
  order = min(order, huge_page_order(h));

for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << order) {
  folio = pfn_folio(pfn);
  rc = dissolve_free_hugetlb_folio(folio);
  if (rc)
   break;
}

return rc;
}

/*
* Allocates a fresh surplus page from the page allocator.
*/
static struct folio *alloc_surplus_hugetlb_folio(struct hstate *h,
    gfp_t gfp_mask, int nid, nodemask_t *nmask)
{
struct folio *folio = NULL;

if (hstate_is_gigantic(h))
  return NULL;

spin_lock_irq(&hugetlb_lock);
if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages)
  goto out_unlock;
spin_unlock_irq(&hugetlb_lock);

folio = only_alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask, NULL);
if (!folio)
  return NULL;

hugetlb_vmemmap_optimize_folio(h, folio);

spin_lock_irq(&hugetlb_lock);
/*
* nr_huge_pages needs to be adjusted within the same lock cycle
* as surplus_pages, otherwise it might confuse
* persistent_huge_pages() momentarily.
*/
__prep_account_new_huge_page(h, folio_nid(folio));

/*
* We could have raced with the pool size change.
* Double check that and simply deallocate the new page
* if we would end up overcommiting the surpluses. Abuse
* temporary page to workaround the nasty free_huge_folio
* codeflow
*/
if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
  folio_set_hugetlb_temporary(folio);
  spin_unlock_irq(&hugetlb_lock);
  free_huge_folio(folio);
  return NULL;
}

h->surplus_huge_pages++;
h->surplus_huge_pages_node[folio_nid(folio)]++;

out_unlock:
spin_unlock_irq(&hugetlb_lock);

return folio;
}

static struct folio *alloc_migrate_hugetlb_folio(struct hstate *h, gfp_t gfp_mask,
         int nid, nodemask_t *nmask)
{
--> --------------------

--> maximum size reached

--> --------------------

Messung V0.5

¤ Dauer der Verarbeitung: 0.52 Sekunden (vorverarbeitet) ¤

Wurzel

Suchen

Beweissystem der NASA

Beweissystem Isabelle

NIST Cobol Testsuite

Cephes Mathematical Library

Wiener Entwicklungsmethode

Haftungshinweis

Die Informationen auf dieser Webseite wurden nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit, noch Qualität der bereit gestellten Informationen zugesichert.

Bemerkung:

Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.