Quelle percpu.c Sprache: C

// SPDX-License-Identifier: GPL-2.0-only
/*
* mm/percpu.c - percpu memory allocator
*
* Copyright (C) 2009 SUSE Linux Products GmbH
* Copyright (C) 2009 Tejun Heo <tj@kernel.org>
*
* Copyright (C) 2017 Facebook Inc.
* Copyright (C) 2017 Dennis Zhou <dennis@kernel.org>
*
* The percpu allocator handles both static and dynamic areas.  Percpu
* areas are allocated in chunks which are divided into units.  There is
* a 1-to-1 mapping for units to possible cpus.  These units are grouped
* based on NUMA properties of the machine.
*
*  c0                           c1                         c2
*  -------------------          -------------------        ------------
* | u0 | u1 | u2 | u3 |        | u0 | u1 | u2 | u3 |      | u0 | u1 | u
*  -------------------  ......  -------------------  ....  ------------
*
* Allocation is done by offsets into a unit's address space.  Ie., an
* area of 512 bytes at 6k in c1 occupies 512 bytes at 6k in c1:u0,
* c1:u1, c1:u2, etc.  On NUMA machines, the mapping may be non-linear
* and even sparse.  Access is handled by configuring percpu base
* registers according to the cpu to unit mappings and offsetting the
* base address using pcpu_unit_size.
*
* There is special consideration for the first chunk which must handle
* the static percpu variables in the kernel image as allocation services
* are not online yet.  In short, the first chunk is structured like so:
*
*                  <Static | [Reserved] | Dynamic>
*
* The static data is copied from the original section managed by the
* linker.  The reserved section, if non-zero, primarily manages static
* percpu variables from kernel modules.  Finally, the dynamic section
* takes care of normal allocations.
*
* The allocator organizes chunks into lists according to free size and
* memcg-awareness.  To make a percpu allocation memcg-aware the __GFP_ACCOUNT
* flag should be passed.  All memcg-aware allocations are sharing one set
* of chunks and all unaccounted allocations and allocations performed
* by processes belonging to the root memory cgroup are using the second set.
*
* The allocator tries to allocate from the fullest chunk first. Each chunk
* is managed by a bitmap with metadata blocks.  The allocation map is updated
* on every allocation and free to reflect the current state while the boundary
* map is only updated on allocation.  Each metadata block contains
* information to help mitigate the need to iterate over large portions
* of the bitmap.  The reverse mapping from page to chunk is stored in
* the page's index.  Lastly, units are lazily backed and grow in unison.
*
* There is a unique conversion that goes on here between bytes and bits.
* Each bit represents a fragment of size PCPU_MIN_ALLOC_SIZE.  The chunk
* tracks the number of pages it is responsible for in nr_pages.  Helper
* functions are used to convert from between the bytes, bits, and blocks.
* All hints are managed in bits unless explicitly stated.
*
* To use this allocator, arch code should do the following:
*
* - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate
*   regular address to percpu pointer and back if they need to be
*   different from the default
*
* - use pcpu_setup_first_chunk() during percpu area initialization to
*   setup the first chunk containing the kernel static percpu area
*/

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/bitmap.h>
#include <linux/cpumask.h>
#include <linux/memblock.h>
#include <linux/err.h>
#include <linux/list.h>
#include <linux/log2.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/percpu.h>
#include <linux/pfn.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/vmalloc.h>
#include <linux/workqueue.h>
#include <linux/kmemleak.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/memcontrol.h>

#include <asm/cacheflush.h>
#include <asm/sections.h>
#include <asm/tlbflush.h>
#include <asm/io.h>

#define CREATE_TRACE_POINTS
#include <trace/events/percpu.h>

#include "percpu-internal.h"

/*
* The slots are sorted by the size of the biggest continuous free area.
* 1-31 bytes share the same slot.
*/
#define PCPU_SLOT_BASE_SHIFT  5
/* chunks in slots below this are subject to being sidelined on failed alloc */
#define PCPU_SLOT_FAIL_THRESHOLD 3

#define PCPU_EMPTY_POP_PAGES_LOW 2
#define PCPU_EMPTY_POP_PAGES_HIGH 4

#ifdef CONFIG_SMP
/* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */
#ifndef __addr_to_pcpu_ptr
#define __addr_to_pcpu_ptr(addr)     \
(void __percpu *)((unsigned long)(addr) -   \
     (unsigned long)pcpu_base_addr +  \
     (unsigned long)__per_cpu_start)
#endif
#ifndef __pcpu_ptr_to_addr
#define __pcpu_ptr_to_addr(ptr)      \
(void __force *)((unsigned long)(ptr) +    \
    (unsigned long)pcpu_base_addr -  \
    (unsigned long)__per_cpu_start)
#endif
#else /* CONFIG_SMP */
/* on UP, it's always identity mapped */
#define __addr_to_pcpu_ptr(addr) (void __percpu *)(addr)
#define __pcpu_ptr_to_addr(ptr)  (void __force *)(ptr)
#endif /* CONFIG_SMP */

static int pcpu_unit_pages __ro_after_init;
static int pcpu_unit_size __ro_after_init;
static int pcpu_nr_units __ro_after_init;
static int pcpu_atom_size __ro_after_init;
int pcpu_nr_slots __ro_after_init;
static int pcpu_free_slot __ro_after_init;
int pcpu_sidelined_slot __ro_after_init;
int pcpu_to_depopulate_slot __ro_after_init;
static size_t pcpu_chunk_struct_size __ro_after_init;

/* cpus with the lowest and highest unit addresses */
static unsigned int pcpu_low_unit_cpu __ro_after_init;
static unsigned int pcpu_high_unit_cpu __ro_after_init;

/* the address of the first chunk which starts with the kernel static area */
void *pcpu_base_addr __ro_after_init;

static const int *pcpu_unit_map __ro_after_init;  /* cpu -> unit */
const unsigned long *pcpu_unit_offsets __ro_after_init; /* cpu -> unit offset */

/* group information, used for vm allocation */
static int pcpu_nr_groups __ro_after_init;
static const unsigned long *pcpu_group_offsets __ro_after_init;
static const size_t *pcpu_group_sizes __ro_after_init;

/*
* The first chunk which always exists.  Note that unlike other
* chunks, this one can be allocated and mapped in several different
* ways and thus often doesn't live in the vmalloc area.
*/
struct pcpu_chunk *pcpu_first_chunk __ro_after_init;

/*
* Optional reserved chunk.  This chunk reserves part of the first
* chunk and serves it for reserved allocations.  When the reserved
* region doesn't exist, the following variable is NULL.
*/
struct pcpu_chunk *pcpu_reserved_chunk __ro_after_init;

DEFINE_SPINLOCK(pcpu_lock); /* all internal data structures */
static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop, map ext */

struct list_head *pcpu_chunk_lists __ro_after_init; /* chunk list slots */

/*
* The number of empty populated pages, protected by pcpu_lock.
* The reserved chunk doesn't contribute to the count.
*/
int pcpu_nr_empty_pop_pages;

/*
* The number of populated pages in use by the allocator, protected by
* pcpu_lock.  This number is kept per a unit per chunk (i.e. when a page gets
* allocated/deallocated, it is allocated/deallocated in all units of a chunk
* and increments/decrements this count by 1).
*/
static unsigned long pcpu_nr_populated;

/*
* Balance work is used to populate or destroy chunks asynchronously.  We
* try to keep the number of populated free pages between
* PCPU_EMPTY_POP_PAGES_LOW and HIGH for atomic allocations and at most one
* empty chunk.
*/
static void pcpu_balance_workfn(struct work_struct *work);
static DECLARE_WORK(pcpu_balance_work, pcpu_balance_workfn);
static bool pcpu_async_enabled __read_mostly;
static bool pcpu_atomic_alloc_failed;

static void pcpu_schedule_balance_work(void)
{
if (pcpu_async_enabled)
  schedule_work(&pcpu_balance_work);
}

/**
* pcpu_addr_in_chunk - check if the address is served from this chunk
* @chunk: chunk of interest
* @addr: percpu address
*
* RETURNS:
* True if the address is served from this chunk.
*/
static bool pcpu_addr_in_chunk(struct pcpu_chunk *chunk, void *addr)
{
void *start_addr, *end_addr;

if (!chunk)
  return false;

start_addr = chunk->base_addr + chunk->start_offset;
end_addr = chunk->base_addr + chunk->nr_pages * PAGE_SIZE -
     chunk->end_offset;

return addr >= start_addr && addr < end_addr;
}

static int __pcpu_size_to_slot(int size)
{
int highbit = fls(size); /* size is in bytes */
return max(highbit - PCPU_SLOT_BASE_SHIFT + 2, 1);
}

static int pcpu_size_to_slot(int size)
{
if (size == pcpu_unit_size)
  return pcpu_free_slot;
return __pcpu_size_to_slot(size);
}

static int pcpu_chunk_slot(const struct pcpu_chunk *chunk)
{
const struct pcpu_block_md *chunk_md = &chunk->chunk_md;

if (chunk->free_bytes < PCPU_MIN_ALLOC_SIZE ||
     chunk_md->contig_hint == 0)
  return 0;

return pcpu_size_to_slot(chunk_md->contig_hint * PCPU_MIN_ALLOC_SIZE);
}

/* set the pointer to a chunk in a page struct */
static void pcpu_set_page_chunk(struct page *page, struct pcpu_chunk *pcpu)
{
page->private = (unsigned long)pcpu;
}

/* obtain pointer to a chunk from a page struct */
static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page)
{
return (struct pcpu_chunk *)page->private;
}

static int __maybe_unused pcpu_page_idx(unsigned int cpu, int page_idx)
{
return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx;
}

static unsigned long pcpu_unit_page_offset(unsigned int cpu, int page_idx)
{
return pcpu_unit_offsets[cpu] + (page_idx << PAGE_SHIFT);
}

static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
         unsigned int cpu, int page_idx)
{
return (unsigned long)chunk->base_addr +
        pcpu_unit_page_offset(cpu, page_idx);
}

/*
* The following are helper functions to help access bitmaps and convert
* between bitmap offsets to address offsets.
*/
static unsigned long *pcpu_index_alloc_map(struct pcpu_chunk *chunk, int index)
{
return chunk->alloc_map +
        (index * PCPU_BITMAP_BLOCK_BITS / BITS_PER_LONG);
}

static unsigned long pcpu_off_to_block_index(int off)
{
return off / PCPU_BITMAP_BLOCK_BITS;
}

static unsigned long pcpu_off_to_block_off(int off)
{
return off & (PCPU_BITMAP_BLOCK_BITS - 1);
}

static unsigned long pcpu_block_off_to_off(int index, int off)
{
return index * PCPU_BITMAP_BLOCK_BITS + off;
}

/**
* pcpu_check_block_hint - check against the contig hint
* @block: block of interest
* @bits: size of allocation
* @align: alignment of area (max PAGE_SIZE)
*
* Check to see if the allocation can fit in the block's contig hint.
* Note, a chunk uses the same hints as a block so this can also check against
* the chunk's contig hint.
*/
static bool pcpu_check_block_hint(struct pcpu_block_md *block, int bits,
      size_t align)
{
int bit_off = ALIGN(block->contig_hint_start, align) -
  block->contig_hint_start;

return bit_off + bits <= block->contig_hint;
}

/*
* pcpu_next_hint - determine which hint to use
* @block: block of interest
* @alloc_bits: size of allocation
*
* This determines if we should scan based on the scan_hint or first_free.
* In general, we want to scan from first_free to fulfill allocations by
* first fit.  However, if we know a scan_hint at position scan_hint_start
* cannot fulfill an allocation, we can begin scanning from there knowing
* the contig_hint will be our fallback.
*/
static int pcpu_next_hint(struct pcpu_block_md *block, int alloc_bits)
{
/*
* The three conditions below determine if we can skip past the
* scan_hint.  First, does the scan hint exist.  Second, is the
* contig_hint after the scan_hint (possibly not true iff
* contig_hint == scan_hint).  Third, is the allocation request
* larger than the scan_hint.
*/
if (block->scan_hint &&
     block->contig_hint_start > block->scan_hint_start &&
     alloc_bits > block->scan_hint)
  return block->scan_hint_start + block->scan_hint;

return block->first_free;
}

/**
* pcpu_next_md_free_region - finds the next hint free area
* @chunk: chunk of interest
* @bit_off: chunk offset
* @bits: size of free area
*
* Helper function for pcpu_for_each_md_free_region.  It checks
* block->contig_hint and performs aggregation across blocks to find the
* next hint.  It modifies bit_off and bits in-place to be consumed in the
* loop.
*/
static void pcpu_next_md_free_region(struct pcpu_chunk *chunk, int *bit_off,
         int *bits)
{
int i = pcpu_off_to_block_index(*bit_off);
int block_off = pcpu_off_to_block_off(*bit_off);
struct pcpu_block_md *block;

*bits = 0;
for (block = chunk->md_blocks + i; i < pcpu_chunk_nr_blocks(chunk);
      block++, i++) {
  /* handles contig area across blocks */
  if (*bits) {
   *bits += block->left_free;
   if (block->left_free == PCPU_BITMAP_BLOCK_BITS)
    continue;
   return;
  }

  /*
* This checks three things.  First is there a contig_hint to
* check.  Second, have we checked this hint before by
* comparing the block_off.  Third, is this the same as the
* right contig hint.  In the last case, it spills over into
* the next block and should be handled by the contig area
* across blocks code.
*/
  *bits = block->contig_hint;
  if (*bits && block->contig_hint_start >= block_off &&
      *bits + block->contig_hint_start < PCPU_BITMAP_BLOCK_BITS) {
   *bit_off = pcpu_block_off_to_off(i,
     block->contig_hint_start);
   return;
  }
  /* reset to satisfy the second predicate above */
  block_off = 0;

  *bits = block->right_free;
  *bit_off = (i + 1) * PCPU_BITMAP_BLOCK_BITS - block->right_free;
}
}

/**
* pcpu_next_fit_region - finds fit areas for a given allocation request
* @chunk: chunk of interest
* @alloc_bits: size of allocation
* @align: alignment of area (max PAGE_SIZE)
* @bit_off: chunk offset
* @bits: size of free area
*
* Finds the next free region that is viable for use with a given size and
* alignment.  This only returns if there is a valid area to be used for this
* allocation.  block->first_free is returned if the allocation request fits
* within the block to see if the request can be fulfilled prior to the contig
* hint.
*/
static void pcpu_next_fit_region(struct pcpu_chunk *chunk, int alloc_bits,
     int align, int *bit_off, int *bits)
{
int i = pcpu_off_to_block_index(*bit_off);
int block_off = pcpu_off_to_block_off(*bit_off);
struct pcpu_block_md *block;

*bits = 0;
for (block = chunk->md_blocks + i; i < pcpu_chunk_nr_blocks(chunk);
      block++, i++) {
  /* handles contig area across blocks */
  if (*bits) {
   *bits += block->left_free;
   if (*bits >= alloc_bits)
    return;
   if (block->left_free == PCPU_BITMAP_BLOCK_BITS)
    continue;
  }

  /* check block->contig_hint */
  *bits = ALIGN(block->contig_hint_start, align) -
   block->contig_hint_start;
  /*
* This uses the block offset to determine if this has been
* checked in the prior iteration.
*/
  if (block->contig_hint &&
      block->contig_hint_start >= block_off &&
      block->contig_hint >= *bits + alloc_bits) {
   int start = pcpu_next_hint(block, alloc_bits);

   *bits += alloc_bits + block->contig_hint_start -
     start;
   *bit_off = pcpu_block_off_to_off(i, start);
   return;
  }
  /* reset to satisfy the second predicate above */
  block_off = 0;

  *bit_off = ALIGN(PCPU_BITMAP_BLOCK_BITS - block->right_free,
     align);
  *bits = PCPU_BITMAP_BLOCK_BITS - *bit_off;
  *bit_off = pcpu_block_off_to_off(i, *bit_off);
  if (*bits >= alloc_bits)
   return;
}

/* no valid offsets were found - fail condition */
*bit_off = pcpu_chunk_map_bits(chunk);
}

/*
* Metadata free area iterators.  These perform aggregation of free areas
* based on the metadata blocks and return the offset @bit_off and size in
* bits of the free area @bits.  pcpu_for_each_fit_region only returns when
* a fit is found for the allocation request.
*/
#define pcpu_for_each_md_free_region(chunk, bit_off, bits)  \
for (pcpu_next_md_free_region((chunk), &(bit_off), &(bits)); \
      (bit_off) < pcpu_chunk_map_bits((chunk));   \
      (bit_off) += (bits) + 1,     \
      pcpu_next_md_free_region((chunk), &(bit_off), &(bits)))

#define pcpu_for_each_fit_region(chunk, alloc_bits, align, bit_off, bits)     \
for (pcpu_next_fit_region((chunk), (alloc_bits), (align), &(bit_off), \
      &(bits));          \
      (bit_off) < pcpu_chunk_map_bits((chunk));         \
      (bit_off) += (bits),           \
      pcpu_next_fit_region((chunk), (alloc_bits), (align), &(bit_off), \
      &(bits)))

/**
* pcpu_mem_zalloc - allocate memory
* @size: bytes to allocate
* @gfp: allocation flags
*
* Allocate @size bytes.  If @size is smaller than PAGE_SIZE,
* kzalloc() is used; otherwise, the equivalent of vzalloc() is used.
* This is to facilitate passing through whitelisted flags.  The
* returned memory is always zeroed.
*
* RETURNS:
* Pointer to the allocated area on success, NULL on failure.
*/
static void *pcpu_mem_zalloc(size_t size, gfp_t gfp)
{
if (WARN_ON_ONCE(!slab_is_available()))
  return NULL;

if (size <= PAGE_SIZE)
  return kzalloc(size, gfp);
else
  return __vmalloc(size, gfp | __GFP_ZERO);
}

/**
* pcpu_mem_free - free memory
* @ptr: memory to free
*
* Free @ptr.  @ptr should have been allocated using pcpu_mem_zalloc().
*/
static void pcpu_mem_free(void *ptr)
{
kvfree(ptr);
}

static void __pcpu_chunk_move(struct pcpu_chunk *chunk, int slot,
         bool move_front)
{
if (chunk != pcpu_reserved_chunk) {
  if (move_front)
   list_move(&chunk->list, &pcpu_chunk_lists[slot]);
  else
   list_move_tail(&chunk->list, &pcpu_chunk_lists[slot]);
}
}

static void pcpu_chunk_move(struct pcpu_chunk *chunk, int slot)
{
__pcpu_chunk_move(chunk, slot, true);
}

/**
* pcpu_chunk_relocate - put chunk in the appropriate chunk slot
* @chunk: chunk of interest
* @oslot: the previous slot it was on
*
* This function is called after an allocation or free changed @chunk.
* New slot according to the changed state is determined and @chunk is
* moved to the slot.  Note that the reserved chunk is never put on
* chunk slots.
*
* CONTEXT:
* pcpu_lock.
*/
static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
{
int nslot = pcpu_chunk_slot(chunk);

/* leave isolated chunks in-place */
if (chunk->isolated)
  return;

if (oslot != nslot)
  __pcpu_chunk_move(chunk, nslot, oslot < nslot);
}

static void pcpu_isolate_chunk(struct pcpu_chunk *chunk)
{
lockdep_assert_held(&pcpu_lock);

if (!chunk->isolated) {
  chunk->isolated = true;
  pcpu_nr_empty_pop_pages -= chunk->nr_empty_pop_pages;
}
list_move(&chunk->list, &pcpu_chunk_lists[pcpu_to_depopulate_slot]);
}

static void pcpu_reintegrate_chunk(struct pcpu_chunk *chunk)
{
lockdep_assert_held(&pcpu_lock);

if (chunk->isolated) {
  chunk->isolated = false;
  pcpu_nr_empty_pop_pages += chunk->nr_empty_pop_pages;
  pcpu_chunk_relocate(chunk, -1);
}
}

/*
* pcpu_update_empty_pages - update empty page counters
* @chunk: chunk of interest
* @nr: nr of empty pages
*
* This is used to keep track of the empty pages now based on the premise
* a md_block covers a page.  The hint update functions recognize if a block
* is made full or broken to calculate deltas for keeping track of free pages.
*/
static inline void pcpu_update_empty_pages(struct pcpu_chunk *chunk, int nr)
{
chunk->nr_empty_pop_pages += nr;
if (chunk != pcpu_reserved_chunk && !chunk->isolated)
  pcpu_nr_empty_pop_pages += nr;
}

/*
* pcpu_region_overlap - determines if two regions overlap
* @a: start of first region, inclusive
* @b: end of first region, exclusive
* @x: start of second region, inclusive
* @y: end of second region, exclusive
*
* This is used to determine if the hint region [a, b) overlaps with the
* allocated region [x, y).
*/
static inline bool pcpu_region_overlap(int a, int b, int x, int y)
{
return (a < y) && (x < b);
}

/**
* pcpu_block_update - updates a block given a free area
* @block: block of interest
* @start: start offset in block
* @end: end offset in block
*
* Updates a block given a known free area.  The region [start, end) is
* expected to be the entirety of the free area within a block.  Chooses
* the best starting offset if the contig hints are equal.
*/
static void pcpu_block_update(struct pcpu_block_md *block, int start, int end)
{
int contig = end - start;

block->first_free = min(block->first_free, start);
if (start == 0)
  block->left_free = contig;

if (end == block->nr_bits)
  block->right_free = contig;

if (contig > block->contig_hint) {
  /* promote the old contig_hint to be the new scan_hint */
  if (start > block->contig_hint_start) {
   if (block->contig_hint > block->scan_hint) {
    block->scan_hint_start =
     block->contig_hint_start;
    block->scan_hint = block->contig_hint;
   } else if (start < block->scan_hint_start) {
    /*
* The old contig_hint == scan_hint.  But, the
* new contig is larger so hold the invariant
* scan_hint_start < contig_hint_start.
*/
    block->scan_hint = 0;
   }
  } else {
   block->scan_hint = 0;
  }
  block->contig_hint_start = start;
  block->contig_hint = contig;
} else if (contig == block->contig_hint) {
  if (block->contig_hint_start &&
      (!start ||
       __ffs(start) > __ffs(block->contig_hint_start))) {
   /* start has a better alignment so use it */
   block->contig_hint_start = start;
   if (start < block->scan_hint_start &&
       block->contig_hint > block->scan_hint)
    block->scan_hint = 0;
  } else if (start > block->scan_hint_start ||
      block->contig_hint > block->scan_hint) {
   /*
* Knowing contig == contig_hint, update the scan_hint
* if it is farther than or larger than the current
* scan_hint.
*/
   block->scan_hint_start = start;
   block->scan_hint = contig;
  }
} else {
  /*
* The region is smaller than the contig_hint.  So only update
* the scan_hint if it is larger than or equal and farther than
* the current scan_hint.
*/
  if ((start < block->contig_hint_start &&
       (contig > block->scan_hint ||
        (contig == block->scan_hint &&
         start > block->scan_hint_start)))) {
   block->scan_hint_start = start;
   block->scan_hint = contig;
  }
}
}

/*
* pcpu_block_update_scan - update a block given a free area from a scan
* @chunk: chunk of interest
* @bit_off: chunk offset
* @bits: size of free area
*
* Finding the final allocation spot first goes through pcpu_find_block_fit()
* to find a block that can hold the allocation and then pcpu_alloc_area()
* where a scan is used.  When allocations require specific alignments,
* we can inadvertently create holes which will not be seen in the alloc
* or free paths.
*
* This takes a given free area hole and updates a block as it may change the
* scan_hint.  We need to scan backwards to ensure we don't miss free bits
* from alignment.
*/
static void pcpu_block_update_scan(struct pcpu_chunk *chunk, int bit_off,
       int bits)
{
int s_off = pcpu_off_to_block_off(bit_off);
int e_off = s_off + bits;
int s_index, l_bit;
struct pcpu_block_md *block;

if (e_off > PCPU_BITMAP_BLOCK_BITS)
  return;

s_index = pcpu_off_to_block_index(bit_off);
block = chunk->md_blocks + s_index;

/* scan backwards in case of alignment skipping free bits */
l_bit = find_last_bit(pcpu_index_alloc_map(chunk, s_index), s_off);
s_off = (s_off == l_bit) ? 0 : l_bit + 1;

pcpu_block_update(block, s_off, e_off);
}

/**
* pcpu_chunk_refresh_hint - updates metadata about a chunk
* @chunk: chunk of interest
* @full_scan: if we should scan from the beginning
*
* Iterates over the metadata blocks to find the largest contig area.
* A full scan can be avoided on the allocation path as this is triggered
* if we broke the contig_hint.  In doing so, the scan_hint will be before
* the contig_hint or after if the scan_hint == contig_hint.  This cannot
* be prevented on freeing as we want to find the largest area possibly
* spanning blocks.
*/
static void pcpu_chunk_refresh_hint(struct pcpu_chunk *chunk, bool full_scan)
{
struct pcpu_block_md *chunk_md = &chunk->chunk_md;
int bit_off, bits;

/* promote scan_hint to contig_hint */
if (!full_scan && chunk_md->scan_hint) {
  bit_off = chunk_md->scan_hint_start + chunk_md->scan_hint;
  chunk_md->contig_hint_start = chunk_md->scan_hint_start;
  chunk_md->contig_hint = chunk_md->scan_hint;
  chunk_md->scan_hint = 0;
} else {
  bit_off = chunk_md->first_free;
  chunk_md->contig_hint = 0;
}

bits = 0;
pcpu_for_each_md_free_region(chunk, bit_off, bits)
  pcpu_block_update(chunk_md, bit_off, bit_off + bits);
}

/**
* pcpu_block_refresh_hint
* @chunk: chunk of interest
* @index: index of the metadata block
*
* Scans over the block beginning at first_free and updates the block
* metadata accordingly.
*/
static void pcpu_block_refresh_hint(struct pcpu_chunk *chunk, int index)
{
struct pcpu_block_md *block = chunk->md_blocks + index;
unsigned long *alloc_map = pcpu_index_alloc_map(chunk, index);
unsigned int start, end; /* region start, region end */

/* promote scan_hint to contig_hint */
if (block->scan_hint) {
  start = block->scan_hint_start + block->scan_hint;
  block->contig_hint_start = block->scan_hint_start;
  block->contig_hint = block->scan_hint;
  block->scan_hint = 0;
} else {
  start = block->first_free;
  block->contig_hint = 0;
}

block->right_free = 0;

/* iterate over free areas and update the contig hints */
for_each_clear_bitrange_from(start, end, alloc_map, PCPU_BITMAP_BLOCK_BITS)
  pcpu_block_update(block, start, end);
}

/**
* pcpu_block_update_hint_alloc - update hint on allocation path
* @chunk: chunk of interest
* @bit_off: chunk offset
* @bits: size of request
*
* Updates metadata for the allocation path.  The metadata only has to be
* refreshed by a full scan iff the chunk's contig hint is broken.  Block level
* scans are required if the block's contig hint is broken.
*/
static void pcpu_block_update_hint_alloc(struct pcpu_chunk *chunk, int bit_off,
      int bits)
{
struct pcpu_block_md *chunk_md = &chunk->chunk_md;
int nr_empty_pages = 0;
struct pcpu_block_md *s_block, *e_block, *block;
int s_index, e_index; /* block indexes of the freed allocation */
int s_off, e_off; /* block offsets of the freed allocation */

/*
* Calculate per block offsets.
* The calculation uses an inclusive range, but the resulting offsets
* are [start, end).  e_index always points to the last block in the
* range.
*/
s_index = pcpu_off_to_block_index(bit_off);
e_index = pcpu_off_to_block_index(bit_off + bits - 1);
s_off = pcpu_off_to_block_off(bit_off);
e_off = pcpu_off_to_block_off(bit_off + bits - 1) + 1;

s_block = chunk->md_blocks + s_index;
e_block = chunk->md_blocks + e_index;

/*
* Update s_block.
*/
if (s_block->contig_hint == PCPU_BITMAP_BLOCK_BITS)
  nr_empty_pages++;

/*
* block->first_free must be updated if the allocation takes its place.
* If the allocation breaks the contig_hint, a scan is required to
* restore this hint.
*/
if (s_off == s_block->first_free)
  s_block->first_free = find_next_zero_bit(
     pcpu_index_alloc_map(chunk, s_index),
     PCPU_BITMAP_BLOCK_BITS,
     s_off + bits);

if (pcpu_region_overlap(s_block->scan_hint_start,
    s_block->scan_hint_start + s_block->scan_hint,
    s_off,
    s_off + bits))
  s_block->scan_hint = 0;

if (pcpu_region_overlap(s_block->contig_hint_start,
    s_block->contig_hint_start +
    s_block->contig_hint,
    s_off,
    s_off + bits)) {
  /* block contig hint is broken - scan to fix it */
  if (!s_off)
   s_block->left_free = 0;
  pcpu_block_refresh_hint(chunk, s_index);
} else {
  /* update left and right contig manually */
  s_block->left_free = min(s_block->left_free, s_off);
  if (s_index == e_index)
   s_block->right_free = min_t(int, s_block->right_free,
     PCPU_BITMAP_BLOCK_BITS - e_off);
  else
   s_block->right_free = 0;
}

/*
* Update e_block.
*/
if (s_index != e_index) {
  if (e_block->contig_hint == PCPU_BITMAP_BLOCK_BITS)
   nr_empty_pages++;

  /*
* When the allocation is across blocks, the end is along
* the left part of the e_block.
*/
  e_block->first_free = find_next_zero_bit(
    pcpu_index_alloc_map(chunk, e_index),
    PCPU_BITMAP_BLOCK_BITS, e_off);

  if (e_off == PCPU_BITMAP_BLOCK_BITS) {
   /* reset the block */
   e_block++;
  } else {
   if (e_off > e_block->scan_hint_start)
    e_block->scan_hint = 0;

   e_block->left_free = 0;
   if (e_off > e_block->contig_hint_start) {
    /* contig hint is broken - scan to fix it */
    pcpu_block_refresh_hint(chunk, e_index);
   } else {
    e_block->right_free =
     min_t(int, e_block->right_free,
           PCPU_BITMAP_BLOCK_BITS - e_off);
   }
  }

  /* update in-between md_blocks */
  nr_empty_pages += (e_index - s_index - 1);
  for (block = s_block + 1; block < e_block; block++) {
   block->scan_hint = 0;
   block->contig_hint = 0;
   block->left_free = 0;
   block->right_free = 0;
  }
}

/*
* If the allocation is not atomic, some blocks may not be
* populated with pages, while we account it here.  The number
* of pages will be added back with pcpu_chunk_populated()
* when populating pages.
*/
if (nr_empty_pages)
  pcpu_update_empty_pages(chunk, -nr_empty_pages);

if (pcpu_region_overlap(chunk_md->scan_hint_start,
    chunk_md->scan_hint_start +
    chunk_md->scan_hint,
    bit_off,
    bit_off + bits))
  chunk_md->scan_hint = 0;

/*
* The only time a full chunk scan is required is if the chunk
* contig hint is broken.  Otherwise, it means a smaller space
* was used and therefore the chunk contig hint is still correct.
*/
if (pcpu_region_overlap(chunk_md->contig_hint_start,
    chunk_md->contig_hint_start +
    chunk_md->contig_hint,
    bit_off,
    bit_off + bits))
  pcpu_chunk_refresh_hint(chunk, false);
}

/**
* pcpu_block_update_hint_free - updates the block hints on the free path
* @chunk: chunk of interest
* @bit_off: chunk offset
* @bits: size of request
*
* Updates metadata for the allocation path.  This avoids a blind block
* refresh by making use of the block contig hints.  If this fails, it scans
* forward and backward to determine the extent of the free area.  This is
* capped at the boundary of blocks.
*
* A chunk update is triggered if a page becomes free, a block becomes free,
* or the free spans across blocks.  This tradeoff is to minimize iterating
* over the block metadata to update chunk_md->contig_hint.
* chunk_md->contig_hint may be off by up to a page, but it will never be more
* than the available space.  If the contig hint is contained in one block, it
* will be accurate.
*/
static void pcpu_block_update_hint_free(struct pcpu_chunk *chunk, int bit_off,
     int bits)
{
int nr_empty_pages = 0;
struct pcpu_block_md *s_block, *e_block, *block;
int s_index, e_index; /* block indexes of the freed allocation */
int s_off, e_off; /* block offsets of the freed allocation */
int start, end;  /* start and end of the whole free area */

/*
* Calculate per block offsets.
* The calculation uses an inclusive range, but the resulting offsets
* are [start, end).  e_index always points to the last block in the
* range.
*/
s_index = pcpu_off_to_block_index(bit_off);
e_index = pcpu_off_to_block_index(bit_off + bits - 1);
s_off = pcpu_off_to_block_off(bit_off);
e_off = pcpu_off_to_block_off(bit_off + bits - 1) + 1;

s_block = chunk->md_blocks + s_index;
e_block = chunk->md_blocks + e_index;

/*
* Check if the freed area aligns with the block->contig_hint.
* If it does, then the scan to find the beginning/end of the
* larger free area can be avoided.
*
* start and end refer to beginning and end of the free area
* within each their respective blocks.  This is not necessarily
* the entire free area as it may span blocks past the beginning
* or end of the block.
*/
start = s_off;
if (s_off == s_block->contig_hint + s_block->contig_hint_start) {
  start = s_block->contig_hint_start;
} else {
  /*
* Scan backwards to find the extent of the free area.
* find_last_bit returns the starting bit, so if the start bit
* is returned, that means there was no last bit and the
* remainder of the chunk is free.
*/
  int l_bit = find_last_bit(pcpu_index_alloc_map(chunk, s_index),
       start);
  start = (start == l_bit) ? 0 : l_bit + 1;
}

end = e_off;
if (e_off == e_block->contig_hint_start)
  end = e_block->contig_hint_start + e_block->contig_hint;
else
  end = find_next_bit(pcpu_index_alloc_map(chunk, e_index),
        PCPU_BITMAP_BLOCK_BITS, end);

/* update s_block */
e_off = (s_index == e_index) ? end : PCPU_BITMAP_BLOCK_BITS;
if (!start && e_off == PCPU_BITMAP_BLOCK_BITS)
  nr_empty_pages++;
pcpu_block_update(s_block, start, e_off);

/* freeing in the same block */
if (s_index != e_index) {
  /* update e_block */
  if (end == PCPU_BITMAP_BLOCK_BITS)
   nr_empty_pages++;
  pcpu_block_update(e_block, 0, end);

  /* reset md_blocks in the middle */
  nr_empty_pages += (e_index - s_index - 1);
  for (block = s_block + 1; block < e_block; block++) {
   block->first_free = 0;
   block->scan_hint = 0;
   block->contig_hint_start = 0;
   block->contig_hint = PCPU_BITMAP_BLOCK_BITS;
   block->left_free = PCPU_BITMAP_BLOCK_BITS;
   block->right_free = PCPU_BITMAP_BLOCK_BITS;
  }
}

if (nr_empty_pages)
  pcpu_update_empty_pages(chunk, nr_empty_pages);

/*
* Refresh chunk metadata when the free makes a block free or spans
* across blocks.  The contig_hint may be off by up to a page, but if
* the contig_hint is contained in a block, it will be accurate with
* the else condition below.
*/
if (((end - start) >= PCPU_BITMAP_BLOCK_BITS) || s_index != e_index)
  pcpu_chunk_refresh_hint(chunk, true);
else
  pcpu_block_update(&chunk->chunk_md,
      pcpu_block_off_to_off(s_index, start),
      end);
}

/**
* pcpu_is_populated - determines if the region is populated
* @chunk: chunk of interest
* @bit_off: chunk offset
* @bits: size of area
* @next_off: return value for the next offset to start searching
*
* For atomic allocations, check if the backing pages are populated.
*
* RETURNS:
* Bool if the backing pages are populated.
* next_index is to skip over unpopulated blocks in pcpu_find_block_fit.
*/
static bool pcpu_is_populated(struct pcpu_chunk *chunk, int bit_off, int bits,
         int *next_off)
{
unsigned int start, end;

start = PFN_DOWN(bit_off * PCPU_MIN_ALLOC_SIZE);
end = PFN_UP((bit_off + bits) * PCPU_MIN_ALLOC_SIZE);

start = find_next_zero_bit(chunk->populated, end, start);
if (start >= end)
  return true;

end = find_next_bit(chunk->populated, end, start + 1);

*next_off = end * PAGE_SIZE / PCPU_MIN_ALLOC_SIZE;
return false;
}

/**
* pcpu_find_block_fit - finds the block index to start searching
* @chunk: chunk of interest
* @alloc_bits: size of request in allocation units
* @align: alignment of area (max PAGE_SIZE bytes)
* @pop_only: use populated regions only
*
* Given a chunk and an allocation spec, find the offset to begin searching
* for a free region.  This iterates over the bitmap metadata blocks to
* find an offset that will be guaranteed to fit the requirements.  It is
* not quite first fit as if the allocation does not fit in the contig hint
* of a block or chunk, it is skipped.  This errs on the side of caution
* to prevent excess iteration.  Poor alignment can cause the allocator to
* skip over blocks and chunks that have valid free areas.
*
* RETURNS:
* The offset in the bitmap to begin searching.
* -1 if no offset is found.
*/
static int pcpu_find_block_fit(struct pcpu_chunk *chunk, int alloc_bits,
          size_t align, bool pop_only)
{
struct pcpu_block_md *chunk_md = &chunk->chunk_md;
int bit_off, bits, next_off;

/*
* This is an optimization to prevent scanning by assuming if the
* allocation cannot fit in the global hint, there is memory pressure
* and creating a new chunk would happen soon.
*/
if (!pcpu_check_block_hint(chunk_md, alloc_bits, align))
  return -1;

bit_off = pcpu_next_hint(chunk_md, alloc_bits);
bits = 0;
pcpu_for_each_fit_region(chunk, alloc_bits, align, bit_off, bits) {
  if (!pop_only || pcpu_is_populated(chunk, bit_off, bits,
         &next_off))
   break;

  bit_off = next_off;
  bits = 0;
}

if (bit_off == pcpu_chunk_map_bits(chunk))
  return -1;

return bit_off;
}

/*
* pcpu_find_zero_area - modified from bitmap_find_next_zero_area_off()
* @map: the address to base the search on
* @size: the bitmap size in bits
* @start: the bitnumber to start searching at
* @nr: the number of zeroed bits we're looking for
* @align_mask: alignment mask for zero area
* @largest_off: offset of the largest area skipped
* @largest_bits: size of the largest area skipped
*
* The @align_mask should be one less than a power of 2.
*
* This is a modified version of bitmap_find_next_zero_area_off() to remember
* the largest area that was skipped.  This is imperfect, but in general is
* good enough.  The largest remembered region is the largest failed region
* seen.  This does not include anything we possibly skipped due to alignment.
* pcpu_block_update_scan() does scan backwards to try and recover what was
* lost to alignment.  While this can cause scanning to miss earlier possible
* free areas, smaller allocations will eventually fill those holes.
*/
static unsigned long pcpu_find_zero_area(unsigned long *map,
      unsigned long size,
      unsigned long start,
      unsigned long nr,
      unsigned long align_mask,
      unsigned long *largest_off,
      unsigned long *largest_bits)
{
unsigned long index, end, i, area_off, area_bits;
again:
index = find_next_zero_bit(map, size, start);

/* Align allocation */
index = __ALIGN_MASK(index, align_mask);
area_off = index;

end = index + nr;
if (end > size)
  return end;
i = find_next_bit(map, end, index);
if (i < end) {
  area_bits = i - area_off;
  /* remember largest unused area with best alignment */
  if (area_bits > *largest_bits ||
      (area_bits == *largest_bits && *largest_off &&
       (!area_off || __ffs(area_off) > __ffs(*largest_off)))) {
   *largest_off = area_off;
   *largest_bits = area_bits;
  }

  start = i + 1;
  goto again;
}
return index;
}

/**
* pcpu_alloc_area - allocates an area from a pcpu_chunk
* @chunk: chunk of interest
* @alloc_bits: size of request in allocation units
* @align: alignment of area (max PAGE_SIZE)
* @start: bit_off to start searching
*
* This function takes in a @start offset to begin searching to fit an
* allocation of @alloc_bits with alignment @align.  It needs to scan
* the allocation map because if it fits within the block's contig hint,
* @start will be block->first_free. This is an attempt to fill the
* allocation prior to breaking the contig hint.  The allocation and
* boundary maps are updated accordingly if it confirms a valid
* free area.
*
* RETURNS:
* Allocated addr offset in @chunk on success.
* -1 if no matching area is found.
*/
static int pcpu_alloc_area(struct pcpu_chunk *chunk, int alloc_bits,
      size_t align, int start)
{
struct pcpu_block_md *chunk_md = &chunk->chunk_md;
size_t align_mask = (align) ? (align - 1) : 0;
unsigned long area_off = 0, area_bits = 0;
int bit_off, end, oslot;

lockdep_assert_held(&pcpu_lock);

oslot = pcpu_chunk_slot(chunk);

/*
* Search to find a fit.
*/
end = min_t(int, start + alloc_bits + PCPU_BITMAP_BLOCK_BITS,
      pcpu_chunk_map_bits(chunk));
bit_off = pcpu_find_zero_area(chunk->alloc_map, end, start, alloc_bits,
          align_mask, &area_off, &area_bits);
if (bit_off >= end)
  return -1;

if (area_bits)
  pcpu_block_update_scan(chunk, area_off, area_bits);

/* update alloc map */
bitmap_set(chunk->alloc_map, bit_off, alloc_bits);

/* update boundary map */
set_bit(bit_off, chunk->bound_map);
bitmap_clear(chunk->bound_map, bit_off + 1, alloc_bits - 1);
set_bit(bit_off + alloc_bits, chunk->bound_map);

chunk->free_bytes -= alloc_bits * PCPU_MIN_ALLOC_SIZE;

/* update first free bit */
if (bit_off == chunk_md->first_free)
  chunk_md->first_free = find_next_zero_bit(
     chunk->alloc_map,
     pcpu_chunk_map_bits(chunk),
     bit_off + alloc_bits);

pcpu_block_update_hint_alloc(chunk, bit_off, alloc_bits);

pcpu_chunk_relocate(chunk, oslot);

return bit_off * PCPU_MIN_ALLOC_SIZE;
}

/**
* pcpu_free_area - frees the corresponding offset
* @chunk: chunk of interest
* @off: addr offset into chunk
*
* This function determines the size of an allocation to free using
* the boundary bitmap and clears the allocation map.
*
* RETURNS:
* Number of freed bytes.
*/
static int pcpu_free_area(struct pcpu_chunk *chunk, int off)
{
struct pcpu_block_md *chunk_md = &chunk->chunk_md;
int bit_off, bits, end, oslot, freed;

lockdep_assert_held(&pcpu_lock);
pcpu_stats_area_dealloc(chunk);

oslot = pcpu_chunk_slot(chunk);

bit_off = off / PCPU_MIN_ALLOC_SIZE;

/* find end index */
end = find_next_bit(chunk->bound_map, pcpu_chunk_map_bits(chunk),
       bit_off + 1);
bits = end - bit_off;
bitmap_clear(chunk->alloc_map, bit_off, bits);

freed = bits * PCPU_MIN_ALLOC_SIZE;

/* update metadata */
chunk->free_bytes += freed;

/* update first free bit */
chunk_md->first_free = min(chunk_md->first_free, bit_off);

pcpu_block_update_hint_free(chunk, bit_off, bits);

pcpu_chunk_relocate(chunk, oslot);

return freed;
}

static void pcpu_init_md_block(struct pcpu_block_md *block, int nr_bits)
{
block->scan_hint = 0;
block->contig_hint = nr_bits;
block->left_free = nr_bits;
block->right_free = nr_bits;
block->first_free = 0;
block->nr_bits = nr_bits;
}

static void pcpu_init_md_blocks(struct pcpu_chunk *chunk)
{
struct pcpu_block_md *md_block;

/* init the chunk's block */
pcpu_init_md_block(&chunk->chunk_md, pcpu_chunk_map_bits(chunk));

for (md_block = chunk->md_blocks;
      md_block != chunk->md_blocks + pcpu_chunk_nr_blocks(chunk);
      md_block++)
  pcpu_init_md_block(md_block, PCPU_BITMAP_BLOCK_BITS);
}

/**
* pcpu_alloc_first_chunk - creates chunks that serve the first chunk
* @tmp_addr: the start of the region served
* @map_size: size of the region served
*
* This is responsible for creating the chunks that serve the first chunk.  The
* base_addr is page aligned down of @tmp_addr while the region end is page
* aligned up.  Offsets are kept track of to determine the region served. All
* this is done to appease the bitmap allocator in avoiding partial blocks.
*
* RETURNS:
* Chunk serving the region at @tmp_addr of @map_size.
*/
static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr,
        int map_size)
{
struct pcpu_chunk *chunk;
unsigned long aligned_addr;
int start_offset, offset_bits, region_size, region_bits;
size_t alloc_size;

/* region calculations */
aligned_addr = tmp_addr & PAGE_MASK;

start_offset = tmp_addr - aligned_addr;
region_size = ALIGN(start_offset + map_size, PAGE_SIZE);

/* allocate chunk */
alloc_size = struct_size(chunk, populated,
     BITS_TO_LONGS(region_size >> PAGE_SHIFT));
chunk = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES);

INIT_LIST_HEAD(&chunk->list);

chunk->base_addr = (void *)aligned_addr;
chunk->start_offset = start_offset;
chunk->end_offset = region_size - chunk->start_offset - map_size;

chunk->nr_pages = region_size >> PAGE_SHIFT;
region_bits = pcpu_chunk_map_bits(chunk);

alloc_size = BITS_TO_LONGS(region_bits) * sizeof(chunk->alloc_map[0]);
chunk->alloc_map = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES);

alloc_size =
  BITS_TO_LONGS(region_bits + 1) * sizeof(chunk->bound_map[0]);
chunk->bound_map = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES);

alloc_size = pcpu_chunk_nr_blocks(chunk) * sizeof(chunk->md_blocks[0]);
chunk->md_blocks = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES);
#ifdef NEED_PCPUOBJ_EXT
/* first chunk is free to use */
chunk->obj_exts = NULL;
#endif
pcpu_init_md_blocks(chunk);

/* manage populated page bitmap */
chunk->immutable = true;
bitmap_fill(chunk->populated, chunk->nr_pages);
chunk->nr_populated = chunk->nr_pages;
chunk->nr_empty_pop_pages = chunk->nr_pages;

chunk->free_bytes = map_size;

if (chunk->start_offset) {
  /* hide the beginning of the bitmap */
  offset_bits = chunk->start_offset / PCPU_MIN_ALLOC_SIZE;
  bitmap_set(chunk->alloc_map, 0, offset_bits);
  set_bit(0, chunk->bound_map);
  set_bit(offset_bits, chunk->bound_map);

  chunk->chunk_md.first_free = offset_bits;

  pcpu_block_update_hint_alloc(chunk, 0, offset_bits);
}

if (chunk->end_offset) {
  /* hide the end of the bitmap */
  offset_bits = chunk->end_offset / PCPU_MIN_ALLOC_SIZE;
  bitmap_set(chunk->alloc_map,
      pcpu_chunk_map_bits(chunk) - offset_bits,
      offset_bits);
  set_bit((start_offset + map_size) / PCPU_MIN_ALLOC_SIZE,
   chunk->bound_map);
  set_bit(region_bits, chunk->bound_map);

  pcpu_block_update_hint_alloc(chunk, pcpu_chunk_map_bits(chunk)
          - offset_bits, offset_bits);
}

return chunk;
}

static struct pcpu_chunk *pcpu_alloc_chunk(gfp_t gfp)
{
struct pcpu_chunk *chunk;
int region_bits;

chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size, gfp);
if (!chunk)
  return NULL;

INIT_LIST_HEAD(&chunk->list);
chunk->nr_pages = pcpu_unit_pages;
region_bits = pcpu_chunk_map_bits(chunk);

chunk->alloc_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits) *
        sizeof(chunk->alloc_map[0]), gfp);
if (!chunk->alloc_map)
  goto alloc_map_fail;

chunk->bound_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits + 1) *
        sizeof(chunk->bound_map[0]), gfp);
if (!chunk->bound_map)
  goto bound_map_fail;

chunk->md_blocks = pcpu_mem_zalloc(pcpu_chunk_nr_blocks(chunk) *
        sizeof(chunk->md_blocks[0]), gfp);
if (!chunk->md_blocks)
  goto md_blocks_fail;

#ifdef NEED_PCPUOBJ_EXT
if (need_pcpuobj_ext()) {
  chunk->obj_exts =
   pcpu_mem_zalloc(pcpu_chunk_map_bits(chunk) *
     sizeof(struct pcpuobj_ext), gfp);
  if (!chunk->obj_exts)
   goto objcg_fail;
}
#endif

pcpu_init_md_blocks(chunk);

/* init metadata */
chunk->free_bytes = chunk->nr_pages * PAGE_SIZE;

return chunk;

#ifdef NEED_PCPUOBJ_EXT
objcg_fail:
pcpu_mem_free(chunk->md_blocks);
#endif
md_blocks_fail:
pcpu_mem_free(chunk->bound_map);
bound_map_fail:
pcpu_mem_free(chunk->alloc_map);
alloc_map_fail:
pcpu_mem_free(chunk);

return NULL;
}

static void pcpu_free_chunk(struct pcpu_chunk *chunk)
{
if (!chunk)
  return;
#ifdef NEED_PCPUOBJ_EXT
pcpu_mem_free(chunk->obj_exts);
#endif
pcpu_mem_free(chunk->md_blocks);
pcpu_mem_free(chunk->bound_map);
pcpu_mem_free(chunk->alloc_map);
pcpu_mem_free(chunk);
}

/**
* pcpu_chunk_populated - post-population bookkeeping
* @chunk: pcpu_chunk which got populated
* @page_start: the start page
* @page_end: the end page
*
* Pages in [@page_start,@page_end) have been populated to @chunk.  Update
* the bookkeeping information accordingly.  Must be called after each
* successful population.
*/
static void pcpu_chunk_populated(struct pcpu_chunk *chunk, int page_start,
     int page_end)
{
int nr = page_end - page_start;

lockdep_assert_held(&pcpu_lock);

bitmap_set(chunk->populated, page_start, nr);
chunk->nr_populated += nr;
pcpu_nr_populated += nr;

pcpu_update_empty_pages(chunk, nr);
}

/**
* pcpu_chunk_depopulated - post-depopulation bookkeeping
* @chunk: pcpu_chunk which got depopulated
* @page_start: the start page
* @page_end: the end page
*
* Pages in [@page_start,@page_end) have been depopulated from @chunk.
* Update the bookkeeping information accordingly.  Must be called after
* each successful depopulation.
*/
static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk,
       int page_start, int page_end)
{
int nr = page_end - page_start;

lockdep_assert_held(&pcpu_lock);

bitmap_clear(chunk->populated, page_start, nr);
chunk->nr_populated -= nr;
pcpu_nr_populated -= nr;

pcpu_update_empty_pages(chunk, -nr);
}

/*
* Chunk management implementation.
*
* To allow different implementations, chunk alloc/free and
* [de]population are implemented in a separate file which is pulled
* into this file and compiled together.  The following functions
* should be implemented.
*
* pcpu_populate_chunk - populate the specified range of a chunk
* pcpu_depopulate_chunk - depopulate the specified range of a chunk
* pcpu_post_unmap_tlb_flush - flush tlb for the specified range of a chunk
* pcpu_create_chunk - create a new chunk
* pcpu_destroy_chunk - destroy a chunk, always preceded by full depop
* pcpu_addr_to_page - translate address to physical address
* pcpu_verify_alloc_info - check alloc_info is acceptable during init
*/
static int pcpu_populate_chunk(struct pcpu_chunk *chunk,
          int page_start, int page_end, gfp_t gfp);
static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
      int page_start, int page_end);
static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk,
          int page_start, int page_end);
static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp);
static void pcpu_destroy_chunk(struct pcpu_chunk *chunk);
static struct page *pcpu_addr_to_page(void *addr);
static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai);

#ifdef CONFIG_NEED_PER_CPU_KM
#include "percpu-km.c"
#else
#include "percpu-vm.c"
#endif

/**
* pcpu_chunk_addr_search - determine chunk containing specified address
* @addr: address for which the chunk needs to be determined.
*
* This is an internal function that handles all but static allocations.
* Static percpu address values should never be passed into the allocator.
*
* RETURNS:
* The address of the found chunk.
*/
static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
{
/* is it in the dynamic region (first chunk)? */
if (pcpu_addr_in_chunk(pcpu_first_chunk, addr))
  return pcpu_first_chunk;

/* is it in the reserved region? */
if (pcpu_addr_in_chunk(pcpu_reserved_chunk, addr))
  return pcpu_reserved_chunk;

/*
* The address is relative to unit0 which might be unused and
* thus unmapped.  Offset the address to the unit space of the
* current processor before looking it up in the vmalloc
* space.  Note that any possible cpu id can be used here, so
* there's no need to worry about preemption or cpu hotplug.
*/
addr += pcpu_unit_offsets[raw_smp_processor_id()];
return pcpu_get_page_chunk(pcpu_addr_to_page(addr));
}

#ifdef CONFIG_MEMCG
static bool pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp,
          struct obj_cgroup **objcgp)
{
struct obj_cgroup *objcg;

if (!memcg_kmem_online() || !(gfp & __GFP_ACCOUNT))
  return true;

objcg = current_obj_cgroup();
if (!objcg)
  return true;

if (obj_cgroup_charge(objcg, gfp, pcpu_obj_full_size(size)))
  return false;

*objcgp = objcg;
return true;
}

static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg,
           struct pcpu_chunk *chunk, int off,
           size_t size)
{
if (!objcg)
  return;

if (likely(chunk && chunk->obj_exts)) {
  obj_cgroup_get(objcg);
  chunk->obj_exts[off >> PCPU_MIN_ALLOC_SHIFT].cgroup = objcg;

  rcu_read_lock();
  mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B,
    pcpu_obj_full_size(size));
  rcu_read_unlock();
} else {
  obj_cgroup_uncharge(objcg, pcpu_obj_full_size(size));
}
}

static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size)
{
struct obj_cgroup *objcg;

if (unlikely(!chunk->obj_exts))
  return;

objcg = chunk->obj_exts[off >> PCPU_MIN_ALLOC_SHIFT].cgroup;
if (!objcg)
  return;
chunk->obj_exts[off >> PCPU_MIN_ALLOC_SHIFT].cgroup = NULL;

obj_cgroup_uncharge(objcg, pcpu_obj_full_size(size));

rcu_read_lock();
mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B,
   -pcpu_obj_full_size(size));
rcu_read_unlock();

obj_cgroup_put(objcg);
}

#else /* CONFIG_MEMCG */
static bool
pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp, struct obj_cgroup **objcgp)
{
return true;
}

static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg,
           struct pcpu_chunk *chunk, int off,
           size_t size)
{
}

static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size)
{
}
#endif /* CONFIG_MEMCG */

#ifdef CONFIG_MEM_ALLOC_PROFILING
static void pcpu_alloc_tag_alloc_hook(struct pcpu_chunk *chunk, int off,
          size_t size)
{
if (mem_alloc_profiling_enabled() && likely(chunk->obj_exts)) {
  alloc_tag_add(&chunk->obj_exts[off >> PCPU_MIN_ALLOC_SHIFT].tag,
         current->alloc_tag, size);
}
}

static void pcpu_alloc_tag_free_hook(struct pcpu_chunk *chunk, int off, size_t size)
{
if (mem_alloc_profiling_enabled() && likely(chunk->obj_exts))
  alloc_tag_sub(&chunk->obj_exts[off >> PCPU_MIN_ALLOC_SHIFT].tag, size);
}
#else
static void pcpu_alloc_tag_alloc_hook(struct pcpu_chunk *chunk, int off,
          size_t size)
{
}

static void pcpu_alloc_tag_free_hook(struct pcpu_chunk *chunk, int off, size_t size)
{
}
#endif

/**
* pcpu_alloc - the percpu allocator
* @size: size of area to allocate in bytes
* @align: alignment of area (max PAGE_SIZE)
* @reserved: allocate from the reserved chunk if available
* @gfp: allocation flags
*
* Allocate percpu area of @size bytes aligned at @align.  If @gfp doesn't
* contain %GFP_KERNEL, the allocation is atomic. If @gfp has __GFP_NOWARN
* then no warning will be triggered on invalid or failed allocation
* requests.
*
* RETURNS:
* Percpu pointer to the allocated area on success, NULL on failure.
*/
void __percpu *pcpu_alloc_noprof(size_t size, size_t align, bool reserved,
     gfp_t gfp)
{
gfp_t pcpu_gfp;
bool is_atomic;
bool do_warn;
struct obj_cgroup *objcg = NULL;
static atomic_t warn_limit = ATOMIC_INIT(10);
struct pcpu_chunk *chunk, *next;
const char *err;
int slot, off, cpu, ret;
unsigned long flags;
void __percpu *ptr;
size_t bits, bit_align;

gfp = current_gfp_context(gfp);
/* whitelisted flags that can be passed to the backing allocators */
pcpu_gfp = gfp & (GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN);
is_atomic = !gfpflags_allow_blocking(gfp);
do_warn = !(gfp & __GFP_NOWARN);

/*
* There is now a minimum allocation size of PCPU_MIN_ALLOC_SIZE,
* therefore alignment must be a minimum of that many bytes.
* An allocation may have internal fragmentation from rounding up
* of up to PCPU_MIN_ALLOC_SIZE - 1 bytes.
*/
if (unlikely(align < PCPU_MIN_ALLOC_SIZE))
  align = PCPU_MIN_ALLOC_SIZE;

size = ALIGN(size, PCPU_MIN_ALLOC_SIZE);
bits = size >> PCPU_MIN_ALLOC_SHIFT;
bit_align = align >> PCPU_MIN_ALLOC_SHIFT;

if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE ||
       !is_power_of_2(align))) {
  WARN(do_warn, "illegal size (%zu) or align (%zu) for percpu allocation\n",
       size, align);
  return NULL;
}

if (unlikely(!pcpu_memcg_pre_alloc_hook(size, gfp, &objcg)))
  return NULL;

if (!is_atomic) {
  /*
* pcpu_balance_workfn() allocates memory under this mutex,
* and it may wait for memory reclaim. Allow current task
* to become OOM victim, in case of memory pressure.
*/
  if (gfp & __GFP_NOFAIL) {
   mutex_lock(&pcpu_alloc_mutex);
  } else if (mutex_lock_killable(&pcpu_alloc_mutex)) {
   pcpu_memcg_post_alloc_hook(objcg, NULL, 0, size);
   return NULL;
  }
}

spin_lock_irqsave(&pcpu_lock, flags);

/* serve reserved allocations from the reserved chunk if available */
if (reserved && pcpu_reserved_chunk) {
  chunk = pcpu_reserved_chunk;

  off = pcpu_find_block_fit(chunk, bits, bit_align, is_atomic);
  if (off < 0) {
   err = "alloc from reserved chunk failed";
   goto fail_unlock;
  }

  off = pcpu_alloc_area(chunk, bits, bit_align, off);
  if (off >= 0)
   goto area_found;

  err = "alloc from reserved chunk failed";
  goto fail_unlock;
}

restart:
/* search through normal chunks */
for (slot = pcpu_size_to_slot(size); slot <= pcpu_free_slot; slot++) {
  list_for_each_entry_safe(chunk, next, &pcpu_chunk_lists[slot],
      list) {
   off = pcpu_find_block_fit(chunk, bits, bit_align,
        is_atomic);
   if (off < 0) {
    if (slot < PCPU_SLOT_FAIL_THRESHOLD)
     pcpu_chunk_move(chunk, 0);
    continue;
   }

   off = pcpu_alloc_area(chunk, bits, bit_align, off);
   if (off >= 0) {
    pcpu_reintegrate_chunk(chunk);
    goto area_found;
   }
  }
}

spin_unlock_irqrestore(&pcpu_lock, flags);

if (is_atomic) {
  err = "atomic alloc failed, no space left";
  goto fail;
}

/* No space left.  Create a new chunk. */
if (list_empty(&pcpu_chunk_lists[pcpu_free_slot])) {
  chunk = pcpu_create_chunk(pcpu_gfp);
  if (!chunk) {
   err = "failed to allocate new chunk";
   goto fail;
  }

  spin_lock_irqsave(&pcpu_lock, flags);
  pcpu_chunk_relocate(chunk, -1);
} else {
  spin_lock_irqsave(&pcpu_lock, flags);
}

goto restart;

area_found:
pcpu_stats_area_alloc(chunk, size);

if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW)
  pcpu_schedule_balance_work();

spin_unlock_irqrestore(&pcpu_lock, flags);

/* populate if not all pages are already there */
if (!is_atomic) {
  unsigned int page_end, rs, re;

  rs = PFN_DOWN(off);
  page_end = PFN_UP(off + size);

  for_each_clear_bitrange_from(rs, re, chunk->populated, page_end) {
   WARN_ON(chunk->immutable);

   ret = pcpu_populate_chunk(chunk, rs, re, pcpu_gfp);

   spin_lock_irqsave(&pcpu_lock, flags);
   if (ret) {
    pcpu_free_area(chunk, off);
    err = "failed to populate";
    goto fail_unlock;
   }
   pcpu_chunk_populated(chunk, rs, re);
   spin_unlock_irqrestore(&pcpu_lock, flags);
  }

  mutex_unlock(&pcpu_alloc_mutex);
}

/* clear the areas and return address relative to base address */
for_each_possible_cpu(cpu)
  memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);

ptr = __addr_to_pcpu_ptr(chunk->base_addr + off);
kmemleak_alloc_percpu(ptr, size, gfp);

trace_percpu_alloc_percpu(_RET_IP_, reserved, is_atomic, size, align,
      chunk->base_addr, off, ptr,
      pcpu_obj_full_size(size), gfp);

pcpu_memcg_post_alloc_hook(objcg, chunk, off, size);

pcpu_alloc_tag_alloc_hook(chunk, off, size);

return ptr;

fail_unlock:
spin_unlock_irqrestore(&pcpu_lock, flags);
fail:
trace_percpu_alloc_percpu_fail(reserved, is_atomic, size, align);

if (do_warn) {
  int remaining = atomic_dec_if_positive(&warn_limit);

  if (remaining >= 0) {
   pr_warn("allocation failed, size=%zu align=%zu atomic=%d, %s\n",
    size, align, is_atomic, err);
   if (!is_atomic)
    dump_stack();
   if (remaining == 0)
    pr_info("limit reached, disable warning\n");
  }
}

if (is_atomic) {
  /* see the flag handling in pcpu_balance_workfn() */
  pcpu_atomic_alloc_failed = true;
  pcpu_schedule_balance_work();
} else {
  mutex_unlock(&pcpu_alloc_mutex);
}

pcpu_memcg_post_alloc_hook(objcg, NULL, 0, size);

return NULL;
}
EXPORT_SYMBOL_GPL(pcpu_alloc_noprof);

/**
* pcpu_balance_free - manage the amount of free chunks
* @empty_only: free chunks only if there are no populated pages
*
* If empty_only is %false, reclaim all fully free chunks regardless of the
* number of populated pages.  Otherwise, only reclaim chunks that have no
* populated pages.
*
* CONTEXT:
* pcpu_lock (can be dropped temporarily)
*/
static void pcpu_balance_free(bool empty_only)
{
LIST_HEAD(to_free);
struct list_head *free_head = &pcpu_chunk_lists[pcpu_free_slot];
struct pcpu_chunk *chunk, *next;

lockdep_assert_held(&pcpu_lock);

/*
* There's no reason to keep around multiple unused chunks and VM
* areas can be scarce.  Destroy all free chunks except for one.
*/
list_for_each_entry_safe(chunk, next, free_head, list) {
  WARN_ON(chunk->immutable);

  /* spare the first one */
  if (chunk == list_first_entry(free_head, struct pcpu_chunk, list))
   continue;

  if (!empty_only || chunk->nr_empty_pop_pages == 0)
   list_move(&chunk->list, &to_free);
}

if (list_empty(&to_free))
  return;

spin_unlock_irq(&pcpu_lock);
list_for_each_entry_safe(chunk, next, &to_free, list) {
  unsigned int rs, re;

  for_each_set_bitrange(rs, re, chunk->populated, chunk->nr_pages) {
   pcpu_depopulate_chunk(chunk, rs, re);
   spin_lock_irq(&pcpu_lock);
   pcpu_chunk_depopulated(chunk, rs, re);
   spin_unlock_irq(&pcpu_lock);
  }
  pcpu_destroy_chunk(chunk);
  cond_resched();
}
spin_lock_irq(&pcpu_lock);
}

/**
* pcpu_balance_populated - manage the amount of populated pages
*
* Maintain a certain amount of populated pages to satisfy atomic allocations.
* It is possible that this is called when physical memory is scarce causing
* OOM killer to be triggered.  We should avoid doing so until an actual
* allocation causes the failure as it is possible that requests can be
* serviced from already backed regions.
*
* CONTEXT:
* pcpu_lock (can be dropped temporarily)
*/
static void pcpu_balance_populated(void)
{
/* gfp flags passed to underlying allocators */
const gfp_t gfp = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN;
struct pcpu_chunk *chunk;
int slot, nr_to_pop, ret;

lockdep_assert_held(&pcpu_lock);

/*
* Ensure there are certain number of free populated pages for
* atomic allocs.  Fill up from the most packed so that atomic
* allocs don't increase fragmentation.  If atomic allocation
* failed previously, always populate the maximum amount.  This
* should prevent atomic allocs larger than PAGE_SIZE from keeping
* failing indefinitely; however, large atomic allocs are not
* something we support properly and can be highly unreliable and
* inefficient.
*/
retry_pop:
if (pcpu_atomic_alloc_failed) {
  nr_to_pop = PCPU_EMPTY_POP_PAGES_HIGH;
  /* best effort anyway, don't worry about synchronization */
  pcpu_atomic_alloc_failed = false;
} else {
  nr_to_pop = clamp(PCPU_EMPTY_POP_PAGES_HIGH -
      pcpu_nr_empty_pop_pages,
      0, PCPU_EMPTY_POP_PAGES_HIGH);
}

for (slot = pcpu_size_to_slot(PAGE_SIZE); slot <= pcpu_free_slot; slot++) {
  unsigned int nr_unpop = 0, rs, re;

  if (!nr_to_pop)
   break;

  list_for_each_entry(chunk, &pcpu_chunk_lists[slot], list) {
   nr_unpop = chunk->nr_pages - chunk->nr_populated;
   if (nr_unpop)
    break;
  }

  if (!nr_unpop)
   continue;

  /* @chunk can't go away while pcpu_alloc_mutex is held */
  for_each_clear_bitrange(rs, re, chunk->populated, chunk->nr_pages) {
   int nr = min_t(int, re - rs, nr_to_pop);

   spin_unlock_irq(&pcpu_lock);
   ret = pcpu_populate_chunk(chunk, rs, rs + nr, gfp);
   cond_resched();
   spin_lock_irq(&pcpu_lock);
   if (!ret) {
    nr_to_pop -= nr;
    pcpu_chunk_populated(chunk, rs, rs + nr);
   } else {
    nr_to_pop = 0;
   }

   if (!nr_to_pop)
    break;
  }
}

if (nr_to_pop) {
  /* ran out of chunks to populate, create a new one and retry */
  spin_unlock_irq(&pcpu_lock);
  chunk = pcpu_create_chunk(gfp);
  cond_resched();
  spin_lock_irq(&pcpu_lock);
  if (chunk) {
   pcpu_chunk_relocate(chunk, -1);
   goto retry_pop;
  }
}
}

/**
* pcpu_reclaim_populated - scan over to_depopulate chunks and free empty pages
*
* Scan over chunks in the depopulate list and try to release unused populated
* pages back to the system.  Depopulated chunks are sidelined to prevent
* repopulating these pages unless required.  Fully free chunks are reintegrated
* and freed accordingly (1 is kept around).  If we drop below the empty
* populated pages threshold, reintegrate the chunk if it has empty free pages.
* Each chunk is scanned in the reverse order to keep populated pages close to
* the beginning of the chunk.
*
* CONTEXT:
* pcpu_lock (can be dropped temporarily)
*
*/
static void pcpu_reclaim_populated(void)
{
struct pcpu_chunk *chunk;
struct pcpu_block_md *block;
int freed_page_start, freed_page_end;
int i, end;
bool reintegrate;

lockdep_assert_held(&pcpu_lock);

/*
* Once a chunk is isolated to the to_depopulate list, the chunk is no
* longer discoverable to allocations whom may populate pages.  The only
* other accessor is the free path which only returns area back to the
* allocator not touching the populated bitmap.
*/
while ((chunk = list_first_entry_or_null(
   &pcpu_chunk_lists[pcpu_to_depopulate_slot],
   struct pcpu_chunk, list))) {
  WARN_ON(chunk->immutable);

  /*
* Scan chunk's pages in the reverse order to keep populated
* pages close to the beginning of the chunk.
*/
  freed_page_start = chunk->nr_pages;
  freed_page_end = 0;
  reintegrate = false;
  for (i = chunk->nr_pages - 1, end = -1; i >= 0; i--) {
   /* no more work to do */
   if (chunk->nr_empty_pop_pages == 0)
    break;

   /* reintegrate chunk to prevent atomic alloc failures */
   if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_HIGH) {
    reintegrate = true;
    break;
   }

   /*
* If the page is empty and populated, start or
* extend the (i, end) range.  If i == 0, decrease
* i and perform the depopulation to cover the last
* (first) page in the chunk.
*/
   block = chunk->md_blocks + i;
   if (block->contig_hint == PCPU_BITMAP_BLOCK_BITS &&
       test_bit(i, chunk->populated)) {
    if (end == -1)
     end = i;
    if (i > 0)
     continue;
    i--;
   }

   /* depopulate if there is an active range */
   if (end == -1)
    continue;

   spin_unlock_irq(&pcpu_lock);
   pcpu_depopulate_chunk(chunk, i + 1, end + 1);
   cond_resched();
   spin_lock_irq(&pcpu_lock);

   pcpu_chunk_depopulated(chunk, i + 1, end + 1);
   freed_page_start = min(freed_page_start, i + 1);
   freed_page_end = max(freed_page_end, end + 1);

--> --------------------

--> maximum size reached

--> --------------------

quality95%

¤ Dauer der Verarbeitung: 0.24 Sekunden ¤

Wurzel

Suchen

Beweissystem der NASA

Beweissystem Isabelle

NIST Cobol Testsuite

Cephes Mathematical Library

Wiener Entwicklungsmethode

Haftungshinweis

Die Informationen auf dieser Webseite wurden nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit, noch Qualität der bereit gestellten Informationen zugesichert.

Bemerkung:

Die farbliche Syntaxdarstellung ist noch experimentell.