// SPDX-License-Identifier: GPL-2.0 /* * Slab allocator functions that are independent of the allocator strategy * * (C) 2012 Christoph Lameter <cl@gentwo.org>
*/ #include <linux/slab.h>
/* * Figure out what the alignment of the objects will be given a set of * flags, a user specified alignment and the size of the objects.
*/ staticunsignedint calculate_alignment(slab_flags_t flags, unsignedint align, unsignedint size)
{ /* * If the user wants hardware cache aligned objects then follow that * suggestion if the object is sufficiently large. * * The hardware cache alignment cannot override the specified * alignment though. If that is greater then use it.
*/ if (flags & SLAB_HWCACHE_ALIGN) { unsignedint ralign;
/** * __kmem_cache_create_args - Create a kmem cache. * @name: A string which is used in /proc/slabinfo to identify this cache. * @object_size: The size of objects to be created in this cache. * @args: Additional arguments for the cache creation (see * &struct kmem_cache_args). * @flags: See the desriptions of individual flags. The common ones are listed * in the description below. * * Not to be called directly, use the kmem_cache_create() wrapper with the same * parameters. * * Commonly used @flags: * * &SLAB_ACCOUNT - Account allocations to memcg. * * &SLAB_HWCACHE_ALIGN - Align objects on cache line boundaries. * * &SLAB_RECLAIM_ACCOUNT - Objects are reclaimable. * * &SLAB_TYPESAFE_BY_RCU - Slab page (not individual objects) freeing delayed * by a grace period - see the full description before using. * * Context: Cannot be called within a interrupt, but can be interrupted. * * Return: a pointer to the cache on success, NULL on failure.
*/ struct kmem_cache *__kmem_cache_create_args(constchar *name, unsignedint object_size, struct kmem_cache_args *args,
slab_flags_t flags)
{ struct kmem_cache *s = NULL; constchar *cache_name; int err;
#ifdef CONFIG_SLUB_DEBUG /* * If no slab_debug was enabled globally, the static key is not yet * enabled by setup_slub_debug(). Enable it if the cache is being * created with any of the debugging flags passed explicitly. * It's also possible that this is the first cache created with * SLAB_STORE_USER and we should init stack_depot for it.
*/ if (flags & SLAB_DEBUG_FLAGS)
static_branch_enable(&slub_debug_enabled); if (flags & SLAB_STORE_USER)
stack_depot_init(); #else
flags &= ~SLAB_DEBUG_FLAGS; #endif
mutex_lock(&slab_mutex);
err = kmem_cache_sanity_check(name, object_size); if (err) { goto out_unlock;
}
/** * kmem_buckets_create - Create a set of caches that handle dynamic sized * allocations via kmem_buckets_alloc() * @name: A prefix string which is used in /proc/slabinfo to identify this * cache. The individual caches with have their sizes as the suffix. * @flags: SLAB flags (see kmem_cache_create() for details). * @useroffset: Starting offset within an allocation that may be copied * to/from userspace. * @usersize: How many bytes, starting at @useroffset, may be copied * to/from userspace. * @ctor: A constructor for the objects, run when new allocations are made. * * Cannot be called within an interrupt, but can be interrupted. * * Return: a pointer to the cache on success, NULL on failure. When * CONFIG_SLAB_BUCKETS is not enabled, ZERO_SIZE_PTR is returned, and * subsequent calls to kmem_buckets_alloc() will fall back to kmalloc(). * (i.e. callers only need to check for NULL on failure.)
*/
kmem_buckets *kmem_buckets_create(constchar *name, slab_flags_t flags, unsignedint useroffset, unsignedint usersize, void (*ctor)(void *))
{ unsignedlong mask = 0; unsignedint idx;
kmem_buckets *b;
/* * When the separate buckets API is not built in, just return * a non-NULL value for the kmem_buckets pointer, which will be * unused when performing allocations.
*/ if (!IS_ENABLED(CONFIG_SLAB_BUCKETS)) return ZERO_SIZE_PTR;
if (WARN_ON(!kmem_buckets_cache)) return NULL;
b = kmem_cache_alloc(kmem_buckets_cache, GFP_KERNEL|__GFP_ZERO); if (WARN_ON(!b)) return NULL;
/* * For a given kmem_cache, kmem_cache_destroy() should only be called * once or there will be a use-after-free problem. The actual deletion * and release of the kobject does not need slab_mutex or cpu_hotplug_lock * protection. So they are now done without holding those locks.
*/ staticvoid kmem_cache_release(struct kmem_cache *s)
{
kfence_shutdown_cache(s); if (__is_defined(SLAB_SUPPORTS_SYSFS) && slab_state >= FULL)
sysfs_slab_release(s); else
slab_kmem_cache_release(s);
}
void kmem_cache_destroy(struct kmem_cache *s)
{ int err;
if (unlikely(!s) || !kasan_check_byte(s)) return;
/* in-flight kfree_rcu()'s may include objects from our cache */
kvfree_rcu_barrier();
if (IS_ENABLED(CONFIG_SLUB_RCU_DEBUG) &&
(s->flags & SLAB_TYPESAFE_BY_RCU)) { /* * Under CONFIG_SLUB_RCU_DEBUG, when objects in a * SLAB_TYPESAFE_BY_RCU slab are freed, SLUB will internally * defer their freeing with call_rcu(). * Wait for such call_rcu() invocations here before actually * destroying the cache. * * It doesn't matter that we haven't looked at the slab refcount * yet - slabs with SLAB_TYPESAFE_BY_RCU can't be merged, so * the refcount should be 1 here.
*/
rcu_barrier();
}
cpus_read_lock();
mutex_lock(&slab_mutex);
s->refcount--; if (s->refcount) {
mutex_unlock(&slab_mutex);
cpus_read_unlock(); return;
}
/* free asan quarantined objects */
kasan_cache_shutdown(s);
err = __kmem_cache_shutdown(s); if (!slab_in_kunit_test())
WARN(err, "%s %s: Slab cache still has objects when called from %pS",
__func__, s->name, (void *)_RET_IP_);
list_del(&s->list);
mutex_unlock(&slab_mutex);
cpus_read_unlock();
if (slab_state >= FULL)
sysfs_slab_unlink(s);
debugfs_slab_release(s);
if (err) return;
if (s->flags & SLAB_TYPESAFE_BY_RCU)
rcu_barrier();
/** * kmem_cache_shrink - Shrink a cache. * @cachep: The cache to shrink. * * Releases as many slabs as possible for a cache. * To help debugging, a zero exit status indicates all slabs were released. * * Return: %0 if all slabs were released, non-zero otherwise
*/ int kmem_cache_shrink(struct kmem_cache *cachep)
{
kasan_cache_shrink(cachep);
/** * kmem_dump_obj - Print available slab provenance information * @object: slab object for which to find provenance information. * * This function uses pr_cont(), so that the caller is expected to have * printed out whatever preamble is appropriate. The provenance information * depends on the type of object and on how much debugging is enabled. * For a slab-cache object, the fact that it is a slab object is printed, * and, if available, the slab name, return address, and stack trace from * the allocation and last free path of that object. * * Return: %true if the pointer is to a not-yet-freed object from * kmalloc() or kmem_cache_alloc(), either %true or %false if the pointer * is to an already-freed object, and %false otherwise.
*/ bool kmem_dump_obj(void *object)
{ char *cp = IS_ENABLED(CONFIG_MMU) ? "" : "/vmalloc"; int i; struct slab *slab; unsignedlong ptroffset; struct kmem_obj_info kp = { };
/* Some arches consider ZERO_SIZE_PTR to be a valid address. */ if (object < (void *)PAGE_SIZE || !virt_addr_valid(object)) returnfalse;
slab = virt_to_slab(object); if (!slab) returnfalse;
kmem_obj_info(&kp, object, slab); if (kp.kp_slab_cache)
pr_cont(" slab%s %s", cp, kp.kp_slab_cache->name); else
pr_cont(" slab%s", cp); if (is_kfence_address(object))
pr_cont(" (kfence)"); if (kp.kp_objp)
pr_cont(" start %px", kp.kp_objp); if (kp.kp_data_offset)
pr_cont(" data offset %lu", kp.kp_data_offset); if (kp.kp_objp) {
ptroffset = ((char *)object - (char *)kp.kp_objp) - kp.kp_data_offset;
pr_cont(" pointer offset %lu", ptroffset);
} if (kp.kp_slab_cache && kp.kp_slab_cache->object_size)
pr_cont(" size %u", kp.kp_slab_cache->object_size); if (kp.kp_ret)
pr_cont(" allocated at %pS\n", kp.kp_ret); else
pr_cont("\n"); for (i = 0; i < ARRAY_SIZE(kp.kp_stack); i++) { if (!kp.kp_stack[i]) break;
pr_info(" %pS\n", kp.kp_stack[i]);
}
if (kp.kp_free_stack[0])
pr_cont(" Free path:\n");
for (i = 0; i < ARRAY_SIZE(kp.kp_free_stack); i++) { if (!kp.kp_free_stack[i]) break;
pr_info(" %pS\n", kp.kp_free_stack[i]);
}
/* Create a cache during boot when no slab services are available yet */ void __init create_boot_cache(struct kmem_cache *s, constchar *name, unsignedint size, slab_flags_t flags, unsignedint useroffset, unsignedint usersize)
{ int err; unsignedint align = ARCH_KMALLOC_MINALIGN; struct kmem_cache_args kmem_args = {};
/* * kmalloc caches guarantee alignment of at least the largest * power-of-two divisor of the size. For power-of-two sizes, * it is the size itself.
*/ if (flags & SLAB_KMALLOC)
align = max(align, 1U << (ffs(size) - 1));
kmem_args.align = calculate_alignment(flags, align, size);
/* * Conversion table for small slabs sizes / 8 to the index in the * kmalloc array. This is necessary for slabs < 192 since we have non power * of two cache sizes there. The size of larger slabs can be determined using * fls.
*/
u8 kmalloc_size_index[24] __ro_after_init = {
3, /* 8 */
4, /* 16 */
5, /* 24 */
5, /* 32 */
6, /* 40 */
6, /* 48 */
6, /* 56 */
6, /* 64 */
1, /* 72 */
1, /* 80 */
1, /* 88 */
1, /* 96 */
7, /* 104 */
7, /* 112 */
7, /* 120 */
7, /* 128 */
2, /* 136 */
2, /* 144 */
2, /* 152 */
2, /* 160 */
2, /* 168 */
2, /* 176 */
2, /* 184 */
2 /* 192 */
};
size_t kmalloc_size_roundup(size_t size)
{ if (size && size <= KMALLOC_MAX_CACHE_SIZE) { /* * The flags don't matter since size_index is common to all. * Neither does the caller for just getting ->object_size.
*/ return kmalloc_slab(size, NULL, GFP_KERNEL, 0)->object_size;
}
/* Above the smaller buckets, size is a multiple of page size. */ if (size && size <= KMALLOC_MAX_SIZE) return PAGE_SIZE << get_order(size);
/* * Return 'size' for 0 - kmalloc() returns ZERO_SIZE_PTR * and very large size - kmalloc() may fail.
*/ return size;
/* * kmalloc_info[] is to make slab_debug=,kmalloc-xx option work at boot time. * kmalloc_index() supports up to 2^21=2MB, so the final entry of the table is * kmalloc-2M.
*/ conststruct kmalloc_info_struct kmalloc_info[] __initconst = {
INIT_KMALLOC_INFO(0, 0),
INIT_KMALLOC_INFO(96, 96),
INIT_KMALLOC_INFO(192, 192),
INIT_KMALLOC_INFO(8, 8),
INIT_KMALLOC_INFO(16, 16),
INIT_KMALLOC_INFO(32, 32),
INIT_KMALLOC_INFO(64, 64),
INIT_KMALLOC_INFO(128, 128),
INIT_KMALLOC_INFO(256, 256),
INIT_KMALLOC_INFO(512, 512),
INIT_KMALLOC_INFO(1024, 1k),
INIT_KMALLOC_INFO(2048, 2k),
INIT_KMALLOC_INFO(4096, 4k),
INIT_KMALLOC_INFO(8192, 8k),
INIT_KMALLOC_INFO(16384, 16k),
INIT_KMALLOC_INFO(32768, 32k),
INIT_KMALLOC_INFO(65536, 64k),
INIT_KMALLOC_INFO(131072, 128k),
INIT_KMALLOC_INFO(262144, 256k),
INIT_KMALLOC_INFO(524288, 512k),
INIT_KMALLOC_INFO(1048576, 1M),
INIT_KMALLOC_INFO(2097152, 2M)
};
/* * Patch up the size_index table if we have strange large alignment * requirements for the kmalloc array. This is only the case for * MIPS it seems. The standard arches will not generate any code here. * * Largest permitted alignment is 256 bytes due to the way we * handle the index determination for the smaller caches. * * Make sure that nothing crazy happens if someone starts tinkering * around with ARCH_KMALLOC_MINALIGN
*/ void __init setup_kmalloc_cache_index_table(void)
{ unsignedint i;
for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) { unsignedint elem = size_index_elem(i);
if (elem >= ARRAY_SIZE(kmalloc_size_index)) break;
kmalloc_size_index[elem] = KMALLOC_SHIFT_LOW;
}
if (KMALLOC_MIN_SIZE >= 64) { /* * The 96 byte sized cache is not used if the alignment * is 64 byte.
*/ for (i = 64 + 8; i <= 96; i += 8)
kmalloc_size_index[size_index_elem(i)] = 7;
}
if (KMALLOC_MIN_SIZE >= 128) { /* * The 192 byte sized cache is not used if the alignment * is 128 byte. Redirect kmalloc to use the 256 byte cache * instead.
*/ for (i = 128 + 8; i <= 192; i += 8)
kmalloc_size_index[size_index_elem(i)] = 8;
}
}
if (!kmalloc_caches[type][aligned_idx])
kmalloc_caches[type][aligned_idx] = create_kmalloc_cache(
kmalloc_info[aligned_idx].name[type],
aligned_size, flags); if (idx != aligned_idx)
kmalloc_caches[type][idx] = kmalloc_caches[type][aligned_idx];
}
/* * Create the kmalloc array. Some of the regular kmalloc arrays * may already have been created because they were needed to * enable allocations for slab creation.
*/ void __init create_kmalloc_caches(void)
{ int i; enum kmalloc_cache_type type;
/* * Including KMALLOC_CGROUP if CONFIG_MEMCG defined
*/ for (type = KMALLOC_NORMAL; type < NR_KMALLOC_TYPES; type++) { /* Caches that are NOT of the two-to-the-power-of size. */ if (KMALLOC_MIN_SIZE <= 32)
new_kmalloc_cache(1, type); if (KMALLOC_MIN_SIZE <= 64)
new_kmalloc_cache(2, type);
/* Caches that are of the two-to-the-power-of size. */ for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++)
new_kmalloc_cache(i, type);
} #ifdef CONFIG_RANDOM_KMALLOC_CACHES
random_kmalloc_seed = get_random_u64(); #endif
/* Kmalloc array is now usable */
slab_state = UP;
if (IS_ENABLED(CONFIG_SLAB_BUCKETS))
kmem_buckets_cache = kmem_cache_create("kmalloc_buckets", sizeof(kmem_buckets),
0, SLAB_NO_MERGE, NULL);
}
/** * __ksize -- Report full size of underlying allocation * @object: pointer to the object * * This should only be used internally to query the true size of allocations. * It is not meant to be a way to discover the usable size of an allocation * after the fact. Instead, use kmalloc_size_roundup(). Using memory beyond * the originally requested allocation size may trigger KASAN, UBSAN_BOUNDS, * and/or FORTIFY_SOURCE. * * Return: size of the actual memory used by @object in bytes
*/
size_t __ksize(constvoid *object)
{ struct folio *folio;
if (unlikely(object == ZERO_SIZE_PTR)) return 0;
folio = virt_to_folio(object);
if (unlikely(!folio_test_slab(folio))) { if (WARN_ON(folio_size(folio) <= KMALLOC_MAX_CACHE_SIZE)) return 0; if (WARN_ON(object != folio_address(folio))) return 0; return folio_size(folio);
}
staticvoid print_slabinfo_header(struct seq_file *m)
{ /* * Output format version, so at least we can change it * without _too_ many complaints.
*/
seq_puts(m, "slabinfo - version: 2.1\n");
seq_puts(m, "# name ");
seq_puts(m, " : tunables ");
seq_puts(m, " : slabdata ");
seq_putc(m, '\n');
}
/* * Here acquiring slab_mutex is risky since we don't prefer to get * sleep in oom path. But, without mutex hold, it may introduce a * risk of crash. * Use mutex_trylock to protect the list traverse, dump nothing * without acquiring the mutex.
*/ if (!mutex_trylock(&slab_mutex)) {
pr_warn("excessive unreclaimable slab but cannot dump stats\n"); return;
}
pr_info("Unreclaimable slab info:\n");
pr_info("Name Used Total\n");
list_for_each_entry(s, &slab_caches, list) { if (s->flags & SLAB_RECLAIM_ACCOUNT) continue;
/** * kfree_sensitive - Clear sensitive information in memory before freeing * @p: object to free memory of * * The memory of the object @p points to is zeroed before freed. * If @p is %NULL, kfree_sensitive() does nothing. * * Note: this function zeroes the whole allocated buffer which can be a good * deal bigger than the requested buffer size passed to kmalloc(). So be * careful when using this function in performance sensitive code.
*/ void kfree_sensitive(constvoid *p)
{
size_t ks; void *mem = (void *)p;
size_t ksize(constvoid *objp)
{ /* * We need to first check that the pointer to the object is valid. * The KASAN report printed from ksize() is more useful, then when * it's printed later when the behaviour could be undefined due to * a potential use-after-free or double-free. * * We use kasan_check_byte(), which is supported for the hardware * tag-based KASAN mode, unlike kasan_check_read/write(). * * If the pointed to memory is invalid, we return 0 to avoid users of * ksize() writing to and potentially corrupting the memory region. * * We want to perform the check before __ksize(), to avoid potentially * crashing in __ksize() due to accessing invalid metadata.
*/ if (unlikely(ZERO_OR_NULL_PTR(objp)) || !kasan_check_byte(objp)) return 0;
/* * This rcu parameter is runtime-read-only. It reflects * a minimum allowed number of objects which can be cached * per-CPU. Object size is equal to one page. This value * can be changed at boot time.
*/ staticint rcu_min_cached_objs = 5;
module_param(rcu_min_cached_objs, int, 0444);
// A page shrinker can ask for pages to be freed to make them // available for other parts of the system. This usually happens // under low memory conditions, and in that case we should also // defer page-cache filling for a short time period. // // The default value is 5 seconds, which is long enough to reduce // interference with the shrinker while it asks other systems to // drain their caches. staticint rcu_delay_page_cache_fill_msec = 5000;
module_param(rcu_delay_page_cache_fill_msec, int, 0444);
staticstruct workqueue_struct *rcu_reclaim_wq;
/* Maximum number of jiffies to wait before draining a batch. */ #define KFREE_DRAIN_JIFFIES (5 * HZ) #define KFREE_N_BATCHES 2 #define FREE_N_CHANNELS 2
/** * struct kvfree_rcu_bulk_data - single block to store kvfree_rcu() pointers * @list: List node. All blocks are linked between each other * @gp_snap: Snapshot of RCU state for objects placed to this bulk * @nr_records: Number of active pointers in the array * @records: Array of the kvfree_rcu() pointers
*/ struct kvfree_rcu_bulk_data { struct list_head list; struct rcu_gp_oldstate gp_snap; unsignedlong nr_records; void *records[] __counted_by(nr_records);
};
/* * This macro defines how many entries the "records" array * will contain. It is based on the fact that the size of * kvfree_rcu_bulk_data structure becomes exactly one page.
*/ #define KVFREE_BULK_MAX_ENTR \
((PAGE_SIZE - sizeof(struct kvfree_rcu_bulk_data)) / sizeof(void *))
/** * struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period * @head_free: List of kfree_rcu() objects waiting for a grace period * @head_free_gp_snap: Grace-period snapshot to check for attempted premature frees. * @bulk_head_free: Bulk-List of kvfree_rcu() objects waiting for a grace period * @krcp: Pointer to @kfree_rcu_cpu structure
*/
/** * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period * @head: List of kfree_rcu() objects not yet waiting for a grace period * @head_gp_snap: Snapshot of RCU state for objects placed to "@head" * @bulk_head: Bulk-List of kvfree_rcu() objects not yet waiting for a grace period * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period * @lock: Synchronize access to this structure * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES * @initialized: The @rcu_work fields have been initialized * @head_count: Number of objects in rcu_head singular list * @bulk_count: Number of objects in bulk-list * @bkvcache: * A simple cache list that contains objects for reuse purpose. * In order to save some per-cpu space the list is singular. * Even though it is lockless an access has to be protected by the * per-cpu lock. * @page_cache_work: A work to refill the cache when it is empty * @backoff_page_cache_fill: Delay cache refills * @work_in_progress: Indicates that page_cache_work is running * @hrtimer: A hrtimer for scheduling a page_cache_work * @nr_bkv_objs: number of allocated objects at @bkvcache. * * This is a per-CPU structure. The reason that it is not included in * the rcu_data structure is to permit this code to be extracted from * the RCU files. Such extraction could allow further optimization of * the interactions with the slab allocators.
*/ struct kfree_rcu_cpu { // Objects queued on a linked list // through their rcu_head structures. struct rcu_head *head; unsignedlong head_gp_snap;
atomic_t head_count;
// Objects queued on a bulk-list. struct list_head bulk_head[FREE_N_CHANNELS];
atomic_t bulk_count[FREE_N_CHANNELS];
/* * This function is invoked in workqueue context after a grace period. * It frees all the objects queued on ->bulk_head_free or ->head_free.
*/ staticvoid kfree_rcu_work(struct work_struct *work)
{ unsignedlong flags; struct kvfree_rcu_bulk_data *bnode, *n; struct list_head bulk_head[FREE_N_CHANNELS]; struct rcu_head *head; struct kfree_rcu_cpu *krcp; struct kfree_rcu_cpu_work *krwp; struct rcu_gp_oldstate head_gp_snap; int i;
raw_spin_lock_irqsave(&krcp->lock, flags); // Channels 1 and 2. for (i = 0; i < FREE_N_CHANNELS; i++)
list_replace_init(&krwp->bulk_head_free[i], &bulk_head[i]);
// Handle the first two channels. for (i = 0; i < FREE_N_CHANNELS; i++) { // Start from the tail page, so a GP is likely passed for it.
list_for_each_entry_safe(bnode, n, &bulk_head[i], list)
kvfree_rcu_bulk(krcp, bnode, i);
}
/* * This is used when the "bulk" path can not be used for the * double-argument of kvfree_rcu(). This happens when the * page-cache is empty, which means that objects are instead * queued on a linked list through their rcu_head structures. * This list is named "Channel 3".
*/ if (head && !WARN_ON_ONCE(!poll_state_synchronize_rcu_full(&head_gp_snap)))
kvfree_rcu_list(head);
}
staticbool
need_offload_krc(struct kfree_rcu_cpu *krcp)
{ int i;
for (i = 0; i < FREE_N_CHANNELS; i++) if (!list_empty(&krcp->bulk_head[i])) returntrue;
return !!READ_ONCE(krcp->head);
}
staticbool
need_wait_for_krwp_work(struct kfree_rcu_cpu_work *krwp)
{ int i;
for (i = 0; i < FREE_N_CHANNELS; i++) if (!list_empty(&krwp->bulk_head_free[i])) returntrue;
return !!krwp->head_free;
}
staticint krc_count(struct kfree_rcu_cpu *krcp)
{ int sum = atomic_read(&krcp->head_count); int i;
for (i = 0; i < FREE_N_CHANNELS; i++)
sum += atomic_read(&krcp->bulk_count[i]);
return sum;
}
staticvoid
__schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp)
{ long delay, delay_left;
for (i = 0; i < FREE_N_CHANNELS; i++) {
list_for_each_entry_safe(bnode, n, &bulk_ready[i], list)
kvfree_rcu_bulk(krcp, bnode, i);
}
if (head_ready)
kvfree_rcu_list(head_ready);
}
/* * Return: %true if a work is queued, %false otherwise.
*/ staticbool
kvfree_rcu_queue_batch(struct kfree_rcu_cpu *krcp)
{ unsignedlong flags; bool queued = false; int i, j;
raw_spin_lock_irqsave(&krcp->lock, flags);
// Attempt to start a new batch. for (i = 0; i < KFREE_N_BATCHES; i++) { struct kfree_rcu_cpu_work *krwp = &(krcp->krw_arr[i]);
// Try to detach bulk_head or head and attach it, only when // all channels are free. Any channel is not free means at krwp // there is on-going rcu work to handle krwp's free business. if (need_wait_for_krwp_work(krwp)) continue;
// kvfree_rcu_drain_ready() might handle this krcp, if so give up. if (need_offload_krc(krcp)) { // Channel 1 corresponds to the SLAB-pointer bulk path. // Channel 2 corresponds to vmalloc-pointer bulk path. for (j = 0; j < FREE_N_CHANNELS; j++) { if (list_empty(&krwp->bulk_head_free[j])) {
atomic_set(&krcp->bulk_count[j], 0);
list_replace_init(&krcp->bulk_head[j],
&krwp->bulk_head_free[j]);
}
}
// Channel 3 corresponds to both SLAB and vmalloc // objects queued on the linked list. if (!krwp->head_free) {
krwp->head_free = krcp->head;
get_state_synchronize_rcu_full(&krwp->head_free_gp_snap);
atomic_set(&krcp->head_count, 0);
WRITE_ONCE(krcp->head, NULL);
}
// One work is per one batch, so there are three // "free channels", the batch can handle. Break // the loop since it is done with this CPU thus // queuing an RCU work is _always_ success here.
queued = queue_rcu_work(rcu_reclaim_wq, &krwp->rcu_work);
WARN_ON_ONCE(!queued); break;
}
}
/* * This function is invoked after the KFREE_DRAIN_JIFFIES timeout.
*/ staticvoid kfree_rcu_monitor(struct work_struct *work)
{ struct kfree_rcu_cpu *krcp = container_of(work, struct kfree_rcu_cpu, monitor_work.work);
// Drain ready for reclaim.
kvfree_rcu_drain_ready(krcp);
// Queue a batch for a rest.
kvfree_rcu_queue_batch(krcp);
// If there is nothing to detach, it means that our job is // successfully done here. In case of having at least one // of the channels that is still busy we should rearm the // work to repeat an attempt. Because previous batches are // still in progress. if (need_offload_krc(krcp))
schedule_delayed_monitor_work(krcp);
}
// Record ptr in a page managed by krcp, with the pre-krc_this_cpu_lock() // state specified by flags. If can_alloc is true, the caller must // be schedulable and not be holding any locks or mutexes that might be // acquired by the memory allocator or anything that it might invoke. // Returns true if ptr was successfully recorded, else the caller must // use a fallback. staticinlinebool
add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp, unsignedlong *flags, void *ptr, bool can_alloc)
{ struct kvfree_rcu_bulk_data *bnode; int idx;
*krcp = krc_this_cpu_lock(flags); if (unlikely(!(*krcp)->initialized)) returnfalse;
/* Check if a new block is required. */ if (!bnode || bnode->nr_records == KVFREE_BULK_MAX_ENTR) {
bnode = get_cached_bnode(*krcp); if (!bnode && can_alloc) {
krc_this_cpu_unlock(*krcp, *flags);
// __GFP_NORETRY - allows a light-weight direct reclaim // what is OK from minimizing of fallback hitting point of // view. Apart of that it forbids any OOM invoking what is // also beneficial since we are about to release memory soon. // // __GFP_NOMEMALLOC - prevents from consuming of all the // memory reserves. Please note we have a fallback path. // // __GFP_NOWARN - it is supposed that an allocation can // be failed under low memory or high memory pressure // scenarios.
bnode = (struct kvfree_rcu_bulk_data *)
__get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
raw_spin_lock_irqsave(&(*krcp)->lock, *flags);
}
if (!bnode) returnfalse;
// Initialize the new block and attach it.
bnode->nr_records = 0;
list_add(&bnode->list, &(*krcp)->bulk_head[idx]);
}
// Finally insert and update the GP for this page.
bnode->nr_records++;
bnode->records[bnode->nr_records - 1] = ptr;
get_state_synchronize_rcu_full(&bnode->gp_snap);
atomic_inc(&(*krcp)->bulk_count[idx]);
if (need_offload_krc(krcp))
schedule_delayed_monitor_work(krcp);
}
}
/* * Queue a request for lazy invocation of the appropriate free routine * after a grace period. Please note that three paths are maintained, * two for the common case using arrays of pointers and a third one that * is used only when the main paths cannot be used, for example, due to * memory pressure. * * Each kvfree_call_rcu() request is added to a batch. The batch will be drained * every KFREE_DRAIN_JIFFIES number of jiffies. All the objects in the batch will * be free'd in workqueue context. This allows us to: batch requests together to * reduce the number of grace periods during heavy kfree_rcu()/kvfree_rcu() load.
*/ void kvfree_call_rcu(struct rcu_head *head, void *ptr)
{ unsignedlong flags; struct kfree_rcu_cpu *krcp; bool success;
/* * Please note there is a limitation for the head-less * variant, that is why there is a clear rule for such * objects: it can be used from might_sleep() context * only. For other places please embed an rcu_head to * your data.
*/ if (!head)
might_sleep();
// Queue the object but don't yet schedule the batch. if (debug_rcu_head_queue(ptr)) { // Probable double kfree_rcu(), just leak.
WARN_ONCE(1, "%s(): Double-freed call. rcu_head %p\n",
__func__, head);
// Take a snapshot for this krcp.
krcp->head_gp_snap = get_state_synchronize_rcu();
success = true;
}
/* * The kvfree_rcu() caller considers the pointer freed at this point * and likely removes any references to it. Since the actual slab * freeing (and kmemleak_free()) is deferred, tell kmemleak to ignore * this object (no scanning or false positives reporting).
*/
kmemleak_ignore(ptr);
// Set timer to drain after KFREE_DRAIN_JIFFIES. if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING)
__schedule_delayed_monitor_work(krcp);
unlock_return:
krc_this_cpu_unlock(krcp, flags);
/* * Inline kvfree() after synchronize_rcu(). We can do * it from might_sleep() context only, so the current * CPU can pass the QS state.
*/ if (!success) {
debug_rcu_head_unqueue((struct rcu_head *) ptr);
synchronize_rcu();
kvfree(ptr);
}
}
EXPORT_SYMBOL_GPL(kvfree_call_rcu);
/** * kvfree_rcu_barrier - Wait until all in-flight kvfree_rcu() complete. * * Note that a single argument of kvfree_rcu() call has a slow path that * triggers synchronize_rcu() following by freeing a pointer. It is done * before the return from the function. Therefore for any single-argument * call that will result in a kfree() to a cache that is to be destroyed * during module exit, it is developer's responsibility to ensure that all * such calls have returned before the call to kmem_cache_destroy().
*/ void kvfree_rcu_barrier(void)
{ struct kfree_rcu_cpu_work *krwp; struct kfree_rcu_cpu *krcp; bool queued; int i, cpu;
/* * Firstly we detach objects and queue them over an RCU-batch * for all CPUs. Finally queued works are flushed for each CPU. * * Please note. If there are outstanding batches for a particular * CPU, those have to be finished first following by queuing a new.
*/
for_each_possible_cpu(cpu) {
krcp = per_cpu_ptr(&krc, cpu);
/* * Check if this CPU has any objects which have been queued for a * new GP completion. If not(means nothing to detach), we are done * with it. If any batch is pending/running for this "krcp", below * per-cpu flush_rcu_work() waits its completion(see last step).
*/ if (!need_offload_krc(krcp)) continue;
while (1) { /* * If we are not able to queue a new RCU work it means: * - batches for this CPU are still in flight which should * be flushed first and then repeat; * - no objects to detach, because of concurrency.
*/
queued = kvfree_rcu_queue_batch(krcp);
/* * Bail out, if there is no need to offload this "krcp" * anymore. As noted earlier it can run concurrently.
*/ if (queued || !need_offload_krc(krcp)) break;
/* There are ongoing batches. */ for (i = 0; i < KFREE_N_BATCHES; i++) {
krwp = &(krcp->krw_arr[i]);
flush_rcu_work(&krwp->rcu_work);
}
}
}
/* * Now we guarantee that all objects are flushed.
*/
for_each_possible_cpu(cpu) {
krcp = per_cpu_ptr(&krc, cpu);
/* * A monitor work can drain ready to reclaim objects * directly. Wait its completion if running or pending.
*/
cancel_delayed_work_sync(&krcp->monitor_work);
for (i = 0; i < KFREE_N_BATCHES; i++) {
krwp = &(krcp->krw_arr[i]);
flush_rcu_work(&krwp->rcu_work);
}
}
}
EXPORT_SYMBOL_GPL(kvfree_rcu_barrier);
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung ist noch experimentell.