// SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */ #include <linux/mm.h> #include <linux/llist.h> #include <linux/bpf.h> #include <linux/irq_work.h> #include <linux/bpf_mem_alloc.h> #include <linux/memcontrol.h> #include <asm/local.h>
/* Any context (including NMI) BPF specific memory allocator. * * Tracing BPF programs can attach to kprobe and fentry. Hence they * run in unknown context where calling plain kmalloc() might not be safe. * * Front-end kmalloc() with per-cpu per-bucket cache of free elements. * Refill this cache asynchronously from irq_work. * * CPU_0 buckets * 16 32 64 96 128 196 256 512 1024 2048 4096 * ... * CPU_N buckets * 16 32 64 96 128 196 256 512 1024 2048 4096 * * The buckets are prefilled at the start. * BPF programs always run with migration disabled. * It's safe to allocate from cache of the current cpu with irqs disabled. * Free-ing is always done into bucket of the current cpu as well. * irq_work trims extra free elements from buckets with kfree * and refills them with kmalloc, so global kmalloc logic takes care * of freeing objects allocated by one cpu and freed on another. * * Every allocated objected is padded with extra 8 bytes that contains * struct llist_node.
*/ #define LLIST_NODE_SZ sizeof(struct llist_node)
struct bpf_mem_cache { /* per-cpu list of free objects of size 'unit_size'. * All accesses are done with interrupts disabled and 'active' counter * protection with __llist_add() and __llist_del_first().
*/ struct llist_head free_llist;
local_t active;
/* Operations on the free_list from unit_alloc/unit_free/bpf_mem_refill * are sequenced by per-cpu 'active' counter. But unit_free() cannot * fail. When 'active' is busy the unit_free() will add an object to * free_llist_extra.
*/ struct llist_head free_llist_extra;
struct irq_work refill_work; struct obj_cgroup *objcg; int unit_size; /* count of objects in free_llist */ int free_cnt; int low_watermark, high_watermark, batch; int percpu_size; bool draining; struct bpf_mem_cache *tgt;
/* list of objects to be freed after RCU GP */ struct llist_head free_by_rcu; struct llist_node *free_by_rcu_tail; struct llist_head waiting_for_gp; struct llist_node *waiting_for_gp_tail; struct rcu_head rcu;
atomic_t call_rcu_in_progress; struct llist_head free_llist_extra_rcu;
/* list of objects to be freed after RCU tasks trace GP */ struct llist_head free_by_rcu_ttrace; struct llist_head waiting_for_gp_ttrace; struct rcu_head rcu_ttrace;
atomic_t call_rcu_ttrace_in_progress;
};
staticvoid inc_active(struct bpf_mem_cache *c, unsignedlong *flags)
{ if (IS_ENABLED(CONFIG_PREEMPT_RT)) /* In RT irq_work runs in per-cpu kthread, so disable * interrupts to avoid preemption and interrupts and * reduce the chance of bpf prog executing on this cpu * when active counter is busy.
*/
local_irq_save(*flags); /* alloc_bulk runs from irq_work which will not preempt a bpf * program that does unit_alloc/unit_free since IRQs are * disabled there. There is no race to increment 'active' * counter. It protects free_llist from corruption in case NMI * bpf prog preempted this loop.
*/
WARN_ON_ONCE(local_inc_return(&c->active) != 1);
}
for (i = 0; i < cnt; i++) { /* * For every 'c' llist_del_first(&c->free_by_rcu_ttrace); is * done only by one CPU == current CPU. Other CPUs might * llist_add() and llist_del_all() in parallel.
*/
obj = llist_del_first(&c->free_by_rcu_ttrace); if (!obj) break;
add_obj_to_free_list(c, obj);
} if (i >= cnt) return;
for (; i < cnt; i++) {
obj = llist_del_first(&c->waiting_for_gp_ttrace); if (!obj) break;
add_obj_to_free_list(c, obj);
} if (i >= cnt) return;
memcg = get_memcg(c);
old_memcg = set_active_memcg(memcg); for (; i < cnt; i++) { /* Allocate, but don't deplete atomic reserves that typical * GFP_ATOMIC would do. irq_work runs on this cpu and kmalloc * will allocate from the current numa node which is what we * want here.
*/
obj = __alloc(c, node, gfp); if (!obj) break;
add_obj_to_free_list(c, obj);
}
set_active_memcg(old_memcg);
mem_cgroup_put(memcg);
}
staticvoid __free_rcu_tasks_trace(struct rcu_head *head)
{ /* If RCU Tasks Trace grace period implies RCU grace period, * there is no need to invoke call_rcu().
*/ if (rcu_trace_implies_rcu_gp())
__free_rcu(head); else
call_rcu(head, __free_rcu);
}
/* bpf_mem_cache is a per-cpu object. Freeing happens in irq_work. * Nothing races to add to free_by_rcu_ttrace list.
*/
llist_add(llnode, &c->free_by_rcu_ttrace);
}
if (unlikely(READ_ONCE(c->draining))) {
__free_rcu(&c->rcu_ttrace); return;
}
/* Use call_rcu_tasks_trace() to wait for sleepable progs to finish. * If RCU Tasks Trace grace period implies RCU grace period, free * these elements directly, else use call_rcu() to wait for normal * progs to finish and finally do free_one() on each element.
*/
call_rcu_tasks_trace(&c->rcu_ttrace, __free_rcu_tasks_trace);
}
if (atomic_xchg(&c->call_rcu_in_progress, 1)) { /* * Instead of kmalloc-ing new rcu_head and triggering 10k * call_rcu() to hit rcutree.qhimark and force RCU to notice * the overload just ask RCU to hurry up. There could be many * objects in free_by_rcu list. * This hint reduces memory consumption for an artificial * benchmark from 2 Gbyte to 150 Mbyte.
*/
rcu_request_urgent_qs_task(current); return;
}
/* Racy access to free_cnt. It doesn't need to be 100% accurate */
cnt = c->free_cnt; if (cnt < c->low_watermark) /* irq_work runs on this cpu and kmalloc will allocate * from the current numa node which is what we want here.
*/
alloc_bulk(c, c->batch, NUMA_NO_NODE, true); elseif (cnt > c->high_watermark)
free_bulk(c);
/* For typical bpf map case that uses bpf_mem_cache_alloc and single bucket * the freelist cache will be elem_size * 64 (or less) on each cpu. * * For bpf programs that don't have statically known allocation sizes and * assuming (low_mark + high_mark) / 2 as an average number of elements per * bucket and all buckets are used the total amount of memory in freelists * on each cpu will be: * 64*16 + 64*32 + 64*64 + 64*96 + 64*128 + 64*196 + 64*256 + 32*512 + 16*1024 + 8*2048 + 4*4096 * == ~ 116 Kbyte using below heuristic. * Initialized, but unused bpf allocator (not bpf map specific one) will * consume ~ 11 Kbyte per cpu. * Typical case will be between 11K and 116K closer to 11K. * bpf progs can and should share bpf_mem_cache when possible. * * Percpu allocation is typically rare. To avoid potential unnecessary large * memory consumption, set low_mark = 1 and high_mark = 3, resulting in c->batch = 1.
*/ staticvoid init_refill_work(struct bpf_mem_cache *c)
{
init_irq_work(&c->refill_work, bpf_mem_refill); if (c->percpu_size) {
c->low_watermark = 1;
c->high_watermark = 3;
} elseif (c->unit_size <= 256) {
c->low_watermark = 32;
c->high_watermark = 96;
} else { /* When page_size == 4k, order-0 cache will have low_mark == 2 * and high_mark == 6 with batch alloc of 3 individual pages at * a time. * 8k allocs and above low == 1, high == 3, batch == 1.
*/
c->low_watermark = max(32 * 256 / c->unit_size, 1);
c->high_watermark = max(96 * 256 / c->unit_size, 3);
}
c->batch = max((c->high_watermark - c->low_watermark) / 4 * 3, 1);
}
staticvoid prefill_mem_cache(struct bpf_mem_cache *c, int cpu)
{ int cnt = 1;
/* To avoid consuming memory, for non-percpu allocation, assume that * 1st run of bpf prog won't be doing more than 4 map_update_elem from * irq disabled region if unit size is less than or equal to 256. * For all other cases, let us just do one allocation.
*/ if (!c->percpu_size && c->unit_size <= 256)
cnt = 4;
alloc_bulk(c, cnt, cpu_to_node(cpu), false);
}
/* When size != 0 bpf_mem_cache for each cpu. * This is typical bpf hash map use case when all elements have equal size. * * When size == 0 allocate 11 bpf_mem_cache-s for each cpu, then rely on * kmalloc/kfree. Max allocation size is 4096 in this case. * This is bpf_dynptr and bpf_kptr use case.
*/ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
{ struct bpf_mem_caches *cc; struct bpf_mem_caches __percpu *pcc; struct bpf_mem_cache *c; struct bpf_mem_cache __percpu *pc; struct obj_cgroup *objcg = NULL; int cpu, i, unit_size, percpu_size = 0;
if (percpu && size == 0) return -EINVAL;
/* room for llist_node and per-cpu pointer */ if (percpu)
percpu_size = LLIST_NODE_SZ + sizeof(void *);
ma->percpu = percpu;
if (size) {
pc = __alloc_percpu_gfp(sizeof(*pc), 8, GFP_KERNEL); if (!pc) return -ENOMEM;
if (!percpu)
size += LLIST_NODE_SZ; /* room for llist_node */
unit_size = size;
/* No progs are using this bpf_mem_cache, but htab_map_free() called * bpf_mem_cache_free() for all remaining elements and they can be in * free_by_rcu_ttrace or in waiting_for_gp_ttrace lists, so drain those lists now. * * Except for waiting_for_gp_ttrace list, there are no concurrent operations * on these lists, so it is safe to use __llist_del_all().
*/
free_all(llist_del_all(&c->free_by_rcu_ttrace), percpu);
free_all(llist_del_all(&c->waiting_for_gp_ttrace), percpu);
free_all(__llist_del_all(&c->free_llist), percpu);
free_all(__llist_del_all(&c->free_llist_extra), percpu);
free_all(__llist_del_all(&c->free_by_rcu), percpu);
free_all(__llist_del_all(&c->free_llist_extra_rcu), percpu);
free_all(llist_del_all(&c->waiting_for_gp), percpu);
}
if (ma->cache) {
for_each_possible_cpu(cpu) {
c = per_cpu_ptr(ma->cache, cpu);
check_mem_cache(c);
}
} if (ma->caches) {
for_each_possible_cpu(cpu) {
cc = per_cpu_ptr(ma->caches, cpu); for (i = 0; i < NUM_CACHES; i++) {
c = &cc->cache[i];
check_mem_cache(c);
}
}
}
}
staticvoid free_mem_alloc(struct bpf_mem_alloc *ma)
{ /* waiting_for_gp[_ttrace] lists were drained, but RCU callbacks * might still execute. Wait for them. * * rcu_barrier_tasks_trace() doesn't imply synchronize_rcu_tasks_trace(), * but rcu_barrier_tasks_trace() and rcu_barrier() below are only used * to wait for the pending __free_rcu_tasks_trace() and __free_rcu(), * so if call_rcu(head, __free_rcu) is skipped due to * rcu_trace_implies_rcu_gp(), it will be OK to skip rcu_barrier() by * using rcu_trace_implies_rcu_gp() as well.
*/
rcu_barrier(); /* wait for __free_by_rcu */
rcu_barrier_tasks_trace(); /* wait for __free_rcu */ if (!rcu_trace_implies_rcu_gp())
rcu_barrier();
free_mem_alloc_no_barrier(ma);
}
void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma)
{ struct bpf_mem_caches *cc; struct bpf_mem_cache *c; int cpu, i, rcu_in_progress;
if (ma->cache) {
rcu_in_progress = 0;
for_each_possible_cpu(cpu) {
c = per_cpu_ptr(ma->cache, cpu);
WRITE_ONCE(c->draining, true);
irq_work_sync(&c->refill_work);
drain_mem_cache(c);
rcu_in_progress += atomic_read(&c->call_rcu_ttrace_in_progress);
rcu_in_progress += atomic_read(&c->call_rcu_in_progress);
}
obj_cgroup_put(ma->objcg);
destroy_mem_alloc(ma, rcu_in_progress);
} if (ma->caches) {
rcu_in_progress = 0;
for_each_possible_cpu(cpu) {
cc = per_cpu_ptr(ma->caches, cpu); for (i = 0; i < NUM_CACHES; i++) {
c = &cc->cache[i];
WRITE_ONCE(c->draining, true);
irq_work_sync(&c->refill_work);
drain_mem_cache(c);
rcu_in_progress += atomic_read(&c->call_rcu_ttrace_in_progress);
rcu_in_progress += atomic_read(&c->call_rcu_in_progress);
}
}
obj_cgroup_put(ma->objcg);
destroy_mem_alloc(ma, rcu_in_progress);
}
}
/* notrace is necessary here and in other functions to make sure * bpf programs cannot attach to them and cause llist corruptions.
*/ staticvoid notrace *unit_alloc(struct bpf_mem_cache *c)
{ struct llist_node *llnode = NULL; unsignedlong flags; int cnt = 0;
/* Disable irqs to prevent the following race for majority of prog types: * prog_A * bpf_mem_alloc * preemption or irq -> prog_B * bpf_mem_alloc * * but prog_B could be a perf_event NMI prog. * Use per-cpu 'active' counter to order free_list access between * unit_alloc/unit_free/bpf_mem_refill.
*/
local_irq_save(flags); if (local_inc_return(&c->active) == 1) {
llnode = __llist_del_first(&c->free_llist); if (llnode) {
cnt = --c->free_cnt;
*(struct bpf_mem_cache **)llnode = c;
}
}
local_dec(&c->active);
WARN_ON(cnt < 0);
if (cnt < c->low_watermark)
irq_work_raise(c); /* Enable IRQ after the enqueue of irq work completes, so irq work * will run after IRQ is enabled and free_llist may be refilled by * irq work before other task preempts current task.
*/
local_irq_restore(flags);
return llnode;
}
/* Though 'ptr' object could have been allocated on a different cpu * add it to the free_llist of the current cpu. * Let kfree() logic deal with it when it's later called from irq_work.
*/ staticvoid notrace unit_free(struct bpf_mem_cache *c, void *ptr)
{ struct llist_node *llnode = ptr - LLIST_NODE_SZ; unsignedlong flags; int cnt = 0;
BUILD_BUG_ON(LLIST_NODE_SZ > 8);
/* * Remember bpf_mem_cache that allocated this object. * The hint is not accurate.
*/
c->tgt = *(struct bpf_mem_cache **)llnode;
local_irq_save(flags); if (local_inc_return(&c->active) == 1) {
__llist_add(llnode, &c->free_llist);
cnt = ++c->free_cnt;
} else { /* unit_free() cannot fail. Therefore add an object to atomic * llist. free_bulk() will drain it. Though free_llist_extra is * a per-cpu list we have to use atomic llist_add here, since * it also can be interrupted by bpf nmi prog that does another * unit_free() into the same free_llist_extra.
*/
llist_add(llnode, &c->free_llist_extra);
}
local_dec(&c->active);
if (cnt > c->high_watermark) /* free few objects from current cpu into global kmalloc pool */
irq_work_raise(c); /* Enable IRQ after irq_work_raise() completes, otherwise when current * task is preempted by task which does unit_alloc(), unit_alloc() may * return NULL unexpectedly because irq work is already pending but can * not been triggered and free_llist can not be refilled timely.
*/
local_irq_restore(flags);
}
local_irq_save(flags); if (local_inc_return(&c->active) == 1) { if (__llist_add(llnode, &c->free_by_rcu))
c->free_by_rcu_tail = llnode;
} else {
llist_add(llnode, &c->free_llist_extra_rcu);
}
local_dec(&c->active);
if (!atomic_read(&c->call_rcu_in_progress))
irq_work_raise(c);
local_irq_restore(flags);
}
/* Called from BPF program or from sys_bpf syscall. * In both cases migration is disabled.
*/ void notrace *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size)
{ int idx; void *ret;
if (!size) return NULL;
if (!ma->percpu)
size += LLIST_NODE_SZ;
idx = bpf_mem_cache_idx(size); if (idx < 0) return NULL;
ret = unit_alloc(this_cpu_ptr(ma->caches)->cache + idx); return !ret ? NULL : ret + LLIST_NODE_SZ;
}
/* Directly does a kfree() without putting 'ptr' back to the free_llist * for reuse and without waiting for a rcu_tasks_trace gp. * The caller must first go through the rcu_tasks_trace gp for 'ptr' * before calling bpf_mem_cache_raw_free(). * It could be used when the rcu_tasks_trace callback does not have * a hold on the original bpf_mem_alloc object that allocated the * 'ptr'. This should only be used in the uncommon code path. * Otherwise, the bpf_mem_alloc's free_llist cannot be refilled * and may affect performance.
*/ void bpf_mem_cache_raw_free(void *ptr)
{ if (!ptr) return;
kfree(ptr - LLIST_NODE_SZ);
}
/* When flags == GFP_KERNEL, it signals that the caller will not cause * deadlock when using kmalloc. bpf_mem_cache_alloc_flags() will use * kmalloc if the free_llist is empty.
*/ void notrace *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags)
{ struct bpf_mem_cache *c; void *ret;
c = this_cpu_ptr(ma->cache);
ret = unit_alloc(c); if (!ret && flags == GFP_KERNEL) { struct mem_cgroup *memcg, *old_memcg;
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.