/* * The bucket lock has two protection scopes: * * 1) Serializing concurrent operations from BPF programs on different * CPUs * * 2) Serializing concurrent operations from BPF programs and sys_bpf() * * BPF programs can execute in any context including perf, kprobes and * tracing. As there are almost no limits where perf, kprobes and tracing * can be invoked from the lock operations need to be protected against * deadlocks. Deadlocks can be caused by recursion and by an invocation in * the lock held section when functions which acquire this lock are invoked * from sys_bpf(). BPF recursion is prevented by incrementing the per CPU * variable bpf_prog_active, which prevents BPF programs attached to perf * events, kprobes and tracing to be invoked before the prior invocation * from one of these contexts completed. sys_bpf() uses the same mechanism * by pinning the task to the current CPU and incrementing the recursion * protection across the map operation. * * This has subtle implications on PREEMPT_RT. PREEMPT_RT forbids certain * operations like memory allocations (even with GFP_ATOMIC) from atomic * contexts. This is required because even with GFP_ATOMIC the memory * allocator calls into code paths which acquire locks with long held lock * sections. To ensure the deterministic behaviour these locks are regular * spinlocks, which are converted to 'sleepable' spinlocks on RT. The only * true atomic contexts on an RT kernel are the low level hardware * handling, scheduling, low level interrupt handling, NMIs etc. None of * these contexts should ever do memory allocations. * * As regular device interrupt handlers and soft interrupts are forced into * thread context, the existing code which does * spin_lock*(); alloc(GFP_ATOMIC); spin_unlock*(); * just works. * * In theory the BPF locks could be converted to regular spinlocks as well, * but the bucket locks and percpu_freelist locks can be taken from * arbitrary contexts (perf, kprobes, tracepoints) which are required to be * atomic contexts even on RT. Before the introduction of bpf_mem_alloc, * it is only safe to use raw spinlock for preallocated hash map on a RT kernel, * because there is no memory allocation within the lock held sections. However * after hash map was fully converted to use bpf_mem_alloc, there will be * non-synchronous memory allocation for non-preallocated hash map, so it is * safe to always use raw spinlock for bucket lock.
*/ struct bucket { struct hlist_nulls_head head;
rqspinlock_t raw_lock;
};
struct bpf_htab { struct bpf_map map; struct bpf_mem_alloc ma; struct bpf_mem_alloc pcpu_ma; struct bucket *buckets; void *elems; union { struct pcpu_freelist freelist; struct bpf_lru lru;
}; struct htab_elem *__percpu *extra_elems; /* number of elements in non-preallocated hashtable are kept * in either pcount or count
*/ struct percpu_counter pcount;
atomic_t count; bool use_percpu_counter;
u32 n_buckets; /* number of hash buckets */
u32 elem_size; /* size of each element in bytes */
u32 hashrnd;
};
/* each htab element is struct htab_elem + key + value */ struct htab_elem { union { struct hlist_nulls_node hash_node; struct { void *padding; union { struct pcpu_freelist_node fnode; struct htab_elem *batch_flink;
};
};
}; union { /* pointer to per-cpu pointer */ void *ptr_to_pptr; struct bpf_lru_node lru_node;
};
u32 hash; char key[] __aligned(8);
};
for (i = 0; i < htab->n_buckets; i++) {
INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i);
raw_res_spin_lock_init(&htab->buckets[i].raw_lock);
cond_resched();
}
}
staticstruct htab_elem *get_htab_elem(struct bpf_htab *htab, int i)
{ return (struct htab_elem *) (htab->elems + i * (u64)htab->elem_size);
}
/* Both percpu and fd htab support in-place update, so no need for * extra elem. LRU itself can remove the least used element, so * there is no need for an extra elem during map_update.
*/ staticbool htab_has_extra_elems(struct bpf_htab *htab)
{ return !htab_is_percpu(htab) && !htab_is_lru(htab) && !is_fd_htab(htab);
}
if (IS_ERR_OR_NULL(htab->map.record)) return; if (htab_has_extra_elems(htab))
num_entries += num_possible_cpus(); for (i = 0; i < num_entries; i++) { struct htab_elem *elem;
elem = get_htab_elem(htab, i); if (htab_is_percpu(htab)) { void __percpu *pptr = htab_elem_get_ptr(elem, htab->map.key_size); int cpu;
/* The LRU list has a lock (lru_lock). Each htab bucket has a lock * (bucket_lock). If both locks need to be acquired together, the lock * order is always lru_lock -> bucket_lock and this only happens in * bpf_lru_list.c logic. For example, certain code path of * bpf_lru_pop_free(), which is called by function prealloc_lru_pop(), * will acquire lru_lock first followed by acquiring bucket_lock. * * In hashtab.c, to avoid deadlock, lock acquisition of * bucket_lock followed by lru_lock is not allowed. In such cases, * bucket_lock needs to be released first before acquiring lru_lock.
*/ staticstruct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key,
u32 hash)
{ struct bpf_lru_node *node = bpf_lru_pop_free(&htab->lru, hash); struct htab_elem *l;
if (node) {
bpf_map_inc_elem_count(&htab->map);
l = container_of(node, struct htab_elem, lru_node);
memcpy(l->key, key, htab->map.key_size); return l;
}
for_each_possible_cpu(cpu) {
l = pcpu_freelist_pop(&htab->freelist); /* pop will succeed, since prealloc_init() * preallocated extra num_possible_cpus elements
*/
l_new = container_of(l, struct htab_elem, fnode);
*per_cpu_ptr(pptr, cpu) = l_new;
}
htab->extra_elems = pptr; return 0;
}
/* Called from syscall */ staticint htab_map_alloc_check(union bpf_attr *attr)
{ bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH ||
attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); /* percpu_lru means each cpu has its own LRU list. * it is different from BPF_MAP_TYPE_PERCPU_HASH where * the map's value itself is percpu. percpu_lru has * nothing to do with the map's value.
*/ bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU); bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC); bool zero_seed = (attr->map_flags & BPF_F_ZERO_SEED); int numa_node = bpf_map_attr_numa_node(attr);
if (zero_seed && !capable(CAP_SYS_ADMIN)) /* Guard against local DoS, and discourage production use. */ return -EPERM;
if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK ||
!bpf_map_flags_access_ok(attr->map_flags)) return -EINVAL;
if (!lru && percpu_lru) return -EINVAL;
if (lru && !prealloc) return -ENOTSUPP;
if (numa_node != NUMA_NO_NODE && (percpu || percpu_lru)) return -EINVAL;
/* check sanity of attributes. * value_size == 0 may be allowed in the future to use map as a set
*/ if (attr->max_entries == 0 || attr->key_size == 0 ||
attr->value_size == 0) return -EINVAL;
if ((u64)attr->key_size + attr->value_size >= KMALLOC_MAX_SIZE - sizeof(struct htab_elem)) /* if key_size + value_size is bigger, the user space won't be * able to access the elements via bpf syscall. This check * also makes sure that the elem_size doesn't overflow and it's * kmalloc-able later in htab_map_update_elem()
*/ return -E2BIG; /* percpu map value size is bound by PCPU_MIN_UNIT_SIZE */ if (percpu && round_up(attr->value_size, 8) > PCPU_MIN_UNIT_SIZE) return -E2BIG;
return 0;
}
staticstruct bpf_map *htab_map_alloc(union bpf_attr *attr)
{ bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); /* percpu_lru means each cpu has its own LRU list. * it is different from BPF_MAP_TYPE_PERCPU_HASH where * the map's value itself is percpu. percpu_lru has * nothing to do with the map's value.
*/ bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU); bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC); struct bpf_htab *htab; int err;
htab = bpf_map_area_alloc(sizeof(*htab), NUMA_NO_NODE); if (!htab) return ERR_PTR(-ENOMEM);
bpf_map_init_from_attr(&htab->map, attr);
if (percpu_lru) { /* ensure each CPU's lru list has >=1 elements. * since we are at it, make each lru list has the same * number of elements.
*/
htab->map.max_entries = roundup(attr->max_entries,
num_possible_cpus()); if (htab->map.max_entries < attr->max_entries)
htab->map.max_entries = rounddown(attr->max_entries,
num_possible_cpus());
}
/* hash table size must be power of 2; roundup_pow_of_two() can overflow * into UB on 32-bit arches, so check that first
*/
err = -E2BIG; if (htab->map.max_entries > 1UL << 31) goto free_htab;
/* compute_batch_value() computes batch value as num_online_cpus() * 2 * and __percpu_counter_compare() needs * htab->max_entries - cur_number_of_elems to be more than batch * num_online_cpus() * for percpu_counter to be faster than atomic_t. In practice the average bpf * hash map size is 10k, which means that a system with 64 cpus will fill * hashmap to 20% of 10k before percpu_counter becomes ineffective. Therefore * define our own batch count as 32 then 10k hash map can be filled up to 80%: * 10k - 8k > 32 _batch_ * 64 _cpus_ * and __percpu_counter_compare() will still be fast. At that point hash map * collisions will dominate its performance anyway. Assume that hash map filled * to 50+% isn't going to be O(1) and use the following formula to choose * between percpu_counter and atomic_t.
*/ #define PERCPU_COUNTER_BATCH 32 if (attr->max_entries / 2 > num_online_cpus() * PERCPU_COUNTER_BATCH)
htab->use_percpu_counter = true;
if (htab->use_percpu_counter) {
err = percpu_counter_init(&htab->pcount, 0, GFP_KERNEL); if (err) goto free_map_locked;
}
if (prealloc) {
err = prealloc_init(htab); if (err) goto free_map_locked;
if (htab_has_extra_elems(htab)) {
err = alloc_extra_elems(htab); if (err) goto free_prealloc;
}
} else {
err = bpf_mem_alloc_init(&htab->ma, htab->elem_size, false); if (err) goto free_map_locked; if (percpu) {
err = bpf_mem_alloc_init(&htab->pcpu_ma,
round_up(htab->map.value_size, 8), true); if (err) goto free_map_locked;
}
}
/* this lookup function can only be called with bucket lock taken */ staticstruct htab_elem *lookup_elem_raw(struct hlist_nulls_head *head, u32 hash, void *key, u32 key_size)
{ struct hlist_nulls_node *n; struct htab_elem *l;
hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) if (l->hash == hash && !memcmp(&l->key, key, key_size)) return l;
return NULL;
}
/* can be called without bucket lock. it will repeat the loop in * the unlikely event when elements moved from one bucket into another * while link list is being walked
*/ staticstruct htab_elem *lookup_nulls_elem_raw(struct hlist_nulls_head *head,
u32 hash, void *key,
u32 key_size, u32 n_buckets)
{ struct hlist_nulls_node *n; struct htab_elem *l;
again:
hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) if (l->hash == hash && !memcmp(&l->key, key, key_size)) return l;
if (unlikely(get_nulls_value(n) != (hash & (n_buckets - 1)))) goto again;
return NULL;
}
/* Called from syscall or from eBPF program directly, so * arguments have to match bpf_map_lookup_elem() exactly. * The return value is adjusted by BPF instructions * in htab_map_gen_lookup().
*/ staticvoid *__htab_map_lookup_elem(struct bpf_map *map, void *key)
{ struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct hlist_nulls_head *head; struct htab_elem *l;
u32 hash, key_size;
/* It is called from the bpf_lru_list when the LRU needs to delete * older elements from the htab.
*/ staticbool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
{ struct bpf_htab *htab = arg; struct htab_elem *l = NULL, *tgt_l; struct hlist_nulls_head *head; struct hlist_nulls_node *n; unsignedlong flags; struct bucket *b; int ret;
tgt_l = container_of(node, struct htab_elem, lru_node);
b = __select_bucket(htab, tgt_l->hash);
head = &b->head;
ret = htab_lock_bucket(b, &flags); if (ret) returnfalse;
hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) if (l == tgt_l) {
hlist_nulls_del_rcu(&l->hash_node);
bpf_map_dec_elem_count(&htab->map); break;
}
htab_unlock_bucket(b, flags);
if (l == tgt_l)
check_and_free_fields(htab, l); return l == tgt_l;
}
/* Called from syscall */ staticint htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
{ struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct hlist_nulls_head *head; struct htab_elem *l, *next_l;
u32 hash, key_size; int i = 0;
/* lookup the key */
l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets);
if (!l) goto find_first_elem;
/* key was found, get next key in the same bucket */
next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_next_rcu(&l->hash_node)), struct htab_elem, hash_node);
if (next_l) { /* if next elem in this hash list is non-zero, just return it */
memcpy(next_key, next_l->key, key_size); return 0;
}
/* no more elements in this hash list, go to the next bucket */
i = hash & (htab->n_buckets - 1);
i++;
find_first_elem: /* iterate over buckets */ for (; i < htab->n_buckets; i++) {
head = select_bucket(htab, i);
/* pick first element in the bucket */
next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_first_rcu(head)), struct htab_elem, hash_node); if (next_l) { /* if it's not empty, just return it */
memcpy(next_key, next_l->key, key_size); return 0;
}
}
/* iterated over all buckets and all elements */ return -ENOENT;
}
for_each_possible_cpu(cpu) {
copy_map_value_long(&htab->map, per_cpu_ptr(pptr, cpu), value + off);
off += size;
}
}
}
staticvoid pcpu_init_value(struct bpf_htab *htab, void __percpu *pptr, void *value, bool onallcpus)
{ /* When not setting the initial value on all cpus, zero-fill element * values for other cpus. Otherwise, bpf program has no way to ensure * known initial values for cpus other than current one * (onallcpus=false always when coming from bpf prog).
*/ if (!onallcpus) { int current_cpu = raw_smp_processor_id(); int cpu;
for_each_possible_cpu(cpu) { if (cpu == current_cpu)
copy_map_value_long(&htab->map, per_cpu_ptr(pptr, cpu), value); else/* Since elem is preallocated, we cannot touch special fields */
zero_map_value(&htab->map, per_cpu_ptr(pptr, cpu));
}
} else {
pcpu_copy_value(htab, pptr, value, onallcpus);
}
}
if (prealloc) { if (old_elem) { /* if we're updating the existing element, * use per-cpu extra elems to avoid freelist_pop/push
*/
pl_new = this_cpu_ptr(htab->extra_elems);
l_new = *pl_new;
*pl_new = old_elem;
} else { struct pcpu_freelist_node *l;
l = __pcpu_freelist_pop(&htab->freelist); if (!l) return ERR_PTR(-E2BIG);
l_new = container_of(l, struct htab_elem, fnode);
bpf_map_inc_elem_count(&htab->map);
}
} else { if (is_map_full(htab)) if (!old_elem) /* when map is full and update() is replacing * old element, it's ok to allocate, since * old element will be freed immediately. * Otherwise return an error
*/ return ERR_PTR(-E2BIG);
inc_elem_count(htab);
l_new = bpf_mem_cache_alloc(&htab->ma); if (!l_new) {
l_new = ERR_PTR(-ENOMEM); goto dec_count;
}
}
if (unlikely(map_flags & BPF_F_LOCK)) { if (unlikely(!btf_record_has_field(map->record, BPF_SPIN_LOCK))) return -EINVAL; /* find an element without taking the bucket lock */
l_old = lookup_nulls_elem_raw(head, hash, key, key_size,
htab->n_buckets);
ret = check_flags(htab, l_old, map_flags); if (ret) return ret; if (l_old) { /* grab the element lock and update value in place */
copy_map_value_locked(map,
htab_elem_value(l_old, key_size),
value, false); return 0;
} /* fall through, grab the bucket lock and lookup again. * 99.9% chance that the element won't be found, * but second lookup under lock has to be done.
*/
}
ret = htab_lock_bucket(b, &flags); if (ret) return ret;
ret = check_flags(htab, l_old, map_flags); if (ret) goto err;
if (unlikely(l_old && (map_flags & BPF_F_LOCK))) { /* first lookup without the bucket lock didn't find the element, * but second lookup with the bucket lock found it. * This case is highly unlikely, but has to be dealt with: * grab the element lock in addition to the bucket lock * and update element in place
*/
copy_map_value_locked(map,
htab_elem_value(l_old, key_size),
value, false);
ret = 0; goto err;
}
l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false,
l_old); if (IS_ERR(l_new)) { /* all pre-allocated elements are in use or memory exhausted */
ret = PTR_ERR(l_new); goto err;
}
/* add new element to the head of the list, so that * concurrent search will find it before old elem
*/
hlist_nulls_add_head_rcu(&l_new->hash_node, head); if (l_old) {
hlist_nulls_del_rcu(&l_old->hash_node);
/* l_old has already been stashed in htab->extra_elems, free * its special fields before it is available for reuse.
*/ if (htab_is_prealloc(htab))
check_and_free_fields(htab, l_old);
}
htab_unlock_bucket(b, flags); if (l_old && !htab_is_prealloc(htab))
free_htab_elem(htab, l_old); return 0;
err:
htab_unlock_bucket(b, flags); return ret;
}
/* For LRU, we need to alloc before taking bucket's * spinlock because getting free nodes from LRU may need * to remove older elements from htab and this removal * operation will need a bucket lock.
*/
l_new = prealloc_lru_pop(htab, key, hash); if (!l_new) return -ENOMEM;
copy_map_value(&htab->map, htab_elem_value(l_new, map->key_size), value);
ret = htab_lock_bucket(b, &flags); if (ret) goto err_lock_bucket;
ret = check_flags(htab, l_old, map_flags); if (ret) goto err;
/* add new element to the head of the list, so that * concurrent search will find it before old elem
*/
hlist_nulls_add_head_rcu(&l_new->hash_node, head); if (l_old) {
bpf_lru_node_set_ref(&l_new->lru_node);
hlist_nulls_del_rcu(&l_old->hash_node);
}
ret = 0;
err:
htab_unlock_bucket(b, flags);
err_lock_bucket: if (ret)
htab_lru_push_free(htab, l_new); elseif (l_old)
htab_lru_push_free(htab, l_old);
/* For LRU, we need to alloc before taking bucket's * spinlock because LRU's elem alloc may need * to remove older elem from htab and this removal * operation will need a bucket lock.
*/ if (map_flags != BPF_EXIST) {
l_new = prealloc_lru_pop(htab, key, hash); if (!l_new) return -ENOMEM;
}
ret = htab_lock_bucket(b, &flags); if (ret) goto err_lock_bucket;
hash = htab_map_hash(key, key_size, htab->hashrnd);
b = __select_bucket(htab, hash);
head = &b->head;
ret = htab_lock_bucket(b, &flags); if (ret) return ret;
l = lookup_elem_raw(head, hash, key, key_size);
if (l)
hlist_nulls_del_rcu(&l->hash_node); else
ret = -ENOENT;
htab_unlock_bucket(b, flags); if (l)
htab_lru_push_free(htab, l); return ret;
}
staticvoid delete_all_elements(struct bpf_htab *htab)
{ int i;
/* It's called from a worker thread and migration has been disabled, * therefore, it is OK to invoke bpf_mem_cache_free() directly.
*/ for (i = 0; i < htab->n_buckets; i++) { struct hlist_nulls_head *head = select_bucket(htab, i); struct hlist_nulls_node *n; struct htab_elem *l;
staticvoid htab_free_malloced_timers_and_wq(struct bpf_htab *htab)
{ int i;
rcu_read_lock(); for (i = 0; i < htab->n_buckets; i++) { struct hlist_nulls_head *head = select_bucket(htab, i); struct hlist_nulls_node *n; struct htab_elem *l;
hlist_nulls_for_each_entry(l, n, head, hash_node) { /* We only free timer on uref dropping to zero */ if (btf_record_has_field(htab->map.record, BPF_TIMER))
bpf_obj_free_timer(htab->map.record,
htab_elem_value(l, htab->map.key_size)); if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE))
bpf_obj_free_workqueue(htab->map.record,
htab_elem_value(l, htab->map.key_size));
}
cond_resched_rcu();
}
rcu_read_unlock();
}
/* We only free timer and workqueue on uref dropping to zero */ if (btf_record_has_field(htab->map.record, BPF_TIMER | BPF_WORKQUEUE)) { if (!htab_is_prealloc(htab))
htab_free_malloced_timers_and_wq(htab); else
htab_free_prealloced_timers_and_wq(htab);
}
}
/* Called when map->refcnt goes to zero, either from workqueue or from syscall */ staticvoid htab_map_free(struct bpf_map *map)
{ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
/* bpf_free_used_maps() or close(map_fd) will trigger this map_free callback. * bpf_free_used_maps() is called after bpf prog is no longer executing. * There is no need to synchronize_rcu() here to protect map elements.
*/
/* htab no longer uses call_rcu() directly. bpf_mem_alloc does it * underneath and is responsible for waiting for callbacks to finish * during bpf_mem_alloc_destroy().
*/ if (!htab_is_prealloc(htab)) {
delete_all_elements(htab);
} else {
htab_free_prealloced_fields(htab);
prealloc_destroy(htab);
}
bpf_map_free_elem_count(map);
free_percpu(htab->extra_elems);
bpf_map_area_free(htab->buckets);
bpf_mem_alloc_destroy(&htab->pcpu_ma);
bpf_mem_alloc_destroy(&htab->ma); if (htab->use_percpu_counter)
percpu_counter_destroy(&htab->pcount);
bpf_map_area_free(htab);
}
key_size = htab->map.key_size;
value_size = htab->map.value_size;
size = round_up(value_size, 8); if (is_percpu)
value_size = size * num_possible_cpus();
total = 0; /* while experimenting with hash tables with sizes ranging from 10 to * 1000, it was observed that a bucket can have up to 5 entries.
*/
bucket_size = 5;
alloc: /* We cannot do copy_from_user or copy_to_user inside * the rcu_read_lock. Allocate enough space here.
*/
keys = kvmalloc_array(key_size, bucket_size, GFP_USER | __GFP_NOWARN);
values = kvmalloc_array(value_size, bucket_size, GFP_USER | __GFP_NOWARN); if (!keys || !values) {
ret = -ENOMEM; goto after_loop;
}
again:
bpf_disable_instrumentation();
rcu_read_lock();
again_nocopy:
dst_key = keys;
dst_val = values;
b = &htab->buckets[batch];
head = &b->head; /* do not grab the lock unless need it (bucket_cnt > 0). */ if (locked) {
ret = htab_lock_bucket(b, &flags); if (ret) {
rcu_read_unlock();
bpf_enable_instrumentation(); goto after_loop;
}
}
bucket_cnt = 0;
hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
bucket_cnt++;
if (bucket_cnt > (max_count - total)) { if (total == 0)
ret = -ENOSPC; /* Note that since bucket_cnt > 0 here, it is implicit * that the locked was grabbed, so release it.
*/
htab_unlock_bucket(b, flags);
rcu_read_unlock();
bpf_enable_instrumentation(); goto after_loop;
}
if (bucket_cnt > bucket_size) {
bucket_size = bucket_cnt; /* Note that since bucket_cnt > 0 here, it is implicit * that the locked was grabbed, so release it.
*/
htab_unlock_bucket(b, flags);
rcu_read_unlock();
bpf_enable_instrumentation();
kvfree(keys);
kvfree(values); goto alloc;
}
/* Next block is only safe to run if you have grabbed the lock */ if (!locked) goto next_batch;
hlist_nulls_for_each_entry_safe(l, n, head, hash_node) {
memcpy(dst_key, l->key, key_size);
if (is_percpu) { int off = 0, cpu; void __percpu *pptr;
/* Actual value is the id of the inner map */
map_id = map->ops->map_fd_sys_lookup_elem(*inner_map);
value = &map_id;
}
if (elem_map_flags & BPF_F_LOCK)
copy_map_value_locked(map, dst_val, value, true); else
copy_map_value(map, dst_val, value); /* Zeroing special fields in the temp buffer */
check_and_init_map_value(map, dst_val);
} if (do_delete) {
hlist_nulls_del_rcu(&l->hash_node);
/* bpf_lru_push_free() will acquire lru_lock, which * may cause deadlock. See comments in function * prealloc_lru_pop(). Let us do bpf_lru_push_free() * after releasing the bucket lock. * * For htab of maps, htab_put_fd_value() in * free_htab_elem() may acquire a spinlock with bucket * lock being held and it violates the lock rule, so * invoke free_htab_elem() after unlock as well.
*/
l->batch_flink = node_to_free;
node_to_free = l;
}
dst_key += key_size;
dst_val += value_size;
}
htab_unlock_bucket(b, flags);
locked = false;
while (node_to_free) {
l = node_to_free;
node_to_free = node_to_free->batch_flink; if (is_lru_map)
htab_lru_push_free(htab, l); else
free_htab_elem(htab, l);
}
next_batch: /* If we are not copying data, we can go to next bucket and avoid * unlocking the rcu.
*/ if (!bucket_cnt && (batch + 1 < htab->n_buckets)) {
batch++; goto again_nocopy;
}
rcu_read_unlock();
bpf_enable_instrumentation(); if (bucket_cnt && (copy_to_user(ukeys + total * key_size, keys,
key_size * bucket_cnt) ||
copy_to_user(uvalues + total * value_size, values,
value_size * bucket_cnt))) {
ret = -EFAULT; goto after_loop;
}
total += bucket_cnt;
batch++; if (batch >= htab->n_buckets) {
ret = -ENOENT; goto after_loop;
} goto again;
after_loop: if (ret == -EFAULT) goto out;
/* copy # of entries and next batch */
ubatch = u64_to_user_ptr(attr->batch.out_batch); if (copy_to_user(ubatch, &batch, sizeof(batch)) ||
put_user(total, &uattr->batch.count))
ret = -EFAULT;
/* try to find next elem in the same bucket */ if (prev_elem) { /* no update/deletion on this bucket, prev_elem should be still valid * and we won't skip elements.
*/
n = rcu_dereference_raw(hlist_nulls_next_rcu(&prev_elem->hash_node));
elem = hlist_nulls_entry_safe(n, struct htab_elem, hash_node); if (elem) return elem;
/* not found, unlock and go to the next bucket */
b = &htab->buckets[bucket_id++];
rcu_read_unlock();
skip_elems = 0;
}
for (i = bucket_id; i < htab->n_buckets; i++) {
b = &htab->buckets[i];
rcu_read_lock();
count = 0;
head = &b->head;
hlist_nulls_for_each_entry_rcu(elem, n, head, hash_node) { if (count >= skip_elems) {
info->bucket_id = i;
info->skip_elems = count; return elem;
}
count++;
}
/* migration has been disabled, so percpu value prepared here will be * the same as the one seen by the bpf program with * bpf_map_lookup_elem().
*/ for (i = 0; i < htab->n_buckets; i++) {
b = &htab->buckets[i];
rcu_read_lock();
head = &b->head;
hlist_nulls_for_each_entry_safe(elem, n, head, hash_node) {
key = elem->key; if (is_percpu) { /* current cpu value for percpu map */
pptr = htab_elem_get_ptr(elem, map->key_size);
val = this_cpu_ptr(pptr);
} else {
val = htab_elem_value(elem, map->key_size);
}
num_elems++;
ret = callback_fn((u64)(long)map, (u64)(long)key,
(u64)(long)val, (u64)(long)callback_ctx, 0); /* return value: 0 - continue, 1 - stop and return */ if (ret) {
rcu_read_unlock(); goto out;
}
}
rcu_read_unlock();
}
out: return num_elems;
}
l = __htab_map_lookup_elem(map, key); if (l) {
bpf_lru_node_set_ref(&l->lru_node); return per_cpu_ptr(htab_elem_get_ptr(l, map->key_size), cpu);
}
return NULL;
}
int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value)
{ struct htab_elem *l; void __percpu *pptr; int ret = -ENOENT; int cpu, off = 0;
u32 size;
/* per_cpu areas are zero-filled and bpf programs can only * access 'value_size' of them, so copying rounded areas * will not leak any kernel data
*/
size = round_up(map->value_size, 8);
rcu_read_lock();
l = __htab_map_lookup_elem(map, key); if (!l) goto out; /* We do not mark LRU map element here in order to not mess up * eviction heuristics when user space does a map walk.
*/
pptr = htab_elem_get_ptr(l, map->key_size);
for_each_possible_cpu(cpu) {
copy_map_value_long(map, value + off, per_cpu_ptr(pptr, cpu));
check_and_init_map_value(map, value + off);
off += size;
}
ret = 0;
out:
rcu_read_unlock(); return ret;
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.