// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
*/
#include <linux/bpf.h>
#include <linux/bpf-cgroup.h>
#include <linux/bpf_trace.h>
#include <linux/bpf_lirc.h>
#include <linux/bpf_verifier.h>
#include <linux/bsearch.h>
#include <linux/btf.h>
#include <linux/syscalls.h>
#include <linux/slab.h>
#include <linux/sched/signal.h>
#include <linux/vmalloc.h>
#include <linux/mmzone.h>
#include <linux/anon_inodes.h>
#include <linux/fdtable.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/license.h>
#include <linux/filter.h>
#include <linux/kernel.h>
#include <linux/idr.h>
#include <linux/cred.h>
#include <linux/timekeeping.h>
#include <linux/ctype.h>
#include <linux/nospec.h>
#include <linux/audit.h>
#include <uapi/linux/btf.h>
#include <linux/pgtable.h>
#include <linux/bpf_lsm.h>
#include <linux/poll.h>
#include <linux/sort.h>
#include <linux/bpf-netns.h>
#include <linux/rcupdate_trace.h>
#include <linux/memcontrol.h>
#include <linux/trace_events.h>
#include <linux/tracepoint.h>
#include <linux/overflow.h>
#include <linux/cookie.h>
#include <net/netfilter/nf_bpf_link.h>
#include <net/netkit.h>
#include <net/tcx.h>
#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
(map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \
(map)->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS)
#define IS_FD_PROG_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY)
#define IS_FD_HASH(map) ((map)->map_type == BPF_MAP_TYPE_HASH_OF_MAPS)
#define IS_FD_MAP(map) (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map) || \
IS_FD_HASH(map))
#define BPF_OBJ_FLAG_MASK (BPF_F_RDONLY | BPF_F_WRONLY)
DEFINE_PER_CPU(
int, bpf_prog_active);
DEFINE_COOKIE(bpf_map_cookie);
static DEFINE_IDR(prog_idr);
static DEFINE_SPINLOCK(prog_idr_lock);
static DEFINE_IDR(map_idr);
static DEFINE_SPINLOCK(map_idr_lock);
static DEFINE_IDR(link_idr);
static DEFINE_SPINLOCK(link_idr_lock);
int sysctl_unprivileged_bpf_disabled __read_mostly =
IS_BUILTIN(CONFIG_BPF_UNPRIV_DEFAULT_OFF) ? 2 : 0;
static const struct bpf_map_ops *
const bpf_map_types[] = {
#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type)
#define BPF_MAP_TYPE(_id, _ops) \
[_id] = &_ops,
#define BPF_LINK_TYPE(_id, _name)
#include <linux/bpf_types.h>
#undef BPF_PROG_TYPE
#undef BPF_MAP_TYPE
#undef BPF_LINK_TYPE
};
/*
* If we're handed a bigger struct than we know of, ensure all the unknown bits
* are 0 - i.e. new user-space does not rely on any kernel feature extensions
* we don't know about yet.
*
* There is a ToCToU between this function call and the following
* copy_from_user() call. However, this is not a concern since this function is
* meant to be a future-proofing of bits.
*/
int bpf_check_uarg_tail_zero(bpfptr_t uaddr,
size_t expected_size,
size_t actual_size)
{
int res;
if (unlikely(actual_size > PAGE_SIZE))
/* silly large */
return -E2BIG;
if (actual_size <= expected_size)
return 0;
if (uaddr.is_kernel)
res = memchr_inv(uaddr.kernel + expected_size, 0,
actual_size - expected_size) == NULL;
else
res = check_zeroed_user(uaddr.user + expected_size,
actual_size - expected_size);
if (res < 0)
return res;
return res ? 0 : -E2BIG;
}
const struct bpf_map_ops bpf_map_offload_ops = {
.map_meta_equal = bpf_map_meta_equal,
.map_alloc = bpf_map_offload_map_alloc,
.map_free = bpf_map_offload_map_free,
.map_check_btf = map_check_no_btf,
.map_mem_usage = bpf_map_offload_map_mem_usage,
};
static void bpf_map_write_active_inc(
struct bpf_map *map)
{
atomic64_inc(&map->writecnt);
}
static void bpf_map_write_active_dec(
struct bpf_map *map)
{
atomic64_dec(&map->writecnt);
}
bool bpf_map_write_active(
const struct bpf_map *map)
{
return atomic64_read(&map->writecnt) != 0;
}
static u32 bpf_map_value_size(
const struct bpf_map *map)
{
if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY ||
map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
return round_up(map->value_size, 8) * num_possible_cpus();
else if (IS_FD_MAP(map))
return sizeof(u32);
else
return map->value_size;
}
static void maybe_wait_bpf_programs(
struct bpf_map *map)
{
/* Wait for any running non-sleepable BPF programs to complete so that
* userspace, when we return to it, knows that all non-sleepable
* programs that could be running use the new map value. For sleepable
* BPF programs, synchronize_rcu_tasks_trace() should be used to wait
* for the completions of these programs, but considering the waiting
* time can be very long and userspace may think it will hang forever,
* so don't handle sleepable BPF programs now.
*/
if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS ||
map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS)
synchronize_rcu();
}
static void unpin_uptr_kaddr(
void *kaddr)
{
if (kaddr)
unpin_user_page(virt_to_page(kaddr));
}
static void __bpf_obj_unpin_uptrs(
struct btf_record *rec, u32 cnt,
void *obj)
{
const struct btf_field *field;
void **uptr_addr;
int i;
for (i = 0, field = rec->fields; i < cnt; i++, field++) {
if (field->type != BPF_UPTR)
continue;
uptr_addr = obj + field->offset;
unpin_uptr_kaddr(*uptr_addr);
}
}
static void bpf_obj_unpin_uptrs(
struct btf_record *rec,
void *obj)
{
if (!btf_record_has_field(rec, BPF_UPTR))
return;
__bpf_obj_unpin_uptrs(rec, rec->cnt, obj);
}
static int bpf_obj_pin_uptrs(
struct btf_record *rec,
void *obj)
{
const struct btf_field *field;
const struct btf_type *t;
unsigned long start, end;
struct page *page;
void **uptr_addr;
int i, err;
if (!btf_record_has_field(rec, BPF_UPTR))
return 0;
for (i = 0, field = rec->fields; i < rec->cnt; i++, field++) {
if (field->type != BPF_UPTR)
continue;
uptr_addr = obj + field->offset;
start = *(
unsigned long *)uptr_addr;
if (!start)
continue;
t = btf_type_by_id(field->kptr.btf, field->kptr.btf_id);
/* t->size was checked for zero before */
if (check_add_overflow(start, t->size - 1, &end)) {
err = -EFAULT;
goto unpin_all;
}
/* The uptr's struct cannot span across two pages */
if ((start & PAGE_MASK) != (end & PAGE_MASK)) {
err = -EOPNOTSUPP;
goto unpin_all;
}
err = pin_user_pages_fast(start, 1, FOLL_LONGTERM | FOLL_WRITE, &page);
if (err != 1)
goto unpin_all;
if (PageHighMem(page)) {
err = -EOPNOTSUPP;
unpin_user_page(page);
goto unpin_all;
}
*uptr_addr = page_address(page) + offset_in_page(start);
}
return 0;
unpin_all:
__bpf_obj_unpin_uptrs(rec, i, obj);
return err;
}
static int bpf_map_update_value(
struct bpf_map *map,
struct file *map_file,
void *key,
void *value, __u64 flags)
{
int err;
/* Need to create a kthread, thus must support schedule */
if (bpf_map_is_offloaded(map)) {
return bpf_map_offload_update_elem(map, key, value, flags);
}
else if (map->map_type == BPF_MAP_TYPE_CPUMAP ||
map->map_type == BPF_MAP_TYPE_ARENA ||
map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
return map->ops->map_update_elem(map, key, value, flags);
}
else if (map->map_type == BPF_MAP_TYPE_SOCKHASH ||
map->map_type == BPF_MAP_TYPE_SOCKMAP) {
return sock_map_update_elem_sys(map, key, value, flags);
}
else if (IS_FD_PROG_ARRAY(map)) {
return bpf_fd_array_map_update_elem(map, map_file, key, value,
flags);
}
bpf_disable_instrumentation();
if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
err = bpf_percpu_hash_update(map, key, value, flags);
}
else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
err = bpf_percpu_array_update(map, key, value, flags);
}
else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
err = bpf_percpu_cgroup_storage_update(map, key, value,
flags);
}
else if (IS_FD_ARRAY(map)) {
err = bpf_fd_array_map_update_elem(map, map_file, key, value,
flags);
}
else if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) {
err = bpf_fd_htab_map_update_elem(map, map_file, key, value,
flags);
}
else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
/* rcu_read_lock() is not needed */
err = bpf_fd_reuseport_array_update_elem(map, key, value,
flags);
}
else if (map->map_type == BPF_MAP_TYPE_QUEUE ||
map->map_type == BPF_MAP_TYPE_STACK ||
map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) {
err = map->ops->map_push_elem(map, value, flags);
}
else {
err = bpf_obj_pin_uptrs(map->record, value);
if (!err) {
rcu_read_lock();
err = map->ops->map_update_elem(map, key, value, flags);
rcu_read_unlock();
if (err)
bpf_obj_unpin_uptrs(map->record, value);
}
}
bpf_enable_instrumentation();
return err;
}
static int bpf_map_copy_value(
struct bpf_map *map,
void *key,
void *value,
__u64 flags)
{
void *ptr;
int err;
if (bpf_map_is_offloaded(map))
return bpf_map_offload_lookup_elem(map, key, value);
bpf_disable_instrumentation();
if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
err = bpf_percpu_hash_copy(map, key, value);
}
else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
err = bpf_percpu_array_copy(map, key, value);
}
else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
err = bpf_percpu_cgroup_storage_copy(map, key, value);
}
else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
err = bpf_stackmap_copy(map, key, value);
}
else if (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map)) {
err = bpf_fd_array_map_lookup_elem(map, key, value);
}
else if (IS_FD_HASH(map)) {
err = bpf_fd_htab_map_lookup_elem(map, key, value);
}
else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
err = bpf_fd_reuseport_array_lookup_elem(map, key, value);
}
else if (map->map_type == BPF_MAP_TYPE_QUEUE ||
map->map_type == BPF_MAP_TYPE_STACK ||
map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) {
err = map->ops->map_peek_elem(map, value);
}
else if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
/* struct_ops map requires directly updating "value" */
err = bpf_struct_ops_map_sys_lookup_elem(map, key, value);
}
else {
rcu_read_lock();
if (map->ops->map_lookup_elem_sys_only)
ptr = map->ops->map_lookup_elem_sys_only(map, key);
else
ptr = map->ops->map_lookup_elem(map, key);
if (IS_ERR(ptr)) {
err = PTR_ERR(ptr);
}
else if (!ptr) {
err = -ENOENT;
}
else {
err = 0;
if (flags & BPF_F_LOCK)
/* lock 'ptr' and copy everything but lock */
copy_map_value_locked(map, value, ptr,
true);
else
copy_map_value(map, value, ptr);
/* mask lock and timer, since value wasn't zero inited */
check_and_init_map_value(map, value);
}
rcu_read_unlock();
}
bpf_enable_instrumentation();
return err;
}
/* Please, do not use this function outside from the map creation path
* (e.g. in map update path) without taking care of setting the active
* memory cgroup (see at bpf_map_kmalloc_node() for example).
*/
static void *__bpf_map_area_alloc(u64 size,
int numa_node,
bool mmapable)
{
/* We really just want to fail instead of triggering OOM killer
* under memory pressure, therefore we set __GFP_NORETRY to kmalloc,
* which is used for lower order allocation requests.
*
* It has been observed that higher order allocation requests done by
* vmalloc with __GFP_NORETRY being set might fail due to not trying
* to reclaim memory from the page cache, thus we set
* __GFP_RETRY_MAYFAIL to avoid such situations.
*/
gfp_t gfp = bpf_memcg_flags(__GFP_NOWARN | __GFP_ZERO);
unsigned int flags = 0;
unsigned long align = 1;
void *area;
if (size >= SIZE_MAX)
return NULL;
/* kmalloc()'ed memory can't be mmap()'ed */
if (mmapable) {
BUG_ON(!PAGE_ALIGNED(size));
align = SHMLBA;
flags = VM_USERMAP;
}
else if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
area = kmalloc_node(size, gfp | GFP_USER | __GFP_NORETRY,
numa_node);
if (area != NULL)
return area;
}
return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
gfp | GFP_KERNEL | __GFP_RETRY_MAYFAIL, PAGE_KERNEL,
flags, numa_node, __builtin_return_address(0));
}
void *bpf_map_area_alloc(u64 size,
int numa_node)
{
return __bpf_map_area_alloc(size, numa_node,
false);
}
void *bpf_map_area_mmapable_alloc(u64 size,
int numa_node)
{
return __bpf_map_area_alloc(size, numa_node,
true);
}
void bpf_map_area_free(
void *area)
{
kvfree(area);
}
static u32 bpf_map_flags_retain_permanent(u32 flags)
{
/* Some map creation flags are not tied to the map object but
* rather to the map fd instead, so they have no meaning upon
* map object inspection since multiple file descriptors with
* different (access) properties can exist here. Thus, given
* this has zero meaning for the map itself, lets clear these
* from here.
*/
return flags & ~(BPF_F_RDONLY | BPF_F_WRONLY);
}
void bpf_map_init_from_attr(
struct bpf_map *map,
union bpf_attr *attr)
{
map->map_type = attr->map_type;
map->key_size = attr->key_size;
map->value_size = attr->value_size;
map->max_entries = attr->max_entries;
map->map_flags = bpf_map_flags_retain_permanent(attr->map_flags);
map->numa_node = bpf_map_attr_numa_node(attr);
map->map_extra = attr->map_extra;
}
static int bpf_map_alloc_id(
struct bpf_map *map)
{
int id;
idr_preload(GFP_KERNEL);
spin_lock_bh(&map_idr_lock);
id = idr_alloc_cyclic(&map_idr, map, 1, INT_MAX, GFP_ATOMIC);
if (id > 0)
map->id = id;
spin_unlock_bh(&map_idr_lock);
idr_preload_end();
if (WARN_ON_ONCE(!id))
return -ENOSPC;
return id > 0 ? 0 : id;
}
void bpf_map_free_id(
struct bpf_map *map)
{
unsigned long flags;
/* Offloaded maps are removed from the IDR store when their device
* disappears - even if someone holds an fd to them they are unusable,
* the memory is gone, all ops will fail; they are simply waiting for
* refcnt to drop to be freed.
*/
if (!map->id)
return;
spin_lock_irqsave(&map_idr_lock, flags);
idr_remove(&map_idr, map->id);
map->id = 0;
spin_unlock_irqrestore(&map_idr_lock, flags);
}
#ifdef CONFIG_MEMCG
static void bpf_map_save_memcg(
struct bpf_map *map)
{
/* Currently if a map is created by a process belonging to the root
* memory cgroup, get_obj_cgroup_from_current() will return NULL.
* So we have to check map->objcg for being NULL each time it's
* being used.
*/
if (memcg_bpf_enabled())
map->objcg = get_obj_cgroup_from_current();
}
static void bpf_map_release_memcg(
struct bpf_map *map)
{
if (map->objcg)
obj_cgroup_put(map->objcg);
}
static struct mem_cgroup *bpf_map_get_memcg(
const struct bpf_map *map)
{
if (map->objcg)
return get_mem_cgroup_from_objcg(map->objcg);
return root_mem_cgroup;
}
void *bpf_map_kmalloc_node(
const struct bpf_map *map, size_t size, gfp_t flags,
int node)
{
struct mem_cgroup *memcg, *old_memcg;
void *ptr;
memcg = bpf_map_get_memcg(map);
old_memcg = set_active_memcg(memcg);
ptr = kmalloc_node(size, flags | __GFP_ACCOUNT, node);
set_active_memcg(old_memcg);
mem_cgroup_put(memcg);
return ptr;
}
void *bpf_map_kzalloc(
const struct bpf_map *map, size_t size, gfp_t flags)
{
struct mem_cgroup *memcg, *old_memcg;
void *ptr;
memcg = bpf_map_get_memcg(map);
old_memcg = set_active_memcg(memcg);
ptr = kzalloc(size, flags | __GFP_ACCOUNT);
set_active_memcg(old_memcg);
mem_cgroup_put(memcg);
return ptr;
}
void *bpf_map_kvcalloc(
struct bpf_map *map, size_t n, size_t size,
gfp_t flags)
{
struct mem_cgroup *memcg, *old_memcg;
void *ptr;
memcg = bpf_map_get_memcg(map);
old_memcg = set_active_memcg(memcg);
ptr = kvcalloc(n, size, flags | __GFP_ACCOUNT);
set_active_memcg(old_memcg);
mem_cgroup_put(memcg);
return ptr;
}
void __percpu *bpf_map_alloc_percpu(
const struct bpf_map *map, size_t size,
size_t align, gfp_t flags)
{
struct mem_cgroup *memcg, *old_memcg;
void __percpu *ptr;
memcg = bpf_map_get_memcg(map);
old_memcg = set_active_memcg(memcg);
ptr = __alloc_percpu_gfp(size, align, flags | __GFP_ACCOUNT);
set_active_memcg(old_memcg);
mem_cgroup_put(memcg);
return ptr;
}
#else
static void bpf_map_save_memcg(
struct bpf_map *map)
{
}
static void bpf_map_release_memcg(
struct bpf_map *map)
{
}
#endif
static bool can_alloc_pages(
void)
{
return preempt_count() == 0 && !irqs_disabled() &&
!IS_ENABLED(CONFIG_PREEMPT_RT);
}
static struct page *__bpf_alloc_page(
int nid)
{
if (!can_alloc_pages())
return alloc_pages_nolock(nid, 0);
return alloc_pages_node(nid,
GFP_KERNEL | __GFP_ZERO | __GFP_ACCOUNT
| __GFP_NOWARN,
0);
}
int bpf_map_alloc_pages(
const struct bpf_map *map,
int nid,
unsigned long nr_pages,
struct page **pages)
{
unsigned long i, j;
struct page *pg;
int ret = 0;
#ifdef CONFIG_MEMCG
struct mem_cgroup *memcg, *old_memcg;
memcg = bpf_map_get_memcg(map);
old_memcg = set_active_memcg(memcg);
#endif
for (i = 0; i < nr_pages; i++) {
pg = __bpf_alloc_page(nid);
if (pg) {
pages[i] = pg;
continue;
}
for (j = 0; j < i; j++)
free_pages_nolock(pages[j], 0);
ret = -ENOMEM;
break;
}
#ifdef CONFIG_MEMCG
set_active_memcg(old_memcg);
mem_cgroup_put(memcg);
#endif
return ret;
}
static int btf_field_cmp(
const void *a,
const void *b)
{
const struct btf_field *f1 = a, *f2 = b;
if (f1->offset < f2->offset)
return -1;
else if (f1->offset > f2->offset)
return 1;
return 0;
}
struct btf_field *btf_record_find(
const struct btf_record *rec, u32 offset,
u32 field_mask)
{
struct btf_field *field;
if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & field_mask))
return NULL;
field = bsearch(&offset, rec->fields, rec->cnt,
sizeof(rec->fields[0]), btf_field_cmp);
if (!field || !(field->type & field_mask))
return NULL;
return field;
}
void btf_record_free(
struct btf_record *rec)
{
int i;
if (IS_ERR_OR_NULL(rec))
return;
for (i = 0; i < rec->cnt; i++) {
switch (rec->fields[i].type) {
case BPF_KPTR_UNREF:
case BPF_KPTR_REF:
case BPF_KPTR_PERCPU:
case BPF_UPTR:
if (rec->fields[i].kptr.module)
module_put(rec->fields[i].kptr.module);
if (btf_is_kernel(rec->fields[i].kptr.btf))
btf_put(rec->fields[i].kptr.btf);
break;
case BPF_LIST_HEAD:
case BPF_LIST_NODE:
case BPF_RB_ROOT:
case BPF_RB_NODE:
case BPF_SPIN_LOCK:
case BPF_RES_SPIN_LOCK:
case BPF_TIMER:
case BPF_REFCOUNT:
case BPF_WORKQUEUE:
/* Nothing to release */
break;
default:
WARN_ON_ONCE(1);
continue;
}
}
kfree(rec);
}
void bpf_map_free_record(
struct bpf_map *map)
{
btf_record_free(map->record);
map->record = NULL;
}
struct btf_record *btf_record_dup(
const struct btf_record *rec)
{
const struct btf_field *fields;
struct btf_record *new_rec;
int ret, size, i;
if (IS_ERR_OR_NULL(rec))
return NULL;
size = struct_size(rec, fields, rec->cnt);
new_rec = kmemdup(rec, size, GFP_KERNEL | __GFP_NOWARN);
if (!new_rec)
return ERR_PTR(-ENOMEM);
/* Do a deep copy of the btf_record */
fields = rec->fields;
new_rec->cnt = 0;
for (i = 0; i < rec->cnt; i++) {
switch (fields[i].type) {
case BPF_KPTR_UNREF:
case BPF_KPTR_REF:
case BPF_KPTR_PERCPU:
case BPF_UPTR:
if (btf_is_kernel(fields[i].kptr.btf))
btf_get(fields[i].kptr.btf);
if (fields[i].kptr.module && !try_module_get(fields[i].kptr.module)) {
ret = -ENXIO;
goto free;
}
break;
case BPF_LIST_HEAD:
case BPF_LIST_NODE:
case BPF_RB_ROOT:
case BPF_RB_NODE:
case BPF_SPIN_LOCK:
case BPF_RES_SPIN_LOCK:
case BPF_TIMER:
case BPF_REFCOUNT:
case BPF_WORKQUEUE:
/* Nothing to acquire */
break;
default:
ret = -EFAULT;
WARN_ON_ONCE(1);
goto free;
}
new_rec->cnt++;
}
return new_rec;
free:
btf_record_free(new_rec);
return ERR_PTR(ret);
}
bool btf_record_equal(
const struct btf_record *rec_a,
const struct btf_record *rec_b)
{
bool a_has_fields = !IS_ERR_OR_NULL(rec_a), b_has_fields = !IS_ERR_OR_NULL(rec_b);
int size;
if (!a_has_fields && !b_has_fields)
return true;
if (a_has_fields != b_has_fields)
return false;
if (rec_a->cnt != rec_b->cnt)
return false;
size = struct_size(rec_a, fields, rec_a->cnt);
/* btf_parse_fields uses kzalloc to allocate a btf_record, so unused
* members are zeroed out. So memcmp is safe to do without worrying
* about padding/unused fields.
*
* While spin_lock, timer, and kptr have no relation to map BTF,
* list_head metadata is specific to map BTF, the btf and value_rec
* members in particular. btf is the map BTF, while value_rec points to
* btf_record in that map BTF.
*
* So while by default, we don't rely on the map BTF (which the records
* were parsed from) matching for both records, which is not backwards
* compatible, in case list_head is part of it, we implicitly rely on
* that by way of depending on memcmp succeeding for it.
*/
return !memcmp(rec_a, rec_b, size);
}
void bpf_obj_free_timer(
const struct btf_record *rec,
void *obj)
{
if (WARN_ON_ONCE(!btf_record_has_field(rec, BPF_TIMER)))
return;
bpf_timer_cancel_and_free(obj + rec->timer_off);
}
void bpf_obj_free_workqueue(
const struct btf_record *rec,
void *obj)
{
if (WARN_ON_ONCE(!btf_record_has_field(rec, BPF_WORKQUEUE)))
return;
bpf_wq_cancel_and_free(obj + rec->wq_off);
}
void bpf_obj_free_fields(
const struct btf_record *rec,
void *obj)
{
const struct btf_field *fields;
int i;
if (IS_ERR_OR_NULL(rec))
return;
fields = rec->fields;
for (i = 0; i < rec->cnt; i++) {
struct btf_struct_meta *pointee_struct_meta;
const struct btf_field *field = &fields[i];
void *field_ptr = obj + field->offset;
void *xchgd_field;
switch (fields[i].type) {
case BPF_SPIN_LOCK:
case BPF_RES_SPIN_LOCK:
break;
case BPF_TIMER:
bpf_timer_cancel_and_free(field_ptr);
break;
case BPF_WORKQUEUE:
bpf_wq_cancel_and_free(field_ptr);
break;
case BPF_KPTR_UNREF:
WRITE_ONCE(*(u64 *)field_ptr, 0);
break;
case BPF_KPTR_REF:
case BPF_KPTR_PERCPU:
xchgd_field = (
void *)xchg((
unsigned long *)field_ptr, 0);
if (!xchgd_field)
break;
if (!btf_is_kernel(field->kptr.btf)) {
pointee_struct_meta = btf_find_struct_meta(field->kptr.btf,
field->kptr.btf_id);
__bpf_obj_drop_impl(xchgd_field, pointee_struct_meta ?
pointee_struct_meta->record : NULL,
fields[i].type == BPF_KPTR_PERCPU);
}
else {
field->kptr.dtor(xchgd_field);
}
break;
case BPF_UPTR:
/* The caller ensured that no one is using the uptr */
unpin_uptr_kaddr(*(
void **)field_ptr);
break;
case BPF_LIST_HEAD:
if (WARN_ON_ONCE(rec->spin_lock_off < 0))
continue;
bpf_list_head_free(field, field_ptr, obj + rec->spin_lock_off);
break;
case BPF_RB_ROOT:
if (WARN_ON_ONCE(rec->spin_lock_off < 0))
continue;
bpf_rb_root_free(field, field_ptr, obj + rec->spin_lock_off);
break;
case BPF_LIST_NODE:
case BPF_RB_NODE:
case BPF_REFCOUNT:
break;
default:
WARN_ON_ONCE(1);
continue;
}
}
}
static void bpf_map_free(
struct bpf_map *map)
{
struct btf_record *rec = map->record;
struct btf *btf = map->btf;
/* implementation dependent freeing. Disabling migration to simplify
* the free of values or special fields allocated from bpf memory
* allocator.
*/
migrate_disable();
map->ops->map_free(map);
migrate_enable();
/* Delay freeing of btf_record for maps, as map_free
* callback usually needs access to them. It is better to do it here
* than require each callback to do the free itself manually.
*
* Note that the btf_record stashed in map->inner_map_meta->record was
* already freed using the map_free callback for map in map case which
* eventually calls bpf_map_free_meta, since inner_map_meta is only a
* template bpf_map struct used during verification.
*/
btf_record_free(rec);
/* Delay freeing of btf for maps, as map_free callback may need
* struct_meta info which will be freed with btf_put().
*/
btf_put(btf);
}
/* called from workqueue */
static void bpf_map_free_deferred(
struct work_struct *work)
{
struct bpf_map *map = container_of(work,
struct bpf_map, work);
security_bpf_map_free(map);
bpf_map_release_memcg(map);
bpf_map_owner_free(map);
bpf_map_free(map);
}
static void bpf_map_put_uref(
struct bpf_map *map)
{
if (atomic64_dec_and_test(&map->usercnt)) {
if (map->ops->map_release_uref)
map->ops->map_release_uref(map);
}
}
static void bpf_map_free_in_work(
struct bpf_map *map)
{
INIT_WORK(&map->work, bpf_map_free_deferred);
/* Avoid spawning kworkers, since they all might contend
* for the same mutex like slab_mutex.
*/
queue_work(system_unbound_wq, &map->work);
}
static void bpf_map_free_rcu_gp(
struct rcu_head *rcu)
{
bpf_map_free_in_work(container_of(rcu,
struct bpf_map, rcu));
}
static void bpf_map_free_mult_rcu_gp(
struct rcu_head *rcu)
{
if (rcu_trace_implies_rcu_gp())
bpf_map_free_rcu_gp(rcu);
else
call_rcu(rcu, bpf_map_free_rcu_gp);
}
/* decrement map refcnt and schedule it for freeing via workqueue
* (underlying map implementation ops->map_free() might sleep)
*/
void bpf_map_put(
struct bpf_map *map)
{
if (atomic64_dec_and_test(&map->refcnt)) {
/* bpf_map_free_id() must be called first */
bpf_map_free_id(map);
WARN_ON_ONCE(atomic64_read(&map->sleepable_refcnt));
if (READ_ONCE(map->free_after_mult_rcu_gp))
call_rcu_tasks_trace(&map->rcu, bpf_map_free_mult_rcu_gp);
else if (READ_ONCE(map->free_after_rcu_gp))
call_rcu(&map->rcu, bpf_map_free_rcu_gp);
else
bpf_map_free_in_work(map);
}
}
EXPORT_SYMBOL_GPL(bpf_map_put);
void bpf_map_put_with_uref(
struct bpf_map *map)
{
bpf_map_put_uref(map);
bpf_map_put(map);
}
static int bpf_map_release(
struct inode *inode,
struct file *filp)
{
struct bpf_map *map = filp->private_data;
if (map->ops->map_release)
map->ops->map_release(map, filp);
bpf_map_put_with_uref(map);
return 0;
}
static fmode_t map_get_sys_perms(
struct bpf_map *map,
struct fd f)
{
fmode_t mode = fd_file(f)->f_mode;
/* Our file permissions may have been overridden by global
* map permissions facing syscall side.
*/
if (READ_ONCE(map->frozen))
mode &= ~FMODE_CAN_WRITE;
return mode;
}
#ifdef CONFIG_PROC_FS
/* Show the memory usage of a bpf map */
static u64 bpf_map_memory_usage(
const struct bpf_map *map)
{
return map->ops->map_mem_usage(map);
}
static void bpf_map_show_fdinfo(
struct seq_file *m,
struct file *filp)
{
struct bpf_map *map = filp->private_data;
u32 type = 0, jited = 0;
spin_lock(&map->owner_lock);
if (map->owner) {
type = map->owner->type;
jited = map->owner->jited;
}
spin_unlock(&map->owner_lock);
seq_printf(m,
"map_type:\t%u\n"
"key_size:\t%u\n"
"value_size:\t%u\n"
"max_entries:\t%u\n"
"map_flags:\t%#x\n"
"map_extra:\t%#llx\n"
"memlock:\t%llu\n"
"map_id:\t%u\n"
"frozen:\t%u\n",
map->map_type,
map->key_size,
map->value_size,
map->max_entries,
map->map_flags,
(
unsigned long long)map->map_extra,
bpf_map_memory_usage(map),
map->id,
READ_ONCE(map->frozen));
if (type) {
seq_printf(m,
"owner_prog_type:\t%u\n", type);
seq_printf(m,
"owner_jited:\t%u\n", jited);
}
}
#endif
static ssize_t bpf_dummy_read(
struct file *filp,
char __user *buf, size_t siz,
loff_t *ppos)
{
/* We need this handler such that alloc_file() enables
* f_mode with FMODE_CAN_READ.
*/
return -EINVAL;
}
static ssize_t bpf_dummy_write(
struct file *filp,
const char __user *buf,
size_t siz, loff_t *ppos)
{
/* We need this handler such that alloc_file() enables
* f_mode with FMODE_CAN_WRITE.
*/
return -EINVAL;
}
/* called for any extra memory-mapped regions (except initial) */
static void bpf_map_mmap_open(
struct vm_area_struct *vma)
{
struct bpf_map *map = vma->vm_file->private_data;
if (vma->vm_flags & VM_MAYWRITE)
bpf_map_write_active_inc(map);
}
/* called for all unmapped memory region (including initial) */
static void bpf_map_mmap_close(
struct vm_area_struct *vma)
{
struct bpf_map *map = vma->vm_file->private_data;
if (vma->vm_flags & VM_MAYWRITE)
bpf_map_write_active_dec(map);
}
static const struct vm_operations_struct bpf_map_default_vmops = {
.open = bpf_map_mmap_open,
.close = bpf_map_mmap_close,
};
static int bpf_map_mmap(
struct file *filp,
struct vm_area_struct *vma)
{
struct bpf_map *map = filp->private_data;
int err = 0;
if (!map->ops->map_mmap || !IS_ERR_OR_NULL(map->record))
return -ENOTSUPP;
if (!(vma->vm_flags & VM_SHARED))
return -EINVAL;
mutex_lock(&map->freeze_mutex);
if (vma->vm_flags & VM_WRITE) {
if (map->frozen) {
err = -EPERM;
goto out;
}
/* map is meant to be read-only, so do not allow mapping as
* writable, because it's possible to leak a writable page
* reference and allows user-space to still modify it after
* freezing, while verifier will assume contents do not change
*/
if (map->map_flags & BPF_F_RDONLY_PROG) {
err = -EACCES;
goto out;
}
bpf_map_write_active_inc(map);
}
out:
mutex_unlock(&map->freeze_mutex);
if (err)
return err;
/* set default open/close callbacks */
vma->vm_ops = &bpf_map_default_vmops;
vma->vm_private_data = map;
vm_flags_clear(vma, VM_MAYEXEC);
/* If mapping is read-only, then disallow potentially re-mapping with
* PROT_WRITE by dropping VM_MAYWRITE flag. This VM_MAYWRITE clearing
* means that as far as BPF map's memory-mapped VMAs are concerned,
* VM_WRITE and VM_MAYWRITE and equivalent, if one of them is set,
* both should be set, so we can forget about VM_MAYWRITE and always
* check just VM_WRITE
*/
if (!(vma->vm_flags & VM_WRITE))
vm_flags_clear(vma, VM_MAYWRITE);
err = map->ops->map_mmap(map, vma);
if (err) {
if (vma->vm_flags & VM_WRITE)
bpf_map_write_active_dec(map);
}
return err;
}
static __poll_t bpf_map_poll(
struct file *filp,
struct poll_table_struct *pts)
{
struct bpf_map *map = filp->private_data;
if (map->ops->map_poll)
return map->ops->map_poll(map, filp, pts);
return EPOLLERR;
}
static unsigned long bpf_get_unmapped_area(
struct file *filp,
unsigned long addr,
unsigned long len,
unsigned long pgoff,
unsigned long flags)
{
struct bpf_map *map = filp->private_data;
if (map->ops->map_get_unmapped_area)
return map->ops->map_get_unmapped_area(filp, addr, len, pgoff, flags);
#ifdef CONFIG_MMU
return mm_get_unmapped_area(current->mm, filp, addr, len, pgoff, flags);
#else
return addr;
#endif
}
const struct file_operations bpf_map_fops = {
#ifdef CONFIG_PROC_FS
.show_fdinfo = bpf_map_show_fdinfo,
#endif
.release = bpf_map_release,
.read = bpf_dummy_read,
.write = bpf_dummy_write,
.mmap = bpf_map_mmap,
.poll = bpf_map_poll,
.get_unmapped_area = bpf_get_unmapped_area,
};
int bpf_map_new_fd(
struct bpf_map *map,
int flags)
{
int ret;
ret = security_bpf_map(map, OPEN_FMODE(flags));
if (ret < 0)
return ret;
return anon_inode_getfd(
"bpf-map", &bpf_map_fops, map,
flags | O_CLOEXEC);
}
int bpf_get_file_flag(
int flags)
{
if ((flags & BPF_F_RDONLY) && (flags & BPF_F_WRONLY))
return -EINVAL;
if (flags & BPF_F_RDONLY)
return O_RDONLY;
if (flags & BPF_F_WRONLY)
return O_WRONLY;
return O_RDWR;
}
/* helper macro to check that unused fields 'union bpf_attr' are zero */
#define CHECK_ATTR(CMD) \
memchr_inv((
void *) &attr->CMD
##_LAST_FIELD + \
sizeof(attr->CMD
##_LAST_FIELD), 0, \
sizeof(*attr) - \
offsetof(
union bpf_attr, CMD
##_LAST_FIELD) - \
sizeof(attr->CMD
##_LAST_FIELD)) != NULL
/* dst and src must have at least "size" number of bytes.
* Return strlen on success and < 0 on error.
*/
int bpf_obj_name_cpy(
char *dst,
const char *src,
unsigned int size)
{
const char *end = src + size;
const char *orig_src = src;
memset(dst, 0, size);
/* Copy all isalnum(), '_' and '.' chars. */
while (src < end && *src) {
if (!isalnum(*src) &&
*src !=
'_' && *src !=
'.')
return -EINVAL;
*dst++ = *src++;
}
/* No '\0' found in "size" number of bytes */
if (src == end)
return -EINVAL;
return src - orig_src;
}
int map_check_no_btf(
const struct bpf_map *map,
const struct btf *btf,
const struct btf_type *key_type,
const struct btf_type *value_type)
{
return -ENOTSUPP;
}
static int map_check_btf(
struct bpf_map *map,
struct bpf_token *token,
const struct btf *btf, u32 btf_key_id, u32 btf_value_id)
{
const struct btf_type *key_type, *value_type;
u32 key_size, value_size;
int ret = 0;
/* Some maps allow key to be unspecified. */
if (btf_key_id) {
key_type = btf_type_id_size(btf, &btf_key_id, &key_size);
if (!key_type || key_size != map->key_size)
return -EINVAL;
}
else {
key_type = btf_type_by_id(btf, 0);
if (!map->ops->map_check_btf)
return -EINVAL;
}
value_type = btf_type_id_size(btf, &btf_value_id, &value_size);
if (!value_type || value_size != map->value_size)
return -EINVAL;
map->record = btf_parse_fields(btf, value_type,
BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD |
BPF_RB_ROOT | BPF_REFCOUNT | BPF_WORKQUEUE | BPF_UPTR,
map->value_size);
if (!IS_ERR_OR_NULL(map->record)) {
int i;
if (!bpf_token_capable(token, CAP_BPF)) {
ret = -EPERM;
goto free_map_tab;
}
if (map->map_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG)) {
ret = -EACCES;
goto free_map_tab;
}
for (i = 0; i <
sizeof(map->record->field_mask) * 8; i++) {
switch (map->record->field_mask & (1 << i)) {
case 0:
continue;
case BPF_SPIN_LOCK:
case BPF_RES_SPIN_LOCK:
if (map->map_type != BPF_MAP_TYPE_HASH &&
map->map_type != BPF_MAP_TYPE_ARRAY &&
map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE &&
map->map_type != BPF_MAP_TYPE_SK_STORAGE &&
map->map_type != BPF_MAP_TYPE_INODE_STORAGE &&
map->map_type != BPF_MAP_TYPE_TASK_STORAGE &&
map->map_type != BPF_MAP_TYPE_CGRP_STORAGE) {
ret = -EOPNOTSUPP;
goto free_map_tab;
}
break;
case BPF_TIMER:
case BPF_WORKQUEUE:
if (map->map_type != BPF_MAP_TYPE_HASH &&
map->map_type != BPF_MAP_TYPE_LRU_HASH &&
map->map_type != BPF_MAP_TYPE_ARRAY) {
ret = -EOPNOTSUPP;
goto free_map_tab;
}
break;
case BPF_KPTR_UNREF:
case BPF_KPTR_REF:
case BPF_KPTR_PERCPU:
case BPF_REFCOUNT:
if (map->map_type != BPF_MAP_TYPE_HASH &&
map->map_type != BPF_MAP_TYPE_PERCPU_HASH &&
map->map_type != BPF_MAP_TYPE_LRU_HASH &&
map->map_type != BPF_MAP_TYPE_LRU_PERCPU_HASH &&
map->map_type != BPF_MAP_TYPE_ARRAY &&
map->map_type != BPF_MAP_TYPE_PERCPU_ARRAY &&
map->map_type != BPF_MAP_TYPE_SK_STORAGE &&
map->map_type != BPF_MAP_TYPE_INODE_STORAGE &&
map->map_type != BPF_MAP_TYPE_TASK_STORAGE &&
map->map_type != BPF_MAP_TYPE_CGRP_STORAGE) {
ret = -EOPNOTSUPP;
goto free_map_tab;
}
break;
case BPF_UPTR:
if (map->map_type != BPF_MAP_TYPE_TASK_STORAGE) {
ret = -EOPNOTSUPP;
goto free_map_tab;
}
break;
case BPF_LIST_HEAD:
case BPF_RB_ROOT:
if (map->map_type != BPF_MAP_TYPE_HASH &&
map->map_type != BPF_MAP_TYPE_LRU_HASH &&
map->map_type != BPF_MAP_TYPE_ARRAY) {
ret = -EOPNOTSUPP;
goto free_map_tab;
}
break;
default:
/* Fail if map_type checks are missing for a field type */
ret = -EOPNOTSUPP;
goto free_map_tab;
}
}
}
ret = btf_check_and_fixup_fields(btf, map->record);
if (ret < 0)
goto free_map_tab;
if (map->ops->map_check_btf) {
ret = map->ops->map_check_btf(map, btf, key_type, value_type);
if (ret < 0)
goto free_map_tab;
}
return ret;
free_map_tab:
bpf_map_free_record(map);
return ret;
}
static bool bpf_net_capable(
void)
{
return capable(CAP_NET_ADMIN) || capable(CAP_SYS_ADMIN);
}
#define BPF_MAP_CREATE_LAST_FIELD map_token_fd
/* called via syscall */
static int map_create(
union bpf_attr *attr,
bool kernel)
{
const struct bpf_map_ops *ops;
struct bpf_token *token = NULL;
int numa_node = bpf_map_attr_numa_node(attr);
u32 map_type = attr->map_type;
struct bpf_map *map;
bool token_flag;
int f_flags;
int err;
err = CHECK_ATTR(BPF_MAP_CREATE);
if (err)
return -EINVAL;
/* check BPF_F_TOKEN_FD flag, remember if it's set, and then clear it
* to avoid per-map type checks tripping on unknown flag
*/
token_flag = attr->map_flags & BPF_F_TOKEN_FD;
attr->map_flags &= ~BPF_F_TOKEN_FD;
if (attr->btf_vmlinux_value_type_id) {
if (attr->map_type != BPF_MAP_TYPE_STRUCT_OPS ||
attr->btf_key_type_id || attr->btf_value_type_id)
return -EINVAL;
}
else if (attr->btf_key_type_id && !attr->btf_value_type_id) {
return -EINVAL;
}
if (attr->map_type != BPF_MAP_TYPE_BLOOM_FILTER &&
attr->map_type != BPF_MAP_TYPE_ARENA &&
attr->map_extra != 0)
return -EINVAL;
f_flags = bpf_get_file_flag(attr->map_flags);
if (f_flags < 0)
return f_flags;
if (numa_node != NUMA_NO_NODE &&
((
unsigned int)numa_node >= nr_node_ids ||
!node_online(numa_node)))
return -EINVAL;
/* find map type and init map: hashtable vs rbtree vs bloom vs ... */
map_type = attr->map_type;
if (map_type >= ARRAY_SIZE(bpf_map_types))
return -EINVAL;
map_type = array_index_nospec(map_type, ARRAY_SIZE(bpf_map_types));
ops = bpf_map_types[map_type];
if (!ops)
return -EINVAL;
if (ops->map_alloc_check) {
err = ops->map_alloc_check(attr);
if (err)
return err;
}
if (attr->map_ifindex)
ops = &bpf_map_offload_ops;
if (!ops->map_mem_usage)
return -EINVAL;
if (token_flag) {
token = bpf_token_get_from_fd(attr->map_token_fd);
if (IS_ERR(token))
return PTR_ERR(token);
/* if current token doesn't grant map creation permissions,
* then we can't use this token, so ignore it and rely on
* system-wide capabilities checks
*/
if (!bpf_token_allow_cmd(token, BPF_MAP_CREATE) ||
!bpf_token_allow_map_type(token, attr->map_type)) {
bpf_token_put(token);
token = NULL;
}
}
err = -EPERM;
/* Intent here is for unprivileged_bpf_disabled to block BPF map
* creation for unprivileged users; other actions depend
* on fd availability and access to bpffs, so are dependent on
* object creation success. Even with unprivileged BPF disabled,
* capability checks are still carried out.
*/
if (sysctl_unprivileged_bpf_disabled && !bpf_token_capable(token, CAP_BPF))
goto put_token;
/* check privileged map type permissions */
switch (map_type) {
case BPF_MAP_TYPE_ARRAY:
case BPF_MAP_TYPE_PERCPU_ARRAY:
case BPF_MAP_TYPE_PROG_ARRAY:
case BPF_MAP_TYPE_PERF_EVENT_ARRAY:
case BPF_MAP_TYPE_CGROUP_ARRAY:
case BPF_MAP_TYPE_ARRAY_OF_MAPS:
case BPF_MAP_TYPE_HASH:
case BPF_MAP_TYPE_PERCPU_HASH:
case BPF_MAP_TYPE_HASH_OF_MAPS:
case BPF_MAP_TYPE_RINGBUF:
case BPF_MAP_TYPE_USER_RINGBUF:
case BPF_MAP_TYPE_CGROUP_STORAGE:
case BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE:
/* unprivileged */
break;
case BPF_MAP_TYPE_SK_STORAGE:
case BPF_MAP_TYPE_INODE_STORAGE:
case BPF_MAP_TYPE_TASK_STORAGE:
case BPF_MAP_TYPE_CGRP_STORAGE:
case BPF_MAP_TYPE_BLOOM_FILTER:
case BPF_MAP_TYPE_LPM_TRIE:
case BPF_MAP_TYPE_REUSEPORT_SOCKARRAY:
case BPF_MAP_TYPE_STACK_TRACE:
case BPF_MAP_TYPE_QUEUE:
case BPF_MAP_TYPE_STACK:
case BPF_MAP_TYPE_LRU_HASH:
case BPF_MAP_TYPE_LRU_PERCPU_HASH:
case BPF_MAP_TYPE_STRUCT_OPS:
case BPF_MAP_TYPE_CPUMAP:
case BPF_MAP_TYPE_ARENA:
if (!bpf_token_capable(token, CAP_BPF))
goto put_token;
break;
case BPF_MAP_TYPE_SOCKMAP:
case BPF_MAP_TYPE_SOCKHASH:
case BPF_MAP_TYPE_DEVMAP:
case BPF_MAP_TYPE_DEVMAP_HASH:
case BPF_MAP_TYPE_XSKMAP:
if (!bpf_token_capable(token, CAP_NET_ADMIN))
goto put_token;
break;
default:
WARN(1,
"unsupported map type %d", map_type);
goto put_token;
}
map = ops->map_alloc(attr);
if (IS_ERR(map)) {
err = PTR_ERR(map);
goto put_token;
}
map->ops = ops;
map->map_type = map_type;
err = bpf_obj_name_cpy(map->name, attr->map_name,
sizeof(attr->map_name));
if (err < 0)
goto free_map;
preempt_disable();
map->cookie = gen_cookie_next(&bpf_map_cookie);
preempt_enable();
atomic64_set(&map->refcnt, 1);
atomic64_set(&map->usercnt, 1);
mutex_init(&map->freeze_mutex);
spin_lock_init(&map->owner_lock);
if (attr->btf_key_type_id || attr->btf_value_type_id ||
/* Even the map's value is a kernel's struct,
* the bpf_prog.o must have BTF to begin with
* to figure out the corresponding kernel's
* counter part. Thus, attr->btf_fd has
* to be valid also.
*/
attr->btf_vmlinux_value_type_id) {
struct btf *btf;
btf = btf_get_by_fd(attr->btf_fd);
if (IS_ERR(btf)) {
err = PTR_ERR(btf);
goto free_map;
}
if (btf_is_kernel(btf)) {
btf_put(btf);
err = -EACCES;
goto free_map;
}
map->btf = btf;
if (attr->btf_value_type_id) {
err = map_check_btf(map, token, btf, attr->btf_key_type_id,
attr->btf_value_type_id);
if (err)
goto free_map;
}
map->btf_key_type_id = attr->btf_key_type_id;
map->btf_value_type_id = attr->btf_value_type_id;
map->btf_vmlinux_value_type_id =
attr->btf_vmlinux_value_type_id;
}
err = security_bpf_map_create(map, attr, token, kernel);
if (err)
goto free_map_sec;
err = bpf_map_alloc_id(map);
if (err)
goto free_map_sec;
bpf_map_save_memcg(map);
bpf_token_put(token);
err = bpf_map_new_fd(map, f_flags);
if (err < 0) {
/* failed to allocate fd.
* bpf_map_put_with_uref() is needed because the above
* bpf_map_alloc_id() has published the map
* to the userspace and the userspace may
* have refcnt-ed it through BPF_MAP_GET_FD_BY_ID.
*/
bpf_map_put_with_uref(map);
return err;
}
return err;
free_map_sec:
security_bpf_map_free(map);
free_map:
bpf_map_free(map);
put_token:
bpf_token_put(token);
return err;
}
void bpf_map_inc(
struct bpf_map *map)
{
atomic64_inc(&map->refcnt);
}
EXPORT_SYMBOL_GPL(bpf_map_inc);
void bpf_map_inc_with_uref(
struct bpf_map *map)
{
atomic64_inc(&map->refcnt);
atomic64_inc(&map->usercnt);
}
EXPORT_SYMBOL_GPL(bpf_map_inc_with_uref);
struct bpf_map *bpf_map_get(u32 ufd)
{
CLASS(fd, f)(ufd);
struct bpf_map *map = __bpf_map_get(f);
if (!IS_ERR(map))
bpf_map_inc(map);
return map;
}
EXPORT_SYMBOL_NS(bpf_map_get,
"BPF_INTERNAL");
struct bpf_map *bpf_map_get_with_uref(u32 ufd)
{
CLASS(fd, f)(ufd);
struct bpf_map *map = __bpf_map_get(f);
if (!IS_ERR(map))
bpf_map_inc_with_uref(map);
return map;
}
/* map_idr_lock should have been held or the map should have been
* protected by rcu read lock.
*/
struct bpf_map *__bpf_map_inc_not_zero(
struct bpf_map *map,
bool uref)
{
int refold;
refold = atomic64_fetch_add_unless(&map->refcnt, 1, 0);
if (!refold)
return ERR_PTR(-ENOENT);
if (uref)
atomic64_inc(&map->usercnt);
return map;
}
struct bpf_map *bpf_map_inc_not_zero(
struct bpf_map *map)
{
lockdep_assert(rcu_read_lock_held());
return __bpf_map_inc_not_zero(map,
false);
}
EXPORT_SYMBOL_GPL(bpf_map_inc_not_zero);
int __weak bpf_stackmap_copy(
struct bpf_map *map,
void *key,
void *value)
{
return -ENOTSUPP;
}
static void *__bpf_copy_key(
void __user *ukey, u64 key_size)
{
if (key_size)
return vmemdup_user(ukey, key_size);
if (ukey)
return ERR_PTR(-EINVAL);
return NULL;
}
static void *___bpf_copy_key(bpfptr_t ukey, u64 key_size)
{
if (key_size)
return kvmemdup_bpfptr(ukey, key_size);
if (!bpfptr_is_null(ukey))
return ERR_PTR(-EINVAL);
return NULL;
}
/* last field in 'union bpf_attr' used by this command */
#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD flags
static int map_lookup_elem(
union bpf_attr *attr)
{
void __user *ukey = u64_to_user_ptr(attr->key);
void __user *uvalue = u64_to_user_ptr(attr->value);
struct bpf_map *map;
void *key, *value;
u32 value_size;
int err;
if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM))
return -EINVAL;
if (attr->flags & ~BPF_F_LOCK)
return -EINVAL;
CLASS(fd, f)(attr->map_fd);
map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ))
return -EPERM;
if ((attr->flags & BPF_F_LOCK) &&
!btf_record_has_field(map->record, BPF_SPIN_LOCK))
return -EINVAL;
key = __bpf_copy_key(ukey, map->key_size);
if (IS_ERR(key))
return PTR_ERR(key);
value_size = bpf_map_value_size(map);
err = -ENOMEM;
value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN);
if (!value)
goto free_key;
if (map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) {
if (copy_from_user(value, uvalue, value_size))
err = -EFAULT;
else
err = bpf_map_copy_value(map, key, value, attr->flags);
goto free_value;
}
err = bpf_map_copy_value(map, key, value, attr->flags);
if (err)
goto free_value;
err = -EFAULT;
if (copy_to_user(uvalue, value, value_size) != 0)
goto free_value;
err = 0;
free_value:
kvfree(value);
free_key:
kvfree(key);
return err;
}
#define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags
static int map_update_elem(
union bpf_attr *attr, bpfptr_t uattr)
{
bpfptr_t ukey = make_bpfptr(attr->key, uattr.is_kernel);
bpfptr_t uvalue = make_bpfptr(attr->value, uattr.is_kernel);
struct bpf_map *map;
void *key, *value;
u32 value_size;
int err;
if (CHECK_ATTR(BPF_MAP_UPDATE_ELEM))
return -EINVAL;
CLASS(fd, f)(attr->map_fd);
map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
bpf_map_write_active_inc(map);
if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
err = -EPERM;
goto err_put;
}
if ((attr->flags & BPF_F_LOCK) &&
!btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
err = -EINVAL;
goto err_put;
}
key = ___bpf_copy_key(ukey, map->key_size);
if (IS_ERR(key)) {
err = PTR_ERR(key);
goto err_put;
}
value_size = bpf_map_value_size(map);
value = kvmemdup_bpfptr(uvalue, value_size);
if (IS_ERR(value)) {
err = PTR_ERR(value);
goto free_key;
}
err = bpf_map_update_value(map, fd_file(f), key, value, attr->flags);
if (!err)
maybe_wait_bpf_programs(map);
kvfree(value);
free_key:
kvfree(key);
err_put:
bpf_map_write_active_dec(map);
return err;
}
#define BPF_MAP_DELETE_ELEM_LAST_FIELD key
static int map_delete_elem(
union bpf_attr *attr, bpfptr_t uattr)
{
bpfptr_t ukey = make_bpfptr(attr->key, uattr.is_kernel);
struct bpf_map *map;
void *key;
int err;
if (CHECK_ATTR(BPF_MAP_DELETE_ELEM))
return -EINVAL;
CLASS(fd, f)(attr->map_fd);
map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
bpf_map_write_active_inc(map);
if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
err = -EPERM;
goto err_put;
}
key = ___bpf_copy_key(ukey, map->key_size);
if (IS_ERR(key)) {
err = PTR_ERR(key);
goto err_put;
}
if (bpf_map_is_offloaded(map)) {
err = bpf_map_offload_delete_elem(map, key);
goto out;
}
else if (IS_FD_PROG_ARRAY(map) ||
map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
/* These maps require sleepable context */
err = map->ops->map_delete_elem(map, key);
goto out;
}
bpf_disable_instrumentation();
rcu_read_lock();
err = map->ops->map_delete_elem(map, key);
rcu_read_unlock();
bpf_enable_instrumentation();
if (!err)
maybe_wait_bpf_programs(map);
out:
kvfree(key);
err_put:
bpf_map_write_active_dec(map);
return err;
}
/* last field in 'union bpf_attr' used by this command */
#define BPF_MAP_GET_NEXT_KEY_LAST_FIELD next_key
static int map_get_next_key(
union bpf_attr *attr)
{
void __user *ukey = u64_to_user_ptr(attr->key);
void __user *unext_key = u64_to_user_ptr(attr->next_key);
struct bpf_map *map;
void *key, *next_key;
int err;
if (CHECK_ATTR(BPF_MAP_GET_NEXT_KEY))
return -EINVAL;
CLASS(fd, f)(attr->map_fd);
map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ))
return -EPERM;
if (ukey) {
key = __bpf_copy_key(ukey, map->key_size);
if (IS_ERR(key))
return PTR_ERR(key);
}
else {
key = NULL;
}
err = -ENOMEM;
next_key = kvmalloc(map->key_size, GFP_USER);
if (!next_key)
goto free_key;
if (bpf_map_is_offloaded(map)) {
err = bpf_map_offload_get_next_key(map, key, next_key);
goto out;
}
rcu_read_lock();
err = map->ops->map_get_next_key(map, key, next_key);
rcu_read_unlock();
out:
if (err)
goto free_next_key;
err = -EFAULT;
if (copy_to_user(unext_key, next_key, map->key_size) != 0)
goto free_next_key;
err = 0;
free_next_key:
kvfree(next_key);
free_key:
kvfree(key);
return err;
}
int generic_map_delete_batch(
struct bpf_map *map,
const union bpf_attr *attr,
union bpf_attr __user *uattr)
{
void __user *keys = u64_to_user_ptr(attr->batch.keys);
u32 cp, max_count;
int err = 0;
void *key;
if (attr->batch.elem_flags & ~BPF_F_LOCK)
return -EINVAL;
if ((attr->batch.elem_flags & BPF_F_LOCK) &&
!btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
return -EINVAL;
}
max_count = attr->batch.count;
if (!max_count)
return 0;
if (put_user(0, &uattr->batch.count))
return -EFAULT;
key = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
if (!key)
return -ENOMEM;
for (cp = 0; cp < max_count; cp++) {
err = -EFAULT;
if (copy_from_user(key, keys + cp * map->key_size,
map->key_size))
break;
if (bpf_map_is_offloaded(map)) {
err = bpf_map_offload_delete_elem(map, key);
break;
}
bpf_disable_instrumentation();
rcu_read_lock();
err = map->ops->map_delete_elem(map, key);
rcu_read_unlock();
bpf_enable_instrumentation();
if (err)
break;
cond_resched();
}
if (copy_to_user(&uattr->batch.count, &cp,
sizeof(cp)))
err = -EFAULT;
kvfree(key);
return err;
}
int generic_map_update_batch(
struct bpf_map *map,
struct file *map_file,
const union bpf_attr *attr,
union bpf_attr __user *uattr)
{
void __user *values = u64_to_user_ptr(attr->batch.values);
void __user *keys = u64_to_user_ptr(attr->batch.keys);
u32 value_size, cp, max_count;
void *key, *value;
int err = 0;
if (attr->batch.elem_flags & ~BPF_F_LOCK)
return -EINVAL;
if ((attr->batch.elem_flags & BPF_F_LOCK) &&
!btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
return -EINVAL;
}
value_size = bpf_map_value_size(map);
max_count = attr->batch.count;
if (!max_count)
return 0;
if (put_user(0, &uattr->batch.count))
return -EFAULT;
key = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
if (!key)
return -ENOMEM;
value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN);
if (!value) {
kvfree(key);
return -ENOMEM;
}
for (cp = 0; cp < max_count; cp++) {
err = -EFAULT;
if (copy_from_user(key, keys + cp * map->key_size,
map->key_size) ||
copy_from_user(value, values + cp * value_size, value_size))
break;
err = bpf_map_update_value(map, map_file, key, value,
attr->batch.elem_flags);
if (err)
break;
cond_resched();
}
if (copy_to_user(&uattr->batch.count, &cp,
sizeof(cp)))
err = -EFAULT;
kvfree(value);
kvfree(key);
return err;
}
int generic_map_lookup_batch(
struct bpf_map *map,
const union bpf_attr *attr,
union bpf_attr __user *uattr)
{
void __user *uobatch = u64_to_user_ptr(attr->batch.out_batch);
void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch);
void __user *values = u64_to_user_ptr(attr->batch.values);
void __user *keys = u64_to_user_ptr(attr->batch.keys);
void *buf, *buf_prevkey, *prev_key, *key, *value;
u32 value_size, cp, max_count;
int err;
if (attr->batch.elem_flags & ~BPF_F_LOCK)
return -EINVAL;
if ((attr->batch.elem_flags & BPF_F_LOCK) &&
!btf_record_has_field(map->record, BPF_SPIN_LOCK))
return -EINVAL;
value_size = bpf_map_value_size(map);
max_count = attr->batch.count;
if (!max_count)
return 0;
if (put_user(0, &uattr->batch.count))
return -EFAULT;
buf_prevkey = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
if (!buf_prevkey)
return -ENOMEM;
buf = kvmalloc(map->key_size + value_size, GFP_USER | __GFP_NOWARN);
if (!buf) {
kvfree(buf_prevkey);
return -ENOMEM;
}
err = -EFAULT;
prev_key = NULL;
if (ubatch && copy_from_user(buf_prevkey, ubatch, map->key_size))
goto free_buf;
key = buf;
value = key + map->key_size;
if (ubatch)
prev_key = buf_prevkey;
for (cp = 0; cp < max_count;) {
rcu_read_lock();
err = map->ops->map_get_next_key(map, prev_key, key);
rcu_read_unlock();
if (err)
break;
err = bpf_map_copy_value(map, key, value,
attr->batch.elem_flags);
if (err == -ENOENT)
goto next_key;
if (err)
goto free_buf;
if (copy_to_user(keys + cp * map->key_size, key,
map->key_size)) {
err = -EFAULT;
goto free_buf;
}
if (copy_to_user(values + cp * value_size, value, value_size)) {
err = -EFAULT;
goto free_buf;
}
cp++;
next_key:
if (!prev_key)
prev_key = buf_prevkey;
swap(prev_key, key);
cond_resched();
}
if (err == -EFAULT)
goto free_buf;
if ((copy_to_user(&uattr->batch.count, &cp,
sizeof(cp)) ||
(cp && copy_to_user(uobatch, prev_key, map->key_size))))
err = -EFAULT;
free_buf:
kvfree(buf_prevkey);
kvfree(buf);
return err;
}
#define BPF_MAP_LOOKUP_AND_DELETE_ELEM_LAST_FIELD flags
static int map_lookup_and_delete_elem(
union bpf_attr *attr)
{
void __user *ukey = u64_to_user_ptr(attr->key);
void __user *uvalue = u64_to_user_ptr(attr->value);
struct bpf_map *map;
void *key, *value;
u32 value_size;
int err;
if (CHECK_ATTR(BPF_MAP_LOOKUP_AND_DELETE_ELEM))
return -EINVAL;
if (attr->flags & ~BPF_F_LOCK)
return -EINVAL;
CLASS(fd, f)(attr->map_fd);
map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
bpf_map_write_active_inc(map);
if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ) ||
!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
err = -EPERM;
goto err_put;
}
if (attr->flags &&
(map->map_type == BPF_MAP_TYPE_QUEUE ||
map->map_type == BPF_MAP_TYPE_STACK)) {
err = -EINVAL;
goto err_put;
}
if ((attr->flags & BPF_F_LOCK) &&
!btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
err = -EINVAL;
goto err_put;
}
key = __bpf_copy_key(ukey, map->key_size);
if (IS_ERR(key)) {
err = PTR_ERR(key);
goto err_put;
}
value_size = bpf_map_value_size(map);
err = -ENOMEM;
value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN);
if (!value)
goto free_key;
err = -ENOTSUPP;
if (map->map_type == BPF_MAP_TYPE_QUEUE ||
map->map_type == BPF_MAP_TYPE_STACK) {
err = map->ops->map_pop_elem(map, value);
}
else if (map->map_type == BPF_MAP_TYPE_HASH ||
map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
map->map_type == BPF_MAP_TYPE_LRU_HASH ||
map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
if (!bpf_map_is_offloaded(map)) {
bpf_disable_instrumentation();
rcu_read_lock();
err = map->ops->map_lookup_and_delete_elem(map, key, value, attr->flags);
rcu_read_unlock();
bpf_enable_instrumentation();
}
}
if (err)
goto free_value;
if (copy_to_user(uvalue, value, value_size) != 0) {
err = -EFAULT;
goto free_value;
}
err = 0;
free_value:
kvfree(value);
free_key:
kvfree(key);
err_put:
bpf_map_write_active_dec(map);
return err;
}
#define BPF_MAP_FREEZE_LAST_FIELD map_fd
static int map_freeze(
const union bpf_attr *attr)
{
int err = 0;
struct bpf_map *map;
if (CHECK_ATTR(BPF_MAP_FREEZE))
return -EINVAL;
CLASS(fd, f)(attr->map_fd);
map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS || !IS_ERR_OR_NULL(map->record))
return -ENOTSUPP;
if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE))
return -EPERM;
mutex_lock(&map->freeze_mutex);
if (bpf_map_write_active(map)) {
err = -EBUSY;
goto err_put;
}
if (READ_ONCE(map->frozen)) {
err = -EBUSY;
goto err_put;
}
WRITE_ONCE(map->frozen,
true);
err_put:
mutex_unlock(&map->freeze_mutex);
return err;
}
static const struct bpf_prog_ops *
const bpf_prog_types[] = {
#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \
[_id] = & _name
## _prog_ops,
#define BPF_MAP_TYPE(_id, _ops)
#define BPF_LINK_TYPE(_id, _name)
#include <linux/bpf_types.h>
#undef BPF_PROG_TYPE
#undef BPF_MAP_TYPE
#undef BPF_LINK_TYPE
};
static int find_prog_type(
enum bpf_prog_type type,
struct bpf_prog *prog)
{
const struct bpf_prog_ops *ops;
if (type >= ARRAY_SIZE(bpf_prog_types))
return -EINVAL;
type = array_index_nospec(type, ARRAY_SIZE(bpf_prog_types));
ops = bpf_prog_types[type];
if (!ops)
return -EINVAL;
if (!bpf_prog_is_offloaded(prog->aux))
prog->aux->ops = ops;
else
prog->aux->ops = &bpf_offload_prog_ops;
prog->type = type;
return 0;
}
enum bpf_audit {
BPF_AUDIT_LOAD,
BPF_AUDIT_UNLOAD,
BPF_AUDIT_MAX,
};
static const char *
const bpf_audit_str[BPF_AUDIT_MAX] = {
[BPF_AUDIT_LOAD] =
"LOAD",
[BPF_AUDIT_UNLOAD] =
"UNLOAD",
};
static void bpf_audit_prog(
const struct bpf_prog *prog,
unsigned int op)
{
struct audit_context *ctx = NULL;
struct audit_buffer *ab;
if (WARN_ON_ONCE(op >= BPF_AUDIT_MAX))
return;
if (audit_enabled == AUDIT_OFF)
return;
if (!in_irq() && !irqs_disabled())
ctx = audit_context();
ab = audit_log_start(ctx, GFP_ATOMIC, AUDIT_BPF);
if (unlikely(!ab))
return;
audit_log_format(ab,
"prog-id=%u op=%s",
prog->aux->id, bpf_audit_str[op]);
audit_log_end(ab);
}
static int bpf_prog_alloc_id(
struct bpf_prog *prog)
{
int id;
idr_preload(GFP_KERNEL);
spin_lock_bh(&prog_idr_lock);
id = idr_alloc_cyclic(&prog_idr, prog, 1, INT_MAX, GFP_ATOMIC);
if (id > 0)
prog->aux->id = id;
spin_unlock_bh(&prog_idr_lock);
idr_preload_end();
/* id is in [1, INT_MAX) */
if (WARN_ON_ONCE(!id))
return -ENOSPC;
return id > 0 ? 0 : id;
}
void bpf_prog_free_id(
struct bpf_prog *prog)
{
unsigned long flags;
/* cBPF to eBPF migrations are currently not in the idr store.
* Offloaded programs are removed from the store when their device
* disappears - even if someone grabs an fd to them they are unusable,
* simply waiting for refcnt to drop to be freed.
*/
if (!prog->aux->id)
return;
spin_lock_irqsave(&prog_idr_lock, flags);
idr_remove(&prog_idr, prog->aux->id);
prog->aux->id = 0;
spin_unlock_irqrestore(&prog_idr_lock, flags);
}
static void __bpf_prog_put_rcu(
struct rcu_head *rcu)
{
struct bpf_prog_aux *aux = container_of(rcu,
struct bpf_prog_aux, rcu);
kvfree(aux->func_info);
kfree(aux->func_info_aux);
free_uid(aux->user);
security_bpf_prog_free(aux->prog);
bpf_prog_free(aux->prog);
}
static void __bpf_prog_put_noref(
struct bpf_prog *prog,
bool deferred)
{
bpf_prog_kallsyms_del_all(prog);
btf_put(prog->aux->btf);
module_put(prog->aux->mod);
kvfree(prog->aux->jited_linfo);
kvfree(prog->aux->linfo);
kfree(prog->aux->kfunc_tab);
kfree(prog->aux->ctx_arg_info);
if (prog->aux->attach_btf)
btf_put(prog->aux->attach_btf);
if (deferred) {
if (prog->sleepable)
call_rcu_tasks_trace(&prog->aux->rcu, __bpf_prog_put_rcu);
else
call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
}
else {
__bpf_prog_put_rcu(&prog->aux->rcu);
}
}
static void bpf_prog_put_deferred(
struct work_struct *work)
{
struct bpf_prog_aux *aux;
struct bpf_prog *prog;
aux = container_of(work,
struct bpf_prog_aux, work);
prog = aux->prog;
perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0);
bpf_audit_prog(prog, BPF_AUDIT_UNLOAD);
bpf_prog_free_id(prog);
__bpf_prog_put_noref(prog,
true);
}
static void __bpf_prog_put(
struct bpf_prog *prog)
{
struct bpf_prog_aux *aux = prog->aux;
if (atomic64_dec_and_test(&aux->refcnt)) {
if (in_irq() || irqs_disabled()) {
INIT_WORK(&aux->work, bpf_prog_put_deferred);
schedule_work(&aux->work);
}
else {
bpf_prog_put_deferred(&aux->work);
}
}
}
void bpf_prog_put(
struct bpf_prog *prog)
{
__bpf_prog_put(prog);
}
EXPORT_SYMBOL_GPL(bpf_prog_put);
static int bpf_prog_release(
struct inode *inode,
struct file *filp)
{
struct bpf_prog *prog = filp->private_data;
bpf_prog_put(prog);
return 0;
}
struct bpf_prog_kstats {
u64 nsecs;
u64 cnt;
u64 misses;
};
void notrace bpf_prog_inc_misses_counter(
struct bpf_prog *prog)
{
struct bpf_prog_stats *stats;
unsigned int flags;
stats = this_cpu_ptr(prog->stats);
flags = u64_stats_update_begin_irqsave(&stats->syncp);
u64_stats_inc(&stats->misses);
u64_stats_update_end_irqrestore(&stats->syncp, flags);
}
static void bpf_prog_get_stats(
const struct bpf_prog *prog,
struct bpf_prog_kstats *stats)
{
u64 nsecs = 0, cnt = 0, misses = 0;
int cpu;
for_each_possible_cpu(cpu) {
const struct bpf_prog_stats *st;
unsigned int start;
u64 tnsecs, tcnt, tmisses;
st = per_cpu_ptr(prog->stats, cpu);
do {
start = u64_stats_fetch_begin(&st->syncp);
tnsecs = u64_stats_read(&st->nsecs);
tcnt = u64_stats_read(&st->cnt);
tmisses = u64_stats_read(&st->misses);
}
while (u64_stats_fetch_retry(&st->syncp, start));
nsecs += tnsecs;
cnt += tcnt;
misses += tmisses;
}
stats->nsecs = nsecs;
stats->cnt = cnt;
stats->misses = misses;
}
#ifdef CONFIG_PROC_FS
static void bpf_prog_show_fdinfo(
struct seq_file *m,
struct file *filp)
{
const struct bpf_prog *prog = filp->private_data;
char prog_tag[
sizeof(prog->tag) * 2 + 1] = { };
struct bpf_prog_kstats stats;
bpf_prog_get_stats(prog, &stats);
bin2hex(prog_tag, prog->tag,
sizeof(prog->tag));
seq_printf(m,
"prog_type:\t%u\n"
"prog_jited:\t%u\n"
"prog_tag:\t%s\n"
"memlock:\t%llu\n"
"prog_id:\t%u\n"
"run_time_ns:\t%llu\n"
"run_cnt:\t%llu\n"
"recursion_misses:\t%llu\n"
"verified_insns:\t%u\n",
prog->type,
prog->jited,
prog_tag,
prog->pages * 1ULL << PAGE_SHIFT,
prog->aux->id,
stats.nsecs,
stats.cnt,
stats.misses,
prog->aux->verified_insns);
}
#endif
const struct file_operations bpf_prog_fops = {
#ifdef CONFIG_PROC_FS
.show_fdinfo = bpf_prog_show_fdinfo,
#endif
.release = bpf_prog_release,
.read = bpf_dummy_read,
.write = bpf_dummy_write,
};
int bpf_prog_new_fd(
struct bpf_prog *prog)
{
int ret;
ret = security_bpf_prog(prog);
if (ret < 0)
return ret;
return anon_inode_getfd(
"bpf-prog", &bpf_prog_fops, prog,
O_RDWR | O_CLOEXEC);
}
void bpf_prog_add(
struct bpf_prog *prog,
int i)
{
atomic64_add(i, &prog->aux->refcnt);
}
EXPORT_SYMBOL_GPL(bpf_prog_add);
void bpf_prog_sub(
struct bpf_prog *prog,
int i)
{
/* Only to be used for undoing previous bpf_prog_add() in some
* error path. We still know that another entity in our call
* path holds a reference to the program, thus atomic_sub() can
* be safely used in such cases!
*/
WARN_ON(atomic64_sub_return(i, &prog->aux->refcnt) == 0);
}
EXPORT_SYMBOL_GPL(bpf_prog_sub);
void bpf_prog_inc(
struct bpf_prog *prog)
{
atomic64_inc(&prog->aux->refcnt);
}
EXPORT_SYMBOL_GPL(bpf_prog_inc);
/* prog_idr_lock should have been held */
struct bpf_prog *bpf_prog_inc_not_zero(
struct bpf_prog *prog)
{
int refold;
refold = atomic64_fetch_add_unless(&prog->aux->refcnt, 1, 0);
if (!refold)
return ERR_PTR(-ENOENT);
return prog;
}
EXPORT_SYMBOL_GPL(bpf_prog_inc_not_zero);
bool bpf_prog_get_ok(
struct bpf_prog *prog,
enum bpf_prog_type *attach_type,
bool attach_drv)
{
/* not an attachment, just a refcount inc, always allow */
if (!attach_type)
return true;
if (prog->type != *attach_type)
return false;
if (bpf_prog_is_offloaded(prog->aux) && !attach_drv)
return false;
return true;
}
static struct bpf_prog *__bpf_prog_get(u32 ufd,
enum bpf_prog_type *attach_type,
bool attach_drv)
{
CLASS(fd, f)(ufd);
struct bpf_prog *prog;
if (fd_empty(f))
return ERR_PTR(-EBADF);
if (fd_file(f)->f_op != &bpf_prog_fops)
return ERR_PTR(-EINVAL);
prog = fd_file(f)->private_data;
if (!bpf_prog_get_ok(prog, attach_type, attach_drv))
return ERR_PTR(-EINVAL);
bpf_prog_inc(prog);
return prog;
}
struct bpf_prog *bpf_prog_get(u32 ufd)
{
return __bpf_prog_get(ufd, NULL,
false);
}
struct bpf_prog *bpf_prog_get_type_dev(u32 ufd,
enum bpf_prog_type type,
bool attach_drv)
{
return __bpf_prog_get(ufd, &type, attach_drv);
}
EXPORT_SYMBOL_GPL(bpf_prog_get_type_dev);
/* Initially all BPF programs could be loaded w/o specifying
* expected_attach_type. Later for some of them specifying expected_attach_type
* at load time became required so that program could be validated properly.
* Programs of types that are allowed to be loaded both w/ and w/o (for
* backward compatibility) expected_attach_type, should have the default attach
* type assigned to expected_attach_type for the latter case, so that it can be
* validated later at attach time.
*
* bpf_prog_load_fixup_attach_type() sets expected_attach_type in @attr if
* prog type requires it but has some attach types that have to be backward
* compatible.
*/
static void bpf_prog_load_fixup_attach_type(
union bpf_attr *attr)
{
switch (attr->prog_type) {
case BPF_PROG_TYPE_CGROUP_SOCK:
/* Unfortunately BPF_ATTACH_TYPE_UNSPEC enumeration doesn't
* exist so checking for non-zero is the way to go here.
*/
if (!attr->expected_attach_type)
attr->expected_attach_type =
BPF_CGROUP_INET_SOCK_CREATE;
break;
case BPF_PROG_TYPE_SK_REUSEPORT:
if (!attr->expected_attach_type)
attr->expected_attach_type =
BPF_SK_REUSEPORT_SELECT;
break;
}
}
static int
bpf_prog_load_check_attach(
enum bpf_prog_type prog_type,
enum bpf_attach_type expected_attach_type,
struct btf *attach_btf, u32 btf_id,
struct bpf_prog *dst_prog)
{
if (btf_id) {
if (btf_id > BTF_MAX_TYPE)
return -EINVAL;
if (!attach_btf && !dst_prog)
return -EINVAL;
switch (prog_type) {
case BPF_PROG_TYPE_TRACING:
case BPF_PROG_TYPE_LSM:
case BPF_PROG_TYPE_STRUCT_OPS:
case BPF_PROG_TYPE_EXT:
break;
default:
--> --------------------
--> maximum size reached
--> --------------------