/* map is generic key/value storage optionally accessible by eBPF programs */ struct bpf_map_ops { /* funcs callable from userspace (via syscall) */ int (*map_alloc_check)(union bpf_attr *attr); struct bpf_map *(*map_alloc)(union bpf_attr *attr); void (*map_release)(struct bpf_map *map, struct file *map_file); void (*map_free)(struct bpf_map *map); int (*map_get_next_key)(struct bpf_map *map, void *key, void *next_key); void (*map_release_uref)(struct bpf_map *map); void *(*map_lookup_elem_sys_only)(struct bpf_map *map, void *key); int (*map_lookup_batch)(struct bpf_map *map, constunion bpf_attr *attr, union bpf_attr __user *uattr); int (*map_lookup_and_delete_elem)(struct bpf_map *map, void *key, void *value, u64 flags); int (*map_lookup_and_delete_batch)(struct bpf_map *map, constunion bpf_attr *attr, union bpf_attr __user *uattr); int (*map_update_batch)(struct bpf_map *map, struct file *map_file, constunion bpf_attr *attr, union bpf_attr __user *uattr); int (*map_delete_batch)(struct bpf_map *map, constunion bpf_attr *attr, union bpf_attr __user *uattr);
/* funcs callable from userspace and from eBPF programs */ void *(*map_lookup_elem)(struct bpf_map *map, void *key); long (*map_update_elem)(struct bpf_map *map, void *key, void *value, u64 flags); long (*map_delete_elem)(struct bpf_map *map, void *key); long (*map_push_elem)(struct bpf_map *map, void *value, u64 flags); long (*map_pop_elem)(struct bpf_map *map, void *value); long (*map_peek_elem)(struct bpf_map *map, void *value); void *(*map_lookup_percpu_elem)(struct bpf_map *map, void *key, u32 cpu);
/* funcs called by prog_array and perf_event_array map */ void *(*map_fd_get_ptr)(struct bpf_map *map, struct file *map_file, int fd); /* If need_defer is true, the implementation should guarantee that * the to-be-put element is still alive before the bpf program, which * may manipulate it, exists.
*/ void (*map_fd_put_ptr)(struct bpf_map *map, void *ptr, bool need_defer); int (*map_gen_lookup)(struct bpf_map *map, struct bpf_insn *insn_buf);
u32 (*map_fd_sys_lookup_elem)(void *ptr); void (*map_seq_show_elem)(struct bpf_map *map, void *key, struct seq_file *m); int (*map_check_btf)(conststruct bpf_map *map, conststruct btf *btf, conststruct btf_type *key_type, conststruct btf_type *value_type);
/* map_meta_equal must be implemented for maps that can be * used as an inner map. It is a runtime check to ensure * an inner map can be inserted to an outer map. * * Some properties of the inner map has been used during the * verification time. When inserting an inner map at the runtime, * map_meta_equal has to ensure the inserting map has the same * properties that the verifier has used earlier.
*/ bool (*map_meta_equal)(conststruct bpf_map *meta0, conststruct bpf_map *meta1);
int (*map_set_for_each_callback_args)(struct bpf_verifier_env *env, struct bpf_func_state *caller, struct bpf_func_state *callee); long (*map_for_each_callback)(struct bpf_map *map,
bpf_callback_t callback_fn, void *callback_ctx, u64 flags);
u64 (*map_mem_usage)(conststruct bpf_map *map);
/* BTF id of struct allocated by map_alloc */ int *map_btf_id;
/* bpf_iter info used to open a seq_file */ conststruct bpf_iter_seq_info *iter_seq_info;
};
enum { /* Support at most 11 fields in a BTF type */
BTF_FIELDS_MAX = 11,
};
struct btf_field_kptr { struct btf *btf; struct module *module; /* dtor used if btf_is_kernel(btf), otherwise the type is * program-allocated, dtor is NULL, and __bpf_obj_drop_impl is used
*/
btf_dtor_kfunc_t dtor;
u32 btf_id;
};
struct btf_record {
u32 cnt;
u32 field_mask; int spin_lock_off; int res_spin_lock_off; int timer_off; int wq_off; int refcount_off; struct btf_field fields[];
};
/* Non-opaque version of bpf_rb_node in uapi/linux/bpf.h */ struct bpf_rb_node_kern { struct rb_node rb_node; void *owner;
} __attribute__((aligned(8)));
/* Non-opaque version of bpf_list_node in uapi/linux/bpf.h */ struct bpf_list_node_kern { struct list_head list_head; void *owner;
} __attribute__((aligned(8)));
/* 'Ownership' of program-containing map is claimed by the first program * that is going to use this map or by the first program which FD is * stored in the map to make sure that all callers and callees have the * same prog type, JITed flag and xdp_has_frags flag.
*/ struct bpf_map_owner { enum bpf_prog_type type; bool jited; bool xdp_has_frags;
u64 storage_cookie[MAX_BPF_CGROUP_STORAGE_TYPE]; conststruct btf_type *attach_func_proto; enum bpf_attach_type expected_attach_type;
};
struct bpf_map { conststruct bpf_map_ops *ops; struct bpf_map *inner_map_meta; #ifdef CONFIG_SECURITY void *security; #endif enum bpf_map_type map_type;
u32 key_size;
u32 value_size;
u32 max_entries;
u64 map_extra; /* any per-map-type extra fields */
u32 map_flags;
u32 id; struct btf_record *record; int numa_node;
u32 btf_key_type_id;
u32 btf_value_type_id;
u32 btf_vmlinux_value_type_id; struct btf *btf; #ifdef CONFIG_MEMCG struct obj_cgroup *objcg; #endif char name[BPF_OBJ_NAME_LEN]; struct mutex freeze_mutex;
atomic64_t refcnt;
atomic64_t usercnt; /* rcu is used before freeing and work is only used during freeing */ union { struct work_struct work; struct rcu_head rcu;
};
atomic64_t writecnt;
spinlock_t owner_lock; struct bpf_map_owner *owner; bool bypass_spec_v1; bool frozen; /* write-once; write-protected by freeze_mutex */ bool free_after_mult_rcu_gp; bool free_after_rcu_gp;
atomic64_t sleepable_refcnt;
s64 __percpu *elem_count;
u64 cookie; /* write-once */
};
staticinlineconstchar *btf_field_type_name(enum btf_field_type type)
{ switch (type) { case BPF_SPIN_LOCK: return"bpf_spin_lock"; case BPF_RES_SPIN_LOCK: return"bpf_res_spin_lock"; case BPF_TIMER: return"bpf_timer"; case BPF_WORKQUEUE: return"bpf_wq"; case BPF_KPTR_UNREF: case BPF_KPTR_REF: return"kptr"; case BPF_KPTR_PERCPU: return"percpu_kptr"; case BPF_UPTR: return"uptr"; case BPF_LIST_HEAD: return"bpf_list_head"; case BPF_LIST_NODE: return"bpf_list_node"; case BPF_RB_ROOT: return"bpf_rb_root"; case BPF_RB_NODE: return"bpf_rb_node"; case BPF_REFCOUNT: return"bpf_refcount"; default:
WARN_ON_ONCE(1); return"unknown";
}
}
staticinline u32 btf_field_type_size(enum btf_field_type type)
{ switch (type) { case BPF_SPIN_LOCK: returnsizeof(struct bpf_spin_lock); case BPF_RES_SPIN_LOCK: returnsizeof(struct bpf_res_spin_lock); case BPF_TIMER: returnsizeof(struct bpf_timer); case BPF_WORKQUEUE: returnsizeof(struct bpf_wq); case BPF_KPTR_UNREF: case BPF_KPTR_REF: case BPF_KPTR_PERCPU: case BPF_UPTR: returnsizeof(u64); case BPF_LIST_HEAD: returnsizeof(struct bpf_list_head); case BPF_LIST_NODE: returnsizeof(struct bpf_list_node); case BPF_RB_ROOT: returnsizeof(struct bpf_rb_root); case BPF_RB_NODE: returnsizeof(struct bpf_rb_node); case BPF_REFCOUNT: returnsizeof(struct bpf_refcount); default:
WARN_ON_ONCE(1); return 0;
}
}
staticinline u32 btf_field_type_align(enum btf_field_type type)
{ switch (type) { case BPF_SPIN_LOCK: return __alignof__(struct bpf_spin_lock); case BPF_RES_SPIN_LOCK: return __alignof__(struct bpf_res_spin_lock); case BPF_TIMER: return __alignof__(struct bpf_timer); case BPF_WORKQUEUE: return __alignof__(struct bpf_wq); case BPF_KPTR_UNREF: case BPF_KPTR_REF: case BPF_KPTR_PERCPU: case BPF_UPTR: return __alignof__(u64); case BPF_LIST_HEAD: return __alignof__(struct bpf_list_head); case BPF_LIST_NODE: return __alignof__(struct bpf_list_node); case BPF_RB_ROOT: return __alignof__(struct bpf_rb_root); case BPF_RB_NODE: return __alignof__(struct bpf_rb_node); case BPF_REFCOUNT: return __alignof__(struct bpf_refcount); default:
WARN_ON_ONCE(1); return 0;
}
}
switch (field->type) { case BPF_REFCOUNT:
refcount_set((refcount_t *)addr, 1); break; case BPF_RB_NODE:
RB_CLEAR_NODE((struct rb_node *)addr); break; case BPF_LIST_HEAD: case BPF_LIST_NODE:
INIT_LIST_HEAD((struct list_head *)addr); break; case BPF_RB_ROOT: /* RB_ROOT_CACHED 0-inits, no need to do anything after memset */ case BPF_SPIN_LOCK: case BPF_RES_SPIN_LOCK: case BPF_TIMER: case BPF_WORKQUEUE: case BPF_KPTR_UNREF: case BPF_KPTR_REF: case BPF_KPTR_PERCPU: case BPF_UPTR: break; default:
WARN_ON_ONCE(1); return;
}
}
staticinlinevoid bpf_obj_init(conststruct btf_record *rec, void *obj)
{ int i;
if (IS_ERR_OR_NULL(rec)) return; for (i = 0; i < rec->cnt; i++)
bpf_obj_init_field(&rec->fields[i], obj + rec->fields[i].offset);
}
/* 'dst' must be a temporary buffer and should not point to memory that is being * used in parallel by a bpf program or bpf syscall, otherwise the access from * the bpf program or bpf syscall may be corrupted by the reinitialization, * leading to weird problems. Even 'dst' is newly-allocated from bpf memory * allocator, it is still possible for 'dst' to be used in parallel by a bpf * program or bpf syscall.
*/ staticinlinevoid check_and_init_map_value(struct bpf_map *map, void *dst)
{
bpf_obj_init(map->record, dst);
}
/* memcpy that is used with 8-byte aligned pointers, power-of-8 size and * forced to use 'long' read/writes to try to atomically copy long counters. * Best-effort only. No barriers here, since it _will_ race with concurrent * updates from BPF programs. Called from bpf syscall and mostly used with * size 8 or 16 bytes, so ask compiler to inline it.
*/ staticinlinevoid bpf_long_memcpy(void *dst, constvoid *src, u32 size)
{ constlong *lsrc = src; long *ldst = dst;
size /= sizeof(long); while (size--)
data_race(*ldst++ = *lsrc++);
}
/* copy everything but bpf_spin_lock, bpf_timer, and kptrs. There could be one of each. */ staticinlinevoid bpf_obj_memcpy(struct btf_record *rec, void *dst, void *src, u32 size, bool long_memcpy)
{
u32 curr_off = 0; int i;
if (IS_ERR_OR_NULL(rec)) { if (long_memcpy)
bpf_long_memcpy(dst, src, round_up(size, 8)); else
memcpy(dst, src, size); return;
}
for (i = 0; i < rec->cnt; i++) {
u32 next_off = rec->fields[i].offset;
u32 sz = next_off - curr_off;
/* bpf_type_flag contains a set of flags that are applicable to the values of * arg_type, ret_type and reg_type. For example, a pointer value may be null, * or a memory is read-only. We classify types into two categories: base types * and extended types. Extended types are base types combined with a type flag. * * Currently there are no more than 32 base types in arg_type, ret_type and * reg_types.
*/ #define BPF_BASE_TYPE_BITS 8
enum bpf_type_flag { /* PTR may be NULL. */
PTR_MAYBE_NULL = BIT(0 + BPF_BASE_TYPE_BITS),
/* MEM is read-only. When applied on bpf_arg, it indicates the arg is * compatible with both mutable and immutable memory.
*/
MEM_RDONLY = BIT(1 + BPF_BASE_TYPE_BITS),
/* MEM points to BPF ring buffer reservation. */
MEM_RINGBUF = BIT(2 + BPF_BASE_TYPE_BITS),
/* MEM is in user address space. */
MEM_USER = BIT(3 + BPF_BASE_TYPE_BITS),
/* MEM is a percpu memory. MEM_PERCPU tags PTR_TO_BTF_ID. When tagged * with MEM_PERCPU, PTR_TO_BTF_ID _cannot_ be directly accessed. In * order to drop this tag, it must be passed into bpf_per_cpu_ptr() * or bpf_this_cpu_ptr(), which will return the pointer corresponding * to the specified cpu.
*/
MEM_PERCPU = BIT(4 + BPF_BASE_TYPE_BITS),
/* Indicates that the argument will be released. */
OBJ_RELEASE = BIT(5 + BPF_BASE_TYPE_BITS),
/* PTR is not trusted. This is only used with PTR_TO_BTF_ID, to mark * unreferenced and referenced kptr loaded from map value using a load * instruction, so that they can only be dereferenced but not escape the * BPF program into the kernel (i.e. cannot be passed as arguments to * kfunc or bpf helpers).
*/
PTR_UNTRUSTED = BIT(6 + BPF_BASE_TYPE_BITS),
/* MEM can be uninitialized. */
MEM_UNINIT = BIT(7 + BPF_BASE_TYPE_BITS),
/* DYNPTR points to memory local to the bpf program. */
DYNPTR_TYPE_LOCAL = BIT(8 + BPF_BASE_TYPE_BITS),
/* DYNPTR points to a kernel-produced ringbuf record. */
DYNPTR_TYPE_RINGBUF = BIT(9 + BPF_BASE_TYPE_BITS),
/* Size is known at compile time. */
MEM_FIXED_SIZE = BIT(10 + BPF_BASE_TYPE_BITS),
/* MEM is of an allocated object of type in program BTF. This is used to * tag PTR_TO_BTF_ID allocated using bpf_obj_new.
*/
MEM_ALLOC = BIT(11 + BPF_BASE_TYPE_BITS),
/* PTR was passed from the kernel in a trusted context, and may be * passed to KF_TRUSTED_ARGS kfuncs or BPF helper functions. * Confusingly, this is _not_ the opposite of PTR_UNTRUSTED above. * PTR_UNTRUSTED refers to a kptr that was read directly from a map * without invoking bpf_kptr_xchg(). What we really need to know is * whether a pointer is safe to pass to a kfunc or BPF helper function. * While PTR_UNTRUSTED pointers are unsafe to pass to kfuncs and BPF * helpers, they do not cover all possible instances of unsafe * pointers. For example, a pointer that was obtained from walking a * struct will _not_ get the PTR_UNTRUSTED type modifier, despite the * fact that it may be NULL, invalid, etc. This is due to backwards * compatibility requirements, as this was the behavior that was first * introduced when kptrs were added. The behavior is now considered * deprecated, and PTR_UNTRUSTED will eventually be removed. * * PTR_TRUSTED, on the other hand, is a pointer that the kernel * guarantees to be valid and safe to pass to kfuncs and BPF helpers. * For example, pointers passed to tracepoint arguments are considered * PTR_TRUSTED, as are pointers that are passed to struct_ops * callbacks. As alluded to above, pointers that are obtained from * walking PTR_TRUSTED pointers are _not_ trusted. For example, if a * struct task_struct *task is PTR_TRUSTED, then accessing * task->last_wakee will lose the PTR_TRUSTED modifier when it's stored * in a BPF register. Similarly, pointers passed to certain programs * types such as kretprobes are not guaranteed to be valid, as they may * for example contain an object that was recently freed.
*/
PTR_TRUSTED = BIT(12 + BPF_BASE_TYPE_BITS),
/* MEM is tagged with rcu and memory access needs rcu_read_lock protection. */
MEM_RCU = BIT(13 + BPF_BASE_TYPE_BITS),
/* Used to tag PTR_TO_BTF_ID | MEM_ALLOC references which are non-owning. * Currently only valid for linked-list and rbtree nodes. If the nodes * have a bpf_refcount_field, they must be tagged MEM_RCU as well.
*/
NON_OWN_REF = BIT(14 + BPF_BASE_TYPE_BITS),
/* Memory must be aligned on some architectures, used in combination with * MEM_FIXED_SIZE.
*/
MEM_ALIGNED = BIT(17 + BPF_BASE_TYPE_BITS),
/* MEM is being written to, often combined with MEM_UNINIT. Non-presence * of MEM_WRITE means that MEM is only being read. MEM_WRITE without the * MEM_UNINIT means that memory needs to be initialized since it is also * read.
*/
MEM_WRITE = BIT(18 + BPF_BASE_TYPE_BITS),
/* Max number of base types. */ #define BPF_BASE_TYPE_LIMIT (1UL << BPF_BASE_TYPE_BITS)
/* Max number of all types. */ #define BPF_TYPE_LIMIT (__BPF_TYPE_LAST_FLAG | (__BPF_TYPE_LAST_FLAG - 1))
/* function argument constraints */ enum bpf_arg_type {
ARG_DONTCARE = 0, /* unused argument in helper function */
/* the following constraints used to prototype * bpf_map_lookup/update/delete_elem() functions
*/
ARG_CONST_MAP_PTR, /* const argument used as pointer to bpf_map */
ARG_PTR_TO_MAP_KEY, /* pointer to stack used as map key */
ARG_PTR_TO_MAP_VALUE, /* pointer to stack used as map value */
/* Used to prototype bpf_memcmp() and other functions that access data * on eBPF program stack
*/
ARG_PTR_TO_MEM, /* pointer to valid memory (stack, packet, map value) */
ARG_PTR_TO_ARENA,
ARG_CONST_SIZE, /* number of bytes accessed from memory */
ARG_CONST_SIZE_OR_ZERO, /* number of bytes accessed from memory or 0 */
ARG_PTR_TO_CTX, /* pointer to context */
ARG_ANYTHING, /* any (initialized) argument is ok */
ARG_PTR_TO_SPIN_LOCK, /* pointer to bpf_spin_lock */
ARG_PTR_TO_SOCK_COMMON, /* pointer to sock_common */
ARG_PTR_TO_SOCKET, /* pointer to bpf_sock (fullsock) */
ARG_PTR_TO_BTF_ID, /* pointer to in-kernel struct */
ARG_PTR_TO_RINGBUF_MEM, /* pointer to dynamically reserved ringbuf memory */
ARG_CONST_ALLOC_SIZE_OR_ZERO, /* number of allocated bytes requested */
ARG_PTR_TO_BTF_ID_SOCK_COMMON, /* pointer to in-kernel sock_common or bpf-mirrored bpf_sock */
ARG_PTR_TO_PERCPU_BTF_ID, /* pointer to in-kernel percpu type */
ARG_PTR_TO_FUNC, /* pointer to a bpf program function */
ARG_PTR_TO_STACK, /* pointer to stack */
ARG_PTR_TO_CONST_STR, /* pointer to a null terminated read-only string */
ARG_PTR_TO_TIMER, /* pointer to bpf_timer */
ARG_KPTR_XCHG_DEST, /* pointer to destination that kptrs are bpf_kptr_xchg'd into */
ARG_PTR_TO_DYNPTR, /* pointer to bpf_dynptr. See bpf_type_flag for dynptr type */
__BPF_ARG_TYPE_MAX,
/* Extended arg_types. */
ARG_PTR_TO_MAP_VALUE_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_MAP_VALUE,
ARG_PTR_TO_MEM_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_MEM,
ARG_PTR_TO_CTX_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_CTX,
ARG_PTR_TO_SOCKET_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_SOCKET,
ARG_PTR_TO_STACK_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_STACK,
ARG_PTR_TO_BTF_ID_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_BTF_ID, /* Pointer to memory does not need to be initialized, since helper function * fills all bytes or clears them in error case.
*/
ARG_PTR_TO_UNINIT_MEM = MEM_UNINIT | MEM_WRITE | ARG_PTR_TO_MEM, /* Pointer to valid memory of size known at compile time. */
ARG_PTR_TO_FIXED_SIZE_MEM = MEM_FIXED_SIZE | ARG_PTR_TO_MEM,
/* This must be the last entry. Its purpose is to ensure the enum is * wide enough to hold the higher bits reserved for bpf_type_flag.
*/
__BPF_ARG_TYPE_LIMIT = BPF_TYPE_LIMIT,
};
static_assert(__BPF_ARG_TYPE_MAX <= BPF_BASE_TYPE_LIMIT);
/* type of values returned from helper functions */ enum bpf_return_type {
RET_INTEGER, /* function returns integer */
RET_VOID, /* function doesn't return anything */
RET_PTR_TO_MAP_VALUE, /* returns a pointer to map elem value */
RET_PTR_TO_SOCKET, /* returns a pointer to a socket */
RET_PTR_TO_TCP_SOCK, /* returns a pointer to a tcp_sock */
RET_PTR_TO_SOCK_COMMON, /* returns a pointer to a sock_common */
RET_PTR_TO_MEM, /* returns a pointer to memory */
RET_PTR_TO_MEM_OR_BTF_ID, /* returns a pointer to a valid memory or a btf_id */
RET_PTR_TO_BTF_ID, /* returns a pointer to a btf_id */
__BPF_RET_TYPE_MAX,
/* This must be the last entry. Its purpose is to ensure the enum is * wide enough to hold the higher bits reserved for bpf_type_flag.
*/
__BPF_RET_TYPE_LIMIT = BPF_TYPE_LIMIT,
};
static_assert(__BPF_RET_TYPE_MAX <= BPF_BASE_TYPE_LIMIT);
/* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs * to in-kernel helper functions and for adjusting imm32 field in BPF_CALL * instructions after verifying
*/ struct bpf_func_proto {
u64 (*func)(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); bool gpl_only; bool pkt_access; bool might_sleep; /* set to true if helper follows contract for llvm * attribute bpf_fastcall: * - void functions do not scratch r0 * - functions taking N arguments scratch only registers r1-rN
*/ bool allow_fastcall; enum bpf_return_type ret_type; union { struct { enum bpf_arg_type arg1_type; enum bpf_arg_type arg2_type; enum bpf_arg_type arg3_type; enum bpf_arg_type arg4_type; enum bpf_arg_type arg5_type;
}; enum bpf_arg_type arg_type[5];
}; union { struct {
u32 *arg1_btf_id;
u32 *arg2_btf_id;
u32 *arg3_btf_id;
u32 *arg4_btf_id;
u32 *arg5_btf_id;
};
u32 *arg_btf_id[5]; struct {
size_t arg1_size;
size_t arg2_size;
size_t arg3_size;
size_t arg4_size;
size_t arg5_size;
};
size_t arg_size[5];
}; int *ret_btf_id; /* return value btf_id */ bool (*allowed)(conststruct bpf_prog *prog);
};
/* bpf_context is intentionally undefined structure. Pointer to bpf_context is * the first argument to eBPF programs. * For socket filters: 'struct bpf_context *' == 'struct sk_buff *'
*/ struct bpf_context;
/* types of values stored in eBPF registers */ /* Pointer types represent: * pointer * pointer + imm * pointer + (u16) var * pointer + (u16) var + imm * if (range > 0) then [ptr, ptr + range - off) is safe to access * if (id > 0) means that some 'var' was added * if (off > 0) means that 'imm' was added
*/ enum bpf_reg_type {
NOT_INIT = 0, /* nothing was written into register */
SCALAR_VALUE, /* reg doesn't contain a valid pointer */
PTR_TO_CTX, /* reg points to bpf_context */
CONST_PTR_TO_MAP, /* reg points to struct bpf_map */
PTR_TO_MAP_VALUE, /* reg points to map element value */
PTR_TO_MAP_KEY, /* reg points to a map element key */
PTR_TO_STACK, /* reg == frame_pointer + offset */
PTR_TO_PACKET_META, /* skb->data - meta_len */
PTR_TO_PACKET, /* reg points to skb->data */
PTR_TO_PACKET_END, /* skb->data + headlen */
PTR_TO_FLOW_KEYS, /* reg points to bpf_flow_keys */
PTR_TO_SOCKET, /* reg points to struct bpf_sock */
PTR_TO_SOCK_COMMON, /* reg points to sock_common */
PTR_TO_TCP_SOCK, /* reg points to struct tcp_sock */
PTR_TO_TP_BUFFER, /* reg points to a writable raw tp's buffer */
PTR_TO_XDP_SOCK, /* reg points to struct xdp_sock */ /* PTR_TO_BTF_ID points to a kernel struct that does not need * to be null checked by the BPF program. This does not imply the * pointer is _not_ null and in practice this can easily be a null * pointer when reading pointer chains. The assumption is program * context will handle null pointer dereference typically via fault * handling. The verifier must keep this in mind and can make no * assumptions about null or non-null when doing branch analysis. * Further, when passed into helpers the helpers can not, without * additional context, assume the value is non-null.
*/
PTR_TO_BTF_ID,
PTR_TO_MEM, /* reg points to valid memory region */
PTR_TO_ARENA,
PTR_TO_BUF, /* reg points to a read/write buffer */
PTR_TO_FUNC, /* reg points to a bpf program function */
CONST_PTR_TO_DYNPTR, /* reg points to a const struct bpf_dynptr */
__BPF_REG_TYPE_MAX,
/* Extended reg_types. */
PTR_TO_MAP_VALUE_OR_NULL = PTR_MAYBE_NULL | PTR_TO_MAP_VALUE,
PTR_TO_SOCKET_OR_NULL = PTR_MAYBE_NULL | PTR_TO_SOCKET,
PTR_TO_SOCK_COMMON_OR_NULL = PTR_MAYBE_NULL | PTR_TO_SOCK_COMMON,
PTR_TO_TCP_SOCK_OR_NULL = PTR_MAYBE_NULL | PTR_TO_TCP_SOCK, /* PTR_TO_BTF_ID_OR_NULL points to a kernel struct that has not * been checked for null. Used primarily to inform the verifier * an explicit null check is required for this struct.
*/
PTR_TO_BTF_ID_OR_NULL = PTR_MAYBE_NULL | PTR_TO_BTF_ID,
/* This must be the last entry. Its purpose is to ensure the enum is * wide enough to hold the higher bits reserved for bpf_type_flag.
*/
__BPF_REG_TYPE_LIMIT = BPF_TYPE_LIMIT,
};
static_assert(__BPF_REG_TYPE_MAX <= BPF_BASE_TYPE_LIMIT);
/* The information passed from prog-specific *_is_valid_access * back to the verifier.
*/ struct bpf_insn_access_aux { enum bpf_reg_type reg_type; bool is_ldsx; union { int ctx_field_size; struct { struct btf *btf;
u32 btf_id;
u32 ref_obj_id;
};
}; struct bpf_verifier_log *log; /* for verbose logs */ bool is_retval; /* is accessing function return value ? */
};
/* Given a BPF_ATOMIC instruction @atomic_insn, return true if it is an * atomic load or store, and false if it is a read-modify-write instruction.
*/ staticinlinebool
bpf_atomic_is_load_store(conststruct bpf_insn *atomic_insn)
{ switch (atomic_insn->imm) { case BPF_LOAD_ACQ: case BPF_STORE_REL: returntrue; default: returnfalse;
}
}
struct bpf_prog_ops { int (*test_run)(struct bpf_prog *prog, constunion bpf_attr *kattr, union bpf_attr __user *uattr);
};
/* Restore arguments before returning from trampoline to let original function * continue executing. This flag is used for fentry progs when there are no * fexit progs.
*/ #define BPF_TRAMP_F_RESTORE_REGS BIT(0) /* Call original function after fentry progs, but before fexit progs. * Makes sense for fentry/fexit, normal calls and indirect calls.
*/ #define BPF_TRAMP_F_CALL_ORIG BIT(1) /* Skip current frame and return to parent. Makes sense for fentry/fexit * programs only. Should not be used with normal calls and indirect calls.
*/ #define BPF_TRAMP_F_SKIP_FRAME BIT(2) /* Store IP address of the caller on the trampoline stack, * so it's available for trampoline's programs.
*/ #define BPF_TRAMP_F_IP_ARG BIT(3) /* Return the return value of fentry prog. Only used by bpf_struct_ops. */ #define BPF_TRAMP_F_RET_FENTRY_RET BIT(4)
/* Get original function from stack instead of from provided direct address. * Makes sense for trampolines with fexit or fmod_ret programs.
*/ #define BPF_TRAMP_F_ORIG_STACK BIT(5)
/* This trampoline is on a function with another ftrace_ops with IPMODIFY, * e.g., a live patch. This flag is set and cleared by ftrace call backs,
*/ #define BPF_TRAMP_F_SHARE_IPMODIFY BIT(6)
/* Indicate that current trampoline is in a tail call context. Then, it has to * cache and restore tail_call_cnt to avoid infinite tail call loop.
*/ #define BPF_TRAMP_F_TAIL_CALL_CTX BIT(7)
/* * Indicate the trampoline should be suitable to receive indirect calls; * without this indirectly calling the generated code can result in #UD/#CP, * depending on the CFI options. * * Used by bpf_struct_ops. * * Incompatible with FENTRY usage, overloads @func_addr argument.
*/ #define BPF_TRAMP_F_INDIRECT BIT(8)
/* Each call __bpf_prog_enter + call bpf_func + call __bpf_prog_exit is ~50 * bytes on x86.
*/ enum { #ifdefined(__s390x__)
BPF_MAX_TRAMP_LINKS = 27, #else
BPF_MAX_TRAMP_LINKS = 38, #endif
};
struct bpf_tramp_links { struct bpf_tramp_link *links[BPF_MAX_TRAMP_LINKS]; int nr_links;
};
struct bpf_tramp_run_ctx;
/* Different use cases for BPF trampoline: * 1. replace nop at the function entry (kprobe equivalent) * flags = BPF_TRAMP_F_RESTORE_REGS * fentry = a set of programs to run before returning from trampoline * * 2. replace nop at the function entry (kprobe + kretprobe equivalent) * flags = BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME * orig_call = fentry_ip + MCOUNT_INSN_SIZE * fentry = a set of program to run before calling original function * fexit = a set of program to run after original function * * 3. replace direct call instruction anywhere in the function body * or assign a function pointer for indirect call (like tcp_congestion_ops->cong_avoid) * With flags = 0 * fentry = a set of programs to run before returning from trampoline * With flags = BPF_TRAMP_F_CALL_ORIG * orig_call = original callback addr or direct function addr * fentry = a set of program to run before calling original function * fexit = a set of program to run after original function
*/ struct bpf_tramp_image; int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end, conststruct btf_func_model *m, u32 flags, struct bpf_tramp_links *tlinks, void *func_addr); void *arch_alloc_bpf_trampoline(unsignedint size); void arch_free_bpf_trampoline(void *image, unsignedint size); int __must_check arch_protect_bpf_trampoline(void *image, unsignedint size); int arch_bpf_trampoline_size(conststruct btf_func_model *m, u32 flags, struct bpf_tramp_links *tlinks, void *func_addr);
struct bpf_trampoline { /* hlist for trampoline_table */ struct hlist_node hlist; struct ftrace_ops *fops; /* serializes access to fields of this trampoline */ struct mutex mutex;
refcount_t refcnt;
u32 flags;
u64 key; struct { struct btf_func_model model; void *addr; bool ftrace_managed;
} func; /* if !NULL this is BPF_PROG_TYPE_EXT program that extends another BPF * program by replacing one of its functions. func.addr is the address * of the function it replaced.
*/ struct bpf_prog *extension_prog; /* list of BPF programs using this trampoline */ struct hlist_head progs_hlist[BPF_TRAMP_MAX]; /* Number of attached programs. A counter per kind. */ int progs_cnt[BPF_TRAMP_MAX]; /* Executable image of trampoline */ struct bpf_tramp_image *cur_image;
};
/* the implementation of the opaque uapi struct bpf_dynptr */ struct bpf_dynptr_kern { void *data; /* Size represents the number of usable bytes of dynptr data. * If for example the offset is at 4 for a local dynptr whose data is * of type u64, the number of usable bytes is 4. * * The upper 8 bits are reserved. It is as follows: * Bits 0 - 23 = size * Bits 24 - 30 = dynptr type * Bit 31 = whether dynptr is read-only
*/
u32 size;
u32 offset;
} __aligned(8);
enum bpf_dynptr_type {
BPF_DYNPTR_TYPE_INVALID, /* Points to memory that is local to the bpf program */
BPF_DYNPTR_TYPE_LOCAL, /* Underlying data is a ringbuf record */
BPF_DYNPTR_TYPE_RINGBUF, /* Underlying data is a sk_buff */
BPF_DYNPTR_TYPE_SKB, /* Underlying data is a xdp_buff */
BPF_DYNPTR_TYPE_XDP,
};
/* * When the architecture supports STATIC_CALL replace the bpf_dispatcher_fn * indirection with a direct call to the bpf program. If the architecture does * not have STATIC_CALL, avoid a double-indirection.
*/ #ifdef CONFIG_HAVE_STATIC_CALL
struct bpf_stream {
atomic_t capacity; struct llist_head log; /* list of in-flight stream elements in LIFO order */
struct mutex lock; /* lock protecting backlog_{head,tail} */ struct llist_node *backlog_head; /* list of in-flight stream elements in FIFO order */ struct llist_node *backlog_tail; /* tail of the list above */
};
struct bpf_stream_stage { struct llist_head log; int len;
};
struct bpf_prog_aux {
atomic64_t refcnt;
u32 used_map_cnt;
u32 used_btf_cnt;
u32 max_ctx_offset;
u32 max_pkt_offset;
u32 max_tp_access;
u32 stack_depth;
u32 id;
u32 func_cnt; /* used by non-func prog as the number of func progs */
u32 real_func_cnt; /* includes hidden progs, only used for JIT and freeing progs */
u32 func_idx; /* 0 for non-func prog, the index in func array for func prog */
u32 attach_btf_id; /* in-kernel BTF type id to attach to */
u32 attach_st_ops_member_off;
u32 ctx_arg_info_size;
u32 max_rdonly_access;
u32 max_rdwr_access; struct btf *attach_btf; struct bpf_ctx_arg_aux *ctx_arg_info; void __percpu *priv_stack_ptr; struct mutex dst_mutex; /* protects dst_* pointers below, *after* prog becomes visible */ struct bpf_prog *dst_prog; struct bpf_trampoline *dst_trampoline; enum bpf_prog_type saved_dst_prog_type; enum bpf_attach_type saved_dst_attach_type; bool verifier_zext; /* Zero extensions has been inserted by verifier. */ bool dev_bound; /* Program is bound to the netdev. */ bool offload_requested; /* Program is bound and offloaded to the netdev. */ bool attach_btf_trace; /* true if attaching to BTF-enabled raw tp */ bool attach_tracing_prog; /* true if tracing another tracing program */ bool func_proto_unreliable; bool tail_call_reachable; bool xdp_has_frags; bool exception_cb; bool exception_boundary; bool is_extended; /* true if extended by freplace program */ bool jits_use_priv_stack; bool priv_stack_requested; bool changes_pkt_data; bool might_sleep;
u64 prog_array_member_cnt; /* counts how many times as member of prog_array */ struct mutex ext_mutex; /* mutex for is_extended and prog_array_member_cnt */ struct bpf_arena *arena; void (*recursion_detected)(struct bpf_prog *prog); /* callback if recursion is detected */ /* BTF_KIND_FUNC_PROTO for valid attach_btf_id */ conststruct btf_type *attach_func_proto; /* function name for valid attach_btf_id */ constchar *attach_func_name; struct bpf_prog **func; void *jit_data; /* JIT specific data. arch dependent */ struct bpf_jit_poke_descriptor *poke_tab; struct bpf_kfunc_desc_tab *kfunc_tab; struct bpf_kfunc_btf_tab *kfunc_btf_tab;
u32 size_poke_tab; #ifdef CONFIG_FINEIBT struct bpf_ksym ksym_prefix; #endif struct bpf_ksym ksym; conststruct bpf_prog_ops *ops; conststruct bpf_struct_ops *st_ops; struct bpf_map **used_maps; struct mutex used_maps_mutex; /* mutex for used_maps and used_map_cnt */ struct btf_mod_pair *used_btfs; struct bpf_prog *prog; struct user_struct *user;
u64 load_time; /* ns since boottime */
u32 verified_insns; int cgroup_atype; /* enum cgroup_bpf_attach_type */ struct bpf_map *cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]; char name[BPF_OBJ_NAME_LEN];
u64 (*bpf_exception_cb)(u64 cookie, u64 sp, u64 bp, u64, u64); #ifdef CONFIG_SECURITY void *security; #endif struct bpf_token *token; struct bpf_prog_offload *offload; struct btf *btf; struct bpf_func_info *func_info; struct bpf_func_info_aux *func_info_aux; /* bpf_line_info loaded from userspace. linfo->insn_off * has the xlated insn offset. * Both the main and sub prog share the same linfo. * The subprog can access its first linfo by * using the linfo_idx.
*/ struct bpf_line_info *linfo; /* jited_linfo is the jited addr of the linfo. It has a * one to one mapping to linfo: * jited_linfo[i] is the jited addr for the linfo[i]->insn_off. * Both the main and sub prog share the same jited_linfo. * The subprog can access its first jited_linfo by * using the linfo_idx.
*/ void **jited_linfo;
u32 func_info_cnt;
u32 nr_linfo; /* subprog can use linfo_idx to access its first linfo and * jited_linfo. * main prog always has linfo_idx == 0
*/
u32 linfo_idx; struct module *mod;
u32 num_exentries; struct exception_table_entry *extable; union { struct work_struct work; struct rcu_head rcu;
}; struct bpf_stream stream[2];
};
struct bpf_prog {
u16 pages; /* Number of allocated pages */
u16 jited:1, /* Is our filter JIT'ed? */
jit_requested:1,/* archs need to JIT the prog */
gpl_compatible:1, /* Is filter GPL compatible? */
cb_access:1, /* Is control block accessed? */
dst_needed:1, /* Do we need dst entry? */
blinding_requested:1, /* needs constant blinding */
blinded:1, /* Was blinded */
is_func:1, /* program is a bpf function */
kprobe_override:1, /* Do we override a kprobe? */
has_callchain_buf:1, /* callchain buffer allocated? */
enforce_expected_attach_type:1, /* Enforce expected_attach_type checking at attach time */
call_get_stack:1, /* Do we call bpf_get_stack() or bpf_get_stackid() */
call_get_func_ip:1, /* Do we call get_func_ip() */
tstamp_type_access:1, /* Accessed __sk_buff->tstamp_type */
sleepable:1; /* BPF program is sleepable */ enum bpf_prog_type type; /* Type of BPF program */ enum bpf_attach_type expected_attach_type; /* For some prog types */
u32 len; /* Number of filter blocks */
u32 jited_len; /* Size of jited insns in bytes */
u8 tag[BPF_TAG_SIZE]; struct bpf_prog_stats __percpu *stats; int __percpu *active; unsignedint (*bpf_func)(constvoid *ctx, conststruct bpf_insn *insn); struct bpf_prog_aux *aux; /* Auxiliary fields */ struct sock_fprog_kern *orig_prog; /* Original BPF program */ /* Instructions for interpreter */ union {
DECLARE_FLEX_ARRAY(struct sock_filter, insns);
DECLARE_FLEX_ARRAY(struct bpf_insn, insnsi);
};
};
struct bpf_array_aux { /* Programs with direct jumps into programs part of this array. */ struct list_head poke_progs; struct bpf_map *map; struct mutex poke_mutex; struct work_struct work;
};
/* rcu is used before freeing, work can be used to schedule that * RCU-based freeing before that, so they never overlap
*/ union { struct rcu_head rcu; struct work_struct work;
}; /* whether BPF link itself has "sleepable" semantics, which can differ * from underlying BPF program having a "sleepable" semantics, as BPF * link's semantics is determined by target attach hook
*/ bool sleepable;
};
struct bpf_link_ops { void (*release)(struct bpf_link *link); /* deallocate link resources callback, called without RCU grace period * waiting
*/ void (*dealloc)(struct bpf_link *link); /* deallocate link resources callback, called after RCU grace period; * if either the underlying BPF program is sleepable or BPF link's * target hook is sleepable, we'll go through tasks trace RCU GP and * then "classic" RCU GP; this need for chaining tasks trace and * classic RCU GPs is designated by setting bpf_link->sleepable flag
*/ void (*dealloc_deferred)(struct bpf_link *link); int (*detach)(struct bpf_link *link); int (*update_prog)(struct bpf_link *link, struct bpf_prog *new_prog, struct bpf_prog *old_prog); void (*show_fdinfo)(conststruct bpf_link *link, struct seq_file *seq); int (*fill_link_info)(conststruct bpf_link *link, struct bpf_link_info *info); int (*update_map)(struct bpf_link *link, struct bpf_map *new_map, struct bpf_map *old_map);
__poll_t (*poll)(struct file *file, struct poll_table_struct *pts);
};
#define BPF_STRUCT_OPS_MAX_NR_MEMBERS 64 /** * struct bpf_struct_ops - A structure of callbacks allowing a subsystem to * define a BPF_MAP_TYPE_STRUCT_OPS map type composed * of BPF_PROG_TYPE_STRUCT_OPS progs. * @verifier_ops: A structure of callbacks that are invoked by the verifier * when determining whether the struct_ops progs in the * struct_ops map are valid. * @init: A callback that is invoked a single time, and before any other * callback, to initialize the structure. A nonzero return value means * the subsystem could not be initialized. * @check_member: When defined, a callback invoked by the verifier to allow * the subsystem to determine if an entry in the struct_ops map * is valid. A nonzero return value means that the map is * invalid and should be rejected by the verifier. * @init_member: A callback that is invoked for each member of the struct_ops * map to allow the subsystem to initialize the member. A nonzero * value means the member could not be initialized. This callback * is exclusive with the @type, @type_id, @value_type, and * @value_id fields. * @reg: A callback that is invoked when the struct_ops map has been * initialized and is being attached to. Zero means the struct_ops map * has been successfully registered and is live. A nonzero return value * means the struct_ops map could not be registered. * @unreg: A callback that is invoked when the struct_ops map should be * unregistered. * @update: A callback that is invoked when the live struct_ops map is being * updated to contain new values. This callback is only invoked when * the struct_ops map is loaded with BPF_F_LINK. If not defined, the * it is assumed that the struct_ops map cannot be updated. * @validate: A callback that is invoked after all of the members have been * initialized. This callback should perform static checks on the * map, meaning that it should either fail or succeed * deterministically. A struct_ops map that has been validated may * not necessarily succeed in being registered if the call to @reg * fails. For example, a valid struct_ops map may be loaded, but * then fail to be registered due to there being another active * struct_ops map on the system in the subsystem already. For this * reason, if this callback is not defined, the check is skipped as * the struct_ops map will have final verification performed in * @reg. * @type: BTF type. * @value_type: Value type. * @name: The name of the struct bpf_struct_ops object. * @func_models: Func models * @type_id: BTF type id. * @value_id: BTF value id.
*/ struct bpf_struct_ops { conststruct bpf_verifier_ops *verifier_ops; int (*init)(struct btf *btf); int (*check_member)(conststruct btf_type *t, conststruct btf_member *member, conststruct bpf_prog *prog); int (*init_member)(conststruct btf_type *t, conststruct btf_member *member, void *kdata, constvoid *udata); int (*reg)(void *kdata, struct bpf_link *link); void (*unreg)(void *kdata, struct bpf_link *link); int (*update)(void *kdata, void *old_kdata, struct bpf_link *link); int (*validate)(void *kdata); void *cfi_stubs; struct module *owner; constchar *name; struct btf_func_model func_models[BPF_STRUCT_OPS_MAX_NR_MEMBERS];
};
/* Every member of a struct_ops type has an instance even a member is not * an operator (function pointer). The "info" field will be assigned to * prog->aux->ctx_arg_info of BPF struct_ops programs to provide the * argument information required by the verifier to verify the program. * * btf_ctx_access() will lookup prog->aux->ctx_arg_info to find the * corresponding entry for an given argument.
*/ struct bpf_struct_ops_arg_info { struct bpf_ctx_arg_aux *info;
u32 cnt;
};
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung ist noch experimentell.