/* bpf_check() is a static code analyzer that walks eBPF program * instruction by instruction and updates register/stack state. * All paths of conditional branches are analyzed until 'bpf_exit' insn. * * The first pass is depth-first-search to check that the program is a DAG. * It rejects the following programs: * - larger than BPF_MAXINSNS insns * - if loop is present (detected via back-edge) * - unreachable insns exist (shouldn't be a forest. program = one function) * - out of bounds or malformed jumps * The second pass is all possible path descent from the 1st insn. * Since it's analyzing all paths through the program, the length of the * analysis is limited to 64k insn, which may be hit even if total number of * insn is less then 4K, but there are too many branches that change stack/regs. * Number of 'branches to be analyzed' is limited to 1k * * On entry to each instruction, each register has a type, and the instruction * changes the types of the registers depending on instruction semantics. * If instruction is BPF_MOV64_REG(BPF_REG_1, BPF_REG_5), then type of R5 is * copied to R1. * * All registers are 64-bit. * R0 - return register * R1-R5 argument passing registers * R6-R9 callee saved registers * R10 - frame pointer read-only * * At the start of BPF program the register R1 contains a pointer to bpf_context * and has type PTR_TO_CTX. * * Verifier tracks arithmetic operations on pointers in case: * BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), * BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -20), * 1st insn copies R10 (which has FRAME_PTR) type into R1 * and 2nd arithmetic instruction is pattern matched to recognize * that it wants to construct a pointer to some element within stack. * So after 2nd insn, the register R1 has type PTR_TO_STACK * (and -20 constant is saved for further stack bounds checking). * Meaning that this reg is a pointer to stack plus known immediate constant. * * Most of the time the registers have SCALAR_VALUE type, which * means the register has some value, but it's not a valid pointer. * (like pointer plus pointer becomes SCALAR_VALUE type) * * When verifier sees load or store instructions the type of base register * can be: PTR_TO_MAP_VALUE, PTR_TO_CTX, PTR_TO_STACK, PTR_TO_SOCKET. These are * four pointer types recognized by check_mem_access() function. * * PTR_TO_MAP_VALUE means that this register is pointing to 'map element value' * and the range of [ptr, ptr + map's value_size) is accessible. * * registers used to pass values to function calls are checked against * function argument constraints. * * ARG_PTR_TO_MAP_KEY is one of such argument constraints. * It means that the register type passed to this function must be * PTR_TO_STACK and it will be used inside the function as * 'pointer to map element key' * * For example the argument constraints for bpf_map_lookup_elem(): * .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, * .arg1_type = ARG_CONST_MAP_PTR, * .arg2_type = ARG_PTR_TO_MAP_KEY, * * ret_type says that this function returns 'pointer to map elem value or null' * function expects 1st argument to be a const pointer to 'struct bpf_map' and * 2nd argument should be a pointer to stack, which will be used inside * the helper function as a pointer to map element key. * * On the kernel side the helper function looks like: * u64 bpf_map_lookup_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) * { * struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; * void *key = (void *) (unsigned long) r2; * void *value; * * here kernel can access 'key' and 'map' pointers safely, knowing that * [key, key + map->key_size) bytes are valid and were initialized on * the stack of eBPF program. * } * * Corresponding eBPF program may look like: * BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), // after this insn R2 type is FRAME_PTR * BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), // after this insn R2 type is PTR_TO_STACK * BPF_LD_MAP_FD(BPF_REG_1, map_fd), // after this insn R1 type is CONST_PTR_TO_MAP * BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), * here verifier looks at prototype of map_lookup_elem() and sees: * .arg1_type == ARG_CONST_MAP_PTR and R1->type == CONST_PTR_TO_MAP, which is ok, * Now verifier knows that this map has key of R1->map_ptr->key_size bytes * * Then .arg2_type == ARG_PTR_TO_MAP_KEY and R2->type == PTR_TO_STACK, ok so far, * Now verifier checks that [R2, R2 + map's key_size) are within stack limits * and were initialized prior to this call. * If it's ok, then verifier allows this BPF_CALL insn and looks at * .ret_type which is RET_PTR_TO_MAP_VALUE_OR_NULL, so it sets * R0->type = PTR_TO_MAP_VALUE_OR_NULL which means bpf_map_lookup_elem() function * returns either pointer to map value or NULL. * * When type PTR_TO_MAP_VALUE_OR_NULL passes through 'if (reg != 0) goto +off' * insn, the register holding that pointer in the true branch changes state to * PTR_TO_MAP_VALUE and the same register changes state to CONST_IMM in the false * branch. See check_cond_jmp_op(). * * After the call R0 is set to return type of the function and registers R1-R5 * are set to NOT_INIT to indicate that they are no longer readable. * * The following reference types represent a potential reference to a kernel * resource which, after first being allocated, must be checked and freed by * the BPF program: * - PTR_TO_SOCKET_OR_NULL, PTR_TO_SOCKET * * When the verifier sees a helper call return a reference type, it allocates a * pointer id for the reference and stores it in the current function state. * Similar to the way that PTR_TO_MAP_VALUE_OR_NULL is converted into * PTR_TO_MAP_VALUE, PTR_TO_SOCKET_OR_NULL becomes PTR_TO_SOCKET when the type * passes through a NULL-check conditional. For the branch wherein the state is * changed to CONST_IMM, the verifier releases the reference. * * For each helper function that allocates a reference, such as * bpf_sk_lookup_tcp(), there is a corresponding release function, such as * bpf_sk_release(). When a reference type passes into the release function, * the verifier also releases the reference. If any unchecked or unreleased * reference remains at the end of the program, the verifier rejects it.
*/
/* verifier_state + insn_idx are pushed to stack when branch is encountered */ struct bpf_verifier_stack_elem { /* verifier state is 'st' * before processing instruction 'insn_idx' * and after processing instruction 'prev_insn_idx'
*/ struct bpf_verifier_state st; int insn_idx; int prev_insn_idx; struct bpf_verifier_stack_elem *next; /* length of verifier log at the time this state was pushed on stack */
u32 log_pos;
};
/* arg_{btf,btf_id,owning_ref} are used by kfunc-specific handling, * generally to pass info about user-defined local kptr types to later * verification logic * bpf_obj_drop/bpf_percpu_obj_drop * Record the local kptr type to be drop'd * bpf_refcount_acquire (via KF_ARG_PTR_TO_REFCOUNTED_KPTR arg type) * Record the local kptr type to be refcount_incr'd and use * arg_owning_ref to determine whether refcount_acquire should be * fallible
*/ struct btf *arg_btf;
u32 arg_btf_id; bool arg_owning_ref; bool arg_prog;
if (is_ptr_cast_function(func_id))
ref_obj_uses++; if (is_acquire_function(func_id, map))
ref_obj_uses++; if (is_dynptr_ref_function(func_id))
ref_obj_uses++;
staticbool is_spi_bounds_valid(struct bpf_func_state *state, int spi, int nr_slots)
{ int allocated_slots = state->allocated_stack / BPF_REG_SIZE;
/* We need to check that slots between [spi - nr_slots + 1, spi] are * within [0, allocated_stack). * * Please note that the spi grows downwards. For example, a dynptr * takes the size of two stack slots; the first slot will be at * spi and the second slot will be at spi - 1.
*/ return spi - nr_slots + 1 >= 0 && spi < allocated_slots;
}
staticint stack_slot_obj_get_spi(struct bpf_verifier_env *env, struct bpf_reg_state *reg, constchar *obj_kind, int nr_slots)
{ int off, spi;
if (!tnum_is_const(reg->var_off)) {
verbose(env, "%s has to be at a constant offset\n", obj_kind); return -EINVAL;
}
off = reg->off + reg->var_off.value; if (off % BPF_REG_SIZE) {
verbose(env, "cannot pass in %s at an offset=%d\n", obj_kind, off); return -EINVAL;
}
spi = __get_spi(off); if (spi + 1 < nr_slots) {
verbose(env, "cannot pass in %s at an offset=%d\n", obj_kind, off); return -EINVAL;
}
if (!is_spi_bounds_valid(func(env, reg), spi, nr_slots)) return -ERANGE; return spi;
}
staticint destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env, struct bpf_func_state *state, int spi);
staticint mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg, enum bpf_arg_type arg_type, int insn_idx, int clone_ref_obj_id)
{ struct bpf_func_state *state = func(env, reg); enum bpf_dynptr_type type; int spi, i, err;
spi = dynptr_get_spi(env, reg); if (spi < 0) return spi;
/* We cannot assume both spi and spi - 1 belong to the same dynptr, * hence we need to call destroy_if_dynptr_stack_slot twice for both, * to ensure that for the following example: * [d1][d1][d2][d2] * spi 3 2 1 0 * So marking spi = 2 should lead to destruction of both d1 and d2. In * case they do belong to same dynptr, second call won't see slot_type * as STACK_DYNPTR and will simply skip destruction.
*/
err = destroy_if_dynptr_stack_slot(env, state, spi); if (err) return err;
err = destroy_if_dynptr_stack_slot(env, state, spi - 1); if (err) return err;
for (i = 0; i < BPF_REG_SIZE; i++) {
state->stack[spi].slot_type[i] = STACK_DYNPTR;
state->stack[spi - 1].slot_type[i] = STACK_DYNPTR;
}
type = arg_to_dynptr_type(arg_type); if (type == BPF_DYNPTR_TYPE_INVALID) return -EINVAL;
/* Why do we need to set REG_LIVE_WRITTEN for STACK_INVALID slot? * * While we don't allow reading STACK_INVALID, it is still possible to * do <8 byte writes marking some but not all slots as STACK_MISC. Then, * helpers or insns can do partial read of that part without failing, * but check_stack_range_initialized, check_stack_read_var_off, and * check_stack_read_fixed_off will do mark_reg_read for all 8-bytes of * the slot conservatively. Hence we need to prevent those liveness * marking walks. * * This was not a problem before because STACK_INVALID is only set by * default (where the default reg state has its reg->parent as NULL), or * in clean_live_states after REG_LIVE_DONE (at which point * mark_reg_read won't walk reg->parent chain), but not randomly during * verifier state exploration (like we did above). Hence, for our case * parentage chain will still be live (i.e. reg->parent may be * non-NULL), while earlier reg->parent was NULL, so we need * REG_LIVE_WRITTEN to screen off read marker propagation when it is * done later on reads or by mark_dynptr_read as well to unnecessary * mark registers in verifier state.
*/
state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
state->stack[spi - 1].spilled_ptr.live |= REG_LIVE_WRITTEN;
}
/* If the dynptr has a ref_obj_id, then we need to invalidate * two things: * * 1) Any dynptrs with a matching ref_obj_id (clones) * 2) Any slices derived from this dynptr.
*/
/* Invalidate any slices associated with this dynptr */
WARN_ON_ONCE(release_reference(env, ref_obj_id));
/* Invalidate any dynptr clones */ for (i = 1; i < state->allocated_stack / BPF_REG_SIZE; i++) { if (state->stack[i].spilled_ptr.ref_obj_id != ref_obj_id) continue;
/* it should always be the case that if the ref obj id * matches then the stack slot also belongs to a * dynptr
*/ if (state->stack[i].slot_type[0] != STACK_DYNPTR) {
verifier_bug(env, "misconfigured ref_obj_id"); return -EFAULT;
} if (state->stack[i].spilled_ptr.dynptr.first_slot)
invalidate_dynptr(env, state, i);
}
staticint destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env, struct bpf_func_state *state, int spi)
{ struct bpf_func_state *fstate; struct bpf_reg_state *dreg; int i, dynptr_id;
/* We always ensure that STACK_DYNPTR is never set partially, * hence just checking for slot_type[0] is enough. This is * different for STACK_SPILL, where it may be only set for * 1 byte, so code has to use is_spilled_reg.
*/ if (state->stack[spi].slot_type[0] != STACK_DYNPTR) return 0;
/* Reposition spi to first slot */ if (!state->stack[spi].spilled_ptr.dynptr.first_slot)
spi = spi + 1;
/* Writing partially to one dynptr stack slot destroys both. */ for (i = 0; i < BPF_REG_SIZE; i++) {
state->stack[spi].slot_type[i] = STACK_INVALID;
state->stack[spi - 1].slot_type[i] = STACK_INVALID;
}
dynptr_id = state->stack[spi].spilled_ptr.id; /* Invalidate any slices associated with this dynptr */
bpf_for_each_reg_in_vstate(env->cur_state, fstate, dreg, ({ /* Dynptr slices are only PTR_TO_MEM_OR_NULL and PTR_TO_MEM */ if (dreg->type != (PTR_TO_MEM | PTR_MAYBE_NULL) && dreg->type != PTR_TO_MEM) continue; if (dreg->dynptr_id == dynptr_id)
mark_reg_invalid(env, dreg);
}));
/* Do not release reference state, we are destroying dynptr on stack, * not using some helper to release it. Just reset register.
*/
__mark_reg_not_init(env, &state->stack[spi].spilled_ptr);
__mark_reg_not_init(env, &state->stack[spi - 1].spilled_ptr);
/* Same reason as unmark_stack_slots_dynptr above */
state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
state->stack[spi - 1].spilled_ptr.live |= REG_LIVE_WRITTEN;
return 0;
}
staticbool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{ int spi;
if (reg->type == CONST_PTR_TO_DYNPTR) returnfalse;
spi = dynptr_get_spi(env, reg);
/* -ERANGE (i.e. spi not falling into allocated stack slots) isn't an * error because this just means the stack state hasn't been updated yet. * We will do check_mem_access to check and update stack bounds later.
*/ if (spi < 0 && spi != -ERANGE) returnfalse;
/* We don't need to check if the stack slots are marked by previous * dynptr initializations because we allow overwriting existing unreferenced * STACK_DYNPTR slots, see mark_stack_slots_dynptr which calls * destroy_if_dynptr_stack_slot to ensure dynptr objects at the slots we are * touching are completely destructed before we reinitialize them for a new * one. For referenced ones, destroy_if_dynptr_stack_slot returns an error early * instead of delaying it until the end where the user will get "Unreleased * reference" error.
*/ returntrue;
}
staticbool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{ struct bpf_func_state *state = func(env, reg); int i, spi;
/* This already represents first slot of initialized bpf_dynptr. * * CONST_PTR_TO_DYNPTR already has fixed and var_off as 0 due to * check_func_arg_reg_off's logic, so we don't need to check its * offset and alignment.
*/ if (reg->type == CONST_PTR_TO_DYNPTR) returntrue;
spi = dynptr_get_spi(env, reg); if (spi < 0) returnfalse; if (!state->stack[spi].spilled_ptr.dynptr.first_slot) returnfalse;
for (i = 0; i < BPF_REG_SIZE; i++) { if (state->stack[spi].slot_type[i] != STACK_DYNPTR ||
state->stack[spi - 1].slot_type[i] != STACK_DYNPTR) returnfalse;
}
staticbool is_iter_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int nr_slots)
{ struct bpf_func_state *state = func(env, reg); int spi, i, j;
/* For -ERANGE (i.e. spi not falling into allocated stack slots), we * will do check_mem_access to check and update stack bounds later, so * return true for that case.
*/
spi = iter_get_spi(env, reg, nr_slots); if (spi == -ERANGE) returntrue; if (spi < 0) returnfalse;
for (i = 0; i < nr_slots; i++) { struct bpf_stack_state *slot = &state->stack[spi - i];
for (j = 0; j < BPF_REG_SIZE; j++) if (slot->slot_type[j] == STACK_ITER) returnfalse;
}
returntrue;
}
staticint is_iter_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg, struct btf *btf, u32 btf_id, int nr_slots)
{ struct bpf_func_state *state = func(env, reg); int spi, i, j;
for (i = 0; i < nr_slots; i++) { struct bpf_stack_state *slot = &state->stack[spi - i]; struct bpf_reg_state *st = &slot->spilled_ptr;
if (st->type & PTR_UNTRUSTED) return -EPROTO; /* only main (first) slot has ref_obj_id set */ if (i == 0 && !st->ref_obj_id) return -EINVAL; if (i != 0 && st->ref_obj_id) return -EINVAL; if (st->iter.btf != btf || st->iter.btf_id != btf_id) return -EINVAL;
for (j = 0; j < BPF_REG_SIZE; j++) if (slot->slot_type[j] != STACK_ITER) return -EINVAL;
}
return 0;
}
staticint acquire_irq_state(struct bpf_verifier_env *env, int insn_idx); staticint release_irq_state(struct bpf_verifier_state *state, int id);
staticint mark_stack_slot_irq_flag(struct bpf_verifier_env *env, struct bpf_kfunc_call_arg_meta *meta, struct bpf_reg_state *reg, int insn_idx, int kfunc_class)
{ struct bpf_func_state *state = func(env, reg); struct bpf_stack_state *slot; struct bpf_reg_state *st; int spi, i, id;
spi = irq_flag_get_spi(env, reg); if (spi < 0) return spi;
id = acquire_irq_state(env, insn_idx); if (id < 0) return id;
slot = &state->stack[spi];
st = &slot->spilled_ptr;
__mark_reg_known_zero(st);
st->type = PTR_TO_STACK; /* we don't have dedicated reg type */
st->live |= REG_LIVE_WRITTEN;
st->ref_obj_id = id;
st->irq.kfunc_class = kfunc_class;
for (i = 0; i < BPF_REG_SIZE; i++)
slot->slot_type[i] = STACK_IRQ_FLAG;
mark_stack_slot_scratched(env, spi); return 0;
}
staticint unmark_stack_slot_irq_flag(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int kfunc_class)
{ struct bpf_func_state *state = func(env, reg); struct bpf_stack_state *slot; struct bpf_reg_state *st; int spi, i, err;
spi = irq_flag_get_spi(env, reg); if (spi < 0) return spi;
slot = &state->stack[spi];
st = &slot->spilled_ptr;
verbose(env, "irq flag acquired by %s kfuncs cannot be restored with %s kfuncs\n",
flag_kfunc, used_kfunc); return -EINVAL;
}
err = release_irq_state(env->cur_state, st->ref_obj_id);
WARN_ON_ONCE(err && err != -EACCES); if (err) { int insn_idx = 0;
for (int i = 0; i < env->cur_state->acquired_refs; i++) { if (env->cur_state->refs[i].id == env->cur_state->active_irq_id) {
insn_idx = env->cur_state->refs[i].insn_idx; break;
}
}
verbose(env, "cannot restore irq state out of order, expected id=%d acquired at insn_idx=%d\n",
env->cur_state->active_irq_id, insn_idx); return err;
}
__mark_reg_not_init(env, st);
/* see unmark_stack_slots_dynptr() for why we need to set REG_LIVE_WRITTEN */
st->live |= REG_LIVE_WRITTEN;
for (i = 0; i < BPF_REG_SIZE; i++)
slot->slot_type[i] = STACK_INVALID;
/* For -ERANGE (i.e. spi not falling into allocated stack slots), we * will do check_mem_access to check and update stack bounds later, so * return true for that case.
*/
spi = irq_flag_get_spi(env, reg); if (spi == -ERANGE) returntrue; if (spi < 0) returnfalse;
slot = &state->stack[spi];
for (i = 0; i < BPF_REG_SIZE; i++) if (slot->slot_type[i] == STACK_IRQ_FLAG) returnfalse; returntrue;
}
spi = irq_flag_get_spi(env, reg); if (spi < 0) return -EINVAL;
slot = &state->stack[spi];
st = &slot->spilled_ptr;
if (!st->ref_obj_id) return -EINVAL;
for (i = 0; i < BPF_REG_SIZE; i++) if (slot->slot_type[i] != STACK_IRQ_FLAG) return -EINVAL; return 0;
}
/* Check if given stack slot is "special": * - spilled register state (STACK_SPILL); * - dynptr state (STACK_DYNPTR); * - iter state (STACK_ITER). * - irq flag state (STACK_IRQ_FLAG)
*/ staticbool is_stack_slot_special(conststruct bpf_stack_state *stack)
{ enum bpf_stack_slot_type type = stack->slot_type[BPF_REG_SIZE - 1];
switch (type) { case STACK_SPILL: case STACK_DYNPTR: case STACK_ITER: case STACK_IRQ_FLAG: returntrue; case STACK_INVALID: case STACK_MISC: case STACK_ZERO: returnfalse; default:
WARN_ONCE(1, "unknown stack slot type %d\n", type); returntrue;
}
}
/* The reg state of a pointer or a bounded scalar was saved when * it was spilled to the stack.
*/ staticbool is_spilled_reg(conststruct bpf_stack_state *stack)
{ return stack->slot_type[BPF_REG_SIZE - 1] == STACK_SPILL;
}
/* Mark stack slot as STACK_MISC, unless it is already STACK_INVALID, in which * case they are equivalent, or it's STACK_ZERO, in which case we preserve * more precise STACK_ZERO. * Regardless of allow_ptr_leaks setting (i.e., privileged or unprivileged * mode), we won't promote STACK_INVALID to STACK_MISC. In privileged case it is * unnecessary as both are considered equivalent when loading data and pruning, * in case of unprivileged mode it will be incorrect to allow reads of invalid * slots.
*/ staticvoid mark_stack_slot_misc(struct bpf_verifier_env *env, u8 *stype)
{ if (*stype == STACK_ZERO) return; if (*stype == STACK_INVALID) return;
*stype = STACK_MISC;
}
/* copy array src of length n * size bytes to dst. dst is reallocated if it's too * small to hold src. This is different from krealloc since we don't want to preserve * the contents of dst. * * Leaves dst untouched if src is NULL or length is zero. Returns NULL if memory could * not be allocated.
*/ staticvoid *copy_array(void *dst, constvoid *src, size_t n, size_t size, gfp_t flags)
{
size_t alloc_bytes; void *orig = dst;
size_t bytes;
if (ZERO_OR_NULL_PTR(src)) goto out;
if (unlikely(check_mul_overflow(n, size, &bytes))) return NULL;
/* resize an array from old_n items to new_n items. the array is reallocated if it's too * small to hold new_n items. new items are zeroed out if the array grows. * * Contrary to krealloc_array, does not free arr if new_n is zero.
*/ staticvoid *realloc_array(void *arr, size_t old_n, size_t new_n, size_t size)
{
size_t alloc_size; void *new_arr;
staticint resize_reference_state(struct bpf_verifier_state *state, size_t n)
{
state->refs = realloc_array(state->refs, state->acquired_refs, n, sizeof(struct bpf_reference_state)); if (!state->refs) return -ENOMEM;
state->acquired_refs = n; return 0;
}
/* Possibly update state->allocated_stack to be at least size bytes. Also * possibly update the function's high-water mark in its bpf_subprog_info.
*/ staticint grow_stack_state(struct bpf_verifier_env *env, struct bpf_func_state *state, int size)
{
size_t old_n = state->allocated_stack / BPF_REG_SIZE, n;
/* The stack size is always a multiple of BPF_REG_SIZE. */
size = round_up(size, BPF_REG_SIZE);
n = size / BPF_REG_SIZE;
if (old_n >= n) return 0;
state->stack = realloc_array(state->stack, old_n, n, sizeof(struct bpf_stack_state)); if (!state->stack) return -ENOMEM;
state->allocated_stack = size;
/* update known max for given subprogram */ if (env->subprog_info[state->subprogno].stack_depth < size)
env->subprog_info[state->subprogno].stack_depth = size;
return 0;
}
/* Acquire a pointer id from the env and update the state->refs to include * this new pointer reference. * On success, returns a valid pointer id to associate with the register * On failure, returns a negative errno.
*/ staticstruct bpf_reference_state *acquire_reference_state(struct bpf_verifier_env *env, int insn_idx)
{ struct bpf_verifier_state *state = env->cur_state; int new_ofs = state->acquired_refs; int err;
s = acquire_reference_state(env, insn_idx); if (!s) return -ENOMEM;
s->type = REF_TYPE_IRQ;
s->id = ++env->id_gen;
state->active_irq_id = s->id; return s->id;
}
staticvoid release_reference_state(struct bpf_verifier_state *state, int idx)
{ int last_idx;
size_t rem;
/* IRQ state requires the relative ordering of elements remaining the * same, since it relies on the refs array to behave as a stack, so that * it can detect out-of-order IRQ restore. Hence use memmove to shift * the array instead of swapping the final element into the deleted idx.
*/
last_idx = state->acquired_refs - 1;
rem = state->acquired_refs - idx - 1; if (last_idx && idx != last_idx)
memmove(&state->refs[idx], &state->refs[idx + 1], sizeof(*state->refs) * rem);
memset(&state->refs[last_idx], 0, sizeof(*state->refs));
state->acquired_refs--; return;
}
staticbool find_reference_state(struct bpf_verifier_state *state, int ptr_id)
{ int i;
for (i = 0; i < state->acquired_refs; i++) if (state->refs[i].id == ptr_id) returntrue;
returnfalse;
}
staticint release_lock_state(struct bpf_verifier_state *state, int type, int id, void *ptr)
{ void *prev_ptr = NULL;
u32 prev_id = 0; int i;
for (i = 0; i < state->acquired_refs; i++) { if (state->refs[i].type == type && state->refs[i].id == id &&
state->refs[i].ptr == ptr) {
release_reference_state(state, i);
state->active_locks--; /* Reassign active lock (id, ptr). */
state->active_lock_id = prev_id;
state->active_lock_ptr = prev_ptr; return 0;
} if (state->refs[i].type & REF_TYPE_LOCK_MASK) {
prev_id = state->refs[i].id;
prev_ptr = state->refs[i].ptr;
}
} return -EINVAL;
}
staticint release_irq_state(struct bpf_verifier_state *state, int id)
{
u32 prev_id = 0; int i;
if (id != state->active_irq_id) return -EACCES;
for (i = 0; i < state->acquired_refs; i++) { if (state->refs[i].type != REF_TYPE_IRQ) continue; if (state->refs[i].id == id) {
release_reference_state(state, i);
state->active_irq_id = prev_id; return 0;
} else {
prev_id = state->refs[i].id;
}
} return -EINVAL;
}
staticstruct bpf_reference_state *find_lock_state(struct bpf_verifier_state *state, enum ref_state_type type, int id, void *ptr)
{ int i;
for (i = 0; i < state->acquired_refs; i++) { struct bpf_reference_state *s = &state->refs[i];
if (!(s->type & type)) continue;
if (s->id == id && s->ptr == ptr) return s;
} return NULL;
}
staticvoid free_verifier_state(struct bpf_verifier_state *state, bool free_self)
{ int i;
for (i = 0; i <= state->curframe; i++) {
free_func_state(state->frame[i]);
state->frame[i] = NULL;
}
kfree(state->refs);
clear_jmp_history(state); if (free_self)
kfree(state);
}
/* struct bpf_verifier_state->parent refers to states * that are in either of env->{expored_states,free_list}. * In both cases the state is contained in struct bpf_verifier_state_list.
*/ staticstruct bpf_verifier_state_list *state_parent_as_list(struct bpf_verifier_state *st)
{ if (st->parent) return container_of(st->parent, struct bpf_verifier_state_list, state); return NULL;
}
/* A state can be freed if it is no longer referenced: * - is in the env->free_list; * - has no children states;
*/ staticvoid maybe_free_verifier_state(struct bpf_verifier_env *env, struct bpf_verifier_state_list *sl)
{ if (!sl->in_free_list
|| sl->state.branches != 0
|| incomplete_read_marks(env, &sl->state)) return;
list_del(&sl->node);
free_verifier_state(&sl->state, false);
kfree(sl);
env->free_list_size--;
}
/* copy verifier state from src to dst growing dst stack space * when necessary to accommodate larger src stack
*/ staticint copy_func_state(struct bpf_func_state *dst, conststruct bpf_func_state *src)
{
memcpy(dst, src, offsetof(struct bpf_func_state, stack)); return copy_stack_state(dst, src);
}
staticint copy_verifier_state(struct bpf_verifier_state *dst_state, conststruct bpf_verifier_state *src)
{ struct bpf_func_state *dst; int i, err;
staticbool same_callsites(struct bpf_verifier_state *a, struct bpf_verifier_state *b)
{ int fr;
if (a->curframe != b->curframe) returnfalse;
for (fr = a->curframe; fr >= 0; fr--) if (a->frame[fr]->callsite != b->frame[fr]->callsite) returnfalse;
returntrue;
}
/* Return IP for a given frame in a call stack */ static u32 frame_insn_idx(struct bpf_verifier_state *st, u32 frame)
{ return frame == st->curframe
? st->insn_idx
: st->frame[frame + 1]->callsite;
}
/* For state @st look for a topmost frame with frame_insn_idx() in some SCC, * if such frame exists form a corresponding @callchain as an array of * call sites leading to this frame and SCC id. * E.g.: * * void foo() { A: loop {... SCC#1 ...}; } * void bar() { B: loop { C: foo(); ... SCC#2 ... } * D: loop { E: foo(); ... SCC#3 ... } } * void main() { F: bar(); } * * @callchain at (A) would be either (F,SCC#2) or (F,SCC#3) depending * on @st frame call sites being (F,C,A) or (F,E,A).
*/ staticbool compute_scc_callchain(struct bpf_verifier_env *env, struct bpf_verifier_state *st, struct bpf_scc_callchain *callchain)
{
u32 i, scc, insn_idx;
memset(callchain, 0, sizeof(*callchain)); for (i = 0; i <= st->curframe; i++) {
insn_idx = frame_insn_idx(st, i);
scc = env->insn_aux_data[insn_idx].scc; if (scc) {
callchain->scc = scc; break;
} elseif (i < st->curframe) {
callchain->callsites[i] = insn_idx;
} else { returnfalse;
}
} returntrue;
}
if (!info) return NULL; for (i = 0; i < info->num_visits; i++) if (memcmp(callchain, &visits[i].callchain, sizeof(*callchain)) == 0) return &visits[i]; return NULL;
}
/* Allocate a new bpf_scc_visit instance corresponding to @callchain. * Allocated instances are alive for a duration of the do_check_common() * call and are freed by free_states().
*/ staticstruct bpf_scc_visit *scc_visit_alloc(struct bpf_verifier_env *env, struct bpf_scc_callchain *callchain)
{ struct bpf_scc_visit *visit; struct bpf_scc_info *info;
u32 scc, num_visits;
u64 new_sz;
/* Form a string '(callsite#1,callsite#2,...,scc)' in env->tmp_str_buf */ staticchar *format_callchain(struct bpf_verifier_env *env, struct bpf_scc_callchain *callchain)
{ char *buf = env->tmp_str_buf; int i, delta = 0;
/* If callchain for @st exists (@st is in some SCC), ensure that * bpf_scc_visit instance for this callchain exists. * If instance does not exist or is empty, assign visit->entry_state to @st.
*/ staticint maybe_enter_scc(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
{ struct bpf_scc_callchain *callchain = &env->callchain_buf; struct bpf_scc_visit *visit;
if (!compute_scc_callchain(env, st, callchain)) return 0;
visit = scc_visit_lookup(env, callchain);
visit = visit ?: scc_visit_alloc(env, callchain); if (!visit) return -ENOMEM; if (!visit->entry_state) {
visit->entry_state = st; if (env->log.level & BPF_LOG_LEVEL2)
verbose(env, "SCC enter %s\n", format_callchain(env, callchain));
} return 0;
}
/* If callchain for @st exists (@st is in some SCC), make it empty: * - set visit->entry_state to NULL; * - flush accumulated backedges.
*/ staticint maybe_exit_scc(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
{ struct bpf_scc_callchain *callchain = &env->callchain_buf; struct bpf_scc_visit *visit;
if (!compute_scc_callchain(env, st, callchain)) return 0;
visit = scc_visit_lookup(env, callchain); if (!visit) { /* * If path traversal stops inside an SCC, corresponding bpf_scc_visit * must exist for non-speculative paths. For non-speculative paths * traversal stops when: * a. Verification error is found, maybe_exit_scc() is not called. * b. Top level BPF_EXIT is reached. Top level BPF_EXIT is not a member * of any SCC. * c. A checkpoint is reached and matched. Checkpoints are created by * is_state_visited(), which calls maybe_enter_scc(), which allocates * bpf_scc_visit instances for checkpoints within SCCs. * (c) is the only case that can reach this point.
*/ if (!st->speculative) {
verifier_bug(env, "scc exit: no visit info for call chain %s",
format_callchain(env, callchain)); return -EFAULT;
} return 0;
} if (visit->entry_state != st) return 0; if (env->log.level & BPF_LOG_LEVEL2)
verbose(env, "SCC exit %s\n", format_callchain(env, callchain));
visit->entry_state = NULL;
env->num_backedges -= visit->num_backedges;
visit->num_backedges = 0;
update_peak_states(env); return propagate_backedges(env, visit);
}
/* Lookup an bpf_scc_visit instance corresponding to @st callchain * and add @backedge to visit->backedges. @st callchain must exist.
*/ staticint add_scc_backedge(struct bpf_verifier_env *env, struct bpf_verifier_state *st, struct bpf_scc_backedge *backedge)
{ struct bpf_scc_callchain *callchain = &env->callchain_buf; struct bpf_scc_visit *visit;
if (!compute_scc_callchain(env, st, callchain)) {
verifier_bug(env, "add backedge: no SCC in verification path, insn_idx %d",
st->insn_idx); return -EFAULT;
}
visit = scc_visit_lookup(env, callchain); if (!visit) {
verifier_bug(env, "add backedge: no visit info for call chain %s",
format_callchain(env, callchain)); return -EFAULT;
} if (env->log.level & BPF_LOG_LEVEL2)
verbose(env, "SCC backedge %s\n", format_callchain(env, callchain));
backedge->next = visit->backedges;
visit->backedges = backedge;
visit->num_backedges++;
env->num_backedges++;
update_peak_states(env); return 0;
}
/* bpf_reg_state->live marks for registers in a state @st are incomplete, * if state @st is in some SCC and not all execution paths starting at this * SCC are fully explored.
*/ staticbool incomplete_read_marks(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
{ struct bpf_scc_callchain *callchain = &env->callchain_buf; struct bpf_scc_visit *visit;
if (!compute_scc_callchain(env, st, callchain)) returnfalse;
visit = scc_visit_lookup(env, callchain); if (!visit) returnfalse; return !!visit->backedges;
}
/* verifier_bug_if(br > 1, ...) technically makes sense here, * but see comment in push_stack(), hence:
*/
verifier_bug_if((int)br < 0, env, "%s:branches_to_explore=%d", __func__, br); if (br) break;
err = maybe_exit_scc(env, st); if (err) return err;
parent = st->parent;
parent_sl = state_parent_as_list(st); if (sl)
maybe_free_verifier_state(env, sl);
st = parent;
sl = parent_sl;
} return 0;
}
staticint pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx, int *insn_idx, bool pop_log)
{ struct bpf_verifier_state *cur = env->cur_state; struct bpf_verifier_stack_elem *elem, *head = env->head; int err;
if (env->head == NULL) return -ENOENT;
if (cur) {
err = copy_verifier_state(cur, &head->st); if (err) return err;
} if (pop_log)
bpf_vlog_reset(&env->log, head->log_pos); if (insn_idx)
*insn_idx = head->insn_idx; if (prev_insn_idx)
*prev_insn_idx = head->prev_insn_idx;
elem = head->next;
free_verifier_state(&head->st, false);
kfree(head);
env->head = elem;
env->stack_size--; return 0;
}
staticbool error_recoverable_with_nospec(int err)
{ /* Should only return true for non-fatal errors that are allowed to * occur during speculative verification. For these we can insert a * nospec and the program might still be accepted. Do not include * something like ENOMEM because it is likely to re-occur for the next * architectural path once it has been recovered-from in all speculative * paths.
*/ return err == -EPERM || err == -EACCES || err == -EINVAL;
}
staticstruct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, int insn_idx, int prev_insn_idx, bool speculative)
{ struct bpf_verifier_state *cur = env->cur_state; struct bpf_verifier_stack_elem *elem; int err;
elem = kzalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL_ACCOUNT); if (!elem) return NULL;
elem->insn_idx = insn_idx;
elem->prev_insn_idx = prev_insn_idx;
elem->next = env->head;
elem->log_pos = env->log.end_pos;
env->head = elem;
env->stack_size++;
err = copy_verifier_state(&elem->st, cur); if (err) return NULL;
elem->st.speculative |= speculative; if (env->stack_size > BPF_COMPLEXITY_LIMIT_JMP_SEQ) {
verbose(env, "The sequence of %d jumps is too complex.\n",
env->stack_size); return NULL;
} if (elem->st.parent) {
++elem->st.parent->branches; /* WARN_ON(branches > 2) technically makes sense here, * but * 1. speculative states will bump 'branches' for non-branch * instructions * 2. is_state_visited() heuristics may decide not to create * a new state for a sequence of branches and all such current * and cloned states will be pointing to a single parent state * which might have large 'branches' count.
*/
} return &elem->st;
}
/* Mark the unknown part of a register (variable offset or scalar value) as * known to have the value @imm.
*/ staticvoid __mark_reg_known(struct bpf_reg_state *reg, u64 imm)
{ /* Clear off and union(map_ptr, range) */
memset(((u8 *)reg) + sizeof(reg->type), 0,
offsetof(struct bpf_reg_state, var_off) - sizeof(reg->type));
reg->id = 0;
reg->ref_obj_id = 0;
___mark_reg_known(reg, imm);
}
/* Mark the 'variable offset' part of a register as zero. This should be * used only on registers holding a pointer type.
*/ staticvoid __mark_reg_known_zero(struct bpf_reg_state *reg)
{
__mark_reg_known(reg, 0);
}
staticvoid __mark_reg_const_zero(conststruct bpf_verifier_env *env, struct bpf_reg_state *reg)
{
__mark_reg_known(reg, 0);
reg->type = SCALAR_VALUE; /* all scalars are assumed imprecise initially (unless unprivileged, * in which case everything is forced to be precise)
*/
reg->precise = !env->bpf_capable;
}
staticvoid mark_reg_known_zero(struct bpf_verifier_env *env, struct bpf_reg_state *regs, u32 regno)
{ if (WARN_ON(regno >= MAX_BPF_REG)) {
verbose(env, "mark_reg_known_zero(regs, %u)\n", regno); /* Something bad happened, let's kill all regs */ for (regno = 0; regno < MAX_BPF_REG; regno++)
__mark_reg_not_init(env, regs + regno); return;
}
__mark_reg_known_zero(regs + regno);
}
staticvoid __mark_dynptr_reg(struct bpf_reg_state *reg, enum bpf_dynptr_type type, bool first_slot, int dynptr_id)
{ /* reg->type has no meaning for STACK_DYNPTR, but when we set reg for * callback arguments, it does need to be CONST_PTR_TO_DYNPTR, so simply * set it unconditionally as it is ignored for STACK_DYNPTR anyway.
*/
__mark_reg_known_zero(reg);
reg->type = CONST_PTR_TO_DYNPTR; /* Give each dynptr a unique id to uniquely associate slices to it. */
reg->id = dynptr_id;
reg->dynptr.type = type;
reg->dynptr.first_slot = first_slot;
}
/* Unmodified PTR_TO_PACKET[_META,_END] register from ctx access. */ staticbool reg_is_init_pkt_pointer(conststruct bpf_reg_state *reg, enum bpf_reg_type which)
{ /* The register can already have a range from prior markings. * This is fine as long as it hasn't been advanced from its * origin.
*/ return reg->type == which &&
reg->id == 0 &&
reg->off == 0 &&
tnum_equals_const(reg->var_off, 0);
}
/* Reset the min/max bounds of a register */ staticvoid __mark_reg_unbounded(struct bpf_reg_state *reg)
{
reg->smin_value = S64_MIN;
reg->smax_value = S64_MAX;
reg->umin_value = 0;
reg->umax_value = U64_MAX;
/* Uses signed min/max values to inform unsigned, and vice-versa */ staticvoid __reg32_deduce_bounds(struct bpf_reg_state *reg)
{ /* If upper 32 bits of u64/s64 range don't change, we can use lower 32 * bits to improve our u32/s32 boundaries. * * E.g., the case where we have upper 32 bits as zero ([10, 20] in * u64) is pretty trivial, it's obvious that in u32 we'll also have * [10, 20] range. But this property holds for any 64-bit range as * long as upper 32 bits in that entire range of values stay the same. * * E.g., u64 range [0x10000000A, 0x10000000F] ([4294967306, 4294967311] * in decimal) has the same upper 32 bits throughout all the values in * that range. As such, lower 32 bits form a valid [0xA, 0xF] ([10, 15]) * range. * * Note also, that [0xA, 0xF] is a valid range both in u32 and in s32, * following the rules outlined below about u64/s64 correspondence * (which equally applies to u32 vs s32 correspondence). In general it * depends on actual hexadecimal values of 32-bit range. They can form * only valid u32, or only valid s32 ranges in some cases. * * So we use all these insights to derive bounds for subregisters here.
*/ if ((reg->umin_value >> 32) == (reg->umax_value >> 32)) { /* u64 to u32 casting preserves validity of low 32 bits as * a range, if upper 32 bits are the same
*/
reg->u32_min_value = max_t(u32, reg->u32_min_value, (u32)reg->umin_value);
reg->u32_max_value = min_t(u32, reg->u32_max_value, (u32)reg->umax_value);
if ((s32)reg->umin_value <= (s32)reg->umax_value) {
reg->s32_min_value = max_t(s32, reg->s32_min_value, (s32)reg->umin_value);
reg->s32_max_value = min_t(s32, reg->s32_max_value, (s32)reg->umax_value);
}
} if ((reg->smin_value >> 32) == (reg->smax_value >> 32)) { /* low 32 bits should form a proper u32 range */ if ((u32)reg->smin_value <= (u32)reg->smax_value) {
reg->u32_min_value = max_t(u32, reg->u32_min_value, (u32)reg->smin_value);
reg->u32_max_value = min_t(u32, reg->u32_max_value, (u32)reg->smax_value);
} /* low 32 bits should form a proper s32 range */ if ((s32)reg->smin_value <= (s32)reg->smax_value) {
reg->s32_min_value = max_t(s32, reg->s32_min_value, (s32)reg->smin_value);
reg->s32_max_value = min_t(s32, reg->s32_max_value, (s32)reg->smax_value);
}
} /* Special case where upper bits form a small sequence of two * sequential numbers (in 32-bit unsigned space, so 0xffffffff to * 0x00000000 is also valid), while lower bits form a proper s32 range * going from negative numbers to positive numbers. E.g., let's say we * have s64 range [-1, 1] ([0xffffffffffffffff, 0x0000000000000001]). * Possible s64 values are {-1, 0, 1} ({0xffffffffffffffff, * 0x0000000000000000, 0x00000000000001}). Ignoring upper 32 bits, * we still get a valid s32 range [-1, 1] ([0xffffffff, 0x00000001]). * Note that it doesn't have to be 0xffffffff going to 0x00000000 in * upper 32 bits. As a random example, s64 range * [0xfffffff0fffffff0; 0xfffffff100000010], forms a valid s32 range * [-16, 16] ([0xfffffff0; 0x00000010]) in its 32 bit subregister.
*/ if ((u32)(reg->umin_value >> 32) + 1 == (u32)(reg->umax_value >> 32) &&
(s32)reg->umin_value < 0 && (s32)reg->umax_value >= 0) {
reg->s32_min_value = max_t(s32, reg->s32_min_value, (s32)reg->umin_value);
reg->s32_max_value = min_t(s32, reg->s32_max_value, (s32)reg->umax_value);
} if ((u32)(reg->smin_value >> 32) + 1 == (u32)(reg->smax_value >> 32) &&
(s32)reg->smin_value < 0 && (s32)reg->smax_value >= 0) {
reg->s32_min_value = max_t(s32, reg->s32_min_value, (s32)reg->smin_value);
reg->s32_max_value = min_t(s32, reg->s32_max_value, (s32)reg->smax_value);
} /* if u32 range forms a valid s32 range (due to matching sign bit), * try to learn from that
*/ if ((s32)reg->u32_min_value <= (s32)reg->u32_max_value) {
reg->s32_min_value = max_t(s32, reg->s32_min_value, reg->u32_min_value);
reg->s32_max_value = min_t(s32, reg->s32_max_value, reg->u32_max_value);
} /* If we cannot cross the sign boundary, then signed and unsigned bounds * are the same, so combine. This works even in the negative case, e.g. * -3 s<= x s<= -1 implies 0xf...fd u<= x u<= 0xf...ff.
*/ if ((u32)reg->s32_min_value <= (u32)reg->s32_max_value) {
reg->u32_min_value = max_t(u32, reg->s32_min_value, reg->u32_min_value);
reg->u32_max_value = min_t(u32, reg->s32_max_value, reg->u32_max_value);
}
}
staticvoid __reg64_deduce_bounds(struct bpf_reg_state *reg)
{ /* If u64 range forms a valid s64 range (due to matching sign bit), * try to learn from that. Let's do a bit of ASCII art to see when * this is happening. Let's take u64 range first: * * 0 0x7fffffffffffffff 0x8000000000000000 U64_MAX * |-------------------------------|--------------------------------| * * Valid u64 range is formed when umin and umax are anywhere in the * range [0, U64_MAX], and umin <= umax. u64 case is simple and * straightforward. Let's see how s64 range maps onto the same range * of values, annotated below the line for comparison: * * 0 0x7fffffffffffffff 0x8000000000000000 U64_MAX * |-------------------------------|--------------------------------| * 0 S64_MAX S64_MIN -1 * * So s64 values basically start in the middle and they are logically * contiguous to the right of it, wrapping around from -1 to 0, and * then finishing as S64_MAX (0x7fffffffffffffff) right before * S64_MIN. We can try drawing the continuity of u64 vs s64 values * more visually as mapped to sign-agnostic range of hex values. * * u64 start u64 end * _______________________________________________________________ * / \ * 0 0x7fffffffffffffff 0x8000000000000000 U64_MAX * |-------------------------------|--------------------------------| * 0 S64_MAX S64_MIN -1 * / \ * >------------------------------ -------------------------------> * s64 continues... s64 end s64 start s64 "midpoint" * * What this means is that, in general, we can't always derive * something new about u64 from any random s64 range, and vice versa. * * But we can do that in two particular cases. One is when entire * u64/s64 range is *entirely* contained within left half of the above * diagram or when it is *entirely* contained in the right half. I.e.: * * |-------------------------------|--------------------------------| * ^ ^ ^ ^ * A B C D * * [A, B] and [C, D] are contained entirely in their respective halves * and form valid contiguous ranges as both u64 and s64 values. [A, B] * will be non-negative both as u64 and s64 (and in fact it will be * identical ranges no matter the signedness). [C, D] treated as s64 * will be a range of negative values, while in u64 it will be * non-negative range of values larger than 0x8000000000000000. * * Now, any other range here can't be represented in both u64 and s64 * simultaneously. E.g., [A, C], [A, D], [B, C], [B, D] are valid * contiguous u64 ranges, but they are discontinuous in s64. [B, C] * in s64 would be properly presented as [S64_MIN, C] and [B, S64_MAX], * for example. Similarly, valid s64 range [D, A] (going from negative * to positive values), would be two separate [D, U64_MAX] and [0, A] * ranges as u64. Currently reg_state can't represent two segments per * numeric domain, so in such situations we can only derive maximal * possible range ([0, U64_MAX] for u64, and [S64_MIN, S64_MAX] for s64). * * So we use these facts to derive umin/umax from smin/smax and vice * versa only if they stay within the same "half". This is equivalent * to checking sign bit: lower half will have sign bit as zero, upper * half have sign bit 1. Below in code we simplify this by just * casting umin/umax as smin/smax and checking if they form valid * range, and vice versa. Those are equivalent checks.
*/ if ((s64)reg->umin_value <= (s64)reg->umax_value) {
reg->smin_value = max_t(s64, reg->smin_value, reg->umin_value);
reg->smax_value = min_t(s64, reg->smax_value, reg->umax_value);
} /* If we cannot cross the sign boundary, then signed and unsigned bounds * are the same, so combine. This works even in the negative case, e.g. * -3 s<= x s<= -1 implies 0xf...fd u<= x u<= 0xf...ff.
*/ if ((u64)reg->smin_value <= (u64)reg->smax_value) {
reg->umin_value = max_t(u64, reg->smin_value, reg->umin_value);
reg->umax_value = min_t(u64, reg->smax_value, reg->umax_value);
} else { /* If the s64 range crosses the sign boundary, then it's split * between the beginning and end of the U64 domain. In that * case, we can derive new bounds if the u64 range overlaps * with only one end of the s64 range. * * In the following example, the u64 range overlaps only with * positive portion of the s64 range. * * 0 U64_MAX * | [xxxxxxxxxxxxxx u64 range xxxxxxxxxxxxxx] | * |----------------------------|----------------------------| * |xxxxx s64 range xxxxxxxxx] [xxxxxxx| * 0 S64_MAX S64_MIN -1 * * We can thus derive the following new s64 and u64 ranges. * * 0 U64_MAX * | [xxxxxx u64 range xxxxx] | * |----------------------------|----------------------------| * | [xxxxxx s64 range xxxxx] | * 0 S64_MAX S64_MIN -1 * * If they overlap in two places, we can't derive anything * because reg_state can't represent two ranges per numeric * domain. * * 0 U64_MAX * | [xxxxxxxxxxxxxxxxx u64 range xxxxxxxxxxxxxxxxx] | * |----------------------------|----------------------------| * |xxxxx s64 range xxxxxxxxx] [xxxxxxxxxx| * 0 S64_MAX S64_MIN -1 * * The first condition below corresponds to the first diagram * above.
*/ if (reg->umax_value < (u64)reg->smin_value) {
reg->smin_value = (s64)reg->umin_value;
reg->umax_value = min_t(u64, reg->umax_value, reg->smax_value);
} elseif ((u64)reg->smax_value < reg->umin_value) { /* This second condition considers the case where the u64 range * overlaps with the negative portion of the s64 range: * * 0 U64_MAX * | [xxxxxxxxxxxxxx u64 range xxxxxxxxxxxxxx] | * |----------------------------|----------------------------| * |xxxxxxxxx] [xxxxxxxxxxxx s64 range | * 0 S64_MAX S64_MIN -1
*/
reg->smax_value = (s64)reg->umax_value;
reg->umin_value = max_t(u64, reg->umin_value, reg->smin_value);
}
}
}
staticvoid __reg_deduce_mixed_bounds(struct bpf_reg_state *reg)
{ /* Try to tighten 64-bit bounds from 32-bit knowledge, using 32-bit * values on both sides of 64-bit range in hope to have tighter range. * E.g., if r1 is [0x1'00000000, 0x3'80000000], and we learn from * 32-bit signed > 0 operation that s32 bounds are now [1; 0x7fffffff]. * With this, we can substitute 1 as low 32-bits of _low_ 64-bit bound * (0x100000000 -> 0x100000001) and 0x7fffffff as low 32-bits of * _high_ 64-bit bound (0x380000000 -> 0x37fffffff) and arrive at a * better overall bounds for r1 as [0x1'000000001; 0x3'7fffffff]. * We just need to make sure that derived bounds we are intersecting * with are well-formed ranges in respective s64 or u64 domain, just * like we do with similar kinds of 32-to-64 or 64-to-32 adjustments.
*/
__u64 new_umin, new_umax;
__s64 new_smin, new_smax;
/* Here we would like to handle a special case after sign extending load, * when upper bits for a 64-bit range are all 1s or all 0s. * * Upper bits are all 1s when register is in a range: * [0xffff_ffff_0000_0000, 0xffff_ffff_ffff_ffff] * Upper bits are all 0s when register is in a range: * [0x0000_0000_0000_0000, 0x0000_0000_ffff_ffff] * Together this forms are continuous range: * [0xffff_ffff_0000_0000, 0x0000_0000_ffff_ffff] * * Now, suppose that register range is in fact tighter: * [0xffff_ffff_8000_0000, 0x0000_0000_ffff_ffff] (R) * Also suppose that it's 32-bit range is positive, * meaning that lower 32-bits of the full 64-bit register * are in the range: * [0x0000_0000, 0x7fff_ffff] (W) * * If this happens, then any value in a range: * [0xffff_ffff_0000_0000, 0xffff_ffff_7fff_ffff] * is smaller than a lowest bound of the range (R): * 0xffff_ffff_8000_0000 * which means that upper bits of the full 64-bit register * can't be all 1s, when lower bits are in range (W). * * Note that: * - 0xffff_ffff_8000_0000 == (s64)S32_MIN * - 0x0000_0000_7fff_ffff == (s64)S32_MAX * These relations are used in the conditions below.
*/ if (reg->s32_min_value >= 0 && reg->smin_value >= S32_MIN && reg->smax_value <= S32_MAX) {
reg->smin_value = reg->s32_min_value;
reg->smax_value = reg->s32_max_value;
reg->umin_value = reg->s32_min_value;
reg->umax_value = reg->s32_max_value;
reg->var_off = tnum_intersect(reg->var_off,
tnum_range(reg->smin_value, reg->smax_value));
}
}
staticvoid reg_bounds_sync(struct bpf_reg_state *reg)
{ /* We might have learned new bounds from the var_off. */
__update_reg_bounds(reg); /* We might have learned something about the sign bit. */
__reg_deduce_bounds(reg);
__reg_deduce_bounds(reg);
__reg_deduce_bounds(reg); /* We might have learned some bits from the bounds. */
__reg_bound_offset(reg); /* Intersecting with the old var_off might have improved our bounds * slightly, e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc), * then new var_off is (0; 0x7f...fc) which improves our umax.
*/
__update_reg_bounds(reg);
}
/* Attempt to pull 32-bit signed bounds into 64-bit bounds but must * be positive otherwise set to worse case bounds and refine later * from tnum.
*/ if (__reg32_bound_s64(reg->s32_min_value) &&
__reg32_bound_s64(reg->s32_max_value)) {
reg->smin_value = reg->s32_min_value;
reg->smax_value = reg->s32_max_value;
} else {
reg->smin_value = 0;
reg->smax_value = U32_MAX;
}
}
/* Mark a register as having a completely unknown (scalar) value. */ staticvoid __mark_reg_unknown_imprecise(struct bpf_reg_state *reg)
{ /* * Clear type, off, and union(map_ptr, range) and * padding between 'type' and union
*/
memset(reg, 0, offsetof(struct bpf_reg_state, var_off));
reg->type = SCALAR_VALUE;
reg->id = 0;
reg->ref_obj_id = 0;
reg->var_off = tnum_unknown;
reg->frameno = 0;
reg->precise = false;
__mark_reg_unbounded(reg);
}
/* Mark a register as having a completely unknown (scalar) value, * initialize .precise as true when not bpf capable.
*/ staticvoid __mark_reg_unknown(conststruct bpf_verifier_env *env, struct bpf_reg_state *reg)
{
__mark_reg_unknown_imprecise(reg);
reg->precise = !env->bpf_capable;
}
staticvoid mark_reg_unknown(struct bpf_verifier_env *env, struct bpf_reg_state *regs, u32 regno)
{ if (WARN_ON(regno >= MAX_BPF_REG)) {
verbose(env, "mark_reg_unknown(regs, %u)\n", regno); /* Something bad happened, let's kill all regs except FP */ for (regno = 0; regno < BPF_REG_FP; regno++)
__mark_reg_not_init(env, regs + regno); return;
}
__mark_reg_unknown(env, regs + regno);
}
/* Similar to push_stack(), but for async callbacks */ staticstruct bpf_verifier_state *push_async_cb(struct bpf_verifier_env *env, int insn_idx, int prev_insn_idx, int subprog, bool is_sleepable)
{ struct bpf_verifier_stack_elem *elem; struct bpf_func_state *frame;
elem = kzalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL_ACCOUNT); if (!elem) return NULL;
elem->insn_idx = insn_idx;
elem->prev_insn_idx = prev_insn_idx;
elem->next = env->head;
elem->log_pos = env->log.end_pos;
env->head = elem;
env->stack_size++; if (env->stack_size > BPF_COMPLEXITY_LIMIT_JMP_SEQ) {
verbose(env, "The sequence of %d jumps is too complex for async cb.\n",
env->stack_size); return NULL;
} /* Unlike push_stack() do not copy_verifier_state(). * The caller state doesn't matter. * This is async callback. It starts in a fresh stack. * Initialize it similar to do_check_common().
*/
elem->st.branches = 1;
elem->st.in_sleepable = is_sleepable;
frame = kzalloc(sizeof(*frame), GFP_KERNEL_ACCOUNT); if (!frame) return NULL;
init_func_state(env, frame,
BPF_MAIN_FUNC /* callsite */,
0 /* frameno within this callchain */,
subprog /* subprog number within this prog */);
elem->st.frame[0] = frame; return &elem->st;
}
enum reg_arg_type {
SRC_OP, /* register is used as source operand */
DST_OP, /* register is used as destination operand */
DST_OP_NO_MARK /* same as above, check only, don't mark */
};
/* Find subprogram that contains instruction at 'off' */ staticstruct bpf_subprog_info *find_containing_subprog(struct bpf_verifier_env *env, int off)
{ struct bpf_subprog_info *vals = env->subprog_info; int l, r, m;
if (off >= env->prog->len || off < 0 || env->subprog_cnt == 0) return NULL;
l = 0;
r = env->subprog_cnt - 1; while (l < r) {
m = l + (r - l + 1) / 2; if (vals[m].start <= off)
l = m; else
r = m - 1;
} return &vals[l];
}
/* Find subprogram that starts exactly at 'off' */ staticint find_subprog(struct bpf_verifier_env *env, int off)
{ struct bpf_subprog_info *p;
p = find_containing_subprog(env, off); if (!p || p->start != off) return -ENOENT; return p - env->subprog_info;
}
staticint add_subprog(struct bpf_verifier_env *env, int off)
{ int insn_cnt = env->prog->len; int ret;
if (off >= insn_cnt || off < 0) {
verbose(env, "call to invalid destination\n"); return -EINVAL;
}
ret = find_subprog(env, off); if (ret >= 0) return ret; if (env->subprog_cnt >= BPF_MAX_SUBPROGS) {
verbose(env, "too many subprograms\n"); return -E2BIG;
} /* determine subprog starts. The end is one before the next starts */
env->subprog_info[env->subprog_cnt++].start = off;
sort(env->subprog_info, env->subprog_cnt, sizeof(env->subprog_info[0]), cmp_subprogs, NULL); return env->subprog_cnt - 1;
}
t = btf_type_by_id(btf, main_btf_id); if (!t) {
verbose(env, "invalid btf id for main subprog in func_info\n"); return -EINVAL;
}
name = btf_find_decl_tag_value(btf, t, -1, "exception_callback:"); if (IS_ERR(name)) {
ret = PTR_ERR(name); /* If there is no tag present, there is no exception callback */ if (ret == -ENOENT)
ret = 0; elseif (ret == -EEXIST)
verbose(env, "multiple exception callback tags for main subprog\n"); return ret;
}
ret = btf_find_by_name_kind(btf, name, BTF_KIND_FUNC); if (ret < 0) {
verbose(env, "exception callback '%s' could not be found in BTF\n", name); return ret;
}
id = ret;
t = btf_type_by_id(btf, id); if (btf_func_linkage(t) != BTF_FUNC_GLOBAL) {
verbose(env, "exception callback '%s' must have global linkage\n", name); return -EINVAL;
}
ret = 0; for (i = 0; i < aux->func_info_cnt; i++) { if (aux->func_info[i].type_id != id) continue;
ret = aux->func_info[i].insn_off; /* Further func_info and subprog checks will also happen * later, so assume this is the right insn_off for now.
*/ if (!ret) {
verbose(env, "invalid exception callback insn_off in func_info: 0\n");
ret = -EINVAL;
}
} if (!ret) {
verbose(env, "exception callback type id not found in func_info\n");
ret = -EINVAL;
} return ret;
}
struct bpf_kfunc_desc_tab { /* Sorted by func_id (BTF ID) and offset (fd_array offset) during * verification. JITs do lookups by bpf_insn, where func_id may not be * available, therefore at the end of verification do_misc_fixups() * sorts this by imm and offset.
*/ struct bpf_kfunc_desc descs[MAX_KFUNC_DESCS];
u32 nr_descs;
};
/* sort() reorders entries by value, so b may no longer point * to the right entry after this
*/
sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]),
kfunc_btf_cmp_by_off, NULL);
} else {
btf = b->btf;
}
return btf;
}
void bpf_free_kfunc_btf_tab(struct bpf_kfunc_btf_tab *tab)
{ if (!tab) return;
while (tab->nr_descs--) {
module_put(tab->descs[tab->nr_descs].module);
btf_put(tab->descs[tab->nr_descs].btf);
}
kfree(tab);
}
staticstruct btf *find_kfunc_desc_btf(struct bpf_verifier_env *env, s16 offset)
{ if (offset) { if (offset < 0) { /* In the future, this can be allowed to increase limit * of fd index into fd_array, interpreted as u16.
*/
verbose(env, "negative offset disallowed for kernel module function call\n"); return ERR_PTR(-EINVAL);
}
prog_aux = env->prog->aux;
tab = prog_aux->kfunc_tab;
btf_tab = prog_aux->kfunc_btf_tab; if (!tab) { if (!btf_vmlinux) {
verbose(env, "calling kernel function is not supported without CONFIG_DEBUG_INFO_BTF\n"); return -ENOTSUPP;
}
if (!env->prog->jit_requested) {
verbose(env, "JIT is required for calling kernel function\n"); return -ENOTSUPP;
}
if (!bpf_jit_supports_kfunc_call()) {
verbose(env, "JIT does not support calling kernel function\n"); return -ENOTSUPP;
}
if (!env->prog->gpl_compatible) {
verbose(env, "cannot call kernel function from non-GPL compatible program\n"); return -EINVAL;
}
/* func_id == 0 is always invalid, but instead of returning an error, be * conservative and wait until the code elimination pass before returning * error, so that invalid calls that get pruned out can be in BPF programs * loaded from userspace. It is also required that offset be untouched * for such calls.
*/ if (!func_id && !offset) return 0;
if (!btf_tab && offset) {
btf_tab = kzalloc(sizeof(*btf_tab), GFP_KERNEL_ACCOUNT); if (!btf_tab) return -ENOMEM;
prog_aux->kfunc_btf_tab = btf_tab;
}
desc_btf = find_kfunc_desc_btf(env, offset); if (IS_ERR(desc_btf)) {
verbose(env, "failed to find BTF for kernel function\n"); return PTR_ERR(desc_btf);
}
if (find_kfunc_desc(env->prog, func_id, offset)) return 0;
if (tab->nr_descs == MAX_KFUNC_DESCS) {
verbose(env, "too many different kernel function calls\n"); return -E2BIG;
}
func = btf_type_by_id(desc_btf, func_id); if (!func || !btf_type_is_func(func)) {
verbose(env, "kernel btf_id %u is not a function\n",
func_id); return -EINVAL;
}
func_proto = btf_type_by_id(desc_btf, func->type); if (!func_proto || !btf_type_is_func_proto(func_proto)) {
verbose(env, "kernel function btf_id %u does not have a valid func_proto\n",
func_id); return -EINVAL;
}
func_name = btf_name_by_offset(desc_btf, func->name_off);
addr = kallsyms_lookup_name(func_name); if (!addr) {
verbose(env, "cannot find address for kernel function %s\n",
func_name); return -EINVAL;
}
specialize_kfunc(env, func_id, offset, &addr);
if (bpf_jit_supports_far_kfunc_call()) {
call_imm = func_id;
} else {
call_imm = BPF_CALL_IMM(addr); /* Check whether the relative offset overflows desc->imm */ if ((unsignedlong)(s32)call_imm != call_imm) {
verbose(env, "address of kernel function %s is out of range\n",
func_name); return -EINVAL;
}
}
if (bpf_dev_bound_kfunc_id(func_id)) {
err = bpf_dev_bound_kfunc_check(&env->log, prog_aux); if (err) return err;
}
tab = prog->aux->kfunc_tab;
res = bsearch(&desc, tab->descs, tab->nr_descs, sizeof(tab->descs[0]), kfunc_desc_cmp_by_imm_off);
return res ? &res->func_model : NULL;
}
staticint add_kfunc_in_insns(struct bpf_verifier_env *env, struct bpf_insn *insn, int cnt)
{ int i, ret;
for (i = 0; i < cnt; i++, insn++) { if (bpf_pseudo_kfunc_call(insn)) {
ret = add_kfunc_call(env, insn->imm, insn->off); if (ret < 0) return ret;
}
} return 0;
}
/* Add entry function. */
ret = add_subprog(env, 0); if (ret) return ret;
for (i = 0; i < insn_cnt; i++, insn++) { if (!bpf_pseudo_func(insn) && !bpf_pseudo_call(insn) &&
!bpf_pseudo_kfunc_call(insn)) continue;
if (!env->bpf_capable) {
verbose(env, "loading/calling other bpf or kernel functions are allowed for CAP_BPF and CAP_SYS_ADMIN\n"); return -EPERM;
}
if (bpf_pseudo_func(insn) || bpf_pseudo_call(insn))
ret = add_subprog(env, i + insn->imm + 1); else
ret = add_kfunc_call(env, insn->imm, insn->off);
if (ret < 0) return ret;
}
ret = bpf_find_exception_callback_insn_off(env); if (ret < 0) return ret;
ex_cb_insn = ret;
/* If ex_cb_insn > 0, this means that the main program has a subprog * marked using BTF decl tag to serve as the exception callback.
*/ if (ex_cb_insn) {
ret = add_subprog(env, ex_cb_insn); if (ret < 0) return ret; for (i = 1; i < env->subprog_cnt; i++) { if (env->subprog_info[i].start != ex_cb_insn) continue;
env->exception_callback_subprog = i;
mark_subprog_exc_cb(env, i); break;
}
}
/* Add a fake 'exit' subprog which could simplify subprog iteration * logic. 'subprog_cnt' should not be increased.
*/
subprog[env->subprog_cnt].start = insn_cnt;
if (env->log.level & BPF_LOG_LEVEL2) for (i = 0; i < env->subprog_cnt; i++)
verbose(env, "func#%d @%d\n", i, subprog[i].start);
staticint check_subprogs(struct bpf_verifier_env *env)
{ int i, subprog_start, subprog_end, off, cur_subprog = 0; struct bpf_subprog_info *subprog = env->subprog_info; struct bpf_insn *insn = env->prog->insnsi; int insn_cnt = env->prog->len;
/* now check that all jumps are within the same subprog */
subprog_start = subprog[cur_subprog].start;
subprog_end = subprog[cur_subprog + 1].start; for (i = 0; i < insn_cnt; i++) {
u8 code = insn[i].code;
if (code == (BPF_JMP | BPF_CALL) &&
insn[i].src_reg == 0 &&
insn[i].imm == BPF_FUNC_tail_call) {
subprog[cur_subprog].has_tail_call = true;
subprog[cur_subprog].tail_call_reachable = true;
} if (BPF_CLASS(code) == BPF_LD &&
(BPF_MODE(code) == BPF_ABS || BPF_MODE(code) == BPF_IND))
subprog[cur_subprog].has_ld_abs = true; if (BPF_CLASS(code) != BPF_JMP && BPF_CLASS(code) != BPF_JMP32) goto next; if (BPF_OP(code) == BPF_EXIT || BPF_OP(code) == BPF_CALL) goto next;
off = i + jmp_offset(&insn[i]) + 1; if (off < subprog_start || off >= subprog_end) {
verbose(env, "jump out of range from insn %d to %d\n", i, off); return -EINVAL;
}
next: if (i == subprog_end - 1) { /* to avoid fall-through from one subprog into another * the last insn of the subprog should be either exit * or unconditional jump back or bpf_throw call
*/ if (code != (BPF_JMP | BPF_EXIT) &&
code != (BPF_JMP32 | BPF_JA) &&
code != (BPF_JMP | BPF_JA)) {
verbose(env, "last insn is not an exit or jmp\n"); return -EINVAL;
}
subprog_start = subprog_end;
cur_subprog++; if (cur_subprog < env->subprog_cnt)
subprog_end = subprog[cur_subprog + 1].start;
}
} return 0;
}
/* Parentage chain of this register (or stack slot) should take care of all * issues like callee-saved registers, stack slot allocation time, etc.
*/ staticint mark_reg_read(struct bpf_verifier_env *env, conststruct bpf_reg_state *state, struct bpf_reg_state *parent, u8 flag)
{ bool writes = parent == state->parent; /* Observe write marks */ int cnt = 0;
while (parent) { /* if read wasn't screened by an earlier write ... */ if (writes && state->live & REG_LIVE_WRITTEN) break; if (verifier_bug_if(parent->live & REG_LIVE_DONE, env, "type %s var_off %lld off %d",
reg_type_str(env, parent->type),
parent->var_off.value, parent->off)) return -EFAULT; /* The first condition is more likely to be true than the * second, checked it first.
*/ if ((parent->live & REG_LIVE_READ) == flag ||
parent->live & REG_LIVE_READ64) /* The parentage chain never changes and * this parent was already marked as LIVE_READ. * There is no need to keep walking the chain again and * keep re-marking all parents as LIVE_READ. * This case happens when the same register is read * multiple times without writes into it in-between. * Also, if parent has the stronger REG_LIVE_READ64 set, * then no need to set the weak REG_LIVE_READ32.
*/ break; /* ... then we depend on parent's value */
parent->live |= flag; /* REG_LIVE_READ64 overrides REG_LIVE_READ32. */ if (flag == REG_LIVE_READ64)
parent->live &= ~REG_LIVE_READ32;
state = parent;
parent = state->parent;
writes = true;
cnt++;
}
if (env->longest_mark_read_walk < cnt)
env->longest_mark_read_walk = cnt; return 0;
}
staticint mark_stack_slot_obj_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int spi, int nr_slots)
{ struct bpf_func_state *state = func(env, reg); int err, i;
for (i = 0; i < nr_slots; i++) { struct bpf_reg_state *st = &state->stack[spi - i].spilled_ptr;
err = mark_reg_read(env, st, st->parent, REG_LIVE_READ64); if (err) return err;
staticint mark_dynptr_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{ int spi;
/* For CONST_PTR_TO_DYNPTR, it must have already been done by * check_reg_arg in check_helper_call and mark_btf_func_reg_size in * check_kfunc_call.
*/ if (reg->type == CONST_PTR_TO_DYNPTR) return 0;
spi = dynptr_get_spi(env, reg); if (spi < 0) return spi; /* Caller ensures dynptr is valid and initialized, which means spi is in * bounds and spi is the first dynptr slot. Simply mark stack slot as * read.
*/ return mark_stack_slot_obj_read(env, reg, spi, BPF_DYNPTR_NR_SLOTS);
}
staticint mark_iter_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int spi, int nr_slots)
{ return mark_stack_slot_obj_read(env, reg, spi, nr_slots);
}
staticint mark_irq_flag_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{ int spi;
/* This function is supposed to be used by the following 32-bit optimization * code only. It returns TRUE if the source or destination register operates * on 64-bit, otherwise return FALSE.
*/ staticbool is_reg64(struct bpf_verifier_env *env, struct bpf_insn *insn,
u32 regno, struct bpf_reg_state *reg, enum reg_arg_type t)
{
u8 code, class, op;
code = insn->code; class = BPF_CLASS(code);
op = BPF_OP(code); if (class == BPF_JMP) { /* BPF_EXIT for "main" will reach here. Return TRUE * conservatively.
*/ if (op == BPF_EXIT) returntrue; if (op == BPF_CALL) { /* BPF to BPF call will reach here because of marking * caller saved clobber with DST_OP_NO_MARK for which we * don't care the register def because they are anyway * marked as NOT_INIT already.
*/ if (insn->src_reg == BPF_PSEUDO_CALL) returnfalse; /* Helper call will reach here because of arg type * check, conservatively return TRUE.
*/ if (t == SRC_OP) returntrue;
returnfalse;
}
}
if (class == BPF_ALU64 && op == BPF_END && (insn->imm == 16 || insn->imm == 32)) returnfalse;
if (class == BPF_ALU64 || class == BPF_JMP ||
(class == BPF_ALU && op == BPF_END && insn->imm == 64)) returntrue;
if (class == BPF_ALU || class == BPF_JMP32) returnfalse;
if (class == BPF_LDX) { if (t != SRC_OP) return BPF_SIZE(code) == BPF_DW || BPF_MODE(code) == BPF_MEMSX; /* LDX source must be ptr. */ returntrue;
}
if (class == BPF_STX) { /* BPF_STX (including atomic variants) has one or more source * operands, one of which is a ptr. Check whether the caller is * asking about it.
*/ if (t == SRC_OP && reg->type != SCALAR_VALUE) returntrue; return BPF_SIZE(code) == BPF_DW;
}
if (class == BPF_LD) {
u8 mode = BPF_MODE(code);
/* LD_IMM64 */ if (mode == BPF_IMM) returntrue;
/* Both LD_IND and LD_ABS return 32-bit data. */ if (t != SRC_OP) returnfalse;
/* Implicit ctx ptr. */ if (regno == BPF_REG_6) returntrue;
/* Explicit source could be any width. */ returntrue;
}
if (class == BPF_ST) /* The only source register for BPF_ST is a ptr. */ returntrue;
/* Conservatively return true at default. */ returntrue;
}
/* Return the regno defined by the insn, or -1. */ staticint insn_def_regno(conststruct bpf_insn *insn)
{ switch (BPF_CLASS(insn->code)) { case BPF_JMP: case BPF_JMP32: case BPF_ST: return -1; case BPF_STX: if (BPF_MODE(insn->code) == BPF_ATOMIC ||
BPF_MODE(insn->code) == BPF_PROBE_ATOMIC) { if (insn->imm == BPF_CMPXCHG) return BPF_REG_0; elseif (insn->imm == BPF_LOAD_ACQ) return insn->dst_reg; elseif (insn->imm & BPF_FETCH) return insn->src_reg;
} return -1; default: return insn->dst_reg;
}
}
/* Return TRUE if INSN has defined any 32-bit value explicitly. */ staticbool insn_has_def32(struct bpf_verifier_env *env, struct bpf_insn *insn)
{ int dst_reg = insn_def_regno(insn);
env->insn_aux_data[def_idx - 1].zext_dst = true; /* The dst will be zero extended, so won't be sub-register anymore. */
reg->subreg_def = DEF_NOT_SUBREG;
}
if (regno >= MAX_BPF_REG) {
verbose(env, "R%d is invalid\n", regno); return -EINVAL;
}
mark_reg_scratched(env, regno);
reg = ®s[regno];
rw64 = is_reg64(env, insn, regno, reg, t); if (t == SRC_OP) { /* check whether register used as source operand can be read */ if (reg->type == NOT_INIT) {
verbose(env, "R%d !read_ok\n", regno); return -EACCES;
} /* We don't need to worry about FP liveness because it's read-only */ if (regno == BPF_REG_FP) return 0;
if (rw64)
mark_insn_zext(env, reg);
return mark_reg_read(env, reg, reg->parent,
rw64 ? REG_LIVE_READ64 : REG_LIVE_READ32);
} else { /* check whether register used as dest operand can be written to */ if (regno == BPF_REG_FP) {
verbose(env, "frame pointer is read only\n"); return -EACCES;
}
reg->live |= REG_LIVE_WRITTEN;
reg->subreg_def = rw64 ? DEF_NOT_SUBREG : env->insn_idx + 1; if (t == DST_OP)
mark_reg_unknown(env, regs, regno);
} return 0;
}
/* Use u64 as a vector of 6 10-bit values, use first 4-bits to track * number of elements currently in stack. * Pack one history entry for linked registers as 10 bits in the following format: * - 3-bits frameno * - 6-bits spi_or_reg * - 1-bit is_reg
*/ static u64 linked_regs_pack(struct linked_regs *s)
{
u64 val = 0; int i;
for (i = 0; i < s->cnt; ++i) { struct linked_reg *e = &s->entries[i];
u64 tmp = 0;
/* for any branch, call, exit record the history of jmps in the given state */ staticint push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_state *cur, int insn_flags, u64 linked_regs)
{
u32 cnt = cur->jmp_history_cnt; struct bpf_jmp_history_entry *p;
size_t alloc_size;
/* combine instruction flags if we already recorded this instruction */ if (env->cur_hist_ent) { /* atomic instructions push insn_flags twice, for READ and * WRITE sides, but they should agree on stack slot
*/
verifier_bug_if((env->cur_hist_ent->flags & insn_flags) &&
(env->cur_hist_ent->flags & insn_flags) != insn_flags,
env, "insn history: insn_idx %d cur flags %x new flags %x",
env->insn_idx, env->cur_hist_ent->flags, insn_flags);
env->cur_hist_ent->flags |= insn_flags;
verifier_bug_if(env->cur_hist_ent->linked_regs != 0, env, "insn history: insn_idx %d linked_regs: %#llx",
env->insn_idx, env->cur_hist_ent->linked_regs);
env->cur_hist_ent->linked_regs = linked_regs; return 0;
}
cnt++;
alloc_size = kmalloc_size_roundup(size_mul(cnt, sizeof(*p)));
p = krealloc(cur->jmp_history, alloc_size, GFP_KERNEL_ACCOUNT); if (!p) return -ENOMEM;
cur->jmp_history = p;
/* Backtrack one insn at a time. If idx is not at the top of recorded * history then previous instruction came from straight line execution. * Return -ENOENT if we exhausted all instructions within given state. * * It's legal to have a bit of a looping with the same starting and ending * insn index within the same state, e.g.: 3->4->5->3, so just because current * instruction index is the same as state's first_idx doesn't mean we are * done. If there is still some jump history left, we should keep going. We * need to take into account that we might have a jump history between given * state's parent and itself, due to checkpointing. In this case, we'll have * history entry recording a jump from last instruction of parent state and * first instruction of given state.
*/ staticint get_prev_insn_idx(struct bpf_verifier_state *st, int i,
u32 *history)
{
u32 cnt = *history;
if (i == st->first_insn_idx) { if (cnt == 0) return -ENOENT; if (cnt == 1 && st->jmp_history[0].idx == i) return -ENOENT;
}
/* format registers bitmask, e.g., "r0,r2,r4" for 0x15 mask */ staticvoid fmt_reg_mask(char *buf, ssize_t buf_sz, u32 reg_mask)
{
DECLARE_BITMAP(mask, 64); bool first = true; int i, n;
buf[0] = '\0';
bitmap_from_u64(mask, reg_mask);
for_each_set_bit(i, mask, 32) {
n = snprintf(buf, buf_sz, "%sr%d", first ? "" : ",", i);
first = false;
buf += n;
buf_sz -= n; if (buf_sz < 0) break;
}
} /* format stack slots bitmask, e.g., "-8,-24,-40" for 0x15 mask */ staticvoid fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask)
{
DECLARE_BITMAP(mask, 64); bool first = true; int i, n;
buf[0] = '\0';
bitmap_from_u64(mask, stack_mask);
for_each_set_bit(i, mask, 64) {
n = snprintf(buf, buf_sz, "%s%d", first ? "" : ",", -(i + 1) * 8);
first = false;
buf += n;
buf_sz -= n; if (buf_sz < 0) break;
}
}
/* If any register R in hist->linked_regs is marked as precise in bt, * do bt_set_frame_{reg,slot}(bt, R) for all registers in hist->linked_regs.
*/ staticvoid bt_sync_linked_regs(struct backtrack_state *bt, struct bpf_jmp_history_entry *hist)
{ struct linked_regs linked_regs; bool some_precise = false; int i;
if (!hist || hist->linked_regs == 0) return;
linked_regs_unpack(hist->linked_regs, &linked_regs); for (i = 0; i < linked_regs.cnt; ++i) { struct linked_reg *e = &linked_regs.entries[i];
staticbool calls_callback(struct bpf_verifier_env *env, int insn_idx);
/* For given verifier state backtrack_insn() is called from the last insn to * the first insn. Its purpose is to compute a bitmask of registers and * stack slots that needs precision in the parent verifier state. * * @idx is an index of the instruction we are currently processing; * @subseq_idx is an index of the subsequent instruction that: * - *would be* executed next, if jump history is viewed in forward order; * - *was* processed previously during backtracking.
*/ staticint backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, struct bpf_jmp_history_entry *hist, struct backtrack_state *bt)
{ struct bpf_insn *insn = env->prog->insnsi + idx;
u8 class = BPF_CLASS(insn->code);
u8 opcode = BPF_OP(insn->code);
u8 mode = BPF_MODE(insn->code);
u32 dreg = insn->dst_reg;
u32 sreg = insn->src_reg;
u32 spi, i, fr;
/* If there is a history record that some registers gained range at this insn, * propagate precision marks to those registers, so that bt_is_reg_set() * accounts for these registers.
*/
bt_sync_linked_regs(bt, hist);
if (class == BPF_ALU || class == BPF_ALU64) { if (!bt_is_reg_set(bt, dreg)) return 0; if (opcode == BPF_END || opcode == BPF_NEG) { /* sreg is reserved and unused * dreg still need precision before this insn
*/ return 0;
} elseif (opcode == BPF_MOV) { if (BPF_SRC(insn->code) == BPF_X) { /* dreg = sreg or dreg = (s8, s16, s32)sreg * dreg needs precision after this insn * sreg needs precision before this insn
*/
bt_clear_reg(bt, dreg); if (sreg != BPF_REG_FP)
bt_set_reg(bt, sreg);
} else { /* dreg = K * dreg needs precision after this insn. * Corresponding register is already marked * as precise=true in this verifier state. * No further markings in parent are necessary
*/
bt_clear_reg(bt, dreg);
}
} else { if (BPF_SRC(insn->code) == BPF_X) { /* dreg += sreg * both dreg and sreg need precision * before this insn
*/ if (sreg != BPF_REG_FP)
bt_set_reg(bt, sreg);
} /* else dreg += K * dreg still needs precision before this insn
*/
}
} elseif (class == BPF_LDX || is_atomic_load_insn(insn)) { if (!bt_is_reg_set(bt, dreg)) return 0;
bt_clear_reg(bt, dreg);
/* scalars can only be spilled into stack w/o losing precision. * Load from any other memory can be zero extended. * The desire to keep that precision is already indicated * by 'precise' mark in corresponding register of this state. * No further tracking necessary.
*/ if (!hist || !(hist->flags & INSN_F_STACK_ACCESS)) return 0; /* dreg = *(u64 *)[fp - off] was a fill from the stack. * that [fp - off] slot contains scalar that needs to be * tracked with precision
*/
spi = insn_stack_access_spi(hist->flags);
fr = insn_stack_access_frameno(hist->flags);
bt_set_frame_slot(bt, fr, spi);
} elseif (class == BPF_STX || class == BPF_ST) { if (bt_is_reg_set(bt, dreg)) /* stx & st shouldn't be using _scalar_ dst_reg * to access memory. It means backtracking * encountered a case of pointer subtraction.
*/ return -ENOTSUPP; /* scalars can only be spilled into stack */ if (!hist || !(hist->flags & INSN_F_STACK_ACCESS)) return 0;
spi = insn_stack_access_spi(hist->flags);
fr = insn_stack_access_frameno(hist->flags); if (!bt_is_frame_slot_set(bt, fr, spi)) return 0;
bt_clear_frame_slot(bt, fr, spi); if (class == BPF_STX)
bt_set_reg(bt, sreg);
} elseif (class == BPF_JMP || class == BPF_JMP32) { if (bpf_pseudo_call(insn)) { int subprog_insn_idx, subprog;
if (subprog_is_global(env, subprog)) { /* check that jump history doesn't have any * extra instructions from subprog; the next * instruction after call to global subprog * should be literally next instruction in * caller program
*/
verifier_bug_if(idx + 1 != subseq_idx, env, "extra insn from subprog"); /* r1-r5 are invalidated after subprog call, * so for global func call it shouldn't be set * anymore
*/ if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) {
verifier_bug(env, "global subprog unexpected regs %x",
bt_reg_mask(bt)); return -EFAULT;
} /* global subprog always sets R0 */
bt_clear_reg(bt, BPF_REG_0); return 0;
} else { /* static subprog call instruction, which * means that we are exiting current subprog, * so only r1-r5 could be still requested as * precise, r0 and r6-r10 or any stack slot in * the current frame should be zero by now
*/ if (bt_reg_mask(bt) & ~BPF_REGMASK_ARGS) {
verifier_bug(env, "static subprog unexpected regs %x",
bt_reg_mask(bt)); return -EFAULT;
} /* we are now tracking register spills correctly, * so any instance of leftover slots is a bug
*/ if (bt_stack_mask(bt) != 0) {
verifier_bug(env, "static subprog leftover stack slots %llx",
bt_stack_mask(bt)); return -EFAULT;
} /* propagate r1-r5 to the caller */ for (i = BPF_REG_1; i <= BPF_REG_5; i++) { if (bt_is_reg_set(bt, i)) {
bt_clear_reg(bt, i);
bt_set_frame_reg(bt, bt->frame - 1, i);
}
} if (bt_subprog_exit(bt)) return -EFAULT; return 0;
}
} elseif (is_sync_callback_calling_insn(insn) && idx != subseq_idx - 1) { /* exit from callback subprog to callback-calling helper or * kfunc call. Use idx/subseq_idx check to discern it from * straight line code backtracking. * Unlike the subprog call handling above, we shouldn't * propagate precision of r1-r5 (if any requested), as they are * not actually arguments passed directly to callback subprogs
*/ if (bt_reg_mask(bt) & ~BPF_REGMASK_ARGS) {
verifier_bug(env, "callback unexpected regs %x",
bt_reg_mask(bt)); return -EFAULT;
} if (bt_stack_mask(bt) != 0) {
verifier_bug(env, "callback leftover stack slots %llx",
bt_stack_mask(bt)); return -EFAULT;
} /* clear r1-r5 in callback subprog's mask */ for (i = BPF_REG_1; i <= BPF_REG_5; i++)
bt_clear_reg(bt, i); if (bt_subprog_exit(bt)) return -EFAULT; return 0;
} elseif (opcode == BPF_CALL) { /* kfunc with imm==0 is invalid and fixup_kfunc_call will * catch this error later. Make backtracking conservative * with ENOTSUPP.
*/ if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL && insn->imm == 0) return -ENOTSUPP; /* regular helper call sets R0 */
bt_clear_reg(bt, BPF_REG_0); if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) { /* if backtracking was looking for registers R1-R5 * they should have been found already.
*/
verifier_bug(env, "backtracking call unexpected regs %x",
bt_reg_mask(bt)); return -EFAULT;
}
} elseif (opcode == BPF_EXIT) { bool r0_precise;
/* Backtracking to a nested function call, 'idx' is a part of * the inner frame 'subseq_idx' is a part of the outer frame. * In case of a regular function call, instructions giving * precision to registers R1-R5 should have been found already. * In case of a callback, it is ok to have R1-R5 marked for * backtracking, as these registers are set by the function * invoking callback.
*/ if (subseq_idx >= 0 && calls_callback(env, subseq_idx)) for (i = BPF_REG_1; i <= BPF_REG_5; i++)
bt_clear_reg(bt, i); if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) {
verifier_bug(env, "backtracking exit unexpected regs %x",
bt_reg_mask(bt)); return -EFAULT;
}
/* BPF_EXIT in subprog or callback always returns * right after the call instruction, so by checking * whether the instruction at subseq_idx-1 is subprog * call or not we can distinguish actual exit from * *subprog* from exit from *callback*. In the former * case, we need to propagate r0 precision, if * necessary. In the former we never do that.
*/
r0_precise = subseq_idx - 1 >= 0 &&
bpf_pseudo_call(&env->prog->insnsi[subseq_idx - 1]) &&
bt_is_reg_set(bt, BPF_REG_0);
bt_clear_reg(bt, BPF_REG_0); if (bt_subprog_enter(bt)) return -EFAULT;
if (r0_precise)
bt_set_reg(bt, BPF_REG_0); /* r6-r9 and stack slots will stay set in caller frame * bitmasks until we return back from callee(s)
*/ return 0;
} elseif (BPF_SRC(insn->code) == BPF_X) { if (!bt_is_reg_set(bt, dreg) && !bt_is_reg_set(bt, sreg)) return 0; /* dreg <cond> sreg * Both dreg and sreg need precision before * this insn. If only sreg was marked precise * before it would be equally necessary to * propagate it to dreg.
*/ if (!hist || !(hist->flags & INSN_F_SRC_REG_STACK))
bt_set_reg(bt, sreg); if (!hist || !(hist->flags & INSN_F_DST_REG_STACK))
bt_set_reg(bt, dreg);
} elseif (BPF_SRC(insn->code) == BPF_K) { /* dreg <cond> K * Only dreg still needs precision before * this insn, so for the K-based conditional * there is nothing new to be marked.
*/
}
} elseif (class == BPF_LD) { if (!bt_is_reg_set(bt, dreg)) return 0;
bt_clear_reg(bt, dreg); /* It's ld_imm64 or ld_abs or ld_ind. * For ld_imm64 no further tracking of precision * into parent is necessary
*/ if (mode == BPF_IND || mode == BPF_ABS) /* to be analyzed */ return -ENOTSUPP;
} /* Propagate precision marks to linked registers, to account for * registers marked as precise in this function.
*/
bt_sync_linked_regs(bt, hist); return 0;
}
/* the scalar precision tracking algorithm: * . at the start all registers have precise=false. * . scalar ranges are tracked as normal through alu and jmp insns. * . once precise value of the scalar register is used in: * . ptr + scalar alu * . if (scalar cond K|scalar) * . helper_call(.., scalar, ...) where ARG_CONST is expected * backtrack through the verifier states and mark all registers and * stack slots with spilled constants that these scalar registers * should be precise. * . during state pruning two registers (or spilled stack slots) * are equivalent if both are not precise. * * Note the verifier cannot simply walk register parentage chain, * since many different registers and stack slots could have been * used to compute single precise scalar. * * The approach of starting with precise=true for all registers and then * backtrack to mark a register as not precise when the verifier detects * that program doesn't care about specific value (e.g., when helper * takes register as ARG_ANYTHING parameter) is not safe. * * It's ok to walk single parentage chain of the verifier states. * It's possible that this backtracking will go all the way till 1st insn. * All other branches will be explored for needing precision later. * * The backtracking needs to deal with cases like: * R8=map_value(id=0,off=0,ks=4,vs=1952,imm=0) R9_w=map_value(id=0,off=40,ks=4,vs=1952,imm=0) * r9 -= r8 * r5 = r9 * if r5 > 0x79f goto pc+7 * R5_w=inv(id=0,umax_value=1951,var_off=(0x0; 0x7ff)) * r5 += 1 * ... * call bpf_perf_event_output#25 * where .arg5_type = ARG_CONST_SIZE_OR_ZERO * * and this case: * r6 = 1 * call foo // uses callee's r6 inside to compute r0 * r0 += r6 * if r0 == 0 goto * * to track above reg_mask/stack_mask needs to be independent for each frame. * * Also if parent's curframe > frame where backtracking started, * the verifier need to mark registers in both frames, otherwise callees * may incorrectly prune callers. This is similar to * commit 7640ead93924 ("bpf: verifier: make sure callees don't prune with caller differences") * * For now backtracking falls back into conservative marking.
*/ staticvoid mark_all_scalars_precise(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
{ struct bpf_func_state *func; struct bpf_reg_state *reg; int i, j;
if (env->log.level & BPF_LOG_LEVEL2) {
verbose(env, "mark_precise: frame%d: falling back to forcing all scalars precise\n",
st->curframe);
}
/* big hammer: mark all scalars precise in this path. * pop_stack may still get !precise scalars. * We also skip current state and go straight to first parent state, * because precision markings in current non-checkpointed state are * not needed. See why in the comment in __mark_chain_precision below.
*/ for (st = st->parent; st; st = st->parent) { for (i = 0; i <= st->curframe; i++) {
func = st->frame[i]; for (j = 0; j < BPF_REG_FP; j++) {
reg = &func->regs[j]; if (reg->type != SCALAR_VALUE || reg->precise) continue;
reg->precise = true; if (env->log.level & BPF_LOG_LEVEL2) {
verbose(env, "force_precise: frame%d: forcing r%d to be precise\n",
i, j);
}
} for (j = 0; j < func->allocated_stack / BPF_REG_SIZE; j++) { if (!is_spilled_reg(&func->stack[j])) continue;
reg = &func->stack[j].spilled_ptr; if (reg->type != SCALAR_VALUE || reg->precise) continue;
reg->precise = true; if (env->log.level & BPF_LOG_LEVEL2) {
verbose(env, "force_precise: frame%d: forcing fp%d to be precise\n",
i, -(j + 1) * 8);
}
}
}
}
}
staticvoid mark_all_scalars_imprecise(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
{ struct bpf_func_state *func; struct bpf_reg_state *reg; int i, j;
for (i = 0; i <= st->curframe; i++) {
func = st->frame[i]; for (j = 0; j < BPF_REG_FP; j++) {
reg = &func->regs[j]; if (reg->type != SCALAR_VALUE) continue;
reg->precise = false;
} for (j = 0; j < func->allocated_stack / BPF_REG_SIZE; j++) { if (!is_spilled_reg(&func->stack[j])) continue;
reg = &func->stack[j].spilled_ptr; if (reg->type != SCALAR_VALUE) continue;
reg->precise = false;
}
}
}
/* * __mark_chain_precision() backtracks BPF program instruction sequence and * chain of verifier states making sure that register *regno* (if regno >= 0) * and/or stack slot *spi* (if spi >= 0) are marked as precisely tracked * SCALARS, as well as any other registers and slots that contribute to * a tracked state of given registers/stack slots, depending on specific BPF * assembly instructions (see backtrack_insns() for exact instruction handling * logic). This backtracking relies on recorded jmp_history and is able to * traverse entire chain of parent states. This process ends only when all the * necessary registers/slots and their transitive dependencies are marked as * precise. * * One important and subtle aspect is that precise marks *do not matter* in * the currently verified state (current state). It is important to understand * why this is the case. * * First, note that current state is the state that is not yet "checkpointed", * i.e., it is not yet put into env->explored_states, and it has no children * states as well. It's ephemeral, and can end up either a) being discarded if * compatible explored state is found at some point or BPF_EXIT instruction is * reached or b) checkpointed and put into env->explored_states, branching out * into one or more children states. * * In the former case, precise markings in current state are completely * ignored by state comparison code (see regsafe() for details). Only * checkpointed ("old") state precise markings are important, and if old * state's register/slot is precise, regsafe() assumes current state's * register/slot as precise and checks value ranges exactly and precisely. If * states turn out to be compatible, current state's necessary precise * markings and any required parent states' precise markings are enforced * after the fact with propagate_precision() logic, after the fact. But it's * important to realize that in this case, even after marking current state * registers/slots as precise, we immediately discard current state. So what * actually matters is any of the precise markings propagated into current * state's parent states, which are always checkpointed (due to b) case above). * As such, for scenario a) it doesn't matter if current state has precise * markings set or not. * * Now, for the scenario b), checkpointing and forking into child(ren) * state(s). Note that before current state gets to checkpointing step, any * processed instruction always assumes precise SCALAR register/slot * knowledge: if precise value or range is useful to prune jump branch, BPF * verifier takes this opportunity enthusiastically. Similarly, when * register's value is used to calculate offset or memory address, exact * knowledge of SCALAR range is assumed, checked, and enforced. So, similar to * what we mentioned above about state comparison ignoring precise markings * during state comparison, BPF verifier ignores and also assumes precise * markings *at will* during instruction verification process. But as verifier * assumes precision, it also propagates any precision dependencies across * parent states, which are not yet finalized, so can be further restricted * based on new knowledge gained from restrictions enforced by their children * states. This is so that once those parent states are finalized, i.e., when * they have no more active children state, state comparison logic in * is_state_visited() would enforce strict and precise SCALAR ranges, if * required for correctness. * * To build a bit more intuition, note also that once a state is checkpointed, * the path we took to get to that state is not important. This is crucial * property for state pruning. When state is checkpointed and finalized at * some instruction index, it can be correctly and safely used to "short * circuit" any *compatible* state that reaches exactly the same instruction * index. I.e., if we jumped to that instruction from a completely different * code path than original finalized state was derived from, it doesn't * matter, current state can be discarded because from that instruction * forward having a compatible state will ensure we will safely reach the * exit. States describe preconditions for further exploration, but completely * forget the history of how we got here. * * This also means that even if we needed precise SCALAR range to get to * finalized state, but from that point forward *that same* SCALAR register is * never used in a precise context (i.e., it's precise value is not needed for * correctness), it's correct and safe to mark such register as "imprecise" * (i.e., precise marking set to false). This is what we rely on when we do * not set precise marking in current state. If no child state requires * precision for any given SCALAR register, it's safe to dictate that it can * be imprecise. If any child state does require this register to be precise, * we'll mark it precise later retroactively during precise markings * propagation from child state to parent states. * * Skipping precise marking setting in current state is a mild version of * relying on the above observation. But we can utilize this property even * more aggressively by proactively forgetting any precise marking in the * current state (which we inherited from the parent state), right before we * checkpoint it and branch off into new child state. This is done by * mark_all_scalars_imprecise() to hopefully get more permissive and generic * finalized states which help in short circuiting more future states.
*/ staticint __mark_chain_precision(struct bpf_verifier_env *env, struct bpf_verifier_state *starting_state, int regno, bool *changed)
{ struct bpf_verifier_state *st = starting_state; struct backtrack_state *bt = &env->bt; int first_idx = st->first_insn_idx; int last_idx = starting_state->insn_idx; int subseq_idx = -1; struct bpf_func_state *func; bool tmp, skip_first = true; struct bpf_reg_state *reg; int i, fr, err;
if (!env->bpf_capable) return 0;
changed = changed ?: &tmp; /* set frame number from which we are starting to backtrack */
bt_init(bt, starting_state->curframe);
/* Do sanity checks against current state of register and/or stack * slot, but don't set precise flag in current state, as precision * tracking in the current state is unnecessary.
*/
func = st->frame[bt->frame]; if (regno >= 0) {
reg = &func->regs[regno]; if (reg->type != SCALAR_VALUE) {
verifier_bug(env, "backtracking misuse"); return -EFAULT;
}
bt_set_reg(bt, regno);
}
if (bt_empty(bt)) return 0;
for (;;) {
DECLARE_BITMAP(mask, 64);
u32 history = st->jmp_history_cnt; struct bpf_jmp_history_entry *hist;
if (last_idx < 0) { /* we are at the entry into subprog, which * is expected for global funcs, but only if * requested precise registers are R1-R5 * (which are global func's input arguments)
*/ if (st->curframe == 0 &&
st->frame[0]->subprogno > 0 &&
st->frame[0]->callsite == BPF_MAIN_FUNC &&
bt_stack_mask(bt) == 0 &&
(bt_reg_mask(bt) & ~BPF_REGMASK_ARGS) == 0) {
bitmap_from_u64(mask, bt_reg_mask(bt));
for_each_set_bit(i, mask, 32) {
reg = &st->frame[0]->regs[i];
bt_clear_reg(bt, i); if (reg->type == SCALAR_VALUE) {
reg->precise = true;
*changed = true;
}
} return 0;
}
for (i = last_idx;;) { if (skip_first) {
err = 0;
skip_first = false;
} else {
hist = get_jmp_hist_entry(st, history, i);
err = backtrack_insn(env, i, subseq_idx, hist, bt);
} if (err == -ENOTSUPP) {
mark_all_scalars_precise(env, starting_state);
bt_reset(bt); return 0;
} elseif (err) { return err;
} if (bt_empty(bt)) /* Found assignment(s) into tracked register in this state. * Since this state is already marked, just return. * Nothing to be tracked further in the parent state.
*/ return 0;
subseq_idx = i;
i = get_prev_insn_idx(st, i, &history); if (i == -ENOENT) break; if (i >= env->prog->len) { /* This can happen if backtracking reached insn 0 * and there are still reg_mask or stack_mask * to backtrack. * It means the backtracking missed the spot where * particular register was initialized with a constant.
*/
verifier_bug(env, "backtracking idx %d", i); return -EFAULT;
}
}
st = st->parent; if (!st) break;
/* if we still have requested precise regs or slots, we missed * something (e.g., stack access through non-r10 register), so * fallback to marking all precise
*/ if (!bt_empty(bt)) {
mark_all_scalars_precise(env, starting_state);
bt_reset(bt);
}
return 0;
}
int mark_chain_precision(struct bpf_verifier_env *env, int regno)
{ return __mark_chain_precision(env, env->cur_state, regno, NULL);
}
/* mark_chain_precision_batch() assumes that env->bt is set in the caller to * desired reg and stack masks across all relevant frames
*/ staticint mark_chain_precision_batch(struct bpf_verifier_env *env, struct bpf_verifier_state *starting_state)
{ return __mark_chain_precision(env, starting_state, -1, NULL);
}
staticbool is_spillable_regtype(enum bpf_reg_type type)
{ switch (base_type(type)) { case PTR_TO_MAP_VALUE: case PTR_TO_STACK: case PTR_TO_CTX: case PTR_TO_PACKET: case PTR_TO_PACKET_META: case PTR_TO_PACKET_END: case PTR_TO_FLOW_KEYS: case CONST_PTR_TO_MAP: case PTR_TO_SOCKET: case PTR_TO_SOCK_COMMON: case PTR_TO_TCP_SOCK: case PTR_TO_XDP_SOCK: case PTR_TO_BTF_ID: case PTR_TO_BUF: case PTR_TO_MEM: case PTR_TO_FUNC: case PTR_TO_MAP_KEY: case PTR_TO_ARENA: returntrue; default: returnfalse;
}
}
/* Does this register contain a constant zero? */ staticbool register_is_null(struct bpf_reg_state *reg)
{ return reg->type == SCALAR_VALUE && tnum_equals_const(reg->var_off, 0);
}
/* check if register is a constant scalar value */ staticbool is_reg_const(struct bpf_reg_state *reg, bool subreg32)
{ return reg->type == SCALAR_VALUE &&
tnum_is_const(subreg32 ? tnum_subreg(reg->var_off) : reg->var_off);
}
/* assuming is_reg_const() is true, return constant value of a register */ static u64 reg_const_value(struct bpf_reg_state *reg, bool subreg32)
{ return subreg32 ? tnum_subreg(reg->var_off).value : reg->var_off.value;
}
staticbool __is_pointer_value(bool allow_ptr_leaks, conststruct bpf_reg_state *reg)
{ if (allow_ptr_leaks) returnfalse;
if (src_reg->id & BPF_ADD_CONST) { /* * The verifier is processing rX = rY insn and * rY->id has special linked register already. * Cleared it, since multiple rX += const are not supported.
*/
src_reg->id = 0;
src_reg->off = 0;
}
if (!src_reg->id && !tnum_is_const(src_reg->var_off)) /* Ensure that src_reg has a valid ID that will be copied to * dst_reg and then will be used by sync_linked_regs() to * propagate min/max range.
*/
src_reg->id = ++env->id_gen;
}
/* Copy src state preserving dst->parent and dst->live fields */ staticvoid copy_register_state(struct bpf_reg_state *dst, conststruct bpf_reg_state *src)
{ struct bpf_reg_state *parent = dst->parent; enum bpf_reg_liveness live = dst->live;
/* See comment for mark_fastcall_pattern_for_call() */ staticvoid check_fastcall_stack_contract(struct bpf_verifier_env *env, struct bpf_func_state *state, int insn_idx, int off)
{ struct bpf_subprog_info *subprog = &env->subprog_info[state->subprogno]; struct bpf_insn_aux_data *aux = env->insn_aux_data; int i;
if (subprog->fastcall_stack_off <= off || aux[insn_idx].fastcall_pattern) return; /* access to the region [max_stack_depth .. fastcall_stack_off) * from something that is not a part of the fastcall pattern, * disable fastcall rewrites for current subprogram by setting * fastcall_stack_off to a value smaller than any possible offset.
*/
subprog->fastcall_stack_off = S16_MIN; /* reset fastcall aux flags within subprogram, * happens at most once per subprogram
*/ for (i = subprog->start; i < (subprog + 1)->start; ++i) {
aux[i].fastcall_spills_num = 0;
aux[i].fastcall_pattern = 0;
}
}
/* check_stack_{read,write}_fixed_off functions track spill/fill of registers, * stack boundary and alignment are checked in check_mem_access()
*/ staticint check_stack_write_fixed_off(struct bpf_verifier_env *env, /* stack frame we're writing to */ struct bpf_func_state *state, int off, int size, int value_regno, int insn_idx)
{ struct bpf_func_state *cur; /* state of the current function */ int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err; struct bpf_insn *insn = &env->prog->insnsi[insn_idx]; struct bpf_reg_state *reg = NULL; int insn_flags = insn_stack_access_flags(state->frameno, spi);
/* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0, * so it's aligned access and [off, off + size) are within stack limits
*/ if (!env->allow_ptr_leaks &&
is_spilled_reg(&state->stack[spi]) &&
!is_spilled_scalar_reg(&state->stack[spi]) &&
size != BPF_REG_SIZE) {
verbose(env, "attempt to corrupt spilled pointer on stack\n"); return -EACCES;
}
cur = env->cur_state->frame[env->cur_state->curframe]; if (value_regno >= 0)
reg = &cur->regs[value_regno]; if (!env->bypass_spec_v4) { bool sanitize = reg && is_spillable_regtype(reg->type);
for (i = 0; i < size; i++) {
u8 type = state->stack[spi].slot_type[i];
if (type != STACK_MISC && type != STACK_ZERO) {
sanitize = true; break;
}
}
if (sanitize)
env->insn_aux_data[insn_idx].nospec_result = true;
}
err = destroy_if_dynptr_stack_slot(env, state, spi); if (err) return err;
reg_value_fits = get_reg_width(reg) <= BITS_PER_BYTE * size; /* Make sure that reg had an ID to build a relation on spill. */ if (reg_value_fits)
assign_scalar_id_before_mov(env, reg);
save_register_state(env, state, spi, reg, size); /* Break the relation on a narrowing spill. */ if (!reg_value_fits)
state->stack[spi].spilled_ptr.id = 0;
} elseif (!reg && !(off % BPF_REG_SIZE) && is_bpf_st_mem(insn) &&
env->bpf_capable) { struct bpf_reg_state *tmp_reg = &env->fake_reg[0];
memset(tmp_reg, 0, sizeof(*tmp_reg));
__mark_reg_known(tmp_reg, insn->imm);
tmp_reg->type = SCALAR_VALUE;
save_register_state(env, state, spi, tmp_reg, size);
} elseif (reg && is_spillable_regtype(reg->type)) { /* register containing pointer is being spilled into stack */ if (size != BPF_REG_SIZE) {
verbose_linfo(env, insn_idx, "; ");
verbose(env, "invalid size of register spill\n"); return -EACCES;
} if (state != cur && reg->type == PTR_TO_STACK) {
verbose(env, "cannot spill pointers to stack into stack frame of the caller\n"); return -EINVAL;
}
save_register_state(env, state, spi, reg, size);
} else {
u8 type = STACK_MISC;
/* regular write of data into stack destroys any spilled ptr */
state->stack[spi].spilled_ptr.type = NOT_INIT; /* Mark slots as STACK_MISC if they belonged to spilled ptr/dynptr/iter. */ if (is_stack_slot_special(&state->stack[spi])) for (i = 0; i < BPF_REG_SIZE; i++)
scrub_spilled_slot(&state->stack[spi].slot_type[i]);
/* only mark the slot as written if all 8 bytes were written * otherwise read propagation may incorrectly stop too soon * when stack slots are partially written. * This heuristic means that read propagation will be * conservative, since it will add reg_live_read marks * to stack slots all the way to first state when programs * writes+reads less than 8 bytes
*/ if (size == BPF_REG_SIZE)
state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
/* when we zero initialize stack slots mark them as such */ if ((reg && register_is_null(reg)) ||
(!reg && is_bpf_st_mem(insn) && insn->imm == 0)) { /* STACK_ZERO case happened because register spill * wasn't properly aligned at the stack slot boundary, * so it's not a register spill anymore; force * originating register to be precise to make * STACK_ZERO correct for subsequent states
*/
err = mark_chain_precision(env, value_regno); if (err) return err;
type = STACK_ZERO;
}
/* Mark slots affected by this stack write. */ for (i = 0; i < size; i++)
state->stack[spi].slot_type[(slot - i) % BPF_REG_SIZE] = type;
insn_flags = 0; /* not a register spill */
}
if (insn_flags) return push_jmp_history(env, env->cur_state, insn_flags, 0); return 0;
}
/* Write the stack: 'stack[ptr_regno + off] = value_regno'. 'ptr_regno' is * known to contain a variable offset. * This function checks whether the write is permitted and conservatively * tracks the effects of the write, considering that each stack slot in the * dynamic range is potentially written to. * * 'off' includes 'regno->off'. * 'value_regno' can be -1, meaning that an unknown value is being written to * the stack. * * Spilled pointers in range are not marked as written because we don't know * what's going to be actually written. This means that read propagation for * future reads cannot be terminated by this write. * * For privileged programs, uninitialized stack slots are considered * initialized by this write (even though we don't know exactly what offsets * are going to be written to). The idea is that we don't want the verifier to * reject future reads that access slots written to through variable offsets.
*/ staticint check_stack_write_var_off(struct bpf_verifier_env *env, /* func where register points to */ struct bpf_func_state *state, int ptr_regno, int off, int size, int value_regno, int insn_idx)
{ struct bpf_func_state *cur; /* state of the current function */ int min_off, max_off; int i, err; struct bpf_reg_state *ptr_reg = NULL, *value_reg = NULL; struct bpf_insn *insn = &env->prog->insnsi[insn_idx]; bool writing_zero = false; /* set if the fact that we're writing a zero is used to let any * stack slots remain STACK_ZERO
*/ bool zero_used = false;
cur = env->cur_state->frame[env->cur_state->curframe];
ptr_reg = &cur->regs[ptr_regno];
min_off = ptr_reg->smin_value + off;
max_off = ptr_reg->smax_value + off + size; if (value_regno >= 0)
value_reg = &cur->regs[value_regno]; if ((value_reg && register_is_null(value_reg)) ||
(!value_reg && is_bpf_st_mem(insn) && insn->imm == 0))
writing_zero = true;
check_fastcall_stack_contract(env, state, insn_idx, min_off); /* Variable offset writes destroy any spilled pointers in range. */ for (i = min_off; i < max_off; i++) {
u8 new_type, *stype; int slot, spi;
if (!env->allow_ptr_leaks && *stype != STACK_MISC && *stype != STACK_ZERO) { /* Reject the write if range we may write to has not * been initialized beforehand. If we didn't reject * here, the ptr status would be erased below (even * though not all slots are actually overwritten), * possibly opening the door to leaks. * * We do however catch STACK_INVALID case below, and * only allow reading possibly uninitialized memory * later for CAP_PERFMON, as the write may not happen to * that slot.
*/
verbose(env, "spilled ptr in range of var-offset stack write; insn %d, ptr off: %d",
insn_idx, i); return -EINVAL;
}
/* If writing_zero and the spi slot contains a spill of value 0, * maintain the spill type.
*/ if (writing_zero && *stype == STACK_SPILL &&
is_spilled_scalar_reg(&state->stack[spi])) { struct bpf_reg_state *spill_reg = &state->stack[spi].spilled_ptr;
/* Erase all other spilled pointers. */
state->stack[spi].spilled_ptr.type = NOT_INIT;
/* Update the slot type. */
new_type = STACK_MISC; if (writing_zero && *stype == STACK_ZERO) {
new_type = STACK_ZERO;
zero_used = true;
} /* If the slot is STACK_INVALID, we check whether it's OK to * pretend that it will be initialized by this write. The slot * might not actually be written to, and so if we mark it as * initialized future reads might leak uninitialized memory. * For privileged programs, we will accept such reads to slots * that may or may not be written because, if we're reject * them, the error would be too confusing.
*/ if (*stype == STACK_INVALID && !env->allow_uninit_stack) {
verbose(env, "uninit stack in range of var-offset write prohibited for !root; insn %d, off: %d",
insn_idx, i); return -EINVAL;
}
*stype = new_type;
} if (zero_used) { /* backtracking doesn't work for STACK_ZERO yet. */
err = mark_chain_precision(env, value_regno); if (err) return err;
} return 0;
}
/* When register 'dst_regno' is assigned some values from stack[min_off, * max_off), we set the register's type according to the types of the * respective stack slots. If all the stack values are known to be zeros, then * so is the destination reg. Otherwise, the register is considered to be * SCALAR. This function does not deal with register filling; the caller must * ensure that all spilled registers in the stack range have been marked as * read.
*/ staticvoid mark_reg_stack_read(struct bpf_verifier_env *env, /* func where src register points to */ struct bpf_func_state *ptr_state, int min_off, int max_off, int dst_regno)
{ struct bpf_verifier_state *vstate = env->cur_state; struct bpf_func_state *state = vstate->frame[vstate->curframe]; int i, slot, spi;
u8 *stype; int zeros = 0;
for (i = min_off; i < max_off; i++) {
slot = -i - 1;
spi = slot / BPF_REG_SIZE;
mark_stack_slot_scratched(env, spi);
stype = ptr_state->stack[spi].slot_type; if (stype[slot % BPF_REG_SIZE] != STACK_ZERO) break;
zeros++;
} if (zeros == max_off - min_off) { /* Any access_size read into register is zero extended, * so the whole register == const_zero.
*/
__mark_reg_const_zero(env, &state->regs[dst_regno]);
} else { /* have read misc data from the stack */
mark_reg_unknown(env, state->regs, dst_regno);
}
state->regs[dst_regno].live |= REG_LIVE_WRITTEN;
}
/* Read the stack at 'off' and put the results into the register indicated by * 'dst_regno'. It handles reg filling if the addressed stack slot is a * spilled reg. * * 'dst_regno' can be -1, meaning that the read value is not going to a * register. * * The access is assumed to be within the current stack bounds.
*/ staticint check_stack_read_fixed_off(struct bpf_verifier_env *env, /* func where src register points to */ struct bpf_func_state *reg_state, int off, int size, int dst_regno)
{ struct bpf_verifier_state *vstate = env->cur_state; struct bpf_func_state *state = vstate->frame[vstate->curframe]; int i, slot = -off - 1, spi = slot / BPF_REG_SIZE; struct bpf_reg_state *reg;
u8 *stype, type; int insn_flags = insn_stack_access_flags(reg_state->frameno, spi);
if (is_spilled_reg(®_state->stack[spi])) {
u8 spill_size = 1;
for (i = BPF_REG_SIZE - 1; i > 0 && stype[i - 1] == STACK_SPILL; i--)
spill_size++;
if (size != BPF_REG_SIZE || spill_size != BPF_REG_SIZE) { if (reg->type != SCALAR_VALUE) {
verbose_linfo(env, env->insn_idx, "; ");
verbose(env, "invalid size of register fill\n"); return -EACCES;
}
mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64); if (dst_regno < 0) return 0;
if (size <= spill_size &&
bpf_stack_narrow_access_ok(off, size, spill_size)) { /* The earlier check_reg_arg() has decided the * subreg_def for this insn. Save it first.
*/
s32 subreg_def = state->regs[dst_regno].subreg_def;
/* Break the relation on a narrowing fill. * coerce_reg_to_size will adjust the boundaries.
*/ if (get_reg_width(reg) > size * BITS_PER_BYTE)
state->regs[dst_regno].id = 0;
} else { int spill_cnt = 0, zero_cnt = 0;
for (i = 0; i < size; i++) {
type = stype[(slot - i) % BPF_REG_SIZE]; if (type == STACK_SPILL) {
spill_cnt++; continue;
} if (type == STACK_MISC) continue; if (type == STACK_ZERO) {
zero_cnt++; continue;
} if (type == STACK_INVALID && env->allow_uninit_stack) continue;
verbose(env, "invalid read from stack off %d+%d size %d\n",
off, i, size); return -EACCES;
}
if (spill_cnt == size &&
tnum_is_const(reg->var_off) && reg->var_off.value == 0) {
__mark_reg_const_zero(env, &state->regs[dst_regno]); /* this IS register fill, so keep insn_flags */
} elseif (zero_cnt == size) { /* similarly to mark_reg_stack_read(), preserve zeroes */
__mark_reg_const_zero(env, &state->regs[dst_regno]);
insn_flags = 0; /* not restoring original register state */
} else {
mark_reg_unknown(env, state->regs, dst_regno);
insn_flags = 0; /* not restoring original register state */
}
}
state->regs[dst_regno].live |= REG_LIVE_WRITTEN;
} elseif (dst_regno >= 0) { /* restore register state from stack */
copy_register_state(&state->regs[dst_regno], reg); /* mark reg as written since spilled pointer state likely * has its liveness marks cleared by is_state_visited() * which resets stack/reg liveness for state transitions
*/
state->regs[dst_regno].live |= REG_LIVE_WRITTEN;
} elseif (__is_pointer_value(env->allow_ptr_leaks, reg)) { /* If dst_regno==-1, the caller is asking us whether * it is acceptable to use this value as a SCALAR_VALUE * (e.g. for XADD). * We must not allow unprivileged callers to do that * with spilled pointers.
*/
verbose(env, "leaking pointer from stack off %d\n",
off); return -EACCES;
}
mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64);
} else { for (i = 0; i < size; i++) {
type = stype[(slot - i) % BPF_REG_SIZE]; if (type == STACK_MISC) continue; if (type == STACK_ZERO) continue; if (type == STACK_INVALID && env->allow_uninit_stack) continue;
verbose(env, "invalid read from stack off %d+%d size %d\n",
off, i, size); return -EACCES;
}
mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64); if (dst_regno >= 0)
mark_reg_stack_read(env, reg_state, off, off + size, dst_regno);
insn_flags = 0; /* we are not restoring spilled register */
} if (insn_flags) return push_jmp_history(env, env->cur_state, insn_flags, 0); return 0;
}
enum bpf_access_src {
ACCESS_DIRECT = 1, /* the access is performed by an instruction */
ACCESS_HELPER = 2, /* the access is performed by a helper */
};
staticint check_stack_range_initialized(struct bpf_verifier_env *env, int regno, int off, int access_size, bool zero_size_allowed, enum bpf_access_type type, struct bpf_call_arg_meta *meta);
/* Read the stack at 'ptr_regno + off' and put the result into the register * 'dst_regno'. * 'off' includes the pointer register's fixed offset(i.e. 'ptr_regno.off'), * but not its variable offset. * 'size' is assumed to be <= reg size and the access is assumed to be aligned. * * As opposed to check_stack_read_fixed_off, this function doesn't deal with * filling registers (i.e. reads of spilled register cannot be detected when * the offset is not fixed). We conservatively mark 'dst_regno' as containing * SCALAR_VALUE. That's why we assert that the 'ptr_regno' has a variable * offset; for a fixed offset check_stack_read_fixed_off should be used * instead.
*/ staticint check_stack_read_var_off(struct bpf_verifier_env *env, int ptr_regno, int off, int size, int dst_regno)
{ /* The state of the source register. */ struct bpf_reg_state *reg = reg_state(env, ptr_regno); struct bpf_func_state *ptr_state = func(env, reg); int err; int min_off, max_off;
/* Note that we pass a NULL meta, so raw access will not be permitted.
*/
err = check_stack_range_initialized(env, ptr_regno, off, size, false, BPF_READ, NULL); if (err) return err;
/* check_stack_read dispatches to check_stack_read_fixed_off or * check_stack_read_var_off. * * The caller must ensure that the offset falls within the allocated stack * bounds. * * 'dst_regno' is a register which will receive the value from the stack. It * can be -1, meaning that the read value is not going to a register.
*/ staticint check_stack_read(struct bpf_verifier_env *env, int ptr_regno, int off, int size, int dst_regno)
{ struct bpf_reg_state *reg = reg_state(env, ptr_regno); struct bpf_func_state *state = func(env, reg); int err; /* Some accesses are only permitted with a static offset. */ bool var_off = !tnum_is_const(reg->var_off);
/* The offset is required to be static when reads don't go to a * register, in order to not leak pointers (see * check_stack_read_fixed_off).
*/ if (dst_regno < 0 && var_off) { char tn_buf[48];
tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
verbose(env, "variable offset stack pointer cannot be passed into helper function; var_off=%s off=%d size=%d\n",
tn_buf, off, size); return -EACCES;
} /* Variable offset is prohibited for unprivileged mode for simplicity * since it requires corresponding support in Spectre masking for stack * ALU. See also retrieve_ptr_limit(). The check in * check_stack_access_for_ptr_arithmetic() called by * adjust_ptr_min_max_vals() prevents users from creating stack pointers * with variable offsets, therefore no check is required here. Further, * just checking it here would be insufficient as speculative stack * writes could still lead to unsafe speculative behaviour.
*/ if (!var_off) {
off += reg->var_off.value;
err = check_stack_read_fixed_off(env, state, off, size,
dst_regno);
} else { /* Variable offset stack reads need more conservative handling * than fixed offset ones. Note that dst_regno >= 0 on this * branch.
*/
err = check_stack_read_var_off(env, ptr_regno, off, size,
dst_regno);
} return err;
}
/* check_stack_write dispatches to check_stack_write_fixed_off or * check_stack_write_var_off. * * 'ptr_regno' is the register used as a pointer into the stack. * 'off' includes 'ptr_regno->off', but not its variable offset (if any). * 'value_regno' is the register whose value we're writing to the stack. It can * be -1, meaning that we're not writing from a register. * * The caller must ensure that the offset falls within the maximum stack size.
*/ staticint check_stack_write(struct bpf_verifier_env *env, int ptr_regno, int off, int size, int value_regno, int insn_idx)
{ struct bpf_reg_state *reg = reg_state(env, ptr_regno); struct bpf_func_state *state = func(env, reg); int err;
if (tnum_is_const(reg->var_off)) {
off += reg->var_off.value;
err = check_stack_write_fixed_off(env, state, off, size,
value_regno, insn_idx);
} else { /* Variable offset stack reads need more conservative handling * than fixed offset ones.
*/
err = check_stack_write_var_off(env, state,
ptr_regno, off, size,
value_regno, insn_idx);
} return err;
}
staticint check_map_access_type(struct bpf_verifier_env *env, u32 regno, int off, int size, enum bpf_access_type type)
{ struct bpf_reg_state *regs = cur_regs(env); struct bpf_map *map = regs[regno].map_ptr;
u32 cap = bpf_map_flags_to_cap(map);
reg = &cur_regs(env)[regno]; switch (reg->type) { case PTR_TO_MAP_KEY:
verbose(env, "invalid access to map key, key_size=%d off=%d size=%d\n",
mem_size, off, size); break; case PTR_TO_MAP_VALUE:
verbose(env, "invalid access to map value, value_size=%d off=%d size=%d\n",
mem_size, off, size); break; case PTR_TO_PACKET: case PTR_TO_PACKET_META: case PTR_TO_PACKET_END:
verbose(env, "invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n",
off, size, regno, reg->id, off, mem_size); break; case PTR_TO_MEM: default:
verbose(env, "invalid access to memory, mem_size=%u off=%d size=%d\n",
mem_size, off, size);
}
return -EACCES;
}
/* check read/write into a memory region with possible variable offset */ staticint check_mem_region_access(struct bpf_verifier_env *env, u32 regno, int off, int size, u32 mem_size, bool zero_size_allowed)
{ struct bpf_verifier_state *vstate = env->cur_state; struct bpf_func_state *state = vstate->frame[vstate->curframe]; struct bpf_reg_state *reg = &state->regs[regno]; int err;
/* We may have adjusted the register pointing to memory region, so we * need to try adding each of min_value and max_value to off * to make sure our theoretical access will be safe. * * The minimum value is only important with signed * comparisons where we can't assume the floor of a * value is 0. If we are using signed variables for our * index'es we need to make sure that whatever we use * will have a set floor within our range.
*/ if (reg->smin_value < 0 &&
(reg->smin_value == S64_MIN ||
(off + reg->smin_value != (s64)(s32)(off + reg->smin_value)) ||
reg->smin_value + off < 0)) {
verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
regno); return -EACCES;
}
err = __check_mem_access(env, regno, reg->smin_value + off, size,
mem_size, zero_size_allowed); if (err) {
verbose(env, "R%d min value is outside of the allowed memory range\n",
regno); return err;
}
/* If we haven't set a max value then we need to bail since we can't be * sure we won't do bad things. * If reg->umax_value + off could overflow, treat that as unbounded too.
*/ if (reg->umax_value >= BPF_MAX_VAR_OFF) {
verbose(env, "R%d unbounded memory access, make sure to bounds check any such access\n",
regno); return -EACCES;
}
err = __check_mem_access(env, regno, reg->umax_value + off, size,
mem_size, zero_size_allowed); if (err) {
verbose(env, "R%d max value is outside of the allowed memory range\n",
regno); return err;
}
return 0;
}
staticint __check_ptr_off_reg(struct bpf_verifier_env *env, conststruct bpf_reg_state *reg, int regno, bool fixed_off_ok)
{ /* Access to this pointer-typed register or passing it to a helper * is only allowed in its original, unmodified form.
*/
if (btf_is_kernel(reg->btf)) {
perm_flags = PTR_MAYBE_NULL | PTR_TRUSTED | MEM_RCU;
/* Only unreferenced case accepts untrusted pointers */ if (kptr_field->type == BPF_KPTR_UNREF)
perm_flags |= PTR_UNTRUSTED;
} else {
perm_flags = PTR_MAYBE_NULL | MEM_ALLOC; if (kptr_field->type == BPF_KPTR_PERCPU)
perm_flags |= MEM_PERCPU;
}
if (base_type(reg->type) != PTR_TO_BTF_ID || (type_flag(reg->type) & ~perm_flags)) goto bad_type;
/* We need to verify reg->type and reg->btf, before accessing reg->btf */
reg_name = btf_type_name(reg->btf, reg->btf_id);
/* For ref_ptr case, release function check should ensure we get one * referenced PTR_TO_BTF_ID, and that its fixed offset is 0. For the * normal store of unreferenced kptr, we must ensure var_off is zero. * Since ref_ptr cannot be accessed directly by BPF insns, checks for * reg->off and reg->ref_obj_id are not needed here.
*/ if (__check_ptr_off_reg(env, reg, regno, true)) return -EACCES;
/* A full type match is needed, as BTF can be vmlinux, module or prog BTF, and * we also need to take into account the reg->off. * * We want to support cases like: * * struct foo { * struct bar br; * struct baz bz; * }; * * struct foo *v; * v = func(); // PTR_TO_BTF_ID * val->foo = v; // reg->off is zero, btf and btf_id match type * val->bar = &v->br; // reg->off is still zero, but we need to retry with * // first member type of struct after comparison fails * val->baz = &v->bz; // reg->off is non-zero, so struct needs to be walked * // to match type * * In the kptr_ref case, check_func_arg_reg_off already ensures reg->off * is zero. We must also ensure that btf_struct_ids_match does not walk * the struct to match type against first member of struct, i.e. reject * second case from above. Hence, when type is BPF_KPTR_REF, we set * strict mode to true for type match.
*/ if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off,
kptr_field->kptr.btf, kptr_field->kptr.btf_id,
kptr_field->type != BPF_KPTR_UNREF)) goto bad_type; return 0;
bad_type:
verbose(env, "invalid kptr access, R%d type=%s%s ", regno,
reg_type_str(env, reg->type), reg_name);
verbose(env, "expected=%s%s", reg_type_str(env, PTR_TO_BTF_ID), targ_name); if (kptr_field->type == BPF_KPTR_UNREF)
verbose(env, " or %s%s\n", reg_type_str(env, PTR_TO_BTF_ID | PTR_UNTRUSTED),
targ_name); else
verbose(env, "\n"); return -EINVAL;
}
/* The non-sleepable programs and sleepable programs with explicit bpf_rcu_read_lock() * can dereference RCU protected pointers and result is PTR_TRUSTED.
*/ staticbool in_rcu_cs(struct bpf_verifier_env *env)
{ return env->cur_state->active_rcu_lock ||
env->cur_state->active_locks ||
!in_sleepable(env);
}
/* Once GCC supports btf_type_tag the following mechanism will be replaced with tag check */
BTF_SET_START(rcu_protected_types) #ifdef CONFIG_NET
BTF_ID(struct, prog_test_ref_kfunc) #endif #ifdef CONFIG_CGROUPS
BTF_ID(struct, cgroup) #endif #ifdef CONFIG_BPF_JIT
BTF_ID(struct, bpf_cpumask) #endif
BTF_ID(struct, task_struct) #ifdef CONFIG_CRYPTO
BTF_ID(struct, bpf_crypto_ctx) #endif
BTF_SET_END(rcu_protected_types)
ret = PTR_MAYBE_NULL; if (rcu_safe_kptr(kptr_field) && in_rcu_cs(env)) {
ret |= MEM_RCU; if (kptr_field->type == BPF_KPTR_PERCPU)
ret |= MEM_PERCPU; elseif (!btf_is_kernel(kptr_field->kptr.btf))
ret |= MEM_ALLOC;
rec = kptr_pointee_btf_record(kptr_field); if (rec && btf_record_has_field(rec, BPF_GRAPH_NODE))
ret |= NON_OWN_REF;
} else {
ret |= PTR_UNTRUSTED;
}
staticint check_map_kptr_access(struct bpf_verifier_env *env, u32 regno, int value_regno, int insn_idx, struct btf_field *kptr_field)
{ struct bpf_insn *insn = &env->prog->insnsi[insn_idx]; intclass = BPF_CLASS(insn->code); struct bpf_reg_state *val_reg; int ret;
/* Things we already checked for in check_map_access and caller: * - Reject cases where variable offset may touch kptr * - size of access (must be BPF_DW) * - tnum_is_const(reg->var_off) * - kptr_field->offset == off + reg->var_off.value
*/ /* Only BPF_[LDX,STX,ST] | BPF_MEM | BPF_DW is supported */ if (BPF_MODE(insn->code) != BPF_MEM) {
verbose(env, "kptr in map can only be accessed using BPF_MEM instruction mode\n"); return -EACCES;
}
/* We only allow loading referenced kptr, since it will be marked as * untrusted, similar to unreferenced kptr.
*/ if (class != BPF_LDX &&
(kptr_field->type == BPF_KPTR_REF || kptr_field->type == BPF_KPTR_PERCPU)) {
verbose(env, "store to referenced kptr disallowed\n"); return -EACCES;
} if (class != BPF_LDX && kptr_field->type == BPF_UPTR) {
verbose(env, "store to uptr disallowed\n"); return -EACCES;
}
if (class == BPF_LDX) { if (kptr_field->type == BPF_UPTR) return mark_uptr_ld_reg(env, value_regno, kptr_field);
/* We can simply mark the value_regno receiving the pointer * value from map as PTR_TO_BTF_ID, with the correct type.
*/
ret = mark_btf_ld_reg(env, cur_regs(env), value_regno, PTR_TO_BTF_ID,
kptr_field->kptr.btf, kptr_field->kptr.btf_id,
btf_ld_kptr_type(env, kptr_field)); if (ret < 0) return ret;
} elseif (class == BPF_STX) {
val_reg = reg_state(env, value_regno); if (!register_is_null(val_reg) &&
map_kptr_match_type(env, kptr_field, val_reg, value_regno)) return -EACCES;
} elseif (class == BPF_ST) { if (insn->imm) {
verbose(env, "BPF_ST imm must be 0 when storing to kptr at off=%u\n",
kptr_field->offset); return -EACCES;
}
} else {
verbose(env, "kptr in map can only be accessed using BPF_LDX/BPF_STX/BPF_ST\n"); return -EACCES;
} return 0;
}
/* check read/write into a map element with possible variable offset */ staticint check_map_access(struct bpf_verifier_env *env, u32 regno, int off, int size, bool zero_size_allowed, enum bpf_access_src src)
{ struct bpf_verifier_state *vstate = env->cur_state; struct bpf_func_state *state = vstate->frame[vstate->curframe]; struct bpf_reg_state *reg = &state->regs[regno]; struct bpf_map *map = reg->map_ptr; struct btf_record *rec; int err, i;
if (IS_ERR_OR_NULL(map->record)) return 0;
rec = map->record; for (i = 0; i < rec->cnt; i++) { struct btf_field *field = &rec->fields[i];
u32 p = field->offset;
/* If any part of a field can be touched by load/store, reject * this program. To check that [x1, x2) overlaps with [y1, y2), * it is sufficient to check x1 < y2 && y1 < x2.
*/ if (reg->smin_value + off < p + field->size &&
p < reg->umax_value + off + size) { switch (field->type) { case BPF_KPTR_UNREF: case BPF_KPTR_REF: case BPF_KPTR_PERCPU: case BPF_UPTR: if (src != ACCESS_DIRECT) {
verbose(env, "%s cannot be accessed indirectly by helper\n",
btf_field_type_name(field->type)); return -EACCES;
} if (!tnum_is_const(reg->var_off)) {
verbose(env, "%s access cannot have variable offset\n",
btf_field_type_name(field->type)); return -EACCES;
} if (p != off + reg->var_off.value) {
verbose(env, "%s access misaligned expected=%u off=%llu\n",
btf_field_type_name(field->type),
p, off + reg->var_off.value); return -EACCES;
} if (size != bpf_size_to_bytes(BPF_DW)) {
verbose(env, "%s access size must be BPF_DW\n",
btf_field_type_name(field->type)); return -EACCES;
} break; default:
verbose(env, "%s cannot be accessed directly by load/store\n",
btf_field_type_name(field->type)); return -EACCES;
}
}
} return 0;
}
switch (prog_type) { /* Program types only with direct read access go here! */ case BPF_PROG_TYPE_LWT_IN: case BPF_PROG_TYPE_LWT_OUT: case BPF_PROG_TYPE_LWT_SEG6LOCAL: case BPF_PROG_TYPE_SK_REUSEPORT: case BPF_PROG_TYPE_FLOW_DISSECTOR: case BPF_PROG_TYPE_CGROUP_SKB: if (t == BPF_WRITE) returnfalse;
fallthrough;
/* Program types with direct read + write access go here! */ case BPF_PROG_TYPE_SCHED_CLS: case BPF_PROG_TYPE_SCHED_ACT: case BPF_PROG_TYPE_XDP: case BPF_PROG_TYPE_LWT_XMIT: case BPF_PROG_TYPE_SK_SKB: case BPF_PROG_TYPE_SK_MSG: if (meta) return meta->pkt_access;
env->seen_direct_write = true; returntrue;
case BPF_PROG_TYPE_CGROUP_SOCKOPT: if (t == BPF_WRITE)
env->seen_direct_write = true;
returntrue;
default: returnfalse;
}
}
staticint check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, int size, bool zero_size_allowed)
{ struct bpf_reg_state *regs = cur_regs(env); struct bpf_reg_state *reg = ®s[regno]; int err;
/* We may have added a variable offset to the packet pointer; but any * reg->range we have comes after that. We are only checking the fixed * offset.
*/
/* We don't allow negative numbers, because we aren't tracking enough * detail to prove they're safe.
*/ if (reg->smin_value < 0) {
verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
regno); return -EACCES;
}
err = reg->range < 0 ? -EINVAL :
__check_mem_access(env, regno, off, size, reg->range,
zero_size_allowed); if (err) {
verbose(env, "R%d offset is outside of the packet\n", regno); return err;
}
/* __check_mem_access has made sure "off + size - 1" is within u16. * reg->umax_value can't be bigger than MAX_PACKET_OFF which is 0xffff, * otherwise find_good_pkt_pointers would have refused to set range info * that __check_mem_access would have rejected this pkt access. * Therefore, "off + reg->umax_value + size - 1" won't overflow u32.
*/
env->prog->aux->max_pkt_offset =
max_t(u32, env->prog->aux->max_pkt_offset,
off + reg->umax_value + size - 1);
return err;
}
/* check access to 'struct bpf_context' fields. Supports fixed offsets only */ staticint check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, int size, enum bpf_access_type t, struct bpf_insn_access_aux *info)
{ if (env->ops->is_valid_access &&
env->ops->is_valid_access(off, size, t, env->prog, info)) { /* A non zero info.ctx_field_size indicates that this field is a * candidate for later verifier transformation to load the whole * field and then apply a mask when accessed with a narrower * access than actual ctx access size. A zero info.ctx_field_size * will only allow for whole field access and rejects any other * type of narrower access.
*/ if (base_type(info->reg_type) == PTR_TO_BTF_ID) { if (info->ref_obj_id &&
!find_reference_state(env->cur_state, info->ref_obj_id)) {
verbose(env, "invalid bpf_context access off=%d. Reference may already be released\n",
off); return -EACCES;
}
} else {
env->insn_aux_data[insn_idx].ctx_field_size = info->ctx_field_size;
} /* remember the offset of last byte accessed in ctx */ if (env->prog->aux->max_ctx_offset < off + size)
env->prog->aux->max_ctx_offset = off + size; return 0;
}
staticint check_flow_keys_access(struct bpf_verifier_env *env, int off, int size)
{ if (size < 0 || off < 0 ||
(u64)off + size > sizeof(struct bpf_flow_keys)) {
verbose(env, "invalid access to flow keys off=%d size=%d\n",
off, size); return -EACCES;
} return 0;
}
staticint check_sock_access(struct bpf_verifier_env *env, int insn_idx,
u32 regno, int off, int size, enum bpf_access_type t)
{ struct bpf_reg_state *regs = cur_regs(env); struct bpf_reg_state *reg = ®s[regno]; struct bpf_insn_access_aux info = {}; bool valid;
if (reg->smin_value < 0) {
verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
regno); return -EACCES;
}
/* Return false if @regno contains a pointer whose type isn't supported for * atomic instruction @insn.
*/ staticbool atomic_ptr_type_ok(struct bpf_verifier_env *env, int regno, struct bpf_insn *insn)
{ if (is_ctx_reg(env, regno)) returnfalse; if (is_pkt_reg(env, regno)) returnfalse; if (is_flow_key_reg(env, regno)) returnfalse; if (is_sk_reg(env, regno)) returnfalse; if (is_arena_reg(env, regno)) return bpf_jit_supports_insn(insn, true);
staticbool is_trusted_reg(conststruct bpf_reg_state *reg)
{ /* A referenced register is always trusted. */ if (reg->ref_obj_id) returntrue;
/* Types listed in the reg2btf_ids are always trusted */ if (reg2btf_ids[base_type(reg->type)] &&
!bpf_type_has_unsafe_modifiers(reg->type)) returntrue;
/* If a register is not referenced, it is trusted if it has the * MEM_ALLOC or PTR_TRUSTED type modifiers, and no others. Some of the * other type modifiers may be safe, but we elect to take an opt-in * approach here as some (e.g. PTR_UNTRUSTED and PTR_MAYBE_NULL) are * not. * * Eventually, we should make PTR_TRUSTED the single source of truth * for whether a register is trusted.
*/ return type_flag(reg->type) & BPF_REG_TRUSTED_MODIFIERS &&
!bpf_type_has_unsafe_modifiers(reg->type);
}
staticint check_pkt_ptr_alignment(struct bpf_verifier_env *env, conststruct bpf_reg_state *reg, int off, int size, bool strict)
{ struct tnum reg_off; int ip_align;
/* Byte size accesses are always allowed. */ if (!strict || size == 1) return 0;
/* For platforms that do not have a Kconfig enabling * CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS the value of * NET_IP_ALIGN is universally set to '2'. And on platforms * that do set CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS, we get * to this code only in strict mode where we want to emulate * the NET_IP_ALIGN==2 checking. Therefore use an * unconditional IP align value of '2'.
*/
ip_align = 2;
switch (reg->type) { case PTR_TO_PACKET: case PTR_TO_PACKET_META: /* Special case, because of NET_IP_ALIGN. Given metadata sits * right in front, treat it the very same way.
*/ return check_pkt_ptr_alignment(env, reg, off, size, strict); case PTR_TO_FLOW_KEYS:
pointer_desc = "flow keys "; break; case PTR_TO_MAP_KEY:
pointer_desc = "key "; break; case PTR_TO_MAP_VALUE:
pointer_desc = "value "; break; case PTR_TO_CTX:
pointer_desc = "context "; break; case PTR_TO_STACK:
pointer_desc = "stack "; /* The stack spill tracking logic in check_stack_write_fixed_off() * and check_stack_read_fixed_off() relies on stack accesses being * aligned.
*/
strict = true; break; case PTR_TO_SOCKET:
pointer_desc = "sock "; break; case PTR_TO_SOCK_COMMON:
pointer_desc = "sock_common "; break; case PTR_TO_TCP_SOCK:
pointer_desc = "tcp_sock "; break; case PTR_TO_XDP_SOCK:
pointer_desc = "xdp_sock "; break; case PTR_TO_ARENA: return 0; default: break;
} return check_generic_ptr_alignment(env, reg, pointer_desc, off, size,
strict);
}
staticenum priv_stack_mode bpf_enable_priv_stack(struct bpf_prog *prog)
{ if (!bpf_jit_supports_private_stack()) return NO_PRIV_STACK;
/* bpf_prog_check_recur() checks all prog types that use bpf trampoline * while kprobe/tp/perf_event/raw_tp don't use trampoline hence checked * explicitly.
*/ switch (prog->type) { case BPF_PROG_TYPE_KPROBE: case BPF_PROG_TYPE_TRACEPOINT: case BPF_PROG_TYPE_PERF_EVENT: case BPF_PROG_TYPE_RAW_TRACEPOINT: return PRIV_STACK_ADAPTIVE; case BPF_PROG_TYPE_TRACING: case BPF_PROG_TYPE_LSM: case BPF_PROG_TYPE_STRUCT_OPS: if (prog->aux->priv_stack_requested || bpf_prog_check_recur(prog)) return PRIV_STACK_ADAPTIVE;
fallthrough; default: break;
}
return NO_PRIV_STACK;
}
staticint round_up_stack_depth(struct bpf_verifier_env *env, int stack_depth)
{ if (env->prog->jit_requested) return round_up(stack_depth, 16);
/* round up to 32-bytes, since this is granularity * of interpreter stack size
*/ return round_up(max_t(u32, stack_depth, 1), 32);
}
/* starting from main bpf function walk all instructions of the function * and recursively walk all callees that given function can call. * Ignore jump and exit insns. * Since recursion is prevented by check_cfg() this algorithm * only needs a local stack of MAX_CALL_FRAMES to remember callsites
*/ staticint check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx, bool priv_stack_supported)
{ struct bpf_subprog_info *subprog = env->subprog_info; struct bpf_insn *insn = env->prog->insnsi; int depth = 0, frame = 0, i, subprog_end, subprog_depth; bool tail_call_reachable = false; int ret_insn[MAX_CALL_FRAMES]; int ret_prog[MAX_CALL_FRAMES]; int j;
i = subprog[idx].start; if (!priv_stack_supported)
subprog[idx].priv_stack_mode = NO_PRIV_STACK;
process_func: /* protect against potential stack overflow that might happen when * bpf2bpf calls get combined with tailcalls. Limit the caller's stack * depth for such case down to 256 so that the worst case scenario * would result in 8k stack size (32 which is tailcall limit * 256 = * 8k). * * To get the idea what might happen, see an example: * func1 -> sub rsp, 128 * subfunc1 -> sub rsp, 256 * tailcall1 -> add rsp, 256 * func2 -> sub rsp, 192 (total stack size = 128 + 192 = 320) * subfunc2 -> sub rsp, 64 * subfunc22 -> sub rsp, 128 * tailcall2 -> add rsp, 128 * func3 -> sub rsp, 32 (total stack size 128 + 192 + 64 + 32 = 416) * * tailcall will unwind the current stack frame but it will not get rid * of caller's stack as shown on the example above.
*/ if (idx && subprog[idx].has_tail_call && depth >= 256) {
verbose(env, "tail_calls are not allowed when call stack of previous frames is %d bytes. Too large\n",
depth); return -EACCES;
}
subprog_depth = round_up_stack_depth(env, subprog[idx].stack_depth); if (priv_stack_supported) { /* Request private stack support only if the subprog stack * depth is no less than BPF_PRIV_STACK_MIN_SIZE. This is to * avoid jit penalty if the stack usage is small.
*/ if (subprog[idx].priv_stack_mode == PRIV_STACK_UNKNOWN &&
subprog_depth >= BPF_PRIV_STACK_MIN_SIZE)
subprog[idx].priv_stack_mode = PRIV_STACK_ADAPTIVE;
}
if (subprog[idx].priv_stack_mode == PRIV_STACK_ADAPTIVE) { if (subprog_depth > MAX_BPF_STACK) {
verbose(env, "stack size of subprog %d is %d. Too large\n",
idx, subprog_depth); return -EACCES;
}
} else {
depth += subprog_depth; if (depth > MAX_BPF_STACK) {
verbose(env, "combined stack size of %d calls is %d. Too large\n",
frame + 1, depth); return -EACCES;
}
}
continue_func:
subprog_end = subprog[idx + 1].start; for (; i < subprog_end; i++) { int next_insn, sidx;
if (!is_bpf_throw_kfunc(insn + i)) continue; if (subprog[idx].is_cb)
err = true; for (int c = 0; c < frame && !err; c++) { if (subprog[ret_prog[c]].is_cb) {
err = true; break;
}
} if (!err) continue;
verbose(env, "bpf_throw kfunc (insn %d) cannot be called from callback subprog %d\n",
i, idx); return -EINVAL;
}
if (!bpf_pseudo_call(insn + i) && !bpf_pseudo_func(insn + i)) continue; /* remember insn and function to return to */
ret_insn[frame] = i + 1;
ret_prog[frame] = idx;
/* find the callee */
next_insn = i + insn[i].imm + 1;
sidx = find_subprog(env, next_insn); if (verifier_bug_if(sidx < 0, env, "callee not found at insn %d", next_insn)) return -EFAULT; if (subprog[sidx].is_async_cb) { if (subprog[sidx].has_tail_call) {
verifier_bug(env, "subprog has tail_call and async cb"); return -EFAULT;
} /* async callbacks don't increase bpf prog stack size unless called directly */ if (!bpf_pseudo_call(insn + i)) continue; if (subprog[sidx].is_exception_cb) {
verbose(env, "insn %d cannot call exception cb directly", i); return -EINVAL;
}
}
i = next_insn;
idx = sidx; if (!priv_stack_supported)
subprog[idx].priv_stack_mode = NO_PRIV_STACK;
if (subprog[idx].has_tail_call)
tail_call_reachable = true;
frame++; if (frame >= MAX_CALL_FRAMES) {
verbose(env, "the call stack of %d frames is too deep !\n",
frame); return -E2BIG;
} goto process_func;
} /* if tail call got detected across bpf2bpf calls then mark each of the * currently present subprog frames as tail call reachable subprogs; * this info will be utilized by JIT so that we will be preserving the * tail call counter throughout bpf2bpf calls combined with tailcalls
*/ if (tail_call_reachable) for (j = 0; j < frame; j++) { if (subprog[ret_prog[j]].is_exception_cb) {
verbose(env, "cannot tail call within exception cb\n"); return -EINVAL;
}
subprog[ret_prog[j]].tail_call_reachable = true;
} if (subprog[0].tail_call_reachable)
env->prog->aux->tail_call_reachable = true;
/* end of for() loop means the last insn of the 'subprog' * was reached. Doesn't matter whether it was JA or EXIT
*/ if (frame == 0) return 0; if (subprog[idx].priv_stack_mode != PRIV_STACK_ADAPTIVE)
depth -= round_up_stack_depth(env, subprog[idx].stack_depth);
frame--;
i = ret_insn[frame];
idx = ret_prog[frame]; goto continue_func;
}
for (int i = 0; i < env->subprog_cnt; i++) { if (si[i].has_tail_call) {
priv_stack_mode = NO_PRIV_STACK; break;
}
}
if (priv_stack_mode == PRIV_STACK_UNKNOWN)
priv_stack_mode = bpf_enable_priv_stack(env->prog);
/* All async_cb subprogs use normal kernel stack. If a particular * subprog appears in both main prog and async_cb subtree, that * subprog will use normal kernel stack to avoid potential nesting. * The reverse subprog traversal ensures when main prog subtree is * checked, the subprogs appearing in async_cb subtrees are already * marked as using normal kernel stack, so stack size checking can * be done properly.
*/ for (int i = env->subprog_cnt - 1; i >= 0; i--) { if (!i || si[i].is_async_cb) {
priv_stack_supported = !i && priv_stack_mode == PRIV_STACK_ADAPTIVE;
ret = check_max_stack_depth_subprog(env, i, priv_stack_supported); if (ret < 0) return ret;
}
}
for (int i = 0; i < env->subprog_cnt; i++) { if (si[i].priv_stack_mode == PRIV_STACK_ADAPTIVE) {
env->prog->aux->jits_use_priv_stack = true; break;
}
}
/* truncate register to smaller size (in bytes) * must be called with size < BPF_REG_SIZE
*/ staticvoid coerce_reg_to_size(struct bpf_reg_state *reg, int size)
{
u64 mask;
/* clear high bits in bit representation */
reg->var_off = tnum_cast(reg->var_off, size);
/* If size is smaller than 32bit register the 32bit register * values are also truncated so we push 64-bit bounds into * 32-bit bounds. Above were truncated < 32-bits already.
*/ if (size < 4)
__mark_reg32_unbounded(reg);
staticbool bpf_map_is_rdonly(conststruct bpf_map *map)
{ /* A map is considered read-only if the following condition are true: * * 1) BPF program side cannot change any of the map content. The * BPF_F_RDONLY_PROG flag is throughout the lifetime of a map * and was set at map creation time. * 2) The map value(s) have been initialized from user space by a * loader and then "frozen", such that no new map update/delete * operations from syscall side are possible for the rest of * the map's lifetime from that point onwards. * 3) Any parallel/pending map update/delete operations from syscall * side have been completed. Only after that point, it's safe to * assume that map value(s) are immutable.
*/ return (map->map_flags & BPF_F_RDONLY_PROG) &&
READ_ONCE(map->frozen) &&
!bpf_map_write_active(map);
}
staticint bpf_map_direct_read(struct bpf_map *map, int off, int size, u64 *val, bool is_ldsx)
{ void *ptr;
u64 addr; int err;
/* * Allow list few fields as RCU trusted or full trusted. * This logic doesn't allow mix tagging and will be removed once GCC supports * btf_type_tag.
*/
/* RCU trusted: these fields are trusted in RCU CS and never NULL */
BTF_TYPE_SAFE_RCU(struct task_struct) { const cpumask_t *cpus_ptr; struct css_set __rcu *cgroups; struct task_struct __rcu *real_parent; struct task_struct *group_leader;
};
BTF_TYPE_SAFE_RCU(struct cgroup) { /* cgrp->kn is always accessible as documented in kernel/cgroup/cgroup.c */ struct kernfs_node *kn;
};
/* RCU trusted: these fields are trusted in RCU CS and can be NULL */
BTF_TYPE_SAFE_RCU_OR_NULL(struct mm_struct) { struct file __rcu *exe_file;
};
/* skb->sk, req->sk are not RCU protected, but we mark them as such * because bpf prog accessible sockets are SOCK_RCU_FREE.
*/
BTF_TYPE_SAFE_RCU_OR_NULL(struct sk_buff) { struct sock *sk;
};
/* full trusted: these fields are trusted even outside of RCU CS and never NULL */
BTF_TYPE_SAFE_TRUSTED(struct bpf_iter_meta) { struct seq_file *seq;
};
if (reg->type & MEM_USER) {
verbose(env, "R%d is ptr_%s access user memory: off=%d\n",
regno, tname, off); return -EACCES;
}
if (reg->type & MEM_PERCPU) {
verbose(env, "R%d is ptr_%s access percpu memory: off=%d\n",
regno, tname, off); return -EACCES;
}
if (env->ops->btf_struct_access && !type_is_alloc(reg->type) && atype == BPF_WRITE) { if (!btf_is_kernel(reg->btf)) {
verifier_bug(env, "reg->btf must be kernel btf"); return -EFAULT;
}
ret = env->ops->btf_struct_access(&env->log, reg, off, size);
} else { /* Writes are permitted with default btf_struct_access for * program allocated objects (which always have ref_obj_id > 0), * but not for untrusted PTR_TO_BTF_ID | MEM_ALLOC.
*/ if (atype != BPF_READ && !type_is_ptr_alloc_obj(reg->type)) {
verbose(env, "only read is supported\n"); return -EACCES;
}
if (type_is_alloc(reg->type) && !type_is_non_owning_ref(reg->type) &&
!(reg->type & MEM_RCU) && !reg->ref_obj_id) {
verifier_bug(env, "ref_obj_id for allocated object must be non-zero"); return -EFAULT;
}
} elseif (type_flag(reg->type) & PTR_UNTRUSTED) { /* If this is an untrusted pointer, all pointers formed by walking it * also inherit the untrusted flag.
*/
flag = PTR_UNTRUSTED;
} elseif (is_trusted_reg(reg) || is_rcu_reg(reg)) { /* By default any pointer obtained from walking a trusted pointer is no * longer trusted, unless the field being accessed has explicitly been * marked as inheriting its parent's state of trust (either full or RCU). * For example: * 'cgroups' pointer is untrusted if task->cgroups dereference * happened in a sleepable program outside of bpf_rcu_read_lock() * section. In a non-sleepable program it's trusted while in RCU CS (aka MEM_RCU). * Note bpf_rcu_read_unlock() converts MEM_RCU pointers to PTR_UNTRUSTED. * * A regular RCU-protected pointer with __rcu tag can also be deemed * trusted if we are in an RCU CS. Such pointer can be NULL.
*/ if (type_is_trusted(env, reg, field_name, btf_id)) {
flag |= PTR_TRUSTED;
} elseif (type_is_trusted_or_null(env, reg, field_name, btf_id)) {
flag |= PTR_TRUSTED | PTR_MAYBE_NULL;
} elseif (in_rcu_cs(env) && !type_may_be_null(reg->type)) { if (type_is_rcu(env, reg, field_name, btf_id)) { /* ignore __rcu tag and mark it MEM_RCU */
flag |= MEM_RCU;
} elseif (flag & MEM_RCU ||
type_is_rcu_or_null(env, reg, field_name, btf_id)) { /* __rcu tagged pointers can be NULL */
flag |= MEM_RCU | PTR_MAYBE_NULL;
/* We always trust them */ if (type_is_rcu_or_null(env, reg, field_name, btf_id) &&
flag & PTR_UNTRUSTED)
flag &= ~PTR_UNTRUSTED;
} elseif (flag & (MEM_PERCPU | MEM_USER)) { /* keep as-is */
} else { /* walking unknown pointers yields old deprecated PTR_TO_BTF_ID */
clear_trusted_flags(&flag);
}
} else { /* * If not in RCU CS or MEM_RCU pointer can be NULL then * aggressively mark as untrusted otherwise such * pointers will be plain PTR_TO_BTF_ID without flags * and will be allowed to be passed into helpers for * compat reasons.
*/
flag = PTR_UNTRUSTED;
}
} else { /* Old compat. Deprecated */
clear_trusted_flags(&flag);
}
if (atype == BPF_READ && value_regno >= 0) {
ret = mark_btf_ld_reg(env, regs, value_regno, ret, reg->btf, btf_id, flag); if (ret < 0) return ret;
}
return 0;
}
staticint check_ptr_to_map_access(struct bpf_verifier_env *env, struct bpf_reg_state *regs, int regno, int off, int size, enum bpf_access_type atype, int value_regno)
{ struct bpf_reg_state *reg = regs + regno; struct bpf_map *map = reg->map_ptr; struct bpf_reg_state map_reg; enum bpf_type_flag flag = 0; conststruct btf_type *t; constchar *tname;
u32 btf_id; int ret;
if (!btf_vmlinux) {
verbose(env, "map_ptr access not supported without CONFIG_DEBUG_INFO_BTF\n"); return -ENOTSUPP;
}
if (!map->ops->map_btf_id || !*map->ops->map_btf_id) {
verbose(env, "map_ptr access not supported for map type %d\n",
map->map_type); return -ENOTSUPP;
}
t = btf_type_by_id(btf_vmlinux, *map->ops->map_btf_id);
tname = btf_name_by_offset(btf_vmlinux, t->name_off);
if (!env->allow_ptr_leaks) {
verbose(env, "'struct %s' access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN\n",
tname); return -EPERM;
}
if (off < 0) {
verbose(env, "R%d is %s invalid negative access: off=%d\n",
regno, tname, off); return -EACCES;
}
if (atype != BPF_READ) {
verbose(env, "only read from %s is supported\n", tname); return -EACCES;
}
/* Simulate access to a PTR_TO_BTF_ID */
memset(&map_reg, 0, sizeof(map_reg));
ret = mark_btf_ld_reg(env, &map_reg, 0, PTR_TO_BTF_ID,
btf_vmlinux, *map->ops->map_btf_id, 0); if (ret < 0) return ret;
ret = btf_struct_access(&env->log, &map_reg, off, size, atype, &btf_id, &flag, NULL); if (ret < 0) return ret;
if (value_regno >= 0) {
ret = mark_btf_ld_reg(env, regs, value_regno, ret, btf_vmlinux, btf_id, flag); if (ret < 0) return ret;
}
return 0;
}
/* Check that the stack access at the given offset is within bounds. The * maximum valid offset is -1. * * The minimum valid offset is -MAX_BPF_STACK for writes, and * -state->allocated_stack for reads.
*/ staticint check_stack_slot_within_bounds(struct bpf_verifier_env *env,
s64 off, struct bpf_func_state *state, enum bpf_access_type t)
{ int min_valid_off;
if (off < min_valid_off || off > -1) return -EACCES; return 0;
}
/* Check that the stack access at 'regno + off' falls within the maximum stack * bounds. * * 'off' includes `regno->offset`, but not its dynamic part (if any).
*/ staticint check_stack_access_within_bounds( struct bpf_verifier_env *env, int regno, int off, int access_size, enum bpf_access_type type)
{ struct bpf_reg_state *regs = cur_regs(env); struct bpf_reg_state *reg = regs + regno; struct bpf_func_state *state = func(env, reg);
s64 min_off, max_off; int err; char *err_extra;
err = check_stack_slot_within_bounds(env, min_off, state, type); if (!err && max_off > 0)
err = -EINVAL; /* out of stack access into non-negative offsets */ if (!err && access_size < 0) /* access_size should not be negative (or overflow an int); others checks * along the way should have prevented such an access.
*/
err = -EFAULT; /* invalid negative access size; integer overflow? */
/* Note that there is no stack access with offset zero, so the needed stack * size is -min_off, not -min_off+1.
*/ return grow_stack_state(env, state, -min_off /* size */);
}
/* check whether memory at (regno + off) is accessible for t = (read | write) * if t==write, value_regno is a register which value is stored into memory * if t==read, value_regno is a register which will receive the value from memory * if t==write && value_regno==-1, some unknown value is stored into memory * if t==read && value_regno==-1, don't care what we read from memory
*/ staticint check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regno, int off, int bpf_size, enum bpf_access_type t, int value_regno, bool strict_alignment_once, bool is_ldsx)
{ struct bpf_reg_state *regs = cur_regs(env); struct bpf_reg_state *reg = regs + regno; int size, err = 0;
size = bpf_size_to_bytes(bpf_size); if (size < 0) return size;
/* alignment checks will add in reg->off themselves */
err = check_ptr_alignment(env, reg, off, size, strict_alignment_once); if (err) return err;
/* for access checks, reg->off is just part of off */
off += reg->off;
if (reg->type == PTR_TO_MAP_KEY) { if (t == BPF_WRITE) {
verbose(env, "write to change key R%d not allowed\n", regno); return -EACCES;
}
/* if map is read-only, track its contents as scalars */ if (tnum_is_const(reg->var_off) &&
bpf_map_is_rdonly(map) &&
map->ops->map_direct_value_addr) { int map_off = off + reg->var_off.value;
u64 val = 0;
err = check_ctx_access(env, insn_idx, off, size, t, &info); if (err)
verbose_linfo(env, insn_idx, "; "); if (!err && t == BPF_READ && value_regno >= 0) { /* ctx access returns either a scalar, or a * PTR_TO_PACKET[_META,_END]. In the latter * case, we know the offset is zero.
*/ if (info.reg_type == SCALAR_VALUE) { if (info.is_retval && get_func_retval_range(env->prog, &range)) {
err = __mark_reg_s32_range(env, regs, value_regno,
range.minval, range.maxval); if (err) return err;
} else {
mark_reg_unknown(env, regs, value_regno);
}
} else {
mark_reg_known_zero(env, regs,
value_regno); if (type_may_be_null(info.reg_type))
regs[value_regno].id = ++env->id_gen; /* A load of ctx field could have different * actual load size with the one encoded in the * insn. When the dst is PTR, it is for sure not * a sub-register.
*/
regs[value_regno].subreg_def = DEF_NOT_SUBREG; if (base_type(info.reg_type) == PTR_TO_BTF_ID) {
regs[value_regno].btf = info.btf;
regs[value_regno].btf_id = info.btf_id;
regs[value_regno].ref_obj_id = info.ref_obj_id;
}
}
regs[value_regno].type = info.reg_type;
}
/* Check if (src_reg + off) is readable. The state of dst_reg will be * updated by this call.
*/
err = check_mem_access(env, env->insn_idx, insn->src_reg, insn->off,
BPF_SIZE(insn->code), BPF_READ, insn->dst_reg,
strict_alignment_once, is_ldsx);
err = err ?: save_aux_ptr_type(env, src_reg_type,
allow_trust_mismatch);
err = err ?: reg_bounds_sanity_check(env, ®s[insn->dst_reg], ctx);
if (insn->imm == BPF_CMPXCHG) { /* Check comparison of R0 with memory location */ const u32 aux_reg = BPF_REG_0;
err = check_reg_arg(env, aux_reg, SRC_OP); if (err) return err;
if (is_pointer_value(env, aux_reg)) {
verbose(env, "R%d leaks addr into mem\n", aux_reg); return -EACCES;
}
}
if (is_pointer_value(env, insn->src_reg)) {
verbose(env, "R%d leaks addr into mem\n", insn->src_reg); return -EACCES;
}
if (!atomic_ptr_type_ok(env, insn->dst_reg, insn)) {
verbose(env, "BPF_ATOMIC stores into R%d %s is not allowed\n",
insn->dst_reg,
reg_type_str(env, reg_state(env, insn->dst_reg)->type)); return -EACCES;
}
if (insn->imm & BPF_FETCH) { if (insn->imm == BPF_CMPXCHG)
load_reg = BPF_REG_0; else
load_reg = insn->src_reg;
/* check and record load of old value */
err = check_reg_arg(env, load_reg, DST_OP); if (err) return err;
} else { /* This instruction accesses a memory location but doesn't * actually load it into a register.
*/
load_reg = -1;
}
/* Check whether we can read the memory, with second call for fetch * case to simulate the register fill.
*/
err = check_mem_access(env, env->insn_idx, insn->dst_reg, insn->off,
BPF_SIZE(insn->code), BPF_READ, -1, true, false); if (!err && load_reg >= 0)
err = check_mem_access(env, env->insn_idx, insn->dst_reg,
insn->off, BPF_SIZE(insn->code),
BPF_READ, load_reg, true, false); if (err) return err;
if (is_arena_reg(env, insn->dst_reg)) {
err = save_aux_ptr_type(env, PTR_TO_ARENA, false); if (err) return err;
} /* Check whether we can write into the same memory. */
err = check_mem_access(env, env->insn_idx, insn->dst_reg, insn->off,
BPF_SIZE(insn->code), BPF_WRITE, -1, true, false); if (err) return err; return 0;
}
staticint check_atomic_load(struct bpf_verifier_env *env, struct bpf_insn *insn)
{ int err;
if (!atomic_ptr_type_ok(env, insn->src_reg, insn)) {
verbose(env, "BPF_ATOMIC loads from R%d %s is not allowed\n",
insn->src_reg,
reg_type_str(env, reg_state(env, insn->src_reg)->type)); return -EACCES;
}
return 0;
}
staticint check_atomic_store(struct bpf_verifier_env *env, struct bpf_insn *insn)
{ int err;
err = check_store_reg(env, insn, true); if (err) return err;
if (!atomic_ptr_type_ok(env, insn->dst_reg, insn)) {
verbose(env, "BPF_ATOMIC stores into R%d %s is not allowed\n",
insn->dst_reg,
reg_type_str(env, reg_state(env, insn->dst_reg)->type)); return -EACCES;
}
return 0;
}
staticint check_atomic(struct bpf_verifier_env *env, struct bpf_insn *insn)
{ switch (insn->imm) { case BPF_ADD: case BPF_ADD | BPF_FETCH: case BPF_AND: case BPF_AND | BPF_FETCH: case BPF_OR: case BPF_OR | BPF_FETCH: case BPF_XOR: case BPF_XOR | BPF_FETCH: case BPF_XCHG: case BPF_CMPXCHG: return check_atomic_rmw(env, insn); case BPF_LOAD_ACQ: if (BPF_SIZE(insn->code) == BPF_DW && BITS_PER_LONG != 64) {
verbose(env, "64-bit load-acquires are only supported on 64-bit arches\n"); return -EOPNOTSUPP;
} return check_atomic_load(env, insn); case BPF_STORE_REL: if (BPF_SIZE(insn->code) == BPF_DW && BITS_PER_LONG != 64) {
verbose(env, "64-bit store-releases are only supported on 64-bit arches\n"); return -EOPNOTSUPP;
} return check_atomic_store(env, insn); default:
verbose(env, "BPF_ATOMIC uses invalid atomic opcode %02x\n",
insn->imm); return -EINVAL;
}
}
/* When register 'regno' is used to read the stack (either directly or through * a helper function) make sure that it's within stack boundary and, depending * on the access type and privileges, that all elements of the stack are * initialized. * * 'off' includes 'regno->off', but not its dynamic part (if any). * * All registers that have been spilled on the stack in the slots within the * read offsets are marked as read.
*/ staticint check_stack_range_initialized( struct bpf_verifier_env *env, int regno, int off, int access_size, bool zero_size_allowed, enum bpf_access_type type, struct bpf_call_arg_meta *meta)
{ struct bpf_reg_state *reg = reg_state(env, regno); struct bpf_func_state *state = func(env, reg); int err, min_off, max_off, i, j, slot, spi; /* Some accesses can write anything into the stack, others are * read-only.
*/ bool clobber = false;
if (tnum_is_const(reg->var_off)) {
min_off = max_off = reg->var_off.value + off;
} else { /* Variable offset is prohibited for unprivileged mode for * simplicity since it requires corresponding support in * Spectre masking for stack ALU. * See also retrieve_ptr_limit().
*/ if (!env->bypass_spec_v1) { char tn_buf[48];
tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
verbose(env, "R%d variable offset stack access prohibited for !root, var_off=%s\n",
regno, tn_buf); return -EACCES;
} /* Only initialized buffer on stack is allowed to be accessed * with variable offset. With uninitialized buffer it's hard to * guarantee that whole memory is marked as initialized on * helper return since specific bounds are unknown what may * cause uninitialized stack leaking.
*/ if (meta && meta->raw_mode)
meta = NULL;
if (meta && meta->raw_mode) { /* Ensure we won't be overwriting dynptrs when simulating byte * by byte access in check_helper_call using meta.access_size. * This would be a problem if we have a helper in the future * which takes: * * helper(uninit_mem, len, dynptr) * * Now, uninint_mem may overlap with dynptr pointer. Hence, it * may end up writing to dynptr itself when touching memory from * arg 1. This can be relaxed on a case by case basis for known * safe cases, but reject due to the possibilitiy of aliasing by * default.
*/ for (i = min_off; i < max_off + access_size; i++) { int stack_off = -i - 1;
spi = __get_spi(i); /* raw_mode may write past allocated_stack */ if (state->allocated_stack <= stack_off) continue; if (state->stack[spi].slot_type[stack_off % BPF_REG_SIZE] == STACK_DYNPTR) {
verbose(env, "potential write to dynptr at off=%d disallowed\n", i); return -EACCES;
}
}
meta->access_size = access_size;
meta->regno = regno; return 0;
}
for (i = min_off; i < max_off + access_size; i++) {
u8 *stype;
stype = &state->stack[spi].slot_type[slot % BPF_REG_SIZE]; if (*stype == STACK_MISC) goto mark; if ((*stype == STACK_ZERO) ||
(*stype == STACK_INVALID && env->allow_uninit_stack)) { if (clobber) { /* helper can write anything into the stack */
*stype = STACK_MISC;
} goto mark;
}
if (is_spilled_reg(&state->stack[spi]) &&
(state->stack[spi].spilled_ptr.type == SCALAR_VALUE ||
env->allow_ptr_leaks)) { if (clobber) {
__mark_reg_unknown(env, &state->stack[spi].spilled_ptr); for (j = 0; j < BPF_REG_SIZE; j++)
scrub_spilled_slot(&state->stack[spi].slot_type[j]);
} goto mark;
}
if (tnum_is_const(reg->var_off)) {
verbose(env, "invalid read from stack R%d off %d+%d size %d\n",
regno, min_off, i - min_off, access_size);
} else { char tn_buf[48];
tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
verbose(env, "invalid read from stack R%d var_off %s+%d size %d\n",
regno, tn_buf, i - min_off, access_size);
} return -EACCES;
mark: /* reading any byte out of 8-byte 'spill_slot' will cause * the whole slot to be marked as 'read'
*/
mark_reg_read(env, &state->stack[spi].spilled_ptr,
state->stack[spi].spilled_ptr.parent,
REG_LIVE_READ64); /* We do not set REG_LIVE_WRITTEN for stack slot, as we can not * be sure that whether stack slot is written to or not. Hence, * we must still conservatively propagate reads upwards even if * helper may write to the entire memory range.
*/
} return 0;
}
switch (base_type(reg->type)) { case PTR_TO_PACKET: case PTR_TO_PACKET_META: return check_packet_access(env, regno, reg->off, access_size,
zero_size_allowed); case PTR_TO_MAP_KEY: if (access_type == BPF_WRITE) {
verbose(env, "R%d cannot write into %s\n", regno,
reg_type_str(env, reg->type)); return -EACCES;
} return check_mem_region_access(env, regno, reg->off, access_size,
reg->map_ptr->key_size, false); case PTR_TO_MAP_VALUE: if (check_map_access_type(env, regno, reg->off, access_size, access_type)) return -EACCES; return check_map_access(env, regno, reg->off, access_size,
zero_size_allowed, ACCESS_HELPER); case PTR_TO_MEM: if (type_is_rdonly_mem(reg->type)) { if (access_type == BPF_WRITE) {
verbose(env, "R%d cannot write into %s\n", regno,
reg_type_str(env, reg->type)); return -EACCES;
}
} return check_mem_region_access(env, regno, reg->off,
access_size, reg->mem_size,
zero_size_allowed); case PTR_TO_BUF: if (type_is_rdonly_mem(reg->type)) { if (access_type == BPF_WRITE) {
verbose(env, "R%d cannot write into %s\n", regno,
reg_type_str(env, reg->type)); return -EACCES;
}
max_access = &env->prog->aux->max_rdonly_access;
} else {
max_access = &env->prog->aux->max_rdwr_access;
} return check_buffer_access(env, reg, regno, reg->off,
access_size, zero_size_allowed,
max_access); case PTR_TO_STACK: return check_stack_range_initialized(
env,
regno, reg->off, access_size,
zero_size_allowed, access_type, meta); case PTR_TO_BTF_ID: return check_ptr_to_btf_access(env, regs, regno, reg->off,
access_size, BPF_READ, -1); case PTR_TO_CTX: /* in case the function doesn't know how to access the context, * (because we are in a program of type SYSCALL for example), we * can not statically check its size. * Dynamically check it now.
*/ if (!env->ops->convert_ctx_access) { int offset = access_size - 1;
/* Allow zero-byte read from PTR_TO_CTX */ if (access_size == 0) return zero_size_allowed ? 0 : -EACCES;
/* verify arguments to helpers or kfuncs consisting of a pointer and an access * size. * * @regno is the register containing the access size. regno-1 is the register * containing the pointer.
*/ staticint check_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, u32 regno, enum bpf_access_type access_type, bool zero_size_allowed, struct bpf_call_arg_meta *meta)
{ int err;
/* This is used to refine r0 return value bounds for helpers * that enforce this value as an upper bound on return values. * See do_refine_retval_range() for helpers that can refine * the return value. C type of helper is u32 so we pull register * bound from umax_value however, if negative verifier errors * out. Only upper bounds can be learned because retval is an * int type and negative retvals are allowed.
*/
meta->msize_max_value = reg->umax_value;
/* The register is SCALAR_VALUE; the access check happens using * its boundaries. For unprivileged variable accesses, disable * raw mode so that the program is required to initialize all * the memory that the helper could just partially fill up.
*/ if (!tnum_is_const(reg->var_off))
meta = NULL;
if (reg->smin_value < 0) {
verbose(env, "R%d min value is negative, either use unsigned or 'var &= const'\n",
regno); return -EACCES;
}
/* Assuming that the register contains a value check if the memory * access is safe. Temporarily save and restore the register's state as * the conversion shouldn't be visible to a caller.
*/ if (may_be_null) {
saved_reg = *reg;
mark_ptr_not_null_reg(reg);
}
/* Implementation details: * bpf_map_lookup returns PTR_TO_MAP_VALUE_OR_NULL. * bpf_obj_new returns PTR_TO_BTF_ID | MEM_ALLOC | PTR_MAYBE_NULL. * Two bpf_map_lookups (even with the same key) will have different reg->id. * Two separate bpf_obj_new will also have different reg->id. * For traditional PTR_TO_MAP_VALUE or PTR_TO_BTF_ID | MEM_ALLOC, the verifier * clears reg->id after value_or_null->value transition, since the verifier only * cares about the range of access to valid map value pointer and doesn't care * about actual address of the map element. * For maps with 'struct bpf_spin_lock' inside map value the verifier keeps * reg->id > 0 after value_or_null->value transition. By doing so * two bpf_map_lookups will be considered two different pointers that * point to different bpf_spin_locks. Likewise for pointers to allocated objects * returned from bpf_obj_new. * The verifier allows taking only one bpf_spin_lock at a time to avoid * dead-locks. * Since only one bpf_spin_lock is allowed the checks are simpler than * reg_is_refcounted() logic. The verifier needs to remember only * one spin_lock instead of array of acquired_refs. * env->cur_state->active_locks remembers which map value element or allocated * object got locked and clears it after bpf_spin_unlock.
*/ staticint process_spin_lock(struct bpf_verifier_env *env, int regno, int flags)
{ bool is_lock = flags & PROCESS_SPIN_LOCK, is_res_lock = flags & PROCESS_RES_LOCK; constchar *lock_str = is_res_lock ? "bpf_res_spin" : "bpf_spin"; struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; struct bpf_verifier_state *cur = env->cur_state; bool is_const = tnum_is_const(reg->var_off); bool is_irq = flags & PROCESS_LOCK_IRQ;
u64 val = reg->var_off.value; struct bpf_map *map = NULL; struct btf *btf = NULL; struct btf_record *rec;
u32 spin_lock_off; int err;
if (!is_const) {
verbose(env, "R%d doesn't have constant offset. %s_lock has to be at the constant offset\n",
regno, lock_str); return -EINVAL;
} if (reg->type == PTR_TO_MAP_VALUE) {
map = reg->map_ptr; if (!map->btf) {
verbose(env, "map '%s' has to have BTF in order to use %s_lock\n",
map->name, lock_str); return -EINVAL;
}
} else {
btf = reg->btf;
}
rec = reg_btf_record(reg); if (!btf_record_has_field(rec, is_res_lock ? BPF_RES_SPIN_LOCK : BPF_SPIN_LOCK)) {
verbose(env, "%s '%s' has no valid %s_lock\n", map ? "map" : "local",
map ? map->name : "kptr", lock_str); return -EINVAL;
}
spin_lock_off = is_res_lock ? rec->res_spin_lock_off : rec->spin_lock_off; if (spin_lock_off != val + reg->off) {
verbose(env, "off %lld doesn't point to 'struct %s_lock' that is at %d\n",
val + reg->off, lock_str, spin_lock_off); return -EINVAL;
} if (is_lock) { void *ptr; int type;
if (map)
ptr = map; else
ptr = btf;
if (!is_res_lock && cur->active_locks) { if (find_lock_state(env->cur_state, REF_TYPE_LOCK, 0, NULL)) {
verbose(env, "Locking two bpf_spin_locks are not allowed\n"); return -EINVAL;
}
} elseif (is_res_lock && cur->active_locks) { if (find_lock_state(env->cur_state, REF_TYPE_RES_LOCK | REF_TYPE_RES_LOCK_IRQ, reg->id, ptr)) {
verbose(env, "Acquiring the same lock again, AA deadlock detected\n"); return -EINVAL;
}
}
if (is_res_lock && is_irq)
type = REF_TYPE_RES_LOCK_IRQ; elseif (is_res_lock)
type = REF_TYPE_RES_LOCK; else
type = REF_TYPE_LOCK;
err = acquire_lock_state(env, env->insn_idx, type, reg->id, ptr); if (err < 0) {
verbose(env, "Failed to acquire lock state\n"); return err;
}
} else { void *ptr; int type;
if (map)
ptr = map; else
ptr = btf;
if (!cur->active_locks) {
verbose(env, "%s_unlock without taking a lock\n", lock_str); return -EINVAL;
}
if (is_res_lock && is_irq)
type = REF_TYPE_RES_LOCK_IRQ; elseif (is_res_lock)
type = REF_TYPE_RES_LOCK; else
type = REF_TYPE_LOCK; if (!find_lock_state(cur, type, reg->id, ptr)) {
verbose(env, "%s_unlock of different lock\n", lock_str); return -EINVAL;
} if (reg->id != cur->active_lock_id || ptr != cur->active_lock_ptr) {
verbose(env, "%s_unlock cannot be out of order\n", lock_str); return -EINVAL;
} if (release_lock_state(cur, type, reg->id, ptr)) {
verbose(env, "%s_unlock of different lock\n", lock_str); return -EINVAL;
}
if (!is_const) {
verbose(env, "R%d doesn't have constant offset. bpf_timer has to be at the constant offset\n",
regno); return -EINVAL;
} if (!map->btf) {
verbose(env, "map '%s' has to have BTF in order to use bpf_timer\n",
map->name); return -EINVAL;
} if (!btf_record_has_field(map->record, BPF_TIMER)) {
verbose(env, "map '%s' has no valid bpf_timer\n", map->name); return -EINVAL;
} if (map->record->timer_off != val + reg->off) {
verbose(env, "off %lld doesn't point to 'struct bpf_timer' that is at %d\n",
val + reg->off, map->record->timer_off); return -EINVAL;
} if (meta->map_ptr) {
verifier_bug(env, "Two map pointers in a timer helper"); return -EFAULT;
} if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
verbose(env, "bpf_timer cannot be used for PREEMPT_RT.\n"); return -EOPNOTSUPP;
}
meta->map_uid = reg->map_uid;
meta->map_ptr = map; return 0;
}
if (map->record->wq_off != val + reg->off) {
verbose(env, "off %lld doesn't point to 'struct bpf_wq' that is at %d\n",
val + reg->off, map->record->wq_off); return -EINVAL;
}
meta->map.uid = reg->map_uid;
meta->map.ptr = map; return 0;
}
if (type_is_ptr_alloc_obj(reg->type)) {
rec = reg_btf_record(reg);
} else { /* PTR_TO_MAP_VALUE */
map_ptr = reg->map_ptr; if (!map_ptr->btf) {
verbose(env, "map '%s' has to have BTF in order to use bpf_kptr_xchg\n",
map_ptr->name); return -EINVAL;
}
rec = map_ptr->record;
meta->map_ptr = map_ptr;
}
if (!tnum_is_const(reg->var_off)) {
verbose(env, "R%d doesn't have constant offset. kptr has to be at the constant offset\n",
regno); return -EINVAL;
}
if (!btf_record_has_field(rec, BPF_KPTR)) {
verbose(env, "R%d has no valid kptr\n", regno); return -EINVAL;
}
/* There are two register types representing a bpf_dynptr, one is PTR_TO_STACK * which points to a stack slot, and the other is CONST_PTR_TO_DYNPTR. * * In both cases we deal with the first 8 bytes, but need to mark the next 8 * bytes as STACK_DYNPTR in case of PTR_TO_STACK. In case of * CONST_PTR_TO_DYNPTR, we are guaranteed to get the beginning of the object. * * Mutability of bpf_dynptr is at two levels, one is at the level of struct * bpf_dynptr itself, i.e. whether the helper is receiving a pointer to struct * bpf_dynptr or pointer to const struct bpf_dynptr. In the former case, it can * mutate the view of the dynptr and also possibly destroy it. In the latter * case, it cannot mutate the bpf_dynptr itself but it can still mutate the * memory that dynptr points to. * * The verifier will keep track both levels of mutation (bpf_dynptr's in * reg->type and the memory's in reg->dynptr.type), but there is no support for * readonly dynptr view yet, hence only the first case is tracked and checked. * * This is consistent with how C applies the const modifier to a struct object, * where the pointer itself inside bpf_dynptr becomes const but not what it * points to. * * Helpers which do not mutate the bpf_dynptr set MEM_RDONLY in their argument * type, and declare it as 'const struct bpf_dynptr *' in their prototype.
*/ staticint process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn_idx, enum bpf_arg_type arg_type, int clone_ref_obj_id)
{ struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; int err;
if (reg->type != PTR_TO_STACK && reg->type != CONST_PTR_TO_DYNPTR) {
verbose(env, "arg#%d expected pointer to stack or const struct bpf_dynptr\n",
regno - 1); return -EINVAL;
}
/* MEM_UNINIT and MEM_RDONLY are exclusive, when applied to an * ARG_PTR_TO_DYNPTR (or ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_*):
*/ if ((arg_type & (MEM_UNINIT | MEM_RDONLY)) == (MEM_UNINIT | MEM_RDONLY)) {
verifier_bug(env, "misconfigured dynptr helper type flags"); return -EFAULT;
}
/* MEM_UNINIT - Points to memory that is an appropriate candidate for * constructing a mutable bpf_dynptr object. * * Currently, this is only possible with PTR_TO_STACK * pointing to a region of at least 16 bytes which doesn't * contain an existing bpf_dynptr. * * MEM_RDONLY - Points to a initialized bpf_dynptr that will not be * mutated or destroyed. However, the memory it points to * may be mutated. * * None - Points to a initialized dynptr that can be mutated and * destroyed, including mutation of the memory it points * to.
*/ if (arg_type & MEM_UNINIT) { int i;
if (!is_dynptr_reg_valid_uninit(env, reg)) {
verbose(env, "Dynptr has to be an uninitialized dynptr\n"); return -EINVAL;
}
/* we write BPF_DW bits (8 bytes) at a time */ for (i = 0; i < BPF_DYNPTR_SIZE; i += 8) {
err = check_mem_access(env, insn_idx, regno,
i, BPF_DW, BPF_WRITE, -1, false, false); if (err) return err;
}
err = mark_stack_slots_dynptr(env, reg, arg_type, insn_idx, clone_ref_obj_id);
} else/* MEM_RDONLY and None case from above */ { /* For the reg->type == PTR_TO_STACK case, bpf_dynptr is never const */ if (reg->type == CONST_PTR_TO_DYNPTR && !(arg_type & MEM_RDONLY)) {
verbose(env, "cannot pass pointer to const bpf_dynptr, the helper mutates it\n"); return -EINVAL;
}
if (!is_dynptr_reg_valid_init(env, reg)) {
verbose(env, "Expected an initialized dynptr as arg #%d\n",
regno - 1); return -EINVAL;
}
/* Fold modifiers (in this case, MEM_RDONLY) when checking expected type */ if (!is_dynptr_type_expected(env, reg, arg_type & ~MEM_RDONLY)) {
verbose(env, "Expected a dynptr of type %s as arg #%d\n",
dynptr_type_str(arg_to_dynptr_type(arg_type)), regno - 1); return -EINVAL;
}
staticbool is_kfunc_arg_iter(struct bpf_kfunc_call_arg_meta *meta, int arg_idx, conststruct btf_param *arg)
{ /* btf_check_iter_kfuncs() guarantees that first argument of any iter * kfunc is iter state pointer
*/ if (is_iter_kfunc(meta)) return arg_idx == 0;
/* iter passed as an argument to a generic kfunc */ return btf_param_match_suffix(meta->btf, arg, "__iter");
}
staticint process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_idx, struct bpf_kfunc_call_arg_meta *meta)
{ struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; conststruct btf_type *t; int spi, err, i, nr_slots, btf_id;
if (reg->type != PTR_TO_STACK) {
verbose(env, "arg#%d expected pointer to an iterator on stack\n", regno - 1); return -EINVAL;
}
/* For iter_{new,next,destroy} functions, btf_check_iter_kfuncs() * ensures struct convention, so we wouldn't need to do any BTF * validation here. But given iter state can be passed as a parameter * to any kfunc, if arg has "__iter" suffix, we need to be a bit more * conservative here.
*/
btf_id = btf_check_iter_arg(meta->btf, meta->func_proto, regno - 1); if (btf_id < 0) {
verbose(env, "expected valid iter pointer as arg #%d\n", regno - 1); return -EINVAL;
}
t = btf_type_by_id(meta->btf, btf_id);
nr_slots = t->size / BPF_REG_SIZE;
if (is_iter_new_kfunc(meta)) { /* bpf_iter_<type>_new() expects pointer to uninit iter state */ if (!is_iter_reg_valid_uninit(env, reg, nr_slots)) {
verbose(env, "expected uninitialized iter_%s as arg #%d\n",
iter_type_str(meta->btf, btf_id), regno - 1); return -EINVAL;
}
for (i = 0; i < nr_slots * 8; i += BPF_REG_SIZE) {
err = check_mem_access(env, insn_idx, regno,
i, BPF_DW, BPF_WRITE, -1, false, false); if (err) return err;
}
err = mark_stack_slots_iter(env, meta, reg, insn_idx, meta->btf, btf_id, nr_slots); if (err) return err;
} else { /* iter_next() or iter_destroy(), as well as any kfunc * accepting iter argument, expect initialized iter state
*/
err = is_iter_reg_valid_init(env, reg, meta->btf, btf_id, nr_slots); switch (err) { case 0: break; case -EINVAL:
verbose(env, "expected an initialized iter_%s as arg #%d\n",
iter_type_str(meta->btf, btf_id), regno - 1); return err; case -EPROTO:
verbose(env, "expected an RCU CS when using %s\n", meta->func_name); return err; default: return err;
}
err = mark_iter_read(env, reg, spi, nr_slots); if (err) return err;
/* remember meta->iter info for process_iter_next_call() */
meta->iter.spi = spi;
meta->iter.frameno = reg->frameno;
meta->ref_obj_id = iter_ref_obj_id(env, reg, spi);
if (is_iter_destroy_kfunc(meta)) {
err = unmark_stack_slots_iter(env, reg, nr_slots); if (err) return err;
}
}
return 0;
}
/* Look for a previous loop entry at insn_idx: nearest parent state * stopped at insn_idx with callsites matching those in cur->frame.
*/ staticstruct bpf_verifier_state *find_prev_entry(struct bpf_verifier_env *env, struct bpf_verifier_state *cur, int insn_idx)
{ struct bpf_verifier_state_list *sl; struct bpf_verifier_state *st; struct list_head *pos, *head;
/* Explored states are pushed in stack order, most recent states come first */
head = explored_state(env, insn_idx);
list_for_each(pos, head) {
sl = container_of(pos, struct bpf_verifier_state_list, node); /* If st->branches != 0 state is a part of current DFS verification path, * hence cur & st for a loop.
*/
st = &sl->state; if (st->insn_idx == insn_idx && st->branches && same_callsites(st, cur) &&
st->dfs_depth < cur->dfs_depth) return st;
}
/* process_iter_next_call() is called when verifier gets to iterator's next * "method" (e.g., bpf_iter_num_next() for numbers iterator) call. We'll refer * to it as just "iter_next()" in comments below. * * BPF verifier relies on a crucial contract for any iter_next() * implementation: it should *eventually* return NULL, and once that happens * it should keep returning NULL. That is, once iterator exhausts elements to * iterate, it should never reset or spuriously return new elements. * * With the assumption of such contract, process_iter_next_call() simulates * a fork in the verifier state to validate loop logic correctness and safety * without having to simulate infinite amount of iterations. * * In current state, we first assume that iter_next() returned NULL and * iterator state is set to DRAINED (BPF_ITER_STATE_DRAINED). In such * conditions we should not form an infinite loop and should eventually reach * exit. * * Besides that, we also fork current state and enqueue it for later * verification. In a forked state we keep iterator state as ACTIVE * (BPF_ITER_STATE_ACTIVE) and assume non-NULL return from iter_next(). We * also bump iteration depth to prevent erroneous infinite loop detection * later on (see iter_active_depths_differ() comment for details). In this * state we assume that we'll eventually loop back to another iter_next() * calls (it could be in exactly same location or in some other instruction, * it doesn't matter, we don't make any unnecessary assumptions about this, * everything revolves around iterator state in a stack slot, not which * instruction is calling iter_next()). When that happens, we either will come * to iter_next() with equivalent state and can conclude that next iteration * will proceed in exactly the same way as we just verified, so it's safe to * assume that loop converges. If not, we'll go on another iteration * simulation with a different input state, until all possible starting states * are validated or we reach maximum number of instructions limit. * * This way, we will either exhaustively discover all possible input states * that iterator loop can start with and eventually will converge, or we'll * effectively regress into bounded loop simulation logic and either reach * maximum number of instructions if loop is not provably convergent, or there * is some statically known limit on number of iterations (e.g., if there is * an explicit `if n > 100 then break;` statement somewhere in the loop). * * Iteration convergence logic in is_state_visited() relies on exact * states comparison, which ignores read and precision marks. * This is necessary because read and precision marks are not finalized * while in the loop. Exact comparison might preclude convergence for * simple programs like below: * * i = 0; * while(iter_next(&it)) * i++; * * At each iteration step i++ would produce a new distinct state and * eventually instruction processing limit would be reached. * * To avoid such behavior speculatively forget (widen) range for * imprecise scalar registers, if those registers were not precise at the * end of the previous iteration and do not match exactly. * * This is a conservative heuristic that allows to verify wide range of programs, * however it precludes verification of programs that conjure an * imprecise value on the first loop iteration and use it as precise on a second. * For example, the following safe program would fail to verify: * * struct bpf_num_iter it; * int arr[10]; * int i = 0, a = 0; * bpf_iter_num_new(&it, 0, 10); * while (bpf_iter_num_next(&it)) { * if (a == 0) { * a = 1; * i = 7; // Because i changed verifier would forget * // it's range on second loop entry. * } else { * arr[i] = 42; // This would fail to verify. * } * } * bpf_iter_num_destroy(&it);
*/ staticint process_iter_next_call(struct bpf_verifier_env *env, int insn_idx, struct bpf_kfunc_call_arg_meta *meta)
{ struct bpf_verifier_state *cur_st = env->cur_state, *queued_st, *prev_st; struct bpf_func_state *cur_fr = cur_st->frame[cur_st->curframe], *queued_fr; struct bpf_reg_state *cur_iter, *queued_iter;
BTF_TYPE_EMIT(struct bpf_iter);
cur_iter = get_iter_from_state(cur_st, meta);
if (cur_iter->iter.state != BPF_ITER_STATE_ACTIVE &&
cur_iter->iter.state != BPF_ITER_STATE_DRAINED) {
verifier_bug(env, "unexpected iterator state %d (%s)",
cur_iter->iter.state, iter_state_str(cur_iter->iter.state)); return -EFAULT;
}
if (cur_iter->iter.state == BPF_ITER_STATE_ACTIVE) { /* Because iter_next() call is a checkpoint is_state_visitied() * should guarantee parent state with same call sites and insn_idx.
*/ if (!cur_st->parent || cur_st->parent->insn_idx != insn_idx ||
!same_callsites(cur_st->parent, cur_st)) {
verifier_bug(env, "bad parent state for iter next call"); return -EFAULT;
} /* Note cur_st->parent in the call below, it is necessary to skip * checkpoint created for cur_st by is_state_visited() * right at this instruction.
*/
prev_st = find_prev_entry(env, cur_st->parent, insn_idx); /* branch out active iter state */
queued_st = push_stack(env, insn_idx + 1, insn_idx, false); if (!queued_st) return -ENOMEM;
/* switch to DRAINED state, but keep the depth unchanged */ /* mark current iter state as drained and assume returned NULL */
cur_iter->iter.state = BPF_ITER_STATE_DRAINED;
__mark_reg_const_zero(env, &cur_fr->regs[BPF_REG_0]);
return 0;
}
staticbool arg_type_is_mem_size(enum bpf_arg_type type)
{ return type == ARG_CONST_SIZE ||
type == ARG_CONST_SIZE_OR_ZERO;
}
compatible = compatible_reg_types[base_type(arg_type)]; if (!compatible) {
verifier_bug(env, "unsupported arg type %d", arg_type); return -EFAULT;
}
/* ARG_PTR_TO_MEM + RDONLY is compatible with PTR_TO_MEM and PTR_TO_MEM + RDONLY, * but ARG_PTR_TO_MEM is compatible only with PTR_TO_MEM and NOT with PTR_TO_MEM + RDONLY * * Same for MAYBE_NULL: * * ARG_PTR_TO_MEM + MAYBE_NULL is compatible with PTR_TO_MEM and PTR_TO_MEM + MAYBE_NULL, * but ARG_PTR_TO_MEM is compatible only with PTR_TO_MEM but NOT with PTR_TO_MEM + MAYBE_NULL * * ARG_PTR_TO_MEM is compatible with PTR_TO_MEM that is tagged with a dynptr type. * * Therefore we fold these flags depending on the arg_type before comparison.
*/ if (arg_type & MEM_RDONLY)
type &= ~MEM_RDONLY; if (arg_type & PTR_MAYBE_NULL)
type &= ~PTR_MAYBE_NULL; if (base_type(arg_type) == ARG_PTR_TO_MEM)
type &= ~DYNPTR_TYPE_FLAG_MASK;
/* Local kptr types are allowed as the source argument of bpf_kptr_xchg */ if (meta->func_id == BPF_FUNC_kptr_xchg && type_is_alloc(type) && regno == BPF_REG_2) {
type &= ~MEM_ALLOC;
type &= ~MEM_PERCPU;
}
for (i = 0; i < ARRAY_SIZE(compatible->types); i++) {
expected = compatible->types[i]; if (expected == NOT_INIT) break;
found: if (base_type(reg->type) != PTR_TO_BTF_ID) return 0;
if (compatible == &mem_types) { if (!(arg_type & MEM_RDONLY)) {
verbose(env, "%s() may write into memory pointed by R%d type=%s\n",
func_id_name(meta->func_id),
regno, reg_type_str(env, reg->type)); return -EACCES;
} return 0;
}
switch ((int)reg->type) { case PTR_TO_BTF_ID: case PTR_TO_BTF_ID | PTR_TRUSTED: case PTR_TO_BTF_ID | PTR_TRUSTED | PTR_MAYBE_NULL: case PTR_TO_BTF_ID | MEM_RCU: case PTR_TO_BTF_ID | PTR_MAYBE_NULL: case PTR_TO_BTF_ID | PTR_MAYBE_NULL | MEM_RCU:
{ /* For bpf_sk_release, it needs to match against first member * 'struct sock_common', hence make an exception for it. This * allows bpf_sk_release to work for multiple socket types.
*/ bool strict_type_match = arg_type_is_release(arg_type) &&
meta->func_id != BPF_FUNC_sk_release;
if (type_may_be_null(reg->type) &&
(!type_may_be_null(arg_type) || arg_type_is_release(arg_type))) {
verbose(env, "Possibly NULL pointer passed to helper arg%d\n", regno); return -EACCES;
}
if (!arg_btf_id) { if (!compatible->btf_id) {
verifier_bug(env, "missing arg compatible BTF ID"); return -EFAULT;
}
arg_btf_id = compatible->btf_id;
}
if (meta->func_id == BPF_FUNC_kptr_xchg) { if (map_kptr_match_type(env, meta->kptr_field, reg, regno)) return -EACCES;
} else { if (arg_btf_id == BPF_PTR_POISON) {
verbose(env, "verifier internal error:");
verbose(env, "R%d has non-overwritten BPF_PTR_POISON type\n",
regno); return -EACCES;
}
if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off,
btf_vmlinux, *arg_btf_id,
strict_type_match)) {
verbose(env, "R%d is of type %s but %s is expected\n",
regno, btf_type_name(reg->btf, reg->btf_id),
btf_type_name(btf_vmlinux, *arg_btf_id)); return -EACCES;
}
} break;
} case PTR_TO_BTF_ID | MEM_ALLOC: case PTR_TO_BTF_ID | MEM_PERCPU | MEM_ALLOC: if (meta->func_id != BPF_FUNC_spin_lock && meta->func_id != BPF_FUNC_spin_unlock &&
meta->func_id != BPF_FUNC_kptr_xchg) {
verifier_bug(env, "unimplemented handling of MEM_ALLOC"); return -EFAULT;
} /* Check if local kptr in src arg matches kptr in dst arg */ if (meta->func_id == BPF_FUNC_kptr_xchg && regno == BPF_REG_2) { if (map_kptr_match_type(env, meta->kptr_field, reg, regno)) return -EACCES;
} break; case PTR_TO_BTF_ID | MEM_PERCPU: case PTR_TO_BTF_ID | MEM_PERCPU | MEM_RCU: case PTR_TO_BTF_ID | MEM_PERCPU | PTR_TRUSTED: /* Handled by helper specific checks */ break; default:
verifier_bug(env, "invalid PTR_TO_BTF_ID register for type match"); return -EFAULT;
} return 0;
}
field = btf_record_find(rec, off, fields); if (!field) return NULL;
return field;
}
staticint check_func_arg_reg_off(struct bpf_verifier_env *env, conststruct bpf_reg_state *reg, int regno, enum bpf_arg_type arg_type)
{
u32 type = reg->type;
/* When referenced register is passed to release function, its fixed * offset must be 0. * * We will check arg_type_is_release reg has ref_obj_id when storing * meta->release_regno.
*/ if (arg_type_is_release(arg_type)) { /* ARG_PTR_TO_DYNPTR with OBJ_RELEASE is a bit special, as it * may not directly point to the object being released, but to * dynptr pointing to such object, which might be at some offset * on the stack. In that case, we simply to fallback to the * default handling.
*/ if (arg_type_is_dynptr(arg_type) && type == PTR_TO_STACK) return 0;
/* Doing check_ptr_off_reg check for the offset will catch this * because fixed_off_ok is false, but checking here allows us * to give the user a better error message.
*/ if (reg->off) {
verbose(env, "R%d must have zero offset when passed to release func or trusted arg to kfunc\n",
regno); return -EINVAL;
} return __check_ptr_off_reg(env, reg, regno, false);
}
switch (type) { /* Pointer types where both fixed and variable offset is explicitly allowed: */ case PTR_TO_STACK: case PTR_TO_PACKET: case PTR_TO_PACKET_META: case PTR_TO_MAP_KEY: case PTR_TO_MAP_VALUE: case PTR_TO_MEM: case PTR_TO_MEM | MEM_RDONLY: case PTR_TO_MEM | MEM_RINGBUF: case PTR_TO_BUF: case PTR_TO_BUF | MEM_RDONLY: case PTR_TO_ARENA: case SCALAR_VALUE: return 0; /* All the rest must be rejected, except PTR_TO_BTF_ID which allows * fixed offset.
*/ case PTR_TO_BTF_ID: case PTR_TO_BTF_ID | MEM_ALLOC: case PTR_TO_BTF_ID | PTR_TRUSTED: case PTR_TO_BTF_ID | MEM_RCU: case PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF: case PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF | MEM_RCU: /* When referenced PTR_TO_BTF_ID is passed to release function, * its fixed offset must be 0. In the other cases, fixed offset * can be non-zero. This was already checked above. So pass * fixed_off_ok as true to allow fixed offset for all other * cases. var_off always must be 0 for PTR_TO_BTF_ID, hence we * still need to do checks instead of returning.
*/ return __check_ptr_off_reg(env, reg, regno, true); default: return __check_ptr_off_reg(env, reg, regno, false);
}
}
for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) if (arg_type_is_dynptr(fn->arg_type[i])) { if (state) {
verbose(env, "verifier internal error: multiple dynptr args\n"); return NULL;
}
state = ®s[BPF_REG_1 + i];
}
if (!state)
verbose(env, "verifier internal error: no dynptr arg found\n");
/* First handle precisely tracked STACK_ZERO */ for (i = off; i >= 0 && stype[i] == STACK_ZERO; i--)
zero_size++; if (zero_size >= key_size) {
*value = 0; return 0;
}
/* Check that stack contains a scalar spill of expected size */ if (!is_spilled_scalar_reg(&state->stack[spi])) return -EOPNOTSUPP; for (i = off; i >= 0 && stype[i] == STACK_SPILL; i--)
spill_size++; if (spill_size != key_size) return -EOPNOTSUPP;
reg = &state->stack[spi].spilled_ptr; if (!tnum_is_const(reg->var_off)) /* Stack value not statically known */ return -EOPNOTSUPP;
/* We are relying on a constant value. So mark as precise * to prevent pruning on it.
*/
bt_set_frame_slot(&env->bt, key->frameno, spi);
err = mark_chain_precision_batch(env, env->cur_state); if (err < 0) return err;
err = check_reg_arg(env, regno, SRC_OP); if (err) return err;
if (arg_type == ARG_ANYTHING) { if (is_pointer_value(env, regno)) {
verbose(env, "R%d leaks addr into helper function\n",
regno); return -EACCES;
} return 0;
}
if (type_is_pkt_pointer(type) &&
!may_access_direct_pkt_data(env, meta, BPF_READ)) {
verbose(env, "helper access to the packet is not allowed\n"); return -EACCES;
}
if (base_type(arg_type) == ARG_PTR_TO_MAP_VALUE) {
err = resolve_map_arg_type(env, meta, &arg_type); if (err) return err;
}
if (register_is_null(reg) && type_may_be_null(arg_type)) /* A NULL register has a SCALAR_VALUE type, so skip * type checking.
*/ goto skip_type_check;
/* arg_btf_id and arg_size are in a union. */ if (base_type(arg_type) == ARG_PTR_TO_BTF_ID ||
base_type(arg_type) == ARG_PTR_TO_SPIN_LOCK)
arg_btf_id = fn->arg_btf_id[arg];
err = check_func_arg_reg_off(env, reg, regno, arg_type); if (err) return err;
skip_type_check: if (arg_type_is_release(arg_type)) { if (arg_type_is_dynptr(arg_type)) { struct bpf_func_state *state = func(env, reg); int spi;
/* Only dynptr created on stack can be released, thus * the get_spi and stack state checks for spilled_ptr * should only be done before process_dynptr_func for * PTR_TO_STACK.
*/ if (reg->type == PTR_TO_STACK) {
spi = dynptr_get_spi(env, reg); if (spi < 0 || !state->stack[spi].spilled_ptr.ref_obj_id) {
verbose(env, "arg %d is an unacquired reference\n", regno); return -EINVAL;
}
} else {
verbose(env, "cannot release unowned const bpf_dynptr\n"); return -EINVAL;
}
} elseif (!reg->ref_obj_id && !register_is_null(reg)) {
verbose(env, "R%d must be referenced when passed to release function\n",
regno); return -EINVAL;
} if (meta->release_regno) {
verifier_bug(env, "more than one release argument"); return -EFAULT;
}
meta->release_regno = regno;
}
if (reg->ref_obj_id && base_type(arg_type) != ARG_KPTR_XCHG_DEST) { if (meta->ref_obj_id) {
verbose(env, "more than one arg with ref_obj_id R%d %u %u",
regno, reg->ref_obj_id,
meta->ref_obj_id); return -EACCES;
}
meta->ref_obj_id = reg->ref_obj_id;
}
switch (base_type(arg_type)) { case ARG_CONST_MAP_PTR: /* bpf_map_xxx(map_ptr) call: remember that map_ptr */ if (meta->map_ptr) { /* Use map_uid (which is unique id of inner map) to reject: * inner_map1 = bpf_map_lookup_elem(outer_map, key1) * inner_map2 = bpf_map_lookup_elem(outer_map, key2) * if (inner_map1 && inner_map2) { * timer = bpf_map_lookup_elem(inner_map1); * if (timer) * // mismatch would have been allowed * bpf_timer_init(timer, inner_map2); * } * * Comparing map_ptr is enough to distinguish normal and outer maps.
*/ if (meta->map_ptr != reg->map_ptr ||
meta->map_uid != reg->map_uid) {
verbose(env, "timer pointer in R1 map_uid=%d doesn't match map pointer in R2 map_uid=%d\n",
meta->map_uid, reg->map_uid); return -EINVAL;
}
}
meta->map_ptr = reg->map_ptr;
meta->map_uid = reg->map_uid; break; case ARG_PTR_TO_MAP_KEY: /* bpf_map_xxx(..., map_ptr, ..., key) call: * check that [key, key + map->key_size) are within * stack limits and initialized
*/ if (!meta->map_ptr) { /* in function declaration map_ptr must come before * map_key, so that it's verified and known before * we have to check map_key here. Otherwise it means * that kernel subsystem misconfigured verifier
*/
verifier_bug(env, "invalid map_ptr to access map->key"); return -EFAULT;
}
key_size = meta->map_ptr->key_size;
err = check_helper_mem_access(env, regno, key_size, BPF_READ, false, NULL); if (err) return err; if (can_elide_value_nullness(meta->map_ptr->map_type)) {
err = get_constant_map_key(env, reg, key_size, &meta->const_map_key); if (err < 0) {
meta->const_map_key = -1; if (err == -EOPNOTSUPP)
err = 0; else return err;
}
} break; case ARG_PTR_TO_MAP_VALUE: if (type_may_be_null(arg_type) && register_is_null(reg)) return 0;
/* bpf_map_xxx(..., map_ptr, ..., value) call: * check [value, value + map->value_size) validity
*/ if (!meta->map_ptr) { /* kernel subsystem misconfigured verifier */
verifier_bug(env, "invalid map_ptr to access map->value"); return -EFAULT;
}
meta->raw_mode = arg_type & MEM_UNINIT;
err = check_helper_mem_access(env, regno, meta->map_ptr->value_size,
arg_type & MEM_WRITE ? BPF_WRITE : BPF_READ, false, meta); break; case ARG_PTR_TO_PERCPU_BTF_ID: if (!reg->btf_id) {
verbose(env, "Helper has invalid btf_id in R%d\n", regno); return -EACCES;
}
meta->ret_btf = reg->btf;
meta->ret_btf_id = reg->btf_id; break; case ARG_PTR_TO_SPIN_LOCK: if (in_rbtree_lock_required_cb(env)) {
verbose(env, "can't spin_{lock,unlock} in rbtree cb\n"); return -EACCES;
} if (meta->func_id == BPF_FUNC_spin_lock) {
err = process_spin_lock(env, regno, PROCESS_SPIN_LOCK); if (err) return err;
} elseif (meta->func_id == BPF_FUNC_spin_unlock) {
err = process_spin_lock(env, regno, 0); if (err) return err;
} else {
verifier_bug(env, "spin lock arg on unexpected helper"); return -EFAULT;
} break; case ARG_PTR_TO_TIMER:
err = process_timer_func(env, regno, meta); if (err) return err; break; case ARG_PTR_TO_FUNC:
meta->subprogno = reg->subprogno; break; case ARG_PTR_TO_MEM: /* The access to this pointer is only checked when we hit the * next is_mem_size argument below.
*/
meta->raw_mode = arg_type & MEM_UNINIT; if (arg_type & MEM_FIXED_SIZE) {
err = check_helper_mem_access(env, regno, fn->arg_size[arg],
arg_type & MEM_WRITE ? BPF_WRITE : BPF_READ, false, meta); if (err) return err; if (arg_type & MEM_ALIGNED)
err = check_ptr_alignment(env, reg, 0, fn->arg_size[arg], true);
} break; case ARG_CONST_SIZE:
err = check_mem_size_reg(env, reg, regno,
fn->arg_type[arg - 1] & MEM_WRITE ?
BPF_WRITE : BPF_READ, false, meta); break; case ARG_CONST_SIZE_OR_ZERO:
err = check_mem_size_reg(env, reg, regno,
fn->arg_type[arg - 1] & MEM_WRITE ?
BPF_WRITE : BPF_READ, true, meta); break; case ARG_PTR_TO_DYNPTR:
err = process_dynptr_func(env, regno, insn_idx, arg_type, 0); if (err) return err; break; case ARG_CONST_ALLOC_SIZE_OR_ZERO: if (!tnum_is_const(reg->var_off)) {
verbose(env, "R%d is not a known constant'\n",
regno); return -EACCES;
}
meta->mem_size = reg->var_off.value;
err = mark_chain_precision(env, regno); if (err) return err; break; case ARG_PTR_TO_CONST_STR:
{
err = check_reg_const_str(env, reg, regno); if (err) return err; break;
} case ARG_KPTR_XCHG_DEST:
err = process_kptr_func(env, regno, meta); if (err) return err; break;
}
return err;
}
staticbool may_update_sockmap(struct bpf_verifier_env *env, int func_id)
{ enum bpf_attach_type eatype = env->prog->expected_attach_type; enum bpf_prog_type type = resolve_prog_type(env->prog);
if (func_id != BPF_FUNC_map_update_elem &&
func_id != BPF_FUNC_map_delete_elem) returnfalse;
/* It's not possible to get access to a locked struct sock in these * contexts, so updating is safe.
*/ switch (type) { case BPF_PROG_TYPE_TRACING: if (eatype == BPF_TRACE_ITER) returntrue; break; case BPF_PROG_TYPE_SOCK_OPS: /* map_update allowed only via dedicated helpers with event type checks */ if (func_id == BPF_FUNC_map_delete_elem) returntrue; break; case BPF_PROG_TYPE_SOCKET_FILTER: case BPF_PROG_TYPE_SCHED_CLS: case BPF_PROG_TYPE_SCHED_ACT: case BPF_PROG_TYPE_XDP: case BPF_PROG_TYPE_SK_REUSEPORT: case BPF_PROG_TYPE_FLOW_DISSECTOR: case BPF_PROG_TYPE_SK_LOOKUP: returntrue; default: break;
}
verbose(env, "cannot update sockmap in this context\n"); returnfalse;
}
staticint check_map_func_compatibility(struct bpf_verifier_env *env, struct bpf_map *map, int func_id)
{ if (!map) return 0;
/* We need a two way check, first is from map perspective ... */ switch (map->map_type) { case BPF_MAP_TYPE_PROG_ARRAY: if (func_id != BPF_FUNC_tail_call) goto error; break; case BPF_MAP_TYPE_PERF_EVENT_ARRAY: if (func_id != BPF_FUNC_perf_event_read &&
func_id != BPF_FUNC_perf_event_output &&
func_id != BPF_FUNC_skb_output &&
func_id != BPF_FUNC_perf_event_read_value &&
func_id != BPF_FUNC_xdp_output) goto error; break; case BPF_MAP_TYPE_RINGBUF: if (func_id != BPF_FUNC_ringbuf_output &&
func_id != BPF_FUNC_ringbuf_reserve &&
func_id != BPF_FUNC_ringbuf_query &&
func_id != BPF_FUNC_ringbuf_reserve_dynptr &&
func_id != BPF_FUNC_ringbuf_submit_dynptr &&
func_id != BPF_FUNC_ringbuf_discard_dynptr) goto error; break; case BPF_MAP_TYPE_USER_RINGBUF: if (func_id != BPF_FUNC_user_ringbuf_drain) goto error; break; case BPF_MAP_TYPE_STACK_TRACE: if (func_id != BPF_FUNC_get_stackid) goto error; break; case BPF_MAP_TYPE_CGROUP_ARRAY: if (func_id != BPF_FUNC_skb_under_cgroup &&
func_id != BPF_FUNC_current_task_under_cgroup) goto error; break; case BPF_MAP_TYPE_CGROUP_STORAGE: case BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE: if (func_id != BPF_FUNC_get_local_storage) goto error; break; case BPF_MAP_TYPE_DEVMAP: case BPF_MAP_TYPE_DEVMAP_HASH: if (func_id != BPF_FUNC_redirect_map &&
func_id != BPF_FUNC_map_lookup_elem) goto error; break; /* Restrict bpf side of cpumap and xskmap, open when use-cases * appear.
*/ case BPF_MAP_TYPE_CPUMAP: if (func_id != BPF_FUNC_redirect_map) goto error; break; case BPF_MAP_TYPE_XSKMAP: if (func_id != BPF_FUNC_redirect_map &&
func_id != BPF_FUNC_map_lookup_elem) goto error; break; case BPF_MAP_TYPE_ARRAY_OF_MAPS: case BPF_MAP_TYPE_HASH_OF_MAPS: if (func_id != BPF_FUNC_map_lookup_elem) goto error; break; case BPF_MAP_TYPE_SOCKMAP: if (func_id != BPF_FUNC_sk_redirect_map &&
func_id != BPF_FUNC_sock_map_update &&
func_id != BPF_FUNC_msg_redirect_map &&
func_id != BPF_FUNC_sk_select_reuseport &&
func_id != BPF_FUNC_map_lookup_elem &&
!may_update_sockmap(env, func_id)) goto error; break; case BPF_MAP_TYPE_SOCKHASH: if (func_id != BPF_FUNC_sk_redirect_hash &&
func_id != BPF_FUNC_sock_hash_update &&
func_id != BPF_FUNC_msg_redirect_hash &&
func_id != BPF_FUNC_sk_select_reuseport &&
func_id != BPF_FUNC_map_lookup_elem &&
!may_update_sockmap(env, func_id)) goto error; break; case BPF_MAP_TYPE_REUSEPORT_SOCKARRAY: if (func_id != BPF_FUNC_sk_select_reuseport) goto error; break; case BPF_MAP_TYPE_QUEUE: case BPF_MAP_TYPE_STACK: if (func_id != BPF_FUNC_map_peek_elem &&
func_id != BPF_FUNC_map_pop_elem &&
func_id != BPF_FUNC_map_push_elem) goto error; break; case BPF_MAP_TYPE_SK_STORAGE: if (func_id != BPF_FUNC_sk_storage_get &&
func_id != BPF_FUNC_sk_storage_delete &&
func_id != BPF_FUNC_kptr_xchg) goto error; break; case BPF_MAP_TYPE_INODE_STORAGE: if (func_id != BPF_FUNC_inode_storage_get &&
func_id != BPF_FUNC_inode_storage_delete &&
func_id != BPF_FUNC_kptr_xchg) goto error; break; case BPF_MAP_TYPE_TASK_STORAGE: if (func_id != BPF_FUNC_task_storage_get &&
func_id != BPF_FUNC_task_storage_delete &&
func_id != BPF_FUNC_kptr_xchg) goto error; break; case BPF_MAP_TYPE_CGRP_STORAGE: if (func_id != BPF_FUNC_cgrp_storage_get &&
func_id != BPF_FUNC_cgrp_storage_delete &&
func_id != BPF_FUNC_kptr_xchg) goto error; break; case BPF_MAP_TYPE_BLOOM_FILTER: if (func_id != BPF_FUNC_map_peek_elem &&
func_id != BPF_FUNC_map_push_elem) goto error; break; default: break;
}
/* ... and second from the function itself. */ switch (func_id) { case BPF_FUNC_tail_call: if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY) goto error; if (env->subprog_cnt > 1 && !allow_tail_call_in_subprogs(env)) {
verbose(env, "mixing of tail_calls and bpf-to-bpf calls is not supported\n"); return -EINVAL;
} break; case BPF_FUNC_perf_event_read: case BPF_FUNC_perf_event_output: case BPF_FUNC_perf_event_read_value: case BPF_FUNC_skb_output: case BPF_FUNC_xdp_output: if (map->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY) goto error; break; case BPF_FUNC_ringbuf_output: case BPF_FUNC_ringbuf_reserve: case BPF_FUNC_ringbuf_query: case BPF_FUNC_ringbuf_reserve_dynptr: case BPF_FUNC_ringbuf_submit_dynptr: case BPF_FUNC_ringbuf_discard_dynptr: if (map->map_type != BPF_MAP_TYPE_RINGBUF) goto error; break; case BPF_FUNC_user_ringbuf_drain: if (map->map_type != BPF_MAP_TYPE_USER_RINGBUF) goto error; break; case BPF_FUNC_get_stackid: if (map->map_type != BPF_MAP_TYPE_STACK_TRACE) goto error; break; case BPF_FUNC_current_task_under_cgroup: case BPF_FUNC_skb_under_cgroup: if (map->map_type != BPF_MAP_TYPE_CGROUP_ARRAY) goto error; break; case BPF_FUNC_redirect_map: if (map->map_type != BPF_MAP_TYPE_DEVMAP &&
map->map_type != BPF_MAP_TYPE_DEVMAP_HASH &&
map->map_type != BPF_MAP_TYPE_CPUMAP &&
map->map_type != BPF_MAP_TYPE_XSKMAP) goto error; break; case BPF_FUNC_sk_redirect_map: case BPF_FUNC_msg_redirect_map: case BPF_FUNC_sock_map_update: if (map->map_type != BPF_MAP_TYPE_SOCKMAP) goto error; break; case BPF_FUNC_sk_redirect_hash: case BPF_FUNC_msg_redirect_hash: case BPF_FUNC_sock_hash_update: if (map->map_type != BPF_MAP_TYPE_SOCKHASH) goto error; break; case BPF_FUNC_get_local_storage: if (map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE &&
map->map_type != BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) goto error; break; case BPF_FUNC_sk_select_reuseport: if (map->map_type != BPF_MAP_TYPE_REUSEPORT_SOCKARRAY &&
map->map_type != BPF_MAP_TYPE_SOCKMAP &&
map->map_type != BPF_MAP_TYPE_SOCKHASH) goto error; break; case BPF_FUNC_map_pop_elem: if (map->map_type != BPF_MAP_TYPE_QUEUE &&
map->map_type != BPF_MAP_TYPE_STACK) goto error; break; case BPF_FUNC_map_peek_elem: case BPF_FUNC_map_push_elem: if (map->map_type != BPF_MAP_TYPE_QUEUE &&
map->map_type != BPF_MAP_TYPE_STACK &&
map->map_type != BPF_MAP_TYPE_BLOOM_FILTER) goto error; break; case BPF_FUNC_map_lookup_percpu_elem: if (map->map_type != BPF_MAP_TYPE_PERCPU_ARRAY &&
map->map_type != BPF_MAP_TYPE_PERCPU_HASH &&
map->map_type != BPF_MAP_TYPE_LRU_PERCPU_HASH) goto error; break; case BPF_FUNC_sk_storage_get: case BPF_FUNC_sk_storage_delete: if (map->map_type != BPF_MAP_TYPE_SK_STORAGE) goto error; break; case BPF_FUNC_inode_storage_get: case BPF_FUNC_inode_storage_delete: if (map->map_type != BPF_MAP_TYPE_INODE_STORAGE) goto error; break; case BPF_FUNC_task_storage_get: case BPF_FUNC_task_storage_delete: if (map->map_type != BPF_MAP_TYPE_TASK_STORAGE) goto error; break; case BPF_FUNC_cgrp_storage_get: case BPF_FUNC_cgrp_storage_delete: if (map->map_type != BPF_MAP_TYPE_CGRP_STORAGE) goto error; break; default: break;
}
staticbool check_raw_mode_ok(conststruct bpf_func_proto *fn)
{ int count = 0;
if (arg_type_is_raw_mem(fn->arg1_type))
count++; if (arg_type_is_raw_mem(fn->arg2_type))
count++; if (arg_type_is_raw_mem(fn->arg3_type))
count++; if (arg_type_is_raw_mem(fn->arg4_type))
count++; if (arg_type_is_raw_mem(fn->arg5_type))
count++;
/* We only support one arg being in raw mode at the moment, * which is sufficient for the helper functions we have * right now.
*/ return count <= 1;
}
staticbool check_arg_pair_ok(conststruct bpf_func_proto *fn)
{ /* bpf_xxx(..., buf, len) call will access 'len' * bytes from memory 'buf'. Both arg types need * to be paired, so make sure there's no buggy * helper function specification.
*/ if (arg_type_is_mem_size(fn->arg1_type) ||
check_args_pair_invalid(fn, 0) ||
check_args_pair_invalid(fn, 1) ||
check_args_pair_invalid(fn, 2) ||
check_args_pair_invalid(fn, 3) ||
check_args_pair_invalid(fn, 4)) returnfalse;
returntrue;
}
staticbool check_btf_id_ok(conststruct bpf_func_proto *fn)
{ int i;
for (i = 0; i < ARRAY_SIZE(fn->arg_type); i++) { if (base_type(fn->arg_type[i]) == ARG_PTR_TO_BTF_ID) return !!fn->arg_btf_id[i]; if (base_type(fn->arg_type[i]) == ARG_PTR_TO_SPIN_LOCK) return fn->arg_btf_id[i] == BPF_PTR_POISON; if (base_type(fn->arg_type[i]) != ARG_PTR_TO_BTF_ID && fn->arg_btf_id[i] && /* arg_btf_id and arg_size are in a union. */
(base_type(fn->arg_type[i]) != ARG_PTR_TO_MEM ||
!(fn->arg_type[i] & MEM_FIXED_SIZE))) returnfalse;
}
/* Packet data might have moved, any old PTR_TO_PACKET[_META,_END] * are now invalid, so turn them into unknown SCALAR_VALUE. * * This also applies to dynptr slices belonging to skb and xdp dynptrs, * since these slices point to packet data.
*/ staticvoid clear_all_pkt_pointers(struct bpf_verifier_env *env)
{ struct bpf_func_state *state; struct bpf_reg_state *reg;
if (reg->type != PTR_TO_PACKET) /* PTR_TO_PACKET_META is not supported yet */ return;
/* The 'reg' is pkt > pkt_end or pkt >= pkt_end. * How far beyond pkt_end it goes is unknown. * if (!range_open) it's the case of pkt >= pkt_end * if (range_open) it's the case of pkt > pkt_end * hence this pointer is at least 1 byte bigger than pkt_end
*/ if (range_open)
reg->range = BEYOND_PKT_END; else
reg->range = AT_PKT_END;
}
staticint release_reference_nomark(struct bpf_verifier_state *state, int ref_obj_id)
{ int i;
for (i = 0; i < state->acquired_refs; i++) { if (state->refs[i].type != REF_TYPE_PTR) continue; if (state->refs[i].id == ref_obj_id) {
release_reference_state(state, i); return 0;
}
} return -EINVAL;
}
/* The pointer with the specified id has released its reference to kernel * resources. Identify all copies of the same pointer and clear the reference. * * This is the release function corresponding to acquire_reference(). Idempotent.
*/ staticint release_reference(struct bpf_verifier_env *env, int ref_obj_id)
{ struct bpf_verifier_state *vstate = env->cur_state; struct bpf_func_state *state; struct bpf_reg_state *reg; int err;
err = release_reference_nomark(vstate, ref_obj_id); if (err) return err;
bpf_for_each_reg_in_vstate(env->cur_state, unused, reg, ({ if (type_is_non_owning_ref(reg->type))
mark_reg_invalid(env, reg);
}));
}
staticvoid clear_caller_saved_regs(struct bpf_verifier_env *env, struct bpf_reg_state *regs)
{ int i;
/* after the call registers r0 - r5 were scratched */ for (i = 0; i < CALLER_SAVED_REGS; i++) {
mark_reg_not_init(env, regs, caller_saved[i]);
__check_reg_arg(env, regs, caller_saved[i], DST_OP_NO_MARK);
}
}
/* callee cannot access r0, r6 - r9 for reading and has to write * into its own stack before reading from it. * callee can read/write into caller's stack
*/
init_func_state(env, callee, /* remember the callsite, it will be used by bpf_exit */
callsite,
state->curframe + 1 /* frameno within this callchain */,
subprog /* subprog number within this prog */);
err = set_callee_state_cb(env, caller, callee, callsite); if (err) goto err_out;
/* only increment it after check_reg_arg() finished */
state->curframe++;
ret = btf_prepare_func_args(env, subprog); if (ret) return ret;
/* check that BTF function arguments match actual types that the * verifier sees.
*/ for (i = 0; i < sub->arg_cnt; i++) {
u32 regno = i + 1; struct bpf_reg_state *reg = ®s[regno]; struct bpf_subprog_arg_info *arg = &sub->args[i];
if (arg->arg_type == ARG_ANYTHING) { if (reg->type != SCALAR_VALUE) {
bpf_log(log, "R%d is not a scalar\n", regno); return -EINVAL;
}
} elseif (arg->arg_type & PTR_UNTRUSTED) { /* * Anything is allowed for untrusted arguments, as these are * read-only and probe read instructions would protect against * invalid memory access.
*/
} elseif (arg->arg_type == ARG_PTR_TO_CTX) {
ret = check_func_arg_reg_off(env, reg, regno, ARG_DONTCARE); if (ret < 0) return ret; /* If function expects ctx type in BTF check that caller * is passing PTR_TO_CTX.
*/ if (reg->type != PTR_TO_CTX) {
bpf_log(log, "arg#%d expects pointer to ctx\n", i); return -EINVAL;
}
} elseif (base_type(arg->arg_type) == ARG_PTR_TO_MEM) {
ret = check_func_arg_reg_off(env, reg, regno, ARG_DONTCARE); if (ret < 0) return ret; if (check_mem_reg(env, reg, regno, arg->mem_size)) return -EINVAL; if (!(arg->arg_type & PTR_MAYBE_NULL) && (reg->type & PTR_MAYBE_NULL)) {
bpf_log(log, "arg#%d is expected to be non-NULL\n", i); return -EINVAL;
}
} elseif (base_type(arg->arg_type) == ARG_PTR_TO_ARENA) { /* * Can pass any value and the kernel won't crash, but * only PTR_TO_ARENA or SCALAR make sense. Everything * else is a bug in the bpf program. Point it out to * the user at the verification time instead of * run-time debug nightmare.
*/ if (reg->type != PTR_TO_ARENA && reg->type != SCALAR_VALUE) {
bpf_log(log, "R%d is not a pointer to arena or scalar.\n", regno); return -EINVAL;
}
} elseif (arg->arg_type == (ARG_PTR_TO_DYNPTR | MEM_RDONLY)) {
ret = check_func_arg_reg_off(env, reg, regno, ARG_PTR_TO_DYNPTR); if (ret) return ret;
ret = process_dynptr_func(env, regno, -1, arg->arg_type, 0); if (ret) return ret;
} elseif (base_type(arg->arg_type) == ARG_PTR_TO_BTF_ID) { struct bpf_call_arg_meta meta; int err;
if (register_is_null(reg) && type_may_be_null(arg->arg_type)) continue;
memset(&meta, 0, sizeof(meta)); /* leave func_id as zero */
err = check_reg_type(env, regno, arg->arg_type, &arg->btf_id, &meta);
err = err ?: check_func_arg_reg_off(env, reg, regno, arg->arg_type); if (err) return err;
} else {
verifier_bug(env, "unrecognized arg#%d type %d", i, arg->arg_type); return -EFAULT;
}
}
return 0;
}
/* Compare BTF of a function call with given bpf_reg_state. * Returns: * EFAULT - there is a verifier bug. Abort verification. * EINVAL - there is a type mismatch or BTF is not available. * 0 - BTF matches with what bpf_reg_state expects. * Only PTR_TO_CTX and SCALAR_VALUE states are recognized.
*/ staticint btf_check_subprog_call(struct bpf_verifier_env *env, int subprog, struct bpf_reg_state *regs)
{ struct bpf_prog *prog = env->prog; struct btf *btf = prog->aux->btf;
u32 btf_id; int err;
if (!prog->aux->func_info) return -EINVAL;
btf_id = prog->aux->func_info[subprog].type_id; if (!btf_id) return -EFAULT;
if (prog->aux->func_info_aux[subprog].unreliable) return -EINVAL;
err = btf_check_func_arg_match(env, subprog, btf, regs); /* Compiler optimizations can remove arguments from static functions * or mismatched type can be passed into a global function. * In such cases mark the function as unreliable from BTF point of view.
*/ if (err)
prog->aux->func_info_aux[subprog].unreliable = true; return err;
}
staticint push_callback_call(struct bpf_verifier_env *env, struct bpf_insn *insn, int insn_idx, int subprog,
set_callee_state_fn set_callee_state_cb)
{ struct bpf_verifier_state *state = env->cur_state, *callback_state; struct bpf_func_state *caller, *callee; int err;
/* set_callee_state is used for direct subprog calls, but we are * interested in validating only BPF helpers that can call subprogs as * callbacks
*/
env->subprog_info[subprog].is_cb = true; if (bpf_pseudo_kfunc_call(insn) &&
!is_callback_calling_kfunc(insn->imm)) {
verifier_bug(env, "kfunc %s#%d not marked as callback-calling",
func_id_name(insn->imm), insn->imm); return -EFAULT;
} elseif (!bpf_pseudo_kfunc_call(insn) &&
!is_callback_calling_function(insn->imm)) { /* helper */
verifier_bug(env, "helper %s#%d not marked as callback-calling",
func_id_name(insn->imm), insn->imm); return -EFAULT;
}
if (is_async_callback_calling_insn(insn)) { struct bpf_verifier_state *async_cb;
/* there is no real recursion here. timer and workqueue callbacks are async */
env->subprog_info[subprog].is_async_cb = true;
async_cb = push_async_cb(env, env->subprog_info[subprog].start,
insn_idx, subprog,
is_bpf_wq_set_callback_impl_kfunc(insn->imm)); if (!async_cb) return -EFAULT;
callee = async_cb->frame[0];
callee->async_entry_cnt = caller->async_entry_cnt + 1;
/* Convert bpf_timer_set_callback() args into timer callback args */
err = set_callee_state_cb(env, caller, callee, insn_idx); if (err) return err;
return 0;
}
/* for callback functions enqueue entry to callback and * proceed with next instruction within current frame.
*/
callback_state = push_stack(env, env->subprog_info[subprog].start, insn_idx, false); if (!callback_state) return -ENOMEM;
target_insn = *insn_idx + insn->imm + 1;
subprog = find_subprog(env, target_insn); if (verifier_bug_if(subprog < 0, env, "target of func call at insn %d is not a program",
target_insn)) return -EFAULT;
if (env->cur_state->active_locks) {
verbose(env, "global function calls are not allowed while holding a lock,\n" "use static function instead\n"); return -EINVAL;
}
if (env->subprog_info[subprog].might_sleep &&
(env->cur_state->active_rcu_lock || env->cur_state->active_preempt_locks ||
env->cur_state->active_irq_id || !in_sleepable(env))) {
verbose(env, "global functions that may sleep are not allowed in non-sleepable context,\n" "i.e., in a RCU/IRQ/preempt-disabled section, or in\n" "a non-sleepable BPF program context\n"); return -EINVAL;
}
if (err) {
verbose(env, "Caller passes invalid args into func#%d ('%s')\n",
subprog, sub_name); return err;
}
verbose(env, "Func#%d ('%s') is global and assumed valid.\n",
subprog, sub_name); if (env->subprog_info[subprog].changes_pkt_data)
clear_all_pkt_pointers(env); /* mark global subprog for verifying after main prog */
subprog_aux(env, subprog)->called = true;
clear_caller_saved_regs(env, caller->regs);
/* All global functions return a 64-bit SCALAR_VALUE */
mark_reg_unknown(env, caller->regs, BPF_REG_0);
caller->regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG;
/* continue with next insn after call */ return 0;
}
/* for regular function entry setup new frame and continue * from that frame.
*/
err = setup_func_entry(env, subprog, *insn_idx, set_callee_state, state); if (err) return err;
clear_caller_saved_regs(env, caller->regs);
/* and go analyze first insn of the callee */
*insn_idx = env->subprog_info[subprog].start - 1;
staticint set_callee_state(struct bpf_verifier_env *env, struct bpf_func_state *caller, struct bpf_func_state *callee, int insn_idx)
{ int i;
/* copy r1 - r5 args that callee can access. The copy includes parent * pointers, which connects us up to the liveness chain
*/ for (i = BPF_REG_1; i <= BPF_REG_5; i++)
callee->regs[i] = caller->regs[i]; return 0;
}
/* valid map_ptr and poison value does not matter */
map = insn_aux->map_ptr_state.map_ptr; if (!map->ops->map_set_for_each_callback_args ||
!map->ops->map_for_each_callback) {
verbose(env, "callback function not allowed for map\n"); return -ENOTSUPP;
}
err = map->ops->map_set_for_each_callback_args(env, caller, callee); if (err) return err;
staticint set_rbtree_add_callback_state(struct bpf_verifier_env *env, struct bpf_func_state *caller, struct bpf_func_state *callee, int insn_idx)
{ /* void bpf_rbtree_add_impl(struct bpf_rb_root *root, struct bpf_rb_node *node, * bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b)); * * 'struct bpf_rb_node *node' arg to bpf_rbtree_add_impl is the same PTR_TO_BTF_ID w/ offset * that 'less' callback args will be receiving. However, 'node' arg was release_reference'd * by this point, so look at 'root'
*/ struct btf_field *field;
field = reg_find_field_offset(&caller->regs[BPF_REG_1], caller->regs[BPF_REG_1].off,
BPF_RB_ROOT); if (!field || !field->graph_root.value_btf_id) return -EFAULT;
/* Are we currently verifying the callback for a rbtree helper that must * be called with lock held? If so, no need to complain about unreleased * lock
*/ staticbool in_rbtree_lock_required_cb(struct bpf_verifier_env *env)
{ struct bpf_verifier_state *state = env->cur_state; struct bpf_insn *insn = env->prog->insnsi; struct bpf_func_state *callee; int kfunc_btf_id;
callee = state->frame[state->curframe];
r0 = &callee->regs[BPF_REG_0]; if (r0->type == PTR_TO_STACK) { /* technically it's ok to return caller's stack pointer * (or caller's caller's pointer) back to the caller, * since these pointers are valid. Only current stack * pointer will be invalid as soon as function exits, * but let's be conservative
*/
verbose(env, "cannot return stack pointer to the caller\n"); return -EINVAL;
}
caller = state->frame[state->curframe - 1]; if (callee->in_callback_fn) { if (r0->type != SCALAR_VALUE) {
verbose(env, "R0 not a scalar value\n"); return -EACCES;
}
/* we are going to rely on register's precise value */
err = mark_reg_read(env, r0, r0->parent, REG_LIVE_READ64);
err = err ?: mark_chain_precision(env, BPF_REG_0); if (err) return err;
/* enforce R0 return value range, and bpf_callback_t returns 64bit */ if (!retval_range_within(callee->callback_ret_range, r0, false)) {
verbose_invalid_scalar(env, r0, callee->callback_ret_range, "At callback return", "R0"); return -EINVAL;
} if (!calls_callback(env, callee->callsite)) {
verifier_bug(env, "in callback at %d, callsite %d !calls_callback",
*insn_idx, callee->callsite); return -EFAULT;
}
} else { /* return to the caller whatever r0 had in the callee */
caller->regs[BPF_REG_0] = *r0;
}
/* for callbacks like bpf_loop or bpf_for_each_map_elem go back to callsite, * there function call logic would reschedule callback visit. If iteration * converges is_state_visited() would prune that visit eventually.
*/
in_callback_fn = callee->in_callback_fn; if (in_callback_fn)
*insn_idx = callee->callsite; else
*insn_idx = callee->callsite + 1;
if (env->log.level & BPF_LOG_LEVEL) {
verbose(env, "returning from callee:\n");
print_verifier_state(env, state, callee->frameno, true);
verbose(env, "to caller at %d:\n", *insn_idx);
print_verifier_state(env, state, caller->frameno, true);
} /* clear everything in the callee. In case of exceptional exits using
* bpf_throw, this will be done by copy_verifier_state for extra frames. */
free_func_state(callee);
state->frame[state->curframe--] = NULL;
/* for callbacks widen imprecise scalars to make programs like below verify: * * struct ctx { int i; } * void cb(int idx, struct ctx *ctx) { ctx->i++; ... } * ... * struct ctx = { .i = 0; } * bpf_loop(100, cb, &ctx, 0); * * This is similar to what is done in process_iter_next_call() for open * coded iterators.
*/
prev_st = in_callback_fn ? find_prev_entry(env, state, *insn_idx) : NULL; if (prev_st) {
err = widen_imprecise_scalars(env, prev_st, state); if (err) return err;
} return 0;
}
staticint do_refine_retval_range(struct bpf_verifier_env *env, struct bpf_reg_state *regs, int ret_type, int func_id, struct bpf_call_arg_meta *meta)
{ struct bpf_reg_state *ret_reg = ®s[BPF_REG_0];
if (map == NULL) {
verifier_bug(env, "expected map for helper call"); return -EFAULT;
}
/* In case of read-only, some additional restrictions * need to be applied in order to prevent altering the * state of the map from program side.
*/ if ((map->map_flags & BPF_F_RDONLY_PROG) &&
(func_id == BPF_FUNC_map_delete_elem ||
func_id == BPF_FUNC_map_update_elem ||
func_id == BPF_FUNC_map_push_elem ||
func_id == BPF_FUNC_map_pop_elem)) {
verbose(env, "write into map forbidden\n"); return -EACCES;
}
if (!exception_exit && cur_func(env)->frameno) return 0;
for (i = 0; i < state->acquired_refs; i++) { if (state->refs[i].type != REF_TYPE_PTR) continue; /* Allow struct_ops programs to return a referenced kptr back to * kernel. Type checks are performed later in check_return_code.
*/ if (type == BPF_PROG_TYPE_STRUCT_OPS && !exception_exit &&
reg->ref_obj_id == state->refs[i].id) continue;
verbose(env, "Unreleased reference id=%d alloc_insn=%d\n",
state->refs[i].id, state->refs[i].insn_idx);
refs_lingering = true;
} return refs_lingering ? -EINVAL : 0;
}
/* data must be an array of u64 */ if (data_len_reg->var_off.value % 8) return -EINVAL;
num_args = data_len_reg->var_off.value / 8;
/* fmt being ARG_PTR_TO_CONST_STR guarantees that var_off is const * and map_direct_value_addr is set.
*/
fmt_map_off = fmt_reg->off + fmt_reg->var_off.value;
err = fmt_map->ops->map_direct_value_addr(fmt_map, &fmt_addr,
fmt_map_off); if (err) {
verbose(env, "failed to retrieve map value address\n"); return -EFAULT;
}
fmt = (char *)(long)fmt_addr + fmt_map_off;
/* We are also guaranteed that fmt+fmt_map_off is NULL terminated, we * can focus on validating the format specifiers.
*/
err = bpf_bprintf_prepare(fmt, UINT_MAX, NULL, num_args, &data); if (err < 0)
verbose(env, "Invalid format string\n");
return err;
}
staticint check_get_func_ip(struct bpf_verifier_env *env)
{ enum bpf_prog_type type = resolve_prog_type(env->prog); int func_id = BPF_FUNC_get_func_ip;
if (type == BPF_PROG_TYPE_TRACING) { if (!bpf_prog_has_trampoline(env->prog)) {
verbose(env, "func %s#%d supported only for fentry/fexit/fmod_ret programs\n",
func_id_name(func_id), func_id); return -ENOTSUPP;
} return 0;
} elseif (type == BPF_PROG_TYPE_KPROBE) { return 0;
}
verbose(env, "func %s#%d not supported for program type %d\n",
func_id_name(func_id), func_id, type); return -ENOTSUPP;
}
/* Returns whether or not the given map type can potentially elide * lookup return value nullness check. This is possible if the key * is statically known.
*/ staticbool can_elide_value_nullness(enum bpf_map_type type)
{ switch (type) { case BPF_MAP_TYPE_ARRAY: case BPF_MAP_TYPE_PERCPU_ARRAY: returntrue; default: returnfalse;
}
}
staticint get_helper_proto(struct bpf_verifier_env *env, int func_id, conststruct bpf_func_proto **ptr)
{ if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID) return -ERANGE;
if (err) {
verbose(env, "program of this type cannot use helper %s#%d\n",
func_id_name(func_id), func_id); return err;
}
/* eBPF programs must be GPL compatible to use GPL-ed functions */ if (!env->prog->gpl_compatible && fn->gpl_only) {
verbose(env, "cannot call GPL-restricted function from non-GPL compatible program\n"); return -EINVAL;
}
if (fn->allowed && !fn->allowed(env->prog)) {
verbose(env, "helper call is not allowed in probe\n"); return -EINVAL;
}
if (!in_sleepable(env) && fn->might_sleep) {
verbose(env, "helper call might sleep in a non-sleepable prog\n"); return -EINVAL;
}
/* With LD_ABS/IND some JITs save/restore skb from r1. */
changes_data = bpf_helper_changes_pkt_data(func_id); if (changes_data && fn->arg1_type != ARG_PTR_TO_CTX) {
verifier_bug(env, "func %s#%d: r1 != ctx", func_id_name(func_id), func_id); return -EFAULT;
}
err = check_func_proto(fn, func_id); if (err) {
verifier_bug(env, "incorrect func proto %s#%d", func_id_name(func_id), func_id); return err;
}
if (env->cur_state->active_rcu_lock) { if (fn->might_sleep) {
verbose(env, "sleepable helper %s#%d in rcu_read_lock region\n",
func_id_name(func_id), func_id); return -EINVAL;
}
if (in_sleepable(env) && is_storage_get_function(func_id))
env->insn_aux_data[insn_idx].storage_get_func_atomic = true;
}
if (env->cur_state->active_preempt_locks) { if (fn->might_sleep) {
verbose(env, "sleepable helper %s#%d in non-preemptible region\n",
func_id_name(func_id), func_id); return -EINVAL;
}
if (in_sleepable(env) && is_storage_get_function(func_id))
env->insn_aux_data[insn_idx].storage_get_func_atomic = true;
}
if (env->cur_state->active_irq_id) { if (fn->might_sleep) {
verbose(env, "sleepable helper %s#%d in IRQ-disabled region\n",
func_id_name(func_id), func_id); return -EINVAL;
}
if (in_sleepable(env) && is_storage_get_function(func_id))
env->insn_aux_data[insn_idx].storage_get_func_atomic = true;
}
meta.func_id = func_id; /* check args */ for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) {
err = check_func_arg(env, i, &meta, fn, insn_idx); if (err) return err;
}
err = record_func_map(env, &meta, func_id, insn_idx); if (err) return err;
err = record_func_key(env, &meta, func_id, insn_idx); if (err) return err;
/* Mark slots with STACK_MISC in case of raw mode, stack offset * is inferred from register state.
*/ for (i = 0; i < meta.access_size; i++) {
err = check_mem_access(env, insn_idx, meta.regno, i, BPF_B,
BPF_WRITE, -1, false, false); if (err) return err;
}
regs = cur_regs(env);
if (meta.release_regno) {
err = -EINVAL; /* This can only be set for PTR_TO_STACK, as CONST_PTR_TO_DYNPTR cannot * be released by any dynptr helper. Hence, unmark_stack_slots_dynptr * is safe to do directly.
*/ if (arg_type_is_dynptr(fn->arg_type[meta.release_regno - BPF_REG_1])) { if (regs[meta.release_regno].type == CONST_PTR_TO_DYNPTR) {
verifier_bug(env, "CONST_PTR_TO_DYNPTR cannot be released"); return -EFAULT;
}
err = unmark_stack_slots_dynptr(env, ®s[meta.release_regno]);
} elseif (func_id == BPF_FUNC_kptr_xchg && meta.ref_obj_id) {
u32 ref_obj_id = meta.ref_obj_id; bool in_rcu = in_rcu_cs(env); struct bpf_func_state *state; struct bpf_reg_state *reg;
err = release_reference_nomark(env->cur_state, ref_obj_id); if (!err) {
bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({ if (reg->ref_obj_id == ref_obj_id) { if (in_rcu && (reg->type & MEM_ALLOC) && (reg->type & MEM_PERCPU)) {
reg->ref_obj_id = 0;
reg->type &= ~MEM_ALLOC;
reg->type |= MEM_RCU;
} else {
mark_reg_invalid(env, reg);
}
}
}));
}
} elseif (meta.ref_obj_id) {
err = release_reference(env, meta.ref_obj_id);
} elseif (register_is_null(®s[meta.release_regno])) { /* meta.ref_obj_id can only be 0 if register that is meant to be * released is NULL, which must be > R0.
*/
err = 0;
} if (err) {
verbose(env, "func %s#%d reference has not been acquired before\n",
func_id_name(func_id), func_id); return err;
}
}
switch (func_id) { case BPF_FUNC_tail_call:
err = check_resource_leak(env, false, true, "tail_call"); if (err) return err; break; case BPF_FUNC_get_local_storage: /* check that flags argument in get_local_storage(map, flags) is 0, * this is required because get_local_storage() can't return an error.
*/ if (!register_is_null(®s[BPF_REG_2])) {
verbose(env, "get_local_storage() doesn't support non-zero flags\n"); return -EINVAL;
} break; case BPF_FUNC_for_each_map_elem:
err = push_callback_call(env, insn, insn_idx, meta.subprogno,
set_map_elem_callback_state); break; case BPF_FUNC_timer_set_callback:
err = push_callback_call(env, insn, insn_idx, meta.subprogno,
set_timer_callback_state); break; case BPF_FUNC_find_vma:
err = push_callback_call(env, insn, insn_idx, meta.subprogno,
set_find_vma_callback_state); break; case BPF_FUNC_snprintf:
err = check_bpf_snprintf_call(env, regs); break; case BPF_FUNC_loop:
update_loop_inline_state(env, meta.subprogno); /* Verifier relies on R1 value to determine if bpf_loop() iteration * is finished, thus mark it precise.
*/
err = mark_chain_precision(env, BPF_REG_1); if (err) return err; if (cur_func(env)->callback_depth < regs[BPF_REG_1].umax_value) {
err = push_callback_call(env, insn, insn_idx, meta.subprogno,
set_loop_callback_state);
} else {
cur_func(env)->callback_depth = 0; if (env->log.level & BPF_LOG_LEVEL2)
verbose(env, "frame%d bpf_loop iteration limit reached\n",
env->cur_state->curframe);
} break; case BPF_FUNC_dynptr_from_mem: if (regs[BPF_REG_1].type != PTR_TO_MAP_VALUE) {
verbose(env, "Unsupported reg type %s for bpf_dynptr_from_mem data\n",
reg_type_str(env, regs[BPF_REG_1].type)); return -EACCES;
} break; case BPF_FUNC_set_retval: if (prog_type == BPF_PROG_TYPE_LSM &&
env->prog->expected_attach_type == BPF_LSM_CGROUP) { if (!env->prog->aux->attach_func_proto->type) { /* Make sure programs that attach to void * hooks don't try to modify return value.
*/
verbose(env, "BPF_LSM_CGROUP that attach to void LSM hooks can't modify return value!\n"); return -EINVAL;
}
} break; case BPF_FUNC_dynptr_data:
{ struct bpf_reg_state *reg; int id, ref_obj_id;
reg = get_dynptr_arg_reg(env, fn, regs); if (!reg) return -EFAULT;
if (meta.dynptr_id) {
verifier_bug(env, "meta.dynptr_id already set"); return -EFAULT;
} if (meta.ref_obj_id) {
verifier_bug(env, "meta.ref_obj_id already set"); return -EFAULT;
}
id = dynptr_id(env, reg); if (id < 0) {
verifier_bug(env, "failed to obtain dynptr id"); return id;
}
ref_obj_id = dynptr_ref_obj_id(env, reg); if (ref_obj_id < 0) {
verifier_bug(env, "failed to obtain dynptr ref_obj_id"); return ref_obj_id;
}
reg = get_dynptr_arg_reg(env, fn, regs); if (!reg) return -EFAULT;
dynptr_type = dynptr_get_type(env, reg); if (dynptr_type == BPF_DYNPTR_TYPE_INVALID) return -EFAULT;
if (dynptr_type == BPF_DYNPTR_TYPE_SKB) /* this will trigger clear_all_pkt_pointers(), which will * invalidate all dynptr slices associated with the skb
*/
changes_data = true;
break;
} case BPF_FUNC_per_cpu_ptr: case BPF_FUNC_this_cpu_ptr:
{ struct bpf_reg_state *reg = ®s[BPF_REG_1]; conststruct btf_type *type;
if (reg->type & MEM_RCU) {
type = btf_type_by_id(reg->btf, reg->btf_id); if (!type || !btf_type_is_struct(type)) {
verbose(env, "Helper has invalid btf/btf_id in R1\n"); return -EFAULT;
}
returns_cpu_specific_alloc_ptr = true;
env->insn_aux_data[insn_idx].call_with_percpu_alloc_ptr = true;
} break;
} case BPF_FUNC_user_ringbuf_drain:
err = push_callback_call(env, insn, insn_idx, meta.subprogno,
set_user_ringbuf_callback_state); break;
}
if (err) return err;
/* reset caller saved regs */ for (i = 0; i < CALLER_SAVED_REGS; i++) {
mark_reg_not_init(env, regs, caller_saved[i]);
check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
}
/* update return register (already marked as written above) */
ret_type = fn->ret_type;
ret_flag = type_flag(ret_type);
switch (base_type(ret_type)) { case RET_INTEGER: /* sets type to SCALAR_VALUE */
mark_reg_unknown(env, regs, BPF_REG_0); break; case RET_VOID:
regs[BPF_REG_0].type = NOT_INIT; break; case RET_PTR_TO_MAP_VALUE: /* There is no offset yet applied, variable or fixed */
mark_reg_known_zero(env, regs, BPF_REG_0); /* remember map_ptr, so that check_map_access() * can check 'value_size' boundary of memory access * to map element returned from bpf_map_lookup_elem()
*/ if (meta.map_ptr == NULL) {
verifier_bug(env, "unexpected null map_ptr"); return -EFAULT;
}
mark_reg_known_zero(env, regs, BPF_REG_0);
t = btf_type_skip_modifiers(meta.ret_btf, meta.ret_btf_id, NULL); if (!btf_type_is_struct(t)) {
u32 tsize; conststruct btf_type *ret; constchar *tname;
/* resolve the type size of ksym. */
ret = btf_resolve_size(meta.ret_btf, t, &tsize); if (IS_ERR(ret)) {
tname = btf_name_by_offset(meta.ret_btf, t->name_off);
verbose(env, "unable to resolve the size of type '%s': %ld\n",
tname, PTR_ERR(ret)); return -EINVAL;
}
regs[BPF_REG_0].type = PTR_TO_MEM | ret_flag;
regs[BPF_REG_0].mem_size = tsize;
} else { if (returns_cpu_specific_alloc_ptr) {
regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC | MEM_RCU;
} else { /* MEM_RDONLY may be carried from ret_flag, but it * doesn't apply on PTR_TO_BTF_ID. Fold it, otherwise * it will confuse the check of PTR_TO_BTF_ID in * check_mem_access().
*/
ret_flag &= ~MEM_RDONLY;
regs[BPF_REG_0].type = PTR_TO_BTF_ID | ret_flag;
}
regs[BPF_REG_0].btf = meta.ret_btf;
regs[BPF_REG_0].btf_id = meta.ret_btf_id;
} break;
} case RET_PTR_TO_BTF_ID:
{ struct btf *ret_btf; int ret_btf_id;
if (func_id == BPF_FUNC_get_func_ip) { if (check_get_func_ip(env)) return -ENOTSUPP;
env->prog->call_get_func_ip = true;
}
if (changes_data)
clear_all_pkt_pointers(env); return 0;
}
/* mark_btf_func_reg_size() is used when the reg size is determined by * the BTF func_proto's return value size and argument.
*/ staticvoid __mark_btf_func_reg_size(struct bpf_verifier_env *env, struct bpf_reg_state *regs,
u32 regno, size_t reg_size)
{ struct bpf_reg_state *reg = ®s[regno];
param_name = btf_name_by_offset(btf, arg->name_off); if (str_is_empty(param_name)) returnfalse;
len = strlen(param_name); if (len != target_len) returnfalse; if (strcmp(param_name, name)) returnfalse;
t = btf_type_skip_modifiers(btf, arg->type, NULL); if (!t) returnfalse; if (!btf_type_is_ptr(t)) returnfalse;
t = btf_type_skip_modifiers(btf, t->type, &res_id); if (!t) returnfalse; return btf_types_are_same(btf, res_id, btf_vmlinux, kf_arg_btf_ids[type]);
}
if (meta->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx]) return KF_ARG_PTR_TO_CTX;
/* In this function, we verify the kfunc's BTF as per the argument type, * leaving the rest of the verification with respect to the register * type to our caller. When a set of conditions hold in the BTF type of * arguments, we resolve it to a known kfunc_ptr_arg_type.
*/ if (btf_is_prog_ctx_type(&env->log, meta->btf, t, resolve_prog_type(env->prog), argno)) return KF_ARG_PTR_TO_CTX;
if (is_kfunc_arg_nullable(meta->btf, &args[argno]) && register_is_null(reg)) return KF_ARG_PTR_TO_NULL;
if (is_kfunc_arg_alloc_obj(meta->btf, &args[argno])) return KF_ARG_PTR_TO_ALLOC_BTF_ID;
if (is_kfunc_arg_refcounted_kptr(meta->btf, &args[argno])) return KF_ARG_PTR_TO_REFCOUNTED_KPTR;
if (is_kfunc_arg_dynptr(meta->btf, &args[argno])) return KF_ARG_PTR_TO_DYNPTR;
if (is_kfunc_arg_iter(meta, argno, &args[argno])) return KF_ARG_PTR_TO_ITER;
if (is_kfunc_arg_list_head(meta->btf, &args[argno])) return KF_ARG_PTR_TO_LIST_HEAD;
if (is_kfunc_arg_list_node(meta->btf, &args[argno])) return KF_ARG_PTR_TO_LIST_NODE;
if (is_kfunc_arg_rbtree_root(meta->btf, &args[argno])) return KF_ARG_PTR_TO_RB_ROOT;
if (is_kfunc_arg_rbtree_node(meta->btf, &args[argno])) return KF_ARG_PTR_TO_RB_NODE;
if (is_kfunc_arg_const_str(meta->btf, &args[argno])) return KF_ARG_PTR_TO_CONST_STR;
if (is_kfunc_arg_map(meta->btf, &args[argno])) return KF_ARG_PTR_TO_MAP;
if (is_kfunc_arg_wq(meta->btf, &args[argno])) return KF_ARG_PTR_TO_WORKQUEUE;
if (is_kfunc_arg_irq_flag(meta->btf, &args[argno])) return KF_ARG_PTR_TO_IRQ_FLAG;
if (is_kfunc_arg_res_spin_lock(meta->btf, &args[argno])) return KF_ARG_PTR_TO_RES_SPIN_LOCK;
if ((base_type(reg->type) == PTR_TO_BTF_ID || reg2btf_ids[base_type(reg->type)])) { if (!btf_type_is_struct(ref_t)) {
verbose(env, "kernel function %s args#%d pointer type %s %s is not supported\n",
meta->func_name, argno, btf_type_str(ref_t), ref_tname); return -EINVAL;
} return KF_ARG_PTR_TO_BTF_ID;
}
if (is_kfunc_arg_callback(env, meta->btf, &args[argno])) return KF_ARG_PTR_TO_CALLBACK;
/* This is the catch all argument type of register types supported by * check_helper_mem_access. However, we only allow when argument type is * pointer to scalar, or struct composed (recursively) of scalars. When * arg_mem_size is true, the pointer can be void *.
*/ if (!btf_type_is_scalar(ref_t) && !__btf_type_is_scalar_struct(env, meta->btf, ref_t, 0) &&
(arg_mem_size ? !btf_type_is_void(ref_t) : 1)) {
verbose(env, "arg#%d pointer type %s %s must point to %sscalar, or struct with scalar\n",
argno, btf_type_str(ref_t), ref_tname, arg_mem_size ? "void, " : ""); return -EINVAL;
} return arg_mem_size ? KF_ARG_PTR_TO_MEM_SIZE : KF_ARG_PTR_TO_MEM;
}
/* Enforce strict type matching for calls to kfuncs that are acquiring * or releasing a reference, or are no-cast aliases. We do _not_ * enforce strict matching for plain KF_TRUSTED_ARGS kfuncs by default, * as we want to enable BPF programs to pass types that are bitwise * equivalent without forcing them to explicitly cast with something * like bpf_cast_to_kern_ctx(). * * For example, say we had a type like the following: * * struct bpf_cpumask { * cpumask_t cpumask; * refcount_t usage; * }; * * Note that as specified in <linux/cpumask.h>, cpumask_t is typedef'ed * to a struct cpumask, so it would be safe to pass a struct * bpf_cpumask * to a kfunc expecting a struct cpumask *. * * The philosophy here is similar to how we allow scalars of different * types to be passed to kfuncs as long as the size is the same. The * only difference here is that we're simply allowing * btf_struct_ids_match() to walk the struct at the 0th offset, and * resolve types.
*/ if ((is_kfunc_release(meta) && reg->ref_obj_id) ||
btf_type_ids_nocast_alias(&env->log, reg_btf, reg_ref_id, meta->btf, ref_id))
strict_type_match = true;
reg_ref_t = btf_type_skip_modifiers(reg_btf, reg_ref_id, ®_ref_id);
reg_ref_tname = btf_name_by_offset(reg_btf, reg_ref_t->name_off);
struct_same = btf_struct_ids_match(&env->log, reg_btf, reg_ref_id, reg->off, meta->btf, ref_id, strict_type_match); /* If kfunc is accepting a projection type (ie. __sk_buff), it cannot * actually use it -- it must cast to the underlying type. So we allow * caller to pass in the underlying type.
*/
taking_projection = btf_is_projection_of(ref_tname, reg_ref_tname); if (!taking_projection && !struct_same) {
verbose(env, "kernel function %s args#%d expected pointer to %s %s but R%d has a pointer to %s %s\n",
meta->func_name, argno, btf_type_str(ref_t), ref_tname, argno + 1,
btf_type_str(reg_ref_t), reg_ref_tname); return -EINVAL;
} return 0;
}
if (irq_save) { if (!is_irq_flag_reg_valid_uninit(env, reg)) {
verbose(env, "expected uninitialized irq flag as arg#%d\n", regno - 1); return -EINVAL;
}
if (!ref_obj_id) {
verifier_bug(env, "ref_obj_id is zero for owning -> non-owning conversion"); return -EFAULT;
}
for (i = 0; i < state->acquired_refs; i++) { if (state->refs[i].id != ref_obj_id) continue;
/* Clear ref_obj_id here so release_reference doesn't clobber * the whole reg
*/
bpf_for_each_reg_in_vstate(env->cur_state, unused, reg, ({ if (reg->ref_obj_id == ref_obj_id) {
reg->ref_obj_id = 0;
ref_set_non_owning(env, reg);
}
})); return 0;
}
verifier_bug(env, "ref state missing for ref_obj_id"); return -EFAULT;
}
/* Implementation details: * * Each register points to some region of memory, which we define as an * allocation. Each allocation may embed a bpf_spin_lock which protects any * special BPF objects (bpf_list_head, bpf_rb_root, etc.) part of the same * allocation. The lock and the data it protects are colocated in the same * memory region. * * Hence, everytime a register holds a pointer value pointing to such * allocation, the verifier preserves a unique reg->id for it. * * The verifier remembers the lock 'ptr' and the lock 'id' whenever * bpf_spin_lock is called. * * To enable this, lock state in the verifier captures two values: * active_lock.ptr = Register's type specific pointer * active_lock.id = A unique ID for each register pointer value * * Currently, PTR_TO_MAP_VALUE and PTR_TO_BTF_ID | MEM_ALLOC are the two * supported register types. * * The active_lock.ptr in case of map values is the reg->map_ptr, and in case of * allocated objects is the reg->btf pointer. * * The active_lock.id is non-unique for maps supporting direct_value_addr, as we * can establish the provenance of the map value statically for each distinct * lookup into such maps. They always contain a single map value hence unique * IDs for each pseudo load pessimizes the algorithm and rejects valid programs. * * So, in case of global variables, they use array maps with max_entries = 1, * hence their active_lock.ptr becomes map_ptr and id = 0 (since they all point * into the same map value as max_entries is 1, as described above). * * In case of inner map lookups, the inner map pointer has same map_ptr as the * outer map pointer (in verifier context), but each lookup into an inner map * assigns a fresh reg->id to the lookup, so while lookups into distinct inner * maps from the same outer map share the same map_ptr as active_lock.ptr, they * will get different reg->id assigned to each lookup, hence different * active_lock.id. * * In case of allocated objects, active_lock.ptr is the reg->btf, and the * reg->id is a unique ID preserved after the NULL pointer check on the pointer * returned from bpf_obj_new. Each allocation receives a new reg->id.
*/ staticint check_reg_allocation_locked(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{ struct bpf_reference_state *s; void *ptr;
u32 id;
switch ((int)reg->type) { case PTR_TO_MAP_VALUE:
ptr = reg->map_ptr; break; case PTR_TO_BTF_ID | MEM_ALLOC:
ptr = reg->btf; break; default:
verifier_bug(env, "unknown reg type for lock check"); return -EFAULT;
}
id = reg->id;
if (!env->cur_state->active_locks) return -EINVAL;
s = find_lock_state(env->cur_state, REF_TYPE_LOCK_MASK, id, ptr); if (!s) {
verbose(env, "held lock and object are not in the same allocation\n"); return -EINVAL;
} return 0;
}
if (meta->btf != btf_vmlinux) {
verifier_bug(env, "unexpected btf mismatch in kfunc call"); return -EFAULT;
}
if (!check_kfunc_is_graph_root_api(env, head_field_type, meta->func_id)) return -EFAULT;
head_type_name = btf_field_type_name(head_field_type); if (!tnum_is_const(reg->var_off)) {
verbose(env, "R%d doesn't have constant offset. %s has to be at the constant offset\n",
regno, head_type_name); return -EINVAL;
}
rec = reg_btf_record(reg);
head_off = reg->off + reg->var_off.value;
field = btf_record_find(rec, head_off, head_field_type); if (!field) {
verbose(env, "%s not found at offset=%u\n", head_type_name, head_off); return -EINVAL;
}
/* All functions require bpf_list_head to be protected using a bpf_spin_lock */ if (check_reg_allocation_locked(env, reg)) {
verbose(env, "bpf_spin_lock at off=%d must be held for %s\n",
rec->spin_lock_off, head_type_name); return -EINVAL;
}
if (meta->btf != btf_vmlinux) {
verifier_bug(env, "unexpected btf mismatch in kfunc call"); return -EFAULT;
}
if (!check_kfunc_is_graph_node_api(env, node_field_type, meta->func_id)) return -EFAULT;
node_type_name = btf_field_type_name(node_field_type); if (!tnum_is_const(reg->var_off)) {
verbose(env, "R%d doesn't have constant offset. %s has to be at the constant offset\n",
regno, node_type_name); return -EINVAL;
}
node_off = reg->off + reg->var_off.value;
field = reg_find_field_offset(reg, node_off, node_field_type); if (!field) {
verbose(env, "%s not found at offset=%u\n", node_type_name, node_off); return -EINVAL;
}
field = *node_field;
et = btf_type_by_id(field->graph_root.btf, field->graph_root.value_btf_id);
t = btf_type_by_id(reg->btf, reg->btf_id); if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, 0, field->graph_root.btf,
field->graph_root.value_btf_id, true)) {
verbose(env, "operation on %s expects arg#1 %s at offset=%d " "in struct %s, but arg is at offset=%d in struct %s\n",
btf_field_type_name(head_field_type),
btf_field_type_name(node_field_type),
field->graph_root.node_offset,
btf_name_by_offset(field->graph_root.btf, et->name_off),
node_off, btf_name_by_offset(reg->btf, t->name_off)); return -EINVAL;
}
meta->arg_btf = reg->btf;
meta->arg_btf_id = reg->btf_id;
if (node_off != field->graph_root.node_offset) {
verbose(env, "arg#1 offset=%d, but expected %s at offset=%d in struct %s\n",
node_off, btf_field_type_name(node_field_type),
field->graph_root.node_offset,
btf_name_by_offset(field->graph_root.btf, et->name_off)); return -EINVAL;
}
/* * css_task iter allowlist is needed to avoid dead locking on css_set_lock. * LSM hooks and iters (both sleepable and non-sleepable) are safe. * Any sleepable progs are also safe since bpf_check_attach_target() enforce * them can only be attached to some specific hook points.
*/ staticbool check_css_task_iter_allowlist(struct bpf_verifier_env *env)
{ enum bpf_prog_type prog_type = resolve_prog_type(env->prog);
switch (prog_type) { case BPF_PROG_TYPE_LSM: returntrue; case BPF_PROG_TYPE_TRACING: if (env->prog->expected_attach_type == BPF_TRACE_ITER) returntrue;
fallthrough; default: return in_sleepable(env);
}
}
/* Check that BTF function arguments match actual types that the * verifier sees.
*/ for (i = 0; i < nargs; i++) { struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[i + 1]; conststruct btf_type *t, *ref_t, *resolve_ret; enum bpf_arg_type arg_type = ARG_DONTCARE;
u32 regno = i + 1, ref_id, type_size; bool is_ret_buf_sz = false; int kf_arg_type;
t = btf_type_skip_modifiers(btf, args[i].type, NULL);
if (is_kfunc_arg_ignore(btf, &args[i])) continue;
if (is_kfunc_arg_prog(btf, &args[i])) { /* Used to reject repeated use of __prog. */ if (meta->arg_prog) {
verifier_bug(env, "Only 1 prog->aux argument supported per-kfunc"); return -EFAULT;
}
meta->arg_prog = true;
cur_aux(env)->arg_prog = regno; continue;
}
if (btf_type_is_scalar(t)) { if (reg->type != SCALAR_VALUE) {
verbose(env, "R%d is not a scalar\n", regno); return -EINVAL;
}
if (is_kfunc_arg_constant(meta->btf, &args[i])) { if (meta->arg_constant.found) {
verifier_bug(env, "only one constant argument permitted"); return -EFAULT;
} if (!tnum_is_const(reg->var_off)) {
verbose(env, "R%d must be a known constant\n", regno); return -EINVAL;
}
ret = mark_chain_precision(env, regno); if (ret < 0) return ret;
meta->arg_constant.found = true;
meta->arg_constant.value = reg->var_off.value;
} elseif (is_kfunc_arg_scalar_with_name(btf, &args[i], "rdonly_buf_size")) {
meta->r0_rdonly = true;
is_ret_buf_sz = true;
} elseif (is_kfunc_arg_scalar_with_name(btf, &args[i], "rdwr_buf_size")) {
is_ret_buf_sz = true;
}
if (is_ret_buf_sz) { if (meta->r0_size) {
verbose(env, "2 or more rdonly/rdwr_buf_size parameters for kfunc"); return -EINVAL;
}
if (!tnum_is_const(reg->var_off)) {
verbose(env, "R%d is not a const\n", regno); return -EINVAL;
}
meta->r0_size = reg->var_off.value;
ret = mark_chain_precision(env, regno); if (ret) return ret;
} continue;
}
if (!btf_type_is_ptr(t)) {
verbose(env, "Unrecognized arg#%d type %s\n", i, btf_type_str(t)); return -EINVAL;
}
kf_arg_type = get_kfunc_ptr_arg_type(env, meta, t, ref_t, ref_tname, args, i, nargs); if (kf_arg_type < 0) return kf_arg_type;
switch (kf_arg_type) { case KF_ARG_PTR_TO_NULL: continue; case KF_ARG_PTR_TO_MAP: if (!reg->map_ptr) {
verbose(env, "pointer in R%d isn't map pointer\n", regno); return -EINVAL;
} if (meta->map.ptr && reg->map_ptr->record->wq_off >= 0) { /* Use map_uid (which is unique id of inner map) to reject: * inner_map1 = bpf_map_lookup_elem(outer_map, key1) * inner_map2 = bpf_map_lookup_elem(outer_map, key2) * if (inner_map1 && inner_map2) { * wq = bpf_map_lookup_elem(inner_map1); * if (wq) * // mismatch would have been allowed * bpf_wq_init(wq, inner_map2); * } * * Comparing map_ptr is enough to distinguish normal and outer maps.
*/ if (meta->map.ptr != reg->map_ptr ||
meta->map.uid != reg->map_uid) {
verbose(env, "workqueue pointer in R1 map_uid=%d doesn't match map pointer in R2 map_uid=%d\n",
meta->map.uid, reg->map_uid); return -EINVAL;
}
}
meta->map.ptr = reg->map_ptr;
meta->map.uid = reg->map_uid;
fallthrough; case KF_ARG_PTR_TO_ALLOC_BTF_ID: case KF_ARG_PTR_TO_BTF_ID: if (!is_kfunc_trusted_args(meta) && !is_kfunc_rcu(meta)) break;
if (!is_trusted_reg(reg)) { if (!is_kfunc_rcu(meta)) {
verbose(env, "R%d must be referenced or trusted\n", regno); return -EINVAL;
} if (!is_rcu_reg(reg)) {
verbose(env, "R%d must be a rcu pointer\n", regno); return -EINVAL;
}
}
fallthrough; case KF_ARG_PTR_TO_CTX: case KF_ARG_PTR_TO_DYNPTR: case KF_ARG_PTR_TO_ITER: case KF_ARG_PTR_TO_LIST_HEAD: case KF_ARG_PTR_TO_LIST_NODE: case KF_ARG_PTR_TO_RB_ROOT: case KF_ARG_PTR_TO_RB_NODE: case KF_ARG_PTR_TO_MEM: case KF_ARG_PTR_TO_MEM_SIZE: case KF_ARG_PTR_TO_CALLBACK: case KF_ARG_PTR_TO_REFCOUNTED_KPTR: case KF_ARG_PTR_TO_CONST_STR: case KF_ARG_PTR_TO_WORKQUEUE: case KF_ARG_PTR_TO_IRQ_FLAG: case KF_ARG_PTR_TO_RES_SPIN_LOCK: break; default:
verifier_bug(env, "unknown kfunc arg type %d", kf_arg_type); return -EFAULT;
}
if (is_kfunc_release(meta) && reg->ref_obj_id)
arg_type |= OBJ_RELEASE;
ret = check_func_arg_reg_off(env, reg, regno, arg_type); if (ret < 0) return ret;
switch (kf_arg_type) { case KF_ARG_PTR_TO_CTX: if (reg->type != PTR_TO_CTX) {
verbose(env, "arg#%d expected pointer to ctx, but got %s\n",
i, reg_type_str(env, reg->type)); return -EINVAL;
}
if (meta->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx]) {
ret = get_kern_ctx_btf_id(&env->log, resolve_prog_type(env->prog)); if (ret < 0) return -EINVAL;
meta->ret_btf_id = ret;
} break; case KF_ARG_PTR_TO_ALLOC_BTF_ID: if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC)) { if (meta->func_id != special_kfunc_list[KF_bpf_obj_drop_impl]) {
verbose(env, "arg#%d expected for bpf_obj_drop_impl()\n", i); return -EINVAL;
}
} elseif (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC | MEM_PERCPU)) { if (meta->func_id != special_kfunc_list[KF_bpf_percpu_obj_drop_impl]) {
verbose(env, "arg#%d expected for bpf_percpu_obj_drop_impl()\n", i); return -EINVAL;
}
} else {
verbose(env, "arg#%d expected pointer to allocated object\n", i); return -EINVAL;
} if (!reg->ref_obj_id) {
verbose(env, "allocated object must be referenced\n"); return -EINVAL;
} if (meta->btf == btf_vmlinux) {
meta->arg_btf = reg->btf;
meta->arg_btf_id = reg->btf_id;
} break; case KF_ARG_PTR_TO_DYNPTR:
{ enum bpf_arg_type dynptr_arg_type = ARG_PTR_TO_DYNPTR; int clone_ref_obj_id = 0;
if (reg->type == CONST_PTR_TO_DYNPTR)
dynptr_arg_type |= MEM_RDONLY;
if (is_kfunc_arg_uninit(btf, &args[i]))
dynptr_arg_type |= MEM_UNINIT;
if (parent_type == BPF_DYNPTR_TYPE_INVALID) {
verifier_bug(env, "no dynptr type for parent of clone"); return -EFAULT;
}
dynptr_arg_type |= (unsignedint)get_dynptr_type_flag(parent_type);
clone_ref_obj_id = meta->initialized_dynptr.ref_obj_id; if (dynptr_type_refcounted(parent_type) && !clone_ref_obj_id) {
verifier_bug(env, "missing ref obj id for parent of clone"); return -EFAULT;
}
}
ret = process_dynptr_func(env, regno, insn_idx, dynptr_arg_type, clone_ref_obj_id); if (ret < 0) return ret;
if (!(dynptr_arg_type & MEM_UNINIT)) { int id = dynptr_id(env, reg);
if (id < 0) {
verifier_bug(env, "failed to obtain dynptr id"); return id;
}
meta->initialized_dynptr.id = id;
meta->initialized_dynptr.type = dynptr_get_type(env, reg);
meta->initialized_dynptr.ref_obj_id = dynptr_ref_obj_id(env, reg);
}
break;
} case KF_ARG_PTR_TO_ITER: if (meta->func_id == special_kfunc_list[KF_bpf_iter_css_task_new]) { if (!check_css_task_iter_allowlist(env)) {
verbose(env, "css_task_iter is only allowed in bpf_lsm, bpf_iter and sleepable progs\n"); return -EINVAL;
}
}
ret = process_iter_arg(env, regno, insn_idx, meta); if (ret < 0) return ret; break; case KF_ARG_PTR_TO_LIST_HEAD: if (reg->type != PTR_TO_MAP_VALUE &&
reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
verbose(env, "arg#%d expected pointer to map value or allocated object\n", i); return -EINVAL;
} if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC) && !reg->ref_obj_id) {
verbose(env, "allocated object must be referenced\n"); return -EINVAL;
}
ret = process_kf_arg_ptr_to_list_head(env, reg, regno, meta); if (ret < 0) return ret; break; case KF_ARG_PTR_TO_RB_ROOT: if (reg->type != PTR_TO_MAP_VALUE &&
reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
verbose(env, "arg#%d expected pointer to map value or allocated object\n", i); return -EINVAL;
} if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC) && !reg->ref_obj_id) {
verbose(env, "allocated object must be referenced\n"); return -EINVAL;
}
ret = process_kf_arg_ptr_to_rbtree_root(env, reg, regno, meta); if (ret < 0) return ret; break; case KF_ARG_PTR_TO_LIST_NODE: if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
verbose(env, "arg#%d expected pointer to allocated object\n", i); return -EINVAL;
} if (!reg->ref_obj_id) {
verbose(env, "allocated object must be referenced\n"); return -EINVAL;
}
ret = process_kf_arg_ptr_to_list_node(env, reg, regno, meta); if (ret < 0) return ret; break; case KF_ARG_PTR_TO_RB_NODE: if (meta->func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) { if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
verbose(env, "arg#%d expected pointer to allocated object\n", i); return -EINVAL;
} if (!reg->ref_obj_id) {
verbose(env, "allocated object must be referenced\n"); return -EINVAL;
}
} else { if (!type_is_non_owning_ref(reg->type) && !reg->ref_obj_id) {
verbose(env, "%s can only take non-owning or refcounted bpf_rb_node pointer\n", func_name); return -EINVAL;
} if (in_rbtree_lock_required_cb(env)) {
verbose(env, "%s not allowed in rbtree cb\n", func_name); return -EINVAL;
}
}
ret = process_kf_arg_ptr_to_rbtree_node(env, reg, regno, meta); if (ret < 0) return ret; break; case KF_ARG_PTR_TO_MAP: /* If argument has '__map' suffix expect 'struct bpf_map *' */
ref_id = *reg2btf_ids[CONST_PTR_TO_MAP];
ref_t = btf_type_by_id(btf_vmlinux, ref_id);
ref_tname = btf_name_by_offset(btf, ref_t->name_off);
fallthrough; case KF_ARG_PTR_TO_BTF_ID: /* Only base_type is checked, further checks are done here */ if ((base_type(reg->type) != PTR_TO_BTF_ID ||
(bpf_type_has_unsafe_modifiers(reg->type) && !is_rcu_reg(reg))) &&
!reg2btf_ids[base_type(reg->type)]) {
verbose(env, "arg#%d is %s ", i, reg_type_str(env, reg->type));
verbose(env, "expected %s or socket\n",
reg_type_str(env, base_type(reg->type) |
(type_flag(reg->type) & BPF_REG_TRUSTED_MODIFIERS))); return -EINVAL;
}
ret = process_kf_arg_ptr_to_btf_id(env, reg, ref_t, ref_tname, ref_id, meta, i); if (ret < 0) return ret; break; case KF_ARG_PTR_TO_MEM:
resolve_ret = btf_resolve_size(btf, ref_t, &type_size); if (IS_ERR(resolve_ret)) {
verbose(env, "arg#%d reference type('%s %s') size cannot be determined: %ld\n",
i, btf_type_str(ref_t), ref_tname, PTR_ERR(resolve_ret)); return -EINVAL;
}
ret = check_mem_reg(env, reg, regno, type_size); if (ret < 0) return ret; break; case KF_ARG_PTR_TO_MEM_SIZE:
{ struct bpf_reg_state *buff_reg = ®s[regno]; conststruct btf_param *buff_arg = &args[i]; struct bpf_reg_state *size_reg = ®s[regno + 1]; conststruct btf_param *size_arg = &args[i + 1];
if (!register_is_null(buff_reg) || !is_kfunc_arg_optional(meta->btf, buff_arg)) {
ret = check_kfunc_mem_size_reg(env, size_reg, regno + 1); if (ret < 0) {
verbose(env, "arg#%d arg#%d memory, len pair leads to invalid memory access\n", i, i + 1); return ret;
}
}
if (is_kfunc_arg_const_mem_size(meta->btf, size_arg, size_reg)) { if (meta->arg_constant.found) {
verifier_bug(env, "only one constant argument permitted"); return -EFAULT;
} if (!tnum_is_const(size_reg->var_off)) {
verbose(env, "R%d must be a known constant\n", regno + 1); return -EINVAL;
}
meta->arg_constant.found = true;
meta->arg_constant.value = size_reg->var_off.value;
}
/* Skip next '__sz' or '__szk' argument */
i++; break;
} case KF_ARG_PTR_TO_CALLBACK: if (reg->type != PTR_TO_FUNC) {
verbose(env, "arg%d expected pointer to func\n", i); return -EINVAL;
}
meta->subprogno = reg->subprogno; break; case KF_ARG_PTR_TO_REFCOUNTED_KPTR: if (!type_is_ptr_alloc_obj(reg->type)) {
verbose(env, "arg#%d is neither owning or non-owning ref\n", i); return -EINVAL;
} if (!type_is_non_owning_ref(reg->type))
meta->arg_owning_ref = true;
if (rec->refcount_off < 0) {
verbose(env, "arg#%d doesn't point to a type with bpf_refcount field\n", i); return -EINVAL;
}
meta->arg_btf = reg->btf;
meta->arg_btf_id = reg->btf_id; break; case KF_ARG_PTR_TO_CONST_STR: if (reg->type != PTR_TO_MAP_VALUE) {
verbose(env, "arg#%d doesn't point to a const string\n", i); return -EINVAL;
}
ret = check_reg_const_str(env, reg, regno); if (ret) return ret; break; case KF_ARG_PTR_TO_WORKQUEUE: if (reg->type != PTR_TO_MAP_VALUE) {
verbose(env, "arg#%d doesn't point to a map value\n", i); return -EINVAL;
}
ret = process_wq_func(env, regno, meta); if (ret < 0) return ret; break; case KF_ARG_PTR_TO_IRQ_FLAG: if (reg->type != PTR_TO_STACK) {
verbose(env, "arg#%d doesn't point to an irq flag on stack\n", i); return -EINVAL;
}
ret = process_irq_flag(env, regno, meta); if (ret < 0) return ret; break; case KF_ARG_PTR_TO_RES_SPIN_LOCK:
{ int flags = PROCESS_RES_LOCK;
if (reg->type != PTR_TO_MAP_VALUE && reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
verbose(env, "arg#%d doesn't point to map value or allocated object\n", i); return -EINVAL;
}
if (!is_bpf_res_spin_lock_kfunc(meta->func_id)) return -EFAULT; if (meta->func_id == special_kfunc_list[KF_bpf_res_spin_lock] ||
meta->func_id == special_kfunc_list[KF_bpf_res_spin_lock_irqsave])
flags |= PROCESS_SPIN_LOCK; if (meta->func_id == special_kfunc_list[KF_bpf_res_spin_lock_irqsave] ||
meta->func_id == special_kfunc_list[KF_bpf_res_spin_unlock_irqrestore])
flags |= PROCESS_LOCK_IRQ;
ret = process_spin_lock(env, regno, flags); if (ret < 0) return ret; break;
}
}
}
if (is_kfunc_release(meta) && !meta->release_regno) {
verbose(env, "release kernel function %s expects refcounted PTR_TO_BTF_ID\n",
func_name); return -EINVAL;
}
if (meta->func_id == special_kfunc_list[KF_bpf_obj_new_impl] && !bpf_global_ma_set) return -ENOMEM;
if (((u64)(u32)meta->arg_constant.value) != meta->arg_constant.value) {
verbose(env, "local type ID argument must be in range [0, U32_MAX]\n"); return -EINVAL;
}
/* This may be NULL due to user not supplying a BTF */ if (!ret_btf) {
verbose(env, "bpf_obj_new/bpf_percpu_obj_new requires prog BTF\n"); return -EINVAL;
}
ret_t = btf_type_by_id(ret_btf, ret_btf_id); if (!ret_t || !__btf_type_is_struct(ret_t)) {
verbose(env, "bpf_obj_new/bpf_percpu_obj_new type ID argument must be of a struct\n"); return -EINVAL;
}
if (meta->func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) { if (ret_t->size > BPF_GLOBAL_PERCPU_MA_MAX_SIZE) {
verbose(env, "bpf_percpu_obj_new type size (%d) is greater than %d\n",
ret_t->size, BPF_GLOBAL_PERCPU_MA_MAX_SIZE); return -EINVAL;
}
if (!bpf_global_percpu_ma_set) {
mutex_lock(&bpf_percpu_ma_lock); if (!bpf_global_percpu_ma_set) { /* Charge memory allocated with bpf_global_percpu_ma to * root memcg. The obj_cgroup for root memcg is NULL.
*/
err = bpf_mem_alloc_percpu_init(&bpf_global_percpu_ma, NULL); if (!err)
bpf_global_percpu_ma_set = true;
}
mutex_unlock(&bpf_percpu_ma_lock); if (err) return err;
}
struct_meta = btf_find_struct_meta(ret_btf, ret_btf_id); if (meta->func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) { if (!__btf_type_is_scalar_struct(env, ret_btf, ret_t, 0)) {
verbose(env, "bpf_percpu_obj_new type ID argument must be of a struct of scalars\n"); return -EINVAL;
}
if (struct_meta) {
verbose(env, "bpf_percpu_obj_new type ID argument must not contain special fields\n"); return -EINVAL;
}
}
/* PTR_MAYBE_NULL will be added when is_kfunc_ret_null is checked */
regs[BPF_REG_0].type = PTR_TO_MEM | type_flag;
if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_slice]) {
regs[BPF_REG_0].type |= MEM_RDONLY;
} else { /* this will set env->seen_direct_write to true */ if (!may_access_direct_pkt_data(env, NULL, BPF_WRITE)) {
verbose(env, "the prog does not allow writes to packet data\n"); return -EINVAL;
}
}
if (!meta->initialized_dynptr.id) {
verifier_bug(env, "no dynptr id"); return -EFAULT;
}
regs[BPF_REG_0].dynptr_id = meta->initialized_dynptr.id;
/* we don't need to set BPF_REG_0's ref obj id * because packet slices are not refcounted (see * dynptr_type_refcounted)
*/
} else { return 0;
}
return 1;
}
staticint check_return_code(struct bpf_verifier_env *env, int regno, constchar *reg_name);
if (env->cur_state->active_preempt_locks) { if (preempt_disable) {
env->cur_state->active_preempt_locks++;
} elseif (preempt_enable) {
env->cur_state->active_preempt_locks--;
} elseif (sleepable) {
verbose(env, "kernel func %s is sleepable within non-preemptible region\n", func_name); return -EACCES;
}
} elseif (preempt_disable) {
env->cur_state->active_preempt_locks++;
} elseif (preempt_enable) {
verbose(env, "unmatched attempt to enable preemption (kernel function %s)\n", func_name); return -EINVAL;
}
if (env->cur_state->active_irq_id && sleepable) {
verbose(env, "kernel func %s is sleepable within IRQ-disabled region\n", func_name); return -EACCES;
}
/* In case of release function, we get register number of refcounted * PTR_TO_BTF_ID in bpf_kfunc_arg_meta, do the release now.
*/ if (meta.release_regno) {
err = release_reference(env, regs[meta.release_regno].ref_obj_id); if (err) {
verbose(env, "kfunc %s#%d reference has not been acquired before\n",
func_name, meta.func_id); return err;
}
}
err = release_reference(env, release_ref_obj_id); if (err) {
verbose(env, "kfunc %s#%d reference has not been acquired before\n",
func_name, meta.func_id); return err;
}
}
if (meta.func_id == special_kfunc_list[KF_bpf_throw]) { if (!bpf_jit_supports_exceptions()) {
verbose(env, "JIT does not support calling kfunc %s#%d\n",
func_name, meta.func_id); return -ENOTSUPP;
}
env->seen_exception = true;
/* In the case of the default callback, the cookie value passed * to bpf_throw becomes the return value of the program.
*/ if (!env->exception_callback_subprog) {
err = check_return_code(env, BPF_REG_1, "R1"); if (err < 0) return err;
}
}
for (i = 0; i < CALLER_SAVED_REGS; i++)
mark_reg_not_init(env, regs, caller_saved[i]);
/* Check return type */
t = btf_type_skip_modifiers(desc_btf, meta.func_proto->type, NULL);
if (is_kfunc_acquire(&meta) && !btf_type_is_struct_ptr(meta.btf, t)) { /* Only exception is bpf_obj_new_impl */ if (meta.btf != btf_vmlinux ||
(meta.func_id != special_kfunc_list[KF_bpf_obj_new_impl] &&
meta.func_id != special_kfunc_list[KF_bpf_percpu_obj_new_impl] &&
meta.func_id != special_kfunc_list[KF_bpf_refcount_acquire_impl])) {
verbose(env, "acquire kernel function does not return PTR_TO_BTF_ID\n"); return -EINVAL;
}
}
if (is_kfunc_ret_null(&meta)) {
regs[BPF_REG_0].type |= PTR_MAYBE_NULL; /* For mark_ptr_or_null_reg, see 93c230e3f5bd6 */
regs[BPF_REG_0].id = ++env->id_gen;
}
mark_btf_func_reg_size(env, BPF_REG_0, sizeof(void *)); if (is_kfunc_acquire(&meta)) { int id = acquire_reference(env, insn_idx);
if (id < 0) return id; if (is_kfunc_ret_null(&meta))
regs[BPF_REG_0].id = id;
regs[BPF_REG_0].ref_obj_id = id;
} elseif (is_rbtree_node_type(ptr_type) || is_list_node_type(ptr_type)) {
ref_set_non_owning(env, ®s[BPF_REG_0]);
}
if (known && (val >= BPF_MAX_VAR_OFF || val <= -BPF_MAX_VAR_OFF)) {
verbose(env, "math between %s pointer and %lld is not allowed\n",
reg_type_str(env, type), val); returnfalse;
}
if (reg->off >= BPF_MAX_VAR_OFF || reg->off <= -BPF_MAX_VAR_OFF) {
verbose(env, "%s pointer offset %d is not allowed\n",
reg_type_str(env, type), reg->off); returnfalse;
}
if (smin == S64_MIN) {
verbose(env, "math between %s pointer and register with unbounded min value is not allowed\n",
reg_type_str(env, type)); returnfalse;
}
if (smin >= BPF_MAX_VAR_OFF || smin <= -BPF_MAX_VAR_OFF) {
verbose(env, "value %lld makes %s pointer be out of bounds\n",
smin, reg_type_str(env, type)); returnfalse;
}
switch (ptr_reg->type) { case PTR_TO_STACK: /* Offset 0 is out-of-bounds, but acceptable start for the * left direction, see BPF_REG_FP. Also, unknown scalar * offset where we would need to deal with min/max bounds is * currently prohibited for unprivileged.
*/
max = MAX_BPF_STACK + mask_to_left;
ptr_limit = -(ptr_reg->var_off.value + ptr_reg->off); break; case PTR_TO_MAP_VALUE:
max = ptr_reg->map_ptr->value_size;
ptr_limit = (mask_to_left ?
ptr_reg->smin_value :
ptr_reg->umax_value) + ptr_reg->off; break; default: return REASON_TYPE;
}
staticint update_alu_sanitation_state(struct bpf_insn_aux_data *aux,
u32 alu_state, u32 alu_limit)
{ /* If we arrived here from different branches with different * state or limits to sanitize, then this won't work.
*/ if (aux->alu_state &&
(aux->alu_state != alu_state ||
aux->alu_limit != alu_limit)) return REASON_PATHS;
/* We already marked aux for masking from non-speculative * paths, thus we got here in the first place. We only care * to explore bad access from here.
*/ if (vstate->speculative) goto do_sim;
if (!commit_window) { if (!tnum_is_const(off_reg->var_off) &&
(off_reg->smin_value < 0) != (off_reg->smax_value < 0)) return REASON_BOUNDS;
if (commit_window) { /* In commit phase we narrow the masking window based on * the observed pointer move after the simulated operation.
*/
alu_state = info->aux.alu_state;
alu_limit = abs(info->aux.alu_limit - alu_limit);
} else {
alu_state = off_is_neg ? BPF_ALU_NEG_VALUE : 0;
alu_state |= off_is_imm ? BPF_ALU_IMMEDIATE : 0;
alu_state |= ptr_is_dst_reg ?
BPF_ALU_SANITIZE_SRC : BPF_ALU_SANITIZE_DST;
/* Limit pruning on unknown scalars to enable deep search for * potential masking differences from other program paths.
*/ if (!off_is_imm)
env->explore_alu_limits = true;
}
err = update_alu_sanitation_state(aux, alu_state, alu_limit); if (err < 0) return err;
do_sim: /* If we're in commit phase, we're done here given we already * pushed the truncated dst_reg into the speculative verification * stack. * * Also, when register is a known constant, we rewrite register-based * operation to immediate-based, and thus do not need masking (and as * a consequence, do not need to simulate the zero-truncation either).
*/ if (commit_window || off_is_imm) return 0;
/* Simulate and find potential out-of-bounds access under * speculative execution from truncation as a result of * masking when off was not within expected range. If off * sits in dst, then we temporarily need to move ptr there * to simulate dst (== 0) +/-= ptr. Needed, for example, * for cases where we use K-based arithmetic in one direction * and truncated reg-based in the other in order to explore * bad access.
*/ if (!ptr_is_dst_reg) {
tmp = *dst_reg;
copy_register_state(dst_reg, ptr_reg);
}
ret = sanitize_speculative_path(env, NULL, env->insn_idx + 1,
env->insn_idx); if (!ptr_is_dst_reg && ret)
*dst_reg = tmp; return !ret ? REASON_STACK : 0;
}
/* If we simulate paths under speculation, we don't update the * insn as 'seen' such that when we verify unreachable paths in * the non-speculative domain, sanitize_dead_code() can still * rewrite/sanitize them.
*/ if (!vstate->speculative)
env->insn_aux_data[env->insn_idx].seen = env->pass_cnt;
}
switch (reason) { case REASON_BOUNDS:
verbose(env, "R%d has unknown scalar with mixed signed bounds, %s\n",
off_reg == dst_reg ? dst : src, err); break; case REASON_TYPE:
verbose(env, "R%d has pointer with unsupported alu operation, %s\n",
off_reg == dst_reg ? src : dst, err); break; case REASON_PATHS:
verbose(env, "R%d tried to %s from different maps, paths or scalars, %s\n",
dst, op, err); break; case REASON_LIMIT:
verbose(env, "R%d tried to %s beyond pointer bounds, %s\n",
dst, op, err); break; case REASON_STACK:
verbose(env, "R%d could not be pushed for speculative verification, %s\n",
dst, err); return -ENOMEM; default:
verifier_bug(env, "unknown reason (%d)", reason); break;
}
return -EACCES;
}
/* check that stack access falls within stack limits and that 'reg' doesn't * have a variable offset. * * Variable offset is prohibited for unprivileged mode for simplicity since it * requires corresponding support in Spectre masking for stack ALU. See also * retrieve_ptr_limit(). * * * 'off' includes 'reg->off'.
*/ staticint check_stack_access_for_ptr_arithmetic( struct bpf_verifier_env *env, int regno, conststruct bpf_reg_state *reg, int off)
{ if (!tnum_is_const(reg->var_off)) { char tn_buf[48];
/* For unprivileged we require that resulting offset must be in bounds * in order to be able to sanitize access later on.
*/ if (env->bypass_spec_v1) return 0;
switch (dst_reg->type) { case PTR_TO_STACK: if (check_stack_access_for_ptr_arithmetic(env, dst, dst_reg,
dst_reg->off + dst_reg->var_off.value)) return -EACCES; break; case PTR_TO_MAP_VALUE: if (check_map_access(env, dst, dst_reg->off, 1, false, ACCESS_HELPER)) {
verbose(env, "R%d pointer arithmetic of map value goes out of range, " "prohibited for !root\n", dst); return -EACCES;
} break; default: return -EOPNOTSUPP;
}
return 0;
}
/* Handles arithmetic on a pointer and a scalar: computes new min/max and var_off. * Caller should also handle BPF_MOV case separately. * If we return -EACCES, caller may want to try again treating pointer as a * scalar. So we only emit a diagnostic if !env->allow_ptr_leaks.
*/ staticint adjust_ptr_min_max_vals(struct bpf_verifier_env *env, struct bpf_insn *insn, conststruct bpf_reg_state *ptr_reg, conststruct bpf_reg_state *off_reg)
{ struct bpf_verifier_state *vstate = env->cur_state; struct bpf_func_state *state = vstate->frame[vstate->curframe]; struct bpf_reg_state *regs = state->regs, *dst_reg; bool known = tnum_is_const(off_reg->var_off);
s64 smin_val = off_reg->smin_value, smax_val = off_reg->smax_value,
smin_ptr = ptr_reg->smin_value, smax_ptr = ptr_reg->smax_value;
u64 umin_val = off_reg->umin_value, umax_val = off_reg->umax_value,
umin_ptr = ptr_reg->umin_value, umax_ptr = ptr_reg->umax_value; struct bpf_sanitize_info info = {};
u8 opcode = BPF_OP(insn->code);
u32 dst = insn->dst_reg; int ret, bounds_ret;
dst_reg = ®s[dst];
if ((known && (smin_val != smax_val || umin_val != umax_val)) ||
smin_val > smax_val || umin_val > umax_val) { /* Taint dst register if offset had invalid bounds derived from * e.g. dead branches.
*/
__mark_reg_unknown(env, dst_reg); return 0;
}
if (BPF_CLASS(insn->code) != BPF_ALU64) { /* 32-bit ALU ops on pointers produce (meaningless) scalars */ if (opcode == BPF_SUB && env->allow_ptr_leaks) {
__mark_reg_unknown(env, dst_reg); return 0;
}
if (ptr_reg->type & PTR_MAYBE_NULL) {
verbose(env, "R%d pointer arithmetic on %s prohibited, null-check it first\n",
dst, reg_type_str(env, ptr_reg->type)); return -EACCES;
}
/* * Accesses to untrusted PTR_TO_MEM are done through probe * instructions, hence no need to track offsets.
*/ if (base_type(ptr_reg->type) == PTR_TO_MEM && (ptr_reg->type & PTR_UNTRUSTED)) return 0;
switch (base_type(ptr_reg->type)) { case PTR_TO_CTX: case PTR_TO_MAP_VALUE: case PTR_TO_MAP_KEY: case PTR_TO_STACK: case PTR_TO_PACKET_META: case PTR_TO_PACKET: case PTR_TO_TP_BUFFER: case PTR_TO_BTF_ID: case PTR_TO_MEM: case PTR_TO_BUF: case PTR_TO_FUNC: case CONST_PTR_TO_DYNPTR: break; case PTR_TO_FLOW_KEYS: if (known) break;
fallthrough; case CONST_PTR_TO_MAP: /* smin_val represents the known value */ if (known && smin_val == 0 && opcode == BPF_ADD) break;
fallthrough; default:
verbose(env, "R%d pointer arithmetic on %s prohibited\n",
dst, reg_type_str(env, ptr_reg->type)); return -EACCES;
}
/* In case of 'scalar += pointer', dst_reg inherits pointer type and id. * The id may be overwritten later if we create a new variable offset.
*/
dst_reg->type = ptr_reg->type;
dst_reg->id = ptr_reg->id;
if (!check_reg_sane_offset(env, off_reg, ptr_reg->type) ||
!check_reg_sane_offset(env, ptr_reg, ptr_reg->type)) return -EINVAL;
/* pointer types do not carry 32-bit bounds at the moment. */
__mark_reg32_unbounded(dst_reg);
if (sanitize_needed(opcode)) {
ret = sanitize_ptr_alu(env, insn, ptr_reg, off_reg, dst_reg,
&info, false); if (ret < 0) return sanitize_err(env, insn, ret, off_reg, dst_reg);
}
switch (opcode) { case BPF_ADD: /* We can take a fixed offset as long as it doesn't overflow * the s32 'off' field
*/ if (known && (ptr_reg->off + smin_val ==
(s64)(s32)(ptr_reg->off + smin_val))) { /* pointer += K. Accumulate it into fixed offset */
dst_reg->smin_value = smin_ptr;
dst_reg->smax_value = smax_ptr;
dst_reg->umin_value = umin_ptr;
dst_reg->umax_value = umax_ptr;
dst_reg->var_off = ptr_reg->var_off;
dst_reg->off = ptr_reg->off + smin_val;
dst_reg->raw = ptr_reg->raw; break;
} /* A new variable offset is created. Note that off_reg->off * == 0, since it's a scalar. * dst_reg gets the pointer type and since some positive * integer value was added to the pointer, give it a new 'id' * if it's a PTR_TO_PACKET. * this creates a new 'base' pointer, off_reg (variable) gets * added into the variable offset, and we copy the fixed offset * from ptr_reg.
*/ if (check_add_overflow(smin_ptr, smin_val, &dst_reg->smin_value) ||
check_add_overflow(smax_ptr, smax_val, &dst_reg->smax_value)) {
dst_reg->smin_value = S64_MIN;
dst_reg->smax_value = S64_MAX;
} if (check_add_overflow(umin_ptr, umin_val, &dst_reg->umin_value) ||
check_add_overflow(umax_ptr, umax_val, &dst_reg->umax_value)) {
dst_reg->umin_value = 0;
dst_reg->umax_value = U64_MAX;
}
dst_reg->var_off = tnum_add(ptr_reg->var_off, off_reg->var_off);
dst_reg->off = ptr_reg->off;
dst_reg->raw = ptr_reg->raw; if (reg_is_pkt_pointer(ptr_reg)) {
dst_reg->id = ++env->id_gen; /* something was added to pkt_ptr, set range to zero */
memset(&dst_reg->raw, 0, sizeof(dst_reg->raw));
} break; case BPF_SUB: if (dst_reg == off_reg) { /* scalar -= pointer. Creates an unknown scalar */
verbose(env, "R%d tried to subtract pointer from scalar\n",
dst); return -EACCES;
} /* We don't allow subtraction from FP, because (according to * test_verifier.c test "invalid fp arithmetic", JITs might not * be able to deal with it.
*/ if (ptr_reg->type == PTR_TO_STACK) {
verbose(env, "R%d subtraction from stack pointer prohibited\n",
dst); return -EACCES;
} if (known && (ptr_reg->off - smin_val ==
(s64)(s32)(ptr_reg->off - smin_val))) { /* pointer -= K. Subtract it from fixed offset */
dst_reg->smin_value = smin_ptr;
dst_reg->smax_value = smax_ptr;
dst_reg->umin_value = umin_ptr;
dst_reg->umax_value = umax_ptr;
dst_reg->var_off = ptr_reg->var_off;
dst_reg->id = ptr_reg->id;
dst_reg->off = ptr_reg->off - smin_val;
dst_reg->raw = ptr_reg->raw; break;
} /* A new variable offset is created. If the subtrahend is known * nonnegative, then any reg->range we had before is still good.
*/ if (check_sub_overflow(smin_ptr, smax_val, &dst_reg->smin_value) ||
check_sub_overflow(smax_ptr, smin_val, &dst_reg->smax_value)) { /* Overflow possible, we know nothing */
dst_reg->smin_value = S64_MIN;
dst_reg->smax_value = S64_MAX;
} if (umin_ptr < umax_val) { /* Overflow possible, we know nothing */
dst_reg->umin_value = 0;
dst_reg->umax_value = U64_MAX;
} else { /* Cannot overflow (as long as bounds are consistent) */
dst_reg->umin_value = umin_ptr - umax_val;
dst_reg->umax_value = umax_ptr - umin_val;
}
dst_reg->var_off = tnum_sub(ptr_reg->var_off, off_reg->var_off);
dst_reg->off = ptr_reg->off;
dst_reg->raw = ptr_reg->raw; if (reg_is_pkt_pointer(ptr_reg)) {
dst_reg->id = ++env->id_gen; /* something was added to pkt_ptr, set range to zero */ if (smin_val < 0)
memset(&dst_reg->raw, 0, sizeof(dst_reg->raw));
} break; case BPF_AND: case BPF_OR: case BPF_XOR: /* bitwise ops on pointers are troublesome, prohibit. */
verbose(env, "R%d bitwise operator %s on pointer prohibited\n",
dst, bpf_alu_string[opcode >> 4]); return -EACCES; default: /* other operators (e.g. MUL,LSH) produce non-pointer results */
verbose(env, "R%d pointer arithmetic with %s operator prohibited\n",
dst, bpf_alu_string[opcode >> 4]); return -EACCES;
}
if (!check_reg_sane_offset(env, dst_reg, ptr_reg->type)) return -EINVAL;
reg_bounds_sync(dst_reg);
bounds_ret = sanitize_check_bounds(env, insn, dst_reg); if (bounds_ret == -EACCES) return bounds_ret; if (sanitize_needed(opcode)) {
ret = sanitize_ptr_alu(env, insn, dst_reg, off_reg, dst_reg,
&info, true); if (verifier_bug_if(!can_skip_alu_sanitation(env, insn)
&& !env->cur_state->speculative
&& bounds_ret
&& !ret,
env, "Pointer type unsupported by sanitize_check_bounds() not rejected by retrieve_ptr_limit() as required")) { return -EFAULT;
} if (ret < 0) return sanitize_err(env, insn, ret, off_reg, dst_reg);
}
/* If either all additions overflow or no additions overflow, then * it is okay to set: dst_umin = dst_umin + src_umin, dst_umax = * dst_umax + src_umax. Otherwise (some additions overflow), set * the output bounds to unbounded.
*/
min_overflow = check_add_overflow(*dst_umin, umin_val, dst_umin);
max_overflow = check_add_overflow(*dst_umax, umax_val, dst_umax);
/* If either all additions overflow or no additions overflow, then * it is okay to set: dst_umin = dst_umin + src_umin, dst_umax = * dst_umax + src_umax. Otherwise (some additions overflow), set * the output bounds to unbounded.
*/
min_overflow = check_add_overflow(*dst_umin, umin_val, dst_umin);
max_overflow = check_add_overflow(*dst_umax, umax_val, dst_umax);
/* If either all subtractions underflow or no subtractions * underflow, it is okay to set: dst_umin = dst_umin - src_umax, * dst_umax = dst_umax - src_umin. Otherwise (some subtractions * underflow), set the output bounds to unbounded.
*/
min_underflow = check_sub_overflow(*dst_umin, umax_val, dst_umin);
max_underflow = check_sub_overflow(*dst_umax, umin_val, dst_umax);
/* If either all subtractions underflow or no subtractions * underflow, it is okay to set: dst_umin = dst_umin - src_umax, * dst_umax = dst_umax - src_umin. Otherwise (some subtractions * underflow), set the output bounds to unbounded.
*/
min_underflow = check_sub_overflow(*dst_umin, umax_val, dst_umin);
max_underflow = check_sub_overflow(*dst_umax, umin_val, dst_umax);
if (src_known && dst_known) {
__mark_reg32_known(dst_reg, var32_off.value); return;
}
/* We get our minimum from the var_off, since that's inherently * bitwise. Our maximum is the minimum of the operands' maxima.
*/
dst_reg->u32_min_value = var32_off.value;
dst_reg->u32_max_value = min(dst_reg->u32_max_value, umax_val);
/* Safe to set s32 bounds by casting u32 result into s32 when u32 * doesn't cross sign boundary. Otherwise set s32 bounds to unbounded.
*/ if ((s32)dst_reg->u32_min_value <= (s32)dst_reg->u32_max_value) {
dst_reg->s32_min_value = dst_reg->u32_min_value;
dst_reg->s32_max_value = dst_reg->u32_max_value;
} else {
dst_reg->s32_min_value = S32_MIN;
dst_reg->s32_max_value = S32_MAX;
}
}
if (src_known && dst_known) {
__mark_reg_known(dst_reg, dst_reg->var_off.value); return;
}
/* We get our minimum from the var_off, since that's inherently * bitwise. Our maximum is the minimum of the operands' maxima.
*/
dst_reg->umin_value = dst_reg->var_off.value;
dst_reg->umax_value = min(dst_reg->umax_value, umax_val);
/* Safe to set s64 bounds by casting u64 result into s64 when u64 * doesn't cross sign boundary. Otherwise set s64 bounds to unbounded.
*/ if ((s64)dst_reg->umin_value <= (s64)dst_reg->umax_value) {
dst_reg->smin_value = dst_reg->umin_value;
dst_reg->smax_value = dst_reg->umax_value;
} else {
dst_reg->smin_value = S64_MIN;
dst_reg->smax_value = S64_MAX;
} /* We may learn something more from the var_off */
__update_reg_bounds(dst_reg);
}
if (src_known && dst_known) {
__mark_reg32_known(dst_reg, var32_off.value); return;
}
/* We get our maximum from the var_off, and our minimum is the * maximum of the operands' minima
*/
dst_reg->u32_min_value = max(dst_reg->u32_min_value, umin_val);
dst_reg->u32_max_value = var32_off.value | var32_off.mask;
/* Safe to set s32 bounds by casting u32 result into s32 when u32 * doesn't cross sign boundary. Otherwise set s32 bounds to unbounded.
*/ if ((s32)dst_reg->u32_min_value <= (s32)dst_reg->u32_max_value) {
dst_reg->s32_min_value = dst_reg->u32_min_value;
dst_reg->s32_max_value = dst_reg->u32_max_value;
} else {
dst_reg->s32_min_value = S32_MIN;
dst_reg->s32_max_value = S32_MAX;
}
}
if (src_known && dst_known) {
__mark_reg_known(dst_reg, dst_reg->var_off.value); return;
}
/* We get our maximum from the var_off, and our minimum is the * maximum of the operands' minima
*/
dst_reg->umin_value = max(dst_reg->umin_value, umin_val);
dst_reg->umax_value = dst_reg->var_off.value | dst_reg->var_off.mask;
/* Safe to set s64 bounds by casting u64 result into s64 when u64 * doesn't cross sign boundary. Otherwise set s64 bounds to unbounded.
*/ if ((s64)dst_reg->umin_value <= (s64)dst_reg->umax_value) {
dst_reg->smin_value = dst_reg->umin_value;
dst_reg->smax_value = dst_reg->umax_value;
} else {
dst_reg->smin_value = S64_MIN;
dst_reg->smax_value = S64_MAX;
} /* We may learn something more from the var_off */
__update_reg_bounds(dst_reg);
}
if (src_known && dst_known) {
__mark_reg32_known(dst_reg, var32_off.value); return;
}
/* We get both minimum and maximum from the var32_off. */
dst_reg->u32_min_value = var32_off.value;
dst_reg->u32_max_value = var32_off.value | var32_off.mask;
/* Safe to set s32 bounds by casting u32 result into s32 when u32 * doesn't cross sign boundary. Otherwise set s32 bounds to unbounded.
*/ if ((s32)dst_reg->u32_min_value <= (s32)dst_reg->u32_max_value) {
dst_reg->s32_min_value = dst_reg->u32_min_value;
dst_reg->s32_max_value = dst_reg->u32_max_value;
} else {
dst_reg->s32_min_value = S32_MIN;
dst_reg->s32_max_value = S32_MAX;
}
}
if (src_known && dst_known) { /* dst_reg->var_off.value has been updated earlier */
__mark_reg_known(dst_reg, dst_reg->var_off.value); return;
}
/* We get both minimum and maximum from the var_off. */
dst_reg->umin_value = dst_reg->var_off.value;
dst_reg->umax_value = dst_reg->var_off.value | dst_reg->var_off.mask;
/* Safe to set s64 bounds by casting u64 result into s64 when u64 * doesn't cross sign boundary. Otherwise set s64 bounds to unbounded.
*/ if ((s64)dst_reg->umin_value <= (s64)dst_reg->umax_value) {
dst_reg->smin_value = dst_reg->umin_value;
dst_reg->smax_value = dst_reg->umax_value;
} else {
dst_reg->smin_value = S64_MIN;
dst_reg->smax_value = S64_MAX;
}
__update_reg_bounds(dst_reg);
}
staticvoid __scalar32_min_max_lsh(struct bpf_reg_state *dst_reg,
u64 umin_val, u64 umax_val)
{ /* We lose all sign bit information (except what we can pick * up from var_off)
*/
dst_reg->s32_min_value = S32_MIN;
dst_reg->s32_max_value = S32_MAX; /* If we might shift our top bit out, then we know nothing */ if (umax_val > 31 || dst_reg->u32_max_value > 1ULL << (31 - umax_val)) {
dst_reg->u32_min_value = 0;
dst_reg->u32_max_value = U32_MAX;
} else {
dst_reg->u32_min_value <<= umin_val;
dst_reg->u32_max_value <<= umax_val;
}
}
__scalar32_min_max_lsh(dst_reg, umin_val, umax_val);
dst_reg->var_off = tnum_subreg(tnum_lshift(subreg, umin_val)); /* Not required but being careful mark reg64 bounds as unknown so * that we are forced to pick them up from tnum and zext later and * if some path skips this step we are still safe.
*/
__mark_reg64_unbounded(dst_reg);
__update_reg32_bounds(dst_reg);
}
staticvoid __scalar64_min_max_lsh(struct bpf_reg_state *dst_reg,
u64 umin_val, u64 umax_val)
{ /* Special case <<32 because it is a common compiler pattern to sign * extend subreg by doing <<32 s>>32. In this case if 32bit bounds are * positive we know this shift will also be positive so we can track * bounds correctly. Otherwise we lose all sign bit information except * what we can pick up from var_off. Perhaps we can generalize this * later to shifts of any length.
*/ if (umin_val == 32 && umax_val == 32 && dst_reg->s32_max_value >= 0)
dst_reg->smax_value = (s64)dst_reg->s32_max_value << 32; else
dst_reg->smax_value = S64_MAX;
/* scalar64 calc uses 32bit unshifted bounds so must be called first */
__scalar64_min_max_lsh(dst_reg, umin_val, umax_val);
__scalar32_min_max_lsh(dst_reg, umin_val, umax_val);
dst_reg->var_off = tnum_lshift(dst_reg->var_off, umin_val); /* We may learn something more from the var_off */
__update_reg_bounds(dst_reg);
}
/* BPF_RSH is an unsigned shift. If the value in dst_reg might * be negative, then either: * 1) src_reg might be zero, so the sign bit of the result is * unknown, so we lose our signed bounds * 2) it's known negative, thus the unsigned bounds capture the * signed bounds * 3) the signed bounds cross zero, so they tell us nothing * about the result * If the value in dst_reg is known nonnegative, then again the * unsigned bounds capture the signed bounds. * Thus, in all cases it suffices to blow away our signed bounds * and rely on inferring new ones from the unsigned bounds and * var_off of the result.
*/
dst_reg->s32_min_value = S32_MIN;
dst_reg->s32_max_value = S32_MAX;
/* BPF_RSH is an unsigned shift. If the value in dst_reg might * be negative, then either: * 1) src_reg might be zero, so the sign bit of the result is * unknown, so we lose our signed bounds * 2) it's known negative, thus the unsigned bounds capture the * signed bounds * 3) the signed bounds cross zero, so they tell us nothing * about the result * If the value in dst_reg is known nonnegative, then again the * unsigned bounds capture the signed bounds. * Thus, in all cases it suffices to blow away our signed bounds * and rely on inferring new ones from the unsigned bounds and * var_off of the result.
*/
dst_reg->smin_value = S64_MIN;
dst_reg->smax_value = S64_MAX;
dst_reg->var_off = tnum_rshift(dst_reg->var_off, umin_val);
dst_reg->umin_value >>= umax_val;
dst_reg->umax_value >>= umin_val;
/* Its not easy to operate on alu32 bounds here because it depends * on bits being shifted in. Take easy way out and mark unbounded * so we can recalculate later from tnum.
*/
__mark_reg32_unbounded(dst_reg);
__update_reg_bounds(dst_reg);
}
/* Upon reaching here, src_known is true and * umax_val is equal to umin_val.
*/
dst_reg->s32_min_value = (u32)(((s32)dst_reg->s32_min_value) >> umin_val);
dst_reg->s32_max_value = (u32)(((s32)dst_reg->s32_max_value) >> umin_val);
/* blow away the dst_reg umin_value/umax_value and rely on * dst_reg var_off to refine the result.
*/
dst_reg->u32_min_value = 0;
dst_reg->u32_max_value = U32_MAX;
/* blow away the dst_reg umin_value/umax_value and rely on * dst_reg var_off to refine the result.
*/
dst_reg->umin_value = 0;
dst_reg->umax_value = U64_MAX;
/* Its not easy to operate on alu32 bounds here because it depends * on bits being shifted in from upper 32-bits. Take easy way out * and mark unbounded so we can recalculate later from tnum.
*/
__mark_reg32_unbounded(dst_reg);
__update_reg_bounds(dst_reg);
}
switch (BPF_OP(insn->code)) { case BPF_ADD: case BPF_SUB: case BPF_NEG: case BPF_AND: case BPF_XOR: case BPF_OR: case BPF_MUL: returntrue;
/* Shift operators range is only computable if shift dimension operand * is a constant. Shifts greater than 31 or 63 are undefined. This * includes shifts by a negative number.
*/ case BPF_LSH: case BPF_RSH: case BPF_ARSH: return (src_is_const && src_reg->umax_value < insn_bitness); default: returnfalse;
}
}
/* WARNING: This function does calculations on 64-bit values, but the actual * execution may occur on 32-bit values. Therefore, things like bitshifts * need extra checks in the 32-bit case.
*/ staticint adjust_scalar_min_max_vals(struct bpf_verifier_env *env, struct bpf_insn *insn, struct bpf_reg_state *dst_reg, struct bpf_reg_state src_reg)
{
u8 opcode = BPF_OP(insn->code); bool alu32 = (BPF_CLASS(insn->code) != BPF_ALU64); int ret;
if (!is_safe_to_compute_dst_reg_range(insn, &src_reg)) {
__mark_reg_unknown(env, dst_reg); return 0;
}
if (sanitize_needed(opcode)) {
ret = sanitize_val_alu(env, insn); if (ret < 0) return sanitize_err(env, insn, ret, NULL, NULL);
}
/* Calculate sign/unsigned bounds and tnum for alu32 and alu64 bit ops. * There are two classes of instructions: The first class we track both * alu32 and alu64 sign/unsigned bounds independently this provides the * greatest amount of precision when alu operations are mixed with jmp32 * operations. These operations are BPF_ADD, BPF_SUB, BPF_MUL, BPF_ADD, * and BPF_OR. This is possible because these ops have fairly easy to * understand and calculate behavior in both 32-bit and 64-bit alu ops. * See alu32 verifier tests for examples. The second class of * operations, BPF_LSH, BPF_RSH, and BPF_ARSH, however are not so easy * with regards to tracking sign/unsigned bounds because the bits may * cross subreg boundaries in the alu64 case. When this happens we mark * the reg unbounded in the subreg bound space and use the resulting * tnum to calculate an approximation of the sign/unsigned bounds.
*/ switch (opcode) { case BPF_ADD:
scalar32_min_max_add(dst_reg, &src_reg);
scalar_min_max_add(dst_reg, &src_reg);
dst_reg->var_off = tnum_add(dst_reg->var_off, src_reg.var_off); break; case BPF_SUB:
scalar32_min_max_sub(dst_reg, &src_reg);
scalar_min_max_sub(dst_reg, &src_reg);
dst_reg->var_off = tnum_sub(dst_reg->var_off, src_reg.var_off); break; case BPF_NEG:
env->fake_reg[0] = *dst_reg;
__mark_reg_known(dst_reg, 0);
scalar32_min_max_sub(dst_reg, &env->fake_reg[0]);
scalar_min_max_sub(dst_reg, &env->fake_reg[0]);
dst_reg->var_off = tnum_neg(env->fake_reg[0].var_off); break; case BPF_MUL:
dst_reg->var_off = tnum_mul(dst_reg->var_off, src_reg.var_off);
scalar32_min_max_mul(dst_reg, &src_reg);
scalar_min_max_mul(dst_reg, &src_reg); break; case BPF_AND:
dst_reg->var_off = tnum_and(dst_reg->var_off, src_reg.var_off);
scalar32_min_max_and(dst_reg, &src_reg);
scalar_min_max_and(dst_reg, &src_reg); break; case BPF_OR:
dst_reg->var_off = tnum_or(dst_reg->var_off, src_reg.var_off);
scalar32_min_max_or(dst_reg, &src_reg);
scalar_min_max_or(dst_reg, &src_reg); break; case BPF_XOR:
dst_reg->var_off = tnum_xor(dst_reg->var_off, src_reg.var_off);
scalar32_min_max_xor(dst_reg, &src_reg);
scalar_min_max_xor(dst_reg, &src_reg); break; case BPF_LSH: if (alu32)
scalar32_min_max_lsh(dst_reg, &src_reg); else
scalar_min_max_lsh(dst_reg, &src_reg); break; case BPF_RSH: if (alu32)
scalar32_min_max_rsh(dst_reg, &src_reg); else
scalar_min_max_rsh(dst_reg, &src_reg); break; case BPF_ARSH: if (alu32)
scalar32_min_max_arsh(dst_reg, &src_reg); else
scalar_min_max_arsh(dst_reg, &src_reg); break; default: break;
}
/* ALU32 ops are zero extended into 64bit register */ if (alu32)
zext_32_to_64(dst_reg);
reg_bounds_sync(dst_reg); return 0;
}
/* Handles ALU ops other than BPF_END, BPF_NEG and BPF_MOV: computes new min/max * and var_off.
*/ staticint adjust_reg_min_max_vals(struct bpf_verifier_env *env, struct bpf_insn *insn)
{ struct bpf_verifier_state *vstate = env->cur_state; struct bpf_func_state *state = vstate->frame[vstate->curframe]; struct bpf_reg_state *regs = state->regs, *dst_reg, *src_reg; struct bpf_reg_state *ptr_reg = NULL, off_reg = {0}; bool alu32 = (BPF_CLASS(insn->code) != BPF_ALU64);
u8 opcode = BPF_OP(insn->code); int err;
dst_reg = ®s[insn->dst_reg];
src_reg = NULL;
if (dst_reg->type == PTR_TO_ARENA) { struct bpf_insn_aux_data *aux = cur_aux(env);
if (BPF_CLASS(insn->code) == BPF_ALU64) /* * 32-bit operations zero upper bits automatically. * 64-bit operations need to be converted to 32.
*/
aux->needs_zext = true;
/* Any arithmetic operations are allowed on arena pointers */ return 0;
}
if (dst_reg->type != SCALAR_VALUE)
ptr_reg = dst_reg;
if (BPF_SRC(insn->code) == BPF_X) {
src_reg = ®s[insn->src_reg]; if (src_reg->type != SCALAR_VALUE) { if (dst_reg->type != SCALAR_VALUE) { /* Combining two pointers by any ALU op yields * an arbitrary scalar. Disallow all math except * pointer subtraction
*/ if (opcode == BPF_SUB && env->allow_ptr_leaks) {
mark_reg_unknown(env, regs, insn->dst_reg); return 0;
}
verbose(env, "R%d pointer %s pointer prohibited\n",
insn->dst_reg,
bpf_alu_string[opcode >> 4]); return -EACCES;
} else { /* scalar += pointer * This is legal, but we have to reverse our * src/dest handling in computing the range
*/
err = mark_chain_precision(env, insn->dst_reg); if (err) return err; return adjust_ptr_min_max_vals(env, insn,
src_reg, dst_reg);
}
} elseif (ptr_reg) { /* pointer += scalar */
err = mark_chain_precision(env, insn->src_reg); if (err) return err; return adjust_ptr_min_max_vals(env, insn,
dst_reg, src_reg);
} elseif (dst_reg->precise) { /* if dst_reg is precise, src_reg should be precise as well */
err = mark_chain_precision(env, insn->src_reg); if (err) return err;
}
} else { /* Pretend the src is a reg with a known value, since we only * need to be able to read from this state.
*/
off_reg.type = SCALAR_VALUE;
__mark_reg_known(&off_reg, insn->imm);
src_reg = &off_reg; if (ptr_reg) /* pointer += K */ return adjust_ptr_min_max_vals(env, insn,
ptr_reg, src_reg);
}
/* Got here implies adding two SCALAR_VALUEs */ if (WARN_ON_ONCE(ptr_reg)) {
print_verifier_state(env, vstate, vstate->curframe, true);
verbose(env, "verifier internal error: unexpected ptr_reg\n"); return -EFAULT;
} if (WARN_ON(!src_reg)) {
print_verifier_state(env, vstate, vstate->curframe, true);
verbose(env, "verifier internal error: no src_reg\n"); return -EFAULT;
}
err = adjust_scalar_min_max_vals(env, insn, dst_reg, *src_reg); if (err) return err; /* * Compilers can generate the code * r1 = r2 * r1 += 0x1 * if r2 < 1000 goto ... * use r1 in memory access * So for 64-bit alu remember constant delta between r2 and r1 and * update r1 after 'if' condition.
*/ if (env->bpf_capable &&
BPF_OP(insn->code) == BPF_ADD && !alu32 &&
dst_reg->id && is_reg_const(src_reg, false)) {
u64 val = reg_const_value(src_reg, false);
if ((dst_reg->id & BPF_ADD_CONST) || /* prevent overflow in sync_linked_regs() later */
val > (u32)S32_MAX) { /* * If the register already went through rX += val * we cannot accumulate another val into rx->off.
*/
dst_reg->off = 0;
dst_reg->id = 0;
} else {
dst_reg->id |= BPF_ADD_CONST;
dst_reg->off = val;
}
} else { /* * Make sure ID is cleared otherwise dst_reg min/max could be * incorrectly propagated into other registers by sync_linked_regs()
*/
dst_reg->id = 0;
} return 0;
}
/* check validity of 32-bit and 64-bit arithmetic operations */ staticint check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
{ struct bpf_reg_state *regs = cur_regs(env);
u8 opcode = BPF_OP(insn->code); int err;
if (is_src_reg_u32)
assign_scalar_id_before_mov(env, src_reg);
copy_register_state(dst_reg, src_reg); /* Make sure ID is cleared if src_reg is not in u32 * range otherwise dst_reg min/max could be incorrectly * propagated into src_reg by sync_linked_regs()
*/ if (!is_src_reg_u32)
dst_reg->id = 0;
dst_reg->live |= REG_LIVE_WRITTEN;
dst_reg->subreg_def = env->insn_idx + 1;
} else { /* case: W1 = (s8, s16)W2 */ bool no_sext = src_reg->umax_value < (1ULL << (insn->off - 1));
if (no_sext)
assign_scalar_id_before_mov(env, src_reg);
copy_register_state(dst_reg, src_reg); if (!no_sext)
dst_reg->id = 0;
dst_reg->live |= REG_LIVE_WRITTEN;
dst_reg->subreg_def = env->insn_idx + 1;
coerce_subreg_to_size_sx(dst_reg, insn->off >> 3);
}
} else {
mark_reg_unknown(env, regs,
insn->dst_reg);
}
zext_32_to_64(dst_reg);
reg_bounds_sync(dst_reg);
}
} else { /* case: R = imm * remember the value we stored into this reg
*/ /* clear any state __mark_reg_known doesn't set */
mark_reg_unknown(env, regs, insn->dst_reg);
regs[insn->dst_reg].type = SCALAR_VALUE; if (BPF_CLASS(insn->code) == BPF_ALU64) {
__mark_reg_known(regs + insn->dst_reg,
insn->imm);
} else {
__mark_reg_known(regs + insn->dst_reg,
(u32)insn->imm);
}
}
if (dst_reg->off < 0 ||
(dst_reg->off == 0 && range_right_open)) /* This doesn't give us any range */ return;
if (dst_reg->umax_value > MAX_PACKET_OFF ||
dst_reg->umax_value + dst_reg->off > MAX_PACKET_OFF) /* Risk of overflow. For instance, ptr + (1<<63) may be less * than pkt_end, but that's because it's also less than pkt.
*/ return;
new_range = dst_reg->off; if (range_right_open)
new_range++;
/* Examples for register markings: * * pkt_data in dst register: * * r2 = r3; * r2 += 8; * if (r2 > pkt_end) goto <handle exception> * <access okay> * * r2 = r3; * r2 += 8; * if (r2 < pkt_end) goto <access okay> * <handle exception> * * Where: * r2 == dst_reg, pkt_end == src_reg * r2=pkt(id=n,off=8,r=0) * r3=pkt(id=n,off=0,r=0) * * pkt_data in src register: * * r2 = r3; * r2 += 8; * if (pkt_end >= r2) goto <access okay> * <handle exception> * * r2 = r3; * r2 += 8; * if (pkt_end <= r2) goto <handle exception> * <access okay> * * Where: * pkt_end == dst_reg, r2 == src_reg * r2=pkt(id=n,off=8,r=0) * r3=pkt(id=n,off=0,r=0) * * Find register r3 and mark its range as r3=pkt(id=n,off=0,r=8) * or r3=pkt(id=n,off=0,r=8-1), so that range of bytes [r3, r3 + 8) * and [r3, r3 + 8-1) respectively is safe to access depending on * the check.
*/
/* If our ids match, then we must have the same max_value. And we * don't care about the other reg's fixed offset, since if it's too big * the range won't allow anything. * dst_reg->off is known < MAX_PACKET_OFF, therefore it fits in a u16.
*/
bpf_for_each_reg_in_vstate(vstate, state, reg, ({ if (reg->type == type && reg->id == dst_reg->id) /* keep the maximum range already checked */
reg->range = max(reg->range, new_range);
}));
}
switch (opcode) { case BPF_JEQ: /* constants, umin/umax and smin/smax checks would be * redundant in this case because they all should match
*/ if (tnum_is_const(t1) && tnum_is_const(t2)) return t1.value == t2.value; if (!tnum_overlap(t1, t2)) return 0; /* non-overlapping ranges */ if (umin1 > umax2 || umax1 < umin2) return 0; if (smin1 > smax2 || smax1 < smin2) return 0; if (!is_jmp32) { /* if 64-bit ranges are inconclusive, see if we can * utilize 32-bit subrange knowledge to eliminate * branches that can't be taken a priori
*/ if (reg1->u32_min_value > reg2->u32_max_value ||
reg1->u32_max_value < reg2->u32_min_value) return 0; if (reg1->s32_min_value > reg2->s32_max_value ||
reg1->s32_max_value < reg2->s32_min_value) return 0;
} break; case BPF_JNE: /* constants, umin/umax and smin/smax checks would be * redundant in this case because they all should match
*/ if (tnum_is_const(t1) && tnum_is_const(t2)) return t1.value != t2.value; if (!tnum_overlap(t1, t2)) return 1; /* non-overlapping ranges */ if (umin1 > umax2 || umax1 < umin2) return 1; if (smin1 > smax2 || smax1 < smin2) return 1; if (!is_jmp32) { /* if 64-bit ranges are inconclusive, see if we can * utilize 32-bit subrange knowledge to eliminate * branches that can't be taken a priori
*/ if (reg1->u32_min_value > reg2->u32_max_value ||
reg1->u32_max_value < reg2->u32_min_value) return 1; if (reg1->s32_min_value > reg2->s32_max_value ||
reg1->s32_max_value < reg2->s32_min_value) return 1;
} break; case BPF_JSET: if (!is_reg_const(reg2, is_jmp32)) {
swap(reg1, reg2);
swap(t1, t2);
} if (!is_reg_const(reg2, is_jmp32)) return -1; if ((~t1.mask & t1.value) & t2.value) return 1; if (!((t1.mask | t1.value) & t2.value)) return 0; break; case BPF_JGT: if (umin1 > umax2) return 1; elseif (umax1 <= umin2) return 0; break; case BPF_JSGT: if (smin1 > smax2) return 1; elseif (smax1 <= smin2) return 0; break; case BPF_JLT: if (umax1 < umin2) return 1; elseif (umin1 >= umax2) return 0; break; case BPF_JSLT: if (smax1 < smin2) return 1; elseif (smin1 >= smax2) return 0; break; case BPF_JGE: if (umin1 >= umax2) return 1; elseif (umax1 < umin2) return 0; break; case BPF_JSGE: if (smin1 >= smax2) return 1; elseif (smax1 < smin2) return 0; break; case BPF_JLE: if (umax1 <= umin2) return 1; elseif (umin1 > umax2) return 0; break; case BPF_JSLE: if (smax1 <= smin2) return 1; elseif (smin1 > smax2) return 0; break;
}
return -1;
}
staticint flip_opcode(u32 opcode)
{ /* How can we transform "a <op> b" into "b <op> a"? */ staticconst u8 opcode_flip[16] = { /* these stay the same */
[BPF_JEQ >> 4] = BPF_JEQ,
[BPF_JNE >> 4] = BPF_JNE,
[BPF_JSET >> 4] = BPF_JSET, /* these swap "lesser" and "greater" (L and G in the opcodes) */
[BPF_JGE >> 4] = BPF_JLE,
[BPF_JGT >> 4] = BPF_JLT,
[BPF_JLE >> 4] = BPF_JGE,
[BPF_JLT >> 4] = BPF_JGT,
[BPF_JSGE >> 4] = BPF_JSLE,
[BPF_JSGT >> 4] = BPF_JSLT,
[BPF_JSLE >> 4] = BPF_JSGE,
[BPF_JSLT >> 4] = BPF_JSGT
}; return opcode_flip[opcode >> 4];
}
switch (opcode) { case BPF_JLE: /* pkt <= pkt_end */
fallthrough; case BPF_JGT: /* pkt > pkt_end */ if (pkt->range == BEYOND_PKT_END) /* pkt has at last one extra byte beyond pkt_end */ return opcode == BPF_JGT; break; case BPF_JLT: /* pkt < pkt_end */
fallthrough; case BPF_JGE: /* pkt >= pkt_end */ if (pkt->range == BEYOND_PKT_END || pkt->range == AT_PKT_END) return opcode == BPF_JGE; break;
} return -1;
}
/* compute branch direction of the expression "if (<reg1> opcode <reg2>) goto target;" * and return: * 1 - branch will be taken and "goto target" will be executed * 0 - branch will not be taken and fall-through to next insn * -1 - unknown. Example: "if (reg1 < 5)" is unknown when register value * range [0,10]
*/ staticint is_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_state *reg2,
u8 opcode, bool is_jmp32)
{ if (reg_is_pkt_pointer_any(reg1) && reg_is_pkt_pointer_any(reg2) && !is_jmp32) return is_pkt_ptr_branch_taken(reg1, reg2, opcode);
if (__is_pointer_value(false, reg1) || __is_pointer_value(false, reg2)) {
u64 val;
/* arrange that reg2 is a scalar, and reg1 is a pointer */ if (!is_reg_const(reg2, is_jmp32)) {
opcode = flip_opcode(opcode);
swap(reg1, reg2);
} /* and ensure that reg2 is a constant */ if (!is_reg_const(reg2, is_jmp32)) return -1;
if (!reg_not_null(reg1)) return -1;
/* If pointer is valid tests against zero will fail so we can * use this to direct branch taken.
*/
val = reg_const_value(reg2, is_jmp32); if (val != 0) return -1;
switch (opcode) { case BPF_JEQ: return 0; case BPF_JNE: return 1; default: return -1;
}
}
/* now deal with two scalars, but not necessarily constants */ return is_scalar_branch_taken(reg1, reg2, opcode, is_jmp32);
}
/* Opcode that corresponds to a *false* branch condition. * E.g., if r1 < r2, then reverse (false) condition is r1 >= r2
*/ static u8 rev_opcode(u8 opcode)
{ switch (opcode) { case BPF_JEQ: return BPF_JNE; case BPF_JNE: return BPF_JEQ; /* JSET doesn't have it's reverse opcode in BPF, so add * BPF_X flag to denote the reverse of that operation
*/ case BPF_JSET: return BPF_JSET | BPF_X; case BPF_JSET | BPF_X: return BPF_JSET; case BPF_JGE: return BPF_JLT; case BPF_JGT: return BPF_JLE; case BPF_JLE: return BPF_JGT; case BPF_JLT: return BPF_JGE; case BPF_JSGE: return BPF_JSLT; case BPF_JSGT: return BPF_JSLE; case BPF_JSLE: return BPF_JSGT; case BPF_JSLT: return BPF_JSGE; default: return 0;
}
}
/* In case of GE/GT/SGE/JST, reuse LE/LT/SLE/SLT logic from below */ switch (opcode) { case BPF_JGE: case BPF_JGT: case BPF_JSGE: case BPF_JSGT:
opcode = flip_opcode(opcode);
swap(reg1, reg2); break; default: break;
}
reg1->var_off = tnum_intersect(reg1->var_off, reg2->var_off);
reg2->var_off = reg1->var_off;
} break; case BPF_JNE: if (!is_reg_const(reg2, is_jmp32))
swap(reg1, reg2); if (!is_reg_const(reg2, is_jmp32)) break;
/* try to recompute the bound of reg1 if reg2 is a const and * is exactly the edge of reg1.
*/
val = reg_const_value(reg2, is_jmp32); if (is_jmp32) { /* u32_min_value is not equal to 0xffffffff at this point, * because otherwise u32_max_value is 0xffffffff as well, * in such a case both reg1 and reg2 would be constants, * jump would be predicted and reg_set_min_max() won't * be called. * * Same reasoning works for all {u,s}{min,max}{32,64} cases * below.
*/ if (reg1->u32_min_value == (u32)val)
reg1->u32_min_value++; if (reg1->u32_max_value == (u32)val)
reg1->u32_max_value--; if (reg1->s32_min_value == (s32)val)
reg1->s32_min_value++; if (reg1->s32_max_value == (s32)val)
reg1->s32_max_value--;
} else { if (reg1->umin_value == (u64)val)
reg1->umin_value++; if (reg1->umax_value == (u64)val)
reg1->umax_value--; if (reg1->smin_value == (s64)val)
reg1->smin_value++; if (reg1->smax_value == (s64)val)
reg1->smax_value--;
} break; case BPF_JSET: if (!is_reg_const(reg2, is_jmp32))
swap(reg1, reg2); if (!is_reg_const(reg2, is_jmp32)) break;
val = reg_const_value(reg2, is_jmp32); /* BPF_JSET (i.e., TRUE branch, *not* BPF_JSET | BPF_X) * requires single bit to learn something useful. E.g., if we * know that `r1 & 0x3` is true, then which bits (0, 1, or both) * are actually set? We can learn something definite only if * it's a single-bit value to begin with. * * BPF_JSET | BPF_X (i.e., negation of BPF_JSET) doesn't have * this restriction. I.e., !(r1 & 0x3) means neither bit 0 nor * bit 1 is set, which we can readily use in adjustments.
*/ if (!is_power_of_2(val)) break; if (is_jmp32) {
t = tnum_or(tnum_subreg(reg1->var_off), tnum_const(val));
reg1->var_off = tnum_with_subreg(reg1->var_off, t);
} else {
reg1->var_off = tnum_or(reg1->var_off, tnum_const(val));
} break; case BPF_JSET | BPF_X: /* reverse of BPF_JSET, see rev_opcode() */ if (!is_reg_const(reg2, is_jmp32))
swap(reg1, reg2); if (!is_reg_const(reg2, is_jmp32)) break;
val = reg_const_value(reg2, is_jmp32); /* Forget the ranges before narrowing tnums, to avoid invariant * violations if we're on a dead branch.
*/
__mark_reg_unbounded(reg1); if (is_jmp32) {
t = tnum_and(tnum_subreg(reg1->var_off), tnum_const(~val));
reg1->var_off = tnum_with_subreg(reg1->var_off, t);
} else {
reg1->var_off = tnum_and(reg1->var_off, tnum_const(~val));
} break; case BPF_JLE: if (is_jmp32) {
reg1->u32_max_value = min(reg1->u32_max_value, reg2->u32_max_value);
reg2->u32_min_value = max(reg1->u32_min_value, reg2->u32_min_value);
} else {
reg1->umax_value = min(reg1->umax_value, reg2->umax_value);
reg2->umin_value = max(reg1->umin_value, reg2->umin_value);
} break; case BPF_JLT: if (is_jmp32) {
reg1->u32_max_value = min(reg1->u32_max_value, reg2->u32_max_value - 1);
reg2->u32_min_value = max(reg1->u32_min_value + 1, reg2->u32_min_value);
} else {
reg1->umax_value = min(reg1->umax_value, reg2->umax_value - 1);
reg2->umin_value = max(reg1->umin_value + 1, reg2->umin_value);
} break; case BPF_JSLE: if (is_jmp32) {
reg1->s32_max_value = min(reg1->s32_max_value, reg2->s32_max_value);
reg2->s32_min_value = max(reg1->s32_min_value, reg2->s32_min_value);
} else {
reg1->smax_value = min(reg1->smax_value, reg2->smax_value);
reg2->smin_value = max(reg1->smin_value, reg2->smin_value);
} break; case BPF_JSLT: if (is_jmp32) {
reg1->s32_max_value = min(reg1->s32_max_value, reg2->s32_max_value - 1);
reg2->s32_min_value = max(reg1->s32_min_value + 1, reg2->s32_min_value);
} else {
reg1->smax_value = min(reg1->smax_value, reg2->smax_value - 1);
reg2->smin_value = max(reg1->smin_value + 1, reg2->smin_value);
} break; default: return;
}
}
/* Adjusts the register min/max values in the case that the dst_reg and * src_reg are both SCALAR_VALUE registers (or we are simply doing a BPF_K * check, in which case we have a fake SCALAR_VALUE representing insn->imm). * Technically we can do similar adjustments for pointers to the same object, * but we don't support that right now.
*/ staticint reg_set_min_max(struct bpf_verifier_env *env, struct bpf_reg_state *true_reg1, struct bpf_reg_state *true_reg2, struct bpf_reg_state *false_reg1, struct bpf_reg_state *false_reg2,
u8 opcode, bool is_jmp32)
{ int err;
/* If either register is a pointer, we can't learn anything about its * variable offset from the compare (unless they were a pointer into * the same object, but we don't bother with that).
*/ if (false_reg1->type != SCALAR_VALUE || false_reg2->type != SCALAR_VALUE) return 0;
staticvoid mark_ptr_or_null_reg(struct bpf_func_state *state, struct bpf_reg_state *reg, u32 id, bool is_null)
{ if (type_may_be_null(reg->type) && reg->id == id &&
(is_rcu_reg(reg) || !WARN_ON_ONCE(!reg->id))) { /* Old offset (both fixed and variable parts) should have been * known-zero, because we don't allow pointer arithmetic on * pointers that might be NULL. If we see this happening, don't * convert the register. * * But in some cases, some helpers that return local kptrs * advance offset for the returned pointer. In those cases, it * is fine to expect to see reg->off.
*/ if (WARN_ON_ONCE(reg->smin_value || reg->smax_value || !tnum_equals_const(reg->var_off, 0))) return; if (!(type_is_ptr_alloc_obj(reg->type) || type_is_non_owning_ref(reg->type)) &&
WARN_ON_ONCE(reg->off)) return;
if (is_null) {
reg->type = SCALAR_VALUE; /* We don't need id and ref_obj_id from this point * onwards anymore, thus we should better reset it, * so that state pruning has chances to take effect.
*/
reg->id = 0;
reg->ref_obj_id = 0;
return;
}
mark_ptr_not_null_reg(reg);
if (!reg_may_point_to_spin_lock(reg)) { /* For not-NULL ptr, reg->ref_obj_id will be reset * in release_reference(). * * reg->id is still used by spin_lock ptr. Other * than spin_lock ptr type, reg->id can be reset.
*/
reg->id = 0;
}
}
}
/* The logic is similar to find_good_pkt_pointers(), both could eventually * be folded together at some point.
*/ staticvoid mark_ptr_or_null_regs(struct bpf_verifier_state *vstate, u32 regno, bool is_null)
{ struct bpf_func_state *state = vstate->frame[vstate->curframe]; struct bpf_reg_state *regs = state->regs, *reg;
u32 ref_obj_id = regs[regno].ref_obj_id;
u32 id = regs[regno].id;
if (ref_obj_id && ref_obj_id == id && is_null) /* regs[regno] is in the " == NULL" branch. * No one could have freed the reference state before * doing the NULL check.
*/
WARN_ON_ONCE(release_reference_nomark(vstate, id));
/* For all R being scalar registers or spilled scalar registers * in verifier state, save R in linked_regs if R->id == id. * If there are too many Rs sharing same id, reset id for leftover Rs.
*/ staticvoid collect_linked_regs(struct bpf_verifier_state *vstate, u32 id, struct linked_regs *linked_regs)
{ struct bpf_func_state *func; struct bpf_reg_state *reg; int i, j;
id = id & ~BPF_ADD_CONST; for (i = vstate->curframe; i >= 0; i--) {
func = vstate->frame[i]; for (j = 0; j < BPF_REG_FP; j++) {
reg = &func->regs[j];
__collect_linked_regs(linked_regs, reg, id, i, j, true);
} for (j = 0; j < func->allocated_stack / BPF_REG_SIZE; j++) { if (!is_spilled_reg(&func->stack[j])) continue;
reg = &func->stack[j].spilled_ptr;
__collect_linked_regs(linked_regs, reg, id, i, j, false);
}
}
}
/* For all R in linked_regs, copy known_reg range into R * if R->id == known_reg->id.
*/ staticvoid sync_linked_regs(struct bpf_verifier_state *vstate, struct bpf_reg_state *known_reg, struct linked_regs *linked_regs)
{ struct bpf_reg_state fake_reg; struct bpf_reg_state *reg; struct linked_reg *e; int i;
for (i = 0; i < linked_regs->cnt; ++i) {
e = &linked_regs->entries[i];
reg = e->is_reg ? &vstate->frame[e->frameno]->regs[e->regno]
: &vstate->frame[e->frameno]->stack[e->spi].spilled_ptr; if (reg->type != SCALAR_VALUE || reg == known_reg) continue; if ((reg->id & ~BPF_ADD_CONST) != (known_reg->id & ~BPF_ADD_CONST)) continue; if ((!(reg->id & BPF_ADD_CONST) && !(known_reg->id & BPF_ADD_CONST)) ||
reg->off == known_reg->off) {
s32 saved_subreg_def = reg->subreg_def;
if (dst_reg->type == PTR_TO_STACK)
insn_flags |= INSN_F_DST_REG_STACK;
}
if (insn_flags) {
err = push_jmp_history(env, this_branch, insn_flags, 0); if (err) return err;
}
is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32;
pred = is_branch_taken(dst_reg, src_reg, opcode, is_jmp32); if (pred >= 0) { /* If we get here with a dst_reg pointer type it is because * above is_branch_taken() special cased the 0 comparison.
*/ if (!__is_pointer_value(false, dst_reg))
err = mark_chain_precision(env, insn->dst_reg); if (BPF_SRC(insn->code) == BPF_X && !err &&
!__is_pointer_value(false, src_reg))
err = mark_chain_precision(env, insn->src_reg); if (err) return err;
}
if (pred == 1) { /* Only follow the goto, ignore fall-through. If needed, push * the fall-through branch for simulation under speculative * execution.
*/ if (!env->bypass_spec_v1 &&
!sanitize_speculative_path(env, insn, *insn_idx + 1,
*insn_idx)) return -EFAULT; if (env->log.level & BPF_LOG_LEVEL)
print_insn_state(env, this_branch, this_branch->curframe);
*insn_idx += insn->off; return 0;
} elseif (pred == 0) { /* Only follow the fall-through branch, since that's where the * program will go. If needed, push the goto branch for * simulation under speculative execution.
*/ if (!env->bypass_spec_v1 &&
!sanitize_speculative_path(env, insn,
*insn_idx + insn->off + 1,
*insn_idx)) return -EFAULT; if (env->log.level & BPF_LOG_LEVEL)
print_insn_state(env, this_branch, this_branch->curframe); return 0;
}
/* Push scalar registers sharing same ID to jump history, * do this before creating 'other_branch', so that both * 'this_branch' and 'other_branch' share this history * if parent state is created.
*/ if (BPF_SRC(insn->code) == BPF_X && src_reg->type == SCALAR_VALUE && src_reg->id)
collect_linked_regs(this_branch, src_reg->id, &linked_regs); if (dst_reg->type == SCALAR_VALUE && dst_reg->id)
collect_linked_regs(this_branch, dst_reg->id, &linked_regs); if (linked_regs.cnt > 1) {
err = push_jmp_history(env, this_branch, 0, linked_regs_pack(&linked_regs)); if (err) return err;
}
if (BPF_SRC(insn->code) == BPF_X) {
err = reg_set_min_max(env,
&other_branch_regs[insn->dst_reg],
&other_branch_regs[insn->src_reg],
dst_reg, src_reg, opcode, is_jmp32);
} else/* BPF_SRC(insn->code) == BPF_K */ { /* reg_set_min_max() can mangle the fake_reg. Make a copy * so that these are two different memory locations. The * src_reg is not used beyond here in context of K.
*/
memcpy(&env->fake_reg[1], &env->fake_reg[0], sizeof(env->fake_reg[0]));
err = reg_set_min_max(env,
&other_branch_regs[insn->dst_reg],
&env->fake_reg[0],
dst_reg, &env->fake_reg[1],
opcode, is_jmp32);
} if (err) return err;
/* if one pointer register is compared to another pointer * register check if PTR_MAYBE_NULL could be lifted. * E.g. register A - maybe null * register B - not null * for JNE A, B, ... - A is not null in the false branch; * for JEQ A, B, ... - A is not null in the true branch. * * Since PTR_TO_BTF_ID points to a kernel struct that does * not need to be null checked by the BPF program, i.e., * could be null even without PTR_MAYBE_NULL marking, so * only propagate nullness when neither reg is that type.
*/ if (!is_jmp32 && BPF_SRC(insn->code) == BPF_X &&
__is_pointer_value(false, src_reg) && __is_pointer_value(false, dst_reg) &&
type_may_be_null(src_reg->type) != type_may_be_null(dst_reg->type) &&
base_type(src_reg->type) != PTR_TO_BTF_ID &&
base_type(dst_reg->type) != PTR_TO_BTF_ID) {
eq_branch_regs = NULL; switch (opcode) { case BPF_JEQ:
eq_branch_regs = other_branch_regs; break; case BPF_JNE:
eq_branch_regs = regs; break; default: /* do nothing */ break;
} if (eq_branch_regs) { if (type_may_be_null(src_reg->type))
mark_ptr_not_null_reg(&eq_branch_regs[insn->src_reg]); else
mark_ptr_not_null_reg(&eq_branch_regs[insn->dst_reg]);
}
}
/* detect if R == 0 where R is returned from bpf_map_lookup_elem(). * NOTE: these optimizations below are related with pointer comparison * which will never be JMP32.
*/ if (!is_jmp32 && BPF_SRC(insn->code) == BPF_K &&
insn->imm == 0 && (opcode == BPF_JEQ || opcode == BPF_JNE) &&
type_may_be_null(dst_reg->type)) { /* Mark all identical registers in each branch as either * safe or unknown depending R == 0 or R != 0 conditional.
*/
mark_ptr_or_null_regs(this_branch, insn->dst_reg,
opcode == BPF_JNE);
mark_ptr_or_null_regs(other_branch, insn->dst_reg,
opcode == BPF_JEQ);
} elseif (!try_match_pkt_pointers(insn, dst_reg, ®s[insn->src_reg],
this_branch, other_branch) &&
is_pointer_value(env, insn->dst_reg)) {
verbose(env, "R%d pointer comparison prohibited\n",
insn->dst_reg); return -EACCES;
} if (env->log.level & BPF_LOG_LEVEL)
print_insn_state(env, this_branch, this_branch->curframe); return 0;
}
/* All special src_reg cases are listed below. From this point onwards * we either succeed and assign a corresponding dst_reg->type after * zeroing the offset, or fail and reject the program.
*/
mark_reg_known_zero(env, regs, insn->dst_reg);
if (insn->src_reg == BPF_PSEUDO_MAP_VALUE ||
insn->src_reg == BPF_PSEUDO_MAP_IDX_VALUE) { if (map->map_type == BPF_MAP_TYPE_ARENA) {
__mark_reg_unknown(env, dst_reg); return 0;
}
dst_reg->type = PTR_TO_MAP_VALUE;
dst_reg->off = aux->map_off;
WARN_ON_ONCE(map->max_entries != 1); /* We want reg->id to be same (0) as map_value is not distinct */
} elseif (insn->src_reg == BPF_PSEUDO_MAP_FD ||
insn->src_reg == BPF_PSEUDO_MAP_IDX) {
dst_reg->type = CONST_PTR_TO_MAP;
} else {
verifier_bug(env, "unexpected src reg value for ldimm64"); return -EFAULT;
}
return 0;
}
staticbool may_access_skb(enum bpf_prog_type type)
{ switch (type) { case BPF_PROG_TYPE_SOCKET_FILTER: case BPF_PROG_TYPE_SCHED_CLS: case BPF_PROG_TYPE_SCHED_ACT: returntrue; default: returnfalse;
}
}
/* verify safety of LD_ABS|LD_IND instructions: * - they can only appear in the programs where ctx == skb * - since they are wrappers of function calls, they scratch R1-R5 registers, * preserve R6-R9, and store return value into R0 * * Implicit input: * ctx == skb == R6 == CTX * * Explicit input: * SRC == any register * IMM == 32-bit immediate * * Output: * R0 - 8/16/32-bit skb data converted to cpu endianness
*/ staticint check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
{ struct bpf_reg_state *regs = cur_regs(env); staticconstint ctx_reg = BPF_REG_6;
u8 mode = BPF_MODE(insn->code); int i, err;
if (!may_access_skb(resolve_prog_type(env->prog))) {
verbose(env, "BPF_LD_[ABS|IND] instructions not allowed for this program type\n"); return -EINVAL;
}
if (!env->ops->gen_ld_abs) {
verifier_bug(env, "gen_ld_abs is null"); return -EFAULT;
}
/* check whether implicit source operand (register R6) is readable */
err = check_reg_arg(env, ctx_reg, SRC_OP); if (err) return err;
/* Disallow usage of BPF_LD_[ABS|IND] with reference tracking, as * gen_ld_abs() may terminate the program at runtime, leading to * reference leak.
*/
err = check_resource_leak(env, false, true, "BPF_LD_[ABS|IND]"); if (err) return err;
if (regs[ctx_reg].type != PTR_TO_CTX) {
verbose(env, "at the time of BPF_LD_ABS|IND R6 != pointer to skb\n"); return -EINVAL;
}
/* reset caller saved regs to unreadable */ for (i = 0; i < CALLER_SAVED_REGS; i++) {
mark_reg_not_init(env, regs, caller_saved[i]);
check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
}
/* mark destination R0 register as readable, since it contains * the value fetched from the packet. * Already marked as written above.
*/
mark_reg_unknown(env, regs, BPF_REG_0); /* ld_abs load up to 32-bit skb data. */
regs[BPF_REG_0].subreg_def = env->insn_idx + 1; return 0;
}
/* LSM and struct_ops func-ptr's return type could be "void" */ if (!is_subprog || frame->in_exception_callback_fn) { switch (prog_type) { case BPF_PROG_TYPE_LSM: if (prog->expected_attach_type == BPF_LSM_CGROUP) /* See below, can be 0 or 0-1 depending on hook. */ break; if (!prog->aux->attach_func_proto->type) return 0; break; case BPF_PROG_TYPE_STRUCT_OPS: if (!prog->aux->attach_func_proto->type) return 0;
if (frame->in_exception_callback_fn) break;
/* Allow a struct_ops program to return a referenced kptr if it * matches the operator's return type and is in its unmodified * form. A scalar zero (i.e., a null pointer) is also allowed.
*/
reg_type = reg->btf ? btf_type_by_id(reg->btf, reg->btf_id) : NULL;
ret_type = btf_type_resolve_ptr(prog->aux->attach_btf,
prog->aux->attach_func_proto->type,
NULL); if (ret_type && ret_type == reg_type && reg->ref_obj_id) return __check_ptr_off_reg(env, reg, regno, false); break; default: break;
}
}
/* eBPF calling convention is such that R0 is used * to return the value from eBPF program. * Make sure that it's readable at this time * of bpf_exit, which means that program wrote
--> --------------------
--> maximum size reached
--> --------------------
Messung V0.5 in Prozent
¤ Dauer der Verarbeitung: 0.918 Sekunden
(vorverarbeitet am 2026-04-25)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.