staticbool is_imm8(int value)
{ return value <= 127 && value >= -128;
}
/* * Let us limit the positive offset to be <= 123. * This is to ensure eventual jit convergence For the following patterns: * ... * pass4, final_proglen=4391: * ... * 20e: 48 85 ff test rdi,rdi * 211: 74 7d je 0x290 * 213: 48 8b 77 00 mov rsi,QWORD PTR [rdi+0x0] * ... * 289: 48 85 ff test rdi,rdi * 28c: 74 17 je 0x2a5 * 28e: e9 7f ff ff ff jmp 0x212 * 293: bf 03 00 00 00 mov edi,0x3 * Note that insn at 0x211 is 2-byte cond jump insn for offset 0x7d (-125) * and insn at 0x28e is 5-byte jmp insn with offset -129. * * pass5, final_proglen=4392: * ... * 20e: 48 85 ff test rdi,rdi * 211: 0f 84 80 00 00 00 je 0x297 * 217: 48 8b 77 00 mov rsi,QWORD PTR [rdi+0x0] * ... * 28d: 48 85 ff test rdi,rdi * 290: 74 1a je 0x2ac * 292: eb 84 jmp 0x218 * 294: bf 03 00 00 00 mov edi,0x3 * Note that insn at 0x211 is 6-byte cond jump insn now since its offset * becomes 0x80 based on previous round (0x293 - 0x213 = 0x80). * At the same time, insn at 0x292 is a 2-byte insn since its offset is * -124. * * pass6 will repeat the same code as in pass4 and this will prevent * eventual convergence. * * To fix this issue, we need to break je (2->6 bytes) <-> jmp (5->2 bytes) * cycle in the above. In the above example je offset <= 0x7c should work. * * For other cases, je <-> je needs offset <= 0x7b to avoid no convergence * issue. For jmp <-> je and jmp <-> jmp cases, jmp offset <= 0x7c should * avoid no convergence issue. * * Overall, let us limit the positive offset for 8bit cond/uncond jmp insn * to maximum 123 (0x7b). This way, the jit pass can eventually converge.
*/ staticbool is_imm8_jmp_offset(int value)
{ return value <= 123 && value >= -128;
}
staticbool is_simm32(s64 value)
{ return value == (s64)(s32)value;
}
staticbool is_uimm32(u64 value)
{ return value == (u64)(u32)value;
}
/* mov dst, src */ #define EMIT_mov(DST, SRC) \ do { \ if (DST != SRC) \
EMIT3(add_2mod(0x48, DST, SRC), 0x89, add_2reg(0xC0, DST, SRC)); \
} while (0)
struct jit_context { int cleanup_addr; /* Epilogue code offset */
/* * Program specific offsets of labels in the code; these rely on the * JIT doing at least 2 passes, recording the position on the first * pass, only to generate the correct offset on the second pass.
*/ int tail_call_direct_label; int tail_call_indirect_label;
};
/* Maximum number of bytes emitted while JITing one eBPF insn */ #define BPF_MAX_INSN_SIZE 128 #define BPF_INSN_SAFETY 64
/* Number of bytes emit_patch() needs to generate instructions */ #define X86_PATCH_SIZE 5 /* Number of bytes that will be skipped on tailcall */ #define X86_TAIL_CALL_OFFSET (12 + ENDBR_INSN_SIZE)
if (callee_regs_used[3])
EMIT2(0x41, 0x5F); /* pop r15 */ if (callee_regs_used[2])
EMIT2(0x41, 0x5E); /* pop r14 */ if (callee_regs_used[1])
EMIT2(0x41, 0x5D); /* pop r13 */ if (callee_regs_used[0])
EMIT1(0x5B); /* pop rbx */
*pprog = prog;
}
staticvoid emit_nops(u8 **pprog, int len)
{
u8 *prog = *pprog; int i, noplen;
while (len > 0) {
noplen = len;
if (noplen > ASM_NOP_MAX)
noplen = ASM_NOP_MAX;
for (i = 0; i < noplen; i++)
EMIT1(x86_nops[noplen][i]);
len -= noplen;
}
*pprog = prog;
}
/* * Emit the various CFI preambles, see asm/cfi.h and the comments about FineIBT * in arch/x86/kernel/alternative.c
*/ staticint emit_call(u8 **prog, void *func, void *ip);
if (!is_subprog) { /* cmp rax, MAX_TAIL_CALL_CNT */
EMIT4(0x48, 0x83, 0xF8, MAX_TAIL_CALL_CNT);
EMIT2(X86_JA, 6); /* ja 6 */ /* rax is tail_call_cnt if <= MAX_TAIL_CALL_CNT. * case1: entry of main prog. * case2: tail callee of main prog.
*/
EMIT1(0x50); /* push rax */ /* Make rax as tail_call_cnt_ptr. */
EMIT3(0x48, 0x89, 0xE0); /* mov rax, rsp */
EMIT2(0xEB, 1); /* jmp 1 */ /* rax is tail_call_cnt_ptr if > MAX_TAIL_CALL_CNT. * case: tail callee of subprog.
*/
EMIT1(0x50); /* push rax */ /* push tail_call_cnt_ptr */
EMIT1(0x50); /* push rax */
} else { /* is_subprog */ /* rax is tail_call_cnt_ptr. */
EMIT1(0x50); /* push rax */
EMIT1(0x50); /* push rax */
}
*pprog = prog;
}
/* * Emit x86-64 prologue code for BPF program. * bpf_tail_call helper will skip the first X86_TAIL_CALL_OFFSET bytes * while jumping to another program
*/ staticvoid emit_prologue(u8 **pprog, u8 *ip, u32 stack_depth, bool ebpf_from_cbpf, bool tail_call_reachable, bool is_subprog, bool is_exception_cb)
{
u8 *prog = *pprog;
if (is_subprog) {
emit_cfi(&prog, ip, cfi_bpf_subprog_hash, 5);
} else {
emit_cfi(&prog, ip, cfi_bpf_hash, 1);
} /* BPF trampoline can be made to work without these nops, * but let's waste 5 bytes for now and optimize later
*/
emit_nops(&prog, X86_PATCH_SIZE); if (!ebpf_from_cbpf) { if (tail_call_reachable && !is_subprog) /* When it's the entry of the whole tailcall context, * zeroing rax means initialising tail_call_cnt.
*/
EMIT3(0x48, 0x31, 0xC0); /* xor rax, rax */ else /* Keep the same instruction layout. */
emit_nops(&prog, 3); /* nop3 */
} /* Exception callback receives FP as third parameter */ if (is_exception_cb) {
EMIT3(0x48, 0x89, 0xF4); /* mov rsp, rsi */
EMIT3(0x48, 0x89, 0xD5); /* mov rbp, rdx */ /* The main frame must have exception_boundary as true, so we * first restore those callee-saved regs from stack, before * reusing the stack frame.
*/
pop_callee_regs(&prog, all_callee_regs_used);
pop_r12(&prog); /* Reset the stack frame. */
EMIT3(0x48, 0x89, 0xEC); /* mov rsp, rbp */
} else {
EMIT1(0x55); /* push rbp */
EMIT3(0x48, 0x89, 0xE5); /* mov rbp, rsp */
}
/* X86_TAIL_CALL_OFFSET is here */
EMIT_ENDBR();
/* sub rsp, rounded_stack_depth */ if (stack_depth)
EMIT3_off32(0x48, 0x81, 0xEC, round_up(stack_depth, 8)); if (tail_call_reachable)
emit_prologue_tail_call(&prog, is_subprog);
*pprog = prog;
}
/* * if (prog == NULL) * goto out;
*/
EMIT3(0x48, 0x85, 0xC9); /* test rcx,rcx */
offset = ctx->tail_call_indirect_label - (prog + 2 - start);
EMIT2(X86_JE, offset); /* je out */
/* Inc tail_call_cnt if the slot is populated. */
EMIT4(0x48, 0x83, 0x00, 0x01); /* add qword ptr [rax], 1 */
if (bpf_prog->aux->exception_boundary) {
pop_callee_regs(&prog, all_callee_regs_used);
pop_r12(&prog);
} else {
pop_callee_regs(&prog, callee_regs_used); if (bpf_arena_get_kern_vm_start(bpf_prog->aux->arena))
pop_r12(&prog);
}
/* Pop tail_call_cnt_ptr. */
EMIT1(0x58); /* pop rax */ /* Pop tail_call_cnt, if it's main prog. * Pop tail_call_cnt_ptr, if it's subprog.
*/
EMIT1(0x58); /* pop rax */ if (stack_depth)
EMIT3_off32(0x48, 0x81, 0xC4, /* add rsp, sd */
round_up(stack_depth, 8));
/* Inc tail_call_cnt if the slot is populated. */
EMIT4(0x48, 0x83, 0x00, 0x01); /* add qword ptr [rax], 1 */
if (bpf_prog->aux->exception_boundary) {
pop_callee_regs(&prog, all_callee_regs_used);
pop_r12(&prog);
} else {
pop_callee_regs(&prog, callee_regs_used); if (bpf_arena_get_kern_vm_start(bpf_prog->aux->arena))
pop_r12(&prog);
}
/* Pop tail_call_cnt_ptr. */
EMIT1(0x58); /* pop rax */ /* Pop tail_call_cnt, if it's main prog. * Pop tail_call_cnt_ptr, if it's subprog.
*/
EMIT1(0x58); /* pop rax */ if (stack_depth)
EMIT3_off32(0x48, 0x81, 0xC4, round_up(stack_depth, 8));
if (is_uimm32(imm64)) { /* * For emitting plain u32, where sign bit must not be * propagated LLVM tends to load imm64 over mov32 * directly, so save couple of bytes by just doing * 'mov %eax, imm32' instead.
*/
emit_mov_imm32(&prog, false, dst_reg, imm32_lo);
} elseif (is_simm32(imm64)) {
emit_mov_imm32(&prog, true, dst_reg, imm32_lo);
} else { /* movabsq rax, imm64 */
EMIT2(add_1mod(0x48, dst_reg), add_1reg(0xB8, dst_reg));
EMIT(imm32_lo, 4);
EMIT(imm32_hi, 4);
}
/* Emit the suffix (ModR/M etc) for addressing *(ptr_reg + off) and val_reg */ staticvoid emit_insn_suffix(u8 **pprog, u32 ptr_reg, u32 val_reg, int off)
{
u8 *prog = *pprog;
if (is_imm8(off)) { /* 1-byte signed displacement. * * If off == 0 we could skip this and save one extra byte, but * special case of x86 R13 which always needs an offset is not * worth the hassle
*/
EMIT2(add_2reg(0x40, ptr_reg, val_reg), off);
} else { /* 4-byte signed displacement */
EMIT1_off32(add_2reg(0x80, ptr_reg, val_reg), off);
}
*pprog = prog;
}
/* * Emit a REX byte if it will be necessary to address these registers
*/ staticvoid maybe_emit_mod(u8 **pprog, u32 dst_reg, u32 src_reg, bool is64)
{
u8 *prog = *pprog;
/* jump over faulting load and clear dest register */ if (reg != DONT_CLEAR)
*(unsignedlong *)((void *)regs + reg) = 0;
regs->ip += x->fixup & 0xff; returntrue;
}
staticvoid detect_reg_usage(struct bpf_insn *insn, int insn_cnt, bool *regs_used)
{ int i;
for (i = 1; i <= insn_cnt; i++, insn++) { if (insn->dst_reg == BPF_REG_6 || insn->src_reg == BPF_REG_6)
regs_used[0] = true; if (insn->dst_reg == BPF_REG_7 || insn->src_reg == BPF_REG_7)
regs_used[1] = true; if (insn->dst_reg == BPF_REG_8 || insn->src_reg == BPF_REG_8)
regs_used[2] = true; if (insn->dst_reg == BPF_REG_9 || insn->src_reg == BPF_REG_9)
regs_used[3] = true;
}
}
/* emit the 3-byte VEX prefix * * r: same as rex.r, extra bit for ModRM reg field * x: same as rex.x, extra bit for SIB index field * b: same as rex.b, extra bit for ModRM r/m, or SIB base * m: opcode map select, encoding escape bytes e.g. 0x0f38 * w: same as rex.w (32 bit or 64 bit) or opcode specific * src_reg2: additional source reg (encoded as BPF reg) * l: vector length (128 bit or 256 bit) or reserved * pp: opcode prefix (none, 0x66, 0xf2 or 0xf3)
*/ staticvoid emit_3vex(u8 **pprog, bool r, bool x, bool b, u8 m, bool w, u8 src_reg2, bool l, u8 pp)
{
u8 *prog = *pprog; const u8 b0 = 0xc4; /* first byte of 3-byte VEX prefix */
u8 b1, b2;
u8 vvvv = reg2hex[src_reg2];
/* reg2hex gives only the lower 3 bit of vvvv */ if (is_ereg(src_reg2))
vvvv |= 1 << 3;
if (cpu_feature_enabled(X86_FEATURE_CLEAR_BHB_LOOP)) { /* The clearing sequence clobbers eax and ecx. */
EMIT1(0x50); /* push rax */
EMIT1(0x51); /* push rcx */
ip += 2;
func = (u8 *)clear_bhb_loop;
ip += x86_call_depth_emit_accounting(&prog, func, ip);
if (emit_call(&prog, func, ip)) return -EINVAL;
EMIT1(0x59); /* pop rcx */
EMIT1(0x58); /* pop rax */
} /* Insert IBHF instruction */ if ((cpu_feature_enabled(X86_FEATURE_CLEAR_BHB_LOOP) &&
cpu_feature_enabled(X86_FEATURE_HYPERVISOR)) ||
cpu_feature_enabled(X86_FEATURE_CLEAR_BHB_HW)) { /* * Add an Indirect Branch History Fence (IBHF). IBHF acts as a * fence preventing branch history from before the fence from * affecting indirect branches after the fence. This is * specifically used in cBPF jitted code to prevent Intra-mode * BHI attacks. The IBHF instruction is designed to be a NOP on * hardware that doesn't need or support it. The REP and REX.W * prefixes are required by the microcode, and they also ensure * that the NOP is unlikely to be used in existing code. * * IBHF is not a valid instruction in 32-bit mode.
*/
EMIT5(0xF3, 0x48, 0x0F, 0x1E, 0xF8); /* ibhf */
}
*pprog = prog; return 0;
}
emit_prologue(&prog, image, stack_depth,
bpf_prog_was_classic(bpf_prog), tail_call_reachable,
bpf_is_subprog(bpf_prog), bpf_prog->aux->exception_cb); /* Exception callback will clobber callee regs for its own use, and * restore the original callee regs from main prog's stack frame.
*/ if (bpf_prog->aux->exception_boundary) { /* We also need to save r12, which is not mapped to any BPF * register, as we throw after entry into the kernel, which may * overwrite r12.
*/
push_r12(&prog);
push_callee_regs(&prog, all_callee_regs_used);
} else { if (arena_vm_start)
push_r12(&prog);
push_callee_regs(&prog, callee_regs_used);
} if (arena_vm_start)
emit_mov_imm64(&prog, X86_REG_R12,
arena_vm_start >> 32, (u32) arena_vm_start);
if (priv_frame_ptr)
emit_priv_frame_ptr(&prog, priv_frame_ptr);
/* test dst_reg32, dst_reg32; check if lower 32-bit are zero */
maybe_emit_mod(&prog, dst_reg, dst_reg, false);
EMIT2(0x85, add_2reg(0xC0, dst_reg, dst_reg));
if (excnt >= bpf_prog->aux->num_exentries) {
pr_err("ex gen bug\n"); return -EFAULT;
}
ex = &bpf_prog->aux->extable[excnt++];
delta = _insn - (u8 *)&ex->insn; if (!is_simm32(delta)) {
pr_err("extable->insn doesn't fit into 32-bit\n"); return -EFAULT;
} /* switch ex to rw buffer for writes */
ex = (void *)rw_image + ((void *)ex - (void *)image);
ex->insn = delta;
ex->data = EX_TYPE_BPF;
if (dst_reg > BPF_REG_9) {
pr_err("verifier error\n"); return -EFAULT;
} /* * Compute size of x86 insn and its target dest x86 register. * ex_handler_bpf() will use lower 8 bits to adjust * pt_regs->ip to jump over this x86 instruction * and upper bits to figure out which pt_regs to zero out. * End result: x86 insn "mov rbx, qword ptr [rax+0x14]" * of 4 bytes will be ignored and rbx will be zero inited.
*/
ex->fixup = (prog - start_of_ldx) | (reg2pt_regs[dst_reg] << 8);
} break;
case BPF_STX | BPF_ATOMIC | BPF_B: case BPF_STX | BPF_ATOMIC | BPF_H: if (!bpf_atomic_is_load_store(insn)) {
pr_err("bpf_jit: 1- and 2-byte RMW atomics are not supported\n"); return -EFAULT;
}
fallthrough; case BPF_STX | BPF_ATOMIC | BPF_W: case BPF_STX | BPF_ATOMIC | BPF_DW: if (insn->imm == (BPF_AND | BPF_FETCH) ||
insn->imm == (BPF_OR | BPF_FETCH) ||
insn->imm == (BPF_XOR | BPF_FETCH)) { bool is64 = BPF_SIZE(insn->code) == BPF_DW;
u32 real_src_reg = src_reg;
u32 real_dst_reg = dst_reg;
u8 *branch_target;
/* * Can't be implemented with a single x86 insn. * Need to do a CMPXCHG loop.
*/
/* Will need RAX as a CMPXCHG operand so save R0 */
emit_mov_reg(&prog, true, BPF_REG_AX, BPF_REG_0); if (src_reg == BPF_REG_0)
real_src_reg = BPF_REG_AX; if (dst_reg == BPF_REG_0)
real_dst_reg = BPF_REG_AX;
branch_target = prog; /* Load old value */
emit_ldx(&prog, BPF_SIZE(insn->code),
BPF_REG_0, real_dst_reg, insn->off); /* * Perform the (commutative) operation locally, * put the result in the AUX_REG.
*/
emit_mov_reg(&prog, is64, AUX_REG, BPF_REG_0);
maybe_emit_mod(&prog, AUX_REG, real_src_reg, is64);
EMIT2(simple_alu_opcodes[BPF_OP(insn->imm)],
add_2reg(0xC0, AUX_REG, real_src_reg)); /* Attempt to swap in new value */
err = emit_atomic_rmw(&prog, BPF_CMPXCHG,
real_dst_reg, AUX_REG,
insn->off,
BPF_SIZE(insn->code)); if (WARN_ON(err)) return err; /* * ZF tells us whether we won the race. If it's * cleared we need to try again.
*/
EMIT2(X86_JNE, -(prog - branch_target) - 2); /* Return the pre-modification value */
emit_mov_reg(&prog, is64, real_src_reg, BPF_REG_0); /* Restore R0 after clobbering RAX */
emit_mov_reg(&prog, true, BPF_REG_0, BPF_REG_AX); break;
}
emit_cond_jmp: /* Convert BPF opcode to x86 */ switch (BPF_OP(insn->code)) { case BPF_JEQ:
jmp_cond = X86_JE; break; case BPF_JSET: case BPF_JNE:
jmp_cond = X86_JNE; break; case BPF_JGT: /* GT is unsigned '>', JA in x86 */
jmp_cond = X86_JA; break; case BPF_JLT: /* LT is unsigned '<', JB in x86 */
jmp_cond = X86_JB; break; case BPF_JGE: /* GE is unsigned '>=', JAE in x86 */
jmp_cond = X86_JAE; break; case BPF_JLE: /* LE is unsigned '<=', JBE in x86 */
jmp_cond = X86_JBE; break; case BPF_JSGT: /* Signed '>', GT in x86 */
jmp_cond = X86_JG; break; case BPF_JSLT: /* Signed '<', LT in x86 */
jmp_cond = X86_JL; break; case BPF_JSGE: /* Signed '>=', GE in x86 */
jmp_cond = X86_JGE; break; case BPF_JSLE: /* Signed '<=', LE in x86 */
jmp_cond = X86_JLE; break; default: /* to silence GCC warning */ return -EFAULT;
}
jmp_offset = addrs[i + insn->off] - addrs[i]; if (is_imm8_jmp_offset(jmp_offset)) { if (jmp_padding) { /* To keep the jmp_offset valid, the extra bytes are * padded before the jump insn, so we subtract the * 2 bytes of jmp_cond insn from INSN_SZ_DIFF. * * If the previous pass already emits an imm8 * jmp_cond, then this BPF insn won't shrink, so * "nops" is 0. * * On the other hand, if the previous pass emits an * imm32 jmp_cond, the extra 4 bytes(*) is padded to * keep the image from shrinking further. * * (*) imm32 jmp_cond is 6 bytes, and imm8 jmp_cond * is 2 bytes, so the size difference is 4 bytes.
*/
nops = INSN_SZ_DIFF - 2; if (nops != 0 && nops != 4) {
pr_err("unexpected jmp_cond padding: %d bytes\n",
nops); return -EFAULT;
}
emit_nops(&prog, nops);
}
EMIT2(jmp_cond, jmp_offset);
} elseif (is_simm32(jmp_offset)) {
EMIT2_off32(0x0F, jmp_cond + 0x10, jmp_offset);
} else {
pr_err("cond_jmp gen bug %llx\n", jmp_offset); return -EFAULT;
}
break;
case BPF_JMP | BPF_JA: case BPF_JMP32 | BPF_JA: if (BPF_CLASS(insn->code) == BPF_JMP) { if (insn->off == -1) /* -1 jmp instructions will always jump * backwards two bytes. Explicitly handling * this case avoids wasting too many passes * when there are long sequences of replaced * dead code.
*/
jmp_offset = -2; else
jmp_offset = addrs[i + insn->off] - addrs[i];
} else { if (insn->imm == -1)
jmp_offset = -2; else
jmp_offset = addrs[i + insn->imm] - addrs[i];
}
if (!jmp_offset) { /* * If jmp_padding is enabled, the extra nops will * be inserted. Otherwise, optimize out nop jumps.
*/ if (jmp_padding) { /* There are 3 possible conditions. * (1) This BPF_JA is already optimized out in * the previous run, so there is no need * to pad any extra byte (0 byte). * (2) The previous pass emits an imm8 jmp, * so we pad 2 bytes to match the previous * insn size. * (3) Similarly, the previous pass emits an * imm32 jmp, and 5 bytes is padded.
*/
nops = INSN_SZ_DIFF; if (nops != 0 && nops != 2 && nops != 5) {
pr_err("unexpected nop jump padding: %d bytes\n",
nops); return -EFAULT;
}
emit_nops(&prog, nops);
} break;
}
emit_jmp: if (is_imm8_jmp_offset(jmp_offset)) { if (jmp_padding) { /* To avoid breaking jmp_offset, the extra bytes * are padded before the actual jmp insn, so * 2 bytes is subtracted from INSN_SZ_DIFF. * * If the previous pass already emits an imm8 * jmp, there is nothing to pad (0 byte). * * If it emits an imm32 jmp (5 bytes) previously * and now an imm8 jmp (2 bytes), then we pad * (5 - 2 = 3) bytes to stop the image from * shrinking further.
*/
nops = INSN_SZ_DIFF - 2; if (nops != 0 && nops != 3) {
pr_err("unexpected jump padding: %d bytes\n",
nops); return -EFAULT;
}
emit_nops(&prog, INSN_SZ_DIFF - 2);
}
EMIT2(0xEB, jmp_offset);
} elseif (is_simm32(jmp_offset)) {
EMIT1_off32(0xE9, jmp_offset);
} else {
pr_err("jmp gen bug %llx\n", jmp_offset); return -EFAULT;
} break;
default: /* * By design x86-64 JIT should support all BPF instructions. * This error will be seen if new instruction was added * to the interpreter, but not to the JIT, or if there is * junk in bpf_prog.
*/
pr_err("bpf_jit: unknown opcode %02x\n", insn->code); return -EINVAL;
}
if (image) { /* * When populating the image, assert that: * * i) We do not write beyond the allocated space, and * ii) addrs[i] did not change from the prior run, in order * to validate assumptions made for computing branch * displacements.
*/ if (unlikely(proglen + ilen > oldproglen ||
proglen + ilen != addrs[i])) {
pr_err("bpf_jit: fatal error\n"); return -EFAULT;
}
memcpy(rw_image + proglen, temp, ilen);
}
proglen += ilen;
addrs[i] = proglen;
prog = temp;
}
if (image && excnt != bpf_prog->aux->num_exentries) {
pr_err("extable is not populated\n"); return -EFAULT;
} return proglen;
}
staticvoid clean_stack_garbage(conststruct btf_func_model *m,
u8 **pprog, int nr_stack_slots, int stack_size)
{ int arg_size, off;
u8 *prog;
/* Generally speaking, the compiler will pass the arguments * on-stack with "push" instruction, which will take 8-byte * on the stack. In this case, there won't be garbage values * while we copy the arguments from origin stack frame to current * in BPF_DW. * * However, sometimes the compiler will only allocate 4-byte on * the stack for the arguments. For now, this case will only * happen if there is only one argument on-stack and its size * not more than 4 byte. In this case, there will be garbage * values on the upper 4-byte where we store the argument on * current stack frame. * * arguments on origin stack: * * stack_arg_1(4-byte) xxx(4-byte) * * what we copy: * * stack_arg_1(8-byte): stack_arg_1(origin) xxx * * and the xxx is the garbage values which we should clean here.
*/ if (nr_stack_slots != 1) return;
/* the size of the last argument */
arg_size = m->arg_size[m->nr_args - 1]; if (arg_size <= 4) {
off = -(stack_size - 4);
prog = *pprog; /* mov DWORD PTR [rbp + off], 0 */ if (!is_imm8(off))
EMIT2_off32(0xC7, 0x85, off); else
EMIT3(0xC7, 0x45, off);
EMIT(0, 4);
*pprog = prog;
}
}
/* get the count of the regs that are used to pass arguments */ staticint get_nr_used_regs(conststruct btf_func_model *m)
{ int i, arg_regs, nr_used_regs = 0;
for (i = 0; i < min_t(int, m->nr_args, MAX_BPF_FUNC_ARGS); i++) {
arg_regs = (m->arg_size[i] + 7) / 8; if (nr_used_regs + arg_regs <= 6)
nr_used_regs += arg_regs;
if (nr_used_regs >= 6) break;
}
return nr_used_regs;
}
staticvoid save_args(conststruct btf_func_model *m, u8 **prog, int stack_size, bool for_call_origin)
{ int arg_regs, first_off = 0, nr_regs = 0, nr_stack_slots = 0; int i, j;
/* Store function arguments to stack. * For a function that accepts two pointers the sequence will be: * mov QWORD PTR [rbp-0x10],rdi * mov QWORD PTR [rbp-0x8],rsi
*/ for (i = 0; i < min_t(int, m->nr_args, MAX_BPF_FUNC_ARGS); i++) {
arg_regs = (m->arg_size[i] + 7) / 8;
/* According to the research of Yonghong, struct members * should be all in register or all on the stack. * Meanwhile, the compiler will pass the argument on regs * if the remaining regs can hold the argument. * * Disorder of the args can happen. For example: * * struct foo_struct { * long a; * int b; * }; * int foo(char, char, char, char, char, struct foo_struct, * char); * * the arg1-5,arg7 will be passed by regs, and arg6 will * by stack.
*/ if (nr_regs + arg_regs > 6) { /* copy function arguments from origin stack frame * into current stack frame. * * The starting address of the arguments on-stack * is: * rbp + 8(push rbp) + * 8(return addr of origin call) + * 8(return addr of the caller) * which means: rbp + 24
*/ for (j = 0; j < arg_regs; j++) {
emit_ldx(prog, BPF_DW, BPF_REG_0, BPF_REG_FP,
nr_stack_slots * 8 + 0x18);
emit_stx(prog, BPF_DW, BPF_REG_FP, BPF_REG_0,
-stack_size);
if (!nr_stack_slots)
first_off = stack_size;
stack_size -= 8;
nr_stack_slots++;
}
} else { /* Only copy the arguments on-stack to current * 'stack_size' and ignore the regs, used to * prepare the arguments on-stack for origin call.
*/ if (for_call_origin) {
nr_regs += arg_regs; continue;
}
/* copy the arguments from regs into stack */ for (j = 0; j < arg_regs; j++) {
emit_stx(prog, BPF_DW, BPF_REG_FP,
nr_regs == 5 ? X86_REG_R9 : BPF_REG_1 + nr_regs,
-stack_size);
stack_size -= 8;
nr_regs++;
}
}
}
staticvoid restore_regs(conststruct btf_func_model *m, u8 **prog, int stack_size)
{ int i, j, arg_regs, nr_regs = 0;
/* Restore function arguments from stack. * For a function that accepts two pointers the sequence will be: * EMIT4(0x48, 0x8B, 0x7D, 0xF0); mov rdi,QWORD PTR [rbp-0x10] * EMIT4(0x48, 0x8B, 0x75, 0xF8); mov rsi,QWORD PTR [rbp-0x8] * * The logic here is similar to what we do in save_args()
*/ for (i = 0; i < min_t(int, m->nr_args, MAX_BPF_FUNC_ARGS); i++) {
arg_regs = (m->arg_size[i] + 7) / 8; if (nr_regs + arg_regs <= 6) { for (j = 0; j < arg_regs; j++) {
emit_ldx(prog, BPF_DW,
nr_regs == 5 ? X86_REG_R9 : BPF_REG_1 + nr_regs,
BPF_REG_FP,
-stack_size);
stack_size -= 8;
nr_regs++;
}
} else {
stack_size -= 8 * arg_regs;
}
if (emit_rsb_call(&prog, bpf_trampoline_enter(p), image + (prog - (u8 *)rw_image))) return -EINVAL; /* remember prog start time returned by __bpf_prog_enter */
emit_mov_reg(&prog, true, BPF_REG_6, BPF_REG_0);
/* if (__bpf_prog_enter*(prog) == 0) * goto skip_exec_of_prog;
*/
EMIT3(0x48, 0x85, 0xC0); /* test rax,rax */ /* emit 2 nops that will be replaced with JE insn */
jmp_insn = prog;
emit_nops(&prog, 2);
/* arg1: lea rdi, [rbp - stack_size] */ if (!is_imm8(-stack_size))
EMIT3_off32(0x48, 0x8D, 0xBD, -stack_size); else
EMIT4(0x48, 0x8D, 0x7D, -stack_size); /* arg2: progs[i]->insnsi for interpreter */ if (!p->jited)
emit_mov_imm64(&prog, BPF_REG_2,
(long) p->insnsi >> 32,
(u32) (long) p->insnsi); /* call JITed bpf program or interpreter */ if (emit_rsb_call(&prog, p->bpf_func, image + (prog - (u8 *)rw_image))) return -EINVAL;
/* * BPF_TRAMP_MODIFY_RETURN trampolines can modify the return * of the previous call which is then passed on the stack to * the next BPF program. * * BPF_TRAMP_FENTRY trampoline may need to return the return * value of BPF_PROG_TYPE_STRUCT_OPS prog.
*/ if (save_ret)
emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8);
/* replace 2 nops with JE insn, since jmp target is known */
jmp_insn[0] = X86_JE;
jmp_insn[1] = prog - jmp_insn - 2;
offset = func - (ip + 2 + 4); if (!is_simm32(offset)) {
pr_err("Target %p is out of range\n", func); return -EINVAL;
}
EMIT2_off32(0x0F, jmp_cond + 0x10, offset);
*pprog = prog; return 0;
}
staticint invoke_bpf(conststruct btf_func_model *m, u8 **pprog, struct bpf_tramp_links *tl, int stack_size, int run_ctx_off, bool save_ret, void *image, void *rw_image)
{ int i;
u8 *prog = *pprog;
for (i = 0; i < tl->nr_links; i++) { if (invoke_bpf_prog(m, &prog, tl->links[i], stack_size,
run_ctx_off, save_ret, image, rw_image)) return -EINVAL;
}
*pprog = prog; return 0;
}
staticint invoke_bpf_mod_ret(conststruct btf_func_model *m, u8 **pprog, struct bpf_tramp_links *tl, int stack_size, int run_ctx_off, u8 **branches, void *image, void *rw_image)
{
u8 *prog = *pprog; int i;
/* The first fmod_ret program will receive a garbage return value. * Set this to 0 to avoid confusing the program.
*/
emit_mov_imm32(&prog, false, BPF_REG_0, 0);
emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8); for (i = 0; i < tl->nr_links; i++) { if (invoke_bpf_prog(m, &prog, tl->links[i], stack_size, run_ctx_off, true,
image, rw_image)) return -EINVAL;
/* Save the location of the branch and Generate 6 nops * (4 bytes for an offset and 2 bytes for the jump) These nops * are replaced with a conditional jump once do_fexit (i.e. the * start of the fexit invocation) is finalized.
*/
branches[i] = prog;
emit_nops(&prog, 4 + 2);
}
/* Example: * __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev); * its 'struct btf_func_model' will be nr_args=2 * The assembly code when eth_type_trans is executing after trampoline: * * push rbp * mov rbp, rsp * sub rsp, 16 // space for skb and dev * push rbx // temp regs to pass start time * mov qword ptr [rbp - 16], rdi // save skb pointer to stack * mov qword ptr [rbp - 8], rsi // save dev pointer to stack * call __bpf_prog_enter // rcu_read_lock and preempt_disable * mov rbx, rax // remember start time in bpf stats are enabled * lea rdi, [rbp - 16] // R1==ctx of bpf prog * call addr_of_jited_FENTRY_prog * movabsq rdi, 64bit_addr_of_struct_bpf_prog // unused if bpf stats are off * mov rsi, rbx // prog start time * call __bpf_prog_exit // rcu_read_unlock, preempt_enable and stats math * mov rdi, qword ptr [rbp - 16] // restore skb pointer from stack * mov rsi, qword ptr [rbp - 8] // restore dev pointer from stack * pop rbx * leave * ret * * eth_type_trans has 5 byte nop at the beginning. These 5 bytes will be * replaced with 'call generated_bpf_trampoline'. When it returns * eth_type_trans will continue executing with original skb and dev pointers. * * The assembly code when eth_type_trans is called from trampoline: * * push rbp * mov rbp, rsp * sub rsp, 24 // space for skb, dev, return value * push rbx // temp regs to pass start time * mov qword ptr [rbp - 24], rdi // save skb pointer to stack * mov qword ptr [rbp - 16], rsi // save dev pointer to stack * call __bpf_prog_enter // rcu_read_lock and preempt_disable * mov rbx, rax // remember start time if bpf stats are enabled * lea rdi, [rbp - 24] // R1==ctx of bpf prog * call addr_of_jited_FENTRY_prog // bpf prog can access skb and dev * movabsq rdi, 64bit_addr_of_struct_bpf_prog // unused if bpf stats are off * mov rsi, rbx // prog start time * call __bpf_prog_exit // rcu_read_unlock, preempt_enable and stats math * mov rdi, qword ptr [rbp - 24] // restore skb pointer from stack * mov rsi, qword ptr [rbp - 16] // restore dev pointer from stack * call eth_type_trans+5 // execute body of eth_type_trans * mov qword ptr [rbp - 8], rax // save return value * call __bpf_prog_enter // rcu_read_lock and preempt_disable * mov rbx, rax // remember start time in bpf stats are enabled * lea rdi, [rbp - 24] // R1==ctx of bpf prog * call addr_of_jited_FEXIT_prog // bpf prog can access skb, dev, return value * movabsq rdi, 64bit_addr_of_struct_bpf_prog // unused if bpf stats are off * mov rsi, rbx // prog start time * call __bpf_prog_exit // rcu_read_unlock, preempt_enable and stats math * mov rax, qword ptr [rbp - 8] // restore eth_type_trans's return value * pop rbx * leave * add rsp, 8 // skip eth_type_trans's frame * ret // return to its caller
*/ staticint __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_image, void *rw_image_end, void *image, conststruct btf_func_model *m, u32 flags, struct bpf_tramp_links *tlinks, void *func_addr)
{ int i, ret, nr_regs = m->nr_args, stack_size = 0; int regs_off, nregs_off, ip_off, run_ctx_off, arg_stack_off, rbx_off; struct bpf_tramp_links *fentry = &tlinks[BPF_TRAMP_FENTRY]; struct bpf_tramp_links *fexit = &tlinks[BPF_TRAMP_FEXIT]; struct bpf_tramp_links *fmod_ret = &tlinks[BPF_TRAMP_MODIFY_RETURN]; void *orig_call = func_addr;
u8 **branches = NULL;
u8 *prog; bool save_ret;
/* * F_INDIRECT is only compatible with F_RET_FENTRY_RET, it is * explicitly incompatible with F_CALL_ORIG | F_SKIP_FRAME | F_IP_ARG * because @func_addr.
*/
WARN_ON_ONCE((flags & BPF_TRAMP_F_INDIRECT) &&
(flags & ~(BPF_TRAMP_F_INDIRECT | BPF_TRAMP_F_RET_FENTRY_RET)));
/* extra registers for struct arguments */ for (i = 0; i < m->nr_args; i++) { if (m->arg_flags[i] & BTF_FMODEL_STRUCT_ARG)
nr_regs += (m->arg_size[i] + 7) / 8 - 1;
}
/* x86-64 supports up to MAX_BPF_FUNC_ARGS arguments. 1-6 * are passed through regs, the remains are through stack.
*/ if (nr_regs > MAX_BPF_FUNC_ARGS) return -ENOTSUPP;
/* room for return value of orig_call or fentry prog */
save_ret = flags & (BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_RET_FENTRY_RET); if (save_ret)
stack_size += 8;
if (nr_regs > 6 && (flags & BPF_TRAMP_F_CALL_ORIG)) { /* the space that used to pass arguments on-stack */
stack_size += (nr_regs - get_nr_used_regs(m)) * 8; /* make sure the stack pointer is 16-byte aligned if we * need pass arguments on stack, which means * [stack_size + 8(rbp) + 8(rip) + 8(origin rip)] * should be 16-byte aligned. Following code depend on * that stack_size is already 8-byte aligned.
*/
stack_size += (stack_size % 16) ? 0 : 8;
}
arg_stack_off = stack_size;
if (flags & BPF_TRAMP_F_SKIP_FRAME) { /* skip patched call instruction and point orig_call to actual * body of the kernel function.
*/ if (is_endbr(orig_call))
orig_call += ENDBR_INSN_SIZE;
orig_call += X86_PATCH_SIZE;
}
prog = rw_image;
if (flags & BPF_TRAMP_F_INDIRECT) { /* * Indirect call for bpf_struct_ops
*/
emit_cfi(&prog, image,
cfi_get_func_hash(func_addr),
cfi_get_func_arity(func_addr));
} else { /* * Direct-call fentry stub, as such it needs accounting for the * __fentry__ call.
*/
x86_call_depth_emit_accounting(&prog, NULL, image);
}
EMIT1(0x55); /* push rbp */
EMIT3(0x48, 0x89, 0xE5); /* mov rbp, rsp */ if (!is_imm8(stack_size)) { /* sub rsp, stack_size */
EMIT3_off32(0x48, 0x81, 0xEC, stack_size);
} else { /* sub rsp, stack_size */
EMIT4(0x48, 0x83, 0xEC, stack_size);
} if (flags & BPF_TRAMP_F_TAIL_CALL_CTX)
EMIT1(0x50); /* push rax */ /* mov QWORD PTR [rbp - rbx_off], rbx */
emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_6, -rbx_off);
/* Store number of argument registers of the traced function: * mov rax, nr_regs * mov QWORD PTR [rbp - nregs_off], rax
*/
emit_mov_imm64(&prog, BPF_REG_0, 0, (u32) nr_regs);
emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -nregs_off);
if (flags & BPF_TRAMP_F_IP_ARG) { /* Store IP address of the traced function: * movabsq rax, func_addr * mov QWORD PTR [rbp - ip_off], rax
*/
emit_mov_imm64(&prog, BPF_REG_0, (long) func_addr >> 32, (u32) (long) func_addr);
emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -ip_off);
}
save_args(m, &prog, regs_off, false);
if (flags & BPF_TRAMP_F_CALL_ORIG) { /* arg1: mov rdi, im */
emit_mov_imm64(&prog, BPF_REG_1, (long) im >> 32, (u32) (long) im); if (emit_rsb_call(&prog, __bpf_tramp_enter,
image + (prog - (u8 *)rw_image))) {
ret = -EINVAL; goto cleanup;
}
}
if (fentry->nr_links) { if (invoke_bpf(m, &prog, fentry, regs_off, run_ctx_off,
flags & BPF_TRAMP_F_RET_FENTRY_RET, image, rw_image)) return -EINVAL;
}
if (fmod_ret->nr_links) {
branches = kcalloc(fmod_ret->nr_links, sizeof(u8 *),
GFP_KERNEL); if (!branches) return -ENOMEM;
if (invoke_bpf_mod_ret(m, &prog, fmod_ret, regs_off,
run_ctx_off, branches, image, rw_image)) {
ret = -EINVAL; goto cleanup;
}
}
if (flags & BPF_TRAMP_F_TAIL_CALL_CTX) { /* Before calling the original function, load the * tail_call_cnt_ptr from stack to rax.
*/
LOAD_TRAMP_TAIL_CALL_CNT_PTR(stack_size);
}
if (flags & BPF_TRAMP_F_ORIG_STACK) {
emit_ldx(&prog, BPF_DW, BPF_REG_6, BPF_REG_FP, 8);
EMIT2(0xff, 0xd3); /* call *rbx */
} else { /* call original function */ if (emit_rsb_call(&prog, orig_call, image + (prog - (u8 *)rw_image))) {
ret = -EINVAL; goto cleanup;
}
} /* remember return value in a stack for bpf prog to access */
emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8);
im->ip_after_call = image + (prog - (u8 *)rw_image);
emit_nops(&prog, X86_PATCH_SIZE);
}
if (fmod_ret->nr_links) { /* From Intel 64 and IA-32 Architectures Optimization * Reference Manual, 3.4.1.4 Code Alignment, Assembly/Compiler * Coding Rule 11: All branch targets should be 16-byte * aligned.
*/
emit_align(&prog, 16); /* Update the branches saved in invoke_bpf_mod_ret with the * aligned address of do_fexit.
*/ for (i = 0; i < fmod_ret->nr_links; i++) {
emit_cond_near_jump(&branches[i], image + (prog - (u8 *)rw_image),
image + (branches[i] - (u8 *)rw_image), X86_JNE);
}
}
if (fexit->nr_links) { if (invoke_bpf(m, &prog, fexit, regs_off, run_ctx_off, false, image, rw_image)) {
ret = -EINVAL; goto cleanup;
}
}
if (flags & BPF_TRAMP_F_RESTORE_REGS)
restore_regs(m, &prog, regs_off);
/* This needs to be done regardless. If there were fmod_ret programs, * the return value is only updated on the stack and still needs to be * restored to R0.
*/ if (flags & BPF_TRAMP_F_CALL_ORIG) {
im->ip_epilogue = image + (prog - (u8 *)rw_image); /* arg1: mov rdi, im */
emit_mov_imm64(&prog, BPF_REG_1, (long) im >> 32, (u32) (long) im); if (emit_rsb_call(&prog, __bpf_tramp_exit, image + (prog - (u8 *)rw_image))) {
ret = -EINVAL; goto cleanup;
}
} elseif (flags & BPF_TRAMP_F_TAIL_CALL_CTX) { /* Before running the original function, load the * tail_call_cnt_ptr from stack to rax.
*/
LOAD_TRAMP_TAIL_CALL_CNT_PTR(stack_size);
}
/* restore return value of orig_call or fentry prog back into RAX */ if (save_ret)
emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, -8);
emit_ldx(&prog, BPF_DW, BPF_REG_6, BPF_REG_FP, -rbx_off);
EMIT1(0xC9); /* leave */ if (flags & BPF_TRAMP_F_SKIP_FRAME) { /* skip our return address and return to parent */
EMIT4(0x48, 0x83, 0xC4, 8); /* add rsp, 8 */
}
emit_return(&prog, image + (prog - (u8 *)rw_image)); /* Make sure the trampoline generation logic doesn't overflow */ if (WARN_ON_ONCE(prog > (u8 *)rw_image_end - BPF_INSN_SAFETY)) {
ret = -EFAULT; goto cleanup;
}
ret = prog - (u8 *)rw_image + BPF_INSN_SAFETY;
/* rw_image doesn't need to be in module memory range, so we can * use kvmalloc.
*/
rw_image = kvmalloc(size, GFP_KERNEL); if (!rw_image) return -ENOMEM;
ret = __arch_prepare_bpf_trampoline(im, rw_image, rw_image + size, image, m,
flags, tlinks, func_addr); if (ret < 0) goto out;
tmp = bpf_arch_text_copy(image, rw_image, size); if (IS_ERR(tmp))
ret = PTR_ERR(tmp);
out:
kvfree(rw_image); return ret;
}
int arch_bpf_trampoline_size(conststruct btf_func_model *m, u32 flags, struct bpf_tramp_links *tlinks, void *func_addr)
{ struct bpf_tramp_image im; void *image; int ret;
/* Allocate a temporary buffer for __arch_prepare_bpf_trampoline(). * This will NOT cause fragmentation in direct map, as we do not * call set_memory_*() on this buffer. * * We cannot use kvmalloc here, because we need image to be in * module memory range.
*/
image = bpf_jit_alloc_exec(PAGE_SIZE); if (!image) return -ENOMEM;
ret = __arch_prepare_bpf_trampoline(&im, image, image + PAGE_SIZE, image,
m, flags, tlinks, func_addr);
bpf_jit_free_exec(image); return ret;
}
staticint emit_bpf_dispatcher(u8 **pprog, int a, int b, s64 *progs, u8 *image, u8 *buf)
{
u8 *jg_reloc, *prog = *pprog; int pivot, err, jg_bytes = 1;
s64 jg_offset;
if (a == b) { /* Leaf node of recursion, i.e. not a range of indices * anymore.
*/
EMIT1(add_1mod(0x48, BPF_REG_3)); /* cmp rdx,func */ if (!is_simm32(progs[a])) return -1;
EMIT2_off32(0x81, add_1reg(0xF8, BPF_REG_3),
progs[a]);
err = emit_cond_near_jump(&prog, /* je func */
(void *)progs[a], image + (prog - buf),
X86_JE); if (err) return err;
/* Not a leaf node, so we pivot, and recursively descend into * the lower and upper ranges.
*/
pivot = (b - a) / 2;
EMIT1(add_1mod(0x48, BPF_REG_3)); /* cmp rdx,func */ if (!is_simm32(progs[a + pivot])) return -1;
EMIT2_off32(0x81, add_1reg(0xF8, BPF_REG_3), progs[a + pivot]);
tmp = bpf_jit_blind_constants(prog); /* * If blinding was requested and we failed during blinding, * we must fall back to the interpreter.
*/ if (IS_ERR(tmp)) return orig_prog; if (tmp != prog) {
tmp_blinded = true;
prog = tmp;
}
jit_data = prog->aux->jit_data; if (!jit_data) {
jit_data = kzalloc(sizeof(*jit_data), GFP_KERNEL); if (!jit_data) {
prog = orig_prog; goto out;
}
prog->aux->jit_data = jit_data;
}
priv_stack_ptr = prog->aux->priv_stack_ptr; if (!priv_stack_ptr && prog->aux->jits_use_priv_stack) { /* Allocate actual private stack size with verifier-calculated * stack size plus two memory guards to protect overflow and * underflow.
*/
priv_stack_alloc_sz = round_up(prog->aux->stack_depth, 8) +
2 * PRIV_STACK_GUARD_SZ;
priv_stack_ptr = __alloc_percpu_gfp(priv_stack_alloc_sz, 8, GFP_KERNEL); if (!priv_stack_ptr) {
prog = orig_prog; goto out_priv_stack;
}
/* * Before first pass, make a rough estimation of addrs[] * each BPF instruction is translated to less than 64 bytes
*/ for (proglen = 0, i = 0; i <= prog->len; i++) {
proglen += 64;
addrs[i] = proglen;
}
ctx.cleanup_addr = proglen;
skip_init_addrs:
/* * JITed image shrinks with every pass and the loop iterates * until the image stops shrinking. Very large BPF programs * may converge on the last pass. In such case do one more * pass to emit the final image.
*/ for (pass = 0; pass < MAX_PASSES || image; pass++) { if (!padding && pass >= PADDING_PASSES)
padding = true;
proglen = do_jit(prog, addrs, image, rw_image, oldproglen, &ctx, padding); if (proglen <= 0) {
out_image:
image = NULL; if (header) {
bpf_arch_text_copy(&header->size, &rw_header->size, sizeof(rw_header->size));
bpf_jit_binary_pack_free(header, rw_header);
} /* Fall back to interpreter mode */
prog = orig_prog; if (extra_pass) {
prog->bpf_func = NULL;
prog->jited = 0;
prog->jited_len = 0;
} goto out_addrs;
} if (image) { if (proglen != oldproglen) {
pr_err("bpf_jit: proglen=%d != oldproglen=%d\n",
proglen, oldproglen); goto out_image;
} break;
} if (proglen == oldproglen) { /* * The number of entries in extable is the number of BPF_LDX * insns that access kernel memory via "pointer to BTF type". * The verifier changed their opcode from LDX|MEM|size * to LDX|PROBE_MEM|size to make JITing easier.
*/
u32 align = __alignof__(struct exception_table_entry);
u32 extable_size = prog->aux->num_exentries * sizeof(struct exception_table_entry);
if (bpf_jit_enable > 1)
bpf_jit_dump(prog->len, proglen, pass + 1, rw_image);
if (image) { if (!prog->is_func || extra_pass) { /* * bpf_jit_binary_pack_finalize fails in two scenarios: * 1) header is not pointing to proper module memory; * 2) the arch doesn't support bpf_arch_text_copy(). * * Both cases are serious bugs and justify WARN_ON.
*/ if (WARN_ON(bpf_jit_binary_pack_finalize(header, rw_header))) { /* header has been freed */
header = NULL; goto out_image;
}
bpf_tail_call_direct_fixup(prog);
} else {
jit_data->addrs = addrs;
jit_data->ctx = ctx;
jit_data->proglen = proglen;
jit_data->image = image;
jit_data->header = header;
jit_data->rw_header = rw_header;
} /* * ctx.prog_offset is used when CFI preambles put code *before* * the function. See emit_cfi(). For FineIBT specifically this code * can also be executed and bpf_prog_kallsyms_add() will * generate an additional symbol to cover this, hence also * decrement proglen.
*/
prog->bpf_func = (void *)image + cfi_get_offset();
prog->jited = 1;
prog->jited_len = proglen - cfi_get_offset();
} else {
prog = orig_prog;
}
/* * If we fail the final pass of JIT (from jit_subprogs), * the program may not be finalized yet. Call finalize here * before freeing it.
*/ if (jit_data) {
bpf_jit_binary_pack_finalize(jit_data->header,
jit_data->rw_header);
kvfree(jit_data->addrs);
kfree(jit_data);
}
prog->bpf_func = (void *)prog->bpf_func - cfi_get_offset();
hdr = bpf_jit_binary_pack_hdr(prog);
bpf_jit_binary_pack_free(hdr, NULL);
priv_stack_ptr = prog->aux->priv_stack_ptr; if (priv_stack_ptr) {
priv_stack_alloc_sz = round_up(prog->aux->stack_depth, 8) +
2 * PRIV_STACK_GUARD_SZ;
priv_stack_check_guard(priv_stack_ptr, priv_stack_alloc_sz, prog);
free_percpu(prog->aux->priv_stack_ptr);
}
WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(prog));
}
bpf_prog_unlock_free(prog);
}
bool bpf_jit_supports_exceptions(void)
{ /* We unwind through both kernel frames (starting from within bpf_throw * call) and BPF frames. Therefore we require ORC unwinder to be enabled * to walk kernel frames and reach BPF frames in the stack trace.
*/ return IS_ENABLED(CONFIG_UNWINDER_ORC);
}
old_bypass_addr = old ? NULL : poke->bypass_addr;
old_addr = old ? (u8 *)old->bpf_func + poke->adj_off : NULL;
new_addr = new ? (u8 *)new->bpf_func + poke->adj_off : NULL;
/* * On program loading or teardown, the program's kallsym entry * might not be in place, so we use __bpf_arch_text_poke to skip * the kallsyms check.
*/ if (new) {
ret = __bpf_arch_text_poke(poke->tailcall_target,
BPF_MOD_JUMP,
old_addr, new_addr);
BUG_ON(ret < 0); if (!old) {
ret = __bpf_arch_text_poke(poke->tailcall_bypass,
BPF_MOD_JUMP,
poke->bypass_addr,
NULL);
BUG_ON(ret < 0);
}
} else {
ret = __bpf_arch_text_poke(poke->tailcall_bypass,
BPF_MOD_JUMP,
old_bypass_addr,
poke->bypass_addr);
BUG_ON(ret < 0); /* let other CPUs finish the execution of program * so that it will not possible to expose them * to invalid nop, stack unwind, nop state
*/ if (!ret)
synchronize_rcu();
ret = __bpf_arch_text_poke(poke->tailcall_target,
BPF_MOD_JUMP,
old_addr, NULL);
BUG_ON(ret < 0);
}
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.