linux/arch/arm64/net/bpf_jit.h

334 lines
14 KiB
C
Raw Permalink Normal View History

/* SPDX-License-Identifier: GPL-2.0-only */
/*
* BPF JIT compiler for ARM64
*
* Copyright (C) 2014-2016 Zi Shen Lim <zlim.lnx@gmail.com>
*/
#ifndef _BPF_JIT_H
#define _BPF_JIT_H
#include <asm/insn.h>
/* 5-bit Register Operand */
#define A64_R(x) AARCH64_INSN_REG_##x
#define A64_FP AARCH64_INSN_REG_FP
#define A64_LR AARCH64_INSN_REG_LR
#define A64_ZR AARCH64_INSN_REG_ZR
#define A64_SP AARCH64_INSN_REG_SP
#define A64_VARIANT(sf) \
((sf) ? AARCH64_INSN_VARIANT_64BIT : AARCH64_INSN_VARIANT_32BIT)
/* Compare & branch (immediate) */
#define A64_COMP_BRANCH(sf, Rt, offset, type) \
aarch64_insn_gen_comp_branch_imm(0, offset, Rt, A64_VARIANT(sf), \
AARCH64_INSN_BRANCH_COMP_##type)
#define A64_CBZ(sf, Rt, imm19) A64_COMP_BRANCH(sf, Rt, (imm19) << 2, ZERO)
#define A64_CBNZ(sf, Rt, imm19) A64_COMP_BRANCH(sf, Rt, (imm19) << 2, NONZERO)
/* Conditional branch (immediate) */
#define A64_COND_BRANCH(cond, offset) \
aarch64_insn_gen_cond_branch_imm(0, offset, cond)
#define A64_COND_EQ AARCH64_INSN_COND_EQ /* == */
#define A64_COND_NE AARCH64_INSN_COND_NE /* != */
#define A64_COND_CS AARCH64_INSN_COND_CS /* unsigned >= */
#define A64_COND_HI AARCH64_INSN_COND_HI /* unsigned > */
#define A64_COND_LS AARCH64_INSN_COND_LS /* unsigned <= */
#define A64_COND_CC AARCH64_INSN_COND_CC /* unsigned < */
#define A64_COND_GE AARCH64_INSN_COND_GE /* signed >= */
#define A64_COND_GT AARCH64_INSN_COND_GT /* signed > */
#define A64_COND_LE AARCH64_INSN_COND_LE /* signed <= */
#define A64_COND_LT AARCH64_INSN_COND_LT /* signed < */
#define A64_B_(cond, imm19) A64_COND_BRANCH(cond, (imm19) << 2)
/* Unconditional branch (immediate) */
#define A64_BRANCH(offset, type) aarch64_insn_gen_branch_imm(0, offset, \
AARCH64_INSN_BRANCH_##type)
#define A64_B(imm26) A64_BRANCH((imm26) << 2, NOLINK)
#define A64_BL(imm26) A64_BRANCH((imm26) << 2, LINK)
/* Unconditional branch (register) */
#define A64_BR(Rn) aarch64_insn_gen_branch_reg(Rn, AARCH64_INSN_BRANCH_NOLINK)
#define A64_BLR(Rn) aarch64_insn_gen_branch_reg(Rn, AARCH64_INSN_BRANCH_LINK)
#define A64_RET(Rn) aarch64_insn_gen_branch_reg(Rn, AARCH64_INSN_BRANCH_RETURN)
/* Load/store register (register offset) */
#define A64_LS_REG(Rt, Rn, Rm, size, type) \
aarch64_insn_gen_load_store_reg(Rt, Rn, Rm, \
AARCH64_INSN_SIZE_##size, \
AARCH64_INSN_LDST_##type##_REG_OFFSET)
#define A64_STRB(Wt, Xn, Xm) A64_LS_REG(Wt, Xn, Xm, 8, STORE)
#define A64_LDRB(Wt, Xn, Xm) A64_LS_REG(Wt, Xn, Xm, 8, LOAD)
#define A64_LDRSB(Xt, Xn, Xm) A64_LS_REG(Xt, Xn, Xm, 8, SIGNED_LOAD)
#define A64_STRH(Wt, Xn, Xm) A64_LS_REG(Wt, Xn, Xm, 16, STORE)
#define A64_LDRH(Wt, Xn, Xm) A64_LS_REG(Wt, Xn, Xm, 16, LOAD)
#define A64_LDRSH(Xt, Xn, Xm) A64_LS_REG(Xt, Xn, Xm, 16, SIGNED_LOAD)
#define A64_STR32(Wt, Xn, Xm) A64_LS_REG(Wt, Xn, Xm, 32, STORE)
#define A64_LDR32(Wt, Xn, Xm) A64_LS_REG(Wt, Xn, Xm, 32, LOAD)
#define A64_LDRSW(Xt, Xn, Xm) A64_LS_REG(Xt, Xn, Xm, 32, SIGNED_LOAD)
#define A64_STR64(Xt, Xn, Xm) A64_LS_REG(Xt, Xn, Xm, 64, STORE)
#define A64_LDR64(Xt, Xn, Xm) A64_LS_REG(Xt, Xn, Xm, 64, LOAD)
/* Load/store register (immediate offset) */
#define A64_LS_IMM(Rt, Rn, imm, size, type) \
aarch64_insn_gen_load_store_imm(Rt, Rn, imm, \
AARCH64_INSN_SIZE_##size, \
AARCH64_INSN_LDST_##type##_IMM_OFFSET)
#define A64_STRBI(Wt, Xn, imm) A64_LS_IMM(Wt, Xn, imm, 8, STORE)
#define A64_LDRBI(Wt, Xn, imm) A64_LS_IMM(Wt, Xn, imm, 8, LOAD)
#define A64_LDRSBI(Xt, Xn, imm) A64_LS_IMM(Xt, Xn, imm, 8, SIGNED_LOAD)
#define A64_STRHI(Wt, Xn, imm) A64_LS_IMM(Wt, Xn, imm, 16, STORE)
#define A64_LDRHI(Wt, Xn, imm) A64_LS_IMM(Wt, Xn, imm, 16, LOAD)
#define A64_LDRSHI(Xt, Xn, imm) A64_LS_IMM(Xt, Xn, imm, 16, SIGNED_LOAD)
#define A64_STR32I(Wt, Xn, imm) A64_LS_IMM(Wt, Xn, imm, 32, STORE)
#define A64_LDR32I(Wt, Xn, imm) A64_LS_IMM(Wt, Xn, imm, 32, LOAD)
#define A64_LDRSWI(Xt, Xn, imm) A64_LS_IMM(Xt, Xn, imm, 32, SIGNED_LOAD)
#define A64_STR64I(Xt, Xn, imm) A64_LS_IMM(Xt, Xn, imm, 64, STORE)
#define A64_LDR64I(Xt, Xn, imm) A64_LS_IMM(Xt, Xn, imm, 64, LOAD)
bpf, arm64: Implement bpf_arch_text_poke() for arm64 Implement bpf_arch_text_poke() for arm64, so bpf prog or bpf trampoline can be patched with it. When the target address is NULL, the original instruction is patched to a NOP. When the target address and the source address are within the branch range, the original instruction is patched to a bl instruction to the target address directly. To support attaching bpf trampoline to both regular kernel function and bpf prog, we follow the ftrace patchsite way for bpf prog. That is, two instructions are inserted at the beginning of bpf prog, the first one saves the return address to x9, and the second is a nop which will be patched to a bl instruction when a bpf trampoline is attached. However, when a bpf trampoline is attached to bpf prog, the distance between target address and source address may exceed 128MB, the maximum branch range, because bpf trampoline and bpf prog are allocated separately with vmalloc. So long jump should be handled. When a bpf prog is constructed, a plt pointing to empty trampoline dummy_tramp is placed at the end: bpf_prog: mov x9, lr nop // patchsite ... ret plt: ldr x10, target br x10 target: .quad dummy_tramp // plt target This is also the state when no trampoline is attached. When a short-jump bpf trampoline is attached, the patchsite is patched to a bl instruction to the trampoline directly: bpf_prog: mov x9, lr bl <short-jump bpf trampoline address> // patchsite ... ret plt: ldr x10, target br x10 target: .quad dummy_tramp // plt target When a long-jump bpf trampoline is attached, the plt target is filled with the trampoline address and the patchsite is patched to a bl instruction to the plt: bpf_prog: mov x9, lr bl plt // patchsite ... ret plt: ldr x10, target br x10 target: .quad <long-jump bpf trampoline address> dummy_tramp is used to prevent another CPU from jumping to an unknown location during the patching process, making the patching process easier. The patching process is as follows: 1. when neither the old address or the new address is a long jump, the patchsite is replaced with a bl to the new address, or nop if the new address is NULL; 2. when the old address is not long jump but the new one is, the branch target address is written to plt first, then the patchsite is replaced with a bl instruction to the plt; 3. when the old address is long jump but the new one is not, the address of dummy_tramp is written to plt first, then the patchsite is replaced with a bl to the new address, or a nop if the new address is NULL; 4. when both the old address and the new address are long jump, the new address is written to plt and the patchsite is not changed. Signed-off-by: Xu Kuohai <xukuohai@huawei.com> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Reviewed-by: Jakub Sitnicki <jakub@cloudflare.com> Reviewed-by: KP Singh <kpsingh@kernel.org> Reviewed-by: Jean-Philippe Brucker <jean-philippe@linaro.org> Acked-by: Song Liu <songliubraving@fb.com> Link: https://lore.kernel.org/bpf/20220711150823.2128542-4-xukuohai@huawei.com
2022-07-11 11:08:22 -04:00
/* LDR (literal) */
#define A64_LDR32LIT(Wt, offset) \
aarch64_insn_gen_load_literal(0, offset, Wt, false)
#define A64_LDR64LIT(Xt, offset) \
aarch64_insn_gen_load_literal(0, offset, Xt, true)
/* Load/store register pair */
#define A64_LS_PAIR(Rt, Rt2, Rn, offset, ls, type) \
aarch64_insn_gen_load_store_pair(Rt, Rt2, Rn, offset, \
AARCH64_INSN_VARIANT_64BIT, \
AARCH64_INSN_LDST_##ls##_PAIR_##type)
/* Rn -= 16; Rn[0] = Rt; Rn[8] = Rt2; */
#define A64_PUSH(Rt, Rt2, Rn) A64_LS_PAIR(Rt, Rt2, Rn, -16, STORE, PRE_INDEX)
/* Rt = Rn[0]; Rt2 = Rn[8]; Rn += 16; */
#define A64_POP(Rt, Rt2, Rn) A64_LS_PAIR(Rt, Rt2, Rn, 16, LOAD, POST_INDEX)
bpf, arm64: implement jiting of BPF_XADD This work adds BPF_XADD for BPF_W/BPF_DW to the arm64 JIT and therefore completes JITing of all BPF instructions, meaning we can thus also remove the 'notyet' label and do not need to fall back to the interpreter when BPF_XADD is used in a program! This now also brings arm64 JIT in line with x86_64, s390x, ppc64, sparc64, where all current eBPF features are supported. BPF_W example from test_bpf: .u.insns_int = { BPF_ALU32_IMM(BPF_MOV, R0, 0x12), BPF_ST_MEM(BPF_W, R10, -40, 0x10), BPF_STX_XADD(BPF_W, R10, R0, -40), BPF_LDX_MEM(BPF_W, R0, R10, -40), BPF_EXIT_INSN(), }, [...] 00000020: 52800247 mov w7, #0x12 // #18 00000024: 928004eb mov x11, #0xffffffffffffffd8 // #-40 00000028: d280020a mov x10, #0x10 // #16 0000002c: b82b6b2a str w10, [x25,x11] // start of xadd mapping: 00000030: 928004ea mov x10, #0xffffffffffffffd8 // #-40 00000034: 8b19014a add x10, x10, x25 00000038: f9800151 prfm pstl1strm, [x10] 0000003c: 885f7d4b ldxr w11, [x10] 00000040: 0b07016b add w11, w11, w7 00000044: 880b7d4b stxr w11, w11, [x10] 00000048: 35ffffab cbnz w11, 0x0000003c // end of xadd mapping: [...] BPF_DW example from test_bpf: .u.insns_int = { BPF_ALU32_IMM(BPF_MOV, R0, 0x12), BPF_ST_MEM(BPF_DW, R10, -40, 0x10), BPF_STX_XADD(BPF_DW, R10, R0, -40), BPF_LDX_MEM(BPF_DW, R0, R10, -40), BPF_EXIT_INSN(), }, [...] 00000020: 52800247 mov w7, #0x12 // #18 00000024: 928004eb mov x11, #0xffffffffffffffd8 // #-40 00000028: d280020a mov x10, #0x10 // #16 0000002c: f82b6b2a str x10, [x25,x11] // start of xadd mapping: 00000030: 928004ea mov x10, #0xffffffffffffffd8 // #-40 00000034: 8b19014a add x10, x10, x25 00000038: f9800151 prfm pstl1strm, [x10] 0000003c: c85f7d4b ldxr x11, [x10] 00000040: 8b07016b add x11, x11, x7 00000044: c80b7d4b stxr w11, x11, [x10] 00000048: 35ffffab cbnz w11, 0x0000003c // end of xadd mapping: [...] Tested on Cavium ThunderX ARMv8, test suite results after the patch: No JIT: [ 3751.855362] test_bpf: Summary: 311 PASSED, 0 FAILED, [0/303 JIT'ed] With JIT: [ 3573.759527] test_bpf: Summary: 311 PASSED, 0 FAILED, [303/303 JIT'ed] Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Acked-by: Alexei Starovoitov <ast@kernel.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-05-01 02:57:20 +02:00
/* Load/store exclusive */
#define A64_SIZE(sf) \
((sf) ? AARCH64_INSN_SIZE_64 : AARCH64_INSN_SIZE_32)
#define A64_LSX(sf, Rt, Rn, Rs, type) \
aarch64_insn_gen_load_store_ex(Rt, Rn, Rs, A64_SIZE(sf), \
AARCH64_INSN_LDST_##type)
/* Rt = [Rn]; (atomic) */
#define A64_LDXR(sf, Rt, Rn) \
A64_LSX(sf, Rt, Rn, A64_ZR, LOAD_EX)
/* [Rn] = Rt; (atomic) Rs = [state] */
#define A64_STXR(sf, Rt, Rn, Rs) \
A64_LSX(sf, Rt, Rn, Rs, STORE_EX)
/* [Rn] = Rt (store release); (atomic) Rs = [state] */
#define A64_STLXR(sf, Rt, Rn, Rs) \
aarch64_insn_gen_load_store_ex(Rt, Rn, Rs, A64_SIZE(sf), \
AARCH64_INSN_LDST_STORE_REL_EX)
bpf, arm64: implement jiting of BPF_XADD This work adds BPF_XADD for BPF_W/BPF_DW to the arm64 JIT and therefore completes JITing of all BPF instructions, meaning we can thus also remove the 'notyet' label and do not need to fall back to the interpreter when BPF_XADD is used in a program! This now also brings arm64 JIT in line with x86_64, s390x, ppc64, sparc64, where all current eBPF features are supported. BPF_W example from test_bpf: .u.insns_int = { BPF_ALU32_IMM(BPF_MOV, R0, 0x12), BPF_ST_MEM(BPF_W, R10, -40, 0x10), BPF_STX_XADD(BPF_W, R10, R0, -40), BPF_LDX_MEM(BPF_W, R0, R10, -40), BPF_EXIT_INSN(), }, [...] 00000020: 52800247 mov w7, #0x12 // #18 00000024: 928004eb mov x11, #0xffffffffffffffd8 // #-40 00000028: d280020a mov x10, #0x10 // #16 0000002c: b82b6b2a str w10, [x25,x11] // start of xadd mapping: 00000030: 928004ea mov x10, #0xffffffffffffffd8 // #-40 00000034: 8b19014a add x10, x10, x25 00000038: f9800151 prfm pstl1strm, [x10] 0000003c: 885f7d4b ldxr w11, [x10] 00000040: 0b07016b add w11, w11, w7 00000044: 880b7d4b stxr w11, w11, [x10] 00000048: 35ffffab cbnz w11, 0x0000003c // end of xadd mapping: [...] BPF_DW example from test_bpf: .u.insns_int = { BPF_ALU32_IMM(BPF_MOV, R0, 0x12), BPF_ST_MEM(BPF_DW, R10, -40, 0x10), BPF_STX_XADD(BPF_DW, R10, R0, -40), BPF_LDX_MEM(BPF_DW, R0, R10, -40), BPF_EXIT_INSN(), }, [...] 00000020: 52800247 mov w7, #0x12 // #18 00000024: 928004eb mov x11, #0xffffffffffffffd8 // #-40 00000028: d280020a mov x10, #0x10 // #16 0000002c: f82b6b2a str x10, [x25,x11] // start of xadd mapping: 00000030: 928004ea mov x10, #0xffffffffffffffd8 // #-40 00000034: 8b19014a add x10, x10, x25 00000038: f9800151 prfm pstl1strm, [x10] 0000003c: c85f7d4b ldxr x11, [x10] 00000040: 8b07016b add x11, x11, x7 00000044: c80b7d4b stxr w11, x11, [x10] 00000048: 35ffffab cbnz w11, 0x0000003c // end of xadd mapping: [...] Tested on Cavium ThunderX ARMv8, test suite results after the patch: No JIT: [ 3751.855362] test_bpf: Summary: 311 PASSED, 0 FAILED, [0/303 JIT'ed] With JIT: [ 3573.759527] test_bpf: Summary: 311 PASSED, 0 FAILED, [303/303 JIT'ed] Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Acked-by: Alexei Starovoitov <ast@kernel.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-05-01 02:57:20 +02:00
/* Load-acquire & store-release */
#define A64_LDAR(Rt, Rn, size) \
aarch64_insn_gen_load_acq_store_rel(Rt, Rn, AARCH64_INSN_SIZE_##size, \
AARCH64_INSN_LDST_LOAD_ACQ)
#define A64_STLR(Rt, Rn, size) \
aarch64_insn_gen_load_acq_store_rel(Rt, Rn, AARCH64_INSN_SIZE_##size, \
AARCH64_INSN_LDST_STORE_REL)
/* Rt = [Rn] (load acquire) */
#define A64_LDARB(Wt, Xn) A64_LDAR(Wt, Xn, 8)
#define A64_LDARH(Wt, Xn) A64_LDAR(Wt, Xn, 16)
#define A64_LDAR32(Wt, Xn) A64_LDAR(Wt, Xn, 32)
#define A64_LDAR64(Xt, Xn) A64_LDAR(Xt, Xn, 64)
/* [Rn] = Rt (store release) */
#define A64_STLRB(Wt, Xn) A64_STLR(Wt, Xn, 8)
#define A64_STLRH(Wt, Xn) A64_STLR(Wt, Xn, 16)
#define A64_STLR32(Wt, Xn) A64_STLR(Wt, Xn, 32)
#define A64_STLR64(Xt, Xn) A64_STLR(Xt, Xn, 64)
/*
* LSE atomics
*
* ST{ADD,CLR,SET,EOR} is simply encoded as an alias for
* LDD{ADD,CLR,SET,EOR} with XZR as the destination register.
*/
#define A64_ST_OP(sf, Rn, Rs, op) \
aarch64_insn_gen_atomic_ld_op(A64_ZR, Rn, Rs, \
A64_SIZE(sf), AARCH64_INSN_MEM_ATOMIC_##op, \
AARCH64_INSN_MEM_ORDER_NONE)
/* [Rn] <op>= Rs */
#define A64_STADD(sf, Rn, Rs) A64_ST_OP(sf, Rn, Rs, ADD)
#define A64_STCLR(sf, Rn, Rs) A64_ST_OP(sf, Rn, Rs, CLR)
#define A64_STEOR(sf, Rn, Rs) A64_ST_OP(sf, Rn, Rs, EOR)
#define A64_STSET(sf, Rn, Rs) A64_ST_OP(sf, Rn, Rs, SET)
#define A64_LD_OP_AL(sf, Rt, Rn, Rs, op) \
aarch64_insn_gen_atomic_ld_op(Rt, Rn, Rs, \
A64_SIZE(sf), AARCH64_INSN_MEM_ATOMIC_##op, \
AARCH64_INSN_MEM_ORDER_ACQREL)
/* Rt = [Rn] (load acquire); [Rn] <op>= Rs (store release) */
#define A64_LDADDAL(sf, Rt, Rn, Rs) A64_LD_OP_AL(sf, Rt, Rn, Rs, ADD)
#define A64_LDCLRAL(sf, Rt, Rn, Rs) A64_LD_OP_AL(sf, Rt, Rn, Rs, CLR)
#define A64_LDEORAL(sf, Rt, Rn, Rs) A64_LD_OP_AL(sf, Rt, Rn, Rs, EOR)
#define A64_LDSETAL(sf, Rt, Rn, Rs) A64_LD_OP_AL(sf, Rt, Rn, Rs, SET)
/* Rt = [Rn] (load acquire); [Rn] = Rs (store release) */
#define A64_SWPAL(sf, Rt, Rn, Rs) A64_LD_OP_AL(sf, Rt, Rn, Rs, SWP)
/* Rs = CAS(Rn, Rs, Rt) (load acquire & store release) */
#define A64_CASAL(sf, Rt, Rn, Rs) \
aarch64_insn_gen_cas(Rt, Rn, Rs, A64_SIZE(sf), \
AARCH64_INSN_MEM_ORDER_ACQREL)
/* Add/subtract (immediate) */
#define A64_ADDSUB_IMM(sf, Rd, Rn, imm12, type) \
aarch64_insn_gen_add_sub_imm(Rd, Rn, imm12, \
A64_VARIANT(sf), AARCH64_INSN_ADSB_##type)
/* Rd = Rn OP imm12 */
#define A64_ADD_I(sf, Rd, Rn, imm12) A64_ADDSUB_IMM(sf, Rd, Rn, imm12, ADD)
#define A64_SUB_I(sf, Rd, Rn, imm12) A64_ADDSUB_IMM(sf, Rd, Rn, imm12, SUB)
#define A64_ADDS_I(sf, Rd, Rn, imm12) \
A64_ADDSUB_IMM(sf, Rd, Rn, imm12, ADD_SETFLAGS)
#define A64_SUBS_I(sf, Rd, Rn, imm12) \
A64_ADDSUB_IMM(sf, Rd, Rn, imm12, SUB_SETFLAGS)
/* Rn + imm12; set condition flags */
#define A64_CMN_I(sf, Rn, imm12) A64_ADDS_I(sf, A64_ZR, Rn, imm12)
/* Rn - imm12; set condition flags */
#define A64_CMP_I(sf, Rn, imm12) A64_SUBS_I(sf, A64_ZR, Rn, imm12)
/* Rd = Rn */
#define A64_MOV(sf, Rd, Rn) A64_ADD_I(sf, Rd, Rn, 0)
/* Bitfield move */
#define A64_BITFIELD(sf, Rd, Rn, immr, imms, type) \
aarch64_insn_gen_bitfield(Rd, Rn, immr, imms, \
A64_VARIANT(sf), AARCH64_INSN_BITFIELD_MOVE_##type)
/* Signed, with sign replication to left and zeros to right */
#define A64_SBFM(sf, Rd, Rn, ir, is) A64_BITFIELD(sf, Rd, Rn, ir, is, SIGNED)
/* Unsigned, with zeros to left and right */
#define A64_UBFM(sf, Rd, Rn, ir, is) A64_BITFIELD(sf, Rd, Rn, ir, is, UNSIGNED)
/* Rd = Rn << shift */
#define A64_LSL(sf, Rd, Rn, shift) ({ \
int sz = (sf) ? 64 : 32; \
A64_UBFM(sf, Rd, Rn, (unsigned)-(shift) % sz, sz - 1 - (shift)); \
})
/* Rd = Rn >> shift */
#define A64_LSR(sf, Rd, Rn, shift) A64_UBFM(sf, Rd, Rn, shift, (sf) ? 63 : 31)
/* Rd = Rn >> shift; signed */
#define A64_ASR(sf, Rd, Rn, shift) A64_SBFM(sf, Rd, Rn, shift, (sf) ? 63 : 31)
/* Zero extend */
#define A64_UXTH(sf, Rd, Rn) A64_UBFM(sf, Rd, Rn, 0, 15)
#define A64_UXTW(sf, Rd, Rn) A64_UBFM(sf, Rd, Rn, 0, 31)
/* Sign extend */
#define A64_SXTB(sf, Rd, Rn) A64_SBFM(sf, Rd, Rn, 0, 7)
#define A64_SXTH(sf, Rd, Rn) A64_SBFM(sf, Rd, Rn, 0, 15)
#define A64_SXTW(sf, Rd, Rn) A64_SBFM(sf, Rd, Rn, 0, 31)
/* Move wide (immediate) */
#define A64_MOVEW(sf, Rd, imm16, shift, type) \
aarch64_insn_gen_movewide(Rd, imm16, shift, \
A64_VARIANT(sf), AARCH64_INSN_MOVEWIDE_##type)
/* Rd = Zeros (for MOVZ);
* Rd |= imm16 << shift (where shift is {0, 16, 32, 48});
* Rd = ~Rd; (for MOVN); */
#define A64_MOVN(sf, Rd, imm16, shift) A64_MOVEW(sf, Rd, imm16, shift, INVERSE)
#define A64_MOVZ(sf, Rd, imm16, shift) A64_MOVEW(sf, Rd, imm16, shift, ZERO)
#define A64_MOVK(sf, Rd, imm16, shift) A64_MOVEW(sf, Rd, imm16, shift, KEEP)
/* Add/subtract (shifted register) */
#define A64_ADDSUB_SREG(sf, Rd, Rn, Rm, type) \
aarch64_insn_gen_add_sub_shifted_reg(Rd, Rn, Rm, 0, \
A64_VARIANT(sf), AARCH64_INSN_ADSB_##type)
/* Rd = Rn OP Rm */
#define A64_ADD(sf, Rd, Rn, Rm) A64_ADDSUB_SREG(sf, Rd, Rn, Rm, ADD)
#define A64_SUB(sf, Rd, Rn, Rm) A64_ADDSUB_SREG(sf, Rd, Rn, Rm, SUB)
#define A64_SUBS(sf, Rd, Rn, Rm) A64_ADDSUB_SREG(sf, Rd, Rn, Rm, SUB_SETFLAGS)
/* Rd = -Rm */
#define A64_NEG(sf, Rd, Rm) A64_SUB(sf, Rd, A64_ZR, Rm)
/* Rn - Rm; set condition flags */
#define A64_CMP(sf, Rn, Rm) A64_SUBS(sf, A64_ZR, Rn, Rm)
/* Data-processing (1 source) */
#define A64_DATA1(sf, Rd, Rn, type) aarch64_insn_gen_data1(Rd, Rn, \
A64_VARIANT(sf), AARCH64_INSN_DATA1_##type)
/* Rd = BSWAPx(Rn) */
#define A64_REV16(sf, Rd, Rn) A64_DATA1(sf, Rd, Rn, REVERSE_16)
#define A64_REV32(sf, Rd, Rn) A64_DATA1(sf, Rd, Rn, REVERSE_32)
#define A64_REV64(Rd, Rn) A64_DATA1(1, Rd, Rn, REVERSE_64)
/* Data-processing (2 source) */
/* Rd = Rn OP Rm */
#define A64_DATA2(sf, Rd, Rn, Rm, type) aarch64_insn_gen_data2(Rd, Rn, Rm, \
A64_VARIANT(sf), AARCH64_INSN_DATA2_##type)
#define A64_UDIV(sf, Rd, Rn, Rm) A64_DATA2(sf, Rd, Rn, Rm, UDIV)
#define A64_SDIV(sf, Rd, Rn, Rm) A64_DATA2(sf, Rd, Rn, Rm, SDIV)
#define A64_LSLV(sf, Rd, Rn, Rm) A64_DATA2(sf, Rd, Rn, Rm, LSLV)
#define A64_LSRV(sf, Rd, Rn, Rm) A64_DATA2(sf, Rd, Rn, Rm, LSRV)
#define A64_ASRV(sf, Rd, Rn, Rm) A64_DATA2(sf, Rd, Rn, Rm, ASRV)
/* Data-processing (3 source) */
/* Rd = Ra + Rn * Rm */
#define A64_MADD(sf, Rd, Ra, Rn, Rm) aarch64_insn_gen_data3(Rd, Ra, Rn, Rm, \
A64_VARIANT(sf), AARCH64_INSN_DATA3_MADD)
/* Rd = Ra - Rn * Rm */
#define A64_MSUB(sf, Rd, Ra, Rn, Rm) aarch64_insn_gen_data3(Rd, Ra, Rn, Rm, \
A64_VARIANT(sf), AARCH64_INSN_DATA3_MSUB)
/* Rd = Rn * Rm */
#define A64_MUL(sf, Rd, Rn, Rm) A64_MADD(sf, Rd, A64_ZR, Rn, Rm)
/* Logical (shifted register) */
#define A64_LOGIC_SREG(sf, Rd, Rn, Rm, type) \
aarch64_insn_gen_logical_shifted_reg(Rd, Rn, Rm, 0, \
A64_VARIANT(sf), AARCH64_INSN_LOGIC_##type)
/* Rd = Rn OP Rm */
#define A64_AND(sf, Rd, Rn, Rm) A64_LOGIC_SREG(sf, Rd, Rn, Rm, AND)
#define A64_ORR(sf, Rd, Rn, Rm) A64_LOGIC_SREG(sf, Rd, Rn, Rm, ORR)
#define A64_EOR(sf, Rd, Rn, Rm) A64_LOGIC_SREG(sf, Rd, Rn, Rm, EOR)
#define A64_ANDS(sf, Rd, Rn, Rm) A64_LOGIC_SREG(sf, Rd, Rn, Rm, AND_SETFLAGS)
/* Rn & Rm; set condition flags */
#define A64_TST(sf, Rn, Rm) A64_ANDS(sf, A64_ZR, Rn, Rm)
/* Rd = ~Rm (alias of ORN with A64_ZR as Rn) */
#define A64_MVN(sf, Rd, Rm) \
A64_LOGIC_SREG(sf, Rd, A64_ZR, Rm, ORN)
/* Logical (immediate) */
#define A64_LOGIC_IMM(sf, Rd, Rn, imm, type) ({ \
u64 imm64 = (sf) ? (u64)imm : (u64)(u32)imm; \
aarch64_insn_gen_logical_immediate(AARCH64_INSN_LOGIC_##type, \
A64_VARIANT(sf), Rn, Rd, imm64); \
})
/* Rd = Rn OP imm */
#define A64_AND_I(sf, Rd, Rn, imm) A64_LOGIC_IMM(sf, Rd, Rn, imm, AND)
#define A64_ORR_I(sf, Rd, Rn, imm) A64_LOGIC_IMM(sf, Rd, Rn, imm, ORR)
#define A64_EOR_I(sf, Rd, Rn, imm) A64_LOGIC_IMM(sf, Rd, Rn, imm, EOR)
#define A64_ANDS_I(sf, Rd, Rn, imm) A64_LOGIC_IMM(sf, Rd, Rn, imm, AND_SETFLAGS)
/* Rn & imm; set condition flags */
#define A64_TST_I(sf, Rn, imm) A64_ANDS_I(sf, A64_ZR, Rn, imm)
/* HINTs */
#define A64_HINT(x) aarch64_insn_gen_hint(x)
#define A64_PACIASP A64_HINT(AARCH64_INSN_HINT_PACIASP)
#define A64_AUTIASP A64_HINT(AARCH64_INSN_HINT_AUTIASP)
/* BTI */
#define A64_BTI_C A64_HINT(AARCH64_INSN_HINT_BTIC)
#define A64_BTI_J A64_HINT(AARCH64_INSN_HINT_BTIJ)
#define A64_BTI_JC A64_HINT(AARCH64_INSN_HINT_BTIJC)
bpf, arm64: Implement bpf_arch_text_poke() for arm64 Implement bpf_arch_text_poke() for arm64, so bpf prog or bpf trampoline can be patched with it. When the target address is NULL, the original instruction is patched to a NOP. When the target address and the source address are within the branch range, the original instruction is patched to a bl instruction to the target address directly. To support attaching bpf trampoline to both regular kernel function and bpf prog, we follow the ftrace patchsite way for bpf prog. That is, two instructions are inserted at the beginning of bpf prog, the first one saves the return address to x9, and the second is a nop which will be patched to a bl instruction when a bpf trampoline is attached. However, when a bpf trampoline is attached to bpf prog, the distance between target address and source address may exceed 128MB, the maximum branch range, because bpf trampoline and bpf prog are allocated separately with vmalloc. So long jump should be handled. When a bpf prog is constructed, a plt pointing to empty trampoline dummy_tramp is placed at the end: bpf_prog: mov x9, lr nop // patchsite ... ret plt: ldr x10, target br x10 target: .quad dummy_tramp // plt target This is also the state when no trampoline is attached. When a short-jump bpf trampoline is attached, the patchsite is patched to a bl instruction to the trampoline directly: bpf_prog: mov x9, lr bl <short-jump bpf trampoline address> // patchsite ... ret plt: ldr x10, target br x10 target: .quad dummy_tramp // plt target When a long-jump bpf trampoline is attached, the plt target is filled with the trampoline address and the patchsite is patched to a bl instruction to the plt: bpf_prog: mov x9, lr bl plt // patchsite ... ret plt: ldr x10, target br x10 target: .quad <long-jump bpf trampoline address> dummy_tramp is used to prevent another CPU from jumping to an unknown location during the patching process, making the patching process easier. The patching process is as follows: 1. when neither the old address or the new address is a long jump, the patchsite is replaced with a bl to the new address, or nop if the new address is NULL; 2. when the old address is not long jump but the new one is, the branch target address is written to plt first, then the patchsite is replaced with a bl instruction to the plt; 3. when the old address is long jump but the new one is not, the address of dummy_tramp is written to plt first, then the patchsite is replaced with a bl to the new address, or a nop if the new address is NULL; 4. when both the old address and the new address are long jump, the new address is written to plt and the patchsite is not changed. Signed-off-by: Xu Kuohai <xukuohai@huawei.com> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Reviewed-by: Jakub Sitnicki <jakub@cloudflare.com> Reviewed-by: KP Singh <kpsingh@kernel.org> Reviewed-by: Jean-Philippe Brucker <jean-philippe@linaro.org> Acked-by: Song Liu <songliubraving@fb.com> Link: https://lore.kernel.org/bpf/20220711150823.2128542-4-xukuohai@huawei.com
2022-07-11 11:08:22 -04:00
#define A64_NOP A64_HINT(AARCH64_INSN_HINT_NOP)
/* DMB */
#define A64_DMB_ISH aarch64_insn_gen_dmb(AARCH64_INSN_MB_ISH)
bpf, arm64: Fixed a BTI error on returning to patched function When BPF_TRAMP_F_CALL_ORIG is set, BPF trampoline uses BLR to jump back to the instruction next to call site to call the patched function. For BTI-enabled kernel, the instruction next to call site is usually PACIASP, in this case, it's safe to jump back with BLR. But when the call site is not followed by a PACIASP or bti, a BTI exception is triggered. Here is a fault log: Unhandled 64-bit el1h sync exception on CPU0, ESR 0x0000000034000002 -- BTI CPU: 0 PID: 263 Comm: test_progs Tainted: GF Hardware name: linux,dummy-virt (DT) pstate: 40400805 (nZcv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=-c) pc : bpf_fentry_test1+0xc/0x30 lr : bpf_trampoline_6442573892_0+0x48/0x1000 sp : ffff80000c0c3a50 x29: ffff80000c0c3a90 x28: ffff0000c2e6c080 x27: 0000000000000000 x26: 0000000000000000 x25: 0000000000000000 x24: 0000000000000050 x23: 0000000000000000 x22: 0000ffffcfd2a7f0 x21: 000000000000000a x20: 0000ffffcfd2a7f0 x19: 0000000000000000 x18: 0000000000000000 x17: 0000000000000000 x16: 0000000000000000 x15: 0000ffffcfd2a7f0 x14: 0000000000000000 x13: 0000000000000000 x12: 0000000000000000 x11: 0000000000000000 x10: ffff80000914f5e4 x9 : ffff8000082a1528 x8 : 0000000000000000 x7 : 0000000000000000 x6 : 0101010101010101 x5 : 0000000000000000 x4 : 00000000fffffff2 x3 : 0000000000000001 x2 : ffff8001f4b82000 x1 : 0000000000000000 x0 : 0000000000000001 Kernel panic - not syncing: Unhandled exception CPU: 0 PID: 263 Comm: test_progs Tainted: GF Hardware name: linux,dummy-virt (DT) Call trace: dump_backtrace+0xec/0x144 show_stack+0x24/0x7c dump_stack_lvl+0x8c/0xb8 dump_stack+0x18/0x34 panic+0x1cc/0x3ec __el0_error_handler_common+0x0/0x130 el1h_64_sync_handler+0x60/0xd0 el1h_64_sync+0x78/0x7c bpf_fentry_test1+0xc/0x30 bpf_fentry_test1+0xc/0x30 bpf_prog_test_run_tracing+0xdc/0x2a0 __sys_bpf+0x438/0x22a0 __arm64_sys_bpf+0x30/0x54 invoke_syscall+0x78/0x110 el0_svc_common.constprop.0+0x6c/0x1d0 do_el0_svc+0x38/0xe0 el0_svc+0x30/0xd0 el0t_64_sync_handler+0x1ac/0x1b0 el0t_64_sync+0x1a0/0x1a4 Kernel Offset: disabled CPU features: 0x0000,00034c24,f994fdab Memory Limit: none And the instruction next to call site of bpf_fentry_test1 is ADD, not PACIASP: <bpf_fentry_test1>: bti c nop nop add w0, w0, #0x1 paciasp For BPF prog, JIT always puts a PACIASP after call site for BTI-enabled kernel, so there is no problem. To fix it, replace BLR with RET to bypass the branch target check. Fixes: efc9909fdce0 ("bpf, arm64: Add bpf trampoline for arm64") Reported-by: Florent Revest <revest@chromium.org> Signed-off-by: Xu Kuohai <xukuohai@huawei.com> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Tested-by: Florent Revest <revest@chromium.org> Acked-by: Florent Revest <revest@chromium.org> Link: https://lore.kernel.org/bpf/20230401234144.3719742-1-xukuohai@huaweicloud.com
2023-04-01 19:41:44 -04:00
/* ADR */
#define A64_ADR(Rd, offset) \
aarch64_insn_gen_adr(0, offset, Rd, AARCH64_INSN_ADR_TYPE_ADR)
/* MRS */
#define A64_MRS_TPIDR_EL1(Rt) \
aarch64_insn_gen_mrs(Rt, AARCH64_INSN_SYSREG_TPIDR_EL1)
#define A64_MRS_TPIDR_EL2(Rt) \
aarch64_insn_gen_mrs(Rt, AARCH64_INSN_SYSREG_TPIDR_EL2)
2024-05-02 15:18:54 +00:00
#define A64_MRS_SP_EL0(Rt) \
aarch64_insn_gen_mrs(Rt, AARCH64_INSN_SYSREG_SP_EL0)
bpf, arm64, powerpc: Change nospec to include v1 barrier This changes the semantics of BPF_NOSPEC (previously a v4-only barrier) to always emit a speculation barrier that works against both Spectre v1 AND v4. If mitigation is not needed on an architecture, the backend should set bpf_jit_bypass_spec_v4/v1(). As of now, this commit only has the user-visible implication that unpriv BPF's performance on PowerPC is reduced. This is the case because we have to emit additional v1 barrier instructions for BPF_NOSPEC now. This commit is required for a future commit to allow us to rely on BPF_NOSPEC for Spectre v1 mitigation. As of this commit, the feature that nospec acts as a v1 barrier is unused. Commit f5e81d111750 ("bpf: Introduce BPF nospec instruction for mitigating Spectre v4") noted that mitigation instructions for v1 and v4 might be different on some archs. While this would potentially offer improved performance on PowerPC, it was dismissed after the following considerations: * Only having one barrier simplifies the verifier and allows us to easily rely on v4-induced barriers for reducing the complexity of v1-induced speculative path verification. * For the architectures that implemented BPF_NOSPEC, only PowerPC has distinct instructions for v1 and v4. Even there, some insns may be shared between the barriers for v1 and v4 (e.g., 'ori 31,31,0' and 'sync'). If this is still found to impact performance in an unacceptable way, BPF_NOSPEC can be split into BPF_NOSPEC_V1 and BPF_NOSPEC_V4 later. As an optimization, we can already skip v1/v4 insns from being emitted for PowerPC with this setup if bypass_spec_v1/v4 is set. Vulnerability-status for BPF_NOSPEC-based Spectre mitigations (v4 as of this commit, v1 in the future) is therefore: * x86 (32-bit and 64-bit), ARM64, and PowerPC (64-bit): Mitigated - This patch implements BPF_NOSPEC for these architectures. The previous v4-only version was supported since commit f5e81d111750 ("bpf: Introduce BPF nospec instruction for mitigating Spectre v4") and commit b7540d625094 ("powerpc/bpf: Emit stf barrier instruction sequences for BPF_NOSPEC"). * LoongArch: Not Vulnerable - Commit a6f6a95f2580 ("LoongArch, bpf: Fix jit to skip speculation barrier opcode") is the only other past commit related to BPF_NOSPEC and indicates that the insn is not required there. * MIPS: Vulnerable (if unprivileged BPF is enabled) - Commit a6f6a95f2580 ("LoongArch, bpf: Fix jit to skip speculation barrier opcode") indicates that it is not vulnerable, but this contradicts the kernel and Debian documentation. Therefore, I assume that there exist vulnerable MIPS CPUs (but maybe not from Loongson?). In the future, BPF_NOSPEC could be implemented for MIPS based on the GCC speculation_barrier [1]. For now, we rely on unprivileged BPF being disabled by default. * Other: Unknown - To the best of my knowledge there is no definitive information available that indicates that any other arch is vulnerable. They are therefore left untouched (BPF_NOSPEC is not implemented, but bypass_spec_v1/v4 is also not set). I did the following testing to ensure the insn encoding is correct: * ARM64: * 'dsb nsh; isb' was successfully tested with the BPF CI in [2] * 'sb' locally using QEMU v7.2.15 -cpu max (emitted sb insn is executed for example with './test_progs -t verifier_array_access') * PowerPC: The following configs were tested locally with ppc64le QEMU v8.2 '-machine pseries -cpu POWER9': * STF_BARRIER_EIEIO + CONFIG_PPC_BOOK32_64 * STF_BARRIER_SYNC_ORI (forced on) + CONFIG_PPC_BOOK32_64 * STF_BARRIER_FALLBACK (forced on) + CONFIG_PPC_BOOK32_64 * CONFIG_PPC_E500 (forced on) + STF_BARRIER_EIEIO * CONFIG_PPC_E500 (forced on) + STF_BARRIER_SYNC_ORI (forced on) * CONFIG_PPC_E500 (forced on) + STF_BARRIER_FALLBACK (forced on) * CONFIG_PPC_E500 (forced on) + STF_BARRIER_NONE (forced on) Most of those cobinations should not occur in practice, but I was not able to get an PPC e6500 rootfs (for testing PPC_E500 without forcing it on). In any case, this should ensure that there are no unexpected conflicts between the insns when combined like this. Individual v1/v4 barriers were already emitted elsewhere. Hari's ack is for the PowerPC changes only. [1] https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=29b74545531f6afbee9fc38c267524326dbfbedf ("MIPS: Add speculation_barrier support") [2] https://github.com/kernel-patches/bpf/pull/8576 Signed-off-by: Luis Gerhorst <luis.gerhorst@fau.de> Acked-by: Hari Bathini <hbathini@linux.ibm.com> Cc: Henriette Herzog <henriette.herzog@rub.de> Cc: Maximilian Ott <ott@cs.fau.de> Cc: Milan Stephan <milan.stephan@fau.de> Link: https://lore.kernel.org/r/20250603211703.337860-1-luis.gerhorst@fau.de Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2025-06-03 23:17:03 +02:00
/* Barriers */
#define A64_SB aarch64_insn_get_sb_value()
#define A64_DSB_NSH (aarch64_insn_get_dsb_base_value() | 0x7 << 8)
#define A64_ISB aarch64_insn_get_isb_value()
#endif /* _BPF_JIT_H */