aboutsummaryrefslogtreecommitdiffstats
path: root/arch
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2019-11-20 18:11:23 -0800
committerDavid S. Miller <davem@davemloft.net>2019-11-20 18:11:23 -0800
commitee5a489fd9645104925e5cdf8f8e455d833730b9 (patch)
tree1e46a8c460e1d51d465fe472e42cf1c16f92f9c7 /arch
parentMerge branch 'r8169-smaller-improvements-to-firmware-handling' (diff)
parentbpf: Switch bpf_map_{area_alloc,area_mmapable_alloc}() to u64 size (diff)
downloadlinux-dev-ee5a489fd9645104925e5cdf8f8e455d833730b9.tar.xz
linux-dev-ee5a489fd9645104925e5cdf8f8e455d833730b9.zip
Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next
Daniel Borkmann says: ==================== pull-request: bpf-next 2019-11-20 The following pull-request contains BPF updates for your *net-next* tree. We've added 81 non-merge commits during the last 17 day(s) which contain a total of 120 files changed, 4958 insertions(+), 1081 deletions(-). There are 3 trivial conflicts, resolve it by always taking the chunk from 196e8ca74886c433: <<<<<<< HEAD ======= void *bpf_map_area_mmapable_alloc(u64 size, int numa_node); >>>>>>> 196e8ca74886c433dcfc64a809707074b936aaf5 <<<<<<< HEAD void *bpf_map_area_alloc(u64 size, int numa_node) ======= static void *__bpf_map_area_alloc(u64 size, int numa_node, bool mmapable) >>>>>>> 196e8ca74886c433dcfc64a809707074b936aaf5 <<<<<<< HEAD if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { ======= /* kmalloc()'ed memory can't be mmap()'ed */ if (!mmapable && size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { >>>>>>> 196e8ca74886c433dcfc64a809707074b936aaf5 The main changes are: 1) Addition of BPF trampoline which works as a bridge between kernel functions, BPF programs and other BPF programs along with two new use cases: i) fentry/fexit BPF programs for tracing with practically zero overhead to call into BPF (as opposed to k[ret]probes) and ii) attachment of the former to networking related programs to see input/output of networking programs (covering xdpdump use case), from Alexei Starovoitov. 2) BPF array map mmap support and use in libbpf for global data maps; also a big batch of libbpf improvements, among others, support for reading bitfields in a relocatable manner (via libbpf's CO-RE helper API), from Andrii Nakryiko. 3) Extend s390x JIT with usage of relative long jumps and loads in order to lift the current 64/512k size limits on JITed BPF programs there, from Ilya Leoshkevich. 4) Add BPF audit support and emit messages upon successful prog load and unload in order to have a timeline of events, from Daniel Borkmann and Jiri Olsa. 5) Extension to libbpf and xdpsock sample programs to demo the shared umem mode (XDP_SHARED_UMEM) as well as RX-only and TX-only sockets, from Magnus Karlsson. 6) Several follow-up bug fixes for libbpf's auto-pinning code and a new API call named bpf_get_link_xdp_info() for retrieving the full set of prog IDs attached to XDP, from Toke Høiland-Jørgensen. 7) Add BTF support for array of int, array of struct and multidimensional arrays and enable it for skb->cb[] access in kfree_skb test, from Martin KaFai Lau. 8) Fix AF_XDP by using the correct number of channels from ethtool, from Luigi Rizzo. 9) Two fixes for BPF selftest to get rid of a hang in test_tc_tunnel and to avoid xdping to be run as standalone, from Jiri Benc. 10) Various BPF selftest fixes when run with latest LLVM trunk, from Yonghong Song. 11) Fix a memory leak in BPF fentry test run data, from Colin Ian King. 12) Various smaller misc cleanups and improvements mostly all over BPF selftests and samples, from Daniel T. Lee, Andre Guedes, Anders Roxell, Mao Wenan, Yue Haibing. ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'arch')
-rw-r--r--arch/s390/net/bpf_jit_comp.c502
-rw-r--r--arch/x86/include/asm/text-patching.h24
-rw-r--r--arch/x86/kernel/alternative.c132
-rw-r--r--arch/x86/kernel/jump_label.c9
-rw-r--r--arch/x86/kernel/kprobes/opt.c11
-rw-r--r--arch/x86/net/bpf_jit_comp.c424
6 files changed, 865 insertions, 237 deletions
diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c
index ce88211b9c6c..8d2134136290 100644
--- a/arch/s390/net/bpf_jit_comp.c
+++ b/arch/s390/net/bpf_jit_comp.c
@@ -23,6 +23,8 @@
#include <linux/filter.h>
#include <linux/init.h>
#include <linux/bpf.h>
+#include <linux/mm.h>
+#include <linux/kernel.h>
#include <asm/cacheflush.h>
#include <asm/dis.h>
#include <asm/facility.h>
@@ -38,10 +40,11 @@ struct bpf_jit {
int size; /* Size of program and literal pool */
int size_prg; /* Size of program */
int prg; /* Current position in program */
- int lit_start; /* Start of literal pool */
- int lit; /* Current position in literal pool */
+ int lit32_start; /* Start of 32-bit literal pool */
+ int lit32; /* Current position in 32-bit literal pool */
+ int lit64_start; /* Start of 64-bit literal pool */
+ int lit64; /* Current position in 64-bit literal pool */
int base_ip; /* Base address for literal pool */
- int ret0_ip; /* Address of return 0 */
int exit_ip; /* Address of exit */
int r1_thunk_ip; /* Address of expoline thunk for 'br %r1' */
int r14_thunk_ip; /* Address of expoline thunk for 'br %r14' */
@@ -49,14 +52,10 @@ struct bpf_jit {
int labels[1]; /* Labels for local jumps */
};
-#define BPF_SIZE_MAX 0xffff /* Max size for program (16 bit branches) */
-
-#define SEEN_MEM (1 << 0) /* use mem[] for temporary storage */
-#define SEEN_RET0 (1 << 1) /* ret0_ip points to a valid return 0 */
-#define SEEN_LITERAL (1 << 2) /* code uses literals */
-#define SEEN_FUNC (1 << 3) /* calls C functions */
-#define SEEN_TAIL_CALL (1 << 4) /* code uses tail calls */
-#define SEEN_REG_AX (1 << 5) /* code uses constant blinding */
+#define SEEN_MEM BIT(0) /* use mem[] for temporary storage */
+#define SEEN_LITERAL BIT(1) /* code uses literals */
+#define SEEN_FUNC BIT(2) /* calls C functions */
+#define SEEN_TAIL_CALL BIT(3) /* code uses tail calls */
#define SEEN_STACK (SEEN_FUNC | SEEN_MEM)
/*
@@ -131,13 +130,13 @@ static inline void reg_set_seen(struct bpf_jit *jit, u32 b1)
#define _EMIT2(op) \
({ \
if (jit->prg_buf) \
- *(u16 *) (jit->prg_buf + jit->prg) = op; \
+ *(u16 *) (jit->prg_buf + jit->prg) = (op); \
jit->prg += 2; \
})
#define EMIT2(op, b1, b2) \
({ \
- _EMIT2(op | reg(b1, b2)); \
+ _EMIT2((op) | reg(b1, b2)); \
REG_SET_SEEN(b1); \
REG_SET_SEEN(b2); \
})
@@ -145,20 +144,20 @@ static inline void reg_set_seen(struct bpf_jit *jit, u32 b1)
#define _EMIT4(op) \
({ \
if (jit->prg_buf) \
- *(u32 *) (jit->prg_buf + jit->prg) = op; \
+ *(u32 *) (jit->prg_buf + jit->prg) = (op); \
jit->prg += 4; \
})
#define EMIT4(op, b1, b2) \
({ \
- _EMIT4(op | reg(b1, b2)); \
+ _EMIT4((op) | reg(b1, b2)); \
REG_SET_SEEN(b1); \
REG_SET_SEEN(b2); \
})
#define EMIT4_RRF(op, b1, b2, b3) \
({ \
- _EMIT4(op | reg_high(b3) << 8 | reg(b1, b2)); \
+ _EMIT4((op) | reg_high(b3) << 8 | reg(b1, b2)); \
REG_SET_SEEN(b1); \
REG_SET_SEEN(b2); \
REG_SET_SEEN(b3); \
@@ -167,13 +166,13 @@ static inline void reg_set_seen(struct bpf_jit *jit, u32 b1)
#define _EMIT4_DISP(op, disp) \
({ \
unsigned int __disp = (disp) & 0xfff; \
- _EMIT4(op | __disp); \
+ _EMIT4((op) | __disp); \
})
#define EMIT4_DISP(op, b1, b2, disp) \
({ \
- _EMIT4_DISP(op | reg_high(b1) << 16 | \
- reg_high(b2) << 8, disp); \
+ _EMIT4_DISP((op) | reg_high(b1) << 16 | \
+ reg_high(b2) << 8, (disp)); \
REG_SET_SEEN(b1); \
REG_SET_SEEN(b2); \
})
@@ -181,21 +180,27 @@ static inline void reg_set_seen(struct bpf_jit *jit, u32 b1)
#define EMIT4_IMM(op, b1, imm) \
({ \
unsigned int __imm = (imm) & 0xffff; \
- _EMIT4(op | reg_high(b1) << 16 | __imm); \
+ _EMIT4((op) | reg_high(b1) << 16 | __imm); \
REG_SET_SEEN(b1); \
})
#define EMIT4_PCREL(op, pcrel) \
({ \
long __pcrel = ((pcrel) >> 1) & 0xffff; \
- _EMIT4(op | __pcrel); \
+ _EMIT4((op) | __pcrel); \
+})
+
+#define EMIT4_PCREL_RIC(op, mask, target) \
+({ \
+ int __rel = ((target) - jit->prg) / 2; \
+ _EMIT4((op) | (mask) << 20 | (__rel & 0xffff)); \
})
#define _EMIT6(op1, op2) \
({ \
if (jit->prg_buf) { \
- *(u32 *) (jit->prg_buf + jit->prg) = op1; \
- *(u16 *) (jit->prg_buf + jit->prg + 4) = op2; \
+ *(u32 *) (jit->prg_buf + jit->prg) = (op1); \
+ *(u16 *) (jit->prg_buf + jit->prg + 4) = (op2); \
} \
jit->prg += 6; \
})
@@ -203,20 +208,20 @@ static inline void reg_set_seen(struct bpf_jit *jit, u32 b1)
#define _EMIT6_DISP(op1, op2, disp) \
({ \
unsigned int __disp = (disp) & 0xfff; \
- _EMIT6(op1 | __disp, op2); \
+ _EMIT6((op1) | __disp, op2); \
})
#define _EMIT6_DISP_LH(op1, op2, disp) \
({ \
- u32 _disp = (u32) disp; \
+ u32 _disp = (u32) (disp); \
unsigned int __disp_h = _disp & 0xff000; \
unsigned int __disp_l = _disp & 0x00fff; \
- _EMIT6(op1 | __disp_l, op2 | __disp_h >> 4); \
+ _EMIT6((op1) | __disp_l, (op2) | __disp_h >> 4); \
})
#define EMIT6_DISP_LH(op1, op2, b1, b2, b3, disp) \
({ \
- _EMIT6_DISP_LH(op1 | reg(b1, b2) << 16 | \
+ _EMIT6_DISP_LH((op1) | reg(b1, b2) << 16 | \
reg_high(b3) << 8, op2, disp); \
REG_SET_SEEN(b1); \
REG_SET_SEEN(b2); \
@@ -226,8 +231,8 @@ static inline void reg_set_seen(struct bpf_jit *jit, u32 b1)
#define EMIT6_PCREL_LABEL(op1, op2, b1, b2, label, mask) \
({ \
int rel = (jit->labels[label] - jit->prg) >> 1; \
- _EMIT6(op1 | reg(b1, b2) << 16 | (rel & 0xffff), \
- op2 | mask << 12); \
+ _EMIT6((op1) | reg(b1, b2) << 16 | (rel & 0xffff), \
+ (op2) | (mask) << 12); \
REG_SET_SEEN(b1); \
REG_SET_SEEN(b2); \
})
@@ -235,68 +240,83 @@ static inline void reg_set_seen(struct bpf_jit *jit, u32 b1)
#define EMIT6_PCREL_IMM_LABEL(op1, op2, b1, imm, label, mask) \
({ \
int rel = (jit->labels[label] - jit->prg) >> 1; \
- _EMIT6(op1 | (reg_high(b1) | mask) << 16 | \
- (rel & 0xffff), op2 | (imm & 0xff) << 8); \
+ _EMIT6((op1) | (reg_high(b1) | (mask)) << 16 | \
+ (rel & 0xffff), (op2) | ((imm) & 0xff) << 8); \
REG_SET_SEEN(b1); \
- BUILD_BUG_ON(((unsigned long) imm) > 0xff); \
+ BUILD_BUG_ON(((unsigned long) (imm)) > 0xff); \
})
#define EMIT6_PCREL(op1, op2, b1, b2, i, off, mask) \
({ \
/* Branch instruction needs 6 bytes */ \
- int rel = (addrs[i + off + 1] - (addrs[i + 1] - 6)) / 2;\
- _EMIT6(op1 | reg(b1, b2) << 16 | (rel & 0xffff), op2 | mask); \
+ int rel = (addrs[(i) + (off) + 1] - (addrs[(i) + 1] - 6)) / 2;\
+ _EMIT6((op1) | reg(b1, b2) << 16 | (rel & 0xffff), (op2) | (mask));\
REG_SET_SEEN(b1); \
REG_SET_SEEN(b2); \
})
#define EMIT6_PCREL_RILB(op, b, target) \
({ \
- int rel = (target - jit->prg) / 2; \
- _EMIT6(op | reg_high(b) << 16 | rel >> 16, rel & 0xffff); \
+ unsigned int rel = (int)((target) - jit->prg) / 2; \
+ _EMIT6((op) | reg_high(b) << 16 | rel >> 16, rel & 0xffff);\
REG_SET_SEEN(b); \
})
#define EMIT6_PCREL_RIL(op, target) \
({ \
- int rel = (target - jit->prg) / 2; \
- _EMIT6(op | rel >> 16, rel & 0xffff); \
+ unsigned int rel = (int)((target) - jit->prg) / 2; \
+ _EMIT6((op) | rel >> 16, rel & 0xffff); \
+})
+
+#define EMIT6_PCREL_RILC(op, mask, target) \
+({ \
+ EMIT6_PCREL_RIL((op) | (mask) << 20, (target)); \
})
#define _EMIT6_IMM(op, imm) \
({ \
unsigned int __imm = (imm); \
- _EMIT6(op | (__imm >> 16), __imm & 0xffff); \
+ _EMIT6((op) | (__imm >> 16), __imm & 0xffff); \
})
#define EMIT6_IMM(op, b1, imm) \
({ \
- _EMIT6_IMM(op | reg_high(b1) << 16, imm); \
+ _EMIT6_IMM((op) | reg_high(b1) << 16, imm); \
REG_SET_SEEN(b1); \
})
-#define EMIT_CONST_U32(val) \
+#define _EMIT_CONST_U32(val) \
({ \
unsigned int ret; \
- ret = jit->lit - jit->base_ip; \
- jit->seen |= SEEN_LITERAL; \
+ ret = jit->lit32; \
if (jit->prg_buf) \
- *(u32 *) (jit->prg_buf + jit->lit) = (u32) val; \
- jit->lit += 4; \
+ *(u32 *)(jit->prg_buf + jit->lit32) = (u32)(val);\
+ jit->lit32 += 4; \
ret; \
})
-#define EMIT_CONST_U64(val) \
+#define EMIT_CONST_U32(val) \
({ \
- unsigned int ret; \
- ret = jit->lit - jit->base_ip; \
jit->seen |= SEEN_LITERAL; \
+ _EMIT_CONST_U32(val) - jit->base_ip; \
+})
+
+#define _EMIT_CONST_U64(val) \
+({ \
+ unsigned int ret; \
+ ret = jit->lit64; \
if (jit->prg_buf) \
- *(u64 *) (jit->prg_buf + jit->lit) = (u64) val; \
- jit->lit += 8; \
+ *(u64 *)(jit->prg_buf + jit->lit64) = (u64)(val);\
+ jit->lit64 += 8; \
ret; \
})
+#define EMIT_CONST_U64(val) \
+({ \
+ jit->seen |= SEEN_LITERAL; \
+ _EMIT_CONST_U64(val) - jit->base_ip; \
+})
+
#define EMIT_ZERO(b1) \
({ \
if (!fp->aux->verifier_zext) { \
@@ -307,6 +327,67 @@ static inline void reg_set_seen(struct bpf_jit *jit, u32 b1)
})
/*
+ * Return whether this is the first pass. The first pass is special, since we
+ * don't know any sizes yet, and thus must be conservative.
+ */
+static bool is_first_pass(struct bpf_jit *jit)
+{
+ return jit->size == 0;
+}
+
+/*
+ * Return whether this is the code generation pass. The code generation pass is
+ * special, since we should change as little as possible.
+ */
+static bool is_codegen_pass(struct bpf_jit *jit)
+{
+ return jit->prg_buf;
+}
+
+/*
+ * Return whether "rel" can be encoded as a short PC-relative offset
+ */
+static bool is_valid_rel(int rel)
+{
+ return rel >= -65536 && rel <= 65534;
+}
+
+/*
+ * Return whether "off" can be reached using a short PC-relative offset
+ */
+static bool can_use_rel(struct bpf_jit *jit, int off)
+{
+ return is_valid_rel(off - jit->prg);
+}
+
+/*
+ * Return whether given displacement can be encoded using
+ * Long-Displacement Facility
+ */
+static bool is_valid_ldisp(int disp)
+{
+ return disp >= -524288 && disp <= 524287;
+}
+
+/*
+ * Return whether the next 32-bit literal pool entry can be referenced using
+ * Long-Displacement Facility
+ */
+static bool can_use_ldisp_for_lit32(struct bpf_jit *jit)
+{
+ return is_valid_ldisp(jit->lit32 - jit->base_ip);
+}
+
+/*
+ * Return whether the next 64-bit literal pool entry can be referenced using
+ * Long-Displacement Facility
+ */
+static bool can_use_ldisp_for_lit64(struct bpf_jit *jit)
+{
+ return is_valid_ldisp(jit->lit64 - jit->base_ip);
+}
+
+/*
* Fill whole space with illegal instructions
*/
static void jit_fill_hole(void *area, unsigned int size)
@@ -383,9 +464,18 @@ static int get_end(struct bpf_jit *jit, int start)
*/
static void save_restore_regs(struct bpf_jit *jit, int op, u32 stack_depth)
{
-
+ const int last = 15, save_restore_size = 6;
int re = 6, rs;
+ if (is_first_pass(jit)) {
+ /*
+ * We don't know yet which registers are used. Reserve space
+ * conservatively.
+ */
+ jit->prg += (last - re + 1) * save_restore_size;
+ return;
+ }
+
do {
rs = get_start(jit, re);
if (!rs)
@@ -396,7 +486,7 @@ static void save_restore_regs(struct bpf_jit *jit, int op, u32 stack_depth)
else
restore_regs(jit, rs, re, stack_depth);
re++;
- } while (re <= 15);
+ } while (re <= last);
}
/*
@@ -420,21 +510,28 @@ static void bpf_jit_prologue(struct bpf_jit *jit, u32 stack_depth)
/* Save registers */
save_restore_regs(jit, REGS_SAVE, stack_depth);
/* Setup literal pool */
- if (jit->seen & SEEN_LITERAL) {
- /* basr %r13,0 */
- EMIT2(0x0d00, REG_L, REG_0);
- jit->base_ip = jit->prg;
+ if (is_first_pass(jit) || (jit->seen & SEEN_LITERAL)) {
+ if (!is_first_pass(jit) &&
+ is_valid_ldisp(jit->size - (jit->prg + 2))) {
+ /* basr %l,0 */
+ EMIT2(0x0d00, REG_L, REG_0);
+ jit->base_ip = jit->prg;
+ } else {
+ /* larl %l,lit32_start */
+ EMIT6_PCREL_RILB(0xc0000000, REG_L, jit->lit32_start);
+ jit->base_ip = jit->lit32_start;
+ }
}
/* Setup stack and backchain */
- if (jit->seen & SEEN_STACK) {
- if (jit->seen & SEEN_FUNC)
+ if (is_first_pass(jit) || (jit->seen & SEEN_STACK)) {
+ if (is_first_pass(jit) || (jit->seen & SEEN_FUNC))
/* lgr %w1,%r15 (backchain) */
EMIT4(0xb9040000, REG_W1, REG_15);
/* la %bfp,STK_160_UNUSED(%r15) (BPF frame pointer) */
EMIT4_DISP(0x41000000, BPF_REG_FP, REG_15, STK_160_UNUSED);
/* aghi %r15,-STK_OFF */
EMIT4_IMM(0xa70b0000, REG_15, -(STK_OFF + stack_depth));
- if (jit->seen & SEEN_FUNC)
+ if (is_first_pass(jit) || (jit->seen & SEEN_FUNC))
/* stg %w1,152(%r15) (backchain) */
EMIT6_DISP_LH(0xe3000000, 0x0024, REG_W1, REG_0,
REG_15, 152);
@@ -446,12 +543,6 @@ static void bpf_jit_prologue(struct bpf_jit *jit, u32 stack_depth)
*/
static void bpf_jit_epilogue(struct bpf_jit *jit, u32 stack_depth)
{
- /* Return 0 */
- if (jit->seen & SEEN_RET0) {
- jit->ret0_ip = jit->prg;
- /* lghi %b0,0 */
- EMIT4_IMM(0xa7090000, BPF_REG_0, 0);
- }
jit->exit_ip = jit->prg;
/* Load exit code: lgr %r2,%b0 */
EMIT4(0xb9040000, REG_2, BPF_REG_0);
@@ -476,7 +567,7 @@ static void bpf_jit_epilogue(struct bpf_jit *jit, u32 stack_depth)
_EMIT2(0x07fe);
if (__is_defined(CC_USING_EXPOLINE) && !nospec_disable &&
- (jit->seen & SEEN_FUNC)) {
+ (is_first_pass(jit) || (jit->seen & SEEN_FUNC))) {
jit->r1_thunk_ip = jit->prg;
/* Generate __s390_indirect_jump_r1 thunk */
if (test_facility(35)) {
@@ -506,16 +597,14 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
int i, bool extra_pass)
{
struct bpf_insn *insn = &fp->insnsi[i];
- int jmp_off, last, insn_count = 1;
u32 dst_reg = insn->dst_reg;
u32 src_reg = insn->src_reg;
+ int last, insn_count = 1;
u32 *addrs = jit->addrs;
s32 imm = insn->imm;
s16 off = insn->off;
unsigned int mask;
- if (dst_reg == BPF_REG_AX || src_reg == BPF_REG_AX)
- jit->seen |= SEEN_REG_AX;
switch (insn->code) {
/*
* BPF_MOV
@@ -549,9 +638,8 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
u64 imm64;
imm64 = (u64)(u32) insn[0].imm | ((u64)(u32) insn[1].imm) << 32;
- /* lg %dst,<d(imm)>(%l) */
- EMIT6_DISP_LH(0xe3000000, 0x0004, dst_reg, REG_0, REG_L,
- EMIT_CONST_U64(imm64));
+ /* lgrl %dst,imm */
+ EMIT6_PCREL_RILB(0xc4080000, dst_reg, _EMIT_CONST_U64(imm64));
insn_count = 2;
break;
}
@@ -680,9 +768,18 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
EMIT4_IMM(0xa7080000, REG_W0, 0);
/* lr %w1,%dst */
EMIT2(0x1800, REG_W1, dst_reg);
- /* dl %w0,<d(imm)>(%l) */
- EMIT6_DISP_LH(0xe3000000, 0x0097, REG_W0, REG_0, REG_L,
- EMIT_CONST_U32(imm));
+ if (!is_first_pass(jit) && can_use_ldisp_for_lit32(jit)) {
+ /* dl %w0,<d(imm)>(%l) */
+ EMIT6_DISP_LH(0xe3000000, 0x0097, REG_W0, REG_0, REG_L,
+ EMIT_CONST_U32(imm));
+ } else {
+ /* lgfrl %dst,imm */
+ EMIT6_PCREL_RILB(0xc40c0000, dst_reg,
+ _EMIT_CONST_U32(imm));
+ jit->seen |= SEEN_LITERAL;
+ /* dlr %w0,%dst */
+ EMIT4(0xb9970000, REG_W0, dst_reg);
+ }
/* llgfr %dst,%rc */
EMIT4(0xb9160000, dst_reg, rc_reg);
if (insn_is_zext(&insn[1]))
@@ -704,9 +801,18 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
EMIT4_IMM(0xa7090000, REG_W0, 0);
/* lgr %w1,%dst */
EMIT4(0xb9040000, REG_W1, dst_reg);
- /* dlg %w0,<d(imm)>(%l) */
- EMIT6_DISP_LH(0xe3000000, 0x0087, REG_W0, REG_0, REG_L,
- EMIT_CONST_U64(imm));
+ if (!is_first_pass(jit) && can_use_ldisp_for_lit64(jit)) {
+ /* dlg %w0,<d(imm)>(%l) */
+ EMIT6_DISP_LH(0xe3000000, 0x0087, REG_W0, REG_0, REG_L,
+ EMIT_CONST_U64(imm));
+ } else {
+ /* lgrl %dst,imm */
+ EMIT6_PCREL_RILB(0xc4080000, dst_reg,
+ _EMIT_CONST_U64(imm));
+ jit->seen |= SEEN_LITERAL;
+ /* dlgr %w0,%dst */
+ EMIT4(0xb9870000, REG_W0, dst_reg);
+ }
/* lgr %dst,%rc */
EMIT4(0xb9040000, dst_reg, rc_reg);
break;
@@ -729,9 +835,19 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
EMIT_ZERO(dst_reg);
break;
case BPF_ALU64 | BPF_AND | BPF_K: /* dst = dst & imm */
- /* ng %dst,<d(imm)>(%l) */
- EMIT6_DISP_LH(0xe3000000, 0x0080, dst_reg, REG_0, REG_L,
- EMIT_CONST_U64(imm));
+ if (!is_first_pass(jit) && can_use_ldisp_for_lit64(jit)) {
+ /* ng %dst,<d(imm)>(%l) */
+ EMIT6_DISP_LH(0xe3000000, 0x0080,
+ dst_reg, REG_0, REG_L,
+ EMIT_CONST_U64(imm));
+ } else {
+ /* lgrl %w0,imm */
+ EMIT6_PCREL_RILB(0xc4080000, REG_W0,
+ _EMIT_CONST_U64(imm));
+ jit->seen |= SEEN_LITERAL;
+ /* ngr %dst,%w0 */
+ EMIT4(0xb9800000, dst_reg, REG_W0);
+ }
break;
/*
* BPF_OR
@@ -751,9 +867,19 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
EMIT_ZERO(dst_reg);
break;
case BPF_ALU64 | BPF_OR | BPF_K: /* dst = dst | imm */
- /* og %dst,<d(imm)>(%l) */
- EMIT6_DISP_LH(0xe3000000, 0x0081, dst_reg, REG_0, REG_L,
- EMIT_CONST_U64(imm));
+ if (!is_first_pass(jit) && can_use_ldisp_for_lit64(jit)) {
+ /* og %dst,<d(imm)>(%l) */
+ EMIT6_DISP_LH(0xe3000000, 0x0081,
+ dst_reg, REG_0, REG_L,
+ EMIT_CONST_U64(imm));
+ } else {
+ /* lgrl %w0,imm */
+ EMIT6_PCREL_RILB(0xc4080000, REG_W0,
+ _EMIT_CONST_U64(imm));
+ jit->seen |= SEEN_LITERAL;
+ /* ogr %dst,%w0 */
+ EMIT4(0xb9810000, dst_reg, REG_W0);
+ }
break;
/*
* BPF_XOR
@@ -775,9 +901,19 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
EMIT_ZERO(dst_reg);
break;
case BPF_ALU64 | BPF_XOR | BPF_K: /* dst = dst ^ imm */
- /* xg %dst,<d(imm)>(%l) */
- EMIT6_DISP_LH(0xe3000000, 0x0082, dst_reg, REG_0, REG_L,
- EMIT_CONST_U64(imm));
+ if (!is_first_pass(jit) && can_use_ldisp_for_lit64(jit)) {
+ /* xg %dst,<d(imm)>(%l) */
+ EMIT6_DISP_LH(0xe3000000, 0x0082,
+ dst_reg, REG_0, REG_L,
+ EMIT_CONST_U64(imm));
+ } else {
+ /* lgrl %w0,imm */
+ EMIT6_PCREL_RILB(0xc4080000, REG_W0,
+ _EMIT_CONST_U64(imm));
+ jit->seen |= SEEN_LITERAL;
+ /* xgr %dst,%w0 */
+ EMIT4(0xb9820000, dst_reg, REG_W0);
+ }
break;
/*
* BPF_LSH
@@ -1023,9 +1159,8 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
REG_SET_SEEN(BPF_REG_5);
jit->seen |= SEEN_FUNC;
- /* lg %w1,<d(imm)>(%l) */
- EMIT6_DISP_LH(0xe3000000, 0x0004, REG_W1, REG_0, REG_L,
- EMIT_CONST_U64(func));
+ /* lgrl %w1,func */
+ EMIT6_PCREL_RILB(0xc4080000, REG_W1, _EMIT_CONST_U64(func));
if (__is_defined(CC_USING_EXPOLINE) && !nospec_disable) {
/* brasl %r14,__s390_indirect_jump_r1 */
EMIT6_PCREL_RILB(0xc0050000, REG_14, jit->r1_thunk_ip);
@@ -1054,9 +1189,17 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
/* llgf %w1,map.max_entries(%b2) */
EMIT6_DISP_LH(0xe3000000, 0x0016, REG_W1, REG_0, BPF_REG_2,
offsetof(struct bpf_array, map.max_entries));
- /* clrj %b3,%w1,0xa,label0: if (u32)%b3 >= (u32)%w1 goto out */
- EMIT6_PCREL_LABEL(0xec000000, 0x0077, BPF_REG_3,
- REG_W1, 0, 0xa);
+ /* if ((u32)%b3 >= (u32)%w1) goto out; */
+ if (!is_first_pass(jit) && can_use_rel(jit, jit->labels[0])) {
+ /* clrj %b3,%w1,0xa,label0 */
+ EMIT6_PCREL_LABEL(0xec000000, 0x0077, BPF_REG_3,
+ REG_W1, 0, 0xa);
+ } else {
+ /* clr %b3,%w1 */
+ EMIT2(0x1500, BPF_REG_3, REG_W1);
+ /* brcl 0xa,label0 */
+ EMIT6_PCREL_RILC(0xc0040000, 0xa, jit->labels[0]);
+ }
/*
* if (tail_call_cnt++ > MAX_TAIL_CALL_CNT)
@@ -1071,9 +1214,16 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
EMIT4_IMM(0xa7080000, REG_W0, 1);
/* laal %w1,%w0,off(%r15) */
EMIT6_DISP_LH(0xeb000000, 0x00fa, REG_W1, REG_W0, REG_15, off);
- /* clij %w1,MAX_TAIL_CALL_CNT,0x2,label0 */
- EMIT6_PCREL_IMM_LABEL(0xec000000, 0x007f, REG_W1,
- MAX_TAIL_CALL_CNT, 0, 0x2);
+ if (!is_first_pass(jit) && can_use_rel(jit, jit->labels[0])) {
+ /* clij %w1,MAX_TAIL_CALL_CNT,0x2,label0 */
+ EMIT6_PCREL_IMM_LABEL(0xec000000, 0x007f, REG_W1,
+ MAX_TAIL_CALL_CNT, 0, 0x2);
+ } else {
+ /* clfi %w1,MAX_TAIL_CALL_CNT */
+ EMIT6_IMM(0xc20f0000, REG_W1, MAX_TAIL_CALL_CNT);
+ /* brcl 0x2,label0 */
+ EMIT6_PCREL_RILC(0xc0040000, 0x2, jit->labels[0]);
+ }
/*
* prog = array->ptrs[index];
@@ -1085,11 +1235,16 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
EMIT4(0xb9160000, REG_1, BPF_REG_3);
/* sllg %r1,%r1,3: %r1 *= 8 */
EMIT6_DISP_LH(0xeb000000, 0x000d, REG_1, REG_1, REG_0, 3);
- /* lg %r1,prog(%b2,%r1) */
- EMIT6_DISP_LH(0xe3000000, 0x0004, REG_1, BPF_REG_2,
+ /* ltg %r1,prog(%b2,%r1) */
+ EMIT6_DISP_LH(0xe3000000, 0x0002, REG_1, BPF_REG_2,
REG_1, offsetof(struct bpf_array, ptrs));
- /* clgij %r1,0,0x8,label0 */
- EMIT6_PCREL_IMM_LABEL(0xec000000, 0x007d, REG_1, 0, 0, 0x8);
+ if (!is_first_pass(jit) && can_use_rel(jit, jit->labels[0])) {
+ /* brc 0x8,label0 */
+ EMIT4_PCREL_RIC(0xa7040000, 0x8, jit->labels[0]);
+ } else {
+ /* brcl 0x8,label0 */
+ EMIT6_PCREL_RILC(0xc0040000, 0x8, jit->labels[0]);
+ }
/*
* Restore registers before calling function
@@ -1110,7 +1265,7 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
break;
case BPF_JMP | BPF_EXIT: /* return b0 */
last = (i == fp->len - 1) ? 1 : 0;
- if (last && !(jit->seen & SEEN_RET0))
+ if (last)
break;
/* j <exit> */
EMIT4_PCREL(0xa7f40000, jit->exit_ip - jit->prg);
@@ -1246,36 +1401,83 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
goto branch_oc;
branch_ks:
is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32;
- /* lgfi %w1,imm (load sign extend imm) */
- EMIT6_IMM(0xc0010000, REG_W1, imm);
- /* crj or cgrj %dst,%w1,mask,off */
- EMIT6_PCREL(0xec000000, (is_jmp32 ? 0x0076 : 0x0064),
- dst_reg, REG_W1, i, off, mask);
+ /* cfi or cgfi %dst,imm */
+ EMIT6_IMM(is_jmp32 ? 0xc20d0000 : 0xc20c0000,
+ dst_reg, imm);
+ if (!is_first_pass(jit) &&
+ can_use_rel(jit, addrs[i + off + 1])) {
+ /* brc mask,off */
+ EMIT4_PCREL_RIC(0xa7040000,
+ mask >> 12, addrs[i + off + 1]);
+ } else {
+ /* brcl mask,off */
+ EMIT6_PCREL_RILC(0xc0040000,
+ mask >> 12, addrs[i + off + 1]);
+ }
break;
branch_ku:
is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32;
- /* lgfi %w1,imm (load sign extend imm) */
- EMIT6_IMM(0xc0010000, REG_W1, imm);
- /* clrj or clgrj %dst,%w1,mask,off */
- EMIT6_PCREL(0xec000000, (is_jmp32 ? 0x0077 : 0x0065),
- dst_reg, REG_W1, i, off, mask);
+ /* clfi or clgfi %dst,imm */
+ EMIT6_IMM(is_jmp32 ? 0xc20f0000 : 0xc20e0000,
+ dst_reg, imm);
+ if (!is_first_pass(jit) &&
+ can_use_rel(jit, addrs[i + off + 1])) {
+ /* brc mask,off */
+ EMIT4_PCREL_RIC(0xa7040000,
+ mask >> 12, addrs[i + off + 1]);
+ } else {
+ /* brcl mask,off */
+ EMIT6_PCREL_RILC(0xc0040000,
+ mask >> 12, addrs[i + off + 1]);
+ }
break;
branch_xs:
is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32;
- /* crj or cgrj %dst,%src,mask,off */
- EMIT6_PCREL(0xec000000, (is_jmp32 ? 0x0076 : 0x0064),
- dst_reg, src_reg, i, off, mask);
+ if (!is_first_pass(jit) &&
+ can_use_rel(jit, addrs[i + off + 1])) {
+ /* crj or cgrj %dst,%src,mask,off */
+ EMIT6_PCREL(0xec000000, (is_jmp32 ? 0x0076 : 0x0064),
+ dst_reg, src_reg, i, off, mask);
+ } else {
+ /* cr or cgr %dst,%src */
+ if (is_jmp32)
+ EMIT2(0x1900, dst_reg, src_reg);
+ else
+ EMIT4(0xb9200000, dst_reg, src_reg);
+ /* brcl mask,off */
+ EMIT6_PCREL_RILC(0xc0040000,
+ mask >> 12, addrs[i + off + 1]);
+ }
break;
branch_xu:
is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32;
- /* clrj or clgrj %dst,%src,mask,off */
- EMIT6_PCREL(0xec000000, (is_jmp32 ? 0x0077 : 0x0065),
- dst_reg, src_reg, i, off, mask);
+ if (!is_first_pass(jit) &&
+ can_use_rel(jit, addrs[i + off + 1])) {
+ /* clrj or clgrj %dst,%src,mask,off */
+ EMIT6_PCREL(0xec000000, (is_jmp32 ? 0x0077 : 0x0065),
+ dst_reg, src_reg, i, off, mask);
+ } else {
+ /* clr or clgr %dst,%src */
+ if (is_jmp32)
+ EMIT2(0x1500, dst_reg, src_reg);
+ else
+ EMIT4(0xb9210000, dst_reg, src_reg);
+ /* brcl mask,off */
+ EMIT6_PCREL_RILC(0xc0040000,
+ mask >> 12, addrs[i + off + 1]);
+ }
break;
branch_oc:
- /* brc mask,jmp_off (branch instruction needs 4 bytes) */
- jmp_off = addrs[i + off + 1] - (addrs[i + 1] - 4);
- EMIT4_PCREL(0xa7040000 | mask << 8, jmp_off);
+ if (!is_first_pass(jit) &&
+ can_use_rel(jit, addrs[i + off + 1])) {
+ /* brc mask,off */
+ EMIT4_PCREL_RIC(0xa7040000,
+ mask >> 12, addrs[i + off + 1]);
+ } else {
+ /* brcl mask,off */
+ EMIT6_PCREL_RILC(0xc0040000,
+ mask >> 12, addrs[i + off + 1]);
+ }
break;
}
default: /* too complex, give up */
@@ -1286,28 +1488,67 @@ branch_oc:
}
/*
+ * Return whether new i-th instruction address does not violate any invariant
+ */
+static bool bpf_is_new_addr_sane(struct bpf_jit *jit, int i)
+{
+ /* On the first pass anything goes */
+ if (is_first_pass(jit))
+ return true;
+
+ /* The codegen pass must not change anything */
+ if (is_codegen_pass(jit))
+ return jit->addrs[i] == jit->prg;
+
+ /* Passes in between must not increase code size */
+ return jit->addrs[i] >= jit->prg;
+}
+
+/*
+ * Update the address of i-th instruction
+ */
+static int bpf_set_addr(struct bpf_jit *jit, int i)
+{
+ if (!bpf_is_new_addr_sane(jit, i))
+ return -1;
+ jit->addrs[i] = jit->prg;
+ return 0;
+}
+
+/*
* Compile eBPF program into s390x code
*/
static int bpf_jit_prog(struct bpf_jit *jit, struct bpf_prog *fp,
bool extra_pass)
{
- int i, insn_count;
+ int i, insn_count, lit32_size, lit64_size;
- jit->lit = jit->lit_start;
+ jit->lit32 = jit->lit32_start;
+ jit->lit64 = jit->lit64_start;
jit->prg = 0;
bpf_jit_prologue(jit, fp->aux->stack_depth);
+ if (bpf_set_addr(jit, 0) < 0)
+ return -1;
for (i = 0; i < fp->len; i += insn_count) {
insn_count = bpf_jit_insn(jit, fp, i, extra_pass);
if (insn_count < 0)
return -1;
/* Next instruction address */
- jit->addrs[i + insn_count] = jit->prg;
+ if (bpf_set_addr(jit, i + insn_count) < 0)
+ return -1;
}
bpf_jit_epilogue(jit, fp->aux->stack_depth);
- jit->lit_start = jit->prg;
- jit->size = jit->lit;
+ lit32_size = jit->lit32 - jit->lit32_start;
+ lit64_size = jit->lit64 - jit->lit64_start;
+ jit->lit32_start = jit->prg;
+ if (lit32_size)
+ jit->lit32_start = ALIGN(jit->lit32_start, 4);
+ jit->lit64_start = jit->lit32_start + lit32_size;
+ if (lit64_size)
+ jit->lit64_start = ALIGN(jit->lit64_start, 8);
+ jit->size = jit->lit64_start + lit64_size;
jit->size_prg = jit->prg;
return 0;
}
@@ -1369,7 +1610,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
}
memset(&jit, 0, sizeof(jit));
- jit.addrs = kcalloc(fp->len + 1, sizeof(*jit.addrs), GFP_KERNEL);
+ jit.addrs = kvcalloc(fp->len + 1, sizeof(*jit.addrs), GFP_KERNEL);
if (jit.addrs == NULL) {
fp = orig_fp;
goto out;
@@ -1388,12 +1629,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
/*
* Final pass: Allocate and generate program
*/
- if (jit.size >= BPF_SIZE_MAX) {
- fp = orig_fp;
- goto free_addrs;
- }
-
- header = bpf_jit_binary_alloc(jit.size, &jit.prg_buf, 2, jit_fill_hole);
+ header = bpf_jit_binary_alloc(jit.size, &jit.prg_buf, 8, jit_fill_hole);
if (!header) {
fp = orig_fp;
goto free_addrs;
@@ -1422,7 +1658,7 @@ skip_init_ctx:
if (!fp->is_func || extra_pass) {
bpf_prog_fill_jited_linfo(fp, jit.addrs + 1);
free_addrs:
- kfree(jit.addrs);
+ kvfree(jit.addrs);
kfree(jit_data);
fp->aux->jit_data = NULL;
}
diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h
index 5e8319bb207a..23c626a742e8 100644
--- a/arch/x86/include/asm/text-patching.h
+++ b/arch/x86/include/asm/text-patching.h
@@ -26,10 +26,11 @@ static inline void apply_paravirt(struct paravirt_patch_site *start,
#define POKE_MAX_OPCODE_SIZE 5
struct text_poke_loc {
- void *detour;
void *addr;
- size_t len;
- const char opcode[POKE_MAX_OPCODE_SIZE];
+ int len;
+ s32 rel32;
+ u8 opcode;
+ const u8 text[POKE_MAX_OPCODE_SIZE];
};
extern void text_poke_early(void *addr, const void *opcode, size_t len);
@@ -51,8 +52,10 @@ extern void text_poke_early(void *addr, const void *opcode, size_t len);
extern void *text_poke(void *addr, const void *opcode, size_t len);
extern void *text_poke_kgdb(void *addr, const void *opcode, size_t len);
extern int poke_int3_handler(struct pt_regs *regs);
-extern void text_poke_bp(void *addr, const void *opcode, size_t len, void *handler);
+extern void text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate);
extern void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries);
+extern void text_poke_loc_init(struct text_poke_loc *tp, void *addr,
+ const void *opcode, size_t len, const void *emulate);
extern int after_bootmem;
extern __ro_after_init struct mm_struct *poking_mm;
extern __ro_after_init unsigned long poking_addr;
@@ -63,8 +66,17 @@ static inline void int3_emulate_jmp(struct pt_regs *regs, unsigned long ip)
regs->ip = ip;
}
-#define INT3_INSN_SIZE 1
-#define CALL_INSN_SIZE 5
+#define INT3_INSN_SIZE 1
+#define INT3_INSN_OPCODE 0xCC
+
+#define CALL_INSN_SIZE 5
+#define CALL_INSN_OPCODE 0xE8
+
+#define JMP32_INSN_SIZE 5
+#define JMP32_INSN_OPCODE 0xE9
+
+#define JMP8_INSN_SIZE 2
+#define JMP8_INSN_OPCODE 0xEB
static inline void int3_emulate_push(struct pt_regs *regs, unsigned long val)
{
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 9d3a971ea364..9ec463fe96f2 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -956,16 +956,15 @@ NOKPROBE_SYMBOL(patch_cmp);
int poke_int3_handler(struct pt_regs *regs)
{
struct text_poke_loc *tp;
- unsigned char int3 = 0xcc;
void *ip;
/*
* Having observed our INT3 instruction, we now must observe
* bp_patching.nr_entries.
*
- * nr_entries != 0 INT3
- * WMB RMB
- * write INT3 if (nr_entries)
+ * nr_entries != 0 INT3
+ * WMB RMB
+ * write INT3 if (nr_entries)
*
* Idem for other elements in bp_patching.
*/
@@ -978,9 +977,9 @@ int poke_int3_handler(struct pt_regs *regs)
return 0;
/*
- * Discount the sizeof(int3). See text_poke_bp_batch().
+ * Discount the INT3. See text_poke_bp_batch().
*/
- ip = (void *) regs->ip - sizeof(int3);
+ ip = (void *) regs->ip - INT3_INSN_SIZE;
/*
* Skip the binary search if there is a single member in the vector.
@@ -997,8 +996,28 @@ int poke_int3_handler(struct pt_regs *regs)
return 0;
}
- /* set up the specified breakpoint detour */
- regs->ip = (unsigned long) tp->detour;
+ ip += tp->len;
+
+ switch (tp->opcode) {
+ case INT3_INSN_OPCODE:
+ /*
+ * Someone poked an explicit INT3, they'll want to handle it,
+ * do not consume.
+ */
+ return 0;
+
+ case CALL_INSN_OPCODE:
+ int3_emulate_call(regs, (long)ip + tp->rel32);
+ break;
+
+ case JMP32_INSN_OPCODE:
+ case JMP8_INSN_OPCODE:
+ int3_emulate_jmp(regs, (long)ip + tp->rel32);
+ break;
+
+ default:
+ BUG();
+ }
return 1;
}
@@ -1014,7 +1033,7 @@ NOKPROBE_SYMBOL(poke_int3_handler);
* synchronization using int3 breakpoint.
*
* The way it is done:
- * - For each entry in the vector:
+ * - For each entry in the vector:
* - add a int3 trap to the address that will be patched
* - sync cores
* - For each entry in the vector:
@@ -1027,9 +1046,9 @@ NOKPROBE_SYMBOL(poke_int3_handler);
*/
void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries)
{
- int patched_all_but_first = 0;
- unsigned char int3 = 0xcc;
+ unsigned char int3 = INT3_INSN_OPCODE;
unsigned int i;
+ int do_sync;
lockdep_assert_held(&text_mutex);
@@ -1053,16 +1072,16 @@ void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries)
/*
* Second step: update all but the first byte of the patched range.
*/
- for (i = 0; i < nr_entries; i++) {
+ for (do_sync = 0, i = 0; i < nr_entries; i++) {
if (tp[i].len - sizeof(int3) > 0) {
text_poke((char *)tp[i].addr + sizeof(int3),
- (const char *)tp[i].opcode + sizeof(int3),
+ (const char *)tp[i].text + sizeof(int3),
tp[i].len - sizeof(int3));
- patched_all_but_first++;
+ do_sync++;
}
}
- if (patched_all_but_first) {
+ if (do_sync) {
/*
* According to Intel, this core syncing is very likely
* not necessary and we'd be safe even without it. But
@@ -1075,10 +1094,17 @@ void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries)
* Third step: replace the first byte (int3) by the first byte of
* replacing opcode.
*/
- for (i = 0; i < nr_entries; i++)
- text_poke(tp[i].addr, tp[i].opcode, sizeof(int3));
+ for (do_sync = 0, i = 0; i < nr_entries; i++) {
+ if (tp[i].text[0] == INT3_INSN_OPCODE)
+ continue;
+
+ text_poke(tp[i].addr, tp[i].text, sizeof(int3));
+ do_sync++;
+ }
+
+ if (do_sync)
+ on_each_cpu(do_sync_core, NULL, 1);
- on_each_cpu(do_sync_core, NULL, 1);
/*
* sync_core() implies an smp_mb() and orders this store against
* the writing of the new instruction.
@@ -1087,6 +1113,60 @@ void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries)
bp_patching.nr_entries = 0;
}
+void text_poke_loc_init(struct text_poke_loc *tp, void *addr,
+ const void *opcode, size_t len, const void *emulate)
+{
+ struct insn insn;
+
+ if (!opcode)
+ opcode = (void *)tp->text;
+ else
+ memcpy((void *)tp->text, opcode, len);
+
+ if (!emulate)
+ emulate = opcode;
+
+ kernel_insn_init(&insn, emulate, MAX_INSN_SIZE);
+ insn_get_length(&insn);
+
+ BUG_ON(!insn_complete(&insn));
+ BUG_ON(len != insn.length);
+
+ tp->addr = addr;
+ tp->len = len;
+ tp->opcode = insn.opcode.bytes[0];
+
+ switch (tp->opcode) {
+ case INT3_INSN_OPCODE:
+ break;
+
+ case CALL_INSN_OPCODE:
+ case JMP32_INSN_OPCODE:
+ case JMP8_INSN_OPCODE:
+ tp->rel32 = insn.immediate.value;
+ break;
+
+ default: /* assume NOP */
+ switch (len) {
+ case 2: /* NOP2 -- emulate as JMP8+0 */
+ BUG_ON(memcmp(emulate, ideal_nops[len], len));
+ tp->opcode = JMP8_INSN_OPCODE;
+ tp->rel32 = 0;
+ break;
+
+ case 5: /* NOP5 -- emulate as JMP32+0 */
+ BUG_ON(memcmp(emulate, ideal_nops[NOP_ATOMIC5], len));
+ tp->opcode = JMP32_INSN_OPCODE;
+ tp->rel32 = 0;
+ break;
+
+ default: /* unknown instruction */
+ BUG();
+ }
+ break;
+ }
+}
+
/**
* text_poke_bp() -- update instructions on live kernel on SMP
* @addr: address to patch
@@ -1098,20 +1178,10 @@ void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries)
* dynamically allocated memory. This function should be used when it is
* not possible to allocate memory.
*/
-void text_poke_bp(void *addr, const void *opcode, size_t len, void *handler)
+void text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate)
{
- struct text_poke_loc tp = {
- .detour = handler,
- .addr = addr,
- .len = len,
- };
-
- if (len > POKE_MAX_OPCODE_SIZE) {
- WARN_ONCE(1, "len is larger than %d\n", POKE_MAX_OPCODE_SIZE);
- return;
- }
-
- memcpy((void *)tp.opcode, opcode, len);
+ struct text_poke_loc tp;
+ text_poke_loc_init(&tp, addr, opcode, len, emulate);
text_poke_bp_batch(&tp, 1);
}
diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c
index 044053235302..c1a8b9e71408 100644
--- a/arch/x86/kernel/jump_label.c
+++ b/arch/x86/kernel/jump_label.c
@@ -89,8 +89,7 @@ static void __ref __jump_label_transform(struct jump_entry *entry,
return;
}
- text_poke_bp((void *)jump_entry_code(entry), &code, JUMP_LABEL_NOP_SIZE,
- (void *)jump_entry_code(entry) + JUMP_LABEL_NOP_SIZE);
+ text_poke_bp((void *)jump_entry_code(entry), &code, JUMP_LABEL_NOP_SIZE, NULL);
}
void arch_jump_label_transform(struct jump_entry *entry,
@@ -147,11 +146,9 @@ bool arch_jump_label_transform_queue(struct jump_entry *entry,
}
__jump_label_set_jump_code(entry, type,
- (union jump_code_union *) &tp->opcode, 0);
+ (union jump_code_union *)&tp->text, 0);
- tp->addr = entry_code;
- tp->detour = entry_code + JUMP_LABEL_NOP_SIZE;
- tp->len = JUMP_LABEL_NOP_SIZE;
+ text_poke_loc_init(tp, entry_code, NULL, JUMP_LABEL_NOP_SIZE, NULL);
tp_vec_nr++;
diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c
index b348dd506d58..8900329c28a7 100644
--- a/arch/x86/kernel/kprobes/opt.c
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -437,8 +437,7 @@ void arch_optimize_kprobes(struct list_head *oplist)
insn_buff[0] = RELATIVEJUMP_OPCODE;
*(s32 *)(&insn_buff[1]) = rel;
- text_poke_bp(op->kp.addr, insn_buff, RELATIVEJUMP_SIZE,
- op->optinsn.insn);
+ text_poke_bp(op->kp.addr, insn_buff, RELATIVEJUMP_SIZE, NULL);
list_del_init(&op->list);
}
@@ -448,12 +447,18 @@ void arch_optimize_kprobes(struct list_head *oplist)
void arch_unoptimize_kprobe(struct optimized_kprobe *op)
{
u8 insn_buff[RELATIVEJUMP_SIZE];
+ u8 emulate_buff[RELATIVEJUMP_SIZE];
/* Set int3 to first byte for kprobes */
insn_buff[0] = BREAKPOINT_INSTRUCTION;
memcpy(insn_buff + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
+
+ emulate_buff[0] = RELATIVEJUMP_OPCODE;
+ *(s32 *)(&emulate_buff[1]) = (s32)((long)op->optinsn.insn -
+ ((long)op->kp.addr + RELATIVEJUMP_SIZE));
+
text_poke_bp(op->kp.addr, insn_buff, RELATIVEJUMP_SIZE,
- op->optinsn.insn);
+ emulate_buff);
}
/*
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 8cd23d8309bf..2e586f579945 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -9,9 +9,11 @@
#include <linux/filter.h>
#include <linux/if_vlan.h>
#include <linux/bpf.h>
+#include <linux/memory.h>
#include <asm/extable.h>
#include <asm/set_memory.h>
#include <asm/nospec-branch.h>
+#include <asm/text-patching.h>
static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len)
{
@@ -96,6 +98,7 @@ static int bpf_size_to_x86_bytes(int bpf_size)
/* Pick a register outside of BPF range for JIT internal work */
#define AUX_REG (MAX_BPF_JIT_REG + 1)
+#define X86_REG_R9 (MAX_BPF_JIT_REG + 2)
/*
* The following table maps BPF registers to x86-64 registers.
@@ -104,8 +107,8 @@ static int bpf_size_to_x86_bytes(int bpf_size)
* register in load/store instructions, it always needs an
* extra byte of encoding and is callee saved.
*
- * Also x86-64 register R9 is unused. x86-64 register R10 is
- * used for blinding (if enabled).
+ * x86-64 register R9 is not used by BPF programs, but can be used by BPF
+ * trampoline. x86-64 register R10 is used for blinding (if enabled).
*/
static const int reg2hex[] = {
[BPF_REG_0] = 0, /* RAX */
@@ -121,6 +124,7 @@ static const int reg2hex[] = {
[BPF_REG_FP] = 5, /* RBP readonly */
[BPF_REG_AX] = 2, /* R10 temp register */
[AUX_REG] = 3, /* R11 temp register */
+ [X86_REG_R9] = 1, /* R9 register, 6th function argument */
};
static const int reg2pt_regs[] = {
@@ -148,6 +152,7 @@ static bool is_ereg(u32 reg)
BIT(BPF_REG_7) |
BIT(BPF_REG_8) |
BIT(BPF_REG_9) |
+ BIT(X86_REG_R9) |
BIT(BPF_REG_AX));
}
@@ -198,8 +203,10 @@ struct jit_context {
/* Maximum number of bytes emitted while JITing one eBPF insn */
#define BPF_MAX_INSN_SIZE 128
#define BPF_INSN_SAFETY 64
+/* number of bytes emit_call() needs to generate call instruction */
+#define X86_CALL_SIZE 5
-#define PROLOGUE_SIZE 20
+#define PROLOGUE_SIZE 25
/*
* Emit x86-64 prologue code for BPF program and check its size.
@@ -208,8 +215,13 @@ struct jit_context {
static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf)
{
u8 *prog = *pprog;
- int cnt = 0;
+ int cnt = X86_CALL_SIZE;
+ /* BPF trampoline can be made to work without these nops,
+ * but let's waste 5 bytes for now and optimize later
+ */
+ memcpy(prog, ideal_nops[NOP_ATOMIC5], cnt);
+ prog += cnt;
EMIT1(0x55); /* push rbp */
EMIT3(0x48, 0x89, 0xE5); /* mov rbp, rsp */
/* sub rsp, rounded_stack_depth */
@@ -390,6 +402,149 @@ static void emit_mov_reg(u8 **pprog, bool is64, u32 dst_reg, u32 src_reg)
*pprog = prog;
}
+/* LDX: dst_reg = *(u8*)(src_reg + off) */
+static void emit_ldx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off)
+{
+ u8 *prog = *pprog;
+ int cnt = 0;
+
+ switch (size) {
+ case BPF_B:
+ /* Emit 'movzx rax, byte ptr [rax + off]' */
+ EMIT3(add_2mod(0x48, src_reg, dst_reg), 0x0F, 0xB6);
+ break;
+ case BPF_H:
+ /* Emit 'movzx rax, word ptr [rax + off]' */
+ EMIT3(add_2mod(0x48, src_reg, dst_reg), 0x0F, 0xB7);
+ break;
+ case BPF_W:
+ /* Emit 'mov eax, dword ptr [rax+0x14]' */
+ if (is_ereg(dst_reg) || is_ereg(src_reg))
+ EMIT2(add_2mod(0x40, src_reg, dst_reg), 0x8B);
+ else
+ EMIT1(0x8B);
+ break;
+ case BPF_DW:
+ /* Emit 'mov rax, qword ptr [rax+0x14]' */
+ EMIT2(add_2mod(0x48, src_reg, dst_reg), 0x8B);
+ break;
+ }
+ /*
+ * If insn->off == 0 we can save one extra byte, but
+ * special case of x86 R13 which always needs an offset
+ * is not worth the hassle
+ */
+ if (is_imm8(off))
+ EMIT2(add_2reg(0x40, src_reg, dst_reg), off);
+ else
+ EMIT1_off32(add_2reg(0x80, src_reg, dst_reg), off);
+ *pprog = prog;
+}
+
+/* STX: *(u8*)(dst_reg + off) = src_reg */
+static void emit_stx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off)
+{
+ u8 *prog = *pprog;
+ int cnt = 0;
+
+ switch (size) {
+ case BPF_B:
+ /* Emit 'mov byte ptr [rax + off], al' */
+ if (is_ereg(dst_reg) || is_ereg(src_reg) ||
+ /* We have to add extra byte for x86 SIL, DIL regs */
+ src_reg == BPF_REG_1 || src_reg == BPF_REG_2)
+ EMIT2(add_2mod(0x40, dst_reg, src_reg), 0x88);
+ else
+ EMIT1(0x88);
+ break;
+ case BPF_H:
+ if (is_ereg(dst_reg) || is_ereg(src_reg))
+ EMIT3(0x66, add_2mod(0x40, dst_reg, src_reg), 0x89);
+ else
+ EMIT2(0x66, 0x89);
+ break;
+ case BPF_W:
+ if (is_ereg(dst_reg) || is_ereg(src_reg))
+ EMIT2(add_2mod(0x40, dst_reg, src_reg), 0x89);
+ else
+ EMIT1(0x89);
+ break;
+ case BPF_DW:
+ EMIT2(add_2mod(0x48, dst_reg, src_reg), 0x89);
+ break;
+ }
+ if (is_imm8(off))
+ EMIT2(add_2reg(0x40, dst_reg, src_reg), off);
+ else
+ EMIT1_off32(add_2reg(0x80, dst_reg, src_reg), off);
+ *pprog = prog;
+}
+
+static int emit_call(u8 **pprog, void *func, void *ip)
+{
+ u8 *prog = *pprog;
+ int cnt = 0;
+ s64 offset;
+
+ offset = func - (ip + X86_CALL_SIZE);
+ if (!is_simm32(offset)) {
+ pr_err("Target call %p is out of range\n", func);
+ return -EINVAL;
+ }
+ EMIT1_off32(0xE8, offset);
+ *pprog = prog;
+ return 0;
+}
+
+int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
+ void *old_addr, void *new_addr)
+{
+ u8 old_insn[X86_CALL_SIZE] = {};
+ u8 new_insn[X86_CALL_SIZE] = {};
+ u8 *prog;
+ int ret;
+
+ if (!is_kernel_text((long)ip) &&
+ !is_bpf_text_address((long)ip))
+ /* BPF trampoline in modules is not supported */
+ return -EINVAL;
+
+ if (old_addr) {
+ prog = old_insn;
+ ret = emit_call(&prog, old_addr, (void *)ip);
+ if (ret)
+ return ret;
+ }
+ if (new_addr) {
+ prog = new_insn;
+ ret = emit_call(&prog, new_addr, (void *)ip);
+ if (ret)
+ return ret;
+ }
+ ret = -EBUSY;
+ mutex_lock(&text_mutex);
+ switch (t) {
+ case BPF_MOD_NOP_TO_CALL:
+ if (memcmp(ip, ideal_nops[NOP_ATOMIC5], X86_CALL_SIZE))
+ goto out;
+ text_poke_bp(ip, new_insn, X86_CALL_SIZE, NULL);
+ break;
+ case BPF_MOD_CALL_TO_CALL:
+ if (memcmp(ip, old_insn, X86_CALL_SIZE))
+ goto out;
+ text_poke_bp(ip, new_insn, X86_CALL_SIZE, NULL);
+ break;
+ case BPF_MOD_CALL_TO_NOP:
+ if (memcmp(ip, old_insn, X86_CALL_SIZE))
+ goto out;
+ text_poke_bp(ip, ideal_nops[NOP_ATOMIC5], X86_CALL_SIZE, NULL);
+ break;
+ }
+ ret = 0;
+out:
+ mutex_unlock(&text_mutex);
+ return ret;
+}
static bool ex_handler_bpf(const struct exception_table_entry *x,
struct pt_regs *regs, int trapnr,
@@ -773,68 +928,22 @@ st: if (is_imm8(insn->off))
/* STX: *(u8*)(dst_reg + off) = src_reg */
case BPF_STX | BPF_MEM | BPF_B:
- /* Emit 'mov byte ptr [rax + off], al' */
- if (is_ereg(dst_reg) || is_ereg(src_reg) ||
- /* We have to add extra byte for x86 SIL, DIL regs */
- src_reg == BPF_REG_1 || src_reg == BPF_REG_2)
- EMIT2(add_2mod(0x40, dst_reg, src_reg), 0x88);
- else
- EMIT1(0x88);
- goto stx;
case BPF_STX | BPF_MEM | BPF_H:
- if (is_ereg(dst_reg) || is_ereg(src_reg))
- EMIT3(0x66, add_2mod(0x40, dst_reg, src_reg), 0x89);
- else
- EMIT2(0x66, 0x89);
- goto stx;
case BPF_STX | BPF_MEM | BPF_W:
- if (is_ereg(dst_reg) || is_ereg(src_reg))
- EMIT2(add_2mod(0x40, dst_reg, src_reg), 0x89);
- else
- EMIT1(0x89);
- goto stx;
case BPF_STX | BPF_MEM | BPF_DW:
- EMIT2(add_2mod(0x48, dst_reg, src_reg), 0x89);
-stx: if (is_imm8(insn->off))
- EMIT2(add_2reg(0x40, dst_reg, src_reg), insn->off);
- else
- EMIT1_off32(add_2reg(0x80, dst_reg, src_reg),
- insn->off);
+ emit_stx(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off);
break;
/* LDX: dst_reg = *(u8*)(src_reg + off) */
case BPF_LDX | BPF_MEM | BPF_B:
case BPF_LDX | BPF_PROBE_MEM | BPF_B:
- /* Emit 'movzx rax, byte ptr [rax + off]' */
- EMIT3(add_2mod(0x48, src_reg, dst_reg), 0x0F, 0xB6);
- goto ldx;
case BPF_LDX | BPF_MEM | BPF_H:
case BPF_LDX | BPF_PROBE_MEM | BPF_H:
- /* Emit 'movzx rax, word ptr [rax + off]' */
- EMIT3(add_2mod(0x48, src_reg, dst_reg), 0x0F, 0xB7);
- goto ldx;
case BPF_LDX | BPF_MEM | BPF_W:
case BPF_LDX | BPF_PROBE_MEM | BPF_W:
- /* Emit 'mov eax, dword ptr [rax+0x14]' */
- if (is_ereg(dst_reg) || is_ereg(src_reg))
- EMIT2(add_2mod(0x40, src_reg, dst_reg), 0x8B);
- else
- EMIT1(0x8B);
- goto ldx;
case BPF_LDX | BPF_MEM | BPF_DW:
case BPF_LDX | BPF_PROBE_MEM | BPF_DW:
- /* Emit 'mov rax, qword ptr [rax+0x14]' */
- EMIT2(add_2mod(0x48, src_reg, dst_reg), 0x8B);
-ldx: /*
- * If insn->off == 0 we can save one extra byte, but
- * special case of x86 R13 which always needs an offset
- * is not worth the hassle
- */
- if (is_imm8(insn->off))
- EMIT2(add_2reg(0x40, src_reg, dst_reg), insn->off);
- else
- EMIT1_off32(add_2reg(0x80, src_reg, dst_reg),
- insn->off);
+ emit_ldx(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off);
if (BPF_MODE(insn->code) == BPF_PROBE_MEM) {
struct exception_table_entry *ex;
u8 *_insn = image + proglen;
@@ -899,13 +1008,8 @@ xadd: if (is_imm8(insn->off))
/* call */
case BPF_JMP | BPF_CALL:
func = (u8 *) __bpf_call_base + imm32;
- jmp_offset = func - (image + addrs[i]);
- if (!imm32 || !is_simm32(jmp_offset)) {
- pr_err("unsupported BPF func %d addr %p image %p\n",
- imm32, func, image);
+ if (!imm32 || emit_call(&prog, func, image + addrs[i - 1]))
return -EINVAL;
- }
- EMIT1_off32(0xE8, jmp_offset);
break;
case BPF_JMP | BPF_TAIL_CALL:
@@ -1138,6 +1242,210 @@ emit_jmp:
return proglen;
}
+static void save_regs(struct btf_func_model *m, u8 **prog, int nr_args,
+ int stack_size)
+{
+ int i;
+ /* Store function arguments to stack.
+ * For a function that accepts two pointers the sequence will be:
+ * mov QWORD PTR [rbp-0x10],rdi
+ * mov QWORD PTR [rbp-0x8],rsi
+ */
+ for (i = 0; i < min(nr_args, 6); i++)
+ emit_stx(prog, bytes_to_bpf_size(m->arg_size[i]),
+ BPF_REG_FP,
+ i == 5 ? X86_REG_R9 : BPF_REG_1 + i,
+ -(stack_size - i * 8));
+}
+
+static void restore_regs(struct btf_func_model *m, u8 **prog, int nr_args,
+ int stack_size)
+{
+ int i;
+
+ /* Restore function arguments from stack.
+ * For a function that accepts two pointers the sequence will be:
+ * EMIT4(0x48, 0x8B, 0x7D, 0xF0); mov rdi,QWORD PTR [rbp-0x10]
+ * EMIT4(0x48, 0x8B, 0x75, 0xF8); mov rsi,QWORD PTR [rbp-0x8]
+ */
+ for (i = 0; i < min(nr_args, 6); i++)
+ emit_ldx(prog, bytes_to_bpf_size(m->arg_size[i]),
+ i == 5 ? X86_REG_R9 : BPF_REG_1 + i,
+ BPF_REG_FP,
+ -(stack_size - i * 8));
+}
+
+static int invoke_bpf(struct btf_func_model *m, u8 **pprog,
+ struct bpf_prog **progs, int prog_cnt, int stack_size)
+{
+ u8 *prog = *pprog;
+ int cnt = 0, i;
+
+ for (i = 0; i < prog_cnt; i++) {
+ if (emit_call(&prog, __bpf_prog_enter, prog))
+ return -EINVAL;
+ /* remember prog start time returned by __bpf_prog_enter */
+ emit_mov_reg(&prog, true, BPF_REG_6, BPF_REG_0);
+
+ /* arg1: lea rdi, [rbp - stack_size] */
+ EMIT4(0x48, 0x8D, 0x7D, -stack_size);
+ /* arg2: progs[i]->insnsi for interpreter */
+ if (!progs[i]->jited)
+ emit_mov_imm64(&prog, BPF_REG_2,
+ (long) progs[i]->insnsi >> 32,
+ (u32) (long) progs[i]->insnsi);
+ /* call JITed bpf program or interpreter */
+ if (emit_call(&prog, progs[i]->bpf_func, prog))
+ return -EINVAL;
+
+ /* arg1: mov rdi, progs[i] */
+ emit_mov_imm64(&prog, BPF_REG_1, (long) progs[i] >> 32,
+ (u32) (long) progs[i]);
+ /* arg2: mov rsi, rbx <- start time in nsec */
+ emit_mov_reg(&prog, true, BPF_REG_2, BPF_REG_6);
+ if (emit_call(&prog, __bpf_prog_exit, prog))
+ return -EINVAL;
+ }
+ *pprog = prog;
+ return 0;
+}
+
+/* Example:
+ * __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev);
+ * its 'struct btf_func_model' will be nr_args=2
+ * The assembly code when eth_type_trans is executing after trampoline:
+ *
+ * push rbp
+ * mov rbp, rsp
+ * sub rsp, 16 // space for skb and dev
+ * push rbx // temp regs to pass start time
+ * mov qword ptr [rbp - 16], rdi // save skb pointer to stack
+ * mov qword ptr [rbp - 8], rsi // save dev pointer to stack
+ * call __bpf_prog_enter // rcu_read_lock and preempt_disable
+ * mov rbx, rax // remember start time in bpf stats are enabled
+ * lea rdi, [rbp - 16] // R1==ctx of bpf prog
+ * call addr_of_jited_FENTRY_prog
+ * movabsq rdi, 64bit_addr_of_struct_bpf_prog // unused if bpf stats are off
+ * mov rsi, rbx // prog start time
+ * call __bpf_prog_exit // rcu_read_unlock, preempt_enable and stats math
+ * mov rdi, qword ptr [rbp - 16] // restore skb pointer from stack
+ * mov rsi, qword ptr [rbp - 8] // restore dev pointer from stack
+ * pop rbx
+ * leave
+ * ret
+ *
+ * eth_type_trans has 5 byte nop at the beginning. These 5 bytes will be
+ * replaced with 'call generated_bpf_trampoline'. When it returns
+ * eth_type_trans will continue executing with original skb and dev pointers.
+ *
+ * The assembly code when eth_type_trans is called from trampoline:
+ *
+ * push rbp
+ * mov rbp, rsp
+ * sub rsp, 24 // space for skb, dev, return value
+ * push rbx // temp regs to pass start time
+ * mov qword ptr [rbp - 24], rdi // save skb pointer to stack
+ * mov qword ptr [rbp - 16], rsi // save dev pointer to stack
+ * call __bpf_prog_enter // rcu_read_lock and preempt_disable
+ * mov rbx, rax // remember start time if bpf stats are enabled
+ * lea rdi, [rbp - 24] // R1==ctx of bpf prog
+ * call addr_of_jited_FENTRY_prog // bpf prog can access skb and dev
+ * movabsq rdi, 64bit_addr_of_struct_bpf_prog // unused if bpf stats are off
+ * mov rsi, rbx // prog start time
+ * call __bpf_prog_exit // rcu_read_unlock, preempt_enable and stats math
+ * mov rdi, qword ptr [rbp - 24] // restore skb pointer from stack
+ * mov rsi, qword ptr [rbp - 16] // restore dev pointer from stack
+ * call eth_type_trans+5 // execute body of eth_type_trans
+ * mov qword ptr [rbp - 8], rax // save return value
+ * call __bpf_prog_enter // rcu_read_lock and preempt_disable
+ * mov rbx, rax // remember start time in bpf stats are enabled
+ * lea rdi, [rbp - 24] // R1==ctx of bpf prog
+ * call addr_of_jited_FEXIT_prog // bpf prog can access skb, dev, return value
+ * movabsq rdi, 64bit_addr_of_struct_bpf_prog // unused if bpf stats are off
+ * mov rsi, rbx // prog start time
+ * call __bpf_prog_exit // rcu_read_unlock, preempt_enable and stats math
+ * mov rax, qword ptr [rbp - 8] // restore eth_type_trans's return value
+ * pop rbx
+ * leave
+ * add rsp, 8 // skip eth_type_trans's frame
+ * ret // return to its caller
+ */
+int arch_prepare_bpf_trampoline(void *image, struct btf_func_model *m, u32 flags,
+ struct bpf_prog **fentry_progs, int fentry_cnt,
+ struct bpf_prog **fexit_progs, int fexit_cnt,
+ void *orig_call)
+{
+ int cnt = 0, nr_args = m->nr_args;
+ int stack_size = nr_args * 8;
+ u8 *prog;
+
+ /* x86-64 supports up to 6 arguments. 7+ can be added in the future */
+ if (nr_args > 6)
+ return -ENOTSUPP;
+
+ if ((flags & BPF_TRAMP_F_RESTORE_REGS) &&
+ (flags & BPF_TRAMP_F_SKIP_FRAME))
+ return -EINVAL;
+
+ if (flags & BPF_TRAMP_F_CALL_ORIG)
+ stack_size += 8; /* room for return value of orig_call */
+
+ if (flags & BPF_TRAMP_F_SKIP_FRAME)
+ /* skip patched call instruction and point orig_call to actual
+ * body of the kernel function.
+ */
+ orig_call += X86_CALL_SIZE;
+
+ prog = image;
+
+ EMIT1(0x55); /* push rbp */
+ EMIT3(0x48, 0x89, 0xE5); /* mov rbp, rsp */
+ EMIT4(0x48, 0x83, 0xEC, stack_size); /* sub rsp, stack_size */
+ EMIT1(0x53); /* push rbx */
+
+ save_regs(m, &prog, nr_args, stack_size);
+
+ if (fentry_cnt)
+ if (invoke_bpf(m, &prog, fentry_progs, fentry_cnt, stack_size))
+ return -EINVAL;
+
+ if (flags & BPF_TRAMP_F_CALL_ORIG) {
+ if (fentry_cnt)
+ restore_regs(m, &prog, nr_args, stack_size);
+
+ /* call original function */
+ if (emit_call(&prog, orig_call, prog))
+ return -EINVAL;
+ /* remember return value in a stack for bpf prog to access */
+ emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8);
+ }
+
+ if (fexit_cnt)
+ if (invoke_bpf(m, &prog, fexit_progs, fexit_cnt, stack_size))
+ return -EINVAL;
+
+ if (flags & BPF_TRAMP_F_RESTORE_REGS)
+ restore_regs(m, &prog, nr_args, stack_size);
+
+ if (flags & BPF_TRAMP_F_CALL_ORIG)
+ /* restore original return value back into RAX */
+ emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, -8);
+
+ EMIT1(0x5B); /* pop rbx */
+ EMIT1(0xC9); /* leave */
+ if (flags & BPF_TRAMP_F_SKIP_FRAME)
+ /* skip our return address and return to parent */
+ EMIT4(0x48, 0x83, 0xC4, 8); /* add rsp, 8 */
+ EMIT1(0xC3); /* ret */
+ /* One half of the page has active running trampoline.
+ * Another half is an area for next trampoline.
+ * Make sure the trampoline generation logic doesn't overflow.
+ */
+ if (WARN_ON_ONCE(prog - (u8 *)image > PAGE_SIZE / 2 - BPF_INSN_SAFETY))
+ return -EFAULT;
+ return 0;
+}
+
struct x64_jit_data {
struct bpf_binary_header *header;
int *addrs;