From 88e69a1fcc1e67dec3025af64736a84532528242 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 24 Feb 2018 01:07:58 +0100 Subject: [PATCH 1/6] bpf, x64: save one byte per shl/shr/sar when imm is 1 When we shift by one, we can use a different encoding where imm is not explicitly needed, which saves 1 byte per such op. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- arch/x86/net/bpf_jit_comp.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 4923d92f918d5..4bc36bd1b97a4 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -640,7 +640,11 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, case BPF_RSH: b3 = 0xE8; break; case BPF_ARSH: b3 = 0xF8; break; } - EMIT3(0xC1, add_1reg(b3, dst_reg), imm32); + + if (imm32 == 1) + EMIT2(0xD1, add_1reg(b3, dst_reg)); + else + EMIT3(0xC1, add_1reg(b3, dst_reg), imm32); break; case BPF_ALU | BPF_LSH | BPF_X: From 6fe8b9c1f41dfe3209dabc5bd0726e003a065288 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 24 Feb 2018 01:07:59 +0100 Subject: [PATCH 2/6] bpf, x64: save several bytes by using mov over movabsq when possible While analyzing some of the more complex BPF programs from Cilium, I found that LLVM generally prefers to emit LD_IMM64 instead of MOV32 BPF instructions for loading unsigned 32-bit immediates into a register. Given we cannot change the current/stable LLVM versions that are already out there, lets optimize this case such that the JIT prefers to emit 'mov %eax, imm32' over 'movabsq %rax, imm64' whenever suitable in order to reduce the image size by 4-5 bytes per such load in the typical case, reducing image size on some of the bigger programs by up to 4%. emit_mov_imm32() and emit_mov_imm64() have been added as helpers. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- arch/x86/net/bpf_jit_comp.c | 125 +++++++++++++++++++++--------------- 1 file changed, 74 insertions(+), 51 deletions(-) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 4bc36bd1b97a4..f3e5cd8c1e689 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -60,7 +60,12 @@ static bool is_imm8(int value) static bool is_simm32(s64 value) { - return value == (s64) (s32) value; + return value == (s64)(s32)value; +} + +static bool is_uimm32(u64 value) +{ + return value == (u64)(u32)value; } /* mov dst, src */ @@ -355,6 +360,68 @@ static void emit_load_skb_data_hlen(u8 **pprog) *pprog = prog; } +static void emit_mov_imm32(u8 **pprog, bool sign_propagate, + u32 dst_reg, const u32 imm32) +{ + u8 *prog = *pprog; + u8 b1, b2, b3; + int cnt = 0; + + /* optimization: if imm32 is positive, use 'mov %eax, imm32' + * (which zero-extends imm32) to save 2 bytes. + */ + if (sign_propagate && (s32)imm32 < 0) { + /* 'mov %rax, imm32' sign extends imm32 */ + b1 = add_1mod(0x48, dst_reg); + b2 = 0xC7; + b3 = 0xC0; + EMIT3_off32(b1, b2, add_1reg(b3, dst_reg), imm32); + goto done; + } + + /* optimization: if imm32 is zero, use 'xor %eax, %eax' + * to save 3 bytes. + */ + if (imm32 == 0) { + if (is_ereg(dst_reg)) + EMIT1(add_2mod(0x40, dst_reg, dst_reg)); + b2 = 0x31; /* xor */ + b3 = 0xC0; + EMIT2(b2, add_2reg(b3, dst_reg, dst_reg)); + goto done; + } + + /* mov %eax, imm32 */ + if (is_ereg(dst_reg)) + EMIT1(add_1mod(0x40, dst_reg)); + EMIT1_off32(add_1reg(0xB8, dst_reg), imm32); +done: + *pprog = prog; +} + +static void emit_mov_imm64(u8 **pprog, u32 dst_reg, + const u32 imm32_hi, const u32 imm32_lo) +{ + u8 *prog = *pprog; + int cnt = 0; + + if (is_uimm32(((u64)imm32_hi << 32) | (u32)imm32_lo)) { + /* For emitting plain u32, where sign bit must not be + * propagated LLVM tends to load imm64 over mov32 + * directly, so save couple of bytes by just doing + * 'mov %eax, imm32' instead. + */ + emit_mov_imm32(&prog, false, dst_reg, imm32_lo); + } else { + /* movabsq %rax, imm64 */ + EMIT2(add_1mod(0x48, dst_reg), add_1reg(0xB8, dst_reg)); + EMIT(imm32_lo, 4); + EMIT(imm32_hi, 4); + } + + *pprog = prog; +} + static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, int oldproglen, struct jit_context *ctx) { @@ -377,7 +444,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, const s32 imm32 = insn->imm; u32 dst_reg = insn->dst_reg; u32 src_reg = insn->src_reg; - u8 b1 = 0, b2 = 0, b3 = 0; + u8 b2 = 0, b3 = 0; s64 jmp_offset; u8 jmp_cond; bool reload_skb_data; @@ -485,58 +552,13 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, break; case BPF_ALU64 | BPF_MOV | BPF_K: - /* optimization: if imm32 is positive, - * use 'mov eax, imm32' (which zero-extends imm32) - * to save 2 bytes - */ - if (imm32 < 0) { - /* 'mov rax, imm32' sign extends imm32 */ - b1 = add_1mod(0x48, dst_reg); - b2 = 0xC7; - b3 = 0xC0; - EMIT3_off32(b1, b2, add_1reg(b3, dst_reg), imm32); - break; - } - case BPF_ALU | BPF_MOV | BPF_K: - /* optimization: if imm32 is zero, use 'xor ,' - * to save 3 bytes. - */ - if (imm32 == 0) { - if (is_ereg(dst_reg)) - EMIT1(add_2mod(0x40, dst_reg, dst_reg)); - b2 = 0x31; /* xor */ - b3 = 0xC0; - EMIT2(b2, add_2reg(b3, dst_reg, dst_reg)); - break; - } - - /* mov %eax, imm32 */ - if (is_ereg(dst_reg)) - EMIT1(add_1mod(0x40, dst_reg)); - EMIT1_off32(add_1reg(0xB8, dst_reg), imm32); + emit_mov_imm32(&prog, BPF_CLASS(insn->code) == BPF_ALU64, + dst_reg, imm32); break; case BPF_LD | BPF_IMM | BPF_DW: - /* optimization: if imm64 is zero, use 'xor ,' - * to save 7 bytes. - */ - if (insn[0].imm == 0 && insn[1].imm == 0) { - b1 = add_2mod(0x48, dst_reg, dst_reg); - b2 = 0x31; /* xor */ - b3 = 0xC0; - EMIT3(b1, b2, add_2reg(b3, dst_reg, dst_reg)); - - insn++; - i++; - break; - } - - /* movabsq %rax, imm64 */ - EMIT2(add_1mod(0x48, dst_reg), add_1reg(0xB8, dst_reg)); - EMIT(insn[0].imm, 4); - EMIT(insn[1].imm, 4); - + emit_mov_imm64(&prog, dst_reg, insn[1].imm, insn[0].imm); insn++; i++; break; @@ -604,7 +626,8 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, EMIT_mov(BPF_REG_0, src_reg); else /* mov rax, imm32 */ - EMIT3_off32(0x48, 0xC7, 0xC0, imm32); + emit_mov_imm32(&prog, true, + BPF_REG_0, imm32); if (BPF_CLASS(insn->code) == BPF_ALU64) EMIT1(add_1mod(0x48, AUX_REG)); From d806a0cf2a1ddb97c91d902ef1c8219e1e2b2c4c Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 24 Feb 2018 01:08:00 +0100 Subject: [PATCH 3/6] bpf, x64: save several bytes when mul dest is r0/r3 anyway Instead of unconditionally performing push/pop on rax/rdx in case of multiplication, we can save a few bytes in case of dest register being either BPF r0 (rax) or r3 (rdx) since the result is written in there anyway. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- arch/x86/net/bpf_jit_comp.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index f3e5cd8c1e689..9895ca3830230 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -615,8 +615,10 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, case BPF_ALU | BPF_MUL | BPF_X: case BPF_ALU64 | BPF_MUL | BPF_K: case BPF_ALU64 | BPF_MUL | BPF_X: - EMIT1(0x50); /* push rax */ - EMIT1(0x52); /* push rdx */ + if (dst_reg != BPF_REG_0) + EMIT1(0x50); /* push rax */ + if (dst_reg != BPF_REG_3) + EMIT1(0x52); /* push rdx */ /* mov r11, dst_reg */ EMIT_mov(AUX_REG, dst_reg); @@ -636,14 +638,13 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, /* mul(q) r11 */ EMIT2(0xF7, add_1reg(0xE0, AUX_REG)); - /* mov r11, rax */ - EMIT_mov(AUX_REG, BPF_REG_0); - - EMIT1(0x5A); /* pop rdx */ - EMIT1(0x58); /* pop rax */ - - /* mov dst_reg, r11 */ - EMIT_mov(dst_reg, AUX_REG); + if (dst_reg != BPF_REG_3) + EMIT1(0x5A); /* pop rdx */ + if (dst_reg != BPF_REG_0) { + /* mov dst_reg, rax */ + EMIT_mov(dst_reg, BPF_REG_0); + EMIT1(0x58); /* pop rax */ + } break; /* shifts */ From 4c38e2f386b4fc5fd95d1203c74819948e2e903d Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 24 Feb 2018 01:08:01 +0100 Subject: [PATCH 4/6] bpf, x64: save few bytes when mul is in alu32 Add a generic emit_mov_reg() helper in order to reuse it in BPF multiplication to load the src into rax, we can save a few bytes in alu32 while doing so. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- arch/x86/net/bpf_jit_comp.c | 43 ++++++++++++++++++++++++------------- 1 file changed, 28 insertions(+), 15 deletions(-) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 9895ca3830230..5b8fc1326aa1c 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -422,6 +422,24 @@ static void emit_mov_imm64(u8 **pprog, u32 dst_reg, *pprog = prog; } +static void emit_mov_reg(u8 **pprog, bool is64, u32 dst_reg, u32 src_reg) +{ + u8 *prog = *pprog; + int cnt = 0; + + if (is64) { + /* mov dst, src */ + EMIT_mov(dst_reg, src_reg); + } else { + /* mov32 dst, src */ + if (is_ereg(dst_reg) || is_ereg(src_reg)) + EMIT1(add_2mod(0x40, dst_reg, src_reg)); + EMIT2(0x89, add_2reg(0xC0, dst_reg, src_reg)); + } + + *pprog = prog; +} + static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, int oldproglen, struct jit_context *ctx) { @@ -480,16 +498,11 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, EMIT2(b2, add_2reg(0xC0, dst_reg, src_reg)); break; - /* mov dst, src */ case BPF_ALU64 | BPF_MOV | BPF_X: - EMIT_mov(dst_reg, src_reg); - break; - - /* mov32 dst, src */ case BPF_ALU | BPF_MOV | BPF_X: - if (is_ereg(dst_reg) || is_ereg(src_reg)) - EMIT1(add_2mod(0x40, dst_reg, src_reg)); - EMIT2(0x89, add_2reg(0xC0, dst_reg, src_reg)); + emit_mov_reg(&prog, + BPF_CLASS(insn->code) == BPF_ALU64, + dst_reg, src_reg); break; /* neg dst */ @@ -615,6 +628,9 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, case BPF_ALU | BPF_MUL | BPF_X: case BPF_ALU64 | BPF_MUL | BPF_K: case BPF_ALU64 | BPF_MUL | BPF_X: + { + bool is64 = BPF_CLASS(insn->code) == BPF_ALU64; + if (dst_reg != BPF_REG_0) EMIT1(0x50); /* push rax */ if (dst_reg != BPF_REG_3) @@ -624,14 +640,11 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, EMIT_mov(AUX_REG, dst_reg); if (BPF_SRC(insn->code) == BPF_X) - /* mov rax, src_reg */ - EMIT_mov(BPF_REG_0, src_reg); + emit_mov_reg(&prog, is64, BPF_REG_0, src_reg); else - /* mov rax, imm32 */ - emit_mov_imm32(&prog, true, - BPF_REG_0, imm32); + emit_mov_imm32(&prog, is64, BPF_REG_0, imm32); - if (BPF_CLASS(insn->code) == BPF_ALU64) + if (is64) EMIT1(add_1mod(0x48, AUX_REG)); else if (is_ereg(AUX_REG)) EMIT1(add_1mod(0x40, AUX_REG)); @@ -646,7 +659,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, EMIT1(0x58); /* pop rax */ } break; - + } /* shifts */ case BPF_ALU | BPF_LSH | BPF_K: case BPF_ALU | BPF_RSH | BPF_K: From 0869175220b339b81de48872c8198c3ed75782e3 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 24 Feb 2018 01:08:02 +0100 Subject: [PATCH 5/6] bpf, x64: save 5 bytes in prologue when ebpf insns came from cbpf While it's rather cumbersome to reduce prologue for cBPF->eBPF migrations wrt spill/fill for r15 which is callee saved register due to bpf_error path in bpf_jit.S that is both used by migrations as well as native eBPF, we can still trivially save 5 bytes in prologue for the former since tail calls can never be used there. cBPF->eBPF migrations also have their own custom prologue in BPF asm that xors A and X reg anyway, so it's fine we skip this here. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- arch/x86/net/bpf_jit_comp.c | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 5b8fc1326aa1c..70f9748da7aaa 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -216,7 +216,7 @@ struct jit_context { /* emit x64 prologue code for BPF program and check it's size. * bpf_tail_call helper will skip it while jumping into another program */ -static void emit_prologue(u8 **pprog, u32 stack_depth) +static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf) { u8 *prog = *pprog; int cnt = 0; @@ -251,18 +251,21 @@ static void emit_prologue(u8 **pprog, u32 stack_depth) /* mov qword ptr [rbp+24],r15 */ EMIT4(0x4C, 0x89, 0x7D, 24); - /* Clear the tail call counter (tail_call_cnt): for eBPF tail calls - * we need to reset the counter to 0. It's done in two instructions, - * resetting rax register to 0 (xor on eax gets 0 extended), and - * moving it to the counter location. - */ + if (!ebpf_from_cbpf) { + /* Clear the tail call counter (tail_call_cnt): for eBPF tail + * calls we need to reset the counter to 0. It's done in two + * instructions, resetting rax register to 0, and moving it + * to the counter location. + */ - /* xor eax, eax */ - EMIT2(0x31, 0xc0); - /* mov qword ptr [rbp+32], rax */ - EMIT4(0x48, 0x89, 0x45, 32); + /* xor eax, eax */ + EMIT2(0x31, 0xc0); + /* mov qword ptr [rbp+32], rax */ + EMIT4(0x48, 0x89, 0x45, 32); + + BUILD_BUG_ON(cnt != PROLOGUE_SIZE); + } - BUILD_BUG_ON(cnt != PROLOGUE_SIZE); *pprog = prog; } @@ -453,7 +456,8 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, int proglen = 0; u8 *prog = temp; - emit_prologue(&prog, bpf_prog->aux->stack_depth); + emit_prologue(&prog, bpf_prog->aux->stack_depth, + bpf_prog_was_classic(bpf_prog)); if (seen_ld_abs) emit_load_skb_data_hlen(&prog); From 23d191a82c133c31bb85aa4b4b26851cd4a4b4ac Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 24 Feb 2018 01:08:03 +0100 Subject: [PATCH 6/6] bpf: add various jit test cases Add few test cases that check the rnu-time results under JIT. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/test_verifier.c | 89 +++++++++++++++++++++ 1 file changed, 89 insertions(+) diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index 2971ba2829ace..c987d3a2426f9 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -11140,6 +11140,95 @@ static struct bpf_test tests[] = { .result = REJECT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, }, + { + "jit: lsh, rsh, arsh by 1", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_MOV64_IMM(BPF_REG_1, 0xff), + BPF_ALU64_IMM(BPF_LSH, BPF_REG_1, 1), + BPF_ALU32_IMM(BPF_LSH, BPF_REG_1, 1), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0x3fc, 1), + BPF_EXIT_INSN(), + BPF_ALU64_IMM(BPF_RSH, BPF_REG_1, 1), + BPF_ALU32_IMM(BPF_RSH, BPF_REG_1, 1), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0xff, 1), + BPF_EXIT_INSN(), + BPF_ALU64_IMM(BPF_ARSH, BPF_REG_1, 1), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0x7f, 1), + BPF_EXIT_INSN(), + BPF_MOV64_IMM(BPF_REG_0, 2), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .retval = 2, + }, + { + "jit: mov32 for ldimm64, 1", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 2), + BPF_LD_IMM64(BPF_REG_1, 0xfeffffffffffffffULL), + BPF_ALU64_IMM(BPF_RSH, BPF_REG_1, 32), + BPF_LD_IMM64(BPF_REG_2, 0xfeffffffULL), + BPF_JMP_REG(BPF_JEQ, BPF_REG_1, BPF_REG_2, 1), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .retval = 2, + }, + { + "jit: mov32 for ldimm64, 2", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_LD_IMM64(BPF_REG_1, 0x1ffffffffULL), + BPF_LD_IMM64(BPF_REG_2, 0xffffffffULL), + BPF_JMP_REG(BPF_JEQ, BPF_REG_1, BPF_REG_2, 1), + BPF_MOV64_IMM(BPF_REG_0, 2), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .retval = 2, + }, + { + "jit: various mul tests", + .insns = { + BPF_LD_IMM64(BPF_REG_2, 0xeeff0d413122ULL), + BPF_LD_IMM64(BPF_REG_0, 0xfefefeULL), + BPF_LD_IMM64(BPF_REG_1, 0xefefefULL), + BPF_ALU64_REG(BPF_MUL, BPF_REG_0, BPF_REG_1), + BPF_JMP_REG(BPF_JEQ, BPF_REG_0, BPF_REG_2, 2), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + BPF_LD_IMM64(BPF_REG_3, 0xfefefeULL), + BPF_ALU64_REG(BPF_MUL, BPF_REG_3, BPF_REG_1), + BPF_JMP_REG(BPF_JEQ, BPF_REG_3, BPF_REG_2, 2), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + BPF_MOV32_REG(BPF_REG_2, BPF_REG_2), + BPF_LD_IMM64(BPF_REG_0, 0xfefefeULL), + BPF_ALU32_REG(BPF_MUL, BPF_REG_0, BPF_REG_1), + BPF_JMP_REG(BPF_JEQ, BPF_REG_0, BPF_REG_2, 2), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + BPF_LD_IMM64(BPF_REG_3, 0xfefefeULL), + BPF_ALU32_REG(BPF_MUL, BPF_REG_3, BPF_REG_1), + BPF_JMP_REG(BPF_JEQ, BPF_REG_3, BPF_REG_2, 2), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + BPF_LD_IMM64(BPF_REG_0, 0x952a7bbcULL), + BPF_LD_IMM64(BPF_REG_1, 0xfefefeULL), + BPF_LD_IMM64(BPF_REG_2, 0xeeff0d413122ULL), + BPF_ALU32_REG(BPF_MUL, BPF_REG_2, BPF_REG_1), + BPF_JMP_REG(BPF_JEQ, BPF_REG_2, BPF_REG_0, 2), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + BPF_MOV64_IMM(BPF_REG_0, 2), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .retval = 2, + }, + }; static int probe_filter_length(const struct bpf_insn *fp)