Skip to content

Commit

Permalink
Merge branch 'ebpf-tail-call'
Browse files Browse the repository at this point in the history
Alexei Starovoitov says:

====================
bpf: introduce bpf_tail_call() helper

introduce bpf_tail_call(ctx, &jmp_table, index) helper function
which can be used from BPF programs like:
int bpf_prog(struct pt_regs *ctx)
{
  ...
  bpf_tail_call(ctx, &jmp_table, index);
  ...
}
that is roughly equivalent to:
int bpf_prog(struct pt_regs *ctx)
{
  ...
  if (jmp_table[index])
    return (*jmp_table[index])(ctx);
  ...
}
The important detail that it's not a normal call, but a tail call.
The kernel stack is precious, so this helper reuses the current
stack frame and jumps into another BPF program without adding
extra call frame.
It's trivially done in interpreter and a bit trickier in JITs.

Use cases:
- simplify complex programs
- dispatch into other programs
  (for example: index in jump table can be syscall number or network protocol)
- build dynamic chains of programs

The chain of tail calls can form unpredictable dynamic loops therefore
tail_call_cnt is used to limit the number of calls and currently is set to 32.

patch 1 - support bpf_tail_call() in interpreter
patch 2 - support in x64 JIT
We've discussed what's neccessary to support it in arm64/s390 JITs
and it looks fine.
patch 3 - sample example for tracing
patch 4 - sample example for networking

More details in every patch.

This set went through several iterations of reviews/fixes and older
attempts can be seen:
https://git.kernel.org/cgit/linux/kernel/git/ast/bpf.git/log/?h=tail_call_v[123456]
- tail_call_v1 does it without touching JITs but introduces overhead
  for all programs that don't use this helper function.
- tail_call_v2 still has some overhead and x64 JIT does full stack
  unwind (prologue skipping optimization wasn't there)
- tail_call_v3 reuses 'call' instruction encoding and has interpreter
  overhead for every normal call
- tail_call_v4 fixes above architectural shortcomings and v5,v6 fix few
  more bugs

This last tail_call_v6 approach seems to be the best.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
  • Loading branch information
David S. Miller committed May 21, 2015
2 parents e7582ba + 530b2c8 commit 3f55b7e
Show file tree
Hide file tree
Showing 17 changed files with 928 additions and 45 deletions.
150 changes: 126 additions & 24 deletions arch/x86/net/bpf_jit_comp.c
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include <linux/filter.h>
#include <linux/if_vlan.h>
#include <asm/cacheflush.h>
#include <linux/bpf.h>

int bpf_jit_enable __read_mostly;

Expand All @@ -37,7 +38,8 @@ static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len)
return ptr + len;
}

#define EMIT(bytes, len) do { prog = emit_code(prog, bytes, len); } while (0)
#define EMIT(bytes, len) \
do { prog = emit_code(prog, bytes, len); cnt += len; } while (0)

#define EMIT1(b1) EMIT(b1, 1)
#define EMIT2(b1, b2) EMIT((b1) + ((b2) << 8), 2)
Expand Down Expand Up @@ -186,31 +188,31 @@ struct jit_context {
#define BPF_MAX_INSN_SIZE 128
#define BPF_INSN_SAFETY 64

static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
int oldproglen, struct jit_context *ctx)
#define STACKSIZE \
(MAX_BPF_STACK + \
32 /* space for rbx, r13, r14, r15 */ + \
8 /* space for skb_copy_bits() buffer */)

#define PROLOGUE_SIZE 51

/* emit x64 prologue code for BPF program and check it's size.
* bpf_tail_call helper will skip it while jumping into another program
*/
static void emit_prologue(u8 **pprog)
{
struct bpf_insn *insn = bpf_prog->insnsi;
int insn_cnt = bpf_prog->len;
bool seen_ld_abs = ctx->seen_ld_abs | (oldproglen == 0);
bool seen_exit = false;
u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY];
int i;
int proglen = 0;
u8 *prog = temp;
int stacksize = MAX_BPF_STACK +
32 /* space for rbx, r13, r14, r15 */ +
8 /* space for skb_copy_bits() buffer */;
u8 *prog = *pprog;
int cnt = 0;

EMIT1(0x55); /* push rbp */
EMIT3(0x48, 0x89, 0xE5); /* mov rbp,rsp */

/* sub rsp, stacksize */
EMIT3_off32(0x48, 0x81, 0xEC, stacksize);
/* sub rsp, STACKSIZE */
EMIT3_off32(0x48, 0x81, 0xEC, STACKSIZE);

/* all classic BPF filters use R6(rbx) save it */

/* mov qword ptr [rbp-X],rbx */
EMIT3_off32(0x48, 0x89, 0x9D, -stacksize);
EMIT3_off32(0x48, 0x89, 0x9D, -STACKSIZE);

/* bpf_convert_filter() maps classic BPF register X to R7 and uses R8
* as temporary, so all tcpdump filters need to spill/fill R7(r13) and
Expand All @@ -221,16 +223,112 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
*/

/* mov qword ptr [rbp-X],r13 */
EMIT3_off32(0x4C, 0x89, 0xAD, -stacksize + 8);
EMIT3_off32(0x4C, 0x89, 0xAD, -STACKSIZE + 8);
/* mov qword ptr [rbp-X],r14 */
EMIT3_off32(0x4C, 0x89, 0xB5, -stacksize + 16);
EMIT3_off32(0x4C, 0x89, 0xB5, -STACKSIZE + 16);
/* mov qword ptr [rbp-X],r15 */
EMIT3_off32(0x4C, 0x89, 0xBD, -stacksize + 24);
EMIT3_off32(0x4C, 0x89, 0xBD, -STACKSIZE + 24);

/* clear A and X registers */
EMIT2(0x31, 0xc0); /* xor eax, eax */
EMIT3(0x4D, 0x31, 0xED); /* xor r13, r13 */

/* clear tail_cnt: mov qword ptr [rbp-X], rax */
EMIT3_off32(0x48, 0x89, 0x85, -STACKSIZE + 32);

BUILD_BUG_ON(cnt != PROLOGUE_SIZE);
*pprog = prog;
}

/* generate the following code:
* ... bpf_tail_call(void *ctx, struct bpf_array *array, u64 index) ...
* if (index >= array->map.max_entries)
* goto out;
* if (++tail_call_cnt > MAX_TAIL_CALL_CNT)
* goto out;
* prog = array->prog[index];
* if (prog == NULL)
* goto out;
* goto *(prog->bpf_func + prologue_size);
* out:
*/
static void emit_bpf_tail_call(u8 **pprog)
{
u8 *prog = *pprog;
int label1, label2, label3;
int cnt = 0;

/* rdi - pointer to ctx
* rsi - pointer to bpf_array
* rdx - index in bpf_array
*/

/* if (index >= array->map.max_entries)
* goto out;
*/
EMIT4(0x48, 0x8B, 0x46, /* mov rax, qword ptr [rsi + 16] */
offsetof(struct bpf_array, map.max_entries));
EMIT3(0x48, 0x39, 0xD0); /* cmp rax, rdx */
#define OFFSET1 44 /* number of bytes to jump */
EMIT2(X86_JBE, OFFSET1); /* jbe out */
label1 = cnt;

/* if (tail_call_cnt > MAX_TAIL_CALL_CNT)
* goto out;
*/
EMIT2_off32(0x8B, 0x85, -STACKSIZE + 36); /* mov eax, dword ptr [rbp - 516] */
EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT); /* cmp eax, MAX_TAIL_CALL_CNT */
#define OFFSET2 33
EMIT2(X86_JA, OFFSET2); /* ja out */
label2 = cnt;
EMIT3(0x83, 0xC0, 0x01); /* add eax, 1 */
EMIT2_off32(0x89, 0x85, -STACKSIZE + 36); /* mov dword ptr [rbp - 516], eax */

/* prog = array->prog[index]; */
EMIT4(0x48, 0x8D, 0x44, 0xD6); /* lea rax, [rsi + rdx * 8 + 0x50] */
EMIT1(offsetof(struct bpf_array, prog));
EMIT3(0x48, 0x8B, 0x00); /* mov rax, qword ptr [rax] */

/* if (prog == NULL)
* goto out;
*/
EMIT4(0x48, 0x83, 0xF8, 0x00); /* cmp rax, 0 */
#define OFFSET3 10
EMIT2(X86_JE, OFFSET3); /* je out */
label3 = cnt;

/* goto *(prog->bpf_func + prologue_size); */
EMIT4(0x48, 0x8B, 0x40, /* mov rax, qword ptr [rax + 32] */
offsetof(struct bpf_prog, bpf_func));
EMIT4(0x48, 0x83, 0xC0, PROLOGUE_SIZE); /* add rax, prologue_size */

/* now we're ready to jump into next BPF program
* rdi == ctx (1st arg)
* rax == prog->bpf_func + prologue_size
*/
EMIT2(0xFF, 0xE0); /* jmp rax */

/* out: */
BUILD_BUG_ON(cnt - label1 != OFFSET1);
BUILD_BUG_ON(cnt - label2 != OFFSET2);
BUILD_BUG_ON(cnt - label3 != OFFSET3);
*pprog = prog;
}

static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
int oldproglen, struct jit_context *ctx)
{
struct bpf_insn *insn = bpf_prog->insnsi;
int insn_cnt = bpf_prog->len;
bool seen_ld_abs = ctx->seen_ld_abs | (oldproglen == 0);
bool seen_exit = false;
u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY];
int i, cnt = 0;
int proglen = 0;
u8 *prog = temp;

emit_prologue(&prog);

if (seen_ld_abs) {
/* r9d : skb->len - skb->data_len (headlen)
* r10 : skb->data
Expand Down Expand Up @@ -739,6 +837,10 @@ xadd: if (is_imm8(insn->off))
}
break;

case BPF_JMP | BPF_CALL | BPF_X:
emit_bpf_tail_call(&prog);
break;

/* cond jump */
case BPF_JMP | BPF_JEQ | BPF_X:
case BPF_JMP | BPF_JNE | BPF_X:
Expand Down Expand Up @@ -891,13 +993,13 @@ xadd: if (is_imm8(insn->off))
/* update cleanup_addr */
ctx->cleanup_addr = proglen;
/* mov rbx, qword ptr [rbp-X] */
EMIT3_off32(0x48, 0x8B, 0x9D, -stacksize);
EMIT3_off32(0x48, 0x8B, 0x9D, -STACKSIZE);
/* mov r13, qword ptr [rbp-X] */
EMIT3_off32(0x4C, 0x8B, 0xAD, -stacksize + 8);
EMIT3_off32(0x4C, 0x8B, 0xAD, -STACKSIZE + 8);
/* mov r14, qword ptr [rbp-X] */
EMIT3_off32(0x4C, 0x8B, 0xB5, -stacksize + 16);
EMIT3_off32(0x4C, 0x8B, 0xB5, -STACKSIZE + 16);
/* mov r15, qword ptr [rbp-X] */
EMIT3_off32(0x4C, 0x8B, 0xBD, -stacksize + 24);
EMIT3_off32(0x4C, 0x8B, 0xBD, -STACKSIZE + 24);

EMIT1(0xC9); /* leave */
EMIT1(0xC3); /* ret */
Expand Down
22 changes: 22 additions & 0 deletions include/linux/bpf.h
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,27 @@ struct bpf_prog_aux {
struct work_struct work;
};

struct bpf_array {
struct bpf_map map;
u32 elem_size;
/* 'ownership' of prog_array is claimed by the first program that
* is going to use this map or by the first program which FD is stored
* in the map to make sure that all callers and callees have the same
* prog_type and JITed flag
*/
enum bpf_prog_type owner_prog_type;
bool owner_jited;
union {
char value[0] __aligned(8);
struct bpf_prog *prog[0] __aligned(8);
};
};
#define MAX_TAIL_CALL_CNT 32

u64 bpf_tail_call(u64 ctx, u64 r2, u64 index, u64 r4, u64 r5);
void bpf_prog_array_map_clear(struct bpf_map *map);
bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp);

#ifdef CONFIG_BPF_SYSCALL
void bpf_register_prog_type(struct bpf_prog_type_list *tl);
void bpf_register_map_type(struct bpf_map_type_list *tl);
Expand Down Expand Up @@ -160,5 +181,6 @@ extern const struct bpf_func_proto bpf_map_delete_elem_proto;

extern const struct bpf_func_proto bpf_get_prandom_u32_proto;
extern const struct bpf_func_proto bpf_get_smp_processor_id_proto;
extern const struct bpf_func_proto bpf_tail_call_proto;

#endif /* _LINUX_BPF_H */
2 changes: 1 addition & 1 deletion include/linux/filter.h
Original file line number Diff line number Diff line change
Expand Up @@ -378,7 +378,7 @@ static inline void bpf_prog_unlock_ro(struct bpf_prog *fp)

int sk_filter(struct sock *sk, struct sk_buff *skb);

void bpf_prog_select_runtime(struct bpf_prog *fp);
int bpf_prog_select_runtime(struct bpf_prog *fp);
void bpf_prog_free(struct bpf_prog *fp);

struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags);
Expand Down
10 changes: 10 additions & 0 deletions include/uapi/linux/bpf.h
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ enum bpf_map_type {
BPF_MAP_TYPE_UNSPEC,
BPF_MAP_TYPE_HASH,
BPF_MAP_TYPE_ARRAY,
BPF_MAP_TYPE_PROG_ARRAY,
};

enum bpf_prog_type {
Expand Down Expand Up @@ -210,6 +211,15 @@ enum bpf_func_id {
* Return: 0 on success
*/
BPF_FUNC_l4_csum_replace,

/**
* bpf_tail_call(ctx, prog_array_map, index) - jump into another BPF program
* @ctx: context pointer passed to next program
* @prog_array_map: pointer to map which type is BPF_MAP_TYPE_PROG_ARRAY
* @index: index inside array that selects specific program to run
* Return: 0 on success
*/
BPF_FUNC_tail_call,
__BPF_FUNC_MAX_ID,
};

Expand Down
Loading

0 comments on commit 3f55b7e

Please sign in to comment.