From a646c9b2dabae19138926b888c979e43fb85362b Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 30 Nov 2017 21:32:48 -0800 Subject: [PATCH 01/13] nfp: fix old kdoc issues Since commit 3a025e1d1c2e ("Add optional check for bad kernel-doc comments") when built with W=1 build will complain about kdoc errors. Fix the kdoc issues we have. kdoc is still confused by defines in nfp_net_ctrl.h but those are not really errors. Signed-off-by: Jakub Kicinski Signed-off-by: Daniel Borkmann --- drivers/net/ethernet/netronome/nfp/nfp_net.h | 2 ++ drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cppcore.c | 9 +++------ 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net.h b/drivers/net/ethernet/netronome/nfp/nfp_net.h index 7f9857c276b16..3801c52098d57 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net.h +++ b/drivers/net/ethernet/netronome/nfp/nfp_net.h @@ -548,6 +548,8 @@ struct nfp_net_dp { * @max_r_vecs: Number of allocated interrupt vectors for RX/TX * @max_tx_rings: Maximum number of TX rings supported by the Firmware * @max_rx_rings: Maximum number of RX rings supported by the Firmware + * @stride_rx: Queue controller RX queue spacing + * @stride_tx: Queue controller TX queue spacing * @r_vecs: Pre-allocated array of ring vectors * @irq_entries: Pre-allocated array of MSI-X entries * @lsc_handler: Handler for Link State Change interrupt diff --git a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cppcore.c b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cppcore.c index 04dd5758ecf54..3fcb522d2e852 100644 --- a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cppcore.c +++ b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cppcore.c @@ -372,8 +372,7 @@ nfp_cpp_area_alloc(struct nfp_cpp *cpp, u32 dest, * that it can be accessed directly. * * NOTE: @address and @size must be 32-bit aligned values. - * - * NOTE: The area must also be 'released' when the structure is freed. + * The area must also be 'released' when the structure is freed. * * Return: NFP CPP Area handle, or NULL */ @@ -536,8 +535,7 @@ void nfp_cpp_area_release_free(struct nfp_cpp_area *area) * Read data from indicated CPP region. * * NOTE: @offset and @length must be 32-bit aligned values. - * - * NOTE: Area must have been locked down with an 'acquire'. + * Area must have been locked down with an 'acquire'. * * Return: length of io, or -ERRNO */ @@ -558,8 +556,7 @@ int nfp_cpp_area_read(struct nfp_cpp_area *area, * Write data to indicated CPP region. * * NOTE: @offset and @length must be 32-bit aligned values. - * - * NOTE: Area must have been locked down with an 'acquire'. + * Area must have been locked down with an 'acquire'. * * Return: length of io, or -ERRNO */ From 854dc87d1a7be1f11b2d079a80a822742d6e560e Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Thu, 30 Nov 2017 21:32:49 -0800 Subject: [PATCH 02/13] nfp: bpf: support backward jump This patch adds support for backward jump on NFP. - restrictions on backward jump in various functions have been removed. - nfp_fixup_branches now supports backward jump. There is one thing to note, currently an input eBPF JMP insn may generate several NFP insns, for example, NFP imm move insn A \ NFP compare insn B --> 3 NFP insn jited from eBPF JMP insn M NFP branch insn C / --- NFP insn X --> 1 NFP insn jited from eBPF insn N --- ... therefore, we are doing sanity check to make sure the last jited insn from an eBPF JMP is a NFP branch instruction. Once backward jump is allowed, it is possible an eBPF JMP insn is at the end of the program. This is however causing trouble for the sanity check. Because the sanity check requires the end index of the NFP insns jited from one eBPF insn while only the start index is recorded before this patch that we can only get the end index by: start_index_of_the_next_eBPF_insn - 1 or for the above example: start_index_of_eBPF_insn_N (which is the index of NFP insn X) - 1 nfp_fixup_branches was using nfp_for_each_insn_walk2 to expose *next* insn to each iteration during the traversal so the last index could be calculated from which. Now, it needs some extra code to handle the last insn. Meanwhile, the use of walk2 is actually unnecessary, we could simply use generic single instruction walk to do this, the next insn could be easily calculated using list_next_entry. So, this patch migrates the jump fixup traversal method to *list_for_each_entry*, this simplifies the code logic a little bit. The other thing to note is a new state variable "last_bpf_off" is introduced to track the index of the last jited NFP insn. This is necessary because NFP is generating special purposes epilogue sequences, so the index of the last jited NFP insn is *not* always nfp_prog->prog_len - 1. Suggested-by: Jakub Kicinski Signed-off-by: Jiong Wang Signed-off-by: Jakub Kicinski Signed-off-by: Daniel Borkmann --- drivers/net/ethernet/netronome/nfp/bpf/jit.c | 66 +++++++++++-------- drivers/net/ethernet/netronome/nfp/bpf/main.h | 4 +- 2 files changed, 40 insertions(+), 30 deletions(-) diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c b/drivers/net/ethernet/netronome/nfp/bpf/jit.c index 995e95410b118..20daf6b95601b 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2016 Netronome Systems, Inc. + * Copyright (C) 2016-2017 Netronome Systems, Inc. * * This software is dual licensed under the GNU General License Version 2, * June 1991 as shown in the file COPYING in the top-level directory of this @@ -975,9 +975,6 @@ wrp_test_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, { const struct bpf_insn *insn = &meta->insn; - if (insn->off < 0) /* TODO */ - return -EOPNOTSUPP; - wrp_test_reg_one(nfp_prog, insn->dst_reg * 2, alu_op, insn->src_reg * 2, br_mask, insn->off); wrp_test_reg_one(nfp_prog, insn->dst_reg * 2 + 1, alu_op, @@ -995,9 +992,6 @@ wrp_cmp_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, u8 reg = insn->dst_reg * 2; swreg tmp_reg; - if (insn->off < 0) /* TODO */ - return -EOPNOTSUPP; - tmp_reg = ur_load_imm_any(nfp_prog, imm & ~0U, imm_b(nfp_prog)); if (!swap) emit_alu(nfp_prog, reg_none(), reg_a(reg), ALU_OP_SUB, tmp_reg); @@ -1027,9 +1021,6 @@ wrp_cmp_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, areg = insn->dst_reg * 2; breg = insn->src_reg * 2; - if (insn->off < 0) /* TODO */ - return -EOPNOTSUPP; - if (swap) { areg ^= breg; breg ^= areg; @@ -1630,8 +1621,6 @@ static int mem_stx8(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) static int jump(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) { - if (meta->insn.off < 0) /* TODO */ - return -EOPNOTSUPP; emit_br(nfp_prog, BR_UNC, meta->insn.off, 0); return 0; @@ -1646,9 +1635,6 @@ static int jeq_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) or1 = reg_a(insn->dst_reg * 2); or2 = reg_b(insn->dst_reg * 2 + 1); - if (insn->off < 0) /* TODO */ - return -EOPNOTSUPP; - if (imm & ~0U) { tmp_reg = ur_load_imm_any(nfp_prog, imm & ~0U, imm_b(nfp_prog)); emit_alu(nfp_prog, imm_a(nfp_prog), @@ -1695,9 +1681,6 @@ static int jset_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) u64 imm = insn->imm; /* sign extend */ swreg tmp_reg; - if (insn->off < 0) /* TODO */ - return -EOPNOTSUPP; - if (!imm) { meta->skip = true; return 0; @@ -1726,9 +1709,6 @@ static int jne_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) u64 imm = insn->imm; /* sign extend */ swreg tmp_reg; - if (insn->off < 0) /* TODO */ - return -EOPNOTSUPP; - if (!imm) { emit_alu(nfp_prog, reg_none(), reg_a(insn->dst_reg * 2), ALU_OP_OR, reg_b(insn->dst_reg * 2 + 1)); @@ -1753,9 +1733,6 @@ static int jeq_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) { const struct bpf_insn *insn = &meta->insn; - if (insn->off < 0) /* TODO */ - return -EOPNOTSUPP; - emit_alu(nfp_prog, imm_a(nfp_prog), reg_a(insn->dst_reg * 2), ALU_OP_XOR, reg_b(insn->src_reg * 2)); emit_alu(nfp_prog, imm_b(nfp_prog), reg_a(insn->dst_reg * 2 + 1), @@ -1888,16 +1865,25 @@ static void br_set_offset(u64 *instr, u16 offset) static int nfp_fixup_branches(struct nfp_prog *nfp_prog) { struct nfp_insn_meta *meta, *next; - u32 off, br_idx; - u32 idx; + u32 idx, br_idx; + int off; - nfp_for_each_insn_walk2(nfp_prog, meta, next) { + list_for_each_entry(meta, &nfp_prog->insns, l) { if (meta->skip) continue; if (BPF_CLASS(meta->insn.code) != BPF_JMP) continue; - br_idx = nfp_prog_offset_to_index(nfp_prog, next->off) - 1; + if (list_is_last(&meta->l, &nfp_prog->insns)) { + next = NULL; + idx = nfp_prog->last_bpf_off; + } else { + next = list_next_entry(meta, l); + idx = next->off - 1; + } + + br_idx = nfp_prog_offset_to_index(nfp_prog, idx); + if (!nfp_is_br(nfp_prog->prog[br_idx])) { pr_err("Fixup found block not ending in branch %d %02x %016llx!!\n", br_idx, meta->insn.code, nfp_prog->prog[br_idx]); @@ -1914,10 +1900,30 @@ static int nfp_fixup_branches(struct nfp_prog *nfp_prog) return -ELOOP; } - while (off && nfp_meta_has_next(nfp_prog, next)) { + if (!next) { + /* When "next" is NULL, "meta" is the last node in the + * list. Given it is an JMP, it then must be a backward + * jump. + * + * For eBPF, the jump offset is against pc + 1, so we + * need to compensate the offset by 1 as we are pointing + * "next" to the current node "meta". + */ + if (WARN_ON_ONCE(off > -2)) + return -ELOOP; + + next = meta; + off += 1; + } + + while (off > 0 && nfp_meta_has_next(nfp_prog, next)) { next = nfp_meta_next(next); off--; } + while (off < 0 && nfp_meta_has_prev(nfp_prog, next)) { + next = nfp_meta_prev(next); + off++; + } if (off) { pr_err("Fixup found too large jump!! %d\n", off); return -ELOOP; @@ -2105,6 +2111,8 @@ static int nfp_translate(struct nfp_prog *nfp_prog) nfp_prog->n_translated++; } + nfp_prog->last_bpf_off = nfp_prog_current_offset(nfp_prog) - 1; + nfp_outro(nfp_prog); if (nfp_prog->error) return nfp_prog->error; diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.h b/drivers/net/ethernet/netronome/nfp/bpf/main.h index 082a15f6dfb5b..0f4d218fc77ae 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/main.h +++ b/drivers/net/ethernet/netronome/nfp/bpf/main.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2016 Netronome Systems, Inc. + * Copyright (C) 2016-2017 Netronome Systems, Inc. * * This software is dual licensed under the GNU General License Version 2, * June 1991 as shown in the file COPYING in the top-level directory of this @@ -142,6 +142,7 @@ static inline u8 mbpf_mode(const struct nfp_insn_meta *meta) * @verifier_meta: temporary storage for verifier's insn meta * @type: BPF program type * @start_off: address of the first instruction in the memory + * @last_bpf_off: address of the last instruction translated from BPF * @tgt_out: jump target for normal exit * @tgt_abort: jump target for abort (e.g. access outside of packet buffer) * @tgt_done: jump target to get the next packet @@ -160,6 +161,7 @@ struct nfp_prog { enum bpf_prog_type type; unsigned int start_off; + unsigned int last_bpf_off; unsigned int tgt_out; unsigned int tgt_abort; unsigned int tgt_done; From 5b674140addc3c863efa227946ad7328f016a7a3 Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Thu, 30 Nov 2017 21:32:50 -0800 Subject: [PATCH 03/13] nfp: bpf: record jump destination to simplify jump fixup eBPF insns are internally organized as dual-list inside NFP offload JIT. Random access to an insn needs to be done by either forward or backward traversal along the list. One place we need to do such traversal is at nfp_fixup_branches where one traversal is needed for each jump insn to find the destination. Such traversals could be avoided if jump destinations are collected through a single travesal in a pre-scan pass, and such information could also be useful in other places where jump destination info are needed. This patch adds such jump destination collection in nfp_prog_prepare. Suggested-by: Jakub Kicinski Signed-off-by: Jiong Wang Reviewed-by: Jakub Kicinski Signed-off-by: Daniel Borkmann --- drivers/net/ethernet/netronome/nfp/bpf/jit.c | 57 +++---------------- drivers/net/ethernet/netronome/nfp/bpf/main.h | 13 ++++- .../net/ethernet/netronome/nfp/bpf/offload.c | 22 ++++++- .../net/ethernet/netronome/nfp/bpf/verifier.c | 4 +- 4 files changed, 41 insertions(+), 55 deletions(-) diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c b/drivers/net/ethernet/netronome/nfp/bpf/jit.c index 20daf6b95601b..f76659ecb6544 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c @@ -65,12 +65,6 @@ next = nfp_meta_next(pos), \ next2 = nfp_meta_next(next)) -static bool -nfp_meta_has_next(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) -{ - return meta->l.next != &nfp_prog->insns; -} - static bool nfp_meta_has_prev(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) { @@ -1864,9 +1858,8 @@ static void br_set_offset(u64 *instr, u16 offset) /* --- Assembler logic --- */ static int nfp_fixup_branches(struct nfp_prog *nfp_prog) { - struct nfp_insn_meta *meta, *next; + struct nfp_insn_meta *meta, *jmp_dst; u32 idx, br_idx; - int off; list_for_each_entry(meta, &nfp_prog->insns, l) { if (meta->skip) @@ -1874,13 +1867,10 @@ static int nfp_fixup_branches(struct nfp_prog *nfp_prog) if (BPF_CLASS(meta->insn.code) != BPF_JMP) continue; - if (list_is_last(&meta->l, &nfp_prog->insns)) { - next = NULL; + if (list_is_last(&meta->l, &nfp_prog->insns)) idx = nfp_prog->last_bpf_off; - } else { - next = list_next_entry(meta, l); - idx = next->off - 1; - } + else + idx = list_next_entry(meta, l)->off - 1; br_idx = nfp_prog_offset_to_index(nfp_prog, idx); @@ -1893,43 +1883,14 @@ static int nfp_fixup_branches(struct nfp_prog *nfp_prog) if (FIELD_GET(OP_BR_SPECIAL, nfp_prog->prog[br_idx])) continue; - /* Find the target offset in assembler realm */ - off = meta->insn.off; - if (!off) { - pr_err("Fixup found zero offset!!\n"); + if (!meta->jmp_dst) { + pr_err("Non-exit jump doesn't have destination info recorded!!\n"); return -ELOOP; } - if (!next) { - /* When "next" is NULL, "meta" is the last node in the - * list. Given it is an JMP, it then must be a backward - * jump. - * - * For eBPF, the jump offset is against pc + 1, so we - * need to compensate the offset by 1 as we are pointing - * "next" to the current node "meta". - */ - if (WARN_ON_ONCE(off > -2)) - return -ELOOP; - - next = meta; - off += 1; - } - - while (off > 0 && nfp_meta_has_next(nfp_prog, next)) { - next = nfp_meta_next(next); - off--; - } - while (off < 0 && nfp_meta_has_prev(nfp_prog, next)) { - next = nfp_meta_prev(next); - off++; - } - if (off) { - pr_err("Fixup found too large jump!! %d\n", off); - return -ELOOP; - } + jmp_dst = meta->jmp_dst; - if (next->skip) { + if (jmp_dst->skip) { pr_err("Branch landing on removed instruction!!\n"); return -ELOOP; } @@ -1938,7 +1899,7 @@ static int nfp_fixup_branches(struct nfp_prog *nfp_prog) idx <= br_idx; idx++) { if (!nfp_is_br(nfp_prog->prog[idx])) continue; - br_set_offset(&nfp_prog->prog[idx], next->off); + br_set_offset(&nfp_prog->prog[idx], jmp_dst->off); } } diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.h b/drivers/net/ethernet/netronome/nfp/bpf/main.h index 0f4d218fc77ae..e488656f406cd 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/main.h +++ b/drivers/net/ethernet/netronome/nfp/bpf/main.h @@ -94,6 +94,7 @@ typedef int (*instr_cb_t)(struct nfp_prog *, struct nfp_insn_meta *); * @insn: BPF instruction * @ptr: pointer type for memory operations * @ptr_not_const: pointer is not always constant + * @jmp_dst: destination info for jump instructions * @off: index of first generated machine instruction (in nfp_prog.prog) * @n: eBPF instruction number * @skip: skip this instruction (optimized out) @@ -102,8 +103,13 @@ typedef int (*instr_cb_t)(struct nfp_prog *, struct nfp_insn_meta *); */ struct nfp_insn_meta { struct bpf_insn insn; - struct bpf_reg_state ptr; - bool ptr_not_const; + union { + struct { + struct bpf_reg_state ptr; + bool ptr_not_const; + }; + struct nfp_insn_meta *jmp_dst; + }; unsigned int off; unsigned short n; bool skip; @@ -191,4 +197,7 @@ int nfp_bpf_translate(struct nfp_app *app, struct nfp_net *nn, struct bpf_prog *prog); int nfp_bpf_destroy(struct nfp_app *app, struct nfp_net *nn, struct bpf_prog *prog); +struct nfp_insn_meta * +nfp_bpf_goto_meta(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, + unsigned int insn_idx, unsigned int n_insns); #endif diff --git a/drivers/net/ethernet/netronome/nfp/bpf/offload.c b/drivers/net/ethernet/netronome/nfp/bpf/offload.c index bc879aeb62d4e..240db663d83fa 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/offload.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/offload.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2016 Netronome Systems, Inc. + * Copyright (C) 2016-2017 Netronome Systems, Inc. * * This software is dual licensed under the GNU General License Version 2, * June 1991 as shown in the file COPYING in the top-level directory of this @@ -55,11 +55,10 @@ static int nfp_prog_prepare(struct nfp_prog *nfp_prog, const struct bpf_insn *prog, unsigned int cnt) { + struct nfp_insn_meta *meta; unsigned int i; for (i = 0; i < cnt; i++) { - struct nfp_insn_meta *meta; - meta = kzalloc(sizeof(*meta), GFP_KERNEL); if (!meta) return -ENOMEM; @@ -70,6 +69,23 @@ nfp_prog_prepare(struct nfp_prog *nfp_prog, const struct bpf_insn *prog, list_add_tail(&meta->l, &nfp_prog->insns); } + /* Another pass to record jump information. */ + list_for_each_entry(meta, &nfp_prog->insns, l) { + u64 code = meta->insn.code; + + if (BPF_CLASS(code) == BPF_JMP && BPF_OP(code) != BPF_EXIT && + BPF_OP(code) != BPF_CALL) { + struct nfp_insn_meta *dst_meta; + unsigned short dst_indx; + + dst_indx = meta->n + 1 + meta->insn.off; + dst_meta = nfp_bpf_goto_meta(nfp_prog, meta, dst_indx, + cnt); + + meta->jmp_dst = dst_meta; + } + } + return 0; } diff --git a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c index 8d43491ddd6b8..cca67730b91f4 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2016 Netronome Systems, Inc. + * Copyright (C) 2016-2017 Netronome Systems, Inc. * * This software is dual licensed under the GNU General License Version 2, * June 1991 as shown in the file COPYING in the top-level directory of this @@ -40,7 +40,7 @@ #include "main.h" -static struct nfp_insn_meta * +struct nfp_insn_meta * nfp_bpf_goto_meta(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, unsigned int insn_idx, unsigned int n_insns) { From a09d5c52c42129adbac2d1e39bd0e49a92729e3e Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Thu, 30 Nov 2017 21:32:51 -0800 Subject: [PATCH 04/13] nfp: bpf: flag jump destination to guide insn combine optimizations NFP eBPF offload JIT engine is doing some instruction combine based optimizations which however must not be safe if the combined sequences are across basic block boarders. Currently, there are post checks during fixing jump destinations. If the jump destination is found to be eBPF insn that has been combined into another one, then JIT engine will raise error and abort. This is not optimal. The JIT engine ought to disable the optimization on such cross-bb-border sequences instead of abort. As there is no control flow information in eBPF infrastructure that we can't do basic block based optimizations, this patch extends the existing jump destination record pass to also flag the jump destination, then in instruction combine passes we could skip the optimizations if insns in the sequence are jump targets. Suggested-by: Jakub Kicinski Signed-off-by: Jiong Wang Reviewed-by: Jakub Kicinski Signed-off-by: Daniel Borkmann --- drivers/net/ethernet/netronome/nfp/bpf/main.h | 4 ++++ drivers/net/ethernet/netronome/nfp/bpf/offload.c | 1 + 2 files changed, 5 insertions(+) diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.h b/drivers/net/ethernet/netronome/nfp/bpf/main.h index e488656f406cd..99da1d34dd0ee 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/main.h +++ b/drivers/net/ethernet/netronome/nfp/bpf/main.h @@ -89,6 +89,8 @@ typedef int (*instr_cb_t)(struct nfp_prog *, struct nfp_insn_meta *); #define nfp_meta_next(meta) list_next_entry(meta, l) #define nfp_meta_prev(meta) list_prev_entry(meta, l) +#define FLAG_INSN_IS_JUMP_DST BIT(0) + /** * struct nfp_insn_meta - BPF instruction wrapper * @insn: BPF instruction @@ -97,6 +99,7 @@ typedef int (*instr_cb_t)(struct nfp_prog *, struct nfp_insn_meta *); * @jmp_dst: destination info for jump instructions * @off: index of first generated machine instruction (in nfp_prog.prog) * @n: eBPF instruction number + * @flags: eBPF instruction extra optimization flags * @skip: skip this instruction (optimized out) * @double_cb: callback for second part of the instruction * @l: link on nfp_prog->insns list @@ -112,6 +115,7 @@ struct nfp_insn_meta { }; unsigned int off; unsigned short n; + unsigned short flags; bool skip; instr_cb_t double_cb; diff --git a/drivers/net/ethernet/netronome/nfp/bpf/offload.c b/drivers/net/ethernet/netronome/nfp/bpf/offload.c index 240db663d83fa..377976ce92dd0 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/offload.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/offload.c @@ -83,6 +83,7 @@ nfp_prog_prepare(struct nfp_prog *nfp_prog, const struct bpf_insn *prog, cnt); meta->jmp_dst = dst_meta; + dst_meta->flags |= FLAG_INSN_IS_JUMP_DST; } } From 1266f5d6559e30e9afdebfff60d60ab86a8da77b Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Thu, 30 Nov 2017 21:32:52 -0800 Subject: [PATCH 05/13] nfp: bpf: don't do ld/mask combination if mask is jump destination If the mask insn in the ld/mask pair is jump destination, then don't do combination. Signed-off-by: Jiong Wang Reviewed-by: Jakub Kicinski Signed-off-by: Daniel Borkmann --- drivers/net/ethernet/netronome/nfp/bpf/jit.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c b/drivers/net/ethernet/netronome/nfp/bpf/jit.c index f76659ecb6544..f2317b7642220 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c @@ -2142,6 +2142,9 @@ static void nfp_bpf_opt_ld_mask(struct nfp_prog *nfp_prog) if (next.src_reg || next.dst_reg) continue; + if (meta2->flags & FLAG_INSN_IS_JUMP_DST) + continue; + meta2->skip = true; } } From 29fe46efba5c3e1ac0f857a03a29f6bf0d0c5592 Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Thu, 30 Nov 2017 21:32:53 -0800 Subject: [PATCH 06/13] nfp: bpf: don't do ld/shifts combination if shifts are jump destination If any of the shift insns in the ld/shift sequence is jump destination, don't do combination. Signed-off-by: Jiong Wang Reviewed-by: Jakub Kicinski Signed-off-by: Daniel Borkmann --- drivers/net/ethernet/netronome/nfp/bpf/jit.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c b/drivers/net/ethernet/netronome/nfp/bpf/jit.c index f2317b7642220..54915a3b8a7e9 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c @@ -2181,6 +2181,10 @@ static void nfp_bpf_opt_ld_shift(struct nfp_prog *nfp_prog) if (next1.imm != 0x20 || next2.imm != 0x20) continue; + if (meta2->flags & FLAG_INSN_IS_JUMP_DST || + meta3->flags & FLAG_INSN_IS_JUMP_DST) + continue; + meta2->skip = true; meta3->skip = true; } From 08859f159eaf7159a7a680c6151073f4451b222b Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Thu, 30 Nov 2017 21:32:54 -0800 Subject: [PATCH 07/13] nfp: bpf: relax source operands check The NFP normally requires the source operands to be difference addressing modes, but we should rule out the very special NN_REG_NONE type. There are instruction that ignores both A/B operands, for example: local_csr_rd For these instructions, we might pass the same operand type, NN_REG_NONE, for both A/B operands. NOTE: in current NFP ISA, it is only possible for instructions with unrestricted operands to take none operands, but in case there is new and similar instructoin in restricted form, they would follow similar rules, so swreg_to_restricted is updated as well. Signed-off-by: Jiong Wang Reviewed-by: Jakub Kicinski Signed-off-by: Daniel Borkmann --- drivers/net/ethernet/netronome/nfp/nfp_asm.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/netronome/nfp/nfp_asm.c b/drivers/net/ethernet/netronome/nfp/nfp_asm.c index 830f6de25f475..da277386077c0 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_asm.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_asm.c @@ -120,7 +120,8 @@ int swreg_to_unrestricted(swreg dst, swreg lreg, swreg rreg, reg->dst = nfp_swreg_to_unreg(dst, true); /* Decode source operands */ - if (swreg_type(lreg) == swreg_type(rreg)) + if (swreg_type(lreg) == swreg_type(rreg) && + swreg_type(lreg) != NN_REG_NONE) return -EFAULT; if (swreg_type(lreg) == NN_REG_GPR_B || @@ -200,7 +201,8 @@ int swreg_to_restricted(swreg dst, swreg lreg, swreg rreg, reg->dst = nfp_swreg_to_rereg(dst, true, false, NULL); /* Decode source operands */ - if (swreg_type(lreg) == swreg_type(rreg)) + if (swreg_type(lreg) == swreg_type(rreg) && + swreg_type(lreg) != NN_REG_NONE) return -EFAULT; if (swreg_type(lreg) == NN_REG_GPR_B || From 3239e7bb28a8a4a96c36beeaa6439666e1ed4f8b Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Thu, 30 Nov 2017 21:32:55 -0800 Subject: [PATCH 08/13] nfp: bpf: correct the encoding for No-Dest immed When immed is used with No-Dest, the emitter should use reg.dst instead of reg.areg for the destination, using the latter will actually encode register zero. Signed-off-by: Jiong Wang Reviewed-by: Jakub Kicinski Signed-off-by: Daniel Borkmann --- drivers/net/ethernet/netronome/nfp/bpf/jit.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c b/drivers/net/ethernet/netronome/nfp/bpf/jit.c index 54915a3b8a7e9..024b44089623d 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c @@ -224,9 +224,11 @@ emit_immed(struct nfp_prog *nfp_prog, swreg dst, u16 imm, return; } - __emit_immed(nfp_prog, reg.areg, reg.breg, imm >> 8, width, - invert, shift, reg.wr_both, - reg.dst_lmextn, reg.src_lmextn); + /* Use reg.dst when destination is No-Dest. */ + __emit_immed(nfp_prog, + swreg_type(dst) == NN_REG_NONE ? reg.dst : reg.areg, + reg.breg, imm >> 8, width, invert, shift, + reg.wr_both, reg.dst_lmextn, reg.src_lmextn); } static void From 5468a8b929e6276e139405d525c963a56890b5e0 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 30 Nov 2017 21:32:56 -0800 Subject: [PATCH 09/13] nfp: bpf: encode indirect commands Add support for emitting commands with field overwrites. Signed-off-by: Jakub Kicinski Signed-off-by: Jiong Wang Signed-off-by: Daniel Borkmann --- drivers/net/ethernet/netronome/nfp/bpf/jit.c | 17 +++++++++++++---- drivers/net/ethernet/netronome/nfp/nfp_asm.h | 3 ++- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c b/drivers/net/ethernet/netronome/nfp/bpf/jit.c index 024b44089623d..da4e106d3b164 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c @@ -96,7 +96,7 @@ nfp_prog_offset_to_index(struct nfp_prog *nfp_prog, unsigned int offset) /* --- Emitters --- */ static void __emit_cmd(struct nfp_prog *nfp_prog, enum cmd_tgt_map op, - u8 mode, u8 xfer, u8 areg, u8 breg, u8 size, bool sync) + u8 mode, u8 xfer, u8 areg, u8 breg, u8 size, bool sync, bool indir) { enum cmd_ctx_swap ctx; u64 insn; @@ -114,14 +114,15 @@ __emit_cmd(struct nfp_prog *nfp_prog, enum cmd_tgt_map op, FIELD_PREP(OP_CMD_CNT, size) | FIELD_PREP(OP_CMD_SIG, sync) | FIELD_PREP(OP_CMD_TGT_CMD, cmd_tgt_act[op].tgt_cmd) | + FIELD_PREP(OP_CMD_INDIR, indir) | FIELD_PREP(OP_CMD_MODE, mode); nfp_prog_push(nfp_prog, insn); } static void -emit_cmd(struct nfp_prog *nfp_prog, enum cmd_tgt_map op, - u8 mode, u8 xfer, swreg lreg, swreg rreg, u8 size, bool sync) +emit_cmd_any(struct nfp_prog *nfp_prog, enum cmd_tgt_map op, u8 mode, u8 xfer, + swreg lreg, swreg rreg, u8 size, bool sync, bool indir) { struct nfp_insn_re_regs reg; int err; @@ -142,7 +143,15 @@ emit_cmd(struct nfp_prog *nfp_prog, enum cmd_tgt_map op, return; } - __emit_cmd(nfp_prog, op, mode, xfer, reg.areg, reg.breg, size, sync); + __emit_cmd(nfp_prog, op, mode, xfer, reg.areg, reg.breg, size, sync, + indir); +} + +static void +emit_cmd(struct nfp_prog *nfp_prog, enum cmd_tgt_map op, u8 mode, u8 xfer, + swreg lreg, swreg rreg, u8 size, bool sync) +{ + emit_cmd_any(nfp_prog, op, mode, xfer, lreg, rreg, size, sync, false); } static void diff --git a/drivers/net/ethernet/netronome/nfp/nfp_asm.h b/drivers/net/ethernet/netronome/nfp/nfp_asm.h index 74d0c11ab2f90..6ff842a15e5db 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_asm.h +++ b/drivers/net/ethernet/netronome/nfp/nfp_asm.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2016 Netronome Systems, Inc. + * Copyright (C) 2016-2017 Netronome Systems, Inc. * * This software is dual licensed under the GNU General License Version 2, * June 1991 as shown in the file COPYING in the top-level directory of this @@ -209,6 +209,7 @@ enum alu_dst_ab { #define OP_CMD_CNT 0x0000e000000ULL #define OP_CMD_SIG 0x000f0000000ULL #define OP_CMD_TGT_CMD 0x07f00000000ULL +#define OP_CMD_INDIR 0x20000000000ULL #define OP_CMD_MODE 0x1c0000000000ULL struct cmd_tgt_act { From 5e4d6d20939f8e77b7734c0cea6886dff60c99de Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Thu, 30 Nov 2017 21:32:57 -0800 Subject: [PATCH 10/13] nfp: bpf: factor out is_mbpf_load & is_mbpf_store It is usual that we need to check if one BPF insn is for loading/storeing data from/to memory. Therefore, it makes sense to factor out related code to become common helper functions. Signed-off-by: Jiong Wang Reviewed-by: Jakub Kicinski Signed-off-by: Daniel Borkmann --- drivers/net/ethernet/netronome/nfp/bpf/main.h | 10 ++++++++++ drivers/net/ethernet/netronome/nfp/bpf/verifier.c | 4 ++-- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.h b/drivers/net/ethernet/netronome/nfp/bpf/main.h index 99da1d34dd0ee..20ef0adb29318 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/main.h +++ b/drivers/net/ethernet/netronome/nfp/bpf/main.h @@ -144,6 +144,16 @@ static inline u8 mbpf_mode(const struct nfp_insn_meta *meta) return BPF_MODE(meta->insn.code); } +static inline bool is_mbpf_load(const struct nfp_insn_meta *meta) +{ + return (meta->insn.code & ~BPF_SIZE_MASK) == (BPF_LDX | BPF_MEM); +} + +static inline bool is_mbpf_store(const struct nfp_insn_meta *meta) +{ + return (meta->insn.code & ~BPF_SIZE_MASK) == (BPF_STX | BPF_MEM); +} + /** * struct nfp_prog - nfp BPF program * @prog: machine code diff --git a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c index cca67730b91f4..d2bf29c902262 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c @@ -180,10 +180,10 @@ nfp_verify_insn(struct bpf_verifier_env *env, int insn_idx, int prev_insn_idx) if (meta->insn.code == (BPF_JMP | BPF_EXIT)) return nfp_bpf_check_exit(nfp_prog, env); - if ((meta->insn.code & ~BPF_SIZE_MASK) == (BPF_LDX | BPF_MEM)) + if (is_mbpf_load(meta)) return nfp_bpf_check_ptr(nfp_prog, meta, env, meta->insn.src_reg); - if ((meta->insn.code & ~BPF_SIZE_MASK) == (BPF_STX | BPF_MEM)) + if (is_mbpf_store(meta)) return nfp_bpf_check_ptr(nfp_prog, meta, env, meta->insn.dst_reg); From 9879a3814beb3b1350755475e67a8d92ba1f7e4b Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Thu, 30 Nov 2017 21:32:58 -0800 Subject: [PATCH 11/13] nfp: bpf: implement memory bulk copy for length within 32-bytes For NFP, we want to re-group a sequence of load/store pairs lowered from memcpy/memmove into single memory bulk operation which then could be accelerated using NFP CPP bus. This patch extends the existing load/store auxiliary information by adding two new fields: struct bpf_insn *paired_st; s16 ldst_gather_len; Both fields are supposed to be carried by the the load instruction at the head of the sequence. "paired_st" is the corresponding store instruction at the head and "ldst_gather_len" is the gathered length. If "ldst_gather_len" is negative, then the sequence is doing memory load/store in descending order, otherwise it is in ascending order. We need this information to detect overlapped memory access. This patch then optimize memory bulk copy when the copy length is within 32-bytes. The strategy of read/write used is: * Read. Use read32 (direct_ref), always. * Write. - length <= 8-bytes write8 (direct_ref). - length <= 32-bytes and is 4-byte aligned write32 (direct_ref). - length <= 32-bytes but is not 4-byte aligned write8 (indirect_ref). NOTE: the optimization should not change program semantics. The destination register of the last load instruction should contain the same value before and after this optimization. Signed-off-by: Jiong Wang Reviewed-by: Jakub Kicinski Signed-off-by: Daniel Borkmann --- drivers/net/ethernet/netronome/nfp/bpf/jit.c | 113 ++++++++++++++++++ drivers/net/ethernet/netronome/nfp/bpf/main.h | 4 + drivers/net/ethernet/netronome/nfp/nfp_asm.c | 1 + drivers/net/ethernet/netronome/nfp/nfp_asm.h | 4 + 4 files changed, 122 insertions(+) diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c b/drivers/net/ethernet/netronome/nfp/bpf/jit.c index da4e106d3b164..138568c0eee6c 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c @@ -154,6 +154,13 @@ emit_cmd(struct nfp_prog *nfp_prog, enum cmd_tgt_map op, u8 mode, u8 xfer, emit_cmd_any(nfp_prog, op, mode, xfer, lreg, rreg, size, sync, false); } +static void +emit_cmd_indir(struct nfp_prog *nfp_prog, enum cmd_tgt_map op, u8 mode, u8 xfer, + swreg lreg, swreg rreg, u8 size, bool sync) +{ + emit_cmd_any(nfp_prog, op, mode, xfer, lreg, rreg, size, sync, true); +} + static void __emit_br(struct nfp_prog *nfp_prog, enum br_mask mask, enum br_ev_pip ev_pip, enum br_ctx_signal_state css, u16 addr, u8 defer) @@ -515,6 +522,109 @@ static void wrp_reg_mov(struct nfp_prog *nfp_prog, u16 dst, u16 src) wrp_mov(nfp_prog, reg_both(dst), reg_b(src)); } +/* wrp_reg_subpart() - load @field_len bytes from @offset of @src, write the + * result to @dst from low end. + */ +static void +wrp_reg_subpart(struct nfp_prog *nfp_prog, swreg dst, swreg src, u8 field_len, + u8 offset) +{ + enum shf_sc sc = offset ? SHF_SC_R_SHF : SHF_SC_NONE; + u8 mask = (1 << field_len) - 1; + + emit_ld_field_any(nfp_prog, dst, mask, src, sc, offset * 8, true); +} + +/* NFP has Command Push Pull bus which supports bluk memory operations. */ +static int nfp_cpp_memcpy(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) +{ + bool descending_seq = meta->ldst_gather_len < 0; + s16 len = abs(meta->ldst_gather_len); + swreg src_base, off; + unsigned int i; + u8 xfer_num; + + if (WARN_ON_ONCE(len > 32)) + return -EOPNOTSUPP; + + off = re_load_imm_any(nfp_prog, meta->insn.off, imm_b(nfp_prog)); + src_base = reg_a(meta->insn.src_reg * 2); + xfer_num = round_up(len, 4) / 4; + + /* Memory read from source addr into transfer-in registers. */ + emit_cmd(nfp_prog, CMD_TGT_READ32_SWAP, CMD_MODE_32b, 0, src_base, off, + xfer_num - 1, true); + + /* Move from transfer-in to transfer-out. */ + for (i = 0; i < xfer_num; i++) + wrp_mov(nfp_prog, reg_xfer(i), reg_xfer(i)); + + off = re_load_imm_any(nfp_prog, meta->paired_st->off, imm_b(nfp_prog)); + + if (len <= 8) { + /* Use single direct_ref write8. */ + emit_cmd(nfp_prog, CMD_TGT_WRITE8_SWAP, CMD_MODE_32b, 0, + reg_a(meta->paired_st->dst_reg * 2), off, len - 1, + true); + } else if (IS_ALIGNED(len, 4)) { + /* Use single direct_ref write32. */ + emit_cmd(nfp_prog, CMD_TGT_WRITE32_SWAP, CMD_MODE_32b, 0, + reg_a(meta->paired_st->dst_reg * 2), off, xfer_num - 1, + true); + } else { + /* Use single indirect_ref write8. */ + wrp_immed(nfp_prog, reg_none(), + CMD_OVE_LEN | FIELD_PREP(CMD_OV_LEN, len - 1)); + emit_cmd_indir(nfp_prog, CMD_TGT_WRITE8_SWAP, CMD_MODE_32b, 0, + reg_a(meta->paired_st->dst_reg * 2), off, + len - 1, true); + } + + /* TODO: The following extra load is to make sure data flow be identical + * before and after we do memory copy optimization. + * + * The load destination register is not guaranteed to be dead, so we + * need to make sure it is loaded with the value the same as before + * this transformation. + * + * These extra loads could be removed once we have accurate register + * usage information. + */ + if (descending_seq) + xfer_num = 0; + else if (BPF_SIZE(meta->insn.code) != BPF_DW) + xfer_num = xfer_num - 1; + else + xfer_num = xfer_num - 2; + + switch (BPF_SIZE(meta->insn.code)) { + case BPF_B: + wrp_reg_subpart(nfp_prog, reg_both(meta->insn.dst_reg * 2), + reg_xfer(xfer_num), 1, + IS_ALIGNED(len, 4) ? 3 : (len & 3) - 1); + break; + case BPF_H: + wrp_reg_subpart(nfp_prog, reg_both(meta->insn.dst_reg * 2), + reg_xfer(xfer_num), 2, (len & 3) ^ 2); + break; + case BPF_W: + wrp_mov(nfp_prog, reg_both(meta->insn.dst_reg * 2), + reg_xfer(0)); + break; + case BPF_DW: + wrp_mov(nfp_prog, reg_both(meta->insn.dst_reg * 2), + reg_xfer(xfer_num)); + wrp_mov(nfp_prog, reg_both(meta->insn.dst_reg * 2 + 1), + reg_xfer(xfer_num + 1)); + break; + } + + if (BPF_SIZE(meta->insn.code) != BPF_DW) + wrp_immed(nfp_prog, reg_both(meta->insn.dst_reg * 2 + 1), 0); + + return 0; +} + static int data_ld(struct nfp_prog *nfp_prog, swreg offset, u8 dst_gpr, int size) { @@ -1490,6 +1600,9 @@ static int mem_ldx(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, unsigned int size) { + if (meta->ldst_gather_len) + return nfp_cpp_memcpy(nfp_prog, meta); + if (meta->ptr.type == PTR_TO_CTX) { if (nfp_prog->type == BPF_PROG_TYPE_XDP) return mem_ldx_xdp(nfp_prog, meta, size); diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.h b/drivers/net/ethernet/netronome/nfp/bpf/main.h index 20ef0adb29318..5884291ddba52 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/main.h +++ b/drivers/net/ethernet/netronome/nfp/bpf/main.h @@ -95,6 +95,8 @@ typedef int (*instr_cb_t)(struct nfp_prog *, struct nfp_insn_meta *); * struct nfp_insn_meta - BPF instruction wrapper * @insn: BPF instruction * @ptr: pointer type for memory operations + * @ldst_gather_len: memcpy length gathered from load/store sequence + * @paired_st: the paired store insn at the head of the sequence * @ptr_not_const: pointer is not always constant * @jmp_dst: destination info for jump instructions * @off: index of first generated machine instruction (in nfp_prog.prog) @@ -109,6 +111,8 @@ struct nfp_insn_meta { union { struct { struct bpf_reg_state ptr; + struct bpf_insn *paired_st; + s16 ldst_gather_len; bool ptr_not_const; }; struct nfp_insn_meta *jmp_dst; diff --git a/drivers/net/ethernet/netronome/nfp/nfp_asm.c b/drivers/net/ethernet/netronome/nfp/nfp_asm.c index da277386077c0..d3610987fb073 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_asm.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_asm.c @@ -41,6 +41,7 @@ const struct cmd_tgt_act cmd_tgt_act[__CMD_TGT_MAP_SIZE] = { [CMD_TGT_WRITE8_SWAP] = { 0x02, 0x42 }, + [CMD_TGT_WRITE32_SWAP] = { 0x02, 0x5f }, [CMD_TGT_READ8] = { 0x01, 0x43 }, [CMD_TGT_READ32] = { 0x00, 0x5c }, [CMD_TGT_READ32_LE] = { 0x01, 0x5c }, diff --git a/drivers/net/ethernet/netronome/nfp/nfp_asm.h b/drivers/net/ethernet/netronome/nfp/nfp_asm.h index 6ff842a15e5db..98803f9f40b69 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_asm.h +++ b/drivers/net/ethernet/netronome/nfp/nfp_asm.h @@ -220,6 +220,7 @@ struct cmd_tgt_act { enum cmd_tgt_map { CMD_TGT_READ8, CMD_TGT_WRITE8_SWAP, + CMD_TGT_WRITE32_SWAP, CMD_TGT_READ32, CMD_TGT_READ32_LE, CMD_TGT_READ32_SWAP, @@ -241,6 +242,9 @@ enum cmd_ctx_swap { CMD_CTX_NO_SWAP = 3, }; +#define CMD_OVE_LEN BIT(7) +#define CMD_OV_LEN GENMASK(12, 8) + #define OP_LCSR_BASE 0x0fc00000000ULL #define OP_LCSR_A_SRC 0x000000003ffULL #define OP_LCSR_B_SRC 0x000000ffc00ULL From 8c90053858fce1ca60fab7be03bb61d314ea5c1c Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Thu, 30 Nov 2017 21:32:59 -0800 Subject: [PATCH 12/13] nfp: bpf: implement memory bulk copy for length bigger than 32-bytes When the gathered copy length is bigger than 32-bytes and within 128-bytes (the maximum length a single CPP Pull/Push request can finish), the strategy of read/write are changeed into: * Read. - use direct reference mode when length is within 32-bytes. - use indirect mode when length is bigger than 32-bytes. * Write. - length <= 8-bytes use write8 (direct_ref). - length <= 32-byte and 4-bytes aligned use write32 (direct_ref). - length <= 32-bytes but not 4-bytes aligned use write8 (indirect_ref). - length > 32-bytes and 4-bytes aligned use write32 (indirect_ref). - length > 32-bytes and not 4-bytes aligned and <= 40-bytes use write32 (direct_ref) to finish the first 32-bytes. use write8 (direct_ref) to finish all remaining hanging part. - length > 32-bytes and not 4-bytes aligned use write32 (indirect_ref) to finish those 4-byte aligned parts. use write8 (direct_ref) to finish all remaining hanging part. Signed-off-by: Jiong Wang Reviewed-by: Jakub Kicinski Signed-off-by: Daniel Borkmann --- drivers/net/ethernet/netronome/nfp/bpf/jit.c | 52 +++++++++++++++++--- 1 file changed, 45 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c b/drivers/net/ethernet/netronome/nfp/bpf/jit.c index 138568c0eee6c..1b98ef239605d 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c @@ -544,16 +544,18 @@ static int nfp_cpp_memcpy(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) unsigned int i; u8 xfer_num; - if (WARN_ON_ONCE(len > 32)) - return -EOPNOTSUPP; - off = re_load_imm_any(nfp_prog, meta->insn.off, imm_b(nfp_prog)); src_base = reg_a(meta->insn.src_reg * 2); xfer_num = round_up(len, 4) / 4; + /* Setup PREV_ALU fields to override memory read length. */ + if (len > 32) + wrp_immed(nfp_prog, reg_none(), + CMD_OVE_LEN | FIELD_PREP(CMD_OV_LEN, xfer_num - 1)); + /* Memory read from source addr into transfer-in registers. */ - emit_cmd(nfp_prog, CMD_TGT_READ32_SWAP, CMD_MODE_32b, 0, src_base, off, - xfer_num - 1, true); + emit_cmd_any(nfp_prog, CMD_TGT_READ32_SWAP, CMD_MODE_32b, 0, src_base, + off, xfer_num - 1, true, len > 32); /* Move from transfer-in to transfer-out. */ for (i = 0; i < xfer_num; i++) @@ -566,18 +568,54 @@ static int nfp_cpp_memcpy(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) emit_cmd(nfp_prog, CMD_TGT_WRITE8_SWAP, CMD_MODE_32b, 0, reg_a(meta->paired_st->dst_reg * 2), off, len - 1, true); - } else if (IS_ALIGNED(len, 4)) { + } else if (len <= 32 && IS_ALIGNED(len, 4)) { /* Use single direct_ref write32. */ emit_cmd(nfp_prog, CMD_TGT_WRITE32_SWAP, CMD_MODE_32b, 0, reg_a(meta->paired_st->dst_reg * 2), off, xfer_num - 1, true); - } else { + } else if (len <= 32) { /* Use single indirect_ref write8. */ wrp_immed(nfp_prog, reg_none(), CMD_OVE_LEN | FIELD_PREP(CMD_OV_LEN, len - 1)); emit_cmd_indir(nfp_prog, CMD_TGT_WRITE8_SWAP, CMD_MODE_32b, 0, reg_a(meta->paired_st->dst_reg * 2), off, len - 1, true); + } else if (IS_ALIGNED(len, 4)) { + /* Use single indirect_ref write32. */ + wrp_immed(nfp_prog, reg_none(), + CMD_OVE_LEN | FIELD_PREP(CMD_OV_LEN, xfer_num - 1)); + emit_cmd_indir(nfp_prog, CMD_TGT_WRITE32_SWAP, CMD_MODE_32b, 0, + reg_a(meta->paired_st->dst_reg * 2), off, + xfer_num - 1, true); + } else if (len <= 40) { + /* Use one direct_ref write32 to write the first 32-bytes, then + * another direct_ref write8 to write the remaining bytes. + */ + emit_cmd(nfp_prog, CMD_TGT_WRITE32_SWAP, CMD_MODE_32b, 0, + reg_a(meta->paired_st->dst_reg * 2), off, 7, + true); + + off = re_load_imm_any(nfp_prog, meta->paired_st->off + 32, + imm_b(nfp_prog)); + emit_cmd(nfp_prog, CMD_TGT_WRITE8_SWAP, CMD_MODE_32b, 8, + reg_a(meta->paired_st->dst_reg * 2), off, len - 33, + true); + } else { + /* Use one indirect_ref write32 to write 4-bytes aligned length, + * then another direct_ref write8 to write the remaining bytes. + */ + u8 new_off; + + wrp_immed(nfp_prog, reg_none(), + CMD_OVE_LEN | FIELD_PREP(CMD_OV_LEN, xfer_num - 2)); + emit_cmd_indir(nfp_prog, CMD_TGT_WRITE32_SWAP, CMD_MODE_32b, 0, + reg_a(meta->paired_st->dst_reg * 2), off, + xfer_num - 2, true); + new_off = meta->paired_st->off + (xfer_num - 1) * 4; + off = re_load_imm_any(nfp_prog, new_off, imm_b(nfp_prog)); + emit_cmd(nfp_prog, CMD_TGT_WRITE8_SWAP, CMD_MODE_32b, + xfer_num - 1, reg_a(meta->paired_st->dst_reg * 2), off, + (len & 0x3) - 1, true); } /* TODO: The following extra load is to make sure data flow be identical From 6bc7103c89bff2b53a159e03b74c8216c79bfef8 Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Thu, 30 Nov 2017 21:33:00 -0800 Subject: [PATCH 13/13] nfp: bpf: detect load/store sequences lowered from memory copy This patch add the optimization frontend, but adding a new eBPF IR scan pass "nfp_bpf_opt_ldst_gather". The pass will traverse the IR to recognize the load/store pairs sequences that come from lowering of memory copy builtins. The gathered memory copy information will be kept in the meta info structure of the first load instruction in the sequence and will be consumed by the optimization backend added in the previous patches. NOTE: a sequence with cross memory access doesn't qualify this optimization, i.e. if one load in the sequence will load from place that has been written by previous store. This is because when we turn the sequence into single CPP operation, we are reading all contents at once into NFP transfer registers, then write them out as a whole. This is not identical with what the original load/store sequence is doing. Detecting cross memory access for two random pointers will be difficult, fortunately under XDP/eBPF's restrictied runtime environment, the copy normally happen among map, packet data and stack, they do not overlap with each other. And for cases supported by NFP, cross memory access will only happen on PTR_TO_PACKET. Fortunately for this, there is ID information that we could do accurate memory alias check. Signed-off-by: Jiong Wang Reviewed-by: Jakub Kicinski Signed-off-by: Daniel Borkmann --- drivers/net/ethernet/netronome/nfp/bpf/jit.c | 237 +++++++++++++++++++ 1 file changed, 237 insertions(+) diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c b/drivers/net/ethernet/netronome/nfp/bpf/jit.c index 1b98ef239605d..3419ad495962f 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c @@ -2352,12 +2352,249 @@ static void nfp_bpf_opt_ld_shift(struct nfp_prog *nfp_prog) } } +/* load/store pair that forms memory copy sould look like the following: + * + * ld_width R, [addr_src + offset_src] + * st_width [addr_dest + offset_dest], R + * + * The destination register of load and source register of store should + * be the same, load and store should also perform at the same width. + * If either of addr_src or addr_dest is stack pointer, we don't do the + * CPP optimization as stack is modelled by registers on NFP. + */ +static bool +curr_pair_is_memcpy(struct nfp_insn_meta *ld_meta, + struct nfp_insn_meta *st_meta) +{ + struct bpf_insn *ld = &ld_meta->insn; + struct bpf_insn *st = &st_meta->insn; + + if (!is_mbpf_load(ld_meta) || !is_mbpf_store(st_meta)) + return false; + + if (ld_meta->ptr.type != PTR_TO_PACKET) + return false; + + if (st_meta->ptr.type != PTR_TO_PACKET) + return false; + + if (BPF_SIZE(ld->code) != BPF_SIZE(st->code)) + return false; + + if (ld->dst_reg != st->src_reg) + return false; + + /* There is jump to the store insn in this pair. */ + if (st_meta->flags & FLAG_INSN_IS_JUMP_DST) + return false; + + return true; +} + +/* Currently, we only support chaining load/store pairs if: + * + * - Their address base registers are the same. + * - Their address offsets are in the same order. + * - They operate at the same memory width. + * - There is no jump into the middle of them. + */ +static bool +curr_pair_chain_with_previous(struct nfp_insn_meta *ld_meta, + struct nfp_insn_meta *st_meta, + struct bpf_insn *prev_ld, + struct bpf_insn *prev_st) +{ + u8 prev_size, curr_size, prev_ld_base, prev_st_base, prev_ld_dst; + struct bpf_insn *ld = &ld_meta->insn; + struct bpf_insn *st = &st_meta->insn; + s16 prev_ld_off, prev_st_off; + + /* This pair is the start pair. */ + if (!prev_ld) + return true; + + prev_size = BPF_LDST_BYTES(prev_ld); + curr_size = BPF_LDST_BYTES(ld); + prev_ld_base = prev_ld->src_reg; + prev_st_base = prev_st->dst_reg; + prev_ld_dst = prev_ld->dst_reg; + prev_ld_off = prev_ld->off; + prev_st_off = prev_st->off; + + if (ld->dst_reg != prev_ld_dst) + return false; + + if (ld->src_reg != prev_ld_base || st->dst_reg != prev_st_base) + return false; + + if (curr_size != prev_size) + return false; + + /* There is jump to the head of this pair. */ + if (ld_meta->flags & FLAG_INSN_IS_JUMP_DST) + return false; + + /* Both in ascending order. */ + if (prev_ld_off + prev_size == ld->off && + prev_st_off + prev_size == st->off) + return true; + + /* Both in descending order. */ + if (ld->off + curr_size == prev_ld_off && + st->off + curr_size == prev_st_off) + return true; + + return false; +} + +/* Return TRUE if cross memory access happens. Cross memory access means + * store area is overlapping with load area that a later load might load + * the value from previous store, for this case we can't treat the sequence + * as an memory copy. + */ +static bool +cross_mem_access(struct bpf_insn *ld, struct nfp_insn_meta *head_ld_meta, + struct nfp_insn_meta *head_st_meta) +{ + s16 head_ld_off, head_st_off, ld_off; + + /* Different pointer types does not overlap. */ + if (head_ld_meta->ptr.type != head_st_meta->ptr.type) + return false; + + /* load and store are both PTR_TO_PACKET, check ID info. */ + if (head_ld_meta->ptr.id != head_st_meta->ptr.id) + return true; + + /* Canonicalize the offsets. Turn all of them against the original + * base register. + */ + head_ld_off = head_ld_meta->insn.off + head_ld_meta->ptr.off; + head_st_off = head_st_meta->insn.off + head_st_meta->ptr.off; + ld_off = ld->off + head_ld_meta->ptr.off; + + /* Ascending order cross. */ + if (ld_off > head_ld_off && + head_ld_off < head_st_off && ld_off >= head_st_off) + return true; + + /* Descending order cross. */ + if (ld_off < head_ld_off && + head_ld_off > head_st_off && ld_off <= head_st_off) + return true; + + return false; +} + +/* This pass try to identify the following instructoin sequences. + * + * load R, [regA + offA] + * store [regB + offB], R + * load R, [regA + offA + const_imm_A] + * store [regB + offB + const_imm_A], R + * load R, [regA + offA + 2 * const_imm_A] + * store [regB + offB + 2 * const_imm_A], R + * ... + * + * Above sequence is typically generated by compiler when lowering + * memcpy. NFP prefer using CPP instructions to accelerate it. + */ +static void nfp_bpf_opt_ldst_gather(struct nfp_prog *nfp_prog) +{ + struct nfp_insn_meta *head_ld_meta = NULL; + struct nfp_insn_meta *head_st_meta = NULL; + struct nfp_insn_meta *meta1, *meta2; + struct bpf_insn *prev_ld = NULL; + struct bpf_insn *prev_st = NULL; + u8 count = 0; + + nfp_for_each_insn_walk2(nfp_prog, meta1, meta2) { + struct bpf_insn *ld = &meta1->insn; + struct bpf_insn *st = &meta2->insn; + + /* Reset record status if any of the following if true: + * - The current insn pair is not load/store. + * - The load/store pair doesn't chain with previous one. + * - The chained load/store pair crossed with previous pair. + * - The chained load/store pair has a total size of memory + * copy beyond 128 bytes which is the maximum length a + * single NFP CPP command can transfer. + */ + if (!curr_pair_is_memcpy(meta1, meta2) || + !curr_pair_chain_with_previous(meta1, meta2, prev_ld, + prev_st) || + (head_ld_meta && (cross_mem_access(ld, head_ld_meta, + head_st_meta) || + head_ld_meta->ldst_gather_len >= 128))) { + if (!count) + continue; + + if (count > 1) { + s16 prev_ld_off = prev_ld->off; + s16 prev_st_off = prev_st->off; + s16 head_ld_off = head_ld_meta->insn.off; + + if (prev_ld_off < head_ld_off) { + head_ld_meta->insn.off = prev_ld_off; + head_st_meta->insn.off = prev_st_off; + head_ld_meta->ldst_gather_len = + -head_ld_meta->ldst_gather_len; + } + + head_ld_meta->paired_st = &head_st_meta->insn; + head_st_meta->skip = true; + } else { + head_ld_meta->ldst_gather_len = 0; + } + + /* If the chain is ended by an load/store pair then this + * could serve as the new head of the the next chain. + */ + if (curr_pair_is_memcpy(meta1, meta2)) { + head_ld_meta = meta1; + head_st_meta = meta2; + head_ld_meta->ldst_gather_len = + BPF_LDST_BYTES(ld); + meta1 = nfp_meta_next(meta1); + meta2 = nfp_meta_next(meta2); + prev_ld = ld; + prev_st = st; + count = 1; + } else { + head_ld_meta = NULL; + head_st_meta = NULL; + prev_ld = NULL; + prev_st = NULL; + count = 0; + } + + continue; + } + + if (!head_ld_meta) { + head_ld_meta = meta1; + head_st_meta = meta2; + } else { + meta1->skip = true; + meta2->skip = true; + } + + head_ld_meta->ldst_gather_len += BPF_LDST_BYTES(ld); + meta1 = nfp_meta_next(meta1); + meta2 = nfp_meta_next(meta2); + prev_ld = ld; + prev_st = st; + count++; + } +} + static int nfp_bpf_optimize(struct nfp_prog *nfp_prog) { nfp_bpf_opt_reg_init(nfp_prog); nfp_bpf_opt_ld_mask(nfp_prog); nfp_bpf_opt_ld_shift(nfp_prog); + nfp_bpf_opt_ldst_gather(nfp_prog); return 0; }