From a646c9b2dabae19138926b888c979e43fb85362b Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Thu, 30 Nov 2017 21:32:48 -0800
Subject: [PATCH 01/13] nfp: fix old kdoc issues

Since commit 3a025e1d1c2e ("Add optional check for bad kernel-doc
comments") when built with W=1 build will complain about kdoc errors.
Fix the kdoc issues we have.  kdoc is still confused by defines in
nfp_net_ctrl.h but those are not really errors.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 drivers/net/ethernet/netronome/nfp/nfp_net.h             | 2 ++
 drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cppcore.c | 9 +++------
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net.h b/drivers/net/ethernet/netronome/nfp/nfp_net.h
index 7f9857c276b16..3801c52098d57 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net.h
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net.h
@@ -548,6 +548,8 @@ struct nfp_net_dp {
  * @max_r_vecs:		Number of allocated interrupt vectors for RX/TX
  * @max_tx_rings:       Maximum number of TX rings supported by the Firmware
  * @max_rx_rings:       Maximum number of RX rings supported by the Firmware
+ * @stride_rx:		Queue controller RX queue spacing
+ * @stride_tx:		Queue controller TX queue spacing
  * @r_vecs:             Pre-allocated array of ring vectors
  * @irq_entries:        Pre-allocated array of MSI-X entries
  * @lsc_handler:        Handler for Link State Change interrupt
diff --git a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cppcore.c b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cppcore.c
index 04dd5758ecf54..3fcb522d2e852 100644
--- a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cppcore.c
+++ b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cppcore.c
@@ -372,8 +372,7 @@ nfp_cpp_area_alloc(struct nfp_cpp *cpp, u32 dest,
  * that it can be accessed directly.
  *
  * NOTE: @address and @size must be 32-bit aligned values.
- *
- * NOTE: The area must also be 'released' when the structure is freed.
+ * The area must also be 'released' when the structure is freed.
  *
  * Return: NFP CPP Area handle, or NULL
  */
@@ -536,8 +535,7 @@ void nfp_cpp_area_release_free(struct nfp_cpp_area *area)
  * Read data from indicated CPP region.
  *
  * NOTE: @offset and @length must be 32-bit aligned values.
- *
- * NOTE: Area must have been locked down with an 'acquire'.
+ * Area must have been locked down with an 'acquire'.
  *
  * Return: length of io, or -ERRNO
  */
@@ -558,8 +556,7 @@ int nfp_cpp_area_read(struct nfp_cpp_area *area,
  * Write data to indicated CPP region.
  *
  * NOTE: @offset and @length must be 32-bit aligned values.
- *
- * NOTE: Area must have been locked down with an 'acquire'.
+ * Area must have been locked down with an 'acquire'.
  *
  * Return: length of io, or -ERRNO
  */

From 854dc87d1a7be1f11b2d079a80a822742d6e560e Mon Sep 17 00:00:00 2001
From: Jiong Wang <jiong.wang@netronome.com>
Date: Thu, 30 Nov 2017 21:32:49 -0800
Subject: [PATCH 02/13] nfp: bpf: support backward jump

This patch adds support for backward jump on NFP.

  - restrictions on backward jump in various functions have been removed.
  - nfp_fixup_branches now supports backward jump.

There is one thing to note, currently an input eBPF JMP insn may generate
several NFP insns, for example,

  NFP imm move insn A \
  NFP compare insn  B  --> 3 NFP insn jited from eBPF JMP insn M
  NFP branch insn   C /
  ---
  NFP insn X           --> 1 NFP insn jited from eBPF insn N
  ---
  ...

therefore, we are doing sanity check to make sure the last jited insn from
an eBPF JMP is a NFP branch instruction.

Once backward jump is allowed, it is possible an eBPF JMP insn is at the
end of the program. This is however causing trouble for the sanity check.
Because the sanity check requires the end index of the NFP insns jited from
one eBPF insn while only the start index is recorded before this patch that
we can only get the end index by:

  start_index_of_the_next_eBPF_insn - 1

or for the above example:

  start_index_of_eBPF_insn_N (which is the index of NFP insn X) - 1

nfp_fixup_branches was using nfp_for_each_insn_walk2 to expose *next* insn
to each iteration during the traversal so the last index could be
calculated from which. Now, it needs some extra code to handle the last
insn. Meanwhile, the use of walk2 is actually unnecessary, we could simply
use generic single instruction walk to do this, the next insn could be
easily calculated using list_next_entry.

So, this patch migrates the jump fixup traversal method to
*list_for_each_entry*, this simplifies the code logic a little bit.

The other thing to note is a new state variable "last_bpf_off" is
introduced to track the index of the last jited NFP insn. This is necessary
because NFP is generating special purposes epilogue sequences, so the index
of the last jited NFP insn is *not* always nfp_prog->prog_len - 1.

Suggested-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Jiong Wang <jiong.wang@netronome.com>
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 drivers/net/ethernet/netronome/nfp/bpf/jit.c  | 66 +++++++++++--------
 drivers/net/ethernet/netronome/nfp/bpf/main.h |  4 +-
 2 files changed, 40 insertions(+), 30 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
index 995e95410b118..20daf6b95601b 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2016 Netronome Systems, Inc.
+ * Copyright (C) 2016-2017 Netronome Systems, Inc.
  *
  * This software is dual licensed under the GNU General License Version 2,
  * June 1991 as shown in the file COPYING in the top-level directory of this
@@ -975,9 +975,6 @@ wrp_test_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
 {
 	const struct bpf_insn *insn = &meta->insn;
 
-	if (insn->off < 0) /* TODO */
-		return -EOPNOTSUPP;
-
 	wrp_test_reg_one(nfp_prog, insn->dst_reg * 2, alu_op,
 			 insn->src_reg * 2, br_mask, insn->off);
 	wrp_test_reg_one(nfp_prog, insn->dst_reg * 2 + 1, alu_op,
@@ -995,9 +992,6 @@ wrp_cmp_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
 	u8 reg = insn->dst_reg * 2;
 	swreg tmp_reg;
 
-	if (insn->off < 0) /* TODO */
-		return -EOPNOTSUPP;
-
 	tmp_reg = ur_load_imm_any(nfp_prog, imm & ~0U, imm_b(nfp_prog));
 	if (!swap)
 		emit_alu(nfp_prog, reg_none(), reg_a(reg), ALU_OP_SUB, tmp_reg);
@@ -1027,9 +1021,6 @@ wrp_cmp_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
 	areg = insn->dst_reg * 2;
 	breg = insn->src_reg * 2;
 
-	if (insn->off < 0) /* TODO */
-		return -EOPNOTSUPP;
-
 	if (swap) {
 		areg ^= breg;
 		breg ^= areg;
@@ -1630,8 +1621,6 @@ static int mem_stx8(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 
 static int jump(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 {
-	if (meta->insn.off < 0) /* TODO */
-		return -EOPNOTSUPP;
 	emit_br(nfp_prog, BR_UNC, meta->insn.off, 0);
 
 	return 0;
@@ -1646,9 +1635,6 @@ static int jeq_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 	or1 = reg_a(insn->dst_reg * 2);
 	or2 = reg_b(insn->dst_reg * 2 + 1);
 
-	if (insn->off < 0) /* TODO */
-		return -EOPNOTSUPP;
-
 	if (imm & ~0U) {
 		tmp_reg = ur_load_imm_any(nfp_prog, imm & ~0U, imm_b(nfp_prog));
 		emit_alu(nfp_prog, imm_a(nfp_prog),
@@ -1695,9 +1681,6 @@ static int jset_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 	u64 imm = insn->imm; /* sign extend */
 	swreg tmp_reg;
 
-	if (insn->off < 0) /* TODO */
-		return -EOPNOTSUPP;
-
 	if (!imm) {
 		meta->skip = true;
 		return 0;
@@ -1726,9 +1709,6 @@ static int jne_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 	u64 imm = insn->imm; /* sign extend */
 	swreg tmp_reg;
 
-	if (insn->off < 0) /* TODO */
-		return -EOPNOTSUPP;
-
 	if (!imm) {
 		emit_alu(nfp_prog, reg_none(), reg_a(insn->dst_reg * 2),
 			 ALU_OP_OR, reg_b(insn->dst_reg * 2 + 1));
@@ -1753,9 +1733,6 @@ static int jeq_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 {
 	const struct bpf_insn *insn = &meta->insn;
 
-	if (insn->off < 0) /* TODO */
-		return -EOPNOTSUPP;
-
 	emit_alu(nfp_prog, imm_a(nfp_prog), reg_a(insn->dst_reg * 2),
 		 ALU_OP_XOR, reg_b(insn->src_reg * 2));
 	emit_alu(nfp_prog, imm_b(nfp_prog), reg_a(insn->dst_reg * 2 + 1),
@@ -1888,16 +1865,25 @@ static void br_set_offset(u64 *instr, u16 offset)
 static int nfp_fixup_branches(struct nfp_prog *nfp_prog)
 {
 	struct nfp_insn_meta *meta, *next;
-	u32 off, br_idx;
-	u32 idx;
+	u32 idx, br_idx;
+	int off;
 
-	nfp_for_each_insn_walk2(nfp_prog, meta, next) {
+	list_for_each_entry(meta, &nfp_prog->insns, l) {
 		if (meta->skip)
 			continue;
 		if (BPF_CLASS(meta->insn.code) != BPF_JMP)
 			continue;
 
-		br_idx = nfp_prog_offset_to_index(nfp_prog, next->off) - 1;
+		if (list_is_last(&meta->l, &nfp_prog->insns)) {
+			next = NULL;
+			idx = nfp_prog->last_bpf_off;
+		} else {
+			next = list_next_entry(meta, l);
+			idx = next->off - 1;
+		}
+
+		br_idx = nfp_prog_offset_to_index(nfp_prog, idx);
+
 		if (!nfp_is_br(nfp_prog->prog[br_idx])) {
 			pr_err("Fixup found block not ending in branch %d %02x %016llx!!\n",
 			       br_idx, meta->insn.code, nfp_prog->prog[br_idx]);
@@ -1914,10 +1900,30 @@ static int nfp_fixup_branches(struct nfp_prog *nfp_prog)
 			return -ELOOP;
 		}
 
-		while (off && nfp_meta_has_next(nfp_prog, next)) {
+		if (!next) {
+			/* When "next" is NULL, "meta" is the last node in the
+			 * list. Given it is an JMP, it then must be a backward
+			 * jump.
+			 *
+			 * For eBPF, the jump offset is against pc + 1, so we
+			 * need to compensate the offset by 1 as we are pointing
+			 * "next" to the current node "meta".
+			 */
+			if (WARN_ON_ONCE(off > -2))
+				return -ELOOP;
+
+			next = meta;
+			off += 1;
+		}
+
+		while (off > 0 && nfp_meta_has_next(nfp_prog, next)) {
 			next = nfp_meta_next(next);
 			off--;
 		}
+		while (off < 0 && nfp_meta_has_prev(nfp_prog, next)) {
+			next = nfp_meta_prev(next);
+			off++;
+		}
 		if (off) {
 			pr_err("Fixup found too large jump!! %d\n", off);
 			return -ELOOP;
@@ -2105,6 +2111,8 @@ static int nfp_translate(struct nfp_prog *nfp_prog)
 		nfp_prog->n_translated++;
 	}
 
+	nfp_prog->last_bpf_off = nfp_prog_current_offset(nfp_prog) - 1;
+
 	nfp_outro(nfp_prog);
 	if (nfp_prog->error)
 		return nfp_prog->error;
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.h b/drivers/net/ethernet/netronome/nfp/bpf/main.h
index 082a15f6dfb5b..0f4d218fc77ae 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/main.h
+++ b/drivers/net/ethernet/netronome/nfp/bpf/main.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2016 Netronome Systems, Inc.
+ * Copyright (C) 2016-2017 Netronome Systems, Inc.
  *
  * This software is dual licensed under the GNU General License Version 2,
  * June 1991 as shown in the file COPYING in the top-level directory of this
@@ -142,6 +142,7 @@ static inline u8 mbpf_mode(const struct nfp_insn_meta *meta)
  * @verifier_meta: temporary storage for verifier's insn meta
  * @type: BPF program type
  * @start_off: address of the first instruction in the memory
+ * @last_bpf_off: address of the last instruction translated from BPF
  * @tgt_out: jump target for normal exit
  * @tgt_abort: jump target for abort (e.g. access outside of packet buffer)
  * @tgt_done: jump target to get the next packet
@@ -160,6 +161,7 @@ struct nfp_prog {
 	enum bpf_prog_type type;
 
 	unsigned int start_off;
+	unsigned int last_bpf_off;
 	unsigned int tgt_out;
 	unsigned int tgt_abort;
 	unsigned int tgt_done;

From 5b674140addc3c863efa227946ad7328f016a7a3 Mon Sep 17 00:00:00 2001
From: Jiong Wang <jiong.wang@netronome.com>
Date: Thu, 30 Nov 2017 21:32:50 -0800
Subject: [PATCH 03/13] nfp: bpf: record jump destination to simplify jump
 fixup

eBPF insns are internally organized as dual-list inside NFP offload JIT.
Random access to an insn needs to be done by either forward or backward
traversal along the list.

One place we need to do such traversal is at nfp_fixup_branches where one
traversal is needed for each jump insn to find the destination. Such
traversals could be avoided if jump destinations are collected through a
single travesal in a pre-scan pass, and such information could also be
useful in other places where jump destination info are needed.

This patch adds such jump destination collection in nfp_prog_prepare.

Suggested-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Jiong Wang <jiong.wang@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 drivers/net/ethernet/netronome/nfp/bpf/jit.c  | 57 +++----------------
 drivers/net/ethernet/netronome/nfp/bpf/main.h | 13 ++++-
 .../net/ethernet/netronome/nfp/bpf/offload.c  | 22 ++++++-
 .../net/ethernet/netronome/nfp/bpf/verifier.c |  4 +-
 4 files changed, 41 insertions(+), 55 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
index 20daf6b95601b..f76659ecb6544 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
@@ -65,12 +65,6 @@
 	     next = nfp_meta_next(pos),				\
 	     next2 = nfp_meta_next(next))
 
-static bool
-nfp_meta_has_next(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
-{
-	return meta->l.next != &nfp_prog->insns;
-}
-
 static bool
 nfp_meta_has_prev(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 {
@@ -1864,9 +1858,8 @@ static void br_set_offset(u64 *instr, u16 offset)
 /* --- Assembler logic --- */
 static int nfp_fixup_branches(struct nfp_prog *nfp_prog)
 {
-	struct nfp_insn_meta *meta, *next;
+	struct nfp_insn_meta *meta, *jmp_dst;
 	u32 idx, br_idx;
-	int off;
 
 	list_for_each_entry(meta, &nfp_prog->insns, l) {
 		if (meta->skip)
@@ -1874,13 +1867,10 @@ static int nfp_fixup_branches(struct nfp_prog *nfp_prog)
 		if (BPF_CLASS(meta->insn.code) != BPF_JMP)
 			continue;
 
-		if (list_is_last(&meta->l, &nfp_prog->insns)) {
-			next = NULL;
+		if (list_is_last(&meta->l, &nfp_prog->insns))
 			idx = nfp_prog->last_bpf_off;
-		} else {
-			next = list_next_entry(meta, l);
-			idx = next->off - 1;
-		}
+		else
+			idx = list_next_entry(meta, l)->off - 1;
 
 		br_idx = nfp_prog_offset_to_index(nfp_prog, idx);
 
@@ -1893,43 +1883,14 @@ static int nfp_fixup_branches(struct nfp_prog *nfp_prog)
 		if (FIELD_GET(OP_BR_SPECIAL, nfp_prog->prog[br_idx]))
 			continue;
 
-		/* Find the target offset in assembler realm */
-		off = meta->insn.off;
-		if (!off) {
-			pr_err("Fixup found zero offset!!\n");
+		if (!meta->jmp_dst) {
+			pr_err("Non-exit jump doesn't have destination info recorded!!\n");
 			return -ELOOP;
 		}
 
-		if (!next) {
-			/* When "next" is NULL, "meta" is the last node in the
-			 * list. Given it is an JMP, it then must be a backward
-			 * jump.
-			 *
-			 * For eBPF, the jump offset is against pc + 1, so we
-			 * need to compensate the offset by 1 as we are pointing
-			 * "next" to the current node "meta".
-			 */
-			if (WARN_ON_ONCE(off > -2))
-				return -ELOOP;
-
-			next = meta;
-			off += 1;
-		}
-
-		while (off > 0 && nfp_meta_has_next(nfp_prog, next)) {
-			next = nfp_meta_next(next);
-			off--;
-		}
-		while (off < 0 && nfp_meta_has_prev(nfp_prog, next)) {
-			next = nfp_meta_prev(next);
-			off++;
-		}
-		if (off) {
-			pr_err("Fixup found too large jump!! %d\n", off);
-			return -ELOOP;
-		}
+		jmp_dst = meta->jmp_dst;
 
-		if (next->skip) {
+		if (jmp_dst->skip) {
 			pr_err("Branch landing on removed instruction!!\n");
 			return -ELOOP;
 		}
@@ -1938,7 +1899,7 @@ static int nfp_fixup_branches(struct nfp_prog *nfp_prog)
 		     idx <= br_idx; idx++) {
 			if (!nfp_is_br(nfp_prog->prog[idx]))
 				continue;
-			br_set_offset(&nfp_prog->prog[idx], next->off);
+			br_set_offset(&nfp_prog->prog[idx], jmp_dst->off);
 		}
 	}
 
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.h b/drivers/net/ethernet/netronome/nfp/bpf/main.h
index 0f4d218fc77ae..e488656f406cd 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/main.h
+++ b/drivers/net/ethernet/netronome/nfp/bpf/main.h
@@ -94,6 +94,7 @@ typedef int (*instr_cb_t)(struct nfp_prog *, struct nfp_insn_meta *);
  * @insn: BPF instruction
  * @ptr: pointer type for memory operations
  * @ptr_not_const: pointer is not always constant
+ * @jmp_dst: destination info for jump instructions
  * @off: index of first generated machine instruction (in nfp_prog.prog)
  * @n: eBPF instruction number
  * @skip: skip this instruction (optimized out)
@@ -102,8 +103,13 @@ typedef int (*instr_cb_t)(struct nfp_prog *, struct nfp_insn_meta *);
  */
 struct nfp_insn_meta {
 	struct bpf_insn insn;
-	struct bpf_reg_state ptr;
-	bool ptr_not_const;
+	union {
+		struct {
+			struct bpf_reg_state ptr;
+			bool ptr_not_const;
+		};
+		struct nfp_insn_meta *jmp_dst;
+	};
 	unsigned int off;
 	unsigned short n;
 	bool skip;
@@ -191,4 +197,7 @@ int nfp_bpf_translate(struct nfp_app *app, struct nfp_net *nn,
 		      struct bpf_prog *prog);
 int nfp_bpf_destroy(struct nfp_app *app, struct nfp_net *nn,
 		    struct bpf_prog *prog);
+struct nfp_insn_meta *
+nfp_bpf_goto_meta(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
+		  unsigned int insn_idx, unsigned int n_insns);
 #endif
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/offload.c b/drivers/net/ethernet/netronome/nfp/bpf/offload.c
index bc879aeb62d4e..240db663d83fa 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/offload.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/offload.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2016 Netronome Systems, Inc.
+ * Copyright (C) 2016-2017 Netronome Systems, Inc.
  *
  * This software is dual licensed under the GNU General License Version 2,
  * June 1991 as shown in the file COPYING in the top-level directory of this
@@ -55,11 +55,10 @@ static int
 nfp_prog_prepare(struct nfp_prog *nfp_prog, const struct bpf_insn *prog,
 		 unsigned int cnt)
 {
+	struct nfp_insn_meta *meta;
 	unsigned int i;
 
 	for (i = 0; i < cnt; i++) {
-		struct nfp_insn_meta *meta;
-
 		meta = kzalloc(sizeof(*meta), GFP_KERNEL);
 		if (!meta)
 			return -ENOMEM;
@@ -70,6 +69,23 @@ nfp_prog_prepare(struct nfp_prog *nfp_prog, const struct bpf_insn *prog,
 		list_add_tail(&meta->l, &nfp_prog->insns);
 	}
 
+	/* Another pass to record jump information. */
+	list_for_each_entry(meta, &nfp_prog->insns, l) {
+		u64 code = meta->insn.code;
+
+		if (BPF_CLASS(code) == BPF_JMP && BPF_OP(code) != BPF_EXIT &&
+		    BPF_OP(code) != BPF_CALL) {
+			struct nfp_insn_meta *dst_meta;
+			unsigned short dst_indx;
+
+			dst_indx = meta->n + 1 + meta->insn.off;
+			dst_meta = nfp_bpf_goto_meta(nfp_prog, meta, dst_indx,
+						     cnt);
+
+			meta->jmp_dst = dst_meta;
+		}
+	}
+
 	return 0;
 }
 
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
index 8d43491ddd6b8..cca67730b91f4 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2016 Netronome Systems, Inc.
+ * Copyright (C) 2016-2017 Netronome Systems, Inc.
  *
  * This software is dual licensed under the GNU General License Version 2,
  * June 1991 as shown in the file COPYING in the top-level directory of this
@@ -40,7 +40,7 @@
 
 #include "main.h"
 
-static struct nfp_insn_meta *
+struct nfp_insn_meta *
 nfp_bpf_goto_meta(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
 		  unsigned int insn_idx, unsigned int n_insns)
 {

From a09d5c52c42129adbac2d1e39bd0e49a92729e3e Mon Sep 17 00:00:00 2001
From: Jiong Wang <jiong.wang@netronome.com>
Date: Thu, 30 Nov 2017 21:32:51 -0800
Subject: [PATCH 04/13] nfp: bpf: flag jump destination to guide insn combine
 optimizations

NFP eBPF offload JIT engine is doing some instruction combine based
optimizations which however must not be safe if the combined sequences
are across basic block boarders.

Currently, there are post checks during fixing jump destinations. If the
jump destination is found to be eBPF insn that has been combined into
another one, then JIT engine will raise error and abort.

This is not optimal. The JIT engine ought to disable the optimization on
such cross-bb-border sequences instead of abort.

As there is no control flow information in eBPF infrastructure that we
can't do basic block based optimizations, this patch extends the existing
jump destination record pass to also flag the jump destination, then in
instruction combine passes we could skip the optimizations if insns in the
sequence are jump targets.

Suggested-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Jiong Wang <jiong.wang@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 drivers/net/ethernet/netronome/nfp/bpf/main.h    | 4 ++++
 drivers/net/ethernet/netronome/nfp/bpf/offload.c | 1 +
 2 files changed, 5 insertions(+)

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.h b/drivers/net/ethernet/netronome/nfp/bpf/main.h
index e488656f406cd..99da1d34dd0ee 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/main.h
+++ b/drivers/net/ethernet/netronome/nfp/bpf/main.h
@@ -89,6 +89,8 @@ typedef int (*instr_cb_t)(struct nfp_prog *, struct nfp_insn_meta *);
 #define nfp_meta_next(meta)	list_next_entry(meta, l)
 #define nfp_meta_prev(meta)	list_prev_entry(meta, l)
 
+#define FLAG_INSN_IS_JUMP_DST	BIT(0)
+
 /**
  * struct nfp_insn_meta - BPF instruction wrapper
  * @insn: BPF instruction
@@ -97,6 +99,7 @@ typedef int (*instr_cb_t)(struct nfp_prog *, struct nfp_insn_meta *);
  * @jmp_dst: destination info for jump instructions
  * @off: index of first generated machine instruction (in nfp_prog.prog)
  * @n: eBPF instruction number
+ * @flags: eBPF instruction extra optimization flags
  * @skip: skip this instruction (optimized out)
  * @double_cb: callback for second part of the instruction
  * @l: link on nfp_prog->insns list
@@ -112,6 +115,7 @@ struct nfp_insn_meta {
 	};
 	unsigned int off;
 	unsigned short n;
+	unsigned short flags;
 	bool skip;
 	instr_cb_t double_cb;
 
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/offload.c b/drivers/net/ethernet/netronome/nfp/bpf/offload.c
index 240db663d83fa..377976ce92dd0 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/offload.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/offload.c
@@ -83,6 +83,7 @@ nfp_prog_prepare(struct nfp_prog *nfp_prog, const struct bpf_insn *prog,
 						     cnt);
 
 			meta->jmp_dst = dst_meta;
+			dst_meta->flags |= FLAG_INSN_IS_JUMP_DST;
 		}
 	}
 

From 1266f5d6559e30e9afdebfff60d60ab86a8da77b Mon Sep 17 00:00:00 2001
From: Jiong Wang <jiong.wang@netronome.com>
Date: Thu, 30 Nov 2017 21:32:52 -0800
Subject: [PATCH 05/13] nfp: bpf: don't do ld/mask combination if mask is jump
 destination

If the mask insn in the ld/mask pair is jump destination, then don't do
combination.

Signed-off-by: Jiong Wang <jiong.wang@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 drivers/net/ethernet/netronome/nfp/bpf/jit.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
index f76659ecb6544..f2317b7642220 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
@@ -2142,6 +2142,9 @@ static void nfp_bpf_opt_ld_mask(struct nfp_prog *nfp_prog)
 		if (next.src_reg || next.dst_reg)
 			continue;
 
+		if (meta2->flags & FLAG_INSN_IS_JUMP_DST)
+			continue;
+
 		meta2->skip = true;
 	}
 }

From 29fe46efba5c3e1ac0f857a03a29f6bf0d0c5592 Mon Sep 17 00:00:00 2001
From: Jiong Wang <jiong.wang@netronome.com>
Date: Thu, 30 Nov 2017 21:32:53 -0800
Subject: [PATCH 06/13] nfp: bpf: don't do ld/shifts combination if shifts are
 jump destination

If any of the shift insns in the ld/shift sequence is jump destination,
don't do combination.

Signed-off-by: Jiong Wang <jiong.wang@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 drivers/net/ethernet/netronome/nfp/bpf/jit.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
index f2317b7642220..54915a3b8a7e9 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
@@ -2181,6 +2181,10 @@ static void nfp_bpf_opt_ld_shift(struct nfp_prog *nfp_prog)
 		if (next1.imm != 0x20 || next2.imm != 0x20)
 			continue;
 
+		if (meta2->flags & FLAG_INSN_IS_JUMP_DST ||
+		    meta3->flags & FLAG_INSN_IS_JUMP_DST)
+			continue;
+
 		meta2->skip = true;
 		meta3->skip = true;
 	}

From 08859f159eaf7159a7a680c6151073f4451b222b Mon Sep 17 00:00:00 2001
From: Jiong Wang <jiong.wang@netronome.com>
Date: Thu, 30 Nov 2017 21:32:54 -0800
Subject: [PATCH 07/13] nfp: bpf: relax source operands check

The NFP normally requires the source operands to be difference addressing
modes, but we should rule out the very special NN_REG_NONE type.

There are instruction that ignores both A/B operands, for example:

  local_csr_rd

For these instructions, we might pass the same operand type, NN_REG_NONE,
for both A/B operands.

NOTE: in current NFP ISA, it is only possible for instructions with
unrestricted operands to take none operands, but in case there is new and
similar instructoin in restricted form, they would follow similar rules,
so swreg_to_restricted is updated as well.

Signed-off-by: Jiong Wang <jiong.wang@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 drivers/net/ethernet/netronome/nfp/nfp_asm.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_asm.c b/drivers/net/ethernet/netronome/nfp/nfp_asm.c
index 830f6de25f475..da277386077c0 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_asm.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_asm.c
@@ -120,7 +120,8 @@ int swreg_to_unrestricted(swreg dst, swreg lreg, swreg rreg,
 	reg->dst = nfp_swreg_to_unreg(dst, true);
 
 	/* Decode source operands */
-	if (swreg_type(lreg) == swreg_type(rreg))
+	if (swreg_type(lreg) == swreg_type(rreg) &&
+	    swreg_type(lreg) != NN_REG_NONE)
 		return -EFAULT;
 
 	if (swreg_type(lreg) == NN_REG_GPR_B ||
@@ -200,7 +201,8 @@ int swreg_to_restricted(swreg dst, swreg lreg, swreg rreg,
 	reg->dst = nfp_swreg_to_rereg(dst, true, false, NULL);
 
 	/* Decode source operands */
-	if (swreg_type(lreg) == swreg_type(rreg))
+	if (swreg_type(lreg) == swreg_type(rreg) &&
+	    swreg_type(lreg) != NN_REG_NONE)
 		return -EFAULT;
 
 	if (swreg_type(lreg) == NN_REG_GPR_B ||

From 3239e7bb28a8a4a96c36beeaa6439666e1ed4f8b Mon Sep 17 00:00:00 2001
From: Jiong Wang <jiong.wang@netronome.com>
Date: Thu, 30 Nov 2017 21:32:55 -0800
Subject: [PATCH 08/13] nfp: bpf: correct the encoding for No-Dest immed

When immed is used with No-Dest, the emitter should use reg.dst instead of
reg.areg for the destination, using the latter will actually encode
register zero.

Signed-off-by: Jiong Wang <jiong.wang@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 drivers/net/ethernet/netronome/nfp/bpf/jit.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
index 54915a3b8a7e9..024b44089623d 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
@@ -224,9 +224,11 @@ emit_immed(struct nfp_prog *nfp_prog, swreg dst, u16 imm,
 		return;
 	}
 
-	__emit_immed(nfp_prog, reg.areg, reg.breg, imm >> 8, width,
-		     invert, shift, reg.wr_both,
-		     reg.dst_lmextn, reg.src_lmextn);
+	/* Use reg.dst when destination is No-Dest. */
+	__emit_immed(nfp_prog,
+		     swreg_type(dst) == NN_REG_NONE ? reg.dst : reg.areg,
+		     reg.breg, imm >> 8, width, invert, shift,
+		     reg.wr_both, reg.dst_lmextn, reg.src_lmextn);
 }
 
 static void

From 5468a8b929e6276e139405d525c963a56890b5e0 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Thu, 30 Nov 2017 21:32:56 -0800
Subject: [PATCH 09/13] nfp: bpf: encode indirect commands

Add support for emitting commands with field overwrites.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Jiong Wang <jiong.wang@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 drivers/net/ethernet/netronome/nfp/bpf/jit.c | 17 +++++++++++++----
 drivers/net/ethernet/netronome/nfp/nfp_asm.h |  3 ++-
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
index 024b44089623d..da4e106d3b164 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
@@ -96,7 +96,7 @@ nfp_prog_offset_to_index(struct nfp_prog *nfp_prog, unsigned int offset)
 /* --- Emitters --- */
 static void
 __emit_cmd(struct nfp_prog *nfp_prog, enum cmd_tgt_map op,
-	   u8 mode, u8 xfer, u8 areg, u8 breg, u8 size, bool sync)
+	   u8 mode, u8 xfer, u8 areg, u8 breg, u8 size, bool sync, bool indir)
 {
 	enum cmd_ctx_swap ctx;
 	u64 insn;
@@ -114,14 +114,15 @@ __emit_cmd(struct nfp_prog *nfp_prog, enum cmd_tgt_map op,
 		FIELD_PREP(OP_CMD_CNT, size) |
 		FIELD_PREP(OP_CMD_SIG, sync) |
 		FIELD_PREP(OP_CMD_TGT_CMD, cmd_tgt_act[op].tgt_cmd) |
+		FIELD_PREP(OP_CMD_INDIR, indir) |
 		FIELD_PREP(OP_CMD_MODE, mode);
 
 	nfp_prog_push(nfp_prog, insn);
 }
 
 static void
-emit_cmd(struct nfp_prog *nfp_prog, enum cmd_tgt_map op,
-	 u8 mode, u8 xfer, swreg lreg, swreg rreg, u8 size, bool sync)
+emit_cmd_any(struct nfp_prog *nfp_prog, enum cmd_tgt_map op, u8 mode, u8 xfer,
+	     swreg lreg, swreg rreg, u8 size, bool sync, bool indir)
 {
 	struct nfp_insn_re_regs reg;
 	int err;
@@ -142,7 +143,15 @@ emit_cmd(struct nfp_prog *nfp_prog, enum cmd_tgt_map op,
 		return;
 	}
 
-	__emit_cmd(nfp_prog, op, mode, xfer, reg.areg, reg.breg, size, sync);
+	__emit_cmd(nfp_prog, op, mode, xfer, reg.areg, reg.breg, size, sync,
+		   indir);
+}
+
+static void
+emit_cmd(struct nfp_prog *nfp_prog, enum cmd_tgt_map op, u8 mode, u8 xfer,
+	 swreg lreg, swreg rreg, u8 size, bool sync)
+{
+	emit_cmd_any(nfp_prog, op, mode, xfer, lreg, rreg, size, sync, false);
 }
 
 static void
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_asm.h b/drivers/net/ethernet/netronome/nfp/nfp_asm.h
index 74d0c11ab2f90..6ff842a15e5db 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_asm.h
+++ b/drivers/net/ethernet/netronome/nfp/nfp_asm.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2016 Netronome Systems, Inc.
+ * Copyright (C) 2016-2017 Netronome Systems, Inc.
  *
  * This software is dual licensed under the GNU General License Version 2,
  * June 1991 as shown in the file COPYING in the top-level directory of this
@@ -209,6 +209,7 @@ enum alu_dst_ab {
 #define OP_CMD_CNT		0x0000e000000ULL
 #define OP_CMD_SIG		0x000f0000000ULL
 #define OP_CMD_TGT_CMD		0x07f00000000ULL
+#define OP_CMD_INDIR		0x20000000000ULL
 #define OP_CMD_MODE	       0x1c0000000000ULL
 
 struct cmd_tgt_act {

From 5e4d6d20939f8e77b7734c0cea6886dff60c99de Mon Sep 17 00:00:00 2001
From: Jiong Wang <jiong.wang@netronome.com>
Date: Thu, 30 Nov 2017 21:32:57 -0800
Subject: [PATCH 10/13] nfp: bpf: factor out is_mbpf_load & is_mbpf_store

It is usual that we need to check if one BPF insn is for loading/storeing
data from/to memory.

Therefore, it makes sense to factor out related code to become common
helper functions.

Signed-off-by: Jiong Wang <jiong.wang@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 drivers/net/ethernet/netronome/nfp/bpf/main.h     | 10 ++++++++++
 drivers/net/ethernet/netronome/nfp/bpf/verifier.c |  4 ++--
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.h b/drivers/net/ethernet/netronome/nfp/bpf/main.h
index 99da1d34dd0ee..20ef0adb29318 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/main.h
+++ b/drivers/net/ethernet/netronome/nfp/bpf/main.h
@@ -144,6 +144,16 @@ static inline u8 mbpf_mode(const struct nfp_insn_meta *meta)
 	return BPF_MODE(meta->insn.code);
 }
 
+static inline bool is_mbpf_load(const struct nfp_insn_meta *meta)
+{
+	return (meta->insn.code & ~BPF_SIZE_MASK) == (BPF_LDX | BPF_MEM);
+}
+
+static inline bool is_mbpf_store(const struct nfp_insn_meta *meta)
+{
+	return (meta->insn.code & ~BPF_SIZE_MASK) == (BPF_STX | BPF_MEM);
+}
+
 /**
  * struct nfp_prog - nfp BPF program
  * @prog: machine code
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
index cca67730b91f4..d2bf29c902262 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
@@ -180,10 +180,10 @@ nfp_verify_insn(struct bpf_verifier_env *env, int insn_idx, int prev_insn_idx)
 	if (meta->insn.code == (BPF_JMP | BPF_EXIT))
 		return nfp_bpf_check_exit(nfp_prog, env);
 
-	if ((meta->insn.code & ~BPF_SIZE_MASK) == (BPF_LDX | BPF_MEM))
+	if (is_mbpf_load(meta))
 		return nfp_bpf_check_ptr(nfp_prog, meta, env,
 					 meta->insn.src_reg);
-	if ((meta->insn.code & ~BPF_SIZE_MASK) == (BPF_STX | BPF_MEM))
+	if (is_mbpf_store(meta))
 		return nfp_bpf_check_ptr(nfp_prog, meta, env,
 					 meta->insn.dst_reg);
 

From 9879a3814beb3b1350755475e67a8d92ba1f7e4b Mon Sep 17 00:00:00 2001
From: Jiong Wang <jiong.wang@netronome.com>
Date: Thu, 30 Nov 2017 21:32:58 -0800
Subject: [PATCH 11/13] nfp: bpf: implement memory bulk copy for length within
 32-bytes

For NFP, we want to re-group a sequence of load/store pairs lowered from
memcpy/memmove into single memory bulk operation which then could be
accelerated using NFP CPP bus.

This patch extends the existing load/store auxiliary information by adding
two new fields:

	struct bpf_insn *paired_st;
	s16 ldst_gather_len;

Both fields are supposed to be carried by the the load instruction at the
head of the sequence. "paired_st" is the corresponding store instruction at
the head and "ldst_gather_len" is the gathered length.

If "ldst_gather_len" is negative, then the sequence is doing memory
load/store in descending order, otherwise it is in ascending order. We need
this information to detect overlapped memory access.

This patch then optimize memory bulk copy when the copy length is within
32-bytes.

The strategy of read/write used is:

  * Read.
    Use read32 (direct_ref), always.

  * Write.
    - length <= 8-bytes
      write8 (direct_ref).
    - length <= 32-bytes and is 4-byte aligned
      write32 (direct_ref).
    - length <= 32-bytes but is not 4-byte aligned
      write8 (indirect_ref).

NOTE: the optimization should not change program semantics. The destination
register of the last load instruction should contain the same value before
and after this optimization.

Signed-off-by: Jiong Wang <jiong.wang@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 drivers/net/ethernet/netronome/nfp/bpf/jit.c  | 113 ++++++++++++++++++
 drivers/net/ethernet/netronome/nfp/bpf/main.h |   4 +
 drivers/net/ethernet/netronome/nfp/nfp_asm.c  |   1 +
 drivers/net/ethernet/netronome/nfp/nfp_asm.h  |   4 +
 4 files changed, 122 insertions(+)

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
index da4e106d3b164..138568c0eee6c 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
@@ -154,6 +154,13 @@ emit_cmd(struct nfp_prog *nfp_prog, enum cmd_tgt_map op, u8 mode, u8 xfer,
 	emit_cmd_any(nfp_prog, op, mode, xfer, lreg, rreg, size, sync, false);
 }
 
+static void
+emit_cmd_indir(struct nfp_prog *nfp_prog, enum cmd_tgt_map op, u8 mode, u8 xfer,
+	       swreg lreg, swreg rreg, u8 size, bool sync)
+{
+	emit_cmd_any(nfp_prog, op, mode, xfer, lreg, rreg, size, sync, true);
+}
+
 static void
 __emit_br(struct nfp_prog *nfp_prog, enum br_mask mask, enum br_ev_pip ev_pip,
 	  enum br_ctx_signal_state css, u16 addr, u8 defer)
@@ -515,6 +522,109 @@ static void wrp_reg_mov(struct nfp_prog *nfp_prog, u16 dst, u16 src)
 	wrp_mov(nfp_prog, reg_both(dst), reg_b(src));
 }
 
+/* wrp_reg_subpart() - load @field_len bytes from @offset of @src, write the
+ * result to @dst from low end.
+ */
+static void
+wrp_reg_subpart(struct nfp_prog *nfp_prog, swreg dst, swreg src, u8 field_len,
+		u8 offset)
+{
+	enum shf_sc sc = offset ? SHF_SC_R_SHF : SHF_SC_NONE;
+	u8 mask = (1 << field_len) - 1;
+
+	emit_ld_field_any(nfp_prog, dst, mask, src, sc, offset * 8, true);
+}
+
+/* NFP has Command Push Pull bus which supports bluk memory operations. */
+static int nfp_cpp_memcpy(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
+{
+	bool descending_seq = meta->ldst_gather_len < 0;
+	s16 len = abs(meta->ldst_gather_len);
+	swreg src_base, off;
+	unsigned int i;
+	u8 xfer_num;
+
+	if (WARN_ON_ONCE(len > 32))
+		return -EOPNOTSUPP;
+
+	off = re_load_imm_any(nfp_prog, meta->insn.off, imm_b(nfp_prog));
+	src_base = reg_a(meta->insn.src_reg * 2);
+	xfer_num = round_up(len, 4) / 4;
+
+	/* Memory read from source addr into transfer-in registers. */
+	emit_cmd(nfp_prog, CMD_TGT_READ32_SWAP, CMD_MODE_32b, 0, src_base, off,
+		 xfer_num - 1, true);
+
+	/* Move from transfer-in to transfer-out. */
+	for (i = 0; i < xfer_num; i++)
+		wrp_mov(nfp_prog, reg_xfer(i), reg_xfer(i));
+
+	off = re_load_imm_any(nfp_prog, meta->paired_st->off, imm_b(nfp_prog));
+
+	if (len <= 8) {
+		/* Use single direct_ref write8. */
+		emit_cmd(nfp_prog, CMD_TGT_WRITE8_SWAP, CMD_MODE_32b, 0,
+			 reg_a(meta->paired_st->dst_reg * 2), off, len - 1,
+			 true);
+	} else if (IS_ALIGNED(len, 4)) {
+		/* Use single direct_ref write32. */
+		emit_cmd(nfp_prog, CMD_TGT_WRITE32_SWAP, CMD_MODE_32b, 0,
+			 reg_a(meta->paired_st->dst_reg * 2), off, xfer_num - 1,
+			 true);
+	} else {
+		/* Use single indirect_ref write8. */
+		wrp_immed(nfp_prog, reg_none(),
+			  CMD_OVE_LEN | FIELD_PREP(CMD_OV_LEN, len - 1));
+		emit_cmd_indir(nfp_prog, CMD_TGT_WRITE8_SWAP, CMD_MODE_32b, 0,
+			       reg_a(meta->paired_st->dst_reg * 2), off,
+			       len - 1, true);
+	}
+
+	/* TODO: The following extra load is to make sure data flow be identical
+	 *  before and after we do memory copy optimization.
+	 *
+	 *  The load destination register is not guaranteed to be dead, so we
+	 *  need to make sure it is loaded with the value the same as before
+	 *  this transformation.
+	 *
+	 *  These extra loads could be removed once we have accurate register
+	 *  usage information.
+	 */
+	if (descending_seq)
+		xfer_num = 0;
+	else if (BPF_SIZE(meta->insn.code) != BPF_DW)
+		xfer_num = xfer_num - 1;
+	else
+		xfer_num = xfer_num - 2;
+
+	switch (BPF_SIZE(meta->insn.code)) {
+	case BPF_B:
+		wrp_reg_subpart(nfp_prog, reg_both(meta->insn.dst_reg * 2),
+				reg_xfer(xfer_num), 1,
+				IS_ALIGNED(len, 4) ? 3 : (len & 3) - 1);
+		break;
+	case BPF_H:
+		wrp_reg_subpart(nfp_prog, reg_both(meta->insn.dst_reg * 2),
+				reg_xfer(xfer_num), 2, (len & 3) ^ 2);
+		break;
+	case BPF_W:
+		wrp_mov(nfp_prog, reg_both(meta->insn.dst_reg * 2),
+			reg_xfer(0));
+		break;
+	case BPF_DW:
+		wrp_mov(nfp_prog, reg_both(meta->insn.dst_reg * 2),
+			reg_xfer(xfer_num));
+		wrp_mov(nfp_prog, reg_both(meta->insn.dst_reg * 2 + 1),
+			reg_xfer(xfer_num + 1));
+		break;
+	}
+
+	if (BPF_SIZE(meta->insn.code) != BPF_DW)
+		wrp_immed(nfp_prog, reg_both(meta->insn.dst_reg * 2 + 1), 0);
+
+	return 0;
+}
+
 static int
 data_ld(struct nfp_prog *nfp_prog, swreg offset, u8 dst_gpr, int size)
 {
@@ -1490,6 +1600,9 @@ static int
 mem_ldx(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
 	unsigned int size)
 {
+	if (meta->ldst_gather_len)
+		return nfp_cpp_memcpy(nfp_prog, meta);
+
 	if (meta->ptr.type == PTR_TO_CTX) {
 		if (nfp_prog->type == BPF_PROG_TYPE_XDP)
 			return mem_ldx_xdp(nfp_prog, meta, size);
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.h b/drivers/net/ethernet/netronome/nfp/bpf/main.h
index 20ef0adb29318..5884291ddba52 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/main.h
+++ b/drivers/net/ethernet/netronome/nfp/bpf/main.h
@@ -95,6 +95,8 @@ typedef int (*instr_cb_t)(struct nfp_prog *, struct nfp_insn_meta *);
  * struct nfp_insn_meta - BPF instruction wrapper
  * @insn: BPF instruction
  * @ptr: pointer type for memory operations
+ * @ldst_gather_len: memcpy length gathered from load/store sequence
+ * @paired_st: the paired store insn at the head of the sequence
  * @ptr_not_const: pointer is not always constant
  * @jmp_dst: destination info for jump instructions
  * @off: index of first generated machine instruction (in nfp_prog.prog)
@@ -109,6 +111,8 @@ struct nfp_insn_meta {
 	union {
 		struct {
 			struct bpf_reg_state ptr;
+			struct bpf_insn *paired_st;
+			s16 ldst_gather_len;
 			bool ptr_not_const;
 		};
 		struct nfp_insn_meta *jmp_dst;
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_asm.c b/drivers/net/ethernet/netronome/nfp/nfp_asm.c
index da277386077c0..d3610987fb073 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_asm.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_asm.c
@@ -41,6 +41,7 @@
 
 const struct cmd_tgt_act cmd_tgt_act[__CMD_TGT_MAP_SIZE] = {
 	[CMD_TGT_WRITE8_SWAP] =		{ 0x02, 0x42 },
+	[CMD_TGT_WRITE32_SWAP] =	{ 0x02, 0x5f },
 	[CMD_TGT_READ8] =		{ 0x01, 0x43 },
 	[CMD_TGT_READ32] =		{ 0x00, 0x5c },
 	[CMD_TGT_READ32_LE] =		{ 0x01, 0x5c },
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_asm.h b/drivers/net/ethernet/netronome/nfp/nfp_asm.h
index 6ff842a15e5db..98803f9f40b69 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_asm.h
+++ b/drivers/net/ethernet/netronome/nfp/nfp_asm.h
@@ -220,6 +220,7 @@ struct cmd_tgt_act {
 enum cmd_tgt_map {
 	CMD_TGT_READ8,
 	CMD_TGT_WRITE8_SWAP,
+	CMD_TGT_WRITE32_SWAP,
 	CMD_TGT_READ32,
 	CMD_TGT_READ32_LE,
 	CMD_TGT_READ32_SWAP,
@@ -241,6 +242,9 @@ enum cmd_ctx_swap {
 	CMD_CTX_NO_SWAP = 3,
 };
 
+#define CMD_OVE_LEN	BIT(7)
+#define CMD_OV_LEN	GENMASK(12, 8)
+
 #define OP_LCSR_BASE		0x0fc00000000ULL
 #define OP_LCSR_A_SRC		0x000000003ffULL
 #define OP_LCSR_B_SRC		0x000000ffc00ULL

From 8c90053858fce1ca60fab7be03bb61d314ea5c1c Mon Sep 17 00:00:00 2001
From: Jiong Wang <jiong.wang@netronome.com>
Date: Thu, 30 Nov 2017 21:32:59 -0800
Subject: [PATCH 12/13] nfp: bpf: implement memory bulk copy for length bigger
 than 32-bytes

When the gathered copy length is bigger than 32-bytes and within 128-bytes
(the maximum length a single CPP Pull/Push request can finish), the
strategy of read/write are changeed into:

  * Read.
      - use direct reference mode when length is within 32-bytes.
      - use indirect mode when length is bigger than 32-bytes.

  * Write.
      - length <= 8-bytes
        use write8 (direct_ref).
      - length <= 32-byte and 4-bytes aligned
        use write32 (direct_ref).
      - length <= 32-bytes but not 4-bytes aligned
        use write8 (indirect_ref).
      - length > 32-bytes and 4-bytes aligned
        use write32 (indirect_ref).
      - length > 32-bytes and not 4-bytes aligned and <= 40-bytes
        use write32 (direct_ref) to finish the first 32-bytes.
        use write8 (direct_ref) to finish all remaining hanging part.
      - length > 32-bytes and not 4-bytes aligned
        use write32 (indirect_ref) to finish those 4-byte aligned parts.
        use write8 (direct_ref) to finish all remaining hanging part.

Signed-off-by: Jiong Wang <jiong.wang@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 drivers/net/ethernet/netronome/nfp/bpf/jit.c | 52 +++++++++++++++++---
 1 file changed, 45 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
index 138568c0eee6c..1b98ef239605d 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
@@ -544,16 +544,18 @@ static int nfp_cpp_memcpy(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 	unsigned int i;
 	u8 xfer_num;
 
-	if (WARN_ON_ONCE(len > 32))
-		return -EOPNOTSUPP;
-
 	off = re_load_imm_any(nfp_prog, meta->insn.off, imm_b(nfp_prog));
 	src_base = reg_a(meta->insn.src_reg * 2);
 	xfer_num = round_up(len, 4) / 4;
 
+	/* Setup PREV_ALU fields to override memory read length. */
+	if (len > 32)
+		wrp_immed(nfp_prog, reg_none(),
+			  CMD_OVE_LEN | FIELD_PREP(CMD_OV_LEN, xfer_num - 1));
+
 	/* Memory read from source addr into transfer-in registers. */
-	emit_cmd(nfp_prog, CMD_TGT_READ32_SWAP, CMD_MODE_32b, 0, src_base, off,
-		 xfer_num - 1, true);
+	emit_cmd_any(nfp_prog, CMD_TGT_READ32_SWAP, CMD_MODE_32b, 0, src_base,
+		     off, xfer_num - 1, true, len > 32);
 
 	/* Move from transfer-in to transfer-out. */
 	for (i = 0; i < xfer_num; i++)
@@ -566,18 +568,54 @@ static int nfp_cpp_memcpy(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 		emit_cmd(nfp_prog, CMD_TGT_WRITE8_SWAP, CMD_MODE_32b, 0,
 			 reg_a(meta->paired_st->dst_reg * 2), off, len - 1,
 			 true);
-	} else if (IS_ALIGNED(len, 4)) {
+	} else if (len <= 32 && IS_ALIGNED(len, 4)) {
 		/* Use single direct_ref write32. */
 		emit_cmd(nfp_prog, CMD_TGT_WRITE32_SWAP, CMD_MODE_32b, 0,
 			 reg_a(meta->paired_st->dst_reg * 2), off, xfer_num - 1,
 			 true);
-	} else {
+	} else if (len <= 32) {
 		/* Use single indirect_ref write8. */
 		wrp_immed(nfp_prog, reg_none(),
 			  CMD_OVE_LEN | FIELD_PREP(CMD_OV_LEN, len - 1));
 		emit_cmd_indir(nfp_prog, CMD_TGT_WRITE8_SWAP, CMD_MODE_32b, 0,
 			       reg_a(meta->paired_st->dst_reg * 2), off,
 			       len - 1, true);
+	} else if (IS_ALIGNED(len, 4)) {
+		/* Use single indirect_ref write32. */
+		wrp_immed(nfp_prog, reg_none(),
+			  CMD_OVE_LEN | FIELD_PREP(CMD_OV_LEN, xfer_num - 1));
+		emit_cmd_indir(nfp_prog, CMD_TGT_WRITE32_SWAP, CMD_MODE_32b, 0,
+			       reg_a(meta->paired_st->dst_reg * 2), off,
+			       xfer_num - 1, true);
+	} else if (len <= 40) {
+		/* Use one direct_ref write32 to write the first 32-bytes, then
+		 * another direct_ref write8 to write the remaining bytes.
+		 */
+		emit_cmd(nfp_prog, CMD_TGT_WRITE32_SWAP, CMD_MODE_32b, 0,
+			 reg_a(meta->paired_st->dst_reg * 2), off, 7,
+			 true);
+
+		off = re_load_imm_any(nfp_prog, meta->paired_st->off + 32,
+				      imm_b(nfp_prog));
+		emit_cmd(nfp_prog, CMD_TGT_WRITE8_SWAP, CMD_MODE_32b, 8,
+			 reg_a(meta->paired_st->dst_reg * 2), off, len - 33,
+			 true);
+	} else {
+		/* Use one indirect_ref write32 to write 4-bytes aligned length,
+		 * then another direct_ref write8 to write the remaining bytes.
+		 */
+		u8 new_off;
+
+		wrp_immed(nfp_prog, reg_none(),
+			  CMD_OVE_LEN | FIELD_PREP(CMD_OV_LEN, xfer_num - 2));
+		emit_cmd_indir(nfp_prog, CMD_TGT_WRITE32_SWAP, CMD_MODE_32b, 0,
+			       reg_a(meta->paired_st->dst_reg * 2), off,
+			       xfer_num - 2, true);
+		new_off = meta->paired_st->off + (xfer_num - 1) * 4;
+		off = re_load_imm_any(nfp_prog, new_off, imm_b(nfp_prog));
+		emit_cmd(nfp_prog, CMD_TGT_WRITE8_SWAP, CMD_MODE_32b,
+			 xfer_num - 1, reg_a(meta->paired_st->dst_reg * 2), off,
+			 (len & 0x3) - 1, true);
 	}
 
 	/* TODO: The following extra load is to make sure data flow be identical

From 6bc7103c89bff2b53a159e03b74c8216c79bfef8 Mon Sep 17 00:00:00 2001
From: Jiong Wang <jiong.wang@netronome.com>
Date: Thu, 30 Nov 2017 21:33:00 -0800
Subject: [PATCH 13/13] nfp: bpf: detect load/store sequences lowered from
 memory copy

This patch add the optimization frontend, but adding a new eBPF IR scan
pass "nfp_bpf_opt_ldst_gather".

The pass will traverse the IR to recognize the load/store pairs sequences
that come from lowering of memory copy builtins.

The gathered memory copy information will be kept in the meta info
structure of the first load instruction in the sequence and will be
consumed by the optimization backend added in the previous patches.

NOTE: a sequence with cross memory access doesn't qualify this
optimization, i.e. if one load in the sequence will load from place that
has been written by previous store. This is because when we turn the
sequence into single CPP operation, we are reading all contents at once
into NFP transfer registers, then write them out as a whole. This is not
identical with what the original load/store sequence is doing.

Detecting cross memory access for two random pointers will be difficult,
fortunately under XDP/eBPF's restrictied runtime environment, the copy
normally happen among map, packet data and stack, they do not overlap with
each other.

And for cases supported by NFP, cross memory access will only happen on
PTR_TO_PACKET. Fortunately for this, there is ID information that we could
do accurate memory alias check.

Signed-off-by: Jiong Wang <jiong.wang@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 drivers/net/ethernet/netronome/nfp/bpf/jit.c | 237 +++++++++++++++++++
 1 file changed, 237 insertions(+)

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
index 1b98ef239605d..3419ad495962f 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
@@ -2352,12 +2352,249 @@ static void nfp_bpf_opt_ld_shift(struct nfp_prog *nfp_prog)
 	}
 }
 
+/* load/store pair that forms memory copy sould look like the following:
+ *
+ *   ld_width R, [addr_src + offset_src]
+ *   st_width [addr_dest + offset_dest], R
+ *
+ * The destination register of load and source register of store should
+ * be the same, load and store should also perform at the same width.
+ * If either of addr_src or addr_dest is stack pointer, we don't do the
+ * CPP optimization as stack is modelled by registers on NFP.
+ */
+static bool
+curr_pair_is_memcpy(struct nfp_insn_meta *ld_meta,
+		    struct nfp_insn_meta *st_meta)
+{
+	struct bpf_insn *ld = &ld_meta->insn;
+	struct bpf_insn *st = &st_meta->insn;
+
+	if (!is_mbpf_load(ld_meta) || !is_mbpf_store(st_meta))
+		return false;
+
+	if (ld_meta->ptr.type != PTR_TO_PACKET)
+		return false;
+
+	if (st_meta->ptr.type != PTR_TO_PACKET)
+		return false;
+
+	if (BPF_SIZE(ld->code) != BPF_SIZE(st->code))
+		return false;
+
+	if (ld->dst_reg != st->src_reg)
+		return false;
+
+	/* There is jump to the store insn in this pair. */
+	if (st_meta->flags & FLAG_INSN_IS_JUMP_DST)
+		return false;
+
+	return true;
+}
+
+/* Currently, we only support chaining load/store pairs if:
+ *
+ *  - Their address base registers are the same.
+ *  - Their address offsets are in the same order.
+ *  - They operate at the same memory width.
+ *  - There is no jump into the middle of them.
+ */
+static bool
+curr_pair_chain_with_previous(struct nfp_insn_meta *ld_meta,
+			      struct nfp_insn_meta *st_meta,
+			      struct bpf_insn *prev_ld,
+			      struct bpf_insn *prev_st)
+{
+	u8 prev_size, curr_size, prev_ld_base, prev_st_base, prev_ld_dst;
+	struct bpf_insn *ld = &ld_meta->insn;
+	struct bpf_insn *st = &st_meta->insn;
+	s16 prev_ld_off, prev_st_off;
+
+	/* This pair is the start pair. */
+	if (!prev_ld)
+		return true;
+
+	prev_size = BPF_LDST_BYTES(prev_ld);
+	curr_size = BPF_LDST_BYTES(ld);
+	prev_ld_base = prev_ld->src_reg;
+	prev_st_base = prev_st->dst_reg;
+	prev_ld_dst = prev_ld->dst_reg;
+	prev_ld_off = prev_ld->off;
+	prev_st_off = prev_st->off;
+
+	if (ld->dst_reg != prev_ld_dst)
+		return false;
+
+	if (ld->src_reg != prev_ld_base || st->dst_reg != prev_st_base)
+		return false;
+
+	if (curr_size != prev_size)
+		return false;
+
+	/* There is jump to the head of this pair. */
+	if (ld_meta->flags & FLAG_INSN_IS_JUMP_DST)
+		return false;
+
+	/* Both in ascending order. */
+	if (prev_ld_off + prev_size == ld->off &&
+	    prev_st_off + prev_size == st->off)
+		return true;
+
+	/* Both in descending order. */
+	if (ld->off + curr_size == prev_ld_off &&
+	    st->off + curr_size == prev_st_off)
+		return true;
+
+	return false;
+}
+
+/* Return TRUE if cross memory access happens. Cross memory access means
+ * store area is overlapping with load area that a later load might load
+ * the value from previous store, for this case we can't treat the sequence
+ * as an memory copy.
+ */
+static bool
+cross_mem_access(struct bpf_insn *ld, struct nfp_insn_meta *head_ld_meta,
+		 struct nfp_insn_meta *head_st_meta)
+{
+	s16 head_ld_off, head_st_off, ld_off;
+
+	/* Different pointer types does not overlap. */
+	if (head_ld_meta->ptr.type != head_st_meta->ptr.type)
+		return false;
+
+	/* load and store are both PTR_TO_PACKET, check ID info.  */
+	if (head_ld_meta->ptr.id != head_st_meta->ptr.id)
+		return true;
+
+	/* Canonicalize the offsets. Turn all of them against the original
+	 * base register.
+	 */
+	head_ld_off = head_ld_meta->insn.off + head_ld_meta->ptr.off;
+	head_st_off = head_st_meta->insn.off + head_st_meta->ptr.off;
+	ld_off = ld->off + head_ld_meta->ptr.off;
+
+	/* Ascending order cross. */
+	if (ld_off > head_ld_off &&
+	    head_ld_off < head_st_off && ld_off >= head_st_off)
+		return true;
+
+	/* Descending order cross. */
+	if (ld_off < head_ld_off &&
+	    head_ld_off > head_st_off && ld_off <= head_st_off)
+		return true;
+
+	return false;
+}
+
+/* This pass try to identify the following instructoin sequences.
+ *
+ *   load R, [regA + offA]
+ *   store [regB + offB], R
+ *   load R, [regA + offA + const_imm_A]
+ *   store [regB + offB + const_imm_A], R
+ *   load R, [regA + offA + 2 * const_imm_A]
+ *   store [regB + offB + 2 * const_imm_A], R
+ *   ...
+ *
+ * Above sequence is typically generated by compiler when lowering
+ * memcpy. NFP prefer using CPP instructions to accelerate it.
+ */
+static void nfp_bpf_opt_ldst_gather(struct nfp_prog *nfp_prog)
+{
+	struct nfp_insn_meta *head_ld_meta = NULL;
+	struct nfp_insn_meta *head_st_meta = NULL;
+	struct nfp_insn_meta *meta1, *meta2;
+	struct bpf_insn *prev_ld = NULL;
+	struct bpf_insn *prev_st = NULL;
+	u8 count = 0;
+
+	nfp_for_each_insn_walk2(nfp_prog, meta1, meta2) {
+		struct bpf_insn *ld = &meta1->insn;
+		struct bpf_insn *st = &meta2->insn;
+
+		/* Reset record status if any of the following if true:
+		 *   - The current insn pair is not load/store.
+		 *   - The load/store pair doesn't chain with previous one.
+		 *   - The chained load/store pair crossed with previous pair.
+		 *   - The chained load/store pair has a total size of memory
+		 *     copy beyond 128 bytes which is the maximum length a
+		 *     single NFP CPP command can transfer.
+		 */
+		if (!curr_pair_is_memcpy(meta1, meta2) ||
+		    !curr_pair_chain_with_previous(meta1, meta2, prev_ld,
+						   prev_st) ||
+		    (head_ld_meta && (cross_mem_access(ld, head_ld_meta,
+						       head_st_meta) ||
+				      head_ld_meta->ldst_gather_len >= 128))) {
+			if (!count)
+				continue;
+
+			if (count > 1) {
+				s16 prev_ld_off = prev_ld->off;
+				s16 prev_st_off = prev_st->off;
+				s16 head_ld_off = head_ld_meta->insn.off;
+
+				if (prev_ld_off < head_ld_off) {
+					head_ld_meta->insn.off = prev_ld_off;
+					head_st_meta->insn.off = prev_st_off;
+					head_ld_meta->ldst_gather_len =
+						-head_ld_meta->ldst_gather_len;
+				}
+
+				head_ld_meta->paired_st = &head_st_meta->insn;
+				head_st_meta->skip = true;
+			} else {
+				head_ld_meta->ldst_gather_len = 0;
+			}
+
+			/* If the chain is ended by an load/store pair then this
+			 * could serve as the new head of the the next chain.
+			 */
+			if (curr_pair_is_memcpy(meta1, meta2)) {
+				head_ld_meta = meta1;
+				head_st_meta = meta2;
+				head_ld_meta->ldst_gather_len =
+					BPF_LDST_BYTES(ld);
+				meta1 = nfp_meta_next(meta1);
+				meta2 = nfp_meta_next(meta2);
+				prev_ld = ld;
+				prev_st = st;
+				count = 1;
+			} else {
+				head_ld_meta = NULL;
+				head_st_meta = NULL;
+				prev_ld = NULL;
+				prev_st = NULL;
+				count = 0;
+			}
+
+			continue;
+		}
+
+		if (!head_ld_meta) {
+			head_ld_meta = meta1;
+			head_st_meta = meta2;
+		} else {
+			meta1->skip = true;
+			meta2->skip = true;
+		}
+
+		head_ld_meta->ldst_gather_len += BPF_LDST_BYTES(ld);
+		meta1 = nfp_meta_next(meta1);
+		meta2 = nfp_meta_next(meta2);
+		prev_ld = ld;
+		prev_st = st;
+		count++;
+	}
+}
+
 static int nfp_bpf_optimize(struct nfp_prog *nfp_prog)
 {
 	nfp_bpf_opt_reg_init(nfp_prog);
 
 	nfp_bpf_opt_ld_mask(nfp_prog);
 	nfp_bpf_opt_ld_shift(nfp_prog);
+	nfp_bpf_opt_ldst_gather(nfp_prog);
 
 	return 0;
 }