From c941ce9c282cc606e6517356fcc186a9da2b4ab9 Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin.monnet@netronome.com>
Date: Sun, 7 Oct 2018 12:56:47 +0100
Subject: [PATCH 01/12] bpf: add verifier callback to get stack usage info for
 offloaded progs

In preparation for BPF-to-BPF calls in offloaded programs, add a new
function attribute to the struct bpf_prog_offload_ops so that drivers
supporting eBPF offload can hook at the end of program verification, and
potentially extract information collected by the verifier.

Implement a minimal callback (returning 0) in the drivers providing the
structs, namely netdevsim and nfp.

This will be useful in the nfp driver, in later commits, to extract the
number of subprograms as well as the stack depth for those subprograms.

Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Reviewed-by: Jiong Wang <jiong.wang@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 .../net/ethernet/netronome/nfp/bpf/verifier.c  |  8 +++++++-
 drivers/net/netdevsim/bpf.c                    |  8 +++++++-
 include/linux/bpf.h                            |  1 +
 include/linux/bpf_verifier.h                   |  1 +
 kernel/bpf/offload.c                           | 18 ++++++++++++++++++
 kernel/bpf/verifier.c                          |  3 +++
 6 files changed, 37 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
index a6e9248669e14..e470489021e3e 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
@@ -640,6 +640,12 @@ nfp_verify_insn(struct bpf_verifier_env *env, int insn_idx, int prev_insn_idx)
 	return 0;
 }
 
+static int nfp_bpf_finalize(struct bpf_verifier_env *env)
+{
+	return 0;
+}
+
 const struct bpf_prog_offload_ops nfp_bpf_analyzer_ops = {
-	.insn_hook = nfp_verify_insn,
+	.insn_hook	= nfp_verify_insn,
+	.finalize	= nfp_bpf_finalize,
 };
diff --git a/drivers/net/netdevsim/bpf.c b/drivers/net/netdevsim/bpf.c
index 81444208b2162..cb3518474f0e4 100644
--- a/drivers/net/netdevsim/bpf.c
+++ b/drivers/net/netdevsim/bpf.c
@@ -86,8 +86,14 @@ nsim_bpf_verify_insn(struct bpf_verifier_env *env, int insn_idx, int prev_insn)
 	return 0;
 }
 
+static int nsim_bpf_finalize(struct bpf_verifier_env *env)
+{
+	return 0;
+}
+
 static const struct bpf_prog_offload_ops nsim_bpf_analyzer_ops = {
-	.insn_hook = nsim_bpf_verify_insn,
+	.insn_hook	= nsim_bpf_verify_insn,
+	.finalize	= nsim_bpf_finalize,
 };
 
 static bool nsim_xdp_offload_active(struct netdevsim *ns)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 027697b6a22f4..9b558713447fd 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -263,6 +263,7 @@ struct bpf_verifier_ops {
 struct bpf_prog_offload_ops {
 	int (*insn_hook)(struct bpf_verifier_env *env,
 			 int insn_idx, int prev_insn_idx);
+	int (*finalize)(struct bpf_verifier_env *env);
 };
 
 struct bpf_prog_offload {
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 7b6fd2ab3263f..9e8056ec20faa 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -245,5 +245,6 @@ static inline struct bpf_reg_state *cur_regs(struct bpf_verifier_env *env)
 int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env);
 int bpf_prog_offload_verify_insn(struct bpf_verifier_env *env,
 				 int insn_idx, int prev_insn_idx);
+int bpf_prog_offload_finalize(struct bpf_verifier_env *env);
 
 #endif /* _LINUX_BPF_VERIFIER_H */
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index 177a524363942..8e93c47f07796 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -172,6 +172,24 @@ int bpf_prog_offload_verify_insn(struct bpf_verifier_env *env,
 	return ret;
 }
 
+int bpf_prog_offload_finalize(struct bpf_verifier_env *env)
+{
+	struct bpf_prog_offload *offload;
+	int ret = -ENODEV;
+
+	down_read(&bpf_devs_lock);
+	offload = env->prog->aux->offload;
+	if (offload) {
+		if (offload->dev_ops->finalize)
+			ret = offload->dev_ops->finalize(env);
+		else
+			ret = 0;
+	}
+	up_read(&bpf_devs_lock);
+
+	return ret;
+}
+
 static void __bpf_prog_offload_destroy(struct bpf_prog *prog)
 {
 	struct bpf_prog_offload *offload = prog->aux->offload;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 73c81bef6ae83..a0454cb299bae 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -6309,6 +6309,9 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
 		env->cur_state = NULL;
 	}
 
+	if (ret == 0 && bpf_prog_is_dev_bound(env->prog->aux))
+		ret = bpf_prog_offload_finalize(env);
+
 skip_full_check:
 	while (!pop_stack(env, NULL, NULL));
 	free_states(env);

From 1a7e62e6329c210ff67c5706fbe91187f2452baf Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin.monnet@netronome.com>
Date: Sun, 7 Oct 2018 12:56:48 +0100
Subject: [PATCH 02/12] nfp: bpf: rename nfp_prog->stack_depth as
 nfp_prog->stack_frame_depth

In preparation for support for BPF to BPF calls in offloaded programs,
rename the "stack_depth" field of the struct nfp_prog as
"stack_frame_depth". This is to make it clear that the field refers to
the maximum size of the current stack frame (as opposed to the maximum
size of the whole stack memory).

Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 drivers/net/ethernet/netronome/nfp/bpf/jit.c     | 10 +++++-----
 drivers/net/ethernet/netronome/nfp/bpf/main.h    |  4 ++--
 drivers/net/ethernet/netronome/nfp/bpf/offload.c |  2 +-
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
index eff57f7d056a4..98a94ca36bfa0 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
@@ -1137,7 +1137,7 @@ mem_op_stack(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
 	     unsigned int size, unsigned int ptr_off, u8 gpr, u8 ptr_gpr,
 	     bool clr_gpr, lmem_step step)
 {
-	s32 off = nfp_prog->stack_depth + meta->insn.off + ptr_off;
+	s32 off = nfp_prog->stack_frame_depth + meta->insn.off + ptr_off;
 	bool first = true, last;
 	bool needs_inc = false;
 	swreg stack_off_reg;
@@ -1695,7 +1695,7 @@ map_call_stack_common(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 	s64 lm_off;
 
 	/* We only have to reload LM0 if the key is not at start of stack */
-	lm_off = nfp_prog->stack_depth;
+	lm_off = nfp_prog->stack_frame_depth;
 	lm_off += meta->arg2.reg.var_off.value + meta->arg2.reg.off;
 	load_lm_ptr = meta->arg2.var_off || lm_off;
 
@@ -1808,10 +1808,10 @@ static int mov_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 		swreg stack_depth_reg;
 
 		stack_depth_reg = ur_load_imm_any(nfp_prog,
-						  nfp_prog->stack_depth,
+						  nfp_prog->stack_frame_depth,
 						  stack_imm(nfp_prog));
-		emit_alu(nfp_prog, reg_both(dst),
-			 stack_reg(nfp_prog), ALU_OP_ADD, stack_depth_reg);
+		emit_alu(nfp_prog, reg_both(dst), stack_reg(nfp_prog),
+			 ALU_OP_ADD, stack_depth_reg);
 		wrp_immed(nfp_prog, reg_both(dst + 1), 0);
 	} else {
 		wrp_reg_mov(nfp_prog, dst, src);
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.h b/drivers/net/ethernet/netronome/nfp/bpf/main.h
index 792ebc4081a37..7050535383b84 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/main.h
+++ b/drivers/net/ethernet/netronome/nfp/bpf/main.h
@@ -436,7 +436,7 @@ static inline bool is_mbpf_div(const struct nfp_insn_meta *meta)
  * @tgt_abort: jump target for abort (e.g. access outside of packet buffer)
  * @n_translated: number of successfully translated instructions (for errors)
  * @error: error code if something went wrong
- * @stack_depth: max stack depth from the verifier
+ * @stack_frame_depth: max stack depth for current frame
  * @adjust_head_location: if program has single adjust head call - the insn no.
  * @map_records_cnt: the number of map pointers recorded for this prog
  * @map_records: the map record pointers from bpf->maps_neutral
@@ -460,7 +460,7 @@ struct nfp_prog {
 	unsigned int n_translated;
 	int error;
 
-	unsigned int stack_depth;
+	unsigned int stack_frame_depth;
 	unsigned int adjust_head_location;
 
 	unsigned int map_records_cnt;
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/offload.c b/drivers/net/ethernet/netronome/nfp/bpf/offload.c
index 1ccd6371a15b5..c9519bb00f8a3 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/offload.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/offload.c
@@ -260,7 +260,7 @@ static int nfp_bpf_translate(struct nfp_net *nn, struct bpf_prog *prog)
 			prog->aux->stack_depth, stack_size);
 		return -EOPNOTSUPP;
 	}
-	nfp_prog->stack_depth = round_up(prog->aux->stack_depth, 4);
+	nfp_prog->stack_frame_depth = round_up(prog->aux->stack_depth, 4);
 
 	max_instr = nn_readw(nn, NFP_NET_CFG_BPF_MAX_LEN);
 	nfp_prog->__prog_alloc_len = max_instr * sizeof(u64);

From c5da54d93eb43461a5b79e1fdad8409abad83a77 Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin.monnet@netronome.com>
Date: Sun, 7 Oct 2018 12:56:49 +0100
Subject: [PATCH 03/12] nfp: bpf: copy eBPF subprograms information from kernel
 verifier

In order to support BPF-to-BPF calls in offloaded programs, the nfp
driver must collect information about the distinct subprograms: namely,
the number of subprograms composing the complete program and the stack
depth of those subprograms. The latter in particular is non-trivial to
collect, so we copy those elements from the kernel verifier via the
newly added post-verification hook. The struct nfp_prog is extended to
store this information. Stack depths are stored in an array of dedicated
structs.

Subprogram start indexes are not collected. Instead, meta instructions
associated to the start of a subprogram will be marked with a flag in a
later patch.

Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Reviewed-by: Jiong Wang <jiong.wang@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 drivers/net/ethernet/netronome/nfp/bpf/main.h     | 12 ++++++++++++
 drivers/net/ethernet/netronome/nfp/bpf/offload.c  |  2 ++
 drivers/net/ethernet/netronome/nfp/bpf/verifier.c | 15 +++++++++++++++
 3 files changed, 29 insertions(+)

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.h b/drivers/net/ethernet/netronome/nfp/bpf/main.h
index 7050535383b84..7f6e850e42da7 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/main.h
+++ b/drivers/net/ethernet/netronome/nfp/bpf/main.h
@@ -423,6 +423,14 @@ static inline bool is_mbpf_div(const struct nfp_insn_meta *meta)
 	return is_mbpf_alu(meta) && mbpf_op(meta) == BPF_DIV;
 }
 
+/**
+ * struct nfp_bpf_subprog_info - nfp BPF sub-program (a.k.a. function) info
+ * @stack_depth:	maximum stack depth used by this sub-program
+ */
+struct nfp_bpf_subprog_info {
+	u16 stack_depth;
+};
+
 /**
  * struct nfp_prog - nfp BPF program
  * @bpf: backpointer to the bpf app priv structure
@@ -439,7 +447,9 @@ static inline bool is_mbpf_div(const struct nfp_insn_meta *meta)
  * @stack_frame_depth: max stack depth for current frame
  * @adjust_head_location: if program has single adjust head call - the insn no.
  * @map_records_cnt: the number of map pointers recorded for this prog
+ * @subprog_cnt: number of sub-programs, including main function
  * @map_records: the map record pointers from bpf->maps_neutral
+ * @subprog: pointer to an array of objects holding info about sub-programs
  * @insns: list of BPF instruction wrappers (struct nfp_insn_meta)
  */
 struct nfp_prog {
@@ -464,7 +474,9 @@ struct nfp_prog {
 	unsigned int adjust_head_location;
 
 	unsigned int map_records_cnt;
+	unsigned int subprog_cnt;
 	struct nfp_bpf_neutral_map **map_records;
+	struct nfp_bpf_subprog_info *subprog;
 
 	struct list_head insns;
 };
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/offload.c b/drivers/net/ethernet/netronome/nfp/bpf/offload.c
index c9519bb00f8a3..b683b03efd227 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/offload.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/offload.c
@@ -208,6 +208,8 @@ static void nfp_prog_free(struct nfp_prog *nfp_prog)
 {
 	struct nfp_insn_meta *meta, *tmp;
 
+	kfree(nfp_prog->subprog);
+
 	list_for_each_entry_safe(meta, tmp, &nfp_prog->insns, l) {
 		list_del(&meta->l);
 		kfree(meta);
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
index e470489021e3e..9ef74bc1ec1d7 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
@@ -642,6 +642,21 @@ nfp_verify_insn(struct bpf_verifier_env *env, int insn_idx, int prev_insn_idx)
 
 static int nfp_bpf_finalize(struct bpf_verifier_env *env)
 {
+	struct bpf_subprog_info *info;
+	struct nfp_prog *nfp_prog;
+	int i;
+
+	nfp_prog = env->prog->aux->offload->dev_priv;
+	nfp_prog->subprog_cnt = env->subprog_cnt;
+	nfp_prog->subprog = kcalloc(nfp_prog->subprog_cnt,
+				    sizeof(nfp_prog->subprog[0]), GFP_KERNEL);
+	if (!nfp_prog->subprog)
+		return -ENOMEM;
+
+	info = env->subprog_info;
+	for (i = 0; i < nfp_prog->subprog_cnt; i++)
+		nfp_prog->subprog[i].stack_depth = info[i].stack_depth;
+
 	return 0;
 }
 

From bcfdfb7c962d84ef07aa2b400ef1c9d70e1120e5 Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin.monnet@netronome.com>
Date: Sun, 7 Oct 2018 12:56:50 +0100
Subject: [PATCH 04/12] nfp: bpf: ignore helper-related checks for BPF calls in
 nfp verifier

The checks related to eBPF helper calls are performed each time the nfp
driver meets a BPF_JUMP | BPF_CALL instruction. However, these checks
are not relevant for BPF-to-BPF call (same instruction code, different
value in source register), so just skip the checks for such calls.

While at it, rename the function that runs those checks to make it clear
they apply to _helper_ calls only.

Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Reviewed-by: Jiong Wang <jiong.wang@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 drivers/net/ethernet/netronome/nfp/bpf/main.h     | 8 ++++++++
 drivers/net/ethernet/netronome/nfp/bpf/verifier.c | 9 +++++----
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.h b/drivers/net/ethernet/netronome/nfp/bpf/main.h
index 7f6e850e42da7..853a5346378cf 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/main.h
+++ b/drivers/net/ethernet/netronome/nfp/bpf/main.h
@@ -423,6 +423,14 @@ static inline bool is_mbpf_div(const struct nfp_insn_meta *meta)
 	return is_mbpf_alu(meta) && mbpf_op(meta) == BPF_DIV;
 }
 
+static inline bool is_mbpf_helper_call(const struct nfp_insn_meta *meta)
+{
+	struct bpf_insn insn = meta->insn;
+
+	return insn.code == (BPF_JMP | BPF_CALL) &&
+		insn.src_reg != BPF_PSEUDO_CALL;
+}
+
 /**
  * struct nfp_bpf_subprog_info - nfp BPF sub-program (a.k.a. function) info
  * @stack_depth:	maximum stack depth used by this sub-program
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
index 9ef74bc1ec1d7..c642c2c07d964 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
@@ -155,8 +155,9 @@ nfp_bpf_map_call_ok(const char *fname, struct bpf_verifier_env *env,
 }
 
 static int
-nfp_bpf_check_call(struct nfp_prog *nfp_prog, struct bpf_verifier_env *env,
-		   struct nfp_insn_meta *meta)
+nfp_bpf_check_helper_call(struct nfp_prog *nfp_prog,
+			  struct bpf_verifier_env *env,
+			  struct nfp_insn_meta *meta)
 {
 	const struct bpf_reg_state *reg1 = cur_regs(env) + BPF_REG_1;
 	const struct bpf_reg_state *reg2 = cur_regs(env) + BPF_REG_2;
@@ -620,8 +621,8 @@ nfp_verify_insn(struct bpf_verifier_env *env, int insn_idx, int prev_insn_idx)
 		return -EINVAL;
 	}
 
-	if (meta->insn.code == (BPF_JMP | BPF_CALL))
-		return nfp_bpf_check_call(nfp_prog, env, meta);
+	if (is_mbpf_helper_call(meta))
+		return nfp_bpf_check_helper_call(nfp_prog, env, meta);
 	if (meta->insn.code == (BPF_JMP | BPF_EXIT))
 		return nfp_bpf_check_exit(nfp_prog, env);
 

From e3b49dc69b320ba93059509ff2b31bde9242a7fa Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin.monnet@netronome.com>
Date: Sun, 7 Oct 2018 12:56:51 +0100
Subject: [PATCH 05/12] nfp: bpf: account for BPF-to-BPF calls when preparing
 nfp JIT

Similarly to "exit" or "helper call" instructions, BPF-to-BPF calls will
require additional processing before translation starts, in order to
record and mark jump destinations.

We also mark the instructions where each subprogram begins. This will be
used in a following commit to determine where to add prologues for
subprograms.

Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Reviewed-by: Jiong Wang <jiong.wang@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 drivers/net/ethernet/netronome/nfp/bpf/jit.c  | 35 +++++++++++++------
 drivers/net/ethernet/netronome/nfp/bpf/main.h |  3 +-
 2 files changed, 27 insertions(+), 11 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
index 98a94ca36bfa0..ccb80a5ac8286 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
@@ -4018,20 +4018,35 @@ void nfp_bpf_jit_prepare(struct nfp_prog *nfp_prog, unsigned int cnt)
 
 	/* Another pass to record jump information. */
 	list_for_each_entry(meta, &nfp_prog->insns, l) {
+		struct nfp_insn_meta *dst_meta;
 		u64 code = meta->insn.code;
+		unsigned int dst_idx;
+		bool pseudo_call;
 
-		if (BPF_CLASS(code) == BPF_JMP && BPF_OP(code) != BPF_EXIT &&
-		    BPF_OP(code) != BPF_CALL) {
-			struct nfp_insn_meta *dst_meta;
-			unsigned short dst_indx;
+		if (BPF_CLASS(code) != BPF_JMP)
+			continue;
+		if (BPF_OP(code) == BPF_EXIT)
+			continue;
+		if (is_mbpf_helper_call(meta))
+			continue;
 
-			dst_indx = meta->n + 1 + meta->insn.off;
-			dst_meta = nfp_bpf_goto_meta(nfp_prog, meta, dst_indx,
-						     cnt);
+		/* If opcode is BPF_CALL at this point, this can only be a
+		 * BPF-to-BPF call (a.k.a pseudo call).
+		 */
+		pseudo_call = BPF_OP(code) == BPF_CALL;
 
-			meta->jmp_dst = dst_meta;
-			dst_meta->flags |= FLAG_INSN_IS_JUMP_DST;
-		}
+		if (pseudo_call)
+			dst_idx = meta->n + 1 + meta->insn.imm;
+		else
+			dst_idx = meta->n + 1 + meta->insn.off;
+
+		dst_meta = nfp_bpf_goto_meta(nfp_prog, meta, dst_idx, cnt);
+
+		if (pseudo_call)
+			dst_meta->flags |= FLAG_INSN_IS_SUBPROG_START;
+
+		dst_meta->flags |= FLAG_INSN_IS_JUMP_DST;
+		meta->jmp_dst = dst_meta;
 	}
 }
 
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.h b/drivers/net/ethernet/netronome/nfp/bpf/main.h
index 853a5346378cf..20a98ce4b3458 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/main.h
+++ b/drivers/net/ethernet/netronome/nfp/bpf/main.h
@@ -262,7 +262,8 @@ struct nfp_bpf_reg_state {
 	bool var_off;
 };
 
-#define FLAG_INSN_IS_JUMP_DST	BIT(0)
+#define FLAG_INSN_IS_JUMP_DST			BIT(0)
+#define FLAG_INSN_IS_SUBPROG_START		BIT(1)
 
 /**
  * struct nfp_insn_meta - BPF instruction wrapper

From 389f263b60c9a3168d47eeb44b6a07bd2b8c66e2 Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin.monnet@netronome.com>
Date: Sun, 7 Oct 2018 12:56:52 +0100
Subject: [PATCH 06/12] nfp: bpf: add main logics for BPF-to-BPF calls support
 in nfp driver

This is the main patch for the logics of BPF-to-BPF calls in the nfp
driver.

The functions called on BPF_JUMP | BPF_CALL and BPF_JUMP | BPF_EXIT were
used to call helpers and exit from the program, respectively; make them
usable for calling into, or returning from, a BPF subprogram as well.

For all calls, push the return address as well as the callee-saved
registers (R6 to R9) to the stack, and pop them upon returning from the
calls. In order to limit the overhead in terms of instruction number,
this is done through dedicated subroutines. Jumping to the callee
actually consists in jumping to the subroutine, that "returns" to the
callee: this will require some fixup for passing the address in a later
patch. Similarly, returning consists in jumping to the subroutine, which
pops registers and then return directly to the caller (but no fixup is
needed here).

Return to the caller is performed with the RTN instruction newly added
to the JIT.

For the few steps where we need to know what subprogram an instruction
belongs to, the struct nfp_insn_meta is extended with a new subprog_idx
field.

Note that checks on the available stack size, to take into account the
additional requirements associated to BPF-to-BPF calls (storing R6-R9
and return addresses), are added in a later patch.

Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: Jiong Wang <jiong.wang@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 drivers/net/ethernet/netronome/nfp/bpf/jit.c  | 235 +++++++++++++++++-
 drivers/net/ethernet/netronome/nfp/bpf/main.h |  20 ++
 .../net/ethernet/netronome/nfp/bpf/offload.c  |   1 -
 .../net/ethernet/netronome/nfp/bpf/verifier.c |  34 ++-
 drivers/net/ethernet/netronome/nfp/nfp_asm.h  |   9 +
 5 files changed, 295 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
index ccb80a5ac8286..2d2c9148bd447 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
@@ -266,6 +266,38 @@ emit_br_bset(struct nfp_prog *nfp_prog, swreg src, u8 bit, u16 addr, u8 defer)
 	emit_br_bit_relo(nfp_prog, src, bit, addr, defer, true, RELO_BR_REL);
 }
 
+static void
+__emit_br_alu(struct nfp_prog *nfp_prog, u16 areg, u16 breg, u16 imm_hi,
+	      u8 defer, bool dst_lmextn, bool src_lmextn)
+{
+	u64 insn;
+
+	insn = OP_BR_ALU_BASE |
+		FIELD_PREP(OP_BR_ALU_A_SRC, areg) |
+		FIELD_PREP(OP_BR_ALU_B_SRC, breg) |
+		FIELD_PREP(OP_BR_ALU_DEFBR, defer) |
+		FIELD_PREP(OP_BR_ALU_IMM_HI, imm_hi) |
+		FIELD_PREP(OP_BR_ALU_SRC_LMEXTN, src_lmextn) |
+		FIELD_PREP(OP_BR_ALU_DST_LMEXTN, dst_lmextn);
+
+	nfp_prog_push(nfp_prog, insn);
+}
+
+static void emit_rtn(struct nfp_prog *nfp_prog, swreg base, u8 defer)
+{
+	struct nfp_insn_ur_regs reg;
+	int err;
+
+	err = swreg_to_unrestricted(reg_none(), base, reg_imm(0), &reg);
+	if (err) {
+		nfp_prog->error = err;
+		return;
+	}
+
+	__emit_br_alu(nfp_prog, reg.areg, reg.breg, 0, defer, reg.dst_lmextn,
+		      reg.src_lmextn);
+}
+
 static void
 __emit_immed(struct nfp_prog *nfp_prog, u16 areg, u16 breg, u16 imm_hi,
 	     enum immed_width width, bool invert,
@@ -3081,7 +3113,73 @@ static int jne_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 	return wrp_test_reg(nfp_prog, meta, ALU_OP_XOR, BR_BNE);
 }
 
-static int call(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
+static int
+bpf_to_bpf_call(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
+{
+	u32 ret_tgt, stack_depth;
+	swreg tmp_reg;
+
+	stack_depth = round_up(nfp_prog->stack_frame_depth, STACK_FRAME_ALIGN);
+	/* Space for saving the return address is accounted for by the callee,
+	 * so stack_depth can be zero for the main function.
+	 */
+	if (stack_depth) {
+		tmp_reg = ur_load_imm_any(nfp_prog, stack_depth,
+					  stack_imm(nfp_prog));
+		emit_alu(nfp_prog, stack_reg(nfp_prog),
+			 stack_reg(nfp_prog), ALU_OP_ADD, tmp_reg);
+		emit_csr_wr(nfp_prog, stack_reg(nfp_prog),
+			    NFP_CSR_ACT_LM_ADDR0);
+	}
+
+	/* The following steps are performed:
+	 *     1. Put the start offset of the callee into imm_b(). This will
+	 *        require a fixup step, as we do not necessarily know this
+	 *        address yet.
+	 *     2. Put the return address from the callee to the caller into
+	 *        register ret_reg().
+	 *     3. (After defer slots are consumed) Jump to the subroutine that
+	 *        pushes the registers to the stack.
+	 * The subroutine acts as a trampoline, and returns to the address in
+	 * imm_b(), i.e. jumps to the callee.
+	 *
+	 * Using ret_reg() to pass the return address to the callee is set here
+	 * as a convention. The callee can then push this address onto its
+	 * stack frame in its prologue. The advantages of passing the return
+	 * address through ret_reg(), instead of pushing it to the stack right
+	 * here, are the following:
+	 * - It looks cleaner.
+	 * - If the called function is called multiple time, we get a lower
+	 *   program size.
+	 * - We save two no-op instructions that should be added just before
+	 *   the emit_br() when stack depth is not null otherwise.
+	 * - If we ever find a register to hold the return address during whole
+	 *   execution of the callee, we will not have to push the return
+	 *   address to the stack for leaf functions.
+	 */
+	ret_tgt = nfp_prog_current_offset(nfp_prog) + 3;
+	emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO, 2,
+		     RELO_BR_GO_CALL_PUSH_REGS);
+	wrp_immed_relo(nfp_prog, imm_b(nfp_prog), 0, RELO_IMMED_REL);
+	wrp_immed_relo(nfp_prog, ret_reg(nfp_prog), ret_tgt, RELO_IMMED_REL);
+
+	if (!nfp_prog_confirm_current_offset(nfp_prog, ret_tgt))
+		return -EINVAL;
+
+	if (stack_depth) {
+		tmp_reg = ur_load_imm_any(nfp_prog, stack_depth,
+					  stack_imm(nfp_prog));
+		emit_alu(nfp_prog, stack_reg(nfp_prog),
+			 stack_reg(nfp_prog), ALU_OP_SUB, tmp_reg);
+		emit_csr_wr(nfp_prog, stack_reg(nfp_prog),
+			    NFP_CSR_ACT_LM_ADDR0);
+		wrp_nops(nfp_prog, 3);
+	}
+
+	return 0;
+}
+
+static int helper_call(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 {
 	switch (meta->insn.imm) {
 	case BPF_FUNC_xdp_adjust_head:
@@ -3102,6 +3200,19 @@ static int call(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 	}
 }
 
+static int call(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
+{
+	if (is_mbpf_pseudo_call(meta))
+		return bpf_to_bpf_call(nfp_prog, meta);
+	else
+		return helper_call(nfp_prog, meta);
+}
+
+static bool nfp_is_main_function(struct nfp_insn_meta *meta)
+{
+	return meta->subprog_idx == 0;
+}
+
 static int goto_out(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 {
 	emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO, 0, RELO_BR_GO_OUT);
@@ -3109,6 +3220,30 @@ static int goto_out(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 	return 0;
 }
 
+static int
+nfp_subprog_epilogue(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
+{
+	/* Pop R6~R9 to the stack via related subroutine.
+	 * Pop return address for BPF-to-BPF call from the stack and load it
+	 * into ret_reg() before we jump. This means that the subroutine does
+	 * not come back here, we make it jump back to the subprogram caller
+	 * directly!
+	 */
+	emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO, 1,
+		     RELO_BR_GO_CALL_POP_REGS);
+	wrp_mov(nfp_prog, ret_reg(nfp_prog), reg_lm(0, 0));
+
+	return 0;
+}
+
+static int jmp_exit(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
+{
+	if (nfp_is_main_function(meta))
+		return goto_out(nfp_prog, meta);
+	else
+		return nfp_subprog_epilogue(nfp_prog, meta);
+}
+
 static const instr_cb_t instr_cb[256] = {
 	[BPF_ALU64 | BPF_MOV | BPF_X] =	mov_reg64,
 	[BPF_ALU64 | BPF_MOV | BPF_K] =	mov_imm64,
@@ -3197,7 +3332,7 @@ static const instr_cb_t instr_cb[256] = {
 	[BPF_JMP | BPF_JSET | BPF_X] =	jset_reg,
 	[BPF_JMP | BPF_JNE | BPF_X] =	jne_reg,
 	[BPF_JMP | BPF_CALL] =		call,
-	[BPF_JMP | BPF_EXIT] =		goto_out,
+	[BPF_JMP | BPF_EXIT] =		jmp_exit,
 };
 
 /* --- Assembler logic --- */
@@ -3258,6 +3393,27 @@ static void nfp_intro(struct nfp_prog *nfp_prog)
 		 plen_reg(nfp_prog), ALU_OP_AND, pv_len(nfp_prog));
 }
 
+static void
+nfp_subprog_prologue(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
+{
+	/* Save return address into the stack. */
+	wrp_mov(nfp_prog, reg_lm(0, 0), ret_reg(nfp_prog));
+}
+
+static void
+nfp_start_subprog(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
+{
+	unsigned int depth = nfp_prog->subprog[meta->subprog_idx].stack_depth;
+
+	nfp_prog->stack_frame_depth = round_up(depth, 4);
+	nfp_subprog_prologue(nfp_prog, meta);
+}
+
+bool nfp_is_subprog_start(struct nfp_insn_meta *meta)
+{
+	return meta->flags & FLAG_INSN_IS_SUBPROG_START;
+}
+
 static void nfp_outro_tc_da(struct nfp_prog *nfp_prog)
 {
 	/* TC direct-action mode:
@@ -3348,6 +3504,56 @@ static void nfp_outro_xdp(struct nfp_prog *nfp_prog)
 	emit_ld_field(nfp_prog, reg_a(0), 0xc, reg_b(2), SHF_SC_L_SHF, 16);
 }
 
+static void nfp_push_callee_registers(struct nfp_prog *nfp_prog)
+{
+	u8 reg;
+
+	/* Subroutine: Save all callee saved registers (R6 ~ R9).
+	 * imm_b() holds the return address.
+	 */
+	nfp_prog->tgt_call_push_regs = nfp_prog_current_offset(nfp_prog);
+	for (reg = BPF_REG_6; reg <= BPF_REG_9; reg++) {
+		u8 adj = (reg - BPF_REG_0) * 2;
+		u8 idx = (reg - BPF_REG_6) * 2;
+
+		/* The first slot in the stack frame is used to push the return
+		 * address in bpf_to_bpf_call(), start just after.
+		 */
+		wrp_mov(nfp_prog, reg_lm(0, 1 + idx), reg_b(adj));
+
+		if (reg == BPF_REG_8)
+			/* Prepare to jump back, last 3 insns use defer slots */
+			emit_rtn(nfp_prog, imm_b(nfp_prog), 3);
+
+		wrp_mov(nfp_prog, reg_lm(0, 1 + idx + 1), reg_b(adj + 1));
+	}
+}
+
+static void nfp_pop_callee_registers(struct nfp_prog *nfp_prog)
+{
+	u8 reg;
+
+	/* Subroutine: Restore all callee saved registers (R6 ~ R9).
+	 * ret_reg() holds the return address.
+	 */
+	nfp_prog->tgt_call_pop_regs = nfp_prog_current_offset(nfp_prog);
+	for (reg = BPF_REG_6; reg <= BPF_REG_9; reg++) {
+		u8 adj = (reg - BPF_REG_0) * 2;
+		u8 idx = (reg - BPF_REG_6) * 2;
+
+		/* The first slot in the stack frame holds the return address,
+		 * start popping just after that.
+		 */
+		wrp_mov(nfp_prog, reg_both(adj), reg_lm(0, 1 + idx));
+
+		if (reg == BPF_REG_8)
+			/* Prepare to jump back, last 3 insns use defer slots */
+			emit_rtn(nfp_prog, ret_reg(nfp_prog), 3);
+
+		wrp_mov(nfp_prog, reg_both(adj + 1), reg_lm(0, 1 + idx + 1));
+	}
+}
+
 static void nfp_outro(struct nfp_prog *nfp_prog)
 {
 	switch (nfp_prog->type) {
@@ -3360,13 +3566,23 @@ static void nfp_outro(struct nfp_prog *nfp_prog)
 	default:
 		WARN_ON(1);
 	}
+
+	if (nfp_prog->subprog_cnt == 1)
+		return;
+
+	nfp_push_callee_registers(nfp_prog);
+	nfp_pop_callee_registers(nfp_prog);
 }
 
 static int nfp_translate(struct nfp_prog *nfp_prog)
 {
 	struct nfp_insn_meta *meta;
+	unsigned int depth;
 	int err;
 
+	depth = nfp_prog->subprog[0].stack_depth;
+	nfp_prog->stack_frame_depth = round_up(depth, 4);
+
 	nfp_intro(nfp_prog);
 	if (nfp_prog->error)
 		return nfp_prog->error;
@@ -3376,6 +3592,12 @@ static int nfp_translate(struct nfp_prog *nfp_prog)
 
 		meta->off = nfp_prog_current_offset(nfp_prog);
 
+		if (nfp_is_subprog_start(meta)) {
+			nfp_start_subprog(nfp_prog, meta);
+			if (nfp_prog->error)
+				return nfp_prog->error;
+		}
+
 		if (meta->skip) {
 			nfp_prog->n_translated++;
 			continue;
@@ -4069,6 +4291,7 @@ void *nfp_bpf_relo_for_vnic(struct nfp_prog *nfp_prog, struct nfp_bpf_vnic *bv)
 	for (i = 0; i < nfp_prog->prog_len; i++) {
 		enum nfp_relo_type special;
 		u32 val;
+		u16 off;
 
 		special = FIELD_GET(OP_RELO_TYPE, prog[i]);
 		switch (special) {
@@ -4085,6 +4308,14 @@ void *nfp_bpf_relo_for_vnic(struct nfp_prog *nfp_prog, struct nfp_bpf_vnic *bv)
 			br_set_offset(&prog[i],
 				      nfp_prog->tgt_abort + bv->start_off);
 			break;
+		case RELO_BR_GO_CALL_PUSH_REGS:
+			off = nfp_prog->tgt_call_push_regs + bv->start_off;
+			br_set_offset(&prog[i], off);
+			break;
+		case RELO_BR_GO_CALL_POP_REGS:
+			off = nfp_prog->tgt_call_pop_regs + bv->start_off;
+			br_set_offset(&prog[i], off);
+			break;
 		case RELO_BR_NEXT_PKT:
 			br_set_offset(&prog[i], bv->tgt_done);
 			break;
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.h b/drivers/net/ethernet/netronome/nfp/bpf/main.h
index 20a98ce4b3458..d9695bc316dd3 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/main.h
+++ b/drivers/net/ethernet/netronome/nfp/bpf/main.h
@@ -61,6 +61,8 @@ enum nfp_relo_type {
 	/* internal jumps to parts of the outro */
 	RELO_BR_GO_OUT,
 	RELO_BR_GO_ABORT,
+	RELO_BR_GO_CALL_PUSH_REGS,
+	RELO_BR_GO_CALL_POP_REGS,
 	/* external jumps to fixed addresses */
 	RELO_BR_NEXT_PKT,
 	RELO_BR_HELPER,
@@ -104,6 +106,7 @@ enum pkt_vec {
 #define imma_a(np)	reg_a(STATIC_REG_IMMA)
 #define imma_b(np)	reg_b(STATIC_REG_IMMA)
 #define imm_both(np)	reg_both(STATIC_REG_IMM)
+#define ret_reg(np)	imm_a(np)
 
 #define NFP_BPF_ABI_FLAGS	reg_imm(0)
 #define   NFP_BPF_ABI_FLAG_MARK	1
@@ -290,6 +293,7 @@ struct nfp_bpf_reg_state {
  * @off: index of first generated machine instruction (in nfp_prog.prog)
  * @n: eBPF instruction number
  * @flags: eBPF instruction extra optimization flags
+ * @subprog_idx: index of subprogram to which the instruction belongs
  * @skip: skip this instruction (optimized out)
  * @double_cb: callback for second part of the instruction
  * @l: link on nfp_prog->insns list
@@ -336,6 +340,7 @@ struct nfp_insn_meta {
 	unsigned int off;
 	unsigned short n;
 	unsigned short flags;
+	unsigned short subprog_idx;
 	bool skip;
 	instr_cb_t double_cb;
 
@@ -432,6 +437,16 @@ static inline bool is_mbpf_helper_call(const struct nfp_insn_meta *meta)
 		insn.src_reg != BPF_PSEUDO_CALL;
 }
 
+static inline bool is_mbpf_pseudo_call(const struct nfp_insn_meta *meta)
+{
+	struct bpf_insn insn = meta->insn;
+
+	return insn.code == (BPF_JMP | BPF_CALL) &&
+		insn.src_reg == BPF_PSEUDO_CALL;
+}
+
+#define STACK_FRAME_ALIGN 64
+
 /**
  * struct nfp_bpf_subprog_info - nfp BPF sub-program (a.k.a. function) info
  * @stack_depth:	maximum stack depth used by this sub-program
@@ -451,6 +466,8 @@ struct nfp_bpf_subprog_info {
  * @last_bpf_off: address of the last instruction translated from BPF
  * @tgt_out: jump target for normal exit
  * @tgt_abort: jump target for abort (e.g. access outside of packet buffer)
+ * @tgt_call_push_regs: jump target for subroutine for saving R6~R9 to stack
+ * @tgt_call_pop_regs: jump target for subroutine used for restoring R6~R9
  * @n_translated: number of successfully translated instructions (for errors)
  * @error: error code if something went wrong
  * @stack_frame_depth: max stack depth for current frame
@@ -475,6 +492,8 @@ struct nfp_prog {
 	unsigned int last_bpf_off;
 	unsigned int tgt_out;
 	unsigned int tgt_abort;
+	unsigned int tgt_call_push_regs;
+	unsigned int tgt_call_pop_regs;
 
 	unsigned int n_translated;
 	int error;
@@ -502,6 +521,7 @@ struct nfp_bpf_vnic {
 	unsigned int tgt_done;
 };
 
+bool nfp_is_subprog_start(struct nfp_insn_meta *meta);
 void nfp_bpf_jit_prepare(struct nfp_prog *nfp_prog, unsigned int cnt);
 int nfp_bpf_jit(struct nfp_prog *prog);
 bool nfp_bpf_supported_opcode(u8 code);
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/offload.c b/drivers/net/ethernet/netronome/nfp/bpf/offload.c
index b683b03efd227..2ebd13b29c971 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/offload.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/offload.c
@@ -262,7 +262,6 @@ static int nfp_bpf_translate(struct nfp_net *nn, struct bpf_prog *prog)
 			prog->aux->stack_depth, stack_size);
 		return -EOPNOTSUPP;
 	}
-	nfp_prog->stack_frame_depth = round_up(prog->aux->stack_depth, 4);
 
 	max_instr = nn_readw(nn, NFP_NET_CFG_BPF_MAX_LEN);
 	nfp_prog->__prog_alloc_len = max_instr * sizeof(u64);
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
index c642c2c07d964..cc1b2c601f4e2 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
@@ -641,6 +641,27 @@ nfp_verify_insn(struct bpf_verifier_env *env, int insn_idx, int prev_insn_idx)
 	return 0;
 }
 
+static int
+nfp_assign_subprog_idx(struct bpf_verifier_env *env, struct nfp_prog *nfp_prog)
+{
+	struct nfp_insn_meta *meta;
+	int index = 0;
+
+	list_for_each_entry(meta, &nfp_prog->insns, l) {
+		if (nfp_is_subprog_start(meta))
+			index++;
+		meta->subprog_idx = index;
+	}
+
+	if (index + 1 != nfp_prog->subprog_cnt) {
+		pr_vlog(env, "BUG: number of processed BPF functions is not consistent (processed %d, expected %d)\n",
+			index + 1, nfp_prog->subprog_cnt);
+		return -EFAULT;
+	}
+
+	return 0;
+}
+
 static int nfp_bpf_finalize(struct bpf_verifier_env *env)
 {
 	struct bpf_subprog_info *info;
@@ -654,10 +675,21 @@ static int nfp_bpf_finalize(struct bpf_verifier_env *env)
 	if (!nfp_prog->subprog)
 		return -ENOMEM;
 
+	nfp_assign_subprog_idx(env, nfp_prog);
+
 	info = env->subprog_info;
-	for (i = 0; i < nfp_prog->subprog_cnt; i++)
+	for (i = 0; i < nfp_prog->subprog_cnt; i++) {
 		nfp_prog->subprog[i].stack_depth = info[i].stack_depth;
 
+		if (i == 0)
+			continue;
+
+		/* Account for size of return address. */
+		nfp_prog->subprog[i].stack_depth += REG_WIDTH;
+		/* Account for size of saved registers. */
+		nfp_prog->subprog[i].stack_depth += BPF_REG_SIZE * 4;
+	}
+
 	return 0;
 }
 
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_asm.h b/drivers/net/ethernet/netronome/nfp/nfp_asm.h
index fad0e62a910cc..5b257c603e911 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_asm.h
+++ b/drivers/net/ethernet/netronome/nfp/nfp_asm.h
@@ -82,6 +82,15 @@
 #define OP_BR_BIT_ADDR_LO	OP_BR_ADDR_LO
 #define OP_BR_BIT_ADDR_HI	OP_BR_ADDR_HI
 
+#define OP_BR_ALU_BASE		0x0e800000000ULL
+#define OP_BR_ALU_BASE_MASK	0x0ff80000000ULL
+#define OP_BR_ALU_A_SRC		0x000000003ffULL
+#define OP_BR_ALU_B_SRC		0x000000ffc00ULL
+#define OP_BR_ALU_DEFBR		0x00000300000ULL
+#define OP_BR_ALU_IMM_HI	0x0007fc00000ULL
+#define OP_BR_ALU_SRC_LMEXTN	0x40000000000ULL
+#define OP_BR_ALU_DST_LMEXTN	0x80000000000ULL
+
 static inline bool nfp_is_br(u64 insn)
 {
 	return (insn & OP_BR_BASE_MASK) == OP_BR_BASE ||

From fb1981654129e9c71af4c0f2782c5ffde8cbf37f Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin.monnet@netronome.com>
Date: Sun, 7 Oct 2018 12:56:53 +0100
Subject: [PATCH 07/12] nfp: bpf: account for additional stack usage when
 checking stack limit

Offloaded programs using BPF-to-BPF calls use the stack to store the
return address when calling into a subprogram. Callees also need some
space to save eBPF registers R6 to R9. And contrarily to kernel
verifier, we align stack frames on 64 bytes (and not 32). Account for
all this when checking the stack size limit before JIT-ing the program.
This means we have to recompute maximum stack usage for the program, we
cannot get the value from the kernel.

In addition to adapting the checks on stack usage, move them to the
finalize() callback, now that we have it and because such checks are
part of the verification step rather than translation.

Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 .../net/ethernet/netronome/nfp/bpf/offload.c  |  8 ---
 .../net/ethernet/netronome/nfp/bpf/verifier.c | 68 +++++++++++++++++++
 2 files changed, 68 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/offload.c b/drivers/net/ethernet/netronome/nfp/bpf/offload.c
index 2ebd13b29c971..49c7bead81136 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/offload.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/offload.c
@@ -252,17 +252,9 @@ nfp_bpf_verifier_prep(struct nfp_app *app, struct nfp_net *nn,
 static int nfp_bpf_translate(struct nfp_net *nn, struct bpf_prog *prog)
 {
 	struct nfp_prog *nfp_prog = prog->aux->offload->dev_priv;
-	unsigned int stack_size;
 	unsigned int max_instr;
 	int err;
 
-	stack_size = nn_readb(nn, NFP_NET_CFG_BPF_STACK_SZ) * 64;
-	if (prog->aux->stack_depth > stack_size) {
-		nn_info(nn, "stack too large: program %dB > FW stack %dB\n",
-			prog->aux->stack_depth, stack_size);
-		return -EOPNOTSUPP;
-	}
-
 	max_instr = nn_readw(nn, NFP_NET_CFG_BPF_MAX_LEN);
 	nfp_prog->__prog_alloc_len = max_instr * sizeof(u64);
 
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
index cc1b2c601f4e2..81a463726d559 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
@@ -34,10 +34,12 @@
 #include <linux/bpf.h>
 #include <linux/bpf_verifier.h>
 #include <linux/kernel.h>
+#include <linux/netdevice.h>
 #include <linux/pkt_cls.h>
 
 #include "../nfp_app.h"
 #include "../nfp_main.h"
+#include "../nfp_net.h"
 #include "fw.h"
 #include "main.h"
 
@@ -662,10 +664,67 @@ nfp_assign_subprog_idx(struct bpf_verifier_env *env, struct nfp_prog *nfp_prog)
 	return 0;
 }
 
+static unsigned int
+nfp_bpf_get_stack_usage(struct nfp_prog *nfp_prog, unsigned int cnt)
+{
+	struct nfp_insn_meta *meta = nfp_prog_first_meta(nfp_prog);
+	unsigned int max_depth = 0, depth = 0, frame = 0;
+	struct nfp_insn_meta *ret_insn[MAX_CALL_FRAMES];
+	unsigned short frame_depths[MAX_CALL_FRAMES];
+	unsigned short ret_prog[MAX_CALL_FRAMES];
+	unsigned short idx = meta->subprog_idx;
+
+	/* Inspired from check_max_stack_depth() from kernel verifier.
+	 * Starting from main subprogram, walk all instructions and recursively
+	 * walk all callees that given subprogram can call. Since recursion is
+	 * prevented by the kernel verifier, this algorithm only needs a local
+	 * stack of MAX_CALL_FRAMES to remember callsites.
+	 */
+process_subprog:
+	frame_depths[frame] = nfp_prog->subprog[idx].stack_depth;
+	frame_depths[frame] = round_up(frame_depths[frame], STACK_FRAME_ALIGN);
+	depth += frame_depths[frame];
+	max_depth = max(max_depth, depth);
+
+continue_subprog:
+	for (; meta != nfp_prog_last_meta(nfp_prog) && meta->subprog_idx == idx;
+	     meta = nfp_meta_next(meta)) {
+		if (!is_mbpf_pseudo_call(meta))
+			continue;
+
+		/* We found a call to a subprogram. Remember instruction to
+		 * return to and subprog id.
+		 */
+		ret_insn[frame] = nfp_meta_next(meta);
+		ret_prog[frame] = idx;
+
+		/* Find the callee and start processing it. */
+		meta = nfp_bpf_goto_meta(nfp_prog, meta,
+					 meta->n + 1 + meta->insn.imm, cnt);
+		idx = meta->subprog_idx;
+		frame++;
+		goto process_subprog;
+	}
+	/* End of for() loop means the last instruction of the subprog was
+	 * reached. If we popped all stack frames, return; otherwise, go on
+	 * processing remaining instructions from the caller.
+	 */
+	if (frame == 0)
+		return max_depth;
+
+	depth -= frame_depths[frame];
+	frame--;
+	meta = ret_insn[frame];
+	idx = ret_prog[frame];
+	goto continue_subprog;
+}
+
 static int nfp_bpf_finalize(struct bpf_verifier_env *env)
 {
+	unsigned int stack_size, stack_needed;
 	struct bpf_subprog_info *info;
 	struct nfp_prog *nfp_prog;
+	struct nfp_net *nn;
 	int i;
 
 	nfp_prog = env->prog->aux->offload->dev_priv;
@@ -690,6 +749,15 @@ static int nfp_bpf_finalize(struct bpf_verifier_env *env)
 		nfp_prog->subprog[i].stack_depth += BPF_REG_SIZE * 4;
 	}
 
+	nn = netdev_priv(env->prog->aux->offload->netdev);
+	stack_size = nn_readb(nn, NFP_NET_CFG_BPF_STACK_SZ) * 64;
+	stack_needed = nfp_bpf_get_stack_usage(nfp_prog, env->prog->len);
+	if (stack_needed > stack_size) {
+		pr_vlog(env, "stack too large: program %dB > FW stack %dB\n",
+			stack_needed, stack_size);
+		return -EOPNOTSUPP;
+	}
+
 	return 0;
 }
 

From bdf4c66faf5fa6fd5ffb0b59c39c7629103d6479 Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin.monnet@netronome.com>
Date: Sun, 7 Oct 2018 12:56:54 +0100
Subject: [PATCH 08/12] nfp: bpf: update fixup function for BPF-to-BPF calls
 support

Relocation for targets of BPF-to-BPF calls are required at the end of
translation. Update the nfp_fixup_branches() function in that regard.

When checking that the last instruction of each bloc is a branch, we
must account for the length of the instructions required to pop the
return address from the stack.

Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: Jiong Wang <jiong.wang@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 drivers/net/ethernet/netronome/nfp/bpf/jit.c  | 25 ++++++++++++++++---
 drivers/net/ethernet/netronome/nfp/bpf/main.h |  2 ++
 2 files changed, 24 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
index 2d2c9148bd447..e8b03d8f54f78 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
@@ -3116,7 +3116,7 @@ static int jne_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 static int
 bpf_to_bpf_call(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 {
-	u32 ret_tgt, stack_depth;
+	u32 ret_tgt, stack_depth, offset_br;
 	swreg tmp_reg;
 
 	stack_depth = round_up(nfp_prog->stack_frame_depth, STACK_FRAME_ALIGN);
@@ -3160,6 +3160,7 @@ bpf_to_bpf_call(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 	ret_tgt = nfp_prog_current_offset(nfp_prog) + 3;
 	emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO, 2,
 		     RELO_BR_GO_CALL_PUSH_REGS);
+	offset_br = nfp_prog_current_offset(nfp_prog);
 	wrp_immed_relo(nfp_prog, imm_b(nfp_prog), 0, RELO_IMMED_REL);
 	wrp_immed_relo(nfp_prog, ret_reg(nfp_prog), ret_tgt, RELO_IMMED_REL);
 
@@ -3176,6 +3177,9 @@ bpf_to_bpf_call(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 		wrp_nops(nfp_prog, 3);
 	}
 
+	meta->num_insns_after_br = nfp_prog_current_offset(nfp_prog);
+	meta->num_insns_after_br -= offset_br;
+
 	return 0;
 }
 
@@ -3344,21 +3348,36 @@ static int nfp_fixup_branches(struct nfp_prog *nfp_prog)
 	list_for_each_entry(meta, &nfp_prog->insns, l) {
 		if (meta->skip)
 			continue;
-		if (meta->insn.code == (BPF_JMP | BPF_CALL))
-			continue;
 		if (BPF_CLASS(meta->insn.code) != BPF_JMP)
 			continue;
+		if (meta->insn.code == (BPF_JMP | BPF_EXIT) &&
+		    !nfp_is_main_function(meta))
+			continue;
+		if (is_mbpf_helper_call(meta))
+			continue;
 
 		if (list_is_last(&meta->l, &nfp_prog->insns))
 			br_idx = nfp_prog->last_bpf_off;
 		else
 			br_idx = list_next_entry(meta, l)->off - 1;
 
+		/* For BPF-to-BPF function call, a stack adjustment sequence is
+		 * generated after the return instruction. Therefore, we must
+		 * withdraw the length of this sequence to have br_idx pointing
+		 * to where the "branch" NFP instruction is expected to be.
+		 */
+		if (is_mbpf_pseudo_call(meta))
+			br_idx -= meta->num_insns_after_br;
+
 		if (!nfp_is_br(nfp_prog->prog[br_idx])) {
 			pr_err("Fixup found block not ending in branch %d %02x %016llx!!\n",
 			       br_idx, meta->insn.code, nfp_prog->prog[br_idx]);
 			return -ELOOP;
 		}
+
+		if (meta->insn.code == (BPF_JMP | BPF_EXIT))
+			continue;
+
 		/* Leave special branches for later */
 		if (FIELD_GET(OP_RELO_TYPE, nfp_prog->prog[br_idx]) !=
 		    RELO_BR_REL)
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.h b/drivers/net/ethernet/netronome/nfp/bpf/main.h
index d9695bc316dd3..1cef5136c1984 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/main.h
+++ b/drivers/net/ethernet/netronome/nfp/bpf/main.h
@@ -283,6 +283,7 @@ struct nfp_bpf_reg_state {
  * @xadd_maybe_16bit: 16bit immediate is possible
  * @jmp_dst: destination info for jump instructions
  * @jump_neg_op: jump instruction has inverted immediate, use ADD instead of SUB
+ * @num_insns_after_br: number of insns following a branch jump, used for fixup
  * @func_id: function id for call instructions
  * @arg1: arg1 for call instructions
  * @arg2: arg2 for call instructions
@@ -319,6 +320,7 @@ struct nfp_insn_meta {
 		struct {
 			struct nfp_insn_meta *jmp_dst;
 			bool jump_neg_op;
+			u32 num_insns_after_br; /* only for BPF-to-BPF calls */
 		};
 		/* function calls */
 		struct {

From 2178f3f0dc200557312e783aa683b87794084ae2 Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin.monnet@netronome.com>
Date: Sun, 7 Oct 2018 12:56:55 +0100
Subject: [PATCH 09/12] nfp: bpf: fix return address from register-saving
 subroutine to callee

On performing a BPF-to-BPF call, we first jump to a subroutine that
pushes callee-saved registers (R6~R9) to the stack, and from there we
goes to the start of the callee next. In order to do so, the caller must
pass to the subroutine the address of the NFP instruction to jump to at
the end of that subroutine. This cannot be reliably implemented when
translated the caller, as we do not always know the start offset of the
callee yet.

This patch implement the required fixup step for passing the start
offset in the callee via the register used by the subroutine to hold its
return address.

Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 drivers/net/ethernet/netronome/nfp/bpf/jit.c | 28 +++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
index e8b03d8f54f78..74423d3e714d2 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
@@ -3340,10 +3340,25 @@ static const instr_cb_t instr_cb[256] = {
 };
 
 /* --- Assembler logic --- */
+static int
+nfp_fixup_immed_relo(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
+		     struct nfp_insn_meta *jmp_dst, u32 br_idx)
+{
+	if (immed_get_value(nfp_prog->prog[br_idx + 1])) {
+		pr_err("BUG: failed to fix up callee register saving\n");
+		return -EINVAL;
+	}
+
+	immed_set_value(&nfp_prog->prog[br_idx + 1], jmp_dst->off);
+
+	return 0;
+}
+
 static int nfp_fixup_branches(struct nfp_prog *nfp_prog)
 {
 	struct nfp_insn_meta *meta, *jmp_dst;
 	u32 idx, br_idx;
+	int err;
 
 	list_for_each_entry(meta, &nfp_prog->insns, l) {
 		if (meta->skip)
@@ -3380,7 +3395,7 @@ static int nfp_fixup_branches(struct nfp_prog *nfp_prog)
 
 		/* Leave special branches for later */
 		if (FIELD_GET(OP_RELO_TYPE, nfp_prog->prog[br_idx]) !=
-		    RELO_BR_REL)
+		    RELO_BR_REL && !is_mbpf_pseudo_call(meta))
 			continue;
 
 		if (!meta->jmp_dst) {
@@ -3395,6 +3410,17 @@ static int nfp_fixup_branches(struct nfp_prog *nfp_prog)
 			return -ELOOP;
 		}
 
+		if (is_mbpf_pseudo_call(meta)) {
+			err = nfp_fixup_immed_relo(nfp_prog, meta,
+						   jmp_dst, br_idx);
+			if (err)
+				return err;
+		}
+
+		if (FIELD_GET(OP_RELO_TYPE, nfp_prog->prog[br_idx]) !=
+		    RELO_BR_REL)
+			continue;
+
 		for (idx = meta->off; idx <= br_idx; idx++) {
 			if (!nfp_is_br(nfp_prog->prog[idx]))
 				continue;

From 445496231445aad46866a858a384b428cd073977 Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin.monnet@netronome.com>
Date: Sun, 7 Oct 2018 12:56:56 +0100
Subject: [PATCH 10/12] nfp: bpf: optimise save/restore for R6~R9 based on
 register usage

When pre-processing the instructions, it is trivial to detect what
subprograms are using R6, R7, R8 or R9 as destination registers. If a
subprogram uses none of those, then we do not need to jump to the
subroutines dedicated to saving and restoring callee-saved registers in
its prologue and epilogue.

This patch introduces detection of callee-saved registers in subprograms
and prevents the JIT from adding calls to those subroutines whenever we
can: we save some instructions in the translated program, and some time
at runtime on BPF-to-BPF calls and returns.

If no subprogram needs to save those registers, we can avoid appending
the subroutines at the end of the program.

Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 drivers/net/ethernet/netronome/nfp/bpf/jit.c  | 85 ++++++++++++++-----
 drivers/net/ethernet/netronome/nfp/bpf/main.h |  2 +
 .../net/ethernet/netronome/nfp/bpf/verifier.c | 14 ++-
 3 files changed, 78 insertions(+), 23 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
index 74423d3e714d2..b393f9dea5841 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
@@ -3132,7 +3132,9 @@ bpf_to_bpf_call(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 			    NFP_CSR_ACT_LM_ADDR0);
 	}
 
-	/* The following steps are performed:
+	/* Two cases for jumping to the callee:
+	 *
+	 * - If callee uses and needs to save R6~R9 then:
 	 *     1. Put the start offset of the callee into imm_b(). This will
 	 *        require a fixup step, as we do not necessarily know this
 	 *        address yet.
@@ -3140,8 +3142,12 @@ bpf_to_bpf_call(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 	 *        register ret_reg().
 	 *     3. (After defer slots are consumed) Jump to the subroutine that
 	 *        pushes the registers to the stack.
-	 * The subroutine acts as a trampoline, and returns to the address in
-	 * imm_b(), i.e. jumps to the callee.
+	 *   The subroutine acts as a trampoline, and returns to the address in
+	 *   imm_b(), i.e. jumps to the callee.
+	 *
+	 * - If callee does not need to save R6~R9 then just load return
+	 *   address to the caller in ret_reg(), and jump to the callee
+	 *   directly.
 	 *
 	 * Using ret_reg() to pass the return address to the callee is set here
 	 * as a convention. The callee can then push this address onto its
@@ -3157,11 +3163,21 @@ bpf_to_bpf_call(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 	 *   execution of the callee, we will not have to push the return
 	 *   address to the stack for leaf functions.
 	 */
-	ret_tgt = nfp_prog_current_offset(nfp_prog) + 3;
-	emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO, 2,
-		     RELO_BR_GO_CALL_PUSH_REGS);
-	offset_br = nfp_prog_current_offset(nfp_prog);
-	wrp_immed_relo(nfp_prog, imm_b(nfp_prog), 0, RELO_IMMED_REL);
+	if (!meta->jmp_dst) {
+		pr_err("BUG: BPF-to-BPF call has no destination recorded\n");
+		return -ELOOP;
+	}
+	if (nfp_prog->subprog[meta->jmp_dst->subprog_idx].needs_reg_push) {
+		ret_tgt = nfp_prog_current_offset(nfp_prog) + 3;
+		emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO, 2,
+			     RELO_BR_GO_CALL_PUSH_REGS);
+		offset_br = nfp_prog_current_offset(nfp_prog);
+		wrp_immed_relo(nfp_prog, imm_b(nfp_prog), 0, RELO_IMMED_REL);
+	} else {
+		ret_tgt = nfp_prog_current_offset(nfp_prog) + 2;
+		emit_br(nfp_prog, BR_UNC, meta->n + 1 + meta->insn.imm, 1);
+		offset_br = nfp_prog_current_offset(nfp_prog);
+	}
 	wrp_immed_relo(nfp_prog, ret_reg(nfp_prog), ret_tgt, RELO_IMMED_REL);
 
 	if (!nfp_prog_confirm_current_offset(nfp_prog, ret_tgt))
@@ -3227,15 +3243,24 @@ static int goto_out(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 static int
 nfp_subprog_epilogue(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 {
-	/* Pop R6~R9 to the stack via related subroutine.
-	 * Pop return address for BPF-to-BPF call from the stack and load it
-	 * into ret_reg() before we jump. This means that the subroutine does
-	 * not come back here, we make it jump back to the subprogram caller
-	 * directly!
-	 */
-	emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO, 1,
-		     RELO_BR_GO_CALL_POP_REGS);
-	wrp_mov(nfp_prog, ret_reg(nfp_prog), reg_lm(0, 0));
+	if (nfp_prog->subprog[meta->subprog_idx].needs_reg_push) {
+		/* Pop R6~R9 to the stack via related subroutine.
+		 * We loaded the return address to the caller into ret_reg().
+		 * This means that the subroutine does not come back here, we
+		 * make it jump back to the subprogram caller directly!
+		 */
+		emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO, 1,
+			     RELO_BR_GO_CALL_POP_REGS);
+		/* Pop return address from the stack. */
+		wrp_mov(nfp_prog, ret_reg(nfp_prog), reg_lm(0, 0));
+	} else {
+		/* Pop return address from the stack. */
+		wrp_mov(nfp_prog, ret_reg(nfp_prog), reg_lm(0, 0));
+		/* Jump back to caller if no callee-saved registers were used
+		 * by the subprogram.
+		 */
+		emit_rtn(nfp_prog, ret_reg(nfp_prog), 0);
+	}
 
 	return 0;
 }
@@ -3410,7 +3435,8 @@ static int nfp_fixup_branches(struct nfp_prog *nfp_prog)
 			return -ELOOP;
 		}
 
-		if (is_mbpf_pseudo_call(meta)) {
+		if (is_mbpf_pseudo_call(meta) &&
+		    nfp_prog->subprog[jmp_dst->subprog_idx].needs_reg_push) {
 			err = nfp_fixup_immed_relo(nfp_prog, meta,
 						   jmp_dst, br_idx);
 			if (err)
@@ -3549,6 +3575,17 @@ static void nfp_outro_xdp(struct nfp_prog *nfp_prog)
 	emit_ld_field(nfp_prog, reg_a(0), 0xc, reg_b(2), SHF_SC_L_SHF, 16);
 }
 
+static bool nfp_prog_needs_callee_reg_save(struct nfp_prog *nfp_prog)
+{
+	unsigned int idx;
+
+	for (idx = 1; idx < nfp_prog->subprog_cnt; idx++)
+		if (nfp_prog->subprog[idx].needs_reg_push)
+			return true;
+
+	return false;
+}
+
 static void nfp_push_callee_registers(struct nfp_prog *nfp_prog)
 {
 	u8 reg;
@@ -3612,7 +3649,7 @@ static void nfp_outro(struct nfp_prog *nfp_prog)
 		WARN_ON(1);
 	}
 
-	if (nfp_prog->subprog_cnt == 1)
+	if (!nfp_prog_needs_callee_reg_save(nfp_prog))
 		return;
 
 	nfp_push_callee_registers(nfp_prog);
@@ -4354,10 +4391,20 @@ void *nfp_bpf_relo_for_vnic(struct nfp_prog *nfp_prog, struct nfp_bpf_vnic *bv)
 				      nfp_prog->tgt_abort + bv->start_off);
 			break;
 		case RELO_BR_GO_CALL_PUSH_REGS:
+			if (!nfp_prog->tgt_call_push_regs) {
+				pr_err("BUG: failed to detect subprogram registers needs\n");
+				err = -EINVAL;
+				goto err_free_prog;
+			}
 			off = nfp_prog->tgt_call_push_regs + bv->start_off;
 			br_set_offset(&prog[i], off);
 			break;
 		case RELO_BR_GO_CALL_POP_REGS:
+			if (!nfp_prog->tgt_call_pop_regs) {
+				pr_err("BUG: failed to detect subprogram registers needs\n");
+				err = -EINVAL;
+				goto err_free_prog;
+			}
 			off = nfp_prog->tgt_call_pop_regs + bv->start_off;
 			br_set_offset(&prog[i], off);
 			break;
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.h b/drivers/net/ethernet/netronome/nfp/bpf/main.h
index 1cef5136c1984..44b787a0bd4b4 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/main.h
+++ b/drivers/net/ethernet/netronome/nfp/bpf/main.h
@@ -452,9 +452,11 @@ static inline bool is_mbpf_pseudo_call(const struct nfp_insn_meta *meta)
 /**
  * struct nfp_bpf_subprog_info - nfp BPF sub-program (a.k.a. function) info
  * @stack_depth:	maximum stack depth used by this sub-program
+ * @needs_reg_push:	whether sub-program uses callee-saved registers
  */
 struct nfp_bpf_subprog_info {
 	u16 stack_depth;
+	u8 needs_reg_push : 1;
 };
 
 /**
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
index 81a463726d559..f31721bd1fac8 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
@@ -644,7 +644,8 @@ nfp_verify_insn(struct bpf_verifier_env *env, int insn_idx, int prev_insn_idx)
 }
 
 static int
-nfp_assign_subprog_idx(struct bpf_verifier_env *env, struct nfp_prog *nfp_prog)
+nfp_assign_subprog_idx_and_regs(struct bpf_verifier_env *env,
+				struct nfp_prog *nfp_prog)
 {
 	struct nfp_insn_meta *meta;
 	int index = 0;
@@ -653,6 +654,10 @@ nfp_assign_subprog_idx(struct bpf_verifier_env *env, struct nfp_prog *nfp_prog)
 		if (nfp_is_subprog_start(meta))
 			index++;
 		meta->subprog_idx = index;
+
+		if (meta->insn.dst_reg >= BPF_REG_6 &&
+		    meta->insn.dst_reg <= BPF_REG_9)
+			nfp_prog->subprog[index].needs_reg_push = 1;
 	}
 
 	if (index + 1 != nfp_prog->subprog_cnt) {
@@ -734,7 +739,7 @@ static int nfp_bpf_finalize(struct bpf_verifier_env *env)
 	if (!nfp_prog->subprog)
 		return -ENOMEM;
 
-	nfp_assign_subprog_idx(env, nfp_prog);
+	nfp_assign_subprog_idx_and_regs(env, nfp_prog);
 
 	info = env->subprog_info;
 	for (i = 0; i < nfp_prog->subprog_cnt; i++) {
@@ -745,8 +750,9 @@ static int nfp_bpf_finalize(struct bpf_verifier_env *env)
 
 		/* Account for size of return address. */
 		nfp_prog->subprog[i].stack_depth += REG_WIDTH;
-		/* Account for size of saved registers. */
-		nfp_prog->subprog[i].stack_depth += BPF_REG_SIZE * 4;
+		/* Account for size of saved registers, if necessary. */
+		if (nfp_prog->subprog[i].needs_reg_push)
+			nfp_prog->subprog[i].stack_depth += BPF_REG_SIZE * 4;
 	}
 
 	nn = netdev_priv(env->prog->aux->offload->netdev);

From 7ff0ccde43664e3de9fe60edc19466f16cda6b7a Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin.monnet@netronome.com>
Date: Sun, 7 Oct 2018 12:56:57 +0100
Subject: [PATCH 11/12] nfp: bpf: support pointers to other stack frames for
 BPF-to-BPF calls

Mark instructions that use pointers to areas in the stack outside of the
current stack frame, and process them accordingly in mem_op_stack().
This way, we also support BPF-to-BPF calls where the caller passes a
pointer to data in its own stack frame to the callee (typically, when
the caller passes an address to one of its local variables located in
the stack, as an argument).

Thanks to Jakub and Jiong for figuring out how to deal with this case,
I just had to turn their email discussion into this patch.

Suggested-by: Jiong Wang <jiong.wang@netronome.com>
Suggested-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Reviewed-by: Jiong Wang <jiong.wang@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 drivers/net/ethernet/netronome/nfp/bpf/jit.c      | 3 ++-
 drivers/net/ethernet/netronome/nfp/bpf/main.h     | 1 +
 drivers/net/ethernet/netronome/nfp/bpf/verifier.c | 3 +++
 3 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
index b393f9dea5841..6ed1b5207ecd9 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
@@ -1178,7 +1178,8 @@ mem_op_stack(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
 	bool lm3 = true;
 	int ret;
 
-	if (meta->ptr_not_const) {
+	if (meta->ptr_not_const ||
+	    meta->flags & FLAG_INSN_PTR_CALLER_STACK_FRAME) {
 		/* Use of the last encountered ptr_off is OK, they all have
 		 * the same alignment.  Depend on low bits of value being
 		 * discarded when written to LMaddr register.
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.h b/drivers/net/ethernet/netronome/nfp/bpf/main.h
index 44b787a0bd4b4..25e10cfa2678d 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/main.h
+++ b/drivers/net/ethernet/netronome/nfp/bpf/main.h
@@ -267,6 +267,7 @@ struct nfp_bpf_reg_state {
 
 #define FLAG_INSN_IS_JUMP_DST			BIT(0)
 #define FLAG_INSN_IS_SUBPROG_START		BIT(1)
+#define FLAG_INSN_PTR_CALLER_STACK_FRAME	BIT(2)
 
 /**
  * struct nfp_insn_meta - BPF instruction wrapper
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
index f31721bd1fac8..cddb70786a584 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
@@ -336,6 +336,9 @@ nfp_bpf_check_stack_access(struct nfp_prog *nfp_prog,
 {
 	s32 old_off, new_off;
 
+	if (reg->frameno != env->cur_state->curframe)
+		meta->flags |= FLAG_INSN_PTR_CALLER_STACK_FRAME;
+
 	if (!tnum_is_const(reg->var_off)) {
 		pr_vlog(env, "variable ptr stack access\n");
 		return -EINVAL;

From e4052d06a5195b29271a7af262711d69f9ecfd04 Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin.monnet@netronome.com>
Date: Sun, 7 Oct 2018 12:56:58 +0100
Subject: [PATCH 12/12] bpf: allow offload of programs with BPF-to-BPF function
 calls

Now that there is at least one driver supporting BPF-to-BPF function
calls, lift the restriction, in the verifier, on hardware offload of
eBPF programs containing such calls. But prevent jit_subprogs(), still
in the verifier, from being run for offloaded programs.

Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Reviewed-by: Jiong Wang <jiong.wang@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/verifier.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index a0454cb299bae..73cc136915fe2 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1009,10 +1009,6 @@ static int check_subprogs(struct bpf_verifier_env *env)
 			verbose(env, "function calls to other bpf functions are allowed for root only\n");
 			return -EPERM;
 		}
-		if (bpf_prog_is_dev_bound(env->prog->aux)) {
-			verbose(env, "function calls in offloaded programs are not supported yet\n");
-			return -EINVAL;
-		}
 		ret = add_subprog(env, i + insn[i].imm + 1);
 		if (ret < 0)
 			return ret;
@@ -5968,10 +5964,10 @@ static int fixup_call_args(struct bpf_verifier_env *env)
 	struct bpf_insn *insn = prog->insnsi;
 	int i, depth;
 #endif
-	int err;
+	int err = 0;
 
-	err = 0;
-	if (env->prog->jit_requested) {
+	if (env->prog->jit_requested &&
+	    !bpf_prog_is_dev_bound(env->prog->aux)) {
 		err = jit_subprogs(env);
 		if (err == 0)
 			return 0;