Merge branch 'bpf-nfp-mul-div-support'

Jiong Wang says: ==================== NFP supports u16 and u32 multiplication. Multiplication is done 8-bits per step, therefore we need 2 steps for u16 and 4 steps for u32. We also need one start instruction to initialize the sequence and one or two instructions to fetch the result depending on either you need the high halve of u32 multiplication. For ALU64, if either operand is beyond u32's value range, we reject it. One thing to note, if the source operand is BPF_K, then we need to check "imm" field directly, and we'd reject it if it is negative. Because for ALU64, "imm" (with s32 type) is expected to be sign extended to s64 which NFP mul doesn't support. For ALU32, it is fine for "imm" be negative though, because the result is 32-bits and here is no difference on the low halve of result for signed/unsigned mul, so we will get correct result. NFP doesn't have integer divide instruction, this patch set uses reciprocal algorithm (the basic one, reciprocal_div) to emulate it. For each u32 divide, we would need 11 instructions to finish the operation. 7 (for multiplication) + 4 (various ALUs) = 11 Given NFP only supports multiplication no bigger than u32, we'd require divisor and dividend no bigger than that as well. Also eBPF doesn't support signed divide and has enforced this on C language level by failing compilation. However LLVM assembler hasn't enforced this, so it is possible for negative constant to leak in as a BPF_K operand through assembly code, we reject such cases as well. Meanwhile reciprocal_div.h only implemented the basic version of: "Division by Invariant Integers Using Multiplication" - Torbjörn Granlund and Peter L. Montgomery This patch set further implements the optimized version (Figure 4.2 in the paper) inside existing reciprocal_div.h. When the divider is even and the calculated reciprocal magic number doesn't fit u32, we could reduce the required ALU instructions from 4 to 2 or 1 for some cases. The advanced version requires more complex calculation to get the reciprocal multiplier and other control variables, but then could reduce the required emulation operations. It makes sense to use it for JIT divide code generation (for example eBPF JIT backends) for which we are willing to trade performance of JITed code with that of host. v2: - add warning in l == 32 code path. (Song Liu/Jakub) - jit separate insn sequence for l == 32. (Jakub/Edwin) - should use unrestricted operand for mul. - once place should use "1U << exp" instead of "1 << exp". ==================== Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
mariux64 · Jul 6, 2018 · d90c936 · d90c936
2 parents 02000b5 + 9fb410a
commit d90c936
Show file tree

Hide file tree

Showing 7 changed files with 483 additions and 37 deletions.
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
@@ -34,10 +34,11 @@
 #define pr_fmt(fmt)	"NFP net bpf: " fmt
 
 #include <linux/bug.h>
-#include <linux/kernel.h>
 #include <linux/bpf.h>
 #include <linux/filter.h>
+#include <linux/kernel.h>
 #include <linux/pkt_cls.h>
+#include <linux/reciprocal_div.h>
 #include <linux/unistd.h>
 
 #include "main.h"
@@ -415,6 +416,60 @@ emit_alu(struct nfp_prog *nfp_prog, swreg dst,
 		   reg.dst_lmextn, reg.src_lmextn);
 }
 
+static void
+__emit_mul(struct nfp_prog *nfp_prog, enum alu_dst_ab dst_ab, u16 areg,
+	   enum mul_type type, enum mul_step step, u16 breg, bool swap,
+	   bool wr_both, bool dst_lmextn, bool src_lmextn)
+{
+	u64 insn;
+
+	insn = OP_MUL_BASE |
+		FIELD_PREP(OP_MUL_A_SRC, areg) |
+		FIELD_PREP(OP_MUL_B_SRC, breg) |
+		FIELD_PREP(OP_MUL_STEP, step) |
+		FIELD_PREP(OP_MUL_DST_AB, dst_ab) |
+		FIELD_PREP(OP_MUL_SW, swap) |
+		FIELD_PREP(OP_MUL_TYPE, type) |
+		FIELD_PREP(OP_MUL_WR_AB, wr_both) |
+		FIELD_PREP(OP_MUL_SRC_LMEXTN, src_lmextn) |
+		FIELD_PREP(OP_MUL_DST_LMEXTN, dst_lmextn);
+
+	nfp_prog_push(nfp_prog, insn);
+}
+
+static void
+emit_mul(struct nfp_prog *nfp_prog, swreg lreg, enum mul_type type,
+	 enum mul_step step, swreg rreg)
+{
+	struct nfp_insn_ur_regs reg;
+	u16 areg;
+	int err;
+
+	if (type == MUL_TYPE_START && step != MUL_STEP_NONE) {
+		nfp_prog->error = -EINVAL;
+		return;
+	}
+
+	if (step == MUL_LAST || step == MUL_LAST_2) {
+		/* When type is step and step Number is LAST or LAST2, left
+		 * source is used as destination.
+		 */
+		err = swreg_to_unrestricted(lreg, reg_none(), rreg, &reg);
+		areg = reg.dst;
+	} else {
+		err = swreg_to_unrestricted(reg_none(), lreg, rreg, &reg);
+		areg = reg.areg;
+	}
+
+	if (err) {
+		nfp_prog->error = err;
+		return;
+	}
+
+	__emit_mul(nfp_prog, reg.dst_ab, areg, type, step, reg.breg, reg.swap,
+		   reg.wr_both, reg.dst_lmextn, reg.src_lmextn);
+}
+
 static void
 __emit_ld_field(struct nfp_prog *nfp_prog, enum shf_sc sc,
 		u8 areg, u8 bmask, u8 breg, u8 shift, bool imm8,
@@ -1380,6 +1435,133 @@ static void wrp_end32(struct nfp_prog *nfp_prog, swreg reg_in, u8 gpr_out)
 		      SHF_SC_R_ROT, 16);
 }
 
+static void
+wrp_mul_u32(struct nfp_prog *nfp_prog, swreg dst_hi, swreg dst_lo, swreg lreg,
+	    swreg rreg, bool gen_high_half)
+{
+	emit_mul(nfp_prog, lreg, MUL_TYPE_START, MUL_STEP_NONE, rreg);
+	emit_mul(nfp_prog, lreg, MUL_TYPE_STEP_32x32, MUL_STEP_1, rreg);
+	emit_mul(nfp_prog, lreg, MUL_TYPE_STEP_32x32, MUL_STEP_2, rreg);
+	emit_mul(nfp_prog, lreg, MUL_TYPE_STEP_32x32, MUL_STEP_3, rreg);
+	emit_mul(nfp_prog, lreg, MUL_TYPE_STEP_32x32, MUL_STEP_4, rreg);
+	emit_mul(nfp_prog, dst_lo, MUL_TYPE_STEP_32x32, MUL_LAST, reg_none());
+	if (gen_high_half)
+		emit_mul(nfp_prog, dst_hi, MUL_TYPE_STEP_32x32, MUL_LAST_2,
+			 reg_none());
+	else
+		wrp_immed(nfp_prog, dst_hi, 0);
+}
+
+static void
+wrp_mul_u16(struct nfp_prog *nfp_prog, swreg dst_hi, swreg dst_lo, swreg lreg,
+	    swreg rreg)
+{
+	emit_mul(nfp_prog, lreg, MUL_TYPE_START, MUL_STEP_NONE, rreg);
+	emit_mul(nfp_prog, lreg, MUL_TYPE_STEP_16x16, MUL_STEP_1, rreg);
+	emit_mul(nfp_prog, lreg, MUL_TYPE_STEP_16x16, MUL_STEP_2, rreg);
+	emit_mul(nfp_prog, dst_lo, MUL_TYPE_STEP_16x16, MUL_LAST, reg_none());
+}
+
+static int
+wrp_mul(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
+	bool gen_high_half, bool ropnd_from_reg)
+{
+	swreg multiplier, multiplicand, dst_hi, dst_lo;
+	const struct bpf_insn *insn = &meta->insn;
+	u32 lopnd_max, ropnd_max;
+	u8 dst_reg;
+
+	dst_reg = insn->dst_reg;
+	multiplicand = reg_a(dst_reg * 2);
+	dst_hi = reg_both(dst_reg * 2 + 1);
+	dst_lo = reg_both(dst_reg * 2);
+	lopnd_max = meta->umax_dst;
+	if (ropnd_from_reg) {
+		multiplier = reg_b(insn->src_reg * 2);
+		ropnd_max = meta->umax_src;
+	} else {
+		u32 imm = insn->imm;
+
+		multiplier = ur_load_imm_any(nfp_prog, imm, imm_b(nfp_prog));
+		ropnd_max = imm;
+	}
+	if (lopnd_max > U16_MAX || ropnd_max > U16_MAX)
+		wrp_mul_u32(nfp_prog, dst_hi, dst_lo, multiplicand, multiplier,
+			    gen_high_half);
+	else
+		wrp_mul_u16(nfp_prog, dst_hi, dst_lo, multiplicand, multiplier);
+
+	return 0;
+}
+
+static int wrp_div_imm(struct nfp_prog *nfp_prog, u8 dst, u64 imm)
+{
+	swreg dst_both = reg_both(dst), dst_a = reg_a(dst), dst_b = reg_a(dst);
+	struct reciprocal_value_adv rvalue;
+	u8 pre_shift, exp;
+	swreg magic;
+
+	if (imm > U32_MAX) {
+		wrp_immed(nfp_prog, dst_both, 0);
+		return 0;
+	}
+
+	/* NOTE: because we are using "reciprocal_value_adv" which doesn't
+	 * support "divisor > (1u << 31)", we need to JIT separate NFP sequence
+	 * to handle such case which actually equals to the result of unsigned
+	 * comparison "dst >= imm" which could be calculated using the following
+	 * NFP sequence:
+	 *
+	 *  alu[--, dst, -, imm]
+	 *  immed[imm, 0]
+	 *  alu[dst, imm, +carry, 0]
+	 *
+	 */
+	if (imm > 1U << 31) {
+		swreg tmp_b = ur_load_imm_any(nfp_prog, imm, imm_b(nfp_prog));
+
+		emit_alu(nfp_prog, reg_none(), dst_a, ALU_OP_SUB, tmp_b);
+		wrp_immed(nfp_prog, imm_a(nfp_prog), 0);
+		emit_alu(nfp_prog, dst_both, imm_a(nfp_prog), ALU_OP_ADD_C,
+			 reg_imm(0));
+		return 0;
+	}
+
+	rvalue = reciprocal_value_adv(imm, 32);
+	exp = rvalue.exp;
+	if (rvalue.is_wide_m && !(imm & 1)) {
+		pre_shift = fls(imm & -imm) - 1;
+		rvalue = reciprocal_value_adv(imm >> pre_shift, 32 - pre_shift);
+	} else {
+		pre_shift = 0;
+	}
+	magic = ur_load_imm_any(nfp_prog, rvalue.m, imm_b(nfp_prog));
+	if (imm == 1U << exp) {
+		emit_shf(nfp_prog, dst_both, reg_none(), SHF_OP_NONE, dst_b,
+			 SHF_SC_R_SHF, exp);
+	} else if (rvalue.is_wide_m) {
+		wrp_mul_u32(nfp_prog, imm_both(nfp_prog), reg_none(), dst_a,
+			    magic, true);
+		emit_alu(nfp_prog, dst_both, dst_a, ALU_OP_SUB,
+			 imm_b(nfp_prog));
+		emit_shf(nfp_prog, dst_both, reg_none(), SHF_OP_NONE, dst_b,
+			 SHF_SC_R_SHF, 1);
+		emit_alu(nfp_prog, dst_both, dst_a, ALU_OP_ADD,
+			 imm_b(nfp_prog));
+		emit_shf(nfp_prog, dst_both, reg_none(), SHF_OP_NONE, dst_b,
+			 SHF_SC_R_SHF, rvalue.sh - 1);
+	} else {
+		if (pre_shift)
+			emit_shf(nfp_prog, dst_both, reg_none(), SHF_OP_NONE,
+				 dst_b, SHF_SC_R_SHF, pre_shift);
+		wrp_mul_u32(nfp_prog, dst_both, reg_none(), dst_a, magic, true);
+		emit_shf(nfp_prog, dst_both, reg_none(), SHF_OP_NONE,
+			 dst_b, SHF_SC_R_SHF, rvalue.sh);
+	}
+
+	return 0;
+}
+
 static int adjust_head(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 {
 	swreg tmp = imm_a(nfp_prog), tmp_len = imm_b(nfp_prog);
@@ -1684,6 +1866,31 @@ static int sub_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 	return 0;
 }
 
+static int mul_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
+{
+	return wrp_mul(nfp_prog, meta, true, true);
+}
+
+static int mul_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
+{
+	return wrp_mul(nfp_prog, meta, true, false);
+}
+
+static int div_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
+{
+	const struct bpf_insn *insn = &meta->insn;
+
+	return wrp_div_imm(nfp_prog, insn->dst_reg * 2, insn->imm);
+}
+
+static int div_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
+{
+	/* NOTE: verifier hook has rejected cases for which verifier doesn't
+	 * know whether the source operand is constant or not.
+	 */
+	return wrp_div_imm(nfp_prog, meta->insn.dst_reg * 2, meta->umin_src);
+}
+
 static int neg_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 {
 	const struct bpf_insn *insn = &meta->insn;
@@ -1772,8 +1979,8 @@ static int shl_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 	u8 dst, src;
 
 	dst = insn->dst_reg * 2;
-	umin = meta->umin;
-	umax = meta->umax;
+	umin = meta->umin_src;
+	umax = meta->umax_src;
 	if (umin == umax)
 		return __shl_imm64(nfp_prog, dst, umin);
 
@@ -1881,8 +2088,8 @@ static int shr_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 	u8 dst, src;
 
 	dst = insn->dst_reg * 2;
-	umin = meta->umin;
-	umax = meta->umax;
+	umin = meta->umin_src;
+	umax = meta->umax_src;
 	if (umin == umax)
 		return __shr_imm64(nfp_prog, dst, umin);
 
@@ -1995,8 +2202,8 @@ static int ashr_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 	u8 dst, src;
 
 	dst = insn->dst_reg * 2;
-	umin = meta->umin;
-	umax = meta->umax;
+	umin = meta->umin_src;
+	umax = meta->umax_src;
 	if (umin == umax)
 		return __ashr_imm64(nfp_prog, dst, umin);
 
@@ -2097,6 +2304,26 @@ static int sub_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 	return wrp_alu32_imm(nfp_prog, meta, ALU_OP_SUB, !meta->insn.imm);
 }
 
+static int mul_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
+{
+	return wrp_mul(nfp_prog, meta, false, true);
+}
+
+static int mul_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
+{
+	return wrp_mul(nfp_prog, meta, false, false);
+}
+
+static int div_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
+{
+	return div_reg64(nfp_prog, meta);
+}
+
+static int div_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
+{
+	return div_imm64(nfp_prog, meta);
+}
+
 static int neg_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 {
 	u8 dst = meta->insn.dst_reg * 2;
@@ -2848,6 +3075,10 @@ static const instr_cb_t instr_cb[256] = {
 	[BPF_ALU64 | BPF_ADD | BPF_K] =	add_imm64,
 	[BPF_ALU64 | BPF_SUB | BPF_X] =	sub_reg64,
 	[BPF_ALU64 | BPF_SUB | BPF_K] =	sub_imm64,
+	[BPF_ALU64 | BPF_MUL | BPF_X] =	mul_reg64,
+	[BPF_ALU64 | BPF_MUL | BPF_K] =	mul_imm64,
+	[BPF_ALU64 | BPF_DIV | BPF_X] =	div_reg64,
+	[BPF_ALU64 | BPF_DIV | BPF_K] =	div_imm64,
 	[BPF_ALU64 | BPF_NEG] =		neg_reg64,
 	[BPF_ALU64 | BPF_LSH | BPF_X] =	shl_reg64,
 	[BPF_ALU64 | BPF_LSH | BPF_K] =	shl_imm64,
@@ -2867,6 +3098,10 @@ static const instr_cb_t instr_cb[256] = {
 	[BPF_ALU | BPF_ADD | BPF_K] =	add_imm,
 	[BPF_ALU | BPF_SUB | BPF_X] =	sub_reg,
 	[BPF_ALU | BPF_SUB | BPF_K] =	sub_imm,
+	[BPF_ALU | BPF_MUL | BPF_X] =	mul_reg,
+	[BPF_ALU | BPF_MUL | BPF_K] =	mul_imm,
+	[BPF_ALU | BPF_DIV | BPF_X] =	div_reg,
+	[BPF_ALU | BPF_DIV | BPF_K] =	div_imm,
 	[BPF_ALU | BPF_NEG] =		neg_reg,
 	[BPF_ALU | BPF_LSH | BPF_K] =	shl_imm,
 	[BPF_ALU | BPF_END | BPF_X] =	end_reg32,

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.h b/drivers/net/ethernet/netronome/nfp/bpf/main.h
@@ -263,8 +263,10 @@ struct nfp_bpf_reg_state {
  * @func_id: function id for call instructions
  * @arg1: arg1 for call instructions
  * @arg2: arg2 for call instructions
- * @umin: copy of core verifier umin_value.
- * @umax: copy of core verifier umax_value.
+ * @umin_src: copy of core verifier umin_value for src opearnd.
+ * @umax_src: copy of core verifier umax_value for src operand.
+ * @umin_dst: copy of core verifier umin_value for dst opearnd.
+ * @umax_dst: copy of core verifier umax_value for dst operand.
  * @off: index of first generated machine instruction (in nfp_prog.prog)
  * @n: eBPF instruction number
  * @flags: eBPF instruction extra optimization flags
@@ -300,12 +302,15 @@ struct nfp_insn_meta {
 			struct bpf_reg_state arg1;
 			struct nfp_bpf_reg_state arg2;
 		};
-		/* We are interested in range info for some operands,
-		 * for example, the shift amount.
+		/* We are interested in range info for operands of ALU
+		 * operations. For example, shift amount, multiplicand and
+		 * multiplier etc.
 		 */
 		struct {
-			u64 umin;
-			u64 umax;
+			u64 umin_src;
+			u64 umax_src;
+			u64 umin_dst;
+			u64 umax_dst;
 		};
 	};
 	unsigned int off;
@@ -339,6 +344,11 @@ static inline u8 mbpf_mode(const struct nfp_insn_meta *meta)
 	return BPF_MODE(meta->insn.code);
 }
 
+static inline bool is_mbpf_alu(const struct nfp_insn_meta *meta)
+{
+	return mbpf_class(meta) == BPF_ALU64 || mbpf_class(meta) == BPF_ALU;
+}
+
 static inline bool is_mbpf_load(const struct nfp_insn_meta *meta)
 {
 	return (meta->insn.code & ~BPF_SIZE_MASK) == (BPF_LDX | BPF_MEM);
@@ -384,23 +394,14 @@ static inline bool is_mbpf_xadd(const struct nfp_insn_meta *meta)
 	return (meta->insn.code & ~BPF_SIZE_MASK) == (BPF_STX | BPF_XADD);
 }
 
-static inline bool is_mbpf_indir_shift(const struct nfp_insn_meta *meta)
+static inline bool is_mbpf_mul(const struct nfp_insn_meta *meta)
 {
-	u8 code = meta->insn.code;
-	bool is_alu, is_shift;
-	u8 opclass, opcode;
-
-	opclass = BPF_CLASS(code);
-	is_alu = opclass == BPF_ALU64 || opclass == BPF_ALU;
-	if (!is_alu)
-		return false;
-
-	opcode = BPF_OP(code);
-	is_shift = opcode == BPF_LSH || opcode == BPF_RSH || opcode == BPF_ARSH;
-	if (!is_shift)
-		return false;
+	return is_mbpf_alu(meta) && mbpf_op(meta) == BPF_MUL;
+}
 
-	return BPF_SRC(code) == BPF_X;
+static inline bool is_mbpf_div(const struct nfp_insn_meta *meta)
+{
+	return is_mbpf_alu(meta) && mbpf_op(meta) == BPF_DIV;
 }
 
 /**