From 199d983bc01513173dd9cc486dbddf4d0e414d42 Mon Sep 17 00:00:00 2001
From: Magnus Karlsson <magnus.karlsson@intel.com>
Date: Thu, 11 Nov 2021 08:57:07 +0100
Subject: [PATCH 01/96] xsk: Fix crash on double free in buffer pool
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix a crash in the buffer pool allocator when a buffer is double
freed. It is possible to trigger this behavior not only from a faulty
driver, but also from user space like this: Create a zero-copy AF_XDP
socket. Load an XDP program that will issue XDP_DROP for all
packets. Put the same umem buffer into the fill ring multiple times,
then bind the socket and send some traffic. This will crash the kernel
as the XDP_DROP action triggers one call to xsk_buff_free()/xp_free()
for every packet dropped. Each call will add the corresponding buffer
entry to the free_list and increase the free_list_cnt. Some entries
will have been added multiple times due to the same buffer being
freed. The buffer allocation code will then traverse this broken list
and since the same buffer is in the list multiple times, it will try
to delete the same buffer twice from the list leading to a crash.

The fix for this is just to test that the buffer has not been added
before in xp_free(). If it has been, just return from the function and
do not put it in the free_list a second time.

Note that this bug was not present in the code before the commit
referenced in the Fixes tag. That code used one list entry per
allocated buffer, so multiple frees did not have any side effects. But
the commit below optimized the usage of the pool and only uses a
single entry per buffer in the umem, meaning that multiple
allocations/frees of the same buffer will also only use one entry,
thus leading to the problem.

Fixes: 47e4075df300 ("xsk: Batched buffer allocation for the pool")
Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Björn Töpel <bjorn@kernel.org>
Link: https://lore.kernel.org/bpf/20211111075707.21922-1-magnus.karlsson@gmail.com
---
 net/xdp/xsk_buff_pool.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c
index 90c4e1e819d38..bc4ad48ea4f09 100644
--- a/net/xdp/xsk_buff_pool.c
+++ b/net/xdp/xsk_buff_pool.c
@@ -500,7 +500,7 @@ struct xdp_buff *xp_alloc(struct xsk_buff_pool *pool)
 		pool->free_list_cnt--;
 		xskb = list_first_entry(&pool->free_list, struct xdp_buff_xsk,
 					free_list_node);
-		list_del(&xskb->free_list_node);
+		list_del_init(&xskb->free_list_node);
 	}
 
 	xskb->xdp.data = xskb->xdp.data_hard_start + XDP_PACKET_HEADROOM;
@@ -568,7 +568,7 @@ static u32 xp_alloc_reused(struct xsk_buff_pool *pool, struct xdp_buff **xdp, u3
 	i = nb_entries;
 	while (i--) {
 		xskb = list_first_entry(&pool->free_list, struct xdp_buff_xsk, free_list_node);
-		list_del(&xskb->free_list_node);
+		list_del_init(&xskb->free_list_node);
 
 		*xdp = &xskb->xdp;
 		xdp++;
@@ -615,6 +615,9 @@ EXPORT_SYMBOL(xp_can_alloc);
 
 void xp_free(struct xdp_buff_xsk *xskb)
 {
+	if (!list_empty(&xskb->free_list_node))
+		return;
+
 	xskb->pool->free_list_cnt++;
 	list_add(&xskb->free_list_node, &xskb->pool->free_list);
 }

From 34d11a440c6167133201b7374065b59f259730d7 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Wed, 10 Nov 2021 09:25:56 -0800
Subject: [PATCH 02/96] bpf: Fix inner map state pruning regression.

Introduction of map_uid made two lookups from outer map to be distinct.
That distinction is only necessary when inner map has an embedded timer.
Otherwise it will make the verifier state pruning to be conservative
which will cause complex programs to hit 1M insn_processed limit.
Tighten map_uid logic to apply to inner maps with timers only.

Fixes: 3e8ce29850f1 ("bpf: Prevent pointer mismatch in bpf_timer_init.")
Reported-by: Lorenz Bauer <lmb@cloudflare.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Tested-by: Lorenz Bauer <lmb@cloudflare.com>
Link: https://lore.kernel.org/bpf/CACAyw99hVEJFoiBH_ZGyy=+oO-jyydoz6v1DeKPKs2HVsUH28w@mail.gmail.com
Link: https://lore.kernel.org/bpf/20211110172556.20754-1-alexei.starovoitov@gmail.com
---
 kernel/bpf/verifier.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 890b3ec375a39..aab7482ed1c38 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1151,7 +1151,8 @@ static void mark_ptr_not_null_reg(struct bpf_reg_state *reg)
 			/* transfer reg's id which is unique for every map_lookup_elem
 			 * as UID of the inner map.
 			 */
-			reg->map_uid = reg->id;
+			if (map_value_has_timer(map->inner_map_meta))
+				reg->map_uid = reg->id;
 		} else if (map->map_type == BPF_MAP_TYPE_XSKMAP) {
 			reg->type = PTR_TO_XDP_SOCK;
 		} else if (map->map_type == BPF_MAP_TYPE_SOCKMAP ||

From 6af2e1237412ca735e3f18f2044902b0c514f2db Mon Sep 17 00:00:00 2001
From: Lorenz Bauer <lmb@cloudflare.com>
Date: Thu, 11 Nov 2021 16:14:52 +0000
Subject: [PATCH 03/96] selftests/bpf: Check map in map pruning

Ensure that two registers with a map_value loaded from a nested
map are considered equivalent for the purpose of state pruning
and don't cause the verifier to revisit a pruning point.

This uses a rather crude match on the number of insns visited by
the verifier, which might change in the future. I've therefore
tried to keep the code as "unpruneable" as possible by having
the code paths only converge on the second to last instruction.

Should you require to adjust the test in the future, reducing the
number of processed instructions should always be safe. Increasing
them could cause another regression, so proceed with caution.

Suggested-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Lorenz Bauer <lmb@cloudflare.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/CACAyw99hVEJFoiBH_ZGyy=+oO-jyydoz6v1DeKPKs2HVsUH28w@mail.gmail.com
Link: https://lore.kernel.org/bpf/20211111161452.86864-1-lmb@cloudflare.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../selftests/bpf/verifier/map_in_map.c       | 34 +++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/tools/testing/selftests/bpf/verifier/map_in_map.c b/tools/testing/selftests/bpf/verifier/map_in_map.c
index 2798927ee9ff4..128a348b762dc 100644
--- a/tools/testing/selftests/bpf/verifier/map_in_map.c
+++ b/tools/testing/selftests/bpf/verifier/map_in_map.c
@@ -18,6 +18,40 @@
 	.fixup_map_in_map = { 3 },
 	.result = ACCEPT,
 },
+{
+	"map in map state pruning",
+	.insns = {
+	BPF_ST_MEM(0, BPF_REG_10, -4, 0),
+	BPF_MOV64_REG(BPF_REG_6, BPF_REG_10),
+	BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, -4),
+	BPF_MOV64_REG(BPF_REG_2, BPF_REG_6),
+	BPF_LD_MAP_FD(BPF_REG_1, 0),
+	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+	BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+	BPF_EXIT_INSN(),
+	BPF_MOV64_REG(BPF_REG_2, BPF_REG_6),
+	BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+	BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 11),
+	BPF_MOV64_REG(BPF_REG_2, BPF_REG_6),
+	BPF_LD_MAP_FD(BPF_REG_1, 0),
+	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+	BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+	BPF_EXIT_INSN(),
+	BPF_MOV64_REG(BPF_REG_2, BPF_REG_6),
+	BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+	BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+	BPF_EXIT_INSN(),
+	BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, 0),
+	BPF_EXIT_INSN(),
+	},
+	.fixup_map_in_map = { 4, 14 },
+	.flags = BPF_F_TEST_STATE_FREQ,
+	.result = VERBOSE_ACCEPT,
+	.errstr = "processed 25 insns",
+	.prog_type = BPF_PROG_TYPE_XDP,
+},
 {
 	"invalid inner map pointer",
 	.insns = {

From dc14ca4644f48b1cfa93631e35c28bdc011ad109 Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <alexandr.lobakin@intel.com>
Date: Thu, 11 Nov 2021 22:57:03 +0100
Subject: [PATCH 04/96] samples/bpf: Fix summary per-sec stats in
 xdp_sample_user

sample_summary_print() uses accumulated period to calculate and display
per-sec averages. This period gets incremented by sampling interval each
time a new sample is formed, and thus equals to the number of samples
collected multiplied by this interval.

However, the totals are being calculated differently, they receive current
sample statistics already divided by the interval gotten as a difference
between sample timestamps for better precision -- in other words, they are
being incremented by the per-sec values each sample.

This leads to the excessive division of summary per-secs when interval != 1
sec. It is obvious pps couldn't become two times lower just from picking a
different sampling interval value:

  $ samples/bpf/xdp_redirect_cpu -p xdp_prognum_n1_inverse_qnum -c all
    -s -d 6 -i 1
  < snip >
    Packets received    : 2,197,230,321
    Average packets/s   : 22,887,816
    Packets redirected  : 2,197,230,472
    Average redir/s     : 22,887,817
  $ samples/bpf/xdp_redirect_cpu -p xdp_prognum_n1_inverse_qnum -c all
    -s -d 6 -i 2
  < snip >
    Packets received    : 159,566,498
    Average packets/s   : 11,397,607
    Packets redirected  : 159,566,995
    Average redir/s     : 11,397,642

This can be easily fixed by treating the divisor not as a period, but rather
as a total number of samples, and thus incrementing it by 1 instead of
interval. As a nice side effect, we can now remove so-named argument from a
couple of functions. Let us also create an "alias" for sample_output::rx_cnt::pps
named 'num' using a union since this field is used to store this number (period
previously) as well, and the resulting counter-intuitive code might've been a
reason for this bug.

Fixes: 156f886cf697 ("samples: bpf: Add basic infrastructure for XDP samples")
Signed-off-by: Alexander Lobakin <alexandr.lobakin@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Reviewed-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/bpf/20211111215703.690-1-alexandr.lobakin@intel.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 samples/bpf/xdp_sample_user.c | 28 +++++++++++++++-------------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/samples/bpf/xdp_sample_user.c b/samples/bpf/xdp_sample_user.c
index b32d821781990..8740838e77679 100644
--- a/samples/bpf/xdp_sample_user.c
+++ b/samples/bpf/xdp_sample_user.c
@@ -120,7 +120,10 @@ struct sample_output {
 		__u64 xmit;
 	} totals;
 	struct {
-		__u64 pps;
+		union {
+			__u64 pps;
+			__u64 num;
+		};
 		__u64 drop;
 		__u64 err;
 	} rx_cnt;
@@ -1322,7 +1325,7 @@ int sample_install_xdp(struct bpf_program *xdp_prog, int ifindex, bool generic,
 
 static void sample_summary_print(void)
 {
-	double period = sample_out.rx_cnt.pps;
+	double num = sample_out.rx_cnt.num;
 
 	if (sample_out.totals.rx) {
 		double pkts = sample_out.totals.rx;
@@ -1330,7 +1333,7 @@ static void sample_summary_print(void)
 		print_always("  Packets received    : %'-10llu\n",
 			     sample_out.totals.rx);
 		print_always("  Average packets/s   : %'-10.0f\n",
-			     sample_round(pkts / period));
+			     sample_round(pkts / num));
 	}
 	if (sample_out.totals.redir) {
 		double pkts = sample_out.totals.redir;
@@ -1338,7 +1341,7 @@ static void sample_summary_print(void)
 		print_always("  Packets redirected  : %'-10llu\n",
 			     sample_out.totals.redir);
 		print_always("  Average redir/s     : %'-10.0f\n",
-			     sample_round(pkts / period));
+			     sample_round(pkts / num));
 	}
 	if (sample_out.totals.drop)
 		print_always("  Rx dropped          : %'-10llu\n",
@@ -1355,7 +1358,7 @@ static void sample_summary_print(void)
 		print_always("  Packets transmitted : %'-10llu\n",
 			     sample_out.totals.xmit);
 		print_always("  Average transmit/s  : %'-10.0f\n",
-			     sample_round(pkts / period));
+			     sample_round(pkts / num));
 	}
 }
 
@@ -1422,7 +1425,7 @@ static int sample_stats_collect(struct stats_record *rec)
 	return 0;
 }
 
-static void sample_summary_update(struct sample_output *out, int interval)
+static void sample_summary_update(struct sample_output *out)
 {
 	sample_out.totals.rx += out->totals.rx;
 	sample_out.totals.redir += out->totals.redir;
@@ -1430,12 +1433,11 @@ static void sample_summary_update(struct sample_output *out, int interval)
 	sample_out.totals.drop_xmit += out->totals.drop_xmit;
 	sample_out.totals.err += out->totals.err;
 	sample_out.totals.xmit += out->totals.xmit;
-	sample_out.rx_cnt.pps += interval;
+	sample_out.rx_cnt.num++;
 }
 
 static void sample_stats_print(int mask, struct stats_record *cur,
-			       struct stats_record *prev, char *prog_name,
-			       int interval)
+			       struct stats_record *prev, char *prog_name)
 {
 	struct sample_output out = {};
 
@@ -1452,7 +1454,7 @@ static void sample_stats_print(int mask, struct stats_record *cur,
 	else if (mask & SAMPLE_DEVMAP_XMIT_CNT_MULTI)
 		stats_get_devmap_xmit_multi(cur, prev, 0, &out,
 					    mask & SAMPLE_DEVMAP_XMIT_CNT);
-	sample_summary_update(&out, interval);
+	sample_summary_update(&out);
 
 	stats_print(prog_name, mask, cur, prev, &out);
 }
@@ -1495,7 +1497,7 @@ static void swap(struct stats_record **a, struct stats_record **b)
 }
 
 static int sample_timer_cb(int timerfd, struct stats_record **rec,
-			   struct stats_record **prev, int interval)
+			   struct stats_record **prev)
 {
 	char line[64] = "Summary";
 	int ret;
@@ -1524,7 +1526,7 @@ static int sample_timer_cb(int timerfd, struct stats_record **rec,
 		snprintf(line, sizeof(line), "%s->%s", f ?: "?", t ?: "?");
 	}
 
-	sample_stats_print(sample_mask, *rec, *prev, line, interval);
+	sample_stats_print(sample_mask, *rec, *prev, line);
 	return 0;
 }
 
@@ -1579,7 +1581,7 @@ int sample_run(int interval, void (*post_cb)(void *), void *ctx)
 		if (pfd[0].revents & POLLIN)
 			ret = sample_signal_cb();
 		else if (pfd[1].revents & POLLIN)
-			ret = sample_timer_cb(timerfd, &rec, &prev, interval);
+			ret = sample_timer_cb(timerfd, &rec, &prev);
 
 		if (ret)
 			break;

From e4ac80ef8198636a23866a59575917550328886f Mon Sep 17 00:00:00 2001
From: Jean-Philippe Brucker <jean-philippe@linaro.org>
Date: Fri, 12 Nov 2021 15:51:30 +0000
Subject: [PATCH 05/96] tools/runqslower: Fix cross-build

Commit be79505caf3f ("tools/runqslower: Install libbpf headers when
building") uses the target libbpf to build the host bpftool, which
doesn't work when cross-building:

  make ARCH=arm64 CROSS_COMPILE=aarch64-linux-gnu- -C tools/bpf/runqslower O=/tmp/runqslower
  ...
    LINK    /tmp/runqslower/bpftool/bpftool
  /usr/bin/ld: /tmp/runqslower/libbpf/libbpf.a(libbpf-in.o): Relocations in generic ELF (EM: 183)
  /usr/bin/ld: /tmp/runqslower/libbpf/libbpf.a: error adding symbols: file in wrong format
  collect2: error: ld returned 1 exit status

When cross-building, the target architecture differs from the host. The
bpftool used for building runqslower is executed on the host, and thus
must use a different libbpf than that used for runqslower itself.
Remove the LIBBPF_OUTPUT and LIBBPF_DESTDIR parameters, so the bpftool
build makes its own library if necessary.

In the selftests, pass the host bpftool, already a prerequisite for the
runqslower recipe, as BPFTOOL_OUTPUT. The runqslower Makefile will use
the bpftool that's already built for selftests instead of making a new
one.

Fixes: be79505caf3f ("tools/runqslower: Install libbpf headers when building")
Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Quentin Monnet <quentin@isovalent.com>
Link: https://lore.kernel.org/bpf/20211112155128.565680-1-jean-philippe@linaro.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/bpf/runqslower/Makefile        | 3 +--
 tools/testing/selftests/bpf/Makefile | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/tools/bpf/runqslower/Makefile b/tools/bpf/runqslower/Makefile
index bbd1150578f7a..8791d0e2762b9 100644
--- a/tools/bpf/runqslower/Makefile
+++ b/tools/bpf/runqslower/Makefile
@@ -88,5 +88,4 @@ $(BPFOBJ): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(BPFOBJ_OU
 
 $(DEFAULT_BPFTOOL): $(BPFOBJ) | $(BPFTOOL_OUTPUT)
 	$(Q)$(MAKE) $(submake_extras) -C ../bpftool OUTPUT=$(BPFTOOL_OUTPUT)   \
-		    LIBBPF_OUTPUT=$(BPFOBJ_OUTPUT)			       \
-		    LIBBPF_DESTDIR=$(BPF_DESTDIR) CC=$(HOSTCC) LD=$(HOSTLD)
+		    CC=$(HOSTCC) LD=$(HOSTLD)
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 54b0a41a37750..62fafbeb46723 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -187,7 +187,7 @@ DEFAULT_BPFTOOL := $(HOST_SCRATCH_DIR)/sbin/bpftool
 $(OUTPUT)/runqslower: $(BPFOBJ) | $(DEFAULT_BPFTOOL) $(RUNQSLOWER_OUTPUT)
 	$(Q)$(MAKE) $(submake_extras) -C $(TOOLSDIR)/bpf/runqslower	       \
 		    OUTPUT=$(RUNQSLOWER_OUTPUT) VMLINUX_BTF=$(VMLINUX_BTF)     \
-		    BPFTOOL_OUTPUT=$(BUILD_DIR)/bpftool/		       \
+		    BPFTOOL_OUTPUT=$(HOST_BUILD_DIR)/bpftool/		       \
 		    BPFOBJ_OUTPUT=$(BUILD_DIR)/libbpf			       \
 		    BPFOBJ=$(BPFOBJ) BPF_INCLUDE=$(INCLUDE_DIR) &&	       \
 		    cp $(RUNQSLOWER_OUTPUT)runqslower $@

From 2453afe3845523d9dfe89dbfb3d71abfa095e260 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Fri, 12 Nov 2021 07:33:01 +0530
Subject: [PATCH 06/96] samples/bpf: Fix incorrect use of strlen in
 xdp_redirect_cpu

Commit b599015f044d ("samples/bpf: Fix application of sizeof to pointer")
tried to fix a bug where sizeof was incorrectly applied to a pointer instead
of the array string was being copied to, to find the destination buffer size,
but ended up using strlen, which is still incorrect. However, on closer look
ifname_buf has no other use, hence directly use optarg.

Fixes: b599015f044d ("samples/bpf: Fix application of sizeof to pointer")
Fixes: e531a220cc59 ("samples: bpf: Convert xdp_redirect_cpu to XDP samples helper")
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Alexander Lobakin <alexandr.lobakin@intel.com>
Tested-by: Alexander Lobakin <alexandr.lobakin@intel.com>
Link: https://lore.kernel.org/bpf/20211112020301.528357-1-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 samples/bpf/xdp_redirect_cpu_user.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/samples/bpf/xdp_redirect_cpu_user.c b/samples/bpf/xdp_redirect_cpu_user.c
index d84e6949007cc..a81704d3317ba 100644
--- a/samples/bpf/xdp_redirect_cpu_user.c
+++ b/samples/bpf/xdp_redirect_cpu_user.c
@@ -309,7 +309,6 @@ int main(int argc, char **argv)
 	const char *mprog_filename = NULL, *mprog_name = NULL;
 	struct xdp_redirect_cpu *skel;
 	struct bpf_map_info info = {};
-	char ifname_buf[IF_NAMESIZE];
 	struct bpf_cpumap_val value;
 	__u32 infosz = sizeof(info);
 	int ret = EXIT_FAIL_OPTION;
@@ -390,10 +389,10 @@ int main(int argc, char **argv)
 		case 'd':
 			if (strlen(optarg) >= IF_NAMESIZE) {
 				fprintf(stderr, "-d/--dev name too long\n");
+				usage(argv, long_options, __doc__, mask, true, skel->obj);
 				goto end_cpu;
 			}
-			safe_strncpy(ifname_buf, optarg, strlen(ifname_buf));
-			ifindex = if_nametoindex(ifname_buf);
+			ifindex = if_nametoindex(optarg);
 			if (!ifindex)
 				ifindex = strtoul(optarg, NULL, 0);
 			if (!ifindex) {

From ba05fd36b8512d6aeefe9c2c5b6a25b726c4bfff Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Sat, 13 Nov 2021 04:50:22 +0530
Subject: [PATCH 07/96] libbpf: Perform map fd cleanup for gen_loader in case
 of error

Alexei reported a fd leak issue in gen loader (when invoked from
bpftool) [0]. When adding ksym support, map fd allocation was moved from
stack to loader map, however I missed closing these fds (relevant when
cleanup label is jumped to on error). For the success case, the
allocated fd is returned in loader ctx, hence this problem is not
noticed.

Make three changes, first MAX_USED_MAPS in MAX_FD_ARRAY_SZ instead of
MAX_USED_PROGS, the braino was not a problem until now for this case as
we didn't try to close map fds (otherwise use of it would have tried
closing 32 additional fds in ksym btf fd range). Then, do a cleanup for
all nr_maps fds in cleanup label code, so that in case of error all
temporary map fds from bpf_gen__map_create are closed.

Then, adjust the cleanup label to only generate code for the required
number of program and map fds.  To trim code for remaining program
fds, lay out prog_fd array in stack in the end, so that we can
directly skip the remaining instances.  Still stack size remains same,
since changing that would require changes in a lot of places
(including adjustment of stack_off macro), so nr_progs_sz variable is
only used to track required number of iterations (and jump over
cleanup size calculated from that), stack offset calculation remains
unaffected.

The difference for test_ksyms_module.o is as follows:
libbpf: //prog cleanup iterations: before = 34, after = 5
libbpf: //maps cleanup iterations: before = 64, after = 2

Also, move allocation of gen->fd_array offset to bpf_gen__init. Since
offset can now be 0, and we already continue even if add_data returns 0
in case of failure, we do not need to distinguish between 0 offset and
failure case 0, as we rely on bpf_gen__finish to check errors. We can
also skip check for gen->fd_array in add_*_fd functions, since
bpf_gen__init will take care of it.

  [0]: https://lore.kernel.org/bpf/CAADnVQJ6jSitKSNKyxOrUzwY2qDRX0sPkJ=VLGHuCLVJ=qOt9g@mail.gmail.com

Fixes: 18f4fccbf314 ("libbpf: Update gen_loader to emit BTF_KIND_FUNC relocations")
Reported-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20211112232022.899074-1-memxor@gmail.com
---
 tools/lib/bpf/bpf_gen_internal.h |  4 +--
 tools/lib/bpf/gen_loader.c       | 47 ++++++++++++++++++++------------
 tools/lib/bpf/libbpf.c           |  4 +--
 3 files changed, 34 insertions(+), 21 deletions(-)

diff --git a/tools/lib/bpf/bpf_gen_internal.h b/tools/lib/bpf/bpf_gen_internal.h
index d26e5472fe501..6f3df004479b3 100644
--- a/tools/lib/bpf/bpf_gen_internal.h
+++ b/tools/lib/bpf/bpf_gen_internal.h
@@ -45,8 +45,8 @@ struct bpf_gen {
 	int nr_fd_array;
 };
 
-void bpf_gen__init(struct bpf_gen *gen, int log_level);
-int bpf_gen__finish(struct bpf_gen *gen);
+void bpf_gen__init(struct bpf_gen *gen, int log_level, int nr_progs, int nr_maps);
+int bpf_gen__finish(struct bpf_gen *gen, int nr_progs, int nr_maps);
 void bpf_gen__free(struct bpf_gen *gen);
 void bpf_gen__load_btf(struct bpf_gen *gen, const void *raw_data, __u32 raw_size);
 void bpf_gen__map_create(struct bpf_gen *gen, struct bpf_create_map_params *map_attr, int map_idx);
diff --git a/tools/lib/bpf/gen_loader.c b/tools/lib/bpf/gen_loader.c
index 502dea53a742d..9934851ccde76 100644
--- a/tools/lib/bpf/gen_loader.c
+++ b/tools/lib/bpf/gen_loader.c
@@ -18,7 +18,7 @@
 #define MAX_USED_MAPS	64
 #define MAX_USED_PROGS	32
 #define MAX_KFUNC_DESCS 256
-#define MAX_FD_ARRAY_SZ (MAX_USED_PROGS + MAX_KFUNC_DESCS)
+#define MAX_FD_ARRAY_SZ (MAX_USED_MAPS + MAX_KFUNC_DESCS)
 
 /* The following structure describes the stack layout of the loader program.
  * In addition R6 contains the pointer to context.
@@ -33,8 +33,8 @@
  */
 struct loader_stack {
 	__u32 btf_fd;
-	__u32 prog_fd[MAX_USED_PROGS];
 	__u32 inner_map_fd;
+	__u32 prog_fd[MAX_USED_PROGS];
 };
 
 #define stack_off(field) \
@@ -42,6 +42,11 @@ struct loader_stack {
 
 #define attr_field(attr, field) (attr + offsetof(union bpf_attr, field))
 
+static int blob_fd_array_off(struct bpf_gen *gen, int index)
+{
+	return gen->fd_array + index * sizeof(int);
+}
+
 static int realloc_insn_buf(struct bpf_gen *gen, __u32 size)
 {
 	size_t off = gen->insn_cur - gen->insn_start;
@@ -102,11 +107,15 @@ static void emit2(struct bpf_gen *gen, struct bpf_insn insn1, struct bpf_insn in
 	emit(gen, insn2);
 }
 
-void bpf_gen__init(struct bpf_gen *gen, int log_level)
+static int add_data(struct bpf_gen *gen, const void *data, __u32 size);
+static void emit_sys_close_blob(struct bpf_gen *gen, int blob_off);
+
+void bpf_gen__init(struct bpf_gen *gen, int log_level, int nr_progs, int nr_maps)
 {
-	size_t stack_sz = sizeof(struct loader_stack);
+	size_t stack_sz = sizeof(struct loader_stack), nr_progs_sz;
 	int i;
 
+	gen->fd_array = add_data(gen, NULL, MAX_FD_ARRAY_SZ * sizeof(int));
 	gen->log_level = log_level;
 	/* save ctx pointer into R6 */
 	emit(gen, BPF_MOV64_REG(BPF_REG_6, BPF_REG_1));
@@ -118,19 +127,27 @@ void bpf_gen__init(struct bpf_gen *gen, int log_level)
 	emit(gen, BPF_MOV64_IMM(BPF_REG_3, 0));
 	emit(gen, BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel));
 
+	/* amount of stack actually used, only used to calculate iterations, not stack offset */
+	nr_progs_sz = offsetof(struct loader_stack, prog_fd[nr_progs]);
 	/* jump over cleanup code */
 	emit(gen, BPF_JMP_IMM(BPF_JA, 0, 0,
-			      /* size of cleanup code below */
-			      (stack_sz / 4) * 3 + 2));
+			      /* size of cleanup code below (including map fd cleanup) */
+			      (nr_progs_sz / 4) * 3 + 2 +
+			      /* 6 insns for emit_sys_close_blob,
+			       * 6 insns for debug_regs in emit_sys_close_blob
+			       */
+			      nr_maps * (6 + (gen->log_level ? 6 : 0))));
 
 	/* remember the label where all error branches will jump to */
 	gen->cleanup_label = gen->insn_cur - gen->insn_start;
 	/* emit cleanup code: close all temp FDs */
-	for (i = 0; i < stack_sz; i += 4) {
+	for (i = 0; i < nr_progs_sz; i += 4) {
 		emit(gen, BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_10, -stack_sz + i));
 		emit(gen, BPF_JMP_IMM(BPF_JSLE, BPF_REG_1, 0, 1));
 		emit(gen, BPF_EMIT_CALL(BPF_FUNC_sys_close));
 	}
+	for (i = 0; i < nr_maps; i++)
+		emit_sys_close_blob(gen, blob_fd_array_off(gen, i));
 	/* R7 contains the error code from sys_bpf. Copy it into R0 and exit. */
 	emit(gen, BPF_MOV64_REG(BPF_REG_0, BPF_REG_7));
 	emit(gen, BPF_EXIT_INSN());
@@ -160,8 +177,6 @@ static int add_data(struct bpf_gen *gen, const void *data, __u32 size)
  */
 static int add_map_fd(struct bpf_gen *gen)
 {
-	if (!gen->fd_array)
-		gen->fd_array = add_data(gen, NULL, MAX_FD_ARRAY_SZ * sizeof(int));
 	if (gen->nr_maps == MAX_USED_MAPS) {
 		pr_warn("Total maps exceeds %d\n", MAX_USED_MAPS);
 		gen->error = -E2BIG;
@@ -174,8 +189,6 @@ static int add_kfunc_btf_fd(struct bpf_gen *gen)
 {
 	int cur;
 
-	if (!gen->fd_array)
-		gen->fd_array = add_data(gen, NULL, MAX_FD_ARRAY_SZ * sizeof(int));
 	if (gen->nr_fd_array == MAX_KFUNC_DESCS) {
 		cur = add_data(gen, NULL, sizeof(int));
 		return (cur - gen->fd_array) / sizeof(int);
@@ -183,11 +196,6 @@ static int add_kfunc_btf_fd(struct bpf_gen *gen)
 	return MAX_USED_MAPS + gen->nr_fd_array++;
 }
 
-static int blob_fd_array_off(struct bpf_gen *gen, int index)
-{
-	return gen->fd_array + index * sizeof(int);
-}
-
 static int insn_bytes_to_bpf_size(__u32 sz)
 {
 	switch (sz) {
@@ -359,10 +367,15 @@ static void emit_sys_close_blob(struct bpf_gen *gen, int blob_off)
 	__emit_sys_close(gen);
 }
 
-int bpf_gen__finish(struct bpf_gen *gen)
+int bpf_gen__finish(struct bpf_gen *gen, int nr_progs, int nr_maps)
 {
 	int i;
 
+	if (nr_progs != gen->nr_progs || nr_maps != gen->nr_maps) {
+		pr_warn("progs/maps mismatch\n");
+		gen->error = -EFAULT;
+		return gen->error;
+	}
 	emit_sys_close_stack(gen, stack_off(btf_fd));
 	for (i = 0; i < gen->nr_progs; i++)
 		move_stack2ctx(gen,
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index a1bea1953df67..7c74342bb6680 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -7258,7 +7258,7 @@ int bpf_object__load_xattr(struct bpf_object_load_attr *attr)
 	}
 
 	if (obj->gen_loader)
-		bpf_gen__init(obj->gen_loader, attr->log_level);
+		bpf_gen__init(obj->gen_loader, attr->log_level, obj->nr_programs, obj->nr_maps);
 
 	err = bpf_object__probe_loading(obj);
 	err = err ? : bpf_object__load_vmlinux_btf(obj, false);
@@ -7277,7 +7277,7 @@ int bpf_object__load_xattr(struct bpf_object_load_attr *attr)
 		for (i = 0; i < obj->nr_maps; i++)
 			obj->maps[i].fd = -1;
 		if (!err)
-			err = bpf_gen__finish(obj->gen_loader);
+			err = bpf_gen__finish(obj->gen_loader, obj->nr_programs, obj->nr_maps);
 	}
 
 	/* clean up fd_array */

From 81b1d548d00bcd028303c4f3150fa753b9b8aa71 Mon Sep 17 00:00:00 2001
From: Lin Ma <linma@zju.edu.cn>
Date: Thu, 11 Nov 2021 22:14:02 +0800
Subject: [PATCH 08/96] hamradio: remove needs_free_netdev to avoid UAF

The former patch "defer 6pack kfree after unregister_netdev" reorders
the kfree of two buffer after the unregister_netdev to prevent the race
condition. It also adds free_netdev() function in sixpack_close(), which
is a direct copy from the similar code in mkiss_close().

However, in sixpack driver, the flag needs_free_netdev is set to true in
sp_setup(), hence the unregister_netdev() will free the netdev
automatically. Therefore, as the sp is netdev_priv, use-after-free
occurs.

This patch removes the needs_free_netdev = true and just let the
free_netdev to finish this deallocation task.

Fixes: 0b9111922b1f ("hamradio: defer 6pack kfree after unregister_netdev")
Signed-off-by: Lin Ma <linma@zju.edu.cn>
Link: https://lore.kernel.org/r/20211111141402.7551-1-linma@zju.edu.cn
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/hamradio/6pack.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/net/hamradio/6pack.c b/drivers/net/hamradio/6pack.c
index bfdf89e54752c..8a19a06b505d1 100644
--- a/drivers/net/hamradio/6pack.c
+++ b/drivers/net/hamradio/6pack.c
@@ -306,7 +306,6 @@ static void sp_setup(struct net_device *dev)
 {
 	/* Finish setting up the DEVICE info. */
 	dev->netdev_ops		= &sp_netdev_ops;
-	dev->needs_free_netdev	= true;
 	dev->mtu		= SIXP_MTU;
 	dev->hard_header_len	= AX25_MAX_HEADER_LEN;
 	dev->header_ops 	= &ax25_header_ops;

From 87530779de0440f1ca3cd727299b1a78a9e564d0 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Thu, 11 Nov 2021 07:50:34 -0800
Subject: [PATCH 09/96] ptp: ptp_clockmatrix: repair non-kernel-doc comment

Do not use "/**" to begin a comment that is not in kernel-doc format.

Prevents this docs build warning:

drivers/ptp/ptp_clockmatrix.c:1679: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst
    * Maximum absolute value for write phase offset in picoseconds

Then remove the kernel-doc-like function parameter descriptions
since they don't add any useful info. (suggested by Jakub)

Fixes: 794c3dffacc16 ("ptp: ptp_clockmatrix: Add support for FW 5.2 (8A34005)")
Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Reported-by: kernel test robot <lkp@intel.com>
Cc: Min Li <min.li.xe@renesas.com>
Acked-by: Richard Cochran <richardcochran@gmail.com>
Link: https://lore.kernel.org/r/20211111155034.29153-1-rdunlap@infradead.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/ptp/ptp_clockmatrix.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/drivers/ptp/ptp_clockmatrix.c b/drivers/ptp/ptp_clockmatrix.c
index 6bc5791a7ec5b..08e429a069221 100644
--- a/drivers/ptp/ptp_clockmatrix.c
+++ b/drivers/ptp/ptp_clockmatrix.c
@@ -1699,12 +1699,9 @@ static int initialize_dco_operating_mode(struct idtcm_channel *channel)
 
 /* PTP Hardware Clock interface */
 
-/**
+/*
  * Maximum absolute value for write phase offset in picoseconds
  *
- * @channel:  channel
- * @delta_ns: delta in nanoseconds
- *
  * Destination signed register is 32-bit register in resolution of 50ps
  *
  * 0x7fffffff * 50 =  2147483647 * 50 = 107374182350

From 0cda7d4bac5fd29dceb13df26083333fa99d6bb4 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Thu, 11 Nov 2021 08:29:29 -0800
Subject: [PATCH 10/96] selftests: net: switch to socat in the GSO GRE test

Commit a985442fdecb ("selftests: net: properly support IPv6 in GSO GRE test")
is not compatible with:

  Ncat: Version 7.80 ( https://nmap.org/ncat )

(which is distributed with Fedora/Red Hat), tests fail with:

  nc: invalid option -- 'N'

Let's switch to socat which is far more dependable.

Fixes: 025efa0a82df ("selftests: add simple GSO GRE test")
Fixes: a985442fdecb ("selftests: net: properly support IPv6 in GSO GRE test")
Tested-by: Andrea Righi <andrea.righi@canonical.com>
Link: https://lore.kernel.org/r/20211111162929.530470-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/gre_gso.sh | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/tools/testing/selftests/net/gre_gso.sh b/tools/testing/selftests/net/gre_gso.sh
index fdeb44d621eb9..3224651db97b8 100755
--- a/tools/testing/selftests/net/gre_gso.sh
+++ b/tools/testing/selftests/net/gre_gso.sh
@@ -118,16 +118,18 @@ gre_gst_test_checks()
 	local addr=$2
 	local proto=$3
 
-	$NS_EXEC nc $proto -kl $port >/dev/null &
+	[ "$proto" == 6 ] && addr="[$addr]"
+
+	$NS_EXEC socat - tcp${proto}-listen:$port,reuseaddr,fork >/dev/null &
 	PID=$!
 	while ! $NS_EXEC ss -ltn | grep -q $port; do ((i++)); sleep 0.01; done
 
-	cat $TMPFILE | timeout 1 nc $proto -N $addr $port
+	cat $TMPFILE | timeout 1 socat -u STDIN TCP:$addr:$port
 	log_test $? 0 "$name - copy file w/ TSO"
 
 	ethtool -K veth0 tso off
 
-	cat $TMPFILE | timeout 1 nc $proto -N $addr $port
+	cat $TMPFILE | timeout 1 socat -u STDIN TCP:$addr:$port
 	log_test $? 0 "$name - copy file w/ GSO"
 
 	ethtool -K veth0 tso on
@@ -155,8 +157,8 @@ gre6_gso_test()
 
 	sleep 2
 
-	gre_gst_test_checks GREv6/v4 172.16.2.2
-	gre_gst_test_checks GREv6/v6 2001:db8:1::2 -6
+	gre_gst_test_checks GREv6/v4 172.16.2.2 4
+	gre_gst_test_checks GREv6/v6 2001:db8:1::2 6
 
 	cleanup
 }
@@ -212,8 +214,8 @@ if [ ! -x "$(command -v ip)" ]; then
 	exit $ksft_skip
 fi
 
-if [ ! -x "$(command -v nc)" ]; then
-	echo "SKIP: Could not run test without nc tool"
+if [ ! -x "$(command -v socat)" ]; then
+	echo "SKIP: Could not run test without socat tool"
 	exit $ksft_skip
 fi
 

From 27df68d579c67ef6c39a5047559b6a7c08c96219 Mon Sep 17 00:00:00 2001
From: Konrad Dybcio <konrad.dybcio@somainline.org>
Date: Thu, 11 Nov 2021 19:37:24 +0100
Subject: [PATCH 11/96] net/ipa: ipa_resource: Fix wrong for loop range

The source group count was mistakenly assigned to both dst and src loops.
Fix it to make IPA probe and work again.

Fixes: 4fd704b3608a ("net: ipa: record number of groups in data")
Acked-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@somainline.org>
Reviewed-by: Marijn Suijten <marijn.suijten@somainline.org>
Signed-off-by: Konrad Dybcio <konrad.dybcio@somainline.org>
Reviewed-by: Alex Elder <elder@linaro.org>
Link: https://lore.kernel.org/r/20211111183724.593478-1-konrad.dybcio@somainline.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ipa/ipa_resource.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ipa/ipa_resource.c b/drivers/net/ipa/ipa_resource.c
index e3da95d694099..06cec71993823 100644
--- a/drivers/net/ipa/ipa_resource.c
+++ b/drivers/net/ipa/ipa_resource.c
@@ -52,7 +52,7 @@ static bool ipa_resource_limits_valid(struct ipa *ipa,
 				return false;
 	}
 
-	group_count = data->rsrc_group_src_count;
+	group_count = data->rsrc_group_dst_count;
 	if (!group_count || group_count > IPA_RESOURCE_GROUP_MAX)
 		return false;
 

From aae458725412332825f31121a5feb8fd887cac5a Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Thu, 11 Nov 2021 13:08:24 -0800
Subject: [PATCH 12/96] ethernet: sis900: fix indentation

A space has snuck in.

Reported-by: kernel test robot <lkp@intel.com>
Fixes: 74fad215ee3d ("ethernet: sis900: use eth_hw_addr_set()")
Link: https://lore.kernel.org/r/20211111210824.676201-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/sis/sis900.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/sis/sis900.c b/drivers/net/ethernet/sis/sis900.c
index cc2d907c4c4bc..23a336c5096ed 100644
--- a/drivers/net/ethernet/sis/sis900.c
+++ b/drivers/net/ethernet/sis/sis900.c
@@ -392,7 +392,7 @@ static int sis96x_get_mac_addr(struct pci_dev *pci_dev,
 			/* get MAC address from EEPROM */
 			for (i = 0; i < 3; i++)
 			        addr[i] = read_eeprom(ioaddr, i + EEPROMMACAddr);
-			 eth_hw_addr_set(net_dev, (u8 *)addr);
+			eth_hw_addr_set(net_dev, (u8 *)addr);
 
 			rc = 1;
 			break;

From 70701b83e208767f2720d8cd3e6a62cddafb3a30 Mon Sep 17 00:00:00 2001
From: Arjun Roy <arjunroy@google.com>
Date: Thu, 11 Nov 2021 15:52:15 -0800
Subject: [PATCH 13/96] tcp: Fix uninitialized access in skb frags array for Rx
 0cp.

TCP Receive zerocopy iterates through the SKB queue via
tcp_recv_skb(), acquiring a pointer to an SKB and an offset within
that SKB to read from. From there, it iterates the SKB frags array to
determine which offset to start remapping pages from.

However, this is built on the assumption that the offset read so far
within the SKB is smaller than the SKB length. If this assumption is
violated, we can attempt to read an invalid frags array element, which
would cause a fault.

tcp_recv_skb() can cause such an SKB to be returned when the TCP FIN
flag is set. Therefore, we must guard against this occurrence inside
skb_advance_frag().

One way that we can reproduce this error follows:
1) In a receiver program, call getsockopt(TCP_ZEROCOPY_RECEIVE) with:
char some_array[32 * 1024];
struct tcp_zerocopy_receive zc = {
  .copybuf_address  = (__u64) &some_array[0],
  .copybuf_len = 32 * 1024,
};

2) In a sender program, after a TCP handshake, send the following
sequence of packets:
  i) Seq = [X, X+4000]
  ii) Seq = [X+4000, X+5000]
  iii) Seq = [X+4000, X+5000], Flags = FIN | URG, urgptr=1000

(This can happen without URG, if we have a signal pending, but URG is
a convenient way to reproduce the behaviour).

In this case, the following event sequence will occur on the receiver:

tcp_zerocopy_receive():
-> receive_fallback_to_copy() // copybuf_len >= inq
-> tcp_recvmsg_locked() // reads 5000 bytes, then breaks due to URG
-> tcp_recv_skb() // yields skb with skb->len == offset
-> tcp_zerocopy_set_hint_for_skb()
-> skb_advance_to_frag() // will returns a frags ptr. >= nr_frags
-> find_next_mappable_frag() // will dereference this bad frags ptr.

With this patch, skb_advance_to_frag() will no longer return an
invalid frags pointer, and will return NULL instead, fixing the issue.

Signed-off-by: Arjun Roy <arjunroy@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Fixes: 05255b823a61 ("tcp: add TCP_ZEROCOPY_RECEIVE support for zerocopy receive")
Link: https://lore.kernel.org/r/20211111235215.2605384-1-arjunroy.kdev@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/ipv4/tcp.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index b7796b4cf0a09..bbb3d39c69afc 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1758,6 +1758,9 @@ static skb_frag_t *skb_advance_to_frag(struct sk_buff *skb, u32 offset_skb,
 {
 	skb_frag_t *frag;
 
+	if (unlikely(offset_skb >= skb->len))
+		return NULL;
+
 	offset_skb -= skb_headlen(skb);
 	if ((int)offset_skb < 0 || skb_has_frag_list(skb))
 		return NULL;

From 1aa3b2207e889a948049c9a8016cedb0218c2389 Mon Sep 17 00:00:00 2001
From: Paul Moore <paul@paul-moore.com>
Date: Fri, 12 Nov 2021 18:18:10 -0500
Subject: [PATCH 14/96] net,lsm,selinux: revert the
 security_sctp_assoc_established() hook

This patch reverts two prior patches, e7310c94024c
("security: implement sctp_assoc_established hook in selinux") and
7c2ef0240e6a ("security: add sctp_assoc_established hook"), which
create the security_sctp_assoc_established() LSM hook and provide a
SELinux implementation.  Unfortunately these two patches were merged
without proper review (the Reviewed-by and Tested-by tags from
Richard Haines were for previous revisions of these patches that
were significantly different) and there are outstanding objections
from the SELinux maintainers regarding these patches.

Work is currently ongoing to correct the problems identified in the
reverted patches, as well as others that have come up during review,
but it is unclear at this point in time when that work will be ready
for inclusion in the mainline kernel.  In the interest of not keeping
objectionable code in the kernel for multiple weeks, and potentially
a kernel release, we are reverting the two problematic patches.

Signed-off-by: Paul Moore <paul@paul-moore.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/security/SCTP.rst | 22 ++++++++++++----------
 include/linux/lsm_hook_defs.h   |  2 --
 include/linux/lsm_hooks.h       |  5 -----
 include/linux/security.h        |  7 -------
 net/sctp/sm_statefuns.c         |  2 +-
 security/security.c             |  7 -------
 security/selinux/hooks.c        | 14 +-------------
 7 files changed, 14 insertions(+), 45 deletions(-)

diff --git a/Documentation/security/SCTP.rst b/Documentation/security/SCTP.rst
index 406cc68b88087..d5fd6ccc3dcbd 100644
--- a/Documentation/security/SCTP.rst
+++ b/Documentation/security/SCTP.rst
@@ -15,7 +15,10 @@ For security module support, three SCTP specific hooks have been implemented::
     security_sctp_assoc_request()
     security_sctp_bind_connect()
     security_sctp_sk_clone()
-    security_sctp_assoc_established()
+
+Also the following security hook has been utilised::
+
+    security_inet_conn_established()
 
 The usage of these hooks are described below with the SELinux implementation
 described in the `SCTP SELinux Support`_ chapter.
@@ -119,12 +122,11 @@ calls **sctp_peeloff**\(3).
     @newsk - pointer to new sock structure.
 
 
-security_sctp_assoc_established()
+security_inet_conn_established()
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Called when a COOKIE ACK is received, and the peer secid will be
-saved into ``@asoc->peer_secid`` for client::
+Called when a COOKIE ACK is received::
 
-    @asoc - pointer to sctp association structure.
+    @sk  - pointer to sock structure.
     @skb - pointer to skbuff of the COOKIE ACK packet.
 
 
@@ -132,7 +134,7 @@ Security Hooks used for Association Establishment
 -------------------------------------------------
 
 The following diagram shows the use of ``security_sctp_bind_connect()``,
-``security_sctp_assoc_request()``, ``security_sctp_assoc_established()`` when
+``security_sctp_assoc_request()``, ``security_inet_conn_established()`` when
 establishing an association.
 ::
 
@@ -170,7 +172,7 @@ establishing an association.
           <------------------------------------------- COOKIE ACK
           |                                               |
     sctp_sf_do_5_1E_ca                                    |
- Call security_sctp_assoc_established()                   |
+ Call security_inet_conn_established()                    |
  to set the peer label.                                   |
           |                                               |
           |                               If SCTP_SOCKET_TCP or peeled off
@@ -196,7 +198,7 @@ hooks with the SELinux specifics expanded below::
     security_sctp_assoc_request()
     security_sctp_bind_connect()
     security_sctp_sk_clone()
-    security_sctp_assoc_established()
+    security_inet_conn_established()
 
 
 security_sctp_assoc_request()
@@ -269,12 +271,12 @@ sockets sid and peer sid to that contained in the ``@asoc sid`` and
     @newsk - pointer to new sock structure.
 
 
-security_sctp_assoc_established()
+security_inet_conn_established()
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 Called when a COOKIE ACK is received where it sets the connection's peer sid
 to that in ``@skb``::
 
-    @asoc - pointer to sctp association structure.
+    @sk  - pointer to sock structure.
     @skb - pointer to skbuff of the COOKIE ACK packet.
 
 
diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index 442a611fa0fbc..df8de62f4710f 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -335,8 +335,6 @@ LSM_HOOK(int, 0, sctp_bind_connect, struct sock *sk, int optname,
 	 struct sockaddr *address, int addrlen)
 LSM_HOOK(void, LSM_RET_VOID, sctp_sk_clone, struct sctp_association *asoc,
 	 struct sock *sk, struct sock *newsk)
-LSM_HOOK(void, LSM_RET_VOID, sctp_assoc_established, struct sctp_association *asoc,
-	 struct sk_buff *skb)
 #endif /* CONFIG_SECURITY_NETWORK */
 
 #ifdef CONFIG_SECURITY_INFINIBAND
diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index d6823214d5c1e..d45b6f6e27fda 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -1050,11 +1050,6 @@
  *	@asoc pointer to current sctp association structure.
  *	@sk pointer to current sock structure.
  *	@newsk pointer to new sock structure.
- * @sctp_assoc_established:
- *	Passes the @asoc and @chunk->skb of the association COOKIE_ACK packet
- *	to the security module.
- *	@asoc pointer to sctp association structure.
- *	@skb pointer to skbuff of association packet.
  *
  * Security hooks for Infiniband
  *
diff --git a/include/linux/security.h b/include/linux/security.h
index 06eac4e61a135..bbf44a4668326 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -1430,8 +1430,6 @@ int security_sctp_bind_connect(struct sock *sk, int optname,
 			       struct sockaddr *address, int addrlen);
 void security_sctp_sk_clone(struct sctp_association *asoc, struct sock *sk,
 			    struct sock *newsk);
-void security_sctp_assoc_established(struct sctp_association *asoc,
-				     struct sk_buff *skb);
 
 #else	/* CONFIG_SECURITY_NETWORK */
 static inline int security_unix_stream_connect(struct sock *sock,
@@ -1651,11 +1649,6 @@ static inline void security_sctp_sk_clone(struct sctp_association *asoc,
 					  struct sock *newsk)
 {
 }
-
-static inline void security_sctp_assoc_established(struct sctp_association *asoc,
-						   struct sk_buff *skb)
-{
-}
 #endif	/* CONFIG_SECURITY_NETWORK */
 
 #ifdef CONFIG_SECURITY_INFINIBAND
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 39ba82ee87cec..354c1c4de19bd 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -946,7 +946,7 @@ enum sctp_disposition sctp_sf_do_5_1E_ca(struct net *net,
 	sctp_add_cmd_sf(commands, SCTP_CMD_INIT_COUNTER_RESET, SCTP_NULL());
 
 	/* Set peer label for connection. */
-	security_sctp_assoc_established((struct sctp_association *)asoc, chunk->skb);
+	security_inet_conn_established(ep->base.sk, chunk->skb);
 
 	/* RFC 2960 5.1 Normal Establishment of an Association
 	 *
diff --git a/security/security.c b/security/security.c
index 779a9edea0a03..c88167a414b41 100644
--- a/security/security.c
+++ b/security/security.c
@@ -2388,13 +2388,6 @@ void security_sctp_sk_clone(struct sctp_association *asoc, struct sock *sk,
 }
 EXPORT_SYMBOL(security_sctp_sk_clone);
 
-void security_sctp_assoc_established(struct sctp_association *asoc,
-				     struct sk_buff *skb)
-{
-	call_void_hook(sctp_assoc_established, asoc, skb);
-}
-EXPORT_SYMBOL(security_sctp_assoc_established);
-
 #endif	/* CONFIG_SECURITY_NETWORK */
 
 #ifdef CONFIG_SECURITY_INFINIBAND
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 5e5215fe2e83e..62d30c0a30c29 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -5502,8 +5502,7 @@ static void selinux_sctp_sk_clone(struct sctp_association *asoc, struct sock *sk
 	if (!selinux_policycap_extsockclass())
 		return selinux_sk_clone_security(sk, newsk);
 
-	if (asoc->secid != SECSID_WILD)
-		newsksec->sid = asoc->secid;
+	newsksec->sid = asoc->secid;
 	newsksec->peer_sid = asoc->peer_secid;
 	newsksec->sclass = sksec->sclass;
 	selinux_netlbl_sctp_sk_clone(sk, newsk);
@@ -5559,16 +5558,6 @@ static void selinux_inet_conn_established(struct sock *sk, struct sk_buff *skb)
 	selinux_skb_peerlbl_sid(skb, family, &sksec->peer_sid);
 }
 
-static void selinux_sctp_assoc_established(struct sctp_association *asoc,
-					   struct sk_buff *skb)
-{
-	struct sk_security_struct *sksec = asoc->base.sk->sk_security;
-
-	selinux_inet_conn_established(asoc->base.sk, skb);
-	asoc->peer_secid = sksec->peer_sid;
-	asoc->secid = SECSID_WILD;
-}
-
 static int selinux_secmark_relabel_packet(u32 sid)
 {
 	const struct task_security_struct *__tsec;
@@ -7239,7 +7228,6 @@ static struct security_hook_list selinux_hooks[] __lsm_ro_after_init = {
 	LSM_HOOK_INIT(sctp_assoc_request, selinux_sctp_assoc_request),
 	LSM_HOOK_INIT(sctp_sk_clone, selinux_sctp_sk_clone),
 	LSM_HOOK_INIT(sctp_bind_connect, selinux_sctp_bind_connect),
-	LSM_HOOK_INIT(sctp_assoc_established, selinux_sctp_assoc_established),
 	LSM_HOOK_INIT(inet_conn_request, selinux_inet_conn_request),
 	LSM_HOOK_INIT(inet_csk_clone, selinux_inet_csk_clone),
 	LSM_HOOK_INIT(inet_conn_established, selinux_inet_conn_established),

From 563fbefed46ae4c1f70cffb8eb54c02df480b2c2 Mon Sep 17 00:00:00 2001
From: Nguyen Dinh Phi <phind.uet@gmail.com>
Date: Thu, 28 Oct 2021 01:37:22 +0800
Subject: [PATCH 15/96] cfg80211: call cfg80211_stop_ap when switch from P2P_GO
 type

If the userspace tools switch from NL80211_IFTYPE_P2P_GO to
NL80211_IFTYPE_ADHOC via send_msg(NL80211_CMD_SET_INTERFACE), it
does not call the cleanup cfg80211_stop_ap(), this leads to the
initialization of in-use data. For example, this path re-init the
sdata->assigned_chanctx_list while it is still an element of
assigned_vifs list, and makes that linked list corrupt.

Signed-off-by: Nguyen Dinh Phi <phind.uet@gmail.com>
Reported-by: syzbot+bbf402b783eeb6d908db@syzkaller.appspotmail.com
Link: https://lore.kernel.org/r/20211027173722.777287-1-phind.uet@gmail.com
Cc: stable@vger.kernel.org
Fixes: ac800140c20e ("cfg80211: .stop_ap when interface is going down")
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/util.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/wireless/util.c b/net/wireless/util.c
index 5ff1f8726faf8..41ea65deb6e1f 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -1046,6 +1046,7 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev,
 
 		switch (otype) {
 		case NL80211_IFTYPE_AP:
+		case NL80211_IFTYPE_P2P_GO:
 			cfg80211_stop_ap(rdev, dev, true);
 			break;
 		case NL80211_IFTYPE_ADHOC:

From ce6b69749961426c6d822215ded9e67154e1ad4f Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 29 Oct 2021 09:25:39 +0200
Subject: [PATCH 16/96] nl80211: fix radio statistics in survey dump

Even if userspace specifies the NL80211_ATTR_SURVEY_RADIO_STATS
attribute, we cannot get the statistics because we're not really
parsing the incoming attributes properly any more.

Fix this by passing the attrbuf to nl80211_prepare_wdev_dump()
and filling it there, if given, and using a local version only
if no output is desired.

Since I'm touching it anyway, make nl80211_prepare_wdev_dump()
static.

Fixes: 50508d941c18 ("cfg80211: use parallel_ops for genl")
Reported-by: Jan Fuchs <jf@simonwunderlich.de>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Tested-by: Sven Eckelmann <sven@narfation.org>
Link: https://lore.kernel.org/r/20211029092539.2851b4799386.If9736d4575ee79420cbec1bd930181e1d53c7317@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/nl80211.c | 34 +++++++++++++++++++---------------
 net/wireless/nl80211.h |  6 +-----
 2 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 81232b73df8f9..a27b3b5fa210f 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -936,33 +936,37 @@ nl80211_packet_pattern_policy[MAX_NL80211_PKTPAT + 1] = {
 	[NL80211_PKTPAT_OFFSET] = { .type = NLA_U32 },
 };
 
-int nl80211_prepare_wdev_dump(struct netlink_callback *cb,
-			      struct cfg80211_registered_device **rdev,
-			      struct wireless_dev **wdev)
+static int nl80211_prepare_wdev_dump(struct netlink_callback *cb,
+				     struct cfg80211_registered_device **rdev,
+				     struct wireless_dev **wdev,
+				     struct nlattr **attrbuf)
 {
 	int err;
 
 	if (!cb->args[0]) {
-		struct nlattr **attrbuf;
+		struct nlattr **attrbuf_free = NULL;
 
-		attrbuf = kcalloc(NUM_NL80211_ATTR, sizeof(*attrbuf),
-				  GFP_KERNEL);
-		if (!attrbuf)
-			return -ENOMEM;
+		if (!attrbuf) {
+			attrbuf = kcalloc(NUM_NL80211_ATTR, sizeof(*attrbuf),
+					  GFP_KERNEL);
+			if (!attrbuf)
+				return -ENOMEM;
+			attrbuf_free = attrbuf;
+		}
 
 		err = nlmsg_parse_deprecated(cb->nlh,
 					     GENL_HDRLEN + nl80211_fam.hdrsize,
 					     attrbuf, nl80211_fam.maxattr,
 					     nl80211_policy, NULL);
 		if (err) {
-			kfree(attrbuf);
+			kfree(attrbuf_free);
 			return err;
 		}
 
 		rtnl_lock();
 		*wdev = __cfg80211_wdev_from_attrs(NULL, sock_net(cb->skb->sk),
 						   attrbuf);
-		kfree(attrbuf);
+		kfree(attrbuf_free);
 		if (IS_ERR(*wdev)) {
 			rtnl_unlock();
 			return PTR_ERR(*wdev);
@@ -6197,7 +6201,7 @@ static int nl80211_dump_station(struct sk_buff *skb,
 	int sta_idx = cb->args[2];
 	int err;
 
-	err = nl80211_prepare_wdev_dump(cb, &rdev, &wdev);
+	err = nl80211_prepare_wdev_dump(cb, &rdev, &wdev, NULL);
 	if (err)
 		return err;
 	/* nl80211_prepare_wdev_dump acquired it in the successful case */
@@ -7092,7 +7096,7 @@ static int nl80211_dump_mpath(struct sk_buff *skb,
 	int path_idx = cb->args[2];
 	int err;
 
-	err = nl80211_prepare_wdev_dump(cb, &rdev, &wdev);
+	err = nl80211_prepare_wdev_dump(cb, &rdev, &wdev, NULL);
 	if (err)
 		return err;
 	/* nl80211_prepare_wdev_dump acquired it in the successful case */
@@ -7292,7 +7296,7 @@ static int nl80211_dump_mpp(struct sk_buff *skb,
 	int path_idx = cb->args[2];
 	int err;
 
-	err = nl80211_prepare_wdev_dump(cb, &rdev, &wdev);
+	err = nl80211_prepare_wdev_dump(cb, &rdev, &wdev, NULL);
 	if (err)
 		return err;
 	/* nl80211_prepare_wdev_dump acquired it in the successful case */
@@ -9718,7 +9722,7 @@ static int nl80211_dump_scan(struct sk_buff *skb, struct netlink_callback *cb)
 	int start = cb->args[2], idx = 0;
 	int err;
 
-	err = nl80211_prepare_wdev_dump(cb, &rdev, &wdev);
+	err = nl80211_prepare_wdev_dump(cb, &rdev, &wdev, NULL);
 	if (err)
 		return err;
 	/* nl80211_prepare_wdev_dump acquired it in the successful case */
@@ -9851,7 +9855,7 @@ static int nl80211_dump_survey(struct sk_buff *skb, struct netlink_callback *cb)
 	if (!attrbuf)
 		return -ENOMEM;
 
-	res = nl80211_prepare_wdev_dump(cb, &rdev, &wdev);
+	res = nl80211_prepare_wdev_dump(cb, &rdev, &wdev, attrbuf);
 	if (res) {
 		kfree(attrbuf);
 		return res;
diff --git a/net/wireless/nl80211.h b/net/wireless/nl80211.h
index a3f387770f1bf..d642e3be4ee78 100644
--- a/net/wireless/nl80211.h
+++ b/net/wireless/nl80211.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /*
  * Portions of this file
- * Copyright (C) 2018, 2020 Intel Corporation
+ * Copyright (C) 2018, 2020-2021 Intel Corporation
  */
 #ifndef __NET_WIRELESS_NL80211_H
 #define __NET_WIRELESS_NL80211_H
@@ -22,10 +22,6 @@ static inline u64 wdev_id(struct wireless_dev *wdev)
 	       ((u64)wiphy_to_rdev(wdev->wiphy)->wiphy_idx << 32);
 }
 
-int nl80211_prepare_wdev_dump(struct netlink_callback *cb,
-			      struct cfg80211_registered_device **rdev,
-			      struct wireless_dev **wdev);
-
 int nl80211_parse_chandef(struct cfg80211_registered_device *rdev,
 			  struct genl_info *info,
 			  struct cfg80211_chan_def *chandef);

From 77dfc2bc0bb4b8376ecd7a430f27a4a8fff6a5a0 Mon Sep 17 00:00:00 2001
From: Xing Song <xing.song@mediatek.com>
Date: Mon, 1 Nov 2021 10:46:57 +0800
Subject: [PATCH 17/96] mac80211: do not access the IV when it was stripped

ieee80211_get_keyid() will return false value if IV has been stripped,
such as return 0 for IP/ARP frames due to LLC header, and return -EINVAL
for disassociation frames due to its length... etc. Don't try to access
it if it's not present.

Signed-off-by: Xing Song <xing.song@mediatek.com>
Link: https://lore.kernel.org/r/20211101024657.143026-1-xing.song@mediatek.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/rx.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index fc5c608d02e21..6360c664259ff 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -1952,7 +1952,8 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx)
 		int keyid = rx->sta->ptk_idx;
 		sta_ptk = rcu_dereference(rx->sta->ptk[keyid]);
 
-		if (ieee80211_has_protected(fc)) {
+		if (ieee80211_has_protected(fc) &&
+		    !(status->flag & RX_FLAG_IV_STRIPPED)) {
 			cs = rx->sta->cipher_scheme;
 			keyid = ieee80211_get_keyid(rx->skb, cs);
 

From c033a38a81bc539d6c0db8c5387e0b14d819a0cf Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Tue, 9 Nov 2021 10:02:04 +0100
Subject: [PATCH 18/96] mac80211: fix radiotap header generation

In commit 8c89f7b3d3f2 ("mac80211: Use flex-array for radiotap header
bitmap") we accidentally pointed the position to the wrong place, so
we overwrite a present bitmap, and thus cause all kinds of trouble.

To see the issue, note that the previous code read:

  pos = (void *)(it_present + 1);

The requirement now is that we need to calculate pos via it_optional,
to not trigger the compiler hardening checks, as:

  pos = (void *)&rthdr->it_optional[...];

Rewriting the original expression, we get (obviously, since that just
adds "+ x - x" terms):

  pos = (void *)(it_present + 1 + rthdr->it_optional - rthdr->it_optional)

and moving the "+ rthdr->it_optional" outside to be used as an array:

  pos = (void *)&rthdr->it_optional[it_present + 1 - rthdr->it_optional];

The original is off by one, fix it.

Cc: stable@vger.kernel.org
Fixes: 8c89f7b3d3f2 ("mac80211: Use flex-array for radiotap header bitmap")
Reported-by: Sid Hayn <sidhayn@gmail.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Tested-by: Sid Hayn <sidhayn@gmail.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20211109100203.c61007433ed6.I1dade57aba7de9c4f48d68249adbae62636fd98c@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/rx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 6360c664259ff..1b0bd642dc23b 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -364,7 +364,7 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local,
 	 * the compiler to think we have walked past the end of the
 	 * struct member.
 	 */
-	pos = (void *)&rthdr->it_optional[it_present - rthdr->it_optional];
+	pos = (void *)&rthdr->it_optional[it_present + 1 - rthdr->it_optional];
 
 	/* the order of the following fields is important */
 

From f6ab25d41b18f3d26883cb9c20875e1a85c4f05b Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Wed, 10 Nov 2021 22:22:01 +0100
Subject: [PATCH 19/96] mac80211: drop check for DONT_REORDER in
 __ieee80211_select_queue

When __ieee80211_select_queue is called, skb->cb has not been cleared yet,
which means that info->control.flags can contain garbage.
In some cases this leads to IEEE80211_TX_CTRL_DONT_REORDER being set, causing
packets marked for other queues to randomly end up in BE instead.

This flag only needs to be checked in ieee80211_select_queue_80211, since
the radiotap parser is the only piece of code that sets it

Fixes: 66d06c84730c ("mac80211: adhere to Tx control flag that prevents frame reordering")
Cc: stable@vger.kernel.org
Signed-off-by: Felix Fietkau <nbd@nbd.name>
Link: https://lore.kernel.org/r/20211110212201.35452-1-nbd@nbd.name
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/wme.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/net/mac80211/wme.c b/net/mac80211/wme.c
index 9ea6004abe1be..62c6733e07923 100644
--- a/net/mac80211/wme.c
+++ b/net/mac80211/wme.c
@@ -143,7 +143,6 @@ u16 ieee80211_select_queue_80211(struct ieee80211_sub_if_data *sdata,
 u16 __ieee80211_select_queue(struct ieee80211_sub_if_data *sdata,
 			     struct sta_info *sta, struct sk_buff *skb)
 {
-	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
 	struct mac80211_qos_map *qos_map;
 	bool qos;
 
@@ -156,7 +155,7 @@ u16 __ieee80211_select_queue(struct ieee80211_sub_if_data *sdata,
 	else
 		qos = false;
 
-	if (!qos || (info->control.flags & IEEE80211_TX_CTRL_DONT_REORDER)) {
+	if (!qos) {
 		skb->priority = 0; /* required for correct WPA/11i MIC */
 		return IEEE80211_AC_BE;
 	}

From 6dd2360334f3cb3b45fc1b8194c670090474b87c Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 12 Nov 2021 13:51:44 +0100
Subject: [PATCH 20/96] mac80211: fix monitor_sdata RCU/locking assertions

Since commit a05829a7222e ("cfg80211: avoid holding the RTNL when
calling the driver") we've not only been protecting the pointer
to monitor_sdata with the RTNL, but also with the wiphy->mtx. This
is relevant in a number of lockdep assertions, e.g. the one we hit
in ieee80211_set_monitor_channel(). However, we're now protecting
all the assignments/dereferences, even the one in interface iter,
with the wiphy->mtx, so switch over the lockdep assertions to that
lock.

Fixes: a05829a7222e ("cfg80211: avoid holding the RTNL when calling the driver")
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Link: https://lore.kernel.org/r/20211112135143.cb8e8ceffef3.Iaa210f16f6904c8a7a24954fb3396da0ef86ec08@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/cfg.c   | 12 ++++++++----
 net/mac80211/iface.c |  4 +++-
 net/mac80211/util.c  |  7 ++++---
 3 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index e2b791c37591f..bd3d3195097fa 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -80,7 +80,8 @@ static int ieee80211_set_mon_options(struct ieee80211_sub_if_data *sdata,
 	}
 
 	/* also validate MU-MIMO change */
-	monitor_sdata = rtnl_dereference(local->monitor_sdata);
+	monitor_sdata = wiphy_dereference(local->hw.wiphy,
+					  local->monitor_sdata);
 
 	if (!monitor_sdata &&
 	    (params->vht_mumimo_groups || params->vht_mumimo_follow_addr))
@@ -840,7 +841,8 @@ static int ieee80211_set_monitor_channel(struct wiphy *wiphy,
 
 	mutex_lock(&local->mtx);
 	if (local->use_chanctx) {
-		sdata = rtnl_dereference(local->monitor_sdata);
+		sdata = wiphy_dereference(local->hw.wiphy,
+					  local->monitor_sdata);
 		if (sdata) {
 			ieee80211_vif_release_channel(sdata);
 			ret = ieee80211_vif_use_channel(sdata, chandef,
@@ -2707,7 +2709,8 @@ static int ieee80211_set_tx_power(struct wiphy *wiphy,
 		sdata = IEEE80211_WDEV_TO_SUB_IF(wdev);
 
 		if (sdata->vif.type == NL80211_IFTYPE_MONITOR) {
-			sdata = rtnl_dereference(local->monitor_sdata);
+			sdata = wiphy_dereference(local->hw.wiphy,
+						  local->monitor_sdata);
 			if (!sdata)
 				return -EOPNOTSUPP;
 		}
@@ -2767,7 +2770,8 @@ static int ieee80211_set_tx_power(struct wiphy *wiphy,
 	mutex_unlock(&local->iflist_mtx);
 
 	if (has_monitor) {
-		sdata = rtnl_dereference(local->monitor_sdata);
+		sdata = wiphy_dereference(local->hw.wiphy,
+					  local->monitor_sdata);
 		if (sdata) {
 			sdata->user_power_level = local->user_power_level;
 			if (txp_type != sdata->vif.bss_conf.txpower_type)
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index 9a2145c8192b6..20aa5cc31f77e 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -588,7 +588,7 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, bool going_do
 	 */
 	if (local->suspended) {
 		WARN_ON(local->wowlan);
-		WARN_ON(rtnl_dereference(local->monitor_sdata));
+		WARN_ON(rcu_access_pointer(local->monitor_sdata));
 		return;
 	}
 
@@ -961,6 +961,7 @@ int ieee80211_add_virtual_monitor(struct ieee80211_local *local)
 		return 0;
 
 	ASSERT_RTNL();
+	lockdep_assert_wiphy(local->hw.wiphy);
 
 	if (local->monitor_sdata)
 		return 0;
@@ -1028,6 +1029,7 @@ void ieee80211_del_virtual_monitor(struct ieee80211_local *local)
 		return;
 
 	ASSERT_RTNL();
+	lockdep_assert_wiphy(local->hw.wiphy);
 
 	mutex_lock(&local->iflist_mtx);
 
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 39fa2a50385d8..43df2f0c5db9c 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -796,7 +796,7 @@ static void __iterate_interfaces(struct ieee80211_local *local,
 
 	sdata = rcu_dereference_check(local->monitor_sdata,
 				      lockdep_is_held(&local->iflist_mtx) ||
-				      lockdep_rtnl_is_held());
+				      lockdep_is_held(&local->hw.wiphy->mtx));
 	if (sdata &&
 	    (iter_flags & IEEE80211_IFACE_ITER_RESUME_ALL || !active_only ||
 	     sdata->flags & IEEE80211_SDATA_IN_DRIVER))
@@ -2381,7 +2381,7 @@ int ieee80211_reconfig(struct ieee80211_local *local)
 				   IEEE80211_TPT_LEDTRIG_FL_RADIO, 0);
 
 	/* add interfaces */
-	sdata = rtnl_dereference(local->monitor_sdata);
+	sdata = wiphy_dereference(local->hw.wiphy, local->monitor_sdata);
 	if (sdata) {
 		/* in HW restart it exists already */
 		WARN_ON(local->resuming);
@@ -2426,7 +2426,8 @@ int ieee80211_reconfig(struct ieee80211_local *local)
 				WARN_ON(drv_add_chanctx(local, ctx));
 		mutex_unlock(&local->chanctx_mtx);
 
-		sdata = rtnl_dereference(local->monitor_sdata);
+		sdata = wiphy_dereference(local->hw.wiphy,
+					  local->monitor_sdata);
 		if (sdata && ieee80211_sdata_running(sdata))
 			ieee80211_assign_chanctx(local, sdata);
 	}

From 30f6cf96912b638d0ddfc325204b598f94efddc2 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Sat, 13 Nov 2021 07:34:15 +0100
Subject: [PATCH 21/96] mac80211: fix throughput LED trigger

The codepaths for rx with decap offload and tx with itxq were not updating
the counters for the throughput led trigger.

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Link: https://lore.kernel.org/r/20211113063415.55147-1-nbd@nbd.name
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/led.h |  8 ++++----
 net/mac80211/rx.c  |  7 ++++---
 net/mac80211/tx.c  | 34 +++++++++++++++-------------------
 3 files changed, 23 insertions(+), 26 deletions(-)

diff --git a/net/mac80211/led.h b/net/mac80211/led.h
index fb3aaa3c56069..b71a1428d883c 100644
--- a/net/mac80211/led.h
+++ b/net/mac80211/led.h
@@ -72,19 +72,19 @@ static inline void ieee80211_mod_tpt_led_trig(struct ieee80211_local *local,
 #endif
 
 static inline void
-ieee80211_tpt_led_trig_tx(struct ieee80211_local *local, __le16 fc, int bytes)
+ieee80211_tpt_led_trig_tx(struct ieee80211_local *local, int bytes)
 {
 #ifdef CONFIG_MAC80211_LEDS
-	if (ieee80211_is_data(fc) && atomic_read(&local->tpt_led_active))
+	if (atomic_read(&local->tpt_led_active))
 		local->tpt_led_trigger->tx_bytes += bytes;
 #endif
 }
 
 static inline void
-ieee80211_tpt_led_trig_rx(struct ieee80211_local *local, __le16 fc, int bytes)
+ieee80211_tpt_led_trig_rx(struct ieee80211_local *local, int bytes)
 {
 #ifdef CONFIG_MAC80211_LEDS
-	if (ieee80211_is_data(fc) && atomic_read(&local->tpt_led_active))
+	if (atomic_read(&local->tpt_led_active))
 		local->tpt_led_trigger->rx_bytes += bytes;
 #endif
 }
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 1b0bd642dc23b..9541a4c30aca7 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -4864,6 +4864,7 @@ void ieee80211_rx_list(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta,
 	struct ieee80211_rate *rate = NULL;
 	struct ieee80211_supported_band *sband;
 	struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
+	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data;
 
 	WARN_ON_ONCE(softirq_count() == 0);
 
@@ -4960,9 +4961,9 @@ void ieee80211_rx_list(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta,
 	if (!(status->flag & RX_FLAG_8023))
 		skb = ieee80211_rx_monitor(local, skb, rate);
 	if (skb) {
-		ieee80211_tpt_led_trig_rx(local,
-					  ((struct ieee80211_hdr *)skb->data)->frame_control,
-					  skb->len);
+		if ((status->flag & RX_FLAG_8023) ||
+			ieee80211_is_data_present(hdr->frame_control))
+			ieee80211_tpt_led_trig_rx(local, skb->len);
 
 		if (status->flag & RX_FLAG_8023)
 			__ieee80211_rx_handle_8023(hw, pubsta, skb, list);
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index a756a197c770f..278945e3e08ac 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -1721,21 +1721,19 @@ static bool ieee80211_tx_frags(struct ieee80211_local *local,
  * Returns false if the frame couldn't be transmitted but was queued instead.
  */
 static bool __ieee80211_tx(struct ieee80211_local *local,
-			   struct sk_buff_head *skbs, int led_len,
-			   struct sta_info *sta, bool txpending)
+			   struct sk_buff_head *skbs, struct sta_info *sta,
+			   bool txpending)
 {
 	struct ieee80211_tx_info *info;
 	struct ieee80211_sub_if_data *sdata;
 	struct ieee80211_vif *vif;
 	struct sk_buff *skb;
 	bool result;
-	__le16 fc;
 
 	if (WARN_ON(skb_queue_empty(skbs)))
 		return true;
 
 	skb = skb_peek(skbs);
-	fc = ((struct ieee80211_hdr *)skb->data)->frame_control;
 	info = IEEE80211_SKB_CB(skb);
 	sdata = vif_to_sdata(info->control.vif);
 	if (sta && !sta->uploaded)
@@ -1769,8 +1767,6 @@ static bool __ieee80211_tx(struct ieee80211_local *local,
 
 	result = ieee80211_tx_frags(local, vif, sta, skbs, txpending);
 
-	ieee80211_tpt_led_trig_tx(local, fc, led_len);
-
 	WARN_ON_ONCE(!skb_queue_empty(skbs));
 
 	return result;
@@ -1920,7 +1916,6 @@ static bool ieee80211_tx(struct ieee80211_sub_if_data *sdata,
 	ieee80211_tx_result res_prepare;
 	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
 	bool result = true;
-	int led_len;
 
 	if (unlikely(skb->len < 10)) {
 		dev_kfree_skb(skb);
@@ -1928,7 +1923,6 @@ static bool ieee80211_tx(struct ieee80211_sub_if_data *sdata,
 	}
 
 	/* initialises tx */
-	led_len = skb->len;
 	res_prepare = ieee80211_tx_prepare(sdata, &tx, sta, skb);
 
 	if (unlikely(res_prepare == TX_DROP)) {
@@ -1951,8 +1945,7 @@ static bool ieee80211_tx(struct ieee80211_sub_if_data *sdata,
 		return true;
 
 	if (!invoke_tx_handlers_late(&tx))
-		result = __ieee80211_tx(local, &tx.skbs, led_len,
-					tx.sta, txpending);
+		result = __ieee80211_tx(local, &tx.skbs, tx.sta, txpending);
 
 	return result;
 }
@@ -4175,6 +4168,7 @@ void __ieee80211_subif_start_xmit(struct sk_buff *skb,
 	struct ieee80211_local *local = sdata->local;
 	struct sta_info *sta;
 	struct sk_buff *next;
+	int len = skb->len;
 
 	if (unlikely(skb->len < ETH_HLEN)) {
 		kfree_skb(skb);
@@ -4221,10 +4215,8 @@ void __ieee80211_subif_start_xmit(struct sk_buff *skb,
 		}
 	} else {
 		/* we cannot process non-linear frames on this path */
-		if (skb_linearize(skb)) {
-			kfree_skb(skb);
-			goto out;
-		}
+		if (skb_linearize(skb))
+			goto out_free;
 
 		/* the frame could be fragmented, software-encrypted, and other
 		 * things so we cannot really handle checksum offload with it -
@@ -4258,7 +4250,10 @@ void __ieee80211_subif_start_xmit(struct sk_buff *skb,
 	goto out;
  out_free:
 	kfree_skb(skb);
+	len = 0;
  out:
+	if (len)
+		ieee80211_tpt_led_trig_tx(local, len);
 	rcu_read_unlock();
 }
 
@@ -4396,8 +4391,7 @@ netdev_tx_t ieee80211_subif_start_xmit(struct sk_buff *skb,
 }
 
 static bool ieee80211_tx_8023(struct ieee80211_sub_if_data *sdata,
-			      struct sk_buff *skb, int led_len,
-			      struct sta_info *sta,
+			      struct sk_buff *skb, struct sta_info *sta,
 			      bool txpending)
 {
 	struct ieee80211_local *local = sdata->local;
@@ -4410,6 +4404,8 @@ static bool ieee80211_tx_8023(struct ieee80211_sub_if_data *sdata,
 	if (sta)
 		sk_pacing_shift_update(skb->sk, local->hw.tx_sk_pacing_shift);
 
+	ieee80211_tpt_led_trig_tx(local, skb->len);
+
 	if (ieee80211_queue_skb(local, sdata, sta, skb))
 		return true;
 
@@ -4498,7 +4494,7 @@ static void ieee80211_8023_xmit(struct ieee80211_sub_if_data *sdata,
 	if (key)
 		info->control.hw_key = &key->conf;
 
-	ieee80211_tx_8023(sdata, skb, skb->len, sta, false);
+	ieee80211_tx_8023(sdata, skb, sta, false);
 
 	return;
 
@@ -4637,7 +4633,7 @@ static bool ieee80211_tx_pending_skb(struct ieee80211_local *local,
 		if (IS_ERR(sta) || (sta && !sta->uploaded))
 			sta = NULL;
 
-		result = ieee80211_tx_8023(sdata, skb, skb->len, sta, true);
+		result = ieee80211_tx_8023(sdata, skb, sta, true);
 	} else {
 		struct sk_buff_head skbs;
 
@@ -4647,7 +4643,7 @@ static bool ieee80211_tx_pending_skb(struct ieee80211_local *local,
 		hdr = (struct ieee80211_hdr *)skb->data;
 		sta = sta_info_get(sdata, hdr->addr1);
 
-		result = __ieee80211_tx(local, &skbs, skb->len, sta, true);
+		result = __ieee80211_tx(local, &skbs, sta, true);
 	}
 
 	return result;

From 86c3a3e964d910a62eeb277d60b2a60ebefa9feb Mon Sep 17 00:00:00 2001
From: Tadeusz Struk <tadeusz.struk@linaro.org>
Date: Thu, 11 Nov 2021 12:59:16 -0800
Subject: [PATCH 22/96] tipc: use consistent GFP flags

Some functions, like tipc_crypto_start use inconsisten GFP flags
when allocating memory. The mentioned function use GFP_ATOMIC to
to alloc a crypto instance, and then calls alloc_ordered_workqueue()
which allocates memory with GFP_KERNEL. tipc_aead_init() function
even uses GFP_KERNEL and GFP_ATOMIC interchangeably.
No doc comment specifies what context a function is designed to
work in, but the flags should at least be consistent within a function.

Cc: Jon Maloy <jmaloy@redhat.com>
Cc: Ying Xue <ying.xue@windriver.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: netdev@vger.kernel.org
Cc: tipc-discussion@lists.sourceforge.net
Cc: linux-kernel@vger.kernel.org

Signed-off-by: Tadeusz Struk <tadeusz.struk@linaro.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/crypto.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/net/tipc/crypto.c b/net/tipc/crypto.c
index dc60c32bb70df..e701651f6533a 100644
--- a/net/tipc/crypto.c
+++ b/net/tipc/crypto.c
@@ -524,7 +524,7 @@ static int tipc_aead_init(struct tipc_aead **aead, struct tipc_aead_key *ukey,
 		return -EEXIST;
 
 	/* Allocate a new AEAD */
-	tmp = kzalloc(sizeof(*tmp), GFP_ATOMIC);
+	tmp = kzalloc(sizeof(*tmp), GFP_KERNEL);
 	if (unlikely(!tmp))
 		return -ENOMEM;
 
@@ -1470,7 +1470,7 @@ int tipc_crypto_start(struct tipc_crypto **crypto, struct net *net,
 		return -EEXIST;
 
 	/* Allocate crypto */
-	c = kzalloc(sizeof(*c), GFP_ATOMIC);
+	c = kzalloc(sizeof(*c), GFP_KERNEL);
 	if (!c)
 		return -ENOMEM;
 
@@ -1484,7 +1484,7 @@ int tipc_crypto_start(struct tipc_crypto **crypto, struct net *net,
 	}
 
 	/* Allocate statistic structure */
-	c->stats = alloc_percpu_gfp(struct tipc_crypto_stats, GFP_ATOMIC);
+	c->stats = alloc_percpu(struct tipc_crypto_stats);
 	if (!c->stats) {
 		if (c->wq)
 			destroy_workqueue(c->wq);
@@ -2457,7 +2457,7 @@ static void tipc_crypto_work_tx(struct work_struct *work)
 	}
 
 	/* Lets duplicate it first */
-	skey = kmemdup(aead->key, tipc_aead_key_size(aead->key), GFP_ATOMIC);
+	skey = kmemdup(aead->key, tipc_aead_key_size(aead->key), GFP_KERNEL);
 	rcu_read_unlock();
 
 	/* Now, generate new key, initiate & distribute it */

From a31d27fbed5d518734cb60956303eb15089a7634 Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Fri, 12 Nov 2021 08:56:03 +0100
Subject: [PATCH 23/96] tun: fix bonding active backup with arp monitoring

As stated in the bonding doc, trans_start must be set manually for drivers
using NETIF_F_LLTX:
 Drivers that use NETIF_F_LLTX flag must also update
 netdev_queue->trans_start. If they do not, then the ARP monitor will
 immediately fail any slaves using that driver, and those slaves will stay
 down.

Link: https://www.kernel.org/doc/html/v5.15/networking/bonding.html#arp-monitor-operation
Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/tun.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index fecc9a1d293ae..1572878c34031 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1010,6 +1010,7 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct tun_struct *tun = netdev_priv(dev);
 	int txq = skb->queue_mapping;
+	struct netdev_queue *queue;
 	struct tun_file *tfile;
 	int len = skb->len;
 
@@ -1054,6 +1055,10 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
 	if (ptr_ring_produce(&tfile->tx_ring, skb))
 		goto drop;
 
+	/* NETIF_F_LLTX requires to do our own update of trans_start */
+	queue = netdev_get_tx_queue(dev, txq);
+	queue->trans_start = jiffies;
+
 	/* Notify and wake up reader process */
 	if (tfile->flags & TUN_FASYNC)
 		kill_fasync(&tfile->fasync, SIGIO, POLL_IN);

From 10a2308ffb8cf262e473eb324fde42ae31b6da04 Mon Sep 17 00:00:00 2001
From: Jiapeng Chong <jiapeng.chong@linux.alibaba.com>
Date: Fri, 12 Nov 2021 18:16:34 +0800
Subject: [PATCH 24/96] net: Clean up some inconsistent indenting

Eliminate the follow smatch warning:

./include/linux/skbuff.h:4229 skb_remcsum_process() warn: inconsistent
indenting.

Reported-by: Abaci Robot <abaci@linux.alibaba.com>
Signed-off-by: Jiapeng Chong <jiapeng.chong@linux.alibaba.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 686a666d073d5..c8cb7e697d479 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -4226,7 +4226,7 @@ static inline void skb_remcsum_process(struct sk_buff *skb, void *ptr,
 		return;
 	}
 
-	 if (unlikely(skb->ip_summed != CHECKSUM_COMPLETE)) {
+	if (unlikely(skb->ip_summed != CHECKSUM_COMPLETE)) {
 		__skb_checksum_complete(skb);
 		skb_postpull_rcsum(skb, skb->data, ptr - (void *)skb->data);
 	}

From 6e228d8cbb1cc6ba78022d406340e901e08d26e0 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@linaro.org>
Date: Fri, 12 Nov 2021 16:22:09 -0600
Subject: [PATCH 25/96] net: ipa: HOLB register sometimes must be written twice

Starting with IPA v4.5, the HOL_BLOCK_EN register must be written
twice when enabling head-of-line blocking avoidance.

Fixes: 84f9bd12d46db ("soc: qcom: ipa: IPA endpoints")
Signed-off-by: Alex Elder <elder@linaro.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ipa/ipa_endpoint.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/ipa/ipa_endpoint.c b/drivers/net/ipa/ipa_endpoint.c
index 5528d97110d56..006da4642a0ba 100644
--- a/drivers/net/ipa/ipa_endpoint.c
+++ b/drivers/net/ipa/ipa_endpoint.c
@@ -868,6 +868,9 @@ ipa_endpoint_init_hol_block_enable(struct ipa_endpoint *endpoint, bool enable)
 	val = enable ? HOL_BLOCK_EN_FMASK : 0;
 	offset = IPA_REG_ENDP_INIT_HOL_BLOCK_EN_N_OFFSET(endpoint_id);
 	iowrite32(val, endpoint->ipa->reg_virt + offset);
+	/* When enabling, the register must be written twice for IPA v4.5+ */
+	if (enable && endpoint->ipa->version >= IPA_VERSION_4_5)
+		iowrite32(val, endpoint->ipa->reg_virt + offset);
 }
 
 void ipa_endpoint_modem_hol_block_clear_all(struct ipa *ipa)

From 816316cacad2b5abd5b41423cf04e4845239abd4 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@linaro.org>
Date: Fri, 12 Nov 2021 16:22:10 -0600
Subject: [PATCH 26/96] net: ipa: disable HOLB drop when updating timer

The head-of-line blocking timer should only be modified when
head-of-line drop is disabled.

One of the steps in recovering from a modem crash is to enable
dropping of packets with timeout of 0 (immediate).  We don't know
how the modem configured its endpoints, so before we program the
timer, we need to ensure HOL_BLOCK is disabled.

Fixes: 84f9bd12d46db ("soc: qcom: ipa: IPA endpoints")
Signed-off-by: Alex Elder <elder@linaro.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ipa/ipa_endpoint.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/ipa/ipa_endpoint.c b/drivers/net/ipa/ipa_endpoint.c
index 006da4642a0ba..ef790fd0ab56a 100644
--- a/drivers/net/ipa/ipa_endpoint.c
+++ b/drivers/net/ipa/ipa_endpoint.c
@@ -853,6 +853,7 @@ static void ipa_endpoint_init_hol_block_timer(struct ipa_endpoint *endpoint,
 	u32 offset;
 	u32 val;
 
+	/* This should only be changed when HOL_BLOCK_EN is disabled */
 	offset = IPA_REG_ENDP_INIT_HOL_BLOCK_TIMER_N_OFFSET(endpoint_id);
 	val = hol_block_timer_val(ipa, microseconds);
 	iowrite32(val, ipa->reg_virt + offset);
@@ -883,6 +884,7 @@ void ipa_endpoint_modem_hol_block_clear_all(struct ipa *ipa)
 		if (endpoint->toward_ipa || endpoint->ee_id != GSI_EE_MODEM)
 			continue;
 
+		ipa_endpoint_init_hol_block_enable(endpoint, false);
 		ipa_endpoint_init_hol_block_timer(endpoint, 0);
 		ipa_endpoint_init_hol_block_enable(endpoint, true);
 	}

From 2153bd1e3d3dbf6a3403572084ef6ed31c53c5f0 Mon Sep 17 00:00:00 2001
From: Wen Gu <guwen@linux.alibaba.com>
Date: Sat, 13 Nov 2021 15:33:35 +0800
Subject: [PATCH 27/96] net/smc: Transfer remaining wait queue entries during
 fallback
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The SMC fallback is incomplete currently. There may be some
wait queue entries remaining in smc socket->wq, which should
be removed to clcsocket->wq during the fallback.

For example, in nginx/wrk benchmark, this issue causes an
all-zeros test result:

server: nginx -g 'daemon off;'
client: smc_run wrk -c 1 -t 1 -d 5 http://11.200.15.93/index.html

  Running 5s test @ http://11.200.15.93/index.html
     1 threads and 1 connections
     Thread Stats   Avg      Stdev     Max   ± Stdev
     	Latency     0.00us    0.00us   0.00us    -nan%
	Req/Sec     0.00      0.00     0.00      -nan%
	0 requests in 5.00s, 0.00B read
     Requests/sec:      0.00
     Transfer/sec:       0.00B

The reason for this all-zeros result is that when wrk used SMC
to replace TCP, it added an eppoll_entry into smc socket->wq
and expected to be notified if epoll events like EPOLL_IN/
EPOLL_OUT occurred on the smc socket.

However, once a fallback occurred, wrk switches to use clcsocket.
Now it is clcsocket->wq instead of smc socket->wq which will
be woken up. The eppoll_entry remaining in smc socket->wq does
not work anymore and wrk stops the test.

This patch fixes this issue by removing remaining wait queue
entries from smc socket->wq to clcsocket->wq during the fallback.

Link: https://www.spinics.net/lists/netdev/msg779769.html
Signed-off-by: Wen Gu <guwen@linux.alibaba.com>
Reviewed-by: Tony Lu <tonylu@linux.alibaba.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/af_smc.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 59284da9116d7..b61c802e3bf32 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -566,6 +566,10 @@ static void smc_stat_fallback(struct smc_sock *smc)
 
 static void smc_switch_to_fallback(struct smc_sock *smc, int reason_code)
 {
+	wait_queue_head_t *smc_wait = sk_sleep(&smc->sk);
+	wait_queue_head_t *clc_wait = sk_sleep(smc->clcsock->sk);
+	unsigned long flags;
+
 	smc->use_fallback = true;
 	smc->fallback_rsn = reason_code;
 	smc_stat_fallback(smc);
@@ -575,6 +579,16 @@ static void smc_switch_to_fallback(struct smc_sock *smc, int reason_code)
 		smc->clcsock->file->private_data = smc->clcsock;
 		smc->clcsock->wq.fasync_list =
 			smc->sk.sk_socket->wq.fasync_list;
+
+		/* There may be some entries remaining in
+		 * smc socket->wq, which should be removed
+		 * to clcsocket->wq during the fallback.
+		 */
+		spin_lock_irqsave(&smc_wait->lock, flags);
+		spin_lock(&clc_wait->lock);
+		list_splice_init(&smc_wait->head, &clc_wait->head);
+		spin_unlock(&clc_wait->lock);
+		spin_unlock_irqrestore(&smc_wait->lock, flags);
 	}
 }
 

From f8885ac89ce310570e5391fe0bf0ec9c7c9b4fdc Mon Sep 17 00:00:00 2001
From: Pavel Skripkin <paskripkin@gmail.com>
Date: Sun, 14 Nov 2021 01:36:36 +0300
Subject: [PATCH 28/96] net: bnx2x: fix variable dereferenced before check

Smatch says:
	bnx2x_init_ops.h:640 bnx2x_ilt_client_mem_op()
	warn: variable dereferenced before check 'ilt' (see line 638)

Move ilt_cli variable initialization _after_ ilt validation, because
it's unsafe to deref the pointer before validation check.

Fixes: 523224a3b3cd ("bnx2x, cnic, bnx2i: use new FW/HSI")
Signed-off-by: Pavel Skripkin <paskripkin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_init_ops.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_init_ops.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_init_ops.h
index 1835d2e451c01..fc7fce642666c 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_init_ops.h
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_init_ops.h
@@ -635,11 +635,13 @@ static int bnx2x_ilt_client_mem_op(struct bnx2x *bp, int cli_num,
 {
 	int i, rc;
 	struct bnx2x_ilt *ilt = BP_ILT(bp);
-	struct ilt_client_info *ilt_cli = &ilt->clients[cli_num];
+	struct ilt_client_info *ilt_cli;
 
 	if (!ilt || !ilt->lines)
 		return -1;
 
+	ilt_cli = &ilt->clients[cli_num];
+
 	if (ilt_cli->flags & (ILT_CLIENT_SKIP_INIT | ILT_CLIENT_SKIP_MEM))
 		return 0;
 

From b922f622592af76b57cbc566eaeccda0b31a3496 Mon Sep 17 00:00:00 2001
From: Zekun Shen <bruceshenzk@gmail.com>
Date: Sat, 13 Nov 2021 22:24:40 -0500
Subject: [PATCH 29/96] atlantic: Fix OOB read and write in
 hw_atl_utils_fw_rpc_wait

This bug report shows up when running our research tools. The
reports is SOOB read, but it seems SOOB write is also possible
a few lines below.

In details, fw.len and sw.len are inputs coming from io. A len
over the size of self->rpc triggers SOOB. The patch fixes the
bugs by adding sanity checks.

The bugs are triggerable with compromised/malfunctioning devices.
They are potentially exploitable given they first leak up to
0xffff bytes and able to overwrite the region later.

The patch is tested with QEMU emulater.
This is NOT tested with a real device.

Attached is the log we found by fuzzing.

BUG: KASAN: slab-out-of-bounds in
	hw_atl_utils_fw_upload_dwords+0x393/0x3c0 [atlantic]
Read of size 4 at addr ffff888016260b08 by task modprobe/213
CPU: 0 PID: 213 Comm: modprobe Not tainted 5.6.0 #1
Call Trace:
 dump_stack+0x76/0xa0
 print_address_description.constprop.0+0x16/0x200
 ? hw_atl_utils_fw_upload_dwords+0x393/0x3c0 [atlantic]
 ? hw_atl_utils_fw_upload_dwords+0x393/0x3c0 [atlantic]
 __kasan_report.cold+0x37/0x7c
 ? aq_hw_read_reg_bit+0x60/0x70 [atlantic]
 ? hw_atl_utils_fw_upload_dwords+0x393/0x3c0 [atlantic]
 kasan_report+0xe/0x20
 hw_atl_utils_fw_upload_dwords+0x393/0x3c0 [atlantic]
 hw_atl_utils_fw_rpc_call+0x95/0x130 [atlantic]
 hw_atl_utils_fw_rpc_wait+0x176/0x210 [atlantic]
 hw_atl_utils_mpi_create+0x229/0x2e0 [atlantic]
 ? hw_atl_utils_fw_rpc_wait+0x210/0x210 [atlantic]
 ? hw_atl_utils_initfw+0x9f/0x1c8 [atlantic]
 hw_atl_utils_initfw+0x12a/0x1c8 [atlantic]
 aq_nic_ndev_register+0x88/0x650 [atlantic]
 ? aq_nic_ndev_init+0x235/0x3c0 [atlantic]
 aq_pci_probe+0x731/0x9b0 [atlantic]
 ? aq_pci_func_init+0xc0/0xc0 [atlantic]
 local_pci_probe+0xd3/0x160
 pci_device_probe+0x23f/0x3e0

Reported-by: Brendan Dolan-Gavitt <brendandg@nyu.edu>
Signed-off-by: Zekun Shen <bruceshenzk@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c   | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c
index fc0e660066446..3f1704cbe1cb9 100644
--- a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c
+++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c
@@ -559,6 +559,11 @@ int hw_atl_utils_fw_rpc_wait(struct aq_hw_s *self,
 			goto err_exit;
 
 		if (fw.len == 0xFFFFU) {
+			if (sw.len > sizeof(self->rpc)) {
+				printk(KERN_INFO "Invalid sw len: %x\n", sw.len);
+				err = -EINVAL;
+				goto err_exit;
+			}
 			err = hw_atl_utils_fw_rpc_call(self, sw.len);
 			if (err < 0)
 				goto err_exit;
@@ -567,6 +572,11 @@ int hw_atl_utils_fw_rpc_wait(struct aq_hw_s *self,
 
 	if (rpc) {
 		if (fw.len) {
+			if (fw.len > sizeof(self->rpc)) {
+				printk(KERN_INFO "Invalid fw len: %x\n", fw.len);
+				err = -EINVAL;
+				goto err_exit;
+			}
 			err =
 			hw_atl_utils_fw_downld_dwords(self,
 						      self->rpc_addr,

From e97b21e94652f5f0d1c196452c111151f6d15883 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Sun, 14 Nov 2021 17:02:29 -0800
Subject: [PATCH 30/96] net: ethernet: lantiq_etop: fix build errors/warnings

Fix build error and warnings reported by kernel test robot:

drivers/net/ethernet/lantiq_etop.c: In function 'ltq_etop_probe':
drivers/net/ethernet/lantiq_etop.c:673:15: error: implicit declaration of function 'device_property_read_u32' [-Werror=implicit-function-declaration]
     673 |         err = device_property_read_u32(&pdev->dev, "lantiq,tx-burst-length", &priv->tx_burst_len);

   drivers/net/ethernet/lantiq_etop.c: At top level:
   drivers/net/ethernet/lantiq_etop.c:730:1: warning: no previous prototype for 'init_ltq_etop' [-Wmissing-prototypes]
     730 | init_ltq_etop(void)

   drivers/net/ethernet/lantiq_etop.c: In function 'ltq_etop_hw_init':
   drivers/net/ethernet/lantiq_etop.c:276:25: warning: ignoring return value of 'request_irq' declared with attribute 'warn_unused_result' [-Wunused-result]
     276 |                         request_irq(irq, ltq_etop_dma_irq, 0, "etop_tx", priv);
   drivers/net/ethernet/lantiq_etop.c:284:25: warning: ignoring return value of 'request_irq' declared with attribute 'warn_unused_result' [-Wunused-result]
     284 |                         request_irq(irq, ltq_etop_dma_irq, 0, "etop_rx", priv);

Fixes: 14d4e308e0aa ("net: lantiq: configure the burst length in ethernet drivers")
Fixes: dddb29e42770 ("net: lantiq_etop: remove deprecated IRQF_DISABLED")
Fixes: 504d4721ee8e ("MIPS: Lantiq: Add ethernet driver")
Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Reported-by: kernel test robot <lkp@intel.com>
Link: lore.kernel.org/r/202111090621.yjr9xuVj-lkp@intel.com
To: netdev@vger.kernel.org
Cc: Aleksander Jan Bajkowski <olek2@wp.pl>
Cc: Hauke Mehrtens <hauke@hauke-m.de>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: John Crispin <john@phrozen.org>
Cc: linux-mips@vger.kernel.org
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Michael Opdenacker <michael.opdenacker@bootlin.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/lantiq_etop.c | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/lantiq_etop.c b/drivers/net/ethernet/lantiq_etop.c
index 6433c909c6b26..072391c494ce4 100644
--- a/drivers/net/ethernet/lantiq_etop.c
+++ b/drivers/net/ethernet/lantiq_etop.c
@@ -25,6 +25,7 @@
 #include <linux/io.h>
 #include <linux/dma-mapping.h>
 #include <linux/module.h>
+#include <linux/property.h>
 
 #include <asm/checksum.h>
 
@@ -239,6 +240,7 @@ ltq_etop_hw_init(struct net_device *dev)
 {
 	struct ltq_etop_priv *priv = netdev_priv(dev);
 	int i;
+	int err;
 
 	ltq_pmu_enable(PMU_PPE);
 
@@ -273,7 +275,13 @@ ltq_etop_hw_init(struct net_device *dev)
 
 		if (IS_TX(i)) {
 			ltq_dma_alloc_tx(&ch->dma);
-			request_irq(irq, ltq_etop_dma_irq, 0, "etop_tx", priv);
+			err = request_irq(irq, ltq_etop_dma_irq, 0, "etop_tx", priv);
+			if (err) {
+				netdev_err(dev,
+					   "Unable to get Tx DMA IRQ %d\n",
+					   irq);
+				return err;
+			}
 		} else if (IS_RX(i)) {
 			ltq_dma_alloc_rx(&ch->dma);
 			for (ch->dma.desc = 0; ch->dma.desc < LTQ_DESC_NUM;
@@ -281,7 +289,13 @@ ltq_etop_hw_init(struct net_device *dev)
 				if (ltq_etop_alloc_skb(ch))
 					return -ENOMEM;
 			ch->dma.desc = 0;
-			request_irq(irq, ltq_etop_dma_irq, 0, "etop_rx", priv);
+			err = request_irq(irq, ltq_etop_dma_irq, 0, "etop_rx", priv);
+			if (err) {
+				netdev_err(dev,
+					   "Unable to get Rx DMA IRQ %d\n",
+					   irq);
+				return err;
+			}
 		}
 		ch->dma.irq = irq;
 	}
@@ -726,7 +740,7 @@ static struct platform_driver ltq_mii_driver = {
 	},
 };
 
-int __init
+static int __init
 init_ltq_etop(void)
 {
 	int ret = platform_driver_probe(&ltq_mii_driver, ltq_etop_probe);

From 46d08f55d24e69e921456b5a40717da09199267b Mon Sep 17 00:00:00 2001
From: Edwin Peer <edwin.peer@broadcom.com>
Date: Mon, 15 Nov 2021 02:37:59 -0500
Subject: [PATCH 31/96] bnxt_en: extend RTNL to VF check in devlink
 driver_reinit

The fixes the race condition between configuring SR-IOV and devlink
reload.  The SR-IOV configure logic already takes the RTNL lock,
setting sriov_cfg under the lock while changes are underway. Extend
the lock scope in devlink driver_reinit to cover the VF check and
don't run concurrently with SR-IOV configure.

Reported-by: Leon Romanovsky <leon@kernel.org>
Fixes: 228ea8c187d8 ("bnxt_en: implement devlink dev reload driver_reinit")
Cc: Leon Romanovsky <leon@kernel.org>
Reviewed-by: Somnath Kotur <somnath.kotur@broadcom.com>
Reviewed-by: Pavan Chebbi <pavan.chebbi@broadcom.com>
Reviewed-by: Andy Gospodarek <gospo@broadcom.com>
Signed-off-by: Edwin Peer <edwin.peer@broadcom.com>
Signed-off-by: Michael Chan <michael.chan@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c
index 5c464ea73576f..a0a9af4026422 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c
@@ -441,12 +441,13 @@ static int bnxt_dl_reload_down(struct devlink *dl, bool netns_change,
 
 	switch (action) {
 	case DEVLINK_RELOAD_ACTION_DRIVER_REINIT: {
-		if (BNXT_PF(bp) && bp->pf.active_vfs) {
+		rtnl_lock();
+		if (BNXT_PF(bp) && (bp->pf.active_vfs || bp->sriov_cfg)) {
 			NL_SET_ERR_MSG_MOD(extack,
-					   "reload is unsupported when VFs are allocated");
+					   "reload is unsupported while VFs are allocated or being configured");
+			rtnl_unlock();
 			return -EOPNOTSUPP;
 		}
-		rtnl_lock();
 		if (bp->dev->reg_state == NETREG_UNREGISTERED) {
 			rtnl_unlock();
 			return -ENODEV;

From b68a1a933fe4a52a8316d214e3421f2a89bc113e Mon Sep 17 00:00:00 2001
From: Edwin Peer <edwin.peer@broadcom.com>
Date: Mon, 15 Nov 2021 02:38:00 -0500
Subject: [PATCH 32/96] bnxt_en: fix format specifier in live patch error
 message

This fixes type mismatch warning.

Reported-by: kernel test robot <lkp@intel.com>
Fixes: 3c4153394e2c ("bnxt_en: implement firmware live patching")
Signed-off-by: Edwin Peer <edwin.peer@broadcom.com>
Signed-off-by: Michael Chan <michael.chan@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c
index a0a9af4026422..6fe9e9b59f835 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c
@@ -360,7 +360,7 @@ bnxt_dl_livepatch_report_err(struct bnxt *bp, struct netlink_ext_ack *extack,
 		NL_SET_ERR_MSG_MOD(extack, "Live patch already applied");
 		break;
 	default:
-		netdev_err(bp->dev, "Unexpected live patch error: %hhd\n", err);
+		netdev_err(bp->dev, "Unexpected live patch error: %d\n", err);
 		NL_SET_ERR_MSG_MOD(extack, "Failed to activate live patch");
 		break;
 	}

From b0757491a118ae5727cf9f1c3a11544397d46596 Mon Sep 17 00:00:00 2001
From: Sriharsha Basavapatna <sriharsha.basavapatna@broadcom.com>
Date: Mon, 15 Nov 2021 02:38:01 -0500
Subject: [PATCH 33/96] bnxt_en: reject indirect blk offload when hw-tc-offload
 is off

The driver does not check if hw-tc-offload is enabled for the device
before offloading a flow in the context of indirect block callback.
Fix this by checking NETIF_F_HW_TC in the features flag and rejecting
the offload request.  This will avoid unnecessary dmesg error logs when
hw-tc-offload is disabled, such as these:

bnxt_en 0000:19:00.1 eno2np1: dev(ifindex=294) not on same switch
bnxt_en 0000:19:00.1 eno2np1: Error: bnxt_tc_add_flow: cookie=0xffff8dace1c88000 error=-22
bnxt_en 0000:19:00.0 eno1np0: dev(ifindex=294) not on same switch
bnxt_en 0000:19:00.0 eno1np0: Error: bnxt_tc_add_flow: cookie=0xffff8dace1c88000 error=-22

Reported-by: Marcelo Ricardo Leitner <mleitner@redhat.com>
Fixes: 627c89d00fb9 ("bnxt_en: flow_offload: offload tunnel decap rules via indirect callbacks")
Signed-off-by: Sriharsha Basavapatna <sriharsha.basavapatna@broadcom.com>
Signed-off-by: Michael Chan <michael.chan@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c
index e6a4a768b10b2..1471b6130a2b9 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c
@@ -1868,7 +1868,7 @@ static int bnxt_tc_setup_indr_block_cb(enum tc_setup_type type,
 	struct flow_cls_offload *flower = type_data;
 	struct bnxt *bp = priv->bp;
 
-	if (flower->common.chain_index)
+	if (!tc_cls_can_offload_and_chain0(bp->dev, type_data))
 		return -EOPNOTSUPP;
 
 	switch (type) {

From 9119570039481d56350af1c636f040fb300b8cf3 Mon Sep 17 00:00:00 2001
From: Meng Li <meng.li@windriver.com>
Date: Mon, 15 Nov 2021 15:04:23 +0800
Subject: [PATCH 34/96] net: stmmac: socfpga: add runtime suspend/resume
 callback for stratix10 platform

According to upstream commit 5ec55823438e("net: stmmac:
add clocks management for gmac driver"), it improve clocks
management for stmmac driver. So, it is necessary to implement
the runtime callback in dwmac-socfpga driver because it doesn't
use the common stmmac_pltfr_pm_ops instance. Otherwise, clocks
are not disabled when system enters suspend status.

Fixes: 5ec55823438e ("net: stmmac: add clocks management for gmac driver")
Cc: stable@vger.kernel.org
Signed-off-by: Meng Li <Meng.Li@windriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../ethernet/stmicro/stmmac/dwmac-socfpga.c   | 24 +++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c
index 85208128f135c..b7c2579c963b6 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c
@@ -485,8 +485,28 @@ static int socfpga_dwmac_resume(struct device *dev)
 }
 #endif /* CONFIG_PM_SLEEP */
 
-static SIMPLE_DEV_PM_OPS(socfpga_dwmac_pm_ops, stmmac_suspend,
-					       socfpga_dwmac_resume);
+static int __maybe_unused socfpga_dwmac_runtime_suspend(struct device *dev)
+{
+	struct net_device *ndev = dev_get_drvdata(dev);
+	struct stmmac_priv *priv = netdev_priv(ndev);
+
+	stmmac_bus_clks_config(priv, false);
+
+	return 0;
+}
+
+static int __maybe_unused socfpga_dwmac_runtime_resume(struct device *dev)
+{
+	struct net_device *ndev = dev_get_drvdata(dev);
+	struct stmmac_priv *priv = netdev_priv(ndev);
+
+	return stmmac_bus_clks_config(priv, true);
+}
+
+static const struct dev_pm_ops socfpga_dwmac_pm_ops = {
+	SET_SYSTEM_SLEEP_PM_OPS(stmmac_suspend, socfpga_dwmac_resume)
+	SET_RUNTIME_PM_OPS(socfpga_dwmac_runtime_suspend, socfpga_dwmac_runtime_resume, NULL)
+};
 
 static const struct socfpga_dwmac_ops socfpga_gen5_ops = {
 	.set_phy_mode = socfpga_gen5_set_phy_mode,

From 6def480181f15f6d9ec812bca8cbc62451ba314c Mon Sep 17 00:00:00 2001
From: liuguoqiang <liuguoqiang@uniontech.com>
Date: Mon, 15 Nov 2021 16:14:48 +0800
Subject: [PATCH 35/96] net: return correct error code

When kmemdup called failed and register_net_sysctl return NULL, should
return ENOMEM instead of ENOBUFS

Signed-off-by: liuguoqiang <liuguoqiang@uniontech.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/devinet.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index ec73a0d52d3e3..323e622ff9b74 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -2591,7 +2591,7 @@ static int __devinet_sysctl_register(struct net *net, char *dev_name,
 free:
 	kfree(t);
 out:
-	return -ENOBUFS;
+	return -ENOMEM;
 }
 
 static void __devinet_sysctl_unregister(struct net *net,

From 271351d255b09e39c7f6437738cba595f9b235be Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Mon, 15 Nov 2021 07:45:24 -0500
Subject: [PATCH 36/96] tipc: only accept encrypted MSG_CRYPTO msgs

The MSG_CRYPTO msgs are always encrypted and sent to other nodes
for keys' deployment. But when receiving in peers, if those nodes
do not validate it and make sure it's encrypted, one could craft
a malicious MSG_CRYPTO msg to deploy its key with no need to know
other nodes' keys.

This patch is to do that by checking TIPC_SKB_CB(skb)->decrypted
and discard it if this packet never got decrypted.

Note that this is also a supplementary fix to CVE-2021-43267 that
can be triggered by an unencrypted malicious MSG_CRYPTO msg.

Fixes: 1ef6f7c9390f ("tipc: add automatic session key exchange")
Acked-by: Ying Xue <ying.xue@windriver.com>
Acked-by: Jon Maloy <jmaloy@redhat.com>
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/link.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/net/tipc/link.c b/net/tipc/link.c
index 1b7a487c88419..09ae8448f394f 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -1298,8 +1298,11 @@ static bool tipc_data_input(struct tipc_link *l, struct sk_buff *skb,
 		return false;
 #ifdef CONFIG_TIPC_CRYPTO
 	case MSG_CRYPTO:
-		tipc_crypto_msg_rcv(l->net, skb);
-		return true;
+		if (TIPC_SKB_CB(skb)->decrypted) {
+			tipc_crypto_msg_rcv(l->net, skb);
+			return true;
+		}
+		fallthrough;
 #endif
 	default:
 		pr_warn("Dropping received illegal msg type\n");

From 938cca9e4109b30ee1d476904538225a825e54eb Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Date: Mon, 15 Nov 2021 19:16:56 +0900
Subject: [PATCH 37/96] sock: fix /proc/net/sockstat underflow in
 sk_clone_lock()

sk_clone_lock() needs to call sock_inuse_add(1) before entering the
sk_free_unlock_clone() error path, for __sk_free() from sk_free() from
sk_free_unlock_clone() calls sock_inuse_add(-1).

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Fixes: 648845ab7e200993 ("sock: Move the socket inuse to namespace.")
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/sock.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/net/core/sock.c b/net/core/sock.c
index 8f2b2f2c0e7b1..41e91d0f70614 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2124,8 +2124,10 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
 	newsk->sk_prot_creator = prot;
 
 	/* SANITY */
-	if (likely(newsk->sk_net_refcnt))
+	if (likely(newsk->sk_net_refcnt)) {
 		get_net(sock_net(newsk));
+		sock_inuse_add(sock_net(newsk), 1);
+	}
 	sk_node_init(&newsk->sk_node);
 	sock_lock_init(newsk);
 	bh_lock_sock(newsk);
@@ -2197,8 +2199,6 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
 	newsk->sk_err_soft = 0;
 	newsk->sk_priority = 0;
 	newsk->sk_incoming_cpu = raw_smp_processor_id();
-	if (likely(newsk->sk_net_refcnt))
-		sock_inuse_add(sock_net(newsk), 1);
 
 	/* Before updating sk_refcnt, we must commit prior changes to memory
 	 * (Documentation/RCU/rculist_nulls.rst for details)

From cf4f5530bb55ef7d5a91036b26676643b80b1616 Mon Sep 17 00:00:00 2001
From: Wen Gu <guwen@linux.alibaba.com>
Date: Mon, 15 Nov 2021 17:45:07 +0800
Subject: [PATCH 38/96] net/smc: Make sure the link_id is unique

The link_id is supposed to be unique, but smcr_next_link_id() doesn't
skip the used link_id as expected. So the patch fixes this.

Fixes: 026c381fb477 ("net/smc: introduce link_idx for link group array")
Signed-off-by: Wen Gu <guwen@linux.alibaba.com>
Reviewed-by: Tony Lu <tonylu@linux.alibaba.com>
Acked-by: Karsten Graul <kgraul@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_core.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index 49b8ba3bb6835..25ebd30feecd6 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -708,13 +708,14 @@ static u8 smcr_next_link_id(struct smc_link_group *lgr)
 	int i;
 
 	while (1) {
+again:
 		link_id = ++lgr->next_link_id;
 		if (!link_id)	/* skip zero as link_id */
 			link_id = ++lgr->next_link_id;
 		for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
 			if (smc_link_usable(&lgr->lnk[i]) &&
 			    lgr->lnk[i].link_id == link_id)
-				continue;
+				goto again;
 		}
 		break;
 	}

From 4e5e6b5d9d1334d3490326b6922a2daaf56a867f Mon Sep 17 00:00:00 2001
From: Mateusz Palczewski <mateusz.palczewski@intel.com>
Date: Tue, 9 Feb 2021 11:59:38 +0000
Subject: [PATCH 39/96] iavf: Fix return of set the new channel count

Fixed return correct code from set the new channel count.
Implemented by check if reset is done in appropriate time.
This solution give a extra time to pf for reset vf in case
when user want set new channel count for all vfs.
Without this patch it is possible to return misleading output
code to user and vf reset not to be correctly performed by pf.

Fixes: 5520deb15326 ("iavf: Enable support for up to 16 queues")
Signed-off-by: Grzegorz Szczurek <grzegorzx.szczurek@intel.com>
Signed-off-by: Mateusz Palczewski <mateusz.palczewski@intel.com>
Tested-by: Konrad Jankowski <konrad0.jankowski@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 drivers/net/ethernet/intel/iavf/iavf_ethtool.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/drivers/net/ethernet/intel/iavf/iavf_ethtool.c b/drivers/net/ethernet/intel/iavf/iavf_ethtool.c
index 5a359a0a20ecc..136c801f5584a 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_ethtool.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_ethtool.c
@@ -1776,6 +1776,7 @@ static int iavf_set_channels(struct net_device *netdev,
 {
 	struct iavf_adapter *adapter = netdev_priv(netdev);
 	u32 num_req = ch->combined_count;
+	int i;
 
 	if ((adapter->vf_res->vf_cap_flags & VIRTCHNL_VF_OFFLOAD_ADQ) &&
 	    adapter->num_tc) {
@@ -1798,6 +1799,20 @@ static int iavf_set_channels(struct net_device *netdev,
 	adapter->num_req_queues = num_req;
 	adapter->flags |= IAVF_FLAG_REINIT_ITR_NEEDED;
 	iavf_schedule_reset(adapter);
+
+	/* wait for the reset is done */
+	for (i = 0; i < IAVF_RESET_WAIT_COMPLETE_COUNT; i++) {
+		msleep(IAVF_RESET_WAIT_MS);
+		if (adapter->flags & IAVF_FLAG_RESET_PENDING)
+			continue;
+		break;
+	}
+	if (i == IAVF_RESET_WAIT_COMPLETE_COUNT) {
+		adapter->flags &= ~IAVF_FLAG_REINIT_ITR_NEEDED;
+		adapter->num_active_queues = num_req;
+		return -EOPNOTSUPP;
+	}
+
 	return 0;
 }
 

From 8a4a126f4be88eb8b5f00a165ab58c35edf4ef76 Mon Sep 17 00:00:00 2001
From: Nicholas Nunley <nicholas.d.nunley@intel.com>
Date: Fri, 4 Jun 2021 09:48:53 -0700
Subject: [PATCH 40/96] iavf: check for null in iavf_fix_features

If the driver has lost contact with the PF then it enters a disabled state
and frees adapter->vf_res. However, ndo_fix_features can still be called on
the interface, so we need to check for this condition first. Since we have
no information on the features at this time simply leave them unmodified
and return.

Fixes: c4445aedfe09 ("i40evf: Fix VLAN features")
Signed-off-by: Nicholas Nunley <nicholas.d.nunley@intel.com>
Tested-by: Tony Brelinski <tony.brelinski@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 drivers/net/ethernet/intel/iavf/iavf_main.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c
index 847d67e32a540..561171507cda5 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_main.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_main.c
@@ -3503,7 +3503,8 @@ static netdev_features_t iavf_fix_features(struct net_device *netdev,
 {
 	struct iavf_adapter *adapter = netdev_priv(netdev);
 
-	if (!(adapter->vf_res->vf_cap_flags & VIRTCHNL_VF_OFFLOAD_VLAN))
+	if (adapter->vf_res &&
+	    !(adapter->vf_res->vf_cap_flags & VIRTCHNL_VF_OFFLOAD_VLAN))
 		features &= ~(NETIF_F_HW_VLAN_CTAG_TX |
 			      NETIF_F_HW_VLAN_CTAG_RX |
 			      NETIF_F_HW_VLAN_CTAG_FILTER);

From 89f22f129696ab53cfbc608e0a2184d0fea46ac1 Mon Sep 17 00:00:00 2001
From: Nicholas Nunley <nicholas.d.nunley@intel.com>
Date: Fri, 4 Jun 2021 09:48:54 -0700
Subject: [PATCH 41/96] iavf: free q_vectors before queues in iavf_disable_vf

iavf_free_queues() clears adapter->num_active_queues, which
iavf_free_q_vectors() relies on, so swap the order of these two function
calls in iavf_disable_vf(). This resolves a panic encountered when the
interface is disabled and then later brought up again after PF
communication is restored.

Fixes: 65c7006f234c ("i40evf: assign num_active_queues inside i40evf_alloc_queues")
Signed-off-by: Nicholas Nunley <nicholas.d.nunley@intel.com>
Tested-by: Tony Brelinski <tony.brelinski@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 drivers/net/ethernet/intel/iavf/iavf_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c
index 561171507cda5..c23fff5a4bd9c 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_main.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_main.c
@@ -2123,8 +2123,8 @@ static void iavf_disable_vf(struct iavf_adapter *adapter)
 
 	iavf_free_misc_irq(adapter);
 	iavf_reset_interrupt_capability(adapter);
-	iavf_free_queues(adapter);
 	iavf_free_q_vectors(adapter);
+	iavf_free_queues(adapter);
 	memset(adapter->vf_res, 0, IAVF_VIRTCHNL_VF_RESOURCE_SIZE);
 	iavf_shutdown_adminq(&adapter->hw);
 	adapter->netdev->flags &= ~IFF_UP;

From 2135a8d5c8186bc92901dc00f179ffd50e54c2ac Mon Sep 17 00:00:00 2001
From: Nicholas Nunley <nicholas.d.nunley@intel.com>
Date: Fri, 4 Jun 2021 09:48:55 -0700
Subject: [PATCH 42/96] iavf: don't clear a lock we don't hold

In iavf_configure_clsflower() the function will bail out if it is unable
to obtain the crit_section lock in a reasonable time. However, it will
clear the lock when exiting, so fix this.

Fixes: 640a8af5841f ("i40evf: Reorder configure_clsflower to avoid deadlock on error")
Signed-off-by: Nicholas Nunley <nicholas.d.nunley@intel.com>
Tested-by: Tony Brelinski <tony.brelinski@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 drivers/net/ethernet/intel/iavf/iavf_main.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c
index c23fff5a4bd9c..28661e4425f10 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_main.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_main.c
@@ -3095,8 +3095,10 @@ static int iavf_configure_clsflower(struct iavf_adapter *adapter,
 		return -ENOMEM;
 
 	while (!mutex_trylock(&adapter->crit_lock)) {
-		if (--count == 0)
-			goto err;
+		if (--count == 0) {
+			kfree(filter);
+			return err;
+		}
 		udelay(1);
 	}
 

From 8905072a192fffe9389255489db250c73ecab008 Mon Sep 17 00:00:00 2001
From: Piotr Marczak <piotr.marczak@intel.com>
Date: Fri, 4 Jun 2021 09:48:56 -0700
Subject: [PATCH 43/96] iavf: Fix failure to exit out from last all-multicast
 mode

The driver could only quit allmulti when allmulti and promisc modes are
turn on at the same time. If promisc had been off there was no way to turn
off allmulti mode.
The patch corrects this behavior. Switching allmulti does not depends on
promisc state mode anymore

Fixes: f42a5c74da99 ("i40e: Add allmulti support for the VF")
Signed-off-by: Piotr Marczak <piotr.marczak@intel.com>
Tested-by: Tony Brelinski <tony.brelinski@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 drivers/net/ethernet/intel/iavf/iavf_main.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c
index 28661e4425f10..76c4ca0f055ee 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_main.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_main.c
@@ -1639,8 +1639,7 @@ static int iavf_process_aq_command(struct iavf_adapter *adapter)
 		iavf_set_promiscuous(adapter, FLAG_VF_MULTICAST_PROMISC);
 		return 0;
 	}
-
-	if ((adapter->aq_required & IAVF_FLAG_AQ_RELEASE_PROMISC) &&
+	if ((adapter->aq_required & IAVF_FLAG_AQ_RELEASE_PROMISC) ||
 	    (adapter->aq_required & IAVF_FLAG_AQ_RELEASE_ALLMULTI)) {
 		iavf_set_promiscuous(adapter, 0);
 		return 0;

From 4f0400803818f2642f066d3eacaf013f23554cc7 Mon Sep 17 00:00:00 2001
From: Jacob Keller <jacob.e.keller@intel.com>
Date: Fri, 4 Jun 2021 09:48:57 -0700
Subject: [PATCH 44/96] iavf: prevent accidental free of filter structure

In iavf_config_clsflower, the filter structure could be accidentally
released at the end, if iavf_parse_cls_flower or iavf_handle_tclass ever
return a non-zero but positive value.

In this case, the function continues through to the end, and will call
kfree() on the filter structure even though it has been added to the
linked list.

This can actually happen because iavf_parse_cls_flower will return
a positive IAVF_ERR_CONFIG value instead of the traditional negative
error codes.

Fix this by ensuring that the kfree() check and error checks are
similar. Use the more idiomatic "if (err)" to catch all non-zero error
codes.

Fixes: 0075fa0fadd0 ("i40evf: Add support to apply cloud filters")
Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Tested-by: Tony Brelinski <tony.brelinski@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 drivers/net/ethernet/intel/iavf/iavf_main.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c
index 76c4ca0f055ee..9c68c8628512e 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_main.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_main.c
@@ -3108,11 +3108,11 @@ static int iavf_configure_clsflower(struct iavf_adapter *adapter,
 	/* start out with flow type and eth type IPv4 to begin with */
 	filter->f.flow_type = VIRTCHNL_TCP_V4_FLOW;
 	err = iavf_parse_cls_flower(adapter, cls_flower, filter);
-	if (err < 0)
+	if (err)
 		goto err;
 
 	err = iavf_handle_tclass(adapter, tc, filter);
-	if (err < 0)
+	if (err)
 		goto err;
 
 	/* add filter to the list */

From 131b0edc4028bb88bb472456b1ddba526cfb7036 Mon Sep 17 00:00:00 2001
From: Mitch Williams <mitch.a.williams@intel.com>
Date: Fri, 4 Jun 2021 09:48:58 -0700
Subject: [PATCH 45/96] iavf: validate pointers

In some cases, the ethtool get_rxfh handler may be called with a null
key or indir parameter. So check these pointers, or you will have a very
bad day.

Fixes: 43a3d9ba34c9 ("i40evf: Allow PF driver to configure RSS")
Signed-off-by: Mitch Williams <mitch.a.williams@intel.com>
Tested-by: Tony Brelinski <tony.brelinski@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 drivers/net/ethernet/intel/iavf/iavf_ethtool.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/intel/iavf/iavf_ethtool.c b/drivers/net/ethernet/intel/iavf/iavf_ethtool.c
index 136c801f5584a..25ee0606e625f 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_ethtool.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_ethtool.c
@@ -1859,14 +1859,13 @@ static int iavf_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key,
 
 	if (hfunc)
 		*hfunc = ETH_RSS_HASH_TOP;
-	if (!indir)
-		return 0;
-
-	memcpy(key, adapter->rss_key, adapter->rss_key_size);
+	if (key)
+		memcpy(key, adapter->rss_key, adapter->rss_key_size);
 
-	/* Each 32 bits pointed by 'indir' is stored with a lut entry */
-	for (i = 0; i < adapter->rss_lut_size; i++)
-		indir[i] = (u32)adapter->rss_lut[i];
+	if (indir)
+		/* Each 32 bits pointed by 'indir' is stored with a lut entry */
+		for (i = 0; i < adapter->rss_lut_size; i++)
+			indir[i] = (u32)adapter->rss_lut[i];
 
 	return 0;
 }

From 321421b57a12e933f92b228e0e6d0b2c6541f41d Mon Sep 17 00:00:00 2001
From: Surabhi Boob <surabhi.boob@intel.com>
Date: Fri, 4 Jun 2021 09:48:59 -0700
Subject: [PATCH 46/96] iavf: Fix for the false positive ASQ/ARQ errors while
 issuing VF reset

While issuing VF Reset from the guest OS, the VF driver prints
logs about critical / Overflow error detection. This is not an
actual error since the VF_MBX_ARQLEN register is set to all FF's
for a short period of time and the VF would catch the bits set if
it was reading the register during that spike of time.
This patch introduces an additional check to ignore this condition
since the VF is in reset.

Fixes: 19b73d8efaa4 ("i40evf: Add additional check for reset")
Signed-off-by: Surabhi Boob <surabhi.boob@intel.com>
Tested-by: Tony Brelinski <tony.brelinski@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 drivers/net/ethernet/intel/iavf/iavf_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c
index 9c68c8628512e..9ca9208aa8965 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_main.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_main.c
@@ -2409,7 +2409,7 @@ static void iavf_adminq_task(struct work_struct *work)
 
 	/* check for error indications */
 	val = rd32(hw, hw->aq.arq.len);
-	if (val == 0xdeadbeef) /* indicates device in reset */
+	if (val == 0xdeadbeef || val == 0xffffffff) /* device in reset */
 		goto freedom;
 	oldval = val;
 	if (val & IAVF_VF_ARQLEN1_ARQVFE_MASK) {

From 9a6e9e483a9684a34573fd9f9e30ecfb047cb8cb Mon Sep 17 00:00:00 2001
From: Grzegorz Szczurek <grzegorzx.szczurek@intel.com>
Date: Fri, 4 Jun 2021 09:49:00 -0700
Subject: [PATCH 47/96] iavf: Fix for setting queues to 0

Now setting combine to 0 will be rejected with the
appropriate error code.
This has been implemented by adding a condition that checks
the value of combine equal to zero.
Without this patch, when the user requested it, no error was
returned and combine was set to the default value for VF.

Fixes: 5520deb15326 ("iavf: Enable support for up to 16 queues")
Signed-off-by: Grzegorz Szczurek <grzegorzx.szczurek@intel.com>
Tested-by: Tony Brelinski <tony.brelinski@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 drivers/net/ethernet/intel/iavf/iavf_ethtool.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/iavf/iavf_ethtool.c b/drivers/net/ethernet/intel/iavf/iavf_ethtool.c
index 25ee0606e625f..144a776793597 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_ethtool.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_ethtool.c
@@ -1787,7 +1787,7 @@ static int iavf_set_channels(struct net_device *netdev,
 	/* All of these should have already been checked by ethtool before this
 	 * even gets to us, but just to be sure.
 	 */
-	if (num_req > adapter->vsi_res->num_queue_pairs)
+	if (num_req == 0 || num_req > adapter->vsi_res->num_queue_pairs)
 		return -EINVAL;
 
 	if (num_req == adapter->num_active_queues)

From 4293014230b887d94b68aa460ff00153454a3709 Mon Sep 17 00:00:00 2001
From: Akeem G Abodunrin <akeem.g.abodunrin@intel.com>
Date: Fri, 4 Jun 2021 09:53:27 -0700
Subject: [PATCH 48/96] iavf: Restore VLAN filters after link down

Restore VLAN filters after the link is brought down, and up - since all
filters are deleted from HW during the netdev link down routine.

Fixes: ed1f5b58ea01 ("i40evf: remove VLAN filters on close")
Signed-off-by: Akeem G Abodunrin <akeem.g.abodunrin@intel.com>
Tested-by: George Kuruvinakunnel <george.kuruvinakunnel@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 drivers/net/ethernet/intel/iavf/iavf.h      |  1 +
 drivers/net/ethernet/intel/iavf/iavf_main.c | 35 ++++++++++++++++++---
 2 files changed, 31 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/intel/iavf/iavf.h b/drivers/net/ethernet/intel/iavf/iavf.h
index e6e7c1da47fbe..75635bd57cf6d 100644
--- a/drivers/net/ethernet/intel/iavf/iavf.h
+++ b/drivers/net/ethernet/intel/iavf/iavf.h
@@ -39,6 +39,7 @@
 #include "iavf_txrx.h"
 #include "iavf_fdir.h"
 #include "iavf_adv_rss.h"
+#include <linux/bitmap.h>
 
 #define DEFAULT_DEBUG_LEVEL_SHIFT 3
 #define PFX "iavf: "
diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c
index 9ca9208aa8965..336e6bf95e48f 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_main.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_main.c
@@ -696,6 +696,23 @@ static void iavf_del_vlan(struct iavf_adapter *adapter, u16 vlan)
 	spin_unlock_bh(&adapter->mac_vlan_list_lock);
 }
 
+/**
+ * iavf_restore_filters
+ * @adapter: board private structure
+ *
+ * Restore existing non MAC filters when VF netdev comes back up
+ **/
+static void iavf_restore_filters(struct iavf_adapter *adapter)
+{
+	/* re-add all VLAN filters */
+	if (VLAN_ALLOWED(adapter)) {
+		u16 vid;
+
+		for_each_set_bit(vid, adapter->vsi.active_vlans, VLAN_N_VID)
+			iavf_add_vlan(adapter, vid);
+	}
+}
+
 /**
  * iavf_vlan_rx_add_vid - Add a VLAN filter to a device
  * @netdev: network device struct
@@ -709,8 +726,11 @@ static int iavf_vlan_rx_add_vid(struct net_device *netdev,
 
 	if (!VLAN_ALLOWED(adapter))
 		return -EIO;
+
 	if (iavf_add_vlan(adapter, vid) == NULL)
 		return -ENOMEM;
+
+	set_bit(vid, adapter->vsi.active_vlans);
 	return 0;
 }
 
@@ -725,11 +745,13 @@ static int iavf_vlan_rx_kill_vid(struct net_device *netdev,
 {
 	struct iavf_adapter *adapter = netdev_priv(netdev);
 
-	if (VLAN_ALLOWED(adapter)) {
-		iavf_del_vlan(adapter, vid);
-		return 0;
-	}
-	return -EIO;
+	if (!VLAN_ALLOWED(adapter))
+		return -EIO;
+
+	iavf_del_vlan(adapter, vid);
+	clear_bit(vid, adapter->vsi.active_vlans);
+
+	return 0;
 }
 
 /**
@@ -3309,6 +3331,9 @@ static int iavf_open(struct net_device *netdev)
 
 	spin_unlock_bh(&adapter->mac_vlan_list_lock);
 
+	/* Restore VLAN filters that were removed with IFF_DOWN */
+	iavf_restore_filters(adapter);
+
 	iavf_configure(adapter);
 
 	iavf_up_complete(adapter);

From 5e0bc3082e2e403ac0753e099c2b01446bb35578 Mon Sep 17 00:00:00 2001
From: Dmitrii Banshchikov <me@ubique.spb.ru>
Date: Sat, 13 Nov 2021 18:22:26 +0400
Subject: [PATCH 49/96] bpf: Forbid bpf_ktime_get_coarse_ns and bpf_timer_* in
 tracing progs

Use of bpf_ktime_get_coarse_ns() and bpf_timer_* helpers in tracing
progs may result in locking issues.

bpf_ktime_get_coarse_ns() uses ktime_get_coarse_ns() time accessor that
isn't safe for any context:
======================================================
WARNING: possible circular locking dependency detected
5.15.0-syzkaller #0 Not tainted
------------------------------------------------------
syz-executor.4/14877 is trying to acquire lock:
ffffffff8cb30008 (tk_core.seq.seqcount){----}-{0:0}, at: ktime_get_coarse_ts64+0x25/0x110 kernel/time/timekeeping.c:2255

but task is already holding lock:
ffffffff90dbf200 (&obj_hash[i].lock){-.-.}-{2:2}, at: debug_object_deactivate+0x61/0x400 lib/debugobjects.c:735

which lock already depends on the new lock.

the existing dependency chain (in reverse order) is:

-> #1 (&obj_hash[i].lock){-.-.}-{2:2}:
       lock_acquire+0x19f/0x4d0 kernel/locking/lockdep.c:5625
       __raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:110 [inline]
       _raw_spin_lock_irqsave+0xd1/0x120 kernel/locking/spinlock.c:162
       __debug_object_init+0xd9/0x1860 lib/debugobjects.c:569
       debug_hrtimer_init kernel/time/hrtimer.c:414 [inline]
       debug_init kernel/time/hrtimer.c:468 [inline]
       hrtimer_init+0x20/0x40 kernel/time/hrtimer.c:1592
       ntp_init_cmos_sync kernel/time/ntp.c:676 [inline]
       ntp_init+0xa1/0xad kernel/time/ntp.c:1095
       timekeeping_init+0x512/0x6bf kernel/time/timekeeping.c:1639
       start_kernel+0x267/0x56e init/main.c:1030
       secondary_startup_64_no_verify+0xb1/0xbb

-> #0 (tk_core.seq.seqcount){----}-{0:0}:
       check_prev_add kernel/locking/lockdep.c:3051 [inline]
       check_prevs_add kernel/locking/lockdep.c:3174 [inline]
       validate_chain+0x1dfb/0x8240 kernel/locking/lockdep.c:3789
       __lock_acquire+0x1382/0x2b00 kernel/locking/lockdep.c:5015
       lock_acquire+0x19f/0x4d0 kernel/locking/lockdep.c:5625
       seqcount_lockdep_reader_access+0xfe/0x230 include/linux/seqlock.h:103
       ktime_get_coarse_ts64+0x25/0x110 kernel/time/timekeeping.c:2255
       ktime_get_coarse include/linux/timekeeping.h:120 [inline]
       ktime_get_coarse_ns include/linux/timekeeping.h:126 [inline]
       ____bpf_ktime_get_coarse_ns kernel/bpf/helpers.c:173 [inline]
       bpf_ktime_get_coarse_ns+0x7e/0x130 kernel/bpf/helpers.c:171
       bpf_prog_a99735ebafdda2f1+0x10/0xb50
       bpf_dispatcher_nop_func include/linux/bpf.h:721 [inline]
       __bpf_prog_run include/linux/filter.h:626 [inline]
       bpf_prog_run include/linux/filter.h:633 [inline]
       BPF_PROG_RUN_ARRAY include/linux/bpf.h:1294 [inline]
       trace_call_bpf+0x2cf/0x5d0 kernel/trace/bpf_trace.c:127
       perf_trace_run_bpf_submit+0x7b/0x1d0 kernel/events/core.c:9708
       perf_trace_lock+0x37c/0x440 include/trace/events/lock.h:39
       trace_lock_release+0x128/0x150 include/trace/events/lock.h:58
       lock_release+0x82/0x810 kernel/locking/lockdep.c:5636
       __raw_spin_unlock_irqrestore include/linux/spinlock_api_smp.h:149 [inline]
       _raw_spin_unlock_irqrestore+0x75/0x130 kernel/locking/spinlock.c:194
       debug_hrtimer_deactivate kernel/time/hrtimer.c:425 [inline]
       debug_deactivate kernel/time/hrtimer.c:481 [inline]
       __run_hrtimer kernel/time/hrtimer.c:1653 [inline]
       __hrtimer_run_queues+0x2f9/0xa60 kernel/time/hrtimer.c:1749
       hrtimer_interrupt+0x3b3/0x1040 kernel/time/hrtimer.c:1811
       local_apic_timer_interrupt arch/x86/kernel/apic/apic.c:1086 [inline]
       __sysvec_apic_timer_interrupt+0xf9/0x270 arch/x86/kernel/apic/apic.c:1103
       sysvec_apic_timer_interrupt+0x8c/0xb0 arch/x86/kernel/apic/apic.c:1097
       asm_sysvec_apic_timer_interrupt+0x12/0x20
       __raw_spin_unlock_irqrestore include/linux/spinlock_api_smp.h:152 [inline]
       _raw_spin_unlock_irqrestore+0xd4/0x130 kernel/locking/spinlock.c:194
       try_to_wake_up+0x702/0xd20 kernel/sched/core.c:4118
       wake_up_process kernel/sched/core.c:4200 [inline]
       wake_up_q+0x9a/0xf0 kernel/sched/core.c:953
       futex_wake+0x50f/0x5b0 kernel/futex/waitwake.c:184
       do_futex+0x367/0x560 kernel/futex/syscalls.c:127
       __do_sys_futex kernel/futex/syscalls.c:199 [inline]
       __se_sys_futex+0x401/0x4b0 kernel/futex/syscalls.c:180
       do_syscall_x64 arch/x86/entry/common.c:50 [inline]
       do_syscall_64+0x44/0xd0 arch/x86/entry/common.c:80
       entry_SYSCALL_64_after_hwframe+0x44/0xae

There is a possible deadlock with bpf_timer_* set of helpers:
hrtimer_start()
  lock_base();
  trace_hrtimer...()
    perf_event()
      bpf_run()
        bpf_timer_start()
          hrtimer_start()
            lock_base()         <- DEADLOCK

Forbid use of bpf_ktime_get_coarse_ns() and bpf_timer_* helpers in
BPF_PROG_TYPE_KPROBE, BPF_PROG_TYPE_TRACEPOINT, BPF_PROG_TYPE_PERF_EVENT
and BPF_PROG_TYPE_RAW_TRACEPOINT prog types.

Fixes: d05512618056 ("bpf: Add bpf_ktime_get_coarse_ns helper")
Fixes: b00628b1c7d5 ("bpf: Introduce bpf timers.")
Reported-by: syzbot+43fd005b5a1b4d10781e@syzkaller.appspotmail.com
Signed-off-by: Dmitrii Banshchikov <me@ubique.spb.ru>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20211113142227.566439-2-me@ubique.spb.ru
---
 kernel/bpf/cgroup.c      | 2 ++
 kernel/bpf/helpers.c     | 2 --
 kernel/bpf/verifier.c    | 7 +++++++
 kernel/trace/bpf_trace.c | 2 --
 net/core/filter.c        | 6 ++++++
 net/ipv4/bpf_tcp_ca.c    | 2 ++
 6 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 2ca643af9a548..43eb3501721b7 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -1809,6 +1809,8 @@ sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_sysctl_get_new_value_proto;
 	case BPF_FUNC_sysctl_set_new_value:
 		return &bpf_sysctl_set_new_value_proto;
+	case BPF_FUNC_ktime_get_coarse_ns:
+		return &bpf_ktime_get_coarse_ns_proto;
 	default:
 		return cgroup_base_func_proto(func_id, prog);
 	}
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 1ffd469c217fa..649f07623df6c 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1364,8 +1364,6 @@ bpf_base_func_proto(enum bpf_func_id func_id)
 		return &bpf_ktime_get_ns_proto;
 	case BPF_FUNC_ktime_get_boot_ns:
 		return &bpf_ktime_get_boot_ns_proto;
-	case BPF_FUNC_ktime_get_coarse_ns:
-		return &bpf_ktime_get_coarse_ns_proto;
 	case BPF_FUNC_ringbuf_output:
 		return &bpf_ringbuf_output_proto;
 	case BPF_FUNC_ringbuf_reserve:
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index aab7482ed1c38..65d2f93b70307 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -11632,6 +11632,13 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
 		}
 	}
 
+	if (map_value_has_timer(map)) {
+		if (is_tracing_prog_type(prog_type)) {
+			verbose(env, "tracing progs cannot use bpf_timer yet\n");
+			return -EINVAL;
+		}
+	}
+
 	if ((bpf_prog_is_dev_bound(prog->aux) || bpf_map_is_dev_bound(map)) &&
 	    !bpf_offload_prog_map_match(prog, map)) {
 		verbose(env, "offload device mismatch between prog and map\n");
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 7396488793ff7..ae9755037b7ee 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1111,8 +1111,6 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_ktime_get_ns_proto;
 	case BPF_FUNC_ktime_get_boot_ns:
 		return &bpf_ktime_get_boot_ns_proto;
-	case BPF_FUNC_ktime_get_coarse_ns:
-		return &bpf_ktime_get_coarse_ns_proto;
 	case BPF_FUNC_tail_call:
 		return &bpf_tail_call_proto;
 	case BPF_FUNC_get_current_pid_tgid:
diff --git a/net/core/filter.c b/net/core/filter.c
index e471c9b096705..6102f093d59a5 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -7162,6 +7162,8 @@ sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 #endif
 	case BPF_FUNC_sk_storage_get:
 		return &bpf_sk_storage_get_cg_sock_proto;
+	case BPF_FUNC_ktime_get_coarse_ns:
+		return &bpf_ktime_get_coarse_ns_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}
@@ -10327,6 +10329,8 @@ sk_reuseport_func_proto(enum bpf_func_id func_id,
 		return &sk_reuseport_load_bytes_relative_proto;
 	case BPF_FUNC_get_socket_cookie:
 		return &bpf_get_socket_ptr_cookie_proto;
+	case BPF_FUNC_ktime_get_coarse_ns:
+		return &bpf_ktime_get_coarse_ns_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}
@@ -10833,6 +10837,8 @@ bpf_sk_base_func_proto(enum bpf_func_id func_id)
 	case BPF_FUNC_skc_to_unix_sock:
 		func = &bpf_skc_to_unix_sock_proto;
 		break;
+	case BPF_FUNC_ktime_get_coarse_ns:
+		return &bpf_ktime_get_coarse_ns_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}
diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c
index 2cf02b4d77fbf..4bb9401b0a3fe 100644
--- a/net/ipv4/bpf_tcp_ca.c
+++ b/net/ipv4/bpf_tcp_ca.c
@@ -205,6 +205,8 @@ bpf_tcp_ca_get_func_proto(enum bpf_func_id func_id,
 		    offsetof(struct tcp_congestion_ops, release))
 			return &bpf_sk_getsockopt_proto;
 		return NULL;
+	case BPF_FUNC_ktime_get_coarse_ns:
+		return &bpf_ktime_get_coarse_ns_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}

From e60e6962c503f337531f80e2752423b5bd885443 Mon Sep 17 00:00:00 2001
From: Dmitrii Banshchikov <me@ubique.spb.ru>
Date: Sat, 13 Nov 2021 18:22:27 +0400
Subject: [PATCH 50/96] selftests/bpf: Add tests for restricted helpers

This patch adds tests that bpf_ktime_get_coarse_ns(), bpf_timer_* and
bpf_spin_lock()/bpf_spin_unlock() helpers are forbidden in tracing progs
as their use there may result in various locking issues.

Signed-off-by: Dmitrii Banshchikov <me@ubique.spb.ru>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20211113142227.566439-3-me@ubique.spb.ru
---
 .../bpf/prog_tests/helper_restricted.c        |  33 +++
 .../bpf/progs/test_helper_restricted.c        | 123 +++++++++++
 tools/testing/selftests/bpf/test_verifier.c   |  46 +++-
 .../bpf/verifier/helper_restricted.c          | 196 ++++++++++++++++++
 4 files changed, 397 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/helper_restricted.c
 create mode 100644 tools/testing/selftests/bpf/progs/test_helper_restricted.c
 create mode 100644 tools/testing/selftests/bpf/verifier/helper_restricted.c

diff --git a/tools/testing/selftests/bpf/prog_tests/helper_restricted.c b/tools/testing/selftests/bpf/prog_tests/helper_restricted.c
new file mode 100644
index 0000000000000..e1de5f80c3b23
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/helper_restricted.c
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <test_progs.h>
+#include "test_helper_restricted.skel.h"
+
+void test_helper_restricted(void)
+{
+	int prog_i = 0, prog_cnt;
+	int duration = 0;
+
+	do {
+		struct test_helper_restricted *test;
+		int maybeOK;
+
+		test = test_helper_restricted__open();
+		if (!ASSERT_OK_PTR(test, "open"))
+			return;
+
+		prog_cnt = test->skeleton->prog_cnt;
+
+		for (int j = 0; j < prog_cnt; ++j) {
+			struct bpf_program *prog = *test->skeleton->progs[j].prog;
+
+			maybeOK = bpf_program__set_autoload(prog, prog_i == j);
+			ASSERT_OK(maybeOK, "set autoload");
+		}
+
+		maybeOK = test_helper_restricted__load(test);
+		CHECK(!maybeOK, test->skeleton->progs[prog_i].name, "helper isn't restricted");
+
+		test_helper_restricted__destroy(test);
+	} while (++prog_i < prog_cnt);
+}
diff --git a/tools/testing/selftests/bpf/progs/test_helper_restricted.c b/tools/testing/selftests/bpf/progs/test_helper_restricted.c
new file mode 100644
index 0000000000000..68d64c365f905
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_helper_restricted.c
@@ -0,0 +1,123 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <time.h>
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+
+struct timer {
+	struct bpf_timer t;
+};
+
+struct lock {
+	struct bpf_spin_lock l;
+};
+
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__uint(max_entries, 1);
+	__type(key, __u32);
+	__type(value, struct timer);
+} timers SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__uint(max_entries, 1);
+	__type(key, __u32);
+	__type(value, struct lock);
+} locks SEC(".maps");
+
+static int timer_cb(void *map, int *key, struct timer *timer)
+{
+	return 0;
+}
+
+static void timer_work(void)
+{
+	struct timer *timer;
+	const int key = 0;
+
+	timer  = bpf_map_lookup_elem(&timers, &key);
+	if (timer) {
+		bpf_timer_init(&timer->t, &timers, CLOCK_MONOTONIC);
+		bpf_timer_set_callback(&timer->t, timer_cb);
+		bpf_timer_start(&timer->t, 10E9, 0);
+		bpf_timer_cancel(&timer->t);
+	}
+}
+
+static void spin_lock_work(void)
+{
+	const int key = 0;
+	struct lock *lock;
+
+	lock = bpf_map_lookup_elem(&locks, &key);
+	if (lock) {
+		bpf_spin_lock(&lock->l);
+		bpf_spin_unlock(&lock->l);
+	}
+}
+
+SEC("raw_tp/sys_enter")
+int raw_tp_timer(void *ctx)
+{
+	timer_work();
+
+	return 0;
+}
+
+SEC("tp/syscalls/sys_enter_nanosleep")
+int tp_timer(void *ctx)
+{
+	timer_work();
+
+	return 0;
+}
+
+SEC("kprobe/sys_nanosleep")
+int kprobe_timer(void *ctx)
+{
+	timer_work();
+
+	return 0;
+}
+
+SEC("perf_event")
+int perf_event_timer(void *ctx)
+{
+	timer_work();
+
+	return 0;
+}
+
+SEC("raw_tp/sys_enter")
+int raw_tp_spin_lock(void *ctx)
+{
+	spin_lock_work();
+
+	return 0;
+}
+
+SEC("tp/syscalls/sys_enter_nanosleep")
+int tp_spin_lock(void *ctx)
+{
+	spin_lock_work();
+
+	return 0;
+}
+
+SEC("kprobe/sys_nanosleep")
+int kprobe_spin_lock(void *ctx)
+{
+	spin_lock_work();
+
+	return 0;
+}
+
+SEC("perf_event")
+int perf_event_spin_lock(void *ctx)
+{
+	spin_lock_work();
+
+	return 0;
+}
+
+const char LICENSE[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index 25afe423b3f06..465ef3f112c0c 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -92,6 +92,7 @@ struct bpf_test {
 	int fixup_map_event_output[MAX_FIXUPS];
 	int fixup_map_reuseport_array[MAX_FIXUPS];
 	int fixup_map_ringbuf[MAX_FIXUPS];
+	int fixup_map_timer[MAX_FIXUPS];
 	/* Expected verifier log output for result REJECT or VERBOSE_ACCEPT.
 	 * Can be a tab-separated sequence of expected strings. An empty string
 	 * means no log verification.
@@ -604,8 +605,15 @@ static int create_cgroup_storage(bool percpu)
  *   int cnt;
  *   struct bpf_spin_lock l;
  * };
+ * struct bpf_timer {
+ *   __u64 :64;
+ *   __u64 :64;
+ * } __attribute__((aligned(8)));
+ * struct timer {
+ *   struct bpf_timer t;
+ * };
  */
-static const char btf_str_sec[] = "\0bpf_spin_lock\0val\0cnt\0l";
+static const char btf_str_sec[] = "\0bpf_spin_lock\0val\0cnt\0l\0bpf_timer\0timer\0t";
 static __u32 btf_raw_types[] = {
 	/* int */
 	BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),  /* [1] */
@@ -616,6 +624,11 @@ static __u32 btf_raw_types[] = {
 	BTF_TYPE_ENC(15, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 2), 8),
 	BTF_MEMBER_ENC(19, 1, 0), /* int cnt; */
 	BTF_MEMBER_ENC(23, 2, 32),/* struct bpf_spin_lock l; */
+	/* struct bpf_timer */                          /* [4] */
+	BTF_TYPE_ENC(25, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 0), 16),
+	/* struct timer */                              /* [5] */
+	BTF_TYPE_ENC(35, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 1), 16),
+	BTF_MEMBER_ENC(41, 4, 0), /* struct bpf_timer t; */
 };
 
 static int load_btf(void)
@@ -696,6 +709,29 @@ static int create_sk_storage_map(void)
 	return fd;
 }
 
+static int create_map_timer(void)
+{
+	struct bpf_create_map_attr attr = {
+		.name = "test_map",
+		.map_type = BPF_MAP_TYPE_ARRAY,
+		.key_size = 4,
+		.value_size = 16,
+		.max_entries = 1,
+		.btf_key_type_id = 1,
+		.btf_value_type_id = 5,
+	};
+	int fd, btf_fd;
+
+	btf_fd = load_btf();
+	if (btf_fd < 0)
+		return -1;
+	attr.btf_fd = btf_fd;
+	fd = bpf_create_map_xattr(&attr);
+	if (fd < 0)
+		printf("Failed to create map with timer\n");
+	return fd;
+}
+
 static char bpf_vlog[UINT_MAX >> 8];
 
 static void do_test_fixup(struct bpf_test *test, enum bpf_prog_type prog_type,
@@ -722,6 +758,7 @@ static void do_test_fixup(struct bpf_test *test, enum bpf_prog_type prog_type,
 	int *fixup_map_event_output = test->fixup_map_event_output;
 	int *fixup_map_reuseport_array = test->fixup_map_reuseport_array;
 	int *fixup_map_ringbuf = test->fixup_map_ringbuf;
+	int *fixup_map_timer = test->fixup_map_timer;
 
 	if (test->fill_helper) {
 		test->fill_insns = calloc(MAX_TEST_INSNS, sizeof(struct bpf_insn));
@@ -907,6 +944,13 @@ static void do_test_fixup(struct bpf_test *test, enum bpf_prog_type prog_type,
 			fixup_map_ringbuf++;
 		} while (*fixup_map_ringbuf);
 	}
+	if (*fixup_map_timer) {
+		map_fds[21] = create_map_timer();
+		do {
+			prog[*fixup_map_timer].imm = map_fds[21];
+			fixup_map_timer++;
+		} while (*fixup_map_timer);
+	}
 }
 
 struct libcap {
diff --git a/tools/testing/selftests/bpf/verifier/helper_restricted.c b/tools/testing/selftests/bpf/verifier/helper_restricted.c
new file mode 100644
index 0000000000000..a067b7098b970
--- /dev/null
+++ b/tools/testing/selftests/bpf/verifier/helper_restricted.c
@@ -0,0 +1,196 @@
+{
+	"bpf_ktime_get_coarse_ns is forbidden in BPF_PROG_TYPE_KPROBE",
+	.insns = {
+		BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_ktime_get_coarse_ns),
+		BPF_MOV64_IMM(BPF_REG_0, 0),
+		BPF_EXIT_INSN(),
+	},
+	.errstr = "unknown func bpf_ktime_get_coarse_ns",
+	.result = REJECT,
+	.prog_type = BPF_PROG_TYPE_KPROBE,
+},
+{
+	"bpf_ktime_get_coarse_ns is forbidden in BPF_PROG_TYPE_TRACEPOINT",
+	.insns = {
+		BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_ktime_get_coarse_ns),
+		BPF_MOV64_IMM(BPF_REG_0, 0),
+		BPF_EXIT_INSN(),
+	},
+	.errstr = "unknown func bpf_ktime_get_coarse_ns",
+	.result = REJECT,
+	.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+	"bpf_ktime_get_coarse_ns is forbidden in BPF_PROG_TYPE_PERF_EVENT",
+	.insns = {
+	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_ktime_get_coarse_ns),
+	BPF_MOV64_IMM(BPF_REG_0, 0),
+	BPF_EXIT_INSN(),
+	},
+	.errstr = "unknown func bpf_ktime_get_coarse_ns",
+	.result = REJECT,
+	.prog_type = BPF_PROG_TYPE_PERF_EVENT,
+},
+{
+	"bpf_ktime_get_coarse_ns is forbidden in BPF_PROG_TYPE_RAW_TRACEPOINT",
+	.insns = {
+	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_ktime_get_coarse_ns),
+	BPF_MOV64_IMM(BPF_REG_0, 0),
+	BPF_EXIT_INSN(),
+	},
+	.errstr = "unknown func bpf_ktime_get_coarse_ns",
+	.result = REJECT,
+	.prog_type = BPF_PROG_TYPE_RAW_TRACEPOINT,
+},
+{
+	"bpf_timer_init isn restricted in BPF_PROG_TYPE_KPROBE",
+	.insns = {
+	BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+	BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+	BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+	BPF_LD_MAP_FD(BPF_REG_1, 0),
+	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+	BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4),
+	BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+	BPF_LD_MAP_FD(BPF_REG_2, 0),
+	BPF_MOV64_IMM(BPF_REG_3, 1),
+	BPF_EMIT_CALL(BPF_FUNC_timer_init),
+	BPF_EXIT_INSN(),
+	},
+	.fixup_map_timer = { 3, 8 },
+	.errstr = "tracing progs cannot use bpf_timer yet",
+	.result = REJECT,
+	.prog_type = BPF_PROG_TYPE_KPROBE,
+},
+{
+	"bpf_timer_init is forbidden in BPF_PROG_TYPE_PERF_EVENT",
+	.insns = {
+	BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+	BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+	BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+	BPF_LD_MAP_FD(BPF_REG_1, 0),
+	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+	BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4),
+	BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+	BPF_LD_MAP_FD(BPF_REG_2, 0),
+	BPF_MOV64_IMM(BPF_REG_3, 1),
+	BPF_EMIT_CALL(BPF_FUNC_timer_init),
+	BPF_EXIT_INSN(),
+	},
+	.fixup_map_timer = { 3, 8 },
+	.errstr = "tracing progs cannot use bpf_timer yet",
+	.result = REJECT,
+	.prog_type = BPF_PROG_TYPE_PERF_EVENT,
+},
+{
+	"bpf_timer_init is forbidden in BPF_PROG_TYPE_TRACEPOINT",
+	.insns = {
+	BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+	BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+	BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+	BPF_LD_MAP_FD(BPF_REG_1, 0),
+	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+	BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4),
+	BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+	BPF_LD_MAP_FD(BPF_REG_2, 0),
+	BPF_MOV64_IMM(BPF_REG_3, 1),
+	BPF_EMIT_CALL(BPF_FUNC_timer_init),
+	BPF_EXIT_INSN(),
+	},
+	.fixup_map_timer = { 3, 8 },
+	.errstr = "tracing progs cannot use bpf_timer yet",
+	.result = REJECT,
+	.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+	"bpf_timer_init is forbidden in BPF_PROG_TYPE_RAW_TRACEPOINT",
+	.insns = {
+	BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+	BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+	BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+	BPF_LD_MAP_FD(BPF_REG_1, 0),
+	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+	BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4),
+	BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+	BPF_LD_MAP_FD(BPF_REG_2, 0),
+	BPF_MOV64_IMM(BPF_REG_3, 1),
+	BPF_EMIT_CALL(BPF_FUNC_timer_init),
+	BPF_EXIT_INSN(),
+	},
+	.fixup_map_timer = { 3, 8 },
+	.errstr = "tracing progs cannot use bpf_timer yet",
+	.result = REJECT,
+	.prog_type = BPF_PROG_TYPE_RAW_TRACEPOINT,
+},
+{
+	"bpf_spin_lock is forbidden in BPF_PROG_TYPE_KPROBE",
+	.insns = {
+	BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+	BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+	BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+	BPF_LD_MAP_FD(BPF_REG_1, 0),
+	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+	BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
+	BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+	BPF_EMIT_CALL(BPF_FUNC_spin_lock),
+	BPF_EXIT_INSN(),
+	},
+	.fixup_map_spin_lock = { 3 },
+	.errstr = "tracing progs cannot use bpf_spin_lock yet",
+	.result = REJECT,
+	.prog_type = BPF_PROG_TYPE_KPROBE,
+},
+{
+	"bpf_spin_lock is forbidden in BPF_PROG_TYPE_TRACEPOINT",
+	.insns = {
+	BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+	BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+	BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+	BPF_LD_MAP_FD(BPF_REG_1, 0),
+	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+	BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
+	BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+	BPF_EMIT_CALL(BPF_FUNC_spin_lock),
+	BPF_EXIT_INSN(),
+	},
+	.fixup_map_spin_lock = { 3 },
+	.errstr = "tracing progs cannot use bpf_spin_lock yet",
+	.result = REJECT,
+	.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+	"bpf_spin_lock is forbidden in BPF_PROG_TYPE_PERF_EVENT",
+	.insns = {
+	BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+	BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+	BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+	BPF_LD_MAP_FD(BPF_REG_1, 0),
+	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+	BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
+	BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+	BPF_EMIT_CALL(BPF_FUNC_spin_lock),
+	BPF_EXIT_INSN(),
+	},
+	.fixup_map_spin_lock = { 3 },
+	.errstr = "tracing progs cannot use bpf_spin_lock yet",
+	.result = REJECT,
+	.prog_type = BPF_PROG_TYPE_PERF_EVENT,
+},
+{
+	"bpf_spin_lock is forbidden in BPF_PROG_TYPE_RAW_TRACEPOINT",
+	.insns = {
+	BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+	BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+	BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+	BPF_LD_MAP_FD(BPF_REG_1, 0),
+	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+	BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
+	BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+	BPF_EMIT_CALL(BPF_FUNC_spin_lock),
+	BPF_EXIT_INSN(),
+	},
+	.fixup_map_spin_lock = { 3 },
+	.errstr = "tracing progs cannot use bpf_spin_lock yet",
+	.result = REJECT,
+	.prog_type = BPF_PROG_TYPE_RAW_TRACEPOINT,
+},

From 6060a6cb05e3223146a3c30a1977f136da6c85e7 Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <alexandr.lobakin@intel.com>
Date: Mon, 15 Nov 2021 14:07:41 +0100
Subject: [PATCH 51/96] samples/bpf: Fix build error due to -isystem removal

Since recent Kbuild updates we no longer include files from compiler
directories. However, samples/bpf/hbm_kern.h hasn't been tuned for
this (LLVM 13):

  CLANG-bpf  samples/bpf/hbm_out_kern.o
In file included from samples/bpf/hbm_out_kern.c:55:
samples/bpf/hbm_kern.h:12:10: fatal error: 'stddef.h' file not found
         ^~~~~~~~~~
1 error generated.
  CLANG-bpf  samples/bpf/hbm_edt_kern.o
In file included from samples/bpf/hbm_edt_kern.c:53:
samples/bpf/hbm_kern.h:12:10: fatal error: 'stddef.h' file not found
         ^~~~~~~~~~
1 error generated.

It is enough to just drop both stdbool.h and stddef.h from includes
to fix those.

Fixes: 04e85bbf71c9 ("isystem: delete global -isystem compile option")
Signed-off-by: Alexander Lobakin <alexandr.lobakin@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Michal Swiatkowski <michal.swiatkowski@linux.intel.com>
Link: https://lore.kernel.org/bpf/20211115130741.3584-1-alexandr.lobakin@intel.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 samples/bpf/hbm_kern.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/samples/bpf/hbm_kern.h b/samples/bpf/hbm_kern.h
index 722b3fadb4675..1752a46a2b056 100644
--- a/samples/bpf/hbm_kern.h
+++ b/samples/bpf/hbm_kern.h
@@ -9,8 +9,6 @@
  * Include file for sample Host Bandwidth Manager (HBM) BPF programs
  */
 #define KBUILD_MODNAME "foo"
-#include <stddef.h>
-#include <stdbool.h>
 #include <uapi/linux/bpf.h>
 #include <uapi/linux/if_ether.h>
 #include <uapi/linux/if_packet.h>

From 353050be4c19e102178ccc05988101887c25ae53 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Tue, 9 Nov 2021 18:48:08 +0000
Subject: [PATCH 52/96] bpf: Fix toctou on read-only map's constant scalar
 tracking

Commit a23740ec43ba ("bpf: Track contents of read-only maps as scalars") is
checking whether maps are read-only both from BPF program side and user space
side, and then, given their content is constant, reading out their data via
map->ops->map_direct_value_addr() which is then subsequently used as known
scalar value for the register, that is, it is marked as __mark_reg_known()
with the read value at verification time. Before a23740ec43ba, the register
content was marked as an unknown scalar so the verifier could not make any
assumptions about the map content.

The current implementation however is prone to a TOCTOU race, meaning, the
value read as known scalar for the register is not guaranteed to be exactly
the same at a later point when the program is executed, and as such, the
prior made assumptions of the verifier with regards to the program will be
invalid which can cause issues such as OOB access, etc.

While the BPF_F_RDONLY_PROG map flag is always fixed and required to be
specified at map creation time, the map->frozen property is initially set to
false for the map given the map value needs to be populated, e.g. for global
data sections. Once complete, the loader "freezes" the map from user space
such that no subsequent updates/deletes are possible anymore. For the rest
of the lifetime of the map, this freeze one-time trigger cannot be undone
anymore after a successful BPF_MAP_FREEZE cmd return. Meaning, any new BPF_*
cmd calls which would update/delete map entries will be rejected with -EPERM
since map_get_sys_perms() removes the FMODE_CAN_WRITE permission. This also
means that pending update/delete map entries must still complete before this
guarantee is given. This corner case is not an issue for loaders since they
create and prepare such program private map in successive steps.

However, a malicious user is able to trigger this TOCTOU race in two different
ways: i) via userfaultfd, and ii) via batched updates. For i) userfaultfd is
used to expand the competition interval, so that map_update_elem() can modify
the contents of the map after map_freeze() and bpf_prog_load() were executed.
This works, because userfaultfd halts the parallel thread which triggered a
map_update_elem() at the time where we copy key/value from the user buffer and
this already passed the FMODE_CAN_WRITE capability test given at that time the
map was not "frozen". Then, the main thread performs the map_freeze() and
bpf_prog_load(), and once that had completed successfully, the other thread
is woken up to complete the pending map_update_elem() which then changes the
map content. For ii) the idea of the batched update is similar, meaning, when
there are a large number of updates to be processed, it can increase the
competition interval between the two. It is therefore possible in practice to
modify the contents of the map after executing map_freeze() and bpf_prog_load().

One way to fix both i) and ii) at the same time is to expand the use of the
map's map->writecnt. The latter was introduced in fc9702273e2e ("bpf: Add mmap()
support for BPF_MAP_TYPE_ARRAY") and further refined in 1f6cb19be2e2 ("bpf:
Prevent re-mmap()'ing BPF map as writable for initially r/o mapping") with
the rationale to make a writable mmap()'ing of a map mutually exclusive with
read-only freezing. The counter indicates writable mmap() mappings and then
prevents/fails the freeze operation. Its semantics can be expanded beyond
just mmap() by generally indicating ongoing write phases. This would essentially
span any parallel regular and batched flavor of update/delete operation and
then also have map_freeze() fail with -EBUSY. For the check_mem_access() in
the verifier we expand upon the bpf_map_is_rdonly() check ensuring that all
last pending writes have completed via bpf_map_write_active() test. Once the
map->frozen is set and bpf_map_write_active() indicates a map->writecnt of 0
only then we are really guaranteed to use the map's data as known constants.
For map->frozen being set and pending writes in process of still being completed
we fall back to marking that register as unknown scalar so we don't end up
making assumptions about it. With this, both TOCTOU reproducers from i) and
ii) are fixed.

Note that the map->writecnt has been converted into a atomic64 in the fix in
order to avoid a double freeze_mutex mutex_{un,}lock() pair when updating
map->writecnt in the various map update/delete BPF_* cmd flavors. Spanning
the freeze_mutex over entire map update/delete operations in syscall side
would not be possible due to then causing everything to be serialized.
Similarly, something like synchronize_rcu() after setting map->frozen to wait
for update/deletes to complete is not possible either since it would also
have to span the user copy which can sleep. On the libbpf side, this won't
break d66562fba1ce ("libbpf: Add BPF object skeleton support") as the
anonymous mmap()-ed "map initialization image" is remapped as a BPF map-backed
mmap()-ed memory where for .rodata it's non-writable.

Fixes: a23740ec43ba ("bpf: Track contents of read-only maps as scalars")
Reported-by: w1tcher.bupt@gmail.com
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h   |  3 ++-
 kernel/bpf/syscall.c  | 57 +++++++++++++++++++++++++++----------------
 kernel/bpf/verifier.c | 17 ++++++++++++-
 3 files changed, 54 insertions(+), 23 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index f715e8863f4de..e7a163a3146b6 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -193,7 +193,7 @@ struct bpf_map {
 	atomic64_t usercnt;
 	struct work_struct work;
 	struct mutex freeze_mutex;
-	u64 writecnt; /* writable mmap cnt; protected by freeze_mutex */
+	atomic64_t writecnt;
 };
 
 static inline bool map_value_has_spin_lock(const struct bpf_map *map)
@@ -1419,6 +1419,7 @@ void bpf_map_put(struct bpf_map *map);
 void *bpf_map_area_alloc(u64 size, int numa_node);
 void *bpf_map_area_mmapable_alloc(u64 size, int numa_node);
 void bpf_map_area_free(void *base);
+bool bpf_map_write_active(const struct bpf_map *map);
 void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr);
 int  generic_map_lookup_batch(struct bpf_map *map,
 			      const union bpf_attr *attr,
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 50f96ea4452a2..1033ee8c0caf0 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -132,6 +132,21 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
 	return map;
 }
 
+static void bpf_map_write_active_inc(struct bpf_map *map)
+{
+	atomic64_inc(&map->writecnt);
+}
+
+static void bpf_map_write_active_dec(struct bpf_map *map)
+{
+	atomic64_dec(&map->writecnt);
+}
+
+bool bpf_map_write_active(const struct bpf_map *map)
+{
+	return atomic64_read(&map->writecnt) != 0;
+}
+
 static u32 bpf_map_value_size(const struct bpf_map *map)
 {
 	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
@@ -601,11 +616,8 @@ static void bpf_map_mmap_open(struct vm_area_struct *vma)
 {
 	struct bpf_map *map = vma->vm_file->private_data;
 
-	if (vma->vm_flags & VM_MAYWRITE) {
-		mutex_lock(&map->freeze_mutex);
-		map->writecnt++;
-		mutex_unlock(&map->freeze_mutex);
-	}
+	if (vma->vm_flags & VM_MAYWRITE)
+		bpf_map_write_active_inc(map);
 }
 
 /* called for all unmapped memory region (including initial) */
@@ -613,11 +625,8 @@ static void bpf_map_mmap_close(struct vm_area_struct *vma)
 {
 	struct bpf_map *map = vma->vm_file->private_data;
 
-	if (vma->vm_flags & VM_MAYWRITE) {
-		mutex_lock(&map->freeze_mutex);
-		map->writecnt--;
-		mutex_unlock(&map->freeze_mutex);
-	}
+	if (vma->vm_flags & VM_MAYWRITE)
+		bpf_map_write_active_dec(map);
 }
 
 static const struct vm_operations_struct bpf_map_default_vmops = {
@@ -668,7 +677,7 @@ static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma)
 		goto out;
 
 	if (vma->vm_flags & VM_MAYWRITE)
-		map->writecnt++;
+		bpf_map_write_active_inc(map);
 out:
 	mutex_unlock(&map->freeze_mutex);
 	return err;
@@ -1139,6 +1148,7 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr)
 	map = __bpf_map_get(f);
 	if (IS_ERR(map))
 		return PTR_ERR(map);
+	bpf_map_write_active_inc(map);
 	if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
 		err = -EPERM;
 		goto err_put;
@@ -1174,6 +1184,7 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr)
 free_key:
 	kvfree(key);
 err_put:
+	bpf_map_write_active_dec(map);
 	fdput(f);
 	return err;
 }
@@ -1196,6 +1207,7 @@ static int map_delete_elem(union bpf_attr *attr)
 	map = __bpf_map_get(f);
 	if (IS_ERR(map))
 		return PTR_ERR(map);
+	bpf_map_write_active_inc(map);
 	if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
 		err = -EPERM;
 		goto err_put;
@@ -1226,6 +1238,7 @@ static int map_delete_elem(union bpf_attr *attr)
 out:
 	kvfree(key);
 err_put:
+	bpf_map_write_active_dec(map);
 	fdput(f);
 	return err;
 }
@@ -1533,6 +1546,7 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr)
 	map = __bpf_map_get(f);
 	if (IS_ERR(map))
 		return PTR_ERR(map);
+	bpf_map_write_active_inc(map);
 	if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ) ||
 	    !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
 		err = -EPERM;
@@ -1597,6 +1611,7 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr)
 free_key:
 	kvfree(key);
 err_put:
+	bpf_map_write_active_dec(map);
 	fdput(f);
 	return err;
 }
@@ -1624,8 +1639,7 @@ static int map_freeze(const union bpf_attr *attr)
 	}
 
 	mutex_lock(&map->freeze_mutex);
-
-	if (map->writecnt) {
+	if (bpf_map_write_active(map)) {
 		err = -EBUSY;
 		goto err_put;
 	}
@@ -4171,6 +4185,9 @@ static int bpf_map_do_batch(const union bpf_attr *attr,
 			    union bpf_attr __user *uattr,
 			    int cmd)
 {
+	bool has_read  = cmd == BPF_MAP_LOOKUP_BATCH ||
+			 cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH;
+	bool has_write = cmd != BPF_MAP_LOOKUP_BATCH;
 	struct bpf_map *map;
 	int err, ufd;
 	struct fd f;
@@ -4183,16 +4200,13 @@ static int bpf_map_do_batch(const union bpf_attr *attr,
 	map = __bpf_map_get(f);
 	if (IS_ERR(map))
 		return PTR_ERR(map);
-
-	if ((cmd == BPF_MAP_LOOKUP_BATCH ||
-	     cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH) &&
-	    !(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {
+	if (has_write)
+		bpf_map_write_active_inc(map);
+	if (has_read && !(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {
 		err = -EPERM;
 		goto err_put;
 	}
-
-	if (cmd != BPF_MAP_LOOKUP_BATCH &&
-	    !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
+	if (has_write && !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
 		err = -EPERM;
 		goto err_put;
 	}
@@ -4205,8 +4219,9 @@ static int bpf_map_do_batch(const union bpf_attr *attr,
 		BPF_DO_BATCH(map->ops->map_update_batch);
 	else
 		BPF_DO_BATCH(map->ops->map_delete_batch);
-
 err_put:
+	if (has_write)
+		bpf_map_write_active_dec(map);
 	fdput(f);
 	return err;
 }
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 65d2f93b70307..50efda51515b5 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4056,7 +4056,22 @@ static void coerce_reg_to_size(struct bpf_reg_state *reg, int size)
 
 static bool bpf_map_is_rdonly(const struct bpf_map *map)
 {
-	return (map->map_flags & BPF_F_RDONLY_PROG) && map->frozen;
+	/* A map is considered read-only if the following condition are true:
+	 *
+	 * 1) BPF program side cannot change any of the map content. The
+	 *    BPF_F_RDONLY_PROG flag is throughout the lifetime of a map
+	 *    and was set at map creation time.
+	 * 2) The map value(s) have been initialized from user space by a
+	 *    loader and then "frozen", such that no new map update/delete
+	 *    operations from syscall side are possible for the rest of
+	 *    the map's lifetime from that point onwards.
+	 * 3) Any parallel/pending map update/delete operations from syscall
+	 *    side have been completed. Only after that point, it's safe to
+	 *    assume that map value(s) are immutable.
+	 */
+	return (map->map_flags & BPF_F_RDONLY_PROG) &&
+	       READ_ONCE(map->frozen) &&
+	       !bpf_map_write_active(map);
 }
 
 static int bpf_map_direct_read(struct bpf_map *map, int off, int size, u64 *val)

From 099f896f498a2b26d84f4ddae039b2c542c18b48 Mon Sep 17 00:00:00 2001
From: Cong Wang <cong.wang@bytedance.com>
Date: Sun, 14 Nov 2021 20:40:06 -0800
Subject: [PATCH 53/96] udp: Validate checksum in udp_read_sock()

It turns out the skb's in sock receive queue could have bad checksums, as
both ->poll() and ->recvmsg() validate checksums. We have to do the same
for ->read_sock() path too before they are redirected in sockmap.

Fixes: d7f571188ecf ("udp: Implement ->read_sock() for sockmap")
Reported-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Cong Wang <cong.wang@bytedance.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20211115044006.26068-1-xiyou.wangcong@gmail.com
---
 net/ipv4/udp.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 319dd7bbfe33d..8bcecdd6aeda8 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1807,6 +1807,17 @@ int udp_read_sock(struct sock *sk, read_descriptor_t *desc,
 		skb = skb_recv_udp(sk, 0, 1, &err);
 		if (!skb)
 			return err;
+
+		if (udp_lib_checksum_complete(skb)) {
+			__UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS,
+					IS_UDPLITE(sk));
+			__UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS,
+					IS_UDPLITE(sk));
+			atomic_inc(&sk->sk_drops);
+			kfree_skb(skb);
+			continue;
+		}
+
 		used = recv_actor(desc, skb, 0, skb->len);
 		if (used <= 0) {
 			if (!copied)

From cc4a9cc03faa6d8db1a6954bb536f2c1e63bdff6 Mon Sep 17 00:00:00 2001
From: Tariq Toukan <tariqt@nvidia.com>
Date: Wed, 15 Sep 2021 13:25:31 +0300
Subject: [PATCH 54/96] net/mlx5e: kTLS, Fix crash in RX resync flow

For the TLS RX resync flow, we maintain a list of TLS contexts
that require some attention, to communicate their resync information
to the HW.
Here we fix list corruptions, by protecting the entries against
movements coming from resync_handle_seq_match(), until their resync
handling in napi is fully completed.

Fixes: e9ce991bce5b ("net/mlx5e: kTLS, Add resiliency to RX resync failures")
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Reviewed-by: Maxim Mikityanskiy <maximmi@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 .../mellanox/mlx5/core/en_accel/ktls_rx.c     | 23 ++++++++++++++-----
 1 file changed, 17 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c
index 62abce008c7b8..a2a9f68579dd8 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c
@@ -55,6 +55,7 @@ struct mlx5e_ktls_offload_context_rx {
 	DECLARE_BITMAP(flags, MLX5E_NUM_PRIV_RX_FLAGS);
 
 	/* resync */
+	spinlock_t lock; /* protects resync fields */
 	struct mlx5e_ktls_rx_resync_ctx resync;
 	struct list_head list;
 };
@@ -386,14 +387,18 @@ static void resync_handle_seq_match(struct mlx5e_ktls_offload_context_rx *priv_r
 	struct mlx5e_icosq *sq;
 	bool trigger_poll;
 
-	memcpy(info->rec_seq, &priv_rx->resync.sw_rcd_sn_be, sizeof(info->rec_seq));
-
 	sq = &c->async_icosq;
 	ktls_resync = sq->ktls_resync;
+	trigger_poll = false;
 
 	spin_lock_bh(&ktls_resync->lock);
-	list_add_tail(&priv_rx->list, &ktls_resync->list);
-	trigger_poll = !test_and_set_bit(MLX5E_SQ_STATE_PENDING_TLS_RX_RESYNC, &sq->state);
+	spin_lock_bh(&priv_rx->lock);
+	memcpy(info->rec_seq, &priv_rx->resync.sw_rcd_sn_be, sizeof(info->rec_seq));
+	if (list_empty(&priv_rx->list)) {
+		list_add_tail(&priv_rx->list, &ktls_resync->list);
+		trigger_poll = !test_and_set_bit(MLX5E_SQ_STATE_PENDING_TLS_RX_RESYNC, &sq->state);
+	}
+	spin_unlock_bh(&priv_rx->lock);
 	spin_unlock_bh(&ktls_resync->lock);
 
 	if (!trigger_poll)
@@ -617,6 +622,8 @@ int mlx5e_ktls_add_rx(struct net_device *netdev, struct sock *sk,
 	if (err)
 		goto err_create_key;
 
+	INIT_LIST_HEAD(&priv_rx->list);
+	spin_lock_init(&priv_rx->lock);
 	priv_rx->crypto_info  =
 		*(struct tls12_crypto_info_aes_gcm_128 *)crypto_info;
 
@@ -730,10 +737,14 @@ bool mlx5e_ktls_rx_handle_resync_list(struct mlx5e_channel *c, int budget)
 		priv_rx = list_first_entry(&local_list,
 					   struct mlx5e_ktls_offload_context_rx,
 					   list);
+		spin_lock(&priv_rx->lock);
 		cseg = post_static_params(sq, priv_rx);
-		if (IS_ERR(cseg))
+		if (IS_ERR(cseg)) {
+			spin_unlock(&priv_rx->lock);
 			break;
-		list_del(&priv_rx->list);
+		}
+		list_del_init(&priv_rx->list);
+		spin_unlock(&priv_rx->lock);
 		db_cseg = cseg;
 	}
 	if (db_cseg)

From 362980eada85b5ea691e5e0d9257a991aa7ade47 Mon Sep 17 00:00:00 2001
From: Vlad Buslov <vladbu@nvidia.com>
Date: Thu, 21 Oct 2021 18:15:10 +0300
Subject: [PATCH 55/96] net/mlx5e: Wait for concurrent flow deletion during
 neigh/fib events

Function mlx5e_take_tmp_flow() skips flows with zero reference count. This
can cause syndrome 0x179e84 when the called from neigh or route update code
and the skipped flow is not removed from the hardware by the time
underlying encap/decap resource is deleted. Add new completion
'del_hw_done' that is completed when flow is unoffloaded. This is safe to
do because flow with reference count zero needs to be detached from
encap/decap entry before its memory is deallocated, which requires taking
the encap_tbl_lock mutex that is held by the event handlers code.

Fixes: 8914add2c9e5 ("net/mlx5e: Handle FIB events to update tunnel endpoint device")
Signed-off-by: Vlad Buslov <vladbu@nvidia.com>
Reviewed-by: Roi Dayan <roid@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en/tc_priv.h      | 1 +
 drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c | 8 +++++++-
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c           | 2 ++
 3 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_priv.h b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_priv.h
index 8f64f2c8895a9..b689701ac7d81 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_priv.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_priv.h
@@ -102,6 +102,7 @@ struct mlx5e_tc_flow {
 	refcount_t refcnt;
 	struct rcu_head rcu_head;
 	struct completion init_done;
+	struct completion del_hw_done;
 	int tunnel_id; /* the mapped tunnel id of this flow */
 	struct mlx5_flow_attr *attr;
 };
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c
index 660cca73c36c8..042b1abe1437f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c
@@ -245,8 +245,14 @@ static void mlx5e_take_tmp_flow(struct mlx5e_tc_flow *flow,
 				struct list_head *flow_list,
 				int index)
 {
-	if (IS_ERR(mlx5e_flow_get(flow)))
+	if (IS_ERR(mlx5e_flow_get(flow))) {
+		/* Flow is being deleted concurrently. Wait for it to be
+		 * unoffloaded from hardware, otherwise deleting encap will
+		 * fail.
+		 */
+		wait_for_completion(&flow->del_hw_done);
 		return;
+	}
 	wait_for_completion(&flow->init_done);
 
 	flow->tmp_entry_index = index;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 835caa1c7b745..cb76c41fe163a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -1600,6 +1600,7 @@ static void mlx5e_tc_del_fdb_flow(struct mlx5e_priv *priv,
 		else
 			mlx5e_tc_unoffload_fdb_rules(esw, flow, attr);
 	}
+	complete_all(&flow->del_hw_done);
 
 	if (mlx5_flow_has_geneve_opt(flow))
 		mlx5_geneve_tlv_option_del(priv->mdev->geneve);
@@ -4465,6 +4466,7 @@ mlx5e_alloc_flow(struct mlx5e_priv *priv, int attr_size,
 	INIT_LIST_HEAD(&flow->l3_to_l2_reformat);
 	refcount_set(&flow->refcnt, 1);
 	init_completion(&flow->init_done);
+	init_completion(&flow->del_hw_done);
 
 	*__flow = flow;
 	*__parse_attr = parse_attr;

From d7751d6476185ff754b9dad2cba0c0a6e43ecadc Mon Sep 17 00:00:00 2001
From: Paul Blakey <paulb@nvidia.com>
Date: Thu, 20 May 2021 17:09:58 +0300
Subject: [PATCH 56/96] net/mlx5: E-Switch, Fix resetting of encap mode when
 entering switchdev

E-Switch encap mode is relevant only when in switchdev mode.
The RDMA driver can query the encap configuration via
mlx5_eswitch_get_encap_mode(). Make sure it returns the currently
used mode and not the set one.

This reverts the cited commit which reset the encap mode
on entering switchdev and fixes the original issue properly.

Fixes: 9a64144d683a ("net/mlx5: E-Switch, Fix default encap mode")
Signed-off-by: Paul Blakey <paulb@nvidia.com>
Reviewed-by: Mark Bloch <mbloch@nvidia.com>
Reviewed-by: Maor Dickman <maord@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.c        | 9 +++++++--
 .../net/ethernet/mellanox/mlx5/core/eswitch_offloads.c   | 7 -------
 include/linux/mlx5/eswitch.h                             | 4 ++--
 3 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
index ec136b4992045..5872cc8bf9532 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -1572,6 +1572,11 @@ int mlx5_eswitch_init(struct mlx5_core_dev *dev)
 	esw->enabled_vports = 0;
 	esw->mode = MLX5_ESWITCH_NONE;
 	esw->offloads.inline_mode = MLX5_INLINE_MODE_NONE;
+	if (MLX5_CAP_ESW_FLOWTABLE_FDB(dev, reformat) &&
+	    MLX5_CAP_ESW_FLOWTABLE_FDB(dev, decap))
+		esw->offloads.encap = DEVLINK_ESWITCH_ENCAP_MODE_BASIC;
+	else
+		esw->offloads.encap = DEVLINK_ESWITCH_ENCAP_MODE_NONE;
 
 	dev->priv.eswitch = esw;
 	BLOCKING_INIT_NOTIFIER_HEAD(&esw->n_head);
@@ -1934,7 +1939,7 @@ int mlx5_eswitch_get_vport_stats(struct mlx5_eswitch *esw,
 	return err;
 }
 
-u8 mlx5_eswitch_mode(struct mlx5_core_dev *dev)
+u8 mlx5_eswitch_mode(const struct mlx5_core_dev *dev)
 {
 	struct mlx5_eswitch *esw = dev->priv.eswitch;
 
@@ -1948,7 +1953,7 @@ mlx5_eswitch_get_encap_mode(const struct mlx5_core_dev *dev)
 	struct mlx5_eswitch *esw;
 
 	esw = dev->priv.eswitch;
-	return mlx5_esw_allowed(esw) ? esw->offloads.encap :
+	return (mlx5_eswitch_mode(dev) == MLX5_ESWITCH_OFFLOADS)  ? esw->offloads.encap :
 		DEVLINK_ESWITCH_ENCAP_MODE_NONE;
 }
 EXPORT_SYMBOL(mlx5_eswitch_get_encap_mode);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index f4eaa58938860..80fa76f60e1e6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -3183,12 +3183,6 @@ int esw_offloads_enable(struct mlx5_eswitch *esw)
 	u64 mapping_id;
 	int err;
 
-	if (MLX5_CAP_ESW_FLOWTABLE_FDB(esw->dev, reformat) &&
-	    MLX5_CAP_ESW_FLOWTABLE_FDB(esw->dev, decap))
-		esw->offloads.encap = DEVLINK_ESWITCH_ENCAP_MODE_BASIC;
-	else
-		esw->offloads.encap = DEVLINK_ESWITCH_ENCAP_MODE_NONE;
-
 	mutex_init(&esw->offloads.termtbl_mutex);
 	mlx5_rdma_enable_roce(esw->dev);
 
@@ -3286,7 +3280,6 @@ void esw_offloads_disable(struct mlx5_eswitch *esw)
 	esw_offloads_metadata_uninit(esw);
 	mlx5_rdma_disable_roce(esw->dev);
 	mutex_destroy(&esw->offloads.termtbl_mutex);
-	esw->offloads.encap = DEVLINK_ESWITCH_ENCAP_MODE_NONE;
 }
 
 static int esw_mode_from_devlink(u16 mode, u16 *mlx5_mode)
diff --git a/include/linux/mlx5/eswitch.h b/include/linux/mlx5/eswitch.h
index 97afcea39a7bf..8b18fe9771f97 100644
--- a/include/linux/mlx5/eswitch.h
+++ b/include/linux/mlx5/eswitch.h
@@ -145,13 +145,13 @@ u32 mlx5_eswitch_get_vport_metadata_for_set(struct mlx5_eswitch *esw,
 	GENMASK(31 - ESW_TUN_ID_BITS - ESW_RESERVED_BITS, \
 		ESW_TUN_OPTS_OFFSET + 1)
 
-u8 mlx5_eswitch_mode(struct mlx5_core_dev *dev);
+u8 mlx5_eswitch_mode(const struct mlx5_core_dev *dev);
 u16 mlx5_eswitch_get_total_vports(const struct mlx5_core_dev *dev);
 struct mlx5_core_dev *mlx5_eswitch_get_core_dev(struct mlx5_eswitch *esw);
 
 #else  /* CONFIG_MLX5_ESWITCH */
 
-static inline u8 mlx5_eswitch_mode(struct mlx5_core_dev *dev)
+static inline u8 mlx5_eswitch_mode(const struct mlx5_core_dev *dev)
 {
 	return MLX5_ESWITCH_NONE;
 }

From 76ded29d3fcda4928da8849ffc446ea46871c1c2 Mon Sep 17 00:00:00 2001
From: Valentine Fatiev <valentinef@nvidia.com>
Date: Tue, 26 Oct 2021 11:42:41 +0300
Subject: [PATCH 57/96] net/mlx5e: nullify cq->dbg pointer in
 mlx5_debug_cq_remove()

Prior to this patch in case mlx5_core_destroy_cq() failed it proceeds
to rest of destroy operations. mlx5_core_destroy_cq() could be called again
by user and cause additional call of mlx5_debug_cq_remove().
cq->dbg was not nullify in previous call and cause the crash.

Fix it by nullify cq->dbg pointer after removal.

Also proceed to destroy operations only if FW return 0
for MLX5_CMD_OP_DESTROY_CQ command.

general protection fault, probably for non-canonical address 0x2000300004058: 0000 [#1] SMP PTI
CPU: 5 PID: 1228 Comm: python Not tainted 5.15.0-rc5_for_upstream_min_debug_2021_10_14_11_06 #1
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
RIP: 0010:lockref_get+0x1/0x60
Code: 5d e9 53 ff ff ff 48 8d 7f 70 e8 0a 2e 48 00 c7 85 d0 00 00 00 02
00 00 00 c6 45 70 00 fb 5d c3 c3 cc cc cc cc cc cc cc cc 53 <48> 8b 17
48 89 fb 85 d2 75 3d 48 89 d0 bf 64 00 00 00 48 89 c1 48
RSP: 0018:ffff888137dd7a38 EFLAGS: 00010206
RAX: 0000000000000000 RBX: ffff888107d5f458 RCX: 00000000fffffffe
RDX: 000000000002c2b0 RSI: ffffffff8155e2e0 RDI: 0002000300004058
RBP: ffff888137dd7a88 R08: 0002000300004058 R09: ffff8881144a9f88
R10: 0000000000000000 R11: 0000000000000000 R12: ffff8881141d4000
R13: ffff888137dd7c68 R14: ffff888137dd7d58 R15: ffff888137dd7cc0
FS:  00007f4644f2a4c0(0000) GS:ffff8887a2d40000(0000)
knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 000055b4500f4380 CR3: 0000000114f7a003 CR4: 0000000000170ea0
Call Trace:
  simple_recursive_removal+0x33/0x2e0
  ? debugfs_remove+0x60/0x60
  debugfs_remove+0x40/0x60
  mlx5_debug_cq_remove+0x32/0x70 [mlx5_core]
  mlx5_core_destroy_cq+0x41/0x1d0 [mlx5_core]
  devx_obj_cleanup+0x151/0x330 [mlx5_ib]
  ? __pollwait+0xd0/0xd0
  ? xas_load+0x5/0x70
  ? xa_load+0x62/0xa0
  destroy_hw_idr_uobject+0x20/0x80 [ib_uverbs]
  uverbs_destroy_uobject+0x3b/0x360 [ib_uverbs]
  uobj_destroy+0x54/0xa0 [ib_uverbs]
  ib_uverbs_cmd_verbs+0xaf2/0x1160 [ib_uverbs]
  ? uverbs_finalize_object+0xd0/0xd0 [ib_uverbs]
  ib_uverbs_ioctl+0xc4/0x1b0 [ib_uverbs]
  __x64_sys_ioctl+0x3e4/0x8e0

Fixes: 94b960b9deff ("net/mlx5e: Fix memory leak in mlx5_core_destroy_cq() error path")
Signed-off-by: Valentine Fatiev <valentinef@nvidia.com>
Reviewed-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/cq.c      | 5 +++--
 drivers/net/ethernet/mellanox/mlx5/core/debugfs.c | 4 +++-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cq.c b/drivers/net/ethernet/mellanox/mlx5/core/cq.c
index 02e77ffe5c3e4..5371ad0a12eb5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/cq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/cq.c
@@ -164,13 +164,14 @@ int mlx5_core_destroy_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq)
 	MLX5_SET(destroy_cq_in, in, cqn, cq->cqn);
 	MLX5_SET(destroy_cq_in, in, uid, cq->uid);
 	err = mlx5_cmd_exec_in(dev, destroy_cq, in);
+	if (err)
+		return err;
 
 	synchronize_irq(cq->irqn);
-
 	mlx5_cq_put(cq);
 	wait_for_completion(&cq->free);
 
-	return err;
+	return 0;
 }
 EXPORT_SYMBOL(mlx5_core_destroy_cq);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c b/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c
index 07c8d9811bc81..10d195042ab55 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c
@@ -507,6 +507,8 @@ void mlx5_debug_cq_remove(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq)
 	if (!mlx5_debugfs_root)
 		return;
 
-	if (cq->dbg)
+	if (cq->dbg) {
 		rem_res_tree(cq->dbg);
+		cq->dbg = NULL;
+	}
 }

From 9091b821aaa4c2d107ca8f97c32baefcb1e7e40d Mon Sep 17 00:00:00 2001
From: Yevgeny Kliteynik <kliteyn@nvidia.com>
Date: Wed, 3 Nov 2021 01:09:04 +0200
Subject: [PATCH 58/96] net/mlx5: DR, Handle eswitch manager and uplink vports
 separately

When querying eswitch manager vport capabilities as "other = 1",
we encounter a FW compatibility issue with older FW versions.
To maintain backward compatibility, eswitch manager vport should
be queried as "other = 0" vport both for ECPF and non-ECPF cases.

This patch fixes these queries and improves the code readability
by handling eswitch manager and uplink vports separately, avoiding
the excessive 'if' conditions. Also, uplink caps are stored similar
to esw manager and not as part of xarray.

Fixes: dd4acb2a0954 ("net/mlx5: DR, Add missing query for vport 0")
Signed-off-by: Yevgeny Kliteynik <kliteyn@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 .../mellanox/mlx5/core/steering/dr_domain.c   | 56 ++++++++-----------
 .../mellanox/mlx5/core/steering/dr_types.h    |  1 +
 2 files changed, 24 insertions(+), 33 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_domain.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_domain.c
index 49089cbe897c6..8cbd36c82b3b8 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_domain.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_domain.c
@@ -135,25 +135,14 @@ static void dr_domain_fill_uplink_caps(struct mlx5dr_domain *dmn,
 
 static int dr_domain_query_vport(struct mlx5dr_domain *dmn,
 				 u16 vport_number,
+				 bool other_vport,
 				 struct mlx5dr_cmd_vport_cap *vport_caps)
 {
-	u16 cmd_vport = vport_number;
-	bool other_vport = true;
 	int ret;
 
-	if (vport_number == MLX5_VPORT_UPLINK) {
-		dr_domain_fill_uplink_caps(dmn, vport_caps);
-		return 0;
-	}
-
-	if (dmn->info.caps.is_ecpf && vport_number == MLX5_VPORT_ECPF) {
-		other_vport = false;
-		cmd_vport = 0;
-	}
-
 	ret = mlx5dr_cmd_query_esw_vport_context(dmn->mdev,
 						 other_vport,
-						 cmd_vport,
+						 vport_number,
 						 &vport_caps->icm_address_rx,
 						 &vport_caps->icm_address_tx);
 	if (ret)
@@ -161,7 +150,7 @@ static int dr_domain_query_vport(struct mlx5dr_domain *dmn,
 
 	ret = mlx5dr_cmd_query_gvmi(dmn->mdev,
 				    other_vport,
-				    cmd_vport,
+				    vport_number,
 				    &vport_caps->vport_gvmi);
 	if (ret)
 		return ret;
@@ -176,9 +165,15 @@ static int dr_domain_query_esw_mngr(struct mlx5dr_domain *dmn)
 {
 	return dr_domain_query_vport(dmn,
 				     dmn->info.caps.is_ecpf ? MLX5_VPORT_ECPF : 0,
+				     false,
 				     &dmn->info.caps.vports.esw_manager_caps);
 }
 
+static void dr_domain_query_uplink(struct mlx5dr_domain *dmn)
+{
+	dr_domain_fill_uplink_caps(dmn, &dmn->info.caps.vports.uplink_caps);
+}
+
 static struct mlx5dr_cmd_vport_cap *
 dr_domain_add_vport_cap(struct mlx5dr_domain *dmn, u16 vport)
 {
@@ -190,7 +185,7 @@ dr_domain_add_vport_cap(struct mlx5dr_domain *dmn, u16 vport)
 	if (!vport_caps)
 		return NULL;
 
-	ret = dr_domain_query_vport(dmn, vport, vport_caps);
+	ret = dr_domain_query_vport(dmn, vport, true, vport_caps);
 	if (ret) {
 		kvfree(vport_caps);
 		return NULL;
@@ -207,16 +202,26 @@ dr_domain_add_vport_cap(struct mlx5dr_domain *dmn, u16 vport)
 	return vport_caps;
 }
 
+static bool dr_domain_is_esw_mgr_vport(struct mlx5dr_domain *dmn, u16 vport)
+{
+	struct mlx5dr_cmd_caps *caps = &dmn->info.caps;
+
+	return (caps->is_ecpf && vport == MLX5_VPORT_ECPF) ||
+	       (!caps->is_ecpf && vport == 0);
+}
+
 struct mlx5dr_cmd_vport_cap *
 mlx5dr_domain_get_vport_cap(struct mlx5dr_domain *dmn, u16 vport)
 {
 	struct mlx5dr_cmd_caps *caps = &dmn->info.caps;
 	struct mlx5dr_cmd_vport_cap *vport_caps;
 
-	if ((caps->is_ecpf && vport == MLX5_VPORT_ECPF) ||
-	    (!caps->is_ecpf && vport == 0))
+	if (dr_domain_is_esw_mgr_vport(dmn, vport))
 		return &caps->vports.esw_manager_caps;
 
+	if (vport == MLX5_VPORT_UPLINK)
+		return &caps->vports.uplink_caps;
+
 vport_load:
 	vport_caps = xa_load(&caps->vports.vports_caps_xa, vport);
 	if (vport_caps)
@@ -241,17 +246,6 @@ static void dr_domain_clear_vports(struct mlx5dr_domain *dmn)
 	}
 }
 
-static int dr_domain_query_uplink(struct mlx5dr_domain *dmn)
-{
-	struct mlx5dr_cmd_vport_cap *vport_caps;
-
-	vport_caps = mlx5dr_domain_get_vport_cap(dmn, MLX5_VPORT_UPLINK);
-	if (!vport_caps)
-		return -EINVAL;
-
-	return 0;
-}
-
 static int dr_domain_query_fdb_caps(struct mlx5_core_dev *mdev,
 				    struct mlx5dr_domain *dmn)
 {
@@ -281,11 +275,7 @@ static int dr_domain_query_fdb_caps(struct mlx5_core_dev *mdev,
 		goto free_vports_caps_xa;
 	}
 
-	ret = dr_domain_query_uplink(dmn);
-	if (ret) {
-		mlx5dr_err(dmn, "Failed to query uplink vport caps (err: %d)", ret);
-		goto free_vports_caps_xa;
-	}
+	dr_domain_query_uplink(dmn);
 
 	return 0;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h
index 3028b776da00c..2333c2439c287 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h
@@ -764,6 +764,7 @@ struct mlx5dr_roce_cap {
 
 struct mlx5dr_vports {
 	struct mlx5dr_cmd_vport_cap esw_manager_caps;
+	struct mlx5dr_cmd_vport_cap uplink_caps;
 	struct xarray vports_caps_xa;
 };
 

From 455832d49666e1765acf812be79710b9f84a8cbf Mon Sep 17 00:00:00 2001
From: Yevgeny Kliteynik <kliteyn@nvidia.com>
Date: Wed, 3 Nov 2021 17:51:03 +0200
Subject: [PATCH 59/96] net/mlx5: DR, Fix check for unsupported fields in match
 param

The existing loop doesn't cast the buffer while scanning it, which
results in out-of-bounds read and failure to create the matcher.

Fixes: 941f19798a11 ("net/mlx5: DR, Add check for unsupported fields in match param")
Signed-off-by: Yevgeny Kliteynik <kliteyn@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 .../ethernet/mellanox/mlx5/core/steering/dr_matcher.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_matcher.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_matcher.c
index 75c775bee3510..793365242e852 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_matcher.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_matcher.c
@@ -924,11 +924,12 @@ static int dr_matcher_init(struct mlx5dr_matcher *matcher,
 
 	/* Check that all mask data was consumed */
 	for (i = 0; i < consumed_mask.match_sz; i++) {
-		if (consumed_mask.match_buf[i]) {
-			mlx5dr_dbg(dmn, "Match param mask contains unsupported parameters\n");
-			ret = -EOPNOTSUPP;
-			goto free_consumed_mask;
-		}
+		if (!((u8 *)consumed_mask.match_buf)[i])
+			continue;
+
+		mlx5dr_dbg(dmn, "Match param mask contains unsupported parameters\n");
+		ret = -EOPNOTSUPP;
+		goto free_consumed_mask;
 	}
 
 	ret =  0;

From ba50cd9451f6c49cf0841c0a4a146ff6a2822699 Mon Sep 17 00:00:00 2001
From: Neta Ostrovsky <netao@nvidia.com>
Date: Wed, 27 Oct 2021 15:16:14 +0300
Subject: [PATCH 60/96] net/mlx5: Update error handler for UCTX and UMEM

In the fast unload flow, the device state is set to internal error,
which indicates that the driver started the destroy process.
In this case, when a destroy command is being executed, it should return
MLX5_CMD_STAT_OK.
Fix MLX5_CMD_OP_DESTROY_UCTX and MLX5_CMD_OP_DESTROY_UMEM to return OK
instead of EIO.

This fixes a call trace in the umem release process -
[ 2633.536695] Call Trace:
[ 2633.537518]  ib_uverbs_remove_one+0xc3/0x140 [ib_uverbs]
[ 2633.538596]  remove_client_context+0x8b/0xd0 [ib_core]
[ 2633.539641]  disable_device+0x8c/0x130 [ib_core]
[ 2633.540615]  __ib_unregister_device+0x35/0xa0 [ib_core]
[ 2633.541640]  ib_unregister_device+0x21/0x30 [ib_core]
[ 2633.542663]  __mlx5_ib_remove+0x38/0x90 [mlx5_ib]
[ 2633.543640]  auxiliary_bus_remove+0x1e/0x30 [auxiliary]
[ 2633.544661]  device_release_driver_internal+0x103/0x1f0
[ 2633.545679]  bus_remove_device+0xf7/0x170
[ 2633.546640]  device_del+0x181/0x410
[ 2633.547606]  mlx5_rescan_drivers_locked.part.10+0x63/0x160 [mlx5_core]
[ 2633.548777]  mlx5_unregister_device+0x27/0x40 [mlx5_core]
[ 2633.549841]  mlx5_uninit_one+0x21/0xc0 [mlx5_core]
[ 2633.550864]  remove_one+0x69/0xe0 [mlx5_core]
[ 2633.551819]  pci_device_remove+0x3b/0xc0
[ 2633.552731]  device_release_driver_internal+0x103/0x1f0
[ 2633.553746]  unbind_store+0xf6/0x130
[ 2633.554657]  kernfs_fop_write+0x116/0x190
[ 2633.555567]  vfs_write+0xa5/0x1a0
[ 2633.556407]  ksys_write+0x4f/0xb0
[ 2633.557233]  do_syscall_64+0x5b/0x1a0
[ 2633.558071]  entry_SYSCALL_64_after_hwframe+0x65/0xca
[ 2633.559018] RIP: 0033:0x7f9977132648
[ 2633.559821] Code: 89 02 48 c7 c0 ff ff ff ff eb b3 0f 1f 80 00 00 00 00 f3 0f 1e fa 48 8d 05 55 6f 2d 00 8b 00 85 c0 75 17 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 58 c3 0f 1f 80 00 00 00 00 41 54 49 89 d4 55
[ 2633.562332] RSP: 002b:00007fffb1a83888 EFLAGS: 00000246 ORIG_RAX: 0000000000000001
[ 2633.563472] RAX: ffffffffffffffda RBX: 000000000000000c RCX: 00007f9977132648
[ 2633.564541] RDX: 000000000000000c RSI: 000055b90546e230 RDI: 0000000000000001
[ 2633.565596] RBP: 000055b90546e230 R08: 00007f9977406860 R09: 00007f9977a54740
[ 2633.566653] R10: 0000000000000000 R11: 0000000000000246 R12: 00007f99774056e0
[ 2633.567692] R13: 000000000000000c R14: 00007f9977400880 R15: 000000000000000c
[ 2633.568725] ---[ end trace 10b4fe52945e544d ]---

Fixes: 6a6fabbfa3e8 ("net/mlx5: Update pci error handler entries and command translation")
Signed-off-by: Neta Ostrovsky <netao@nvidia.com>
Reviewed-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
index f71ec4d9d68e3..8eaa24d865c55 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
@@ -339,6 +339,8 @@ static int mlx5_internal_err_ret_value(struct mlx5_core_dev *dev, u16 op,
 	case MLX5_CMD_OP_PAGE_FAULT_RESUME:
 	case MLX5_CMD_OP_QUERY_ESW_FUNCTIONS:
 	case MLX5_CMD_OP_DEALLOC_SF:
+	case MLX5_CMD_OP_DESTROY_UCTX:
+	case MLX5_CMD_OP_DESTROY_UMEM:
 		return MLX5_CMD_STAT_OK;
 
 	case MLX5_CMD_OP_QUERY_HCA_CAP:
@@ -464,9 +466,7 @@ static int mlx5_internal_err_ret_value(struct mlx5_core_dev *dev, u16 op,
 	case MLX5_CMD_OP_MODIFY_GENERAL_OBJECT:
 	case MLX5_CMD_OP_QUERY_GENERAL_OBJECT:
 	case MLX5_CMD_OP_CREATE_UCTX:
-	case MLX5_CMD_OP_DESTROY_UCTX:
 	case MLX5_CMD_OP_CREATE_UMEM:
-	case MLX5_CMD_OP_DESTROY_UMEM:
 	case MLX5_CMD_OP_ALLOC_MEMIC:
 	case MLX5_CMD_OP_MODIFY_XRQ:
 	case MLX5_CMD_OP_RELEASE_XRQ_ERROR:

From 2eb0cb31bc4ce2ede5460cf3ef433b40cf5f040d Mon Sep 17 00:00:00 2001
From: Mark Bloch <mbloch@nvidia.com>
Date: Wed, 10 Nov 2021 15:19:12 +0000
Subject: [PATCH 61/96] net/mlx5: E-Switch, rebuild lag only when needed

A user can enable VFs without changing E-Switch mode, this can happen
when a user moves straight to switchdev mode and only once in switchdev
VFs are enabled via the sysfs interface.

The cited commit assumed this isn't possible and exposed a single
API function where the E-switch calls into the lag code, breaks the lag
and prevents any other lag operations to take place until the
E-switch update has ended.

Breaking the hardware lag when it isn't needed can make it such that
hardware lag can't be enabled again.

In the sysfs call path check if the current E-Switch mode is NONE,
in the context of the function it can only mean the E-Switch is moving
out of NONE mode and the hardware lag should be disabled and enabled
once the mode change has ended. If the mode isn't NONE it means
VFs are about to be enabled and such operation doesn't require
toggling the hardware lag.

Fixes: cac1eb2cf2e3 ("net/mlx5: Lag, properly lock eswitch if needed")
Signed-off-by: Mark Bloch <mbloch@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
index 5872cc8bf9532..51a8cecc4a7ce 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -1305,12 +1305,17 @@ int mlx5_eswitch_enable_locked(struct mlx5_eswitch *esw, int mode, int num_vfs)
  */
 int mlx5_eswitch_enable(struct mlx5_eswitch *esw, int num_vfs)
 {
+	bool toggle_lag;
 	int ret;
 
 	if (!mlx5_esw_allowed(esw))
 		return 0;
 
-	mlx5_lag_disable_change(esw->dev);
+	toggle_lag = esw->mode == MLX5_ESWITCH_NONE;
+
+	if (toggle_lag)
+		mlx5_lag_disable_change(esw->dev);
+
 	down_write(&esw->mode_lock);
 	if (esw->mode == MLX5_ESWITCH_NONE) {
 		ret = mlx5_eswitch_enable_locked(esw, MLX5_ESWITCH_LEGACY, num_vfs);
@@ -1324,7 +1329,10 @@ int mlx5_eswitch_enable(struct mlx5_eswitch *esw, int num_vfs)
 			esw->esw_funcs.num_vfs = num_vfs;
 	}
 	up_write(&esw->mode_lock);
-	mlx5_lag_enable_change(esw->dev);
+
+	if (toggle_lag)
+		mlx5_lag_enable_change(esw->dev);
+
 	return ret;
 }
 

From 38a54cae6f76c3e6a1e6c1e52c2e43a069fa78cb Mon Sep 17 00:00:00 2001
From: Avihai Horon <avihaih@nvidia.com>
Date: Wed, 3 Nov 2021 13:04:23 +0200
Subject: [PATCH 62/96] net/mlx5: Fix flow counters SF bulk query len

When doing a flow counters bulk query, the number of counters to query
must be aligned to 4. Current SF bulk query len is not aligned to 4,
which leads to an error when trying to query more than 4 counters.

Fix it by aligning SF bulk query len to 4.

Fixes: 2fdeb4f4c2ae ("net/mlx5: Reduce flow counters bulk query buffer size for SFs")
Signed-off-by: Avihai Horon <avihaih@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c
index 31c99d53faf79..7e0e04cf26f86 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c
@@ -40,7 +40,7 @@
 #define MLX5_FC_STATS_PERIOD msecs_to_jiffies(1000)
 /* Max number of counters to query in bulk read is 32K */
 #define MLX5_SW_MAX_COUNTERS_BULK BIT(15)
-#define MLX5_SF_NUM_COUNTERS_BULK 6
+#define MLX5_SF_NUM_COUNTERS_BULK 8
 #define MLX5_FC_POOL_MAX_THRESHOLD BIT(18)
 #define MLX5_FC_POOL_USED_BUFF_RATIO 10
 

From 806401c20a0f9c51b6c8fd7035671e6ca841f6c2 Mon Sep 17 00:00:00 2001
From: Roi Dayan <roid@nvidia.com>
Date: Mon, 8 Nov 2021 16:41:05 +0200
Subject: [PATCH 63/96] net/mlx5e: CT, Fix multiple allocations and memleak of
 mod acts

CT clear action offload adds additional mod hdr actions to the
flow's original mod actions in order to clear the registers which
hold ct_state.
When such flow also includes encap action, a neigh update event
can cause the driver to unoffload the flow and then reoffload it.

Each time this happens, the ct clear handling adds that same set
of mod hdr actions to reset ct_state until the max of mod hdr
actions is reached.

Also the driver never releases the allocated mod hdr actions and
causing a memleak.

Fix above two issues by moving CT clear mod acts allocation
into the parsing actions phase and only use it when offloading the rule.
The release of mod acts will be done in the normal flow_put().

 backtrace:
    [<000000007316e2f3>] krealloc+0x83/0xd0
    [<00000000ef157de1>] mlx5e_mod_hdr_alloc+0x147/0x300 [mlx5_core]
    [<00000000970ce4ae>] mlx5e_tc_match_to_reg_set_and_get_id+0xd7/0x240 [mlx5_core]
    [<0000000067c5fa17>] mlx5e_tc_match_to_reg_set+0xa/0x20 [mlx5_core]
    [<00000000d032eb98>] mlx5_tc_ct_entry_set_registers.isra.0+0x36/0xc0 [mlx5_core]
    [<00000000fd23b869>] mlx5_tc_ct_flow_offload+0x272/0x1f10 [mlx5_core]
    [<000000004fc24acc>] mlx5e_tc_offload_fdb_rules.part.0+0x150/0x620 [mlx5_core]
    [<00000000dc741c17>] mlx5e_tc_encap_flows_add+0x489/0x690 [mlx5_core]
    [<00000000e92e49d7>] mlx5e_rep_update_flows+0x6e4/0x9b0 [mlx5_core]
    [<00000000f60f5602>] mlx5e_rep_neigh_update+0x39a/0x5d0 [mlx5_core]

Fixes: 1ef3018f5af3 ("net/mlx5e: CT: Support clear action")
Signed-off-by: Roi Dayan <roid@nvidia.com>
Reviewed-by: Paul Blakey <paulb@nvidia.com>
Reviewed-by: Maor Dickman <maord@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 .../ethernet/mellanox/mlx5/core/en/tc_ct.c    | 26 ++++++++++++-------
 .../ethernet/mellanox/mlx5/core/en/tc_ct.h    |  2 ++
 .../net/ethernet/mellanox/mlx5/core/en_tc.c   |  8 ++++--
 3 files changed, 25 insertions(+), 11 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c
index c1c6e74c79c4f..2445e2ae3324e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c
@@ -1356,9 +1356,13 @@ mlx5_tc_ct_match_add(struct mlx5_tc_ct_priv *priv,
 int
 mlx5_tc_ct_parse_action(struct mlx5_tc_ct_priv *priv,
 			struct mlx5_flow_attr *attr,
+			struct mlx5e_tc_mod_hdr_acts *mod_acts,
 			const struct flow_action_entry *act,
 			struct netlink_ext_ack *extack)
 {
+	bool clear_action = act->ct.action & TCA_CT_ACT_CLEAR;
+	int err;
+
 	if (!priv) {
 		NL_SET_ERR_MSG_MOD(extack,
 				   "offload of ct action isn't available");
@@ -1369,6 +1373,17 @@ mlx5_tc_ct_parse_action(struct mlx5_tc_ct_priv *priv,
 	attr->ct_attr.ct_action = act->ct.action;
 	attr->ct_attr.nf_ft = act->ct.flow_table;
 
+	if (!clear_action)
+		goto out;
+
+	err = mlx5_tc_ct_entry_set_registers(priv, mod_acts, 0, 0, 0, 0);
+	if (err) {
+		NL_SET_ERR_MSG_MOD(extack, "Failed to set registers for ct clear");
+		return err;
+	}
+	attr->action |= MLX5_FLOW_CONTEXT_ACTION_MOD_HDR;
+
+out:
 	return 0;
 }
 
@@ -1898,23 +1913,16 @@ __mlx5_tc_ct_flow_offload_clear(struct mlx5_tc_ct_priv *ct_priv,
 
 	memcpy(pre_ct_attr, attr, attr_sz);
 
-	err = mlx5_tc_ct_entry_set_registers(ct_priv, mod_acts, 0, 0, 0, 0);
-	if (err) {
-		ct_dbg("Failed to set register for ct clear");
-		goto err_set_registers;
-	}
-
 	mod_hdr = mlx5_modify_header_alloc(priv->mdev, ct_priv->ns_type,
 					   mod_acts->num_actions,
 					   mod_acts->actions);
 	if (IS_ERR(mod_hdr)) {
 		err = PTR_ERR(mod_hdr);
 		ct_dbg("Failed to add create ct clear mod hdr");
-		goto err_set_registers;
+		goto err_mod_hdr;
 	}
 
 	pre_ct_attr->modify_hdr = mod_hdr;
-	pre_ct_attr->action |= MLX5_FLOW_CONTEXT_ACTION_MOD_HDR;
 
 	rule = mlx5_tc_rule_insert(priv, orig_spec, pre_ct_attr);
 	if (IS_ERR(rule)) {
@@ -1930,7 +1938,7 @@ __mlx5_tc_ct_flow_offload_clear(struct mlx5_tc_ct_priv *ct_priv,
 
 err_insert:
 	mlx5_modify_header_dealloc(priv->mdev, mod_hdr);
-err_set_registers:
+err_mod_hdr:
 	netdev_warn(priv->netdev,
 		    "Failed to offload ct clear flow, err %d\n", err);
 	kfree(pre_ct_attr);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.h b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.h
index 363329f4aac61..99662af1e41a7 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.h
@@ -110,6 +110,7 @@ int mlx5_tc_ct_add_no_trk_match(struct mlx5_flow_spec *spec);
 int
 mlx5_tc_ct_parse_action(struct mlx5_tc_ct_priv *priv,
 			struct mlx5_flow_attr *attr,
+			struct mlx5e_tc_mod_hdr_acts *mod_acts,
 			const struct flow_action_entry *act,
 			struct netlink_ext_ack *extack);
 
@@ -172,6 +173,7 @@ mlx5_tc_ct_add_no_trk_match(struct mlx5_flow_spec *spec)
 static inline int
 mlx5_tc_ct_parse_action(struct mlx5_tc_ct_priv *priv,
 			struct mlx5_flow_attr *attr,
+			struct mlx5e_tc_mod_hdr_acts *mod_acts,
 			const struct flow_action_entry *act,
 			struct netlink_ext_ack *extack)
 {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index cb76c41fe163a..3d45f4ae80c0a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -3608,7 +3608,9 @@ parse_tc_nic_actions(struct mlx5e_priv *priv,
 			attr->dest_chain = act->chain_index;
 			break;
 		case FLOW_ACTION_CT:
-			err = mlx5_tc_ct_parse_action(get_ct_priv(priv), attr, act, extack);
+			err = mlx5_tc_ct_parse_action(get_ct_priv(priv), attr,
+						      &parse_attr->mod_hdr_acts,
+						      act, extack);
 			if (err)
 				return err;
 
@@ -4277,7 +4279,9 @@ static int parse_tc_fdb_actions(struct mlx5e_priv *priv,
 				NL_SET_ERR_MSG_MOD(extack, "Sample action with connection tracking is not supported");
 				return -EOPNOTSUPP;
 			}
-			err = mlx5_tc_ct_parse_action(get_ct_priv(priv), attr, act, extack);
+			err = mlx5_tc_ct_parse_action(get_ct_priv(priv), attr,
+						      &parse_attr->mod_hdr_acts,
+						      act, extack);
 			if (err)
 				return err;
 

From ae396d85c01c7bdc9eeceecde1f493d03f793465 Mon Sep 17 00:00:00 2001
From: Maher Sanalla <msanalla@nvidia.com>
Date: Fri, 5 Nov 2021 11:19:48 +0200
Subject: [PATCH 64/96] net/mlx5: Lag, update tracker when state change event
 received

Currently, In NETDEV_CHANGELOWERSTATE/NETDEV_CHANGEUPPERSTATE events
handling, tracking is not fully completed if the LAG device is not ready
at the time the events occur. But, we must keep track of the upper and
lower states after receiving the events because RoCE needs this info in
mlx5_lag_get_roce_netdev() - in order to return the corresponding port
that its running on. Returning the wrong (not most recent) port will lead
to gids table being incorrect.

For example: If during the attachment of a slave to the bond, the other
non-attached port performs pci_reload, then the LAG device is not ready,
but that should not result in dismissing attached slave tracker update
automatically (which is performed in mlx5_handle_changelowerstate()), Since
these events might not come later, which can lead to both bond ports
having tx_enabled=0 - which is not a valid state of LAG bond.

Fixes: 9b412cc35f00 ("net/mlx5e: Add LAG warning if bond slave is not lag master")
Signed-off-by: Maher Sanalla <msanalla@nvidia.com>
Reviewed-by: Mark Bloch <mbloch@nvidia.com>
Reviewed-by: Jianbo Liu <jianbol@nvidia.com>
Reviewed-by: Roi Dayan <roid@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 .../net/ethernet/mellanox/mlx5/core/lag/lag.c | 28 +++++++++----------
 1 file changed, 13 insertions(+), 15 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c
index 48d2ea690d7ad..4ddf6b330a442 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c
@@ -615,6 +615,7 @@ static int mlx5_handle_changeupper_event(struct mlx5_lag *ldev,
 	bool is_bonded, is_in_lag, mode_supported;
 	int bond_status = 0;
 	int num_slaves = 0;
+	int changed = 0;
 	int idx;
 
 	if (!netif_is_lag_master(upper))
@@ -653,27 +654,27 @@ static int mlx5_handle_changeupper_event(struct mlx5_lag *ldev,
 	 */
 	is_in_lag = num_slaves == MLX5_MAX_PORTS && bond_status == 0x3;
 
-	if (!mlx5_lag_is_ready(ldev) && is_in_lag) {
-		NL_SET_ERR_MSG_MOD(info->info.extack,
-				   "Can't activate LAG offload, PF is configured with more than 64 VFs");
-		return 0;
-	}
-
 	/* Lag mode must be activebackup or hash. */
 	mode_supported = tracker->tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP ||
 			 tracker->tx_type == NETDEV_LAG_TX_TYPE_HASH;
 
-	if (is_in_lag && !mode_supported)
-		NL_SET_ERR_MSG_MOD(info->info.extack,
-				   "Can't activate LAG offload, TX type isn't supported");
-
 	is_bonded = is_in_lag && mode_supported;
 	if (tracker->is_bonded != is_bonded) {
 		tracker->is_bonded = is_bonded;
-		return 1;
+		changed = 1;
 	}
 
-	return 0;
+	if (!is_in_lag)
+		return changed;
+
+	if (!mlx5_lag_is_ready(ldev))
+		NL_SET_ERR_MSG_MOD(info->info.extack,
+				   "Can't activate LAG offload, PF is configured with more than 64 VFs");
+	else if (!mode_supported)
+		NL_SET_ERR_MSG_MOD(info->info.extack,
+				   "Can't activate LAG offload, TX type isn't supported");
+
+	return changed;
 }
 
 static int mlx5_handle_changelowerstate_event(struct mlx5_lag *ldev,
@@ -716,9 +717,6 @@ static int mlx5_lag_netdev_event(struct notifier_block *this,
 
 	ldev    = container_of(this, struct mlx5_lag, nb);
 
-	if (!mlx5_lag_is_ready(ldev) && event == NETDEV_CHANGELOWERSTATE)
-		return NOTIFY_DONE;
-
 	tracker = ldev->tracker;
 
 	switch (event) {

From c4c3176739dfa6efcc5b1d1de4b3fd2b51b048c7 Mon Sep 17 00:00:00 2001
From: Raed Salem <raeds@nvidia.com>
Date: Mon, 1 Nov 2021 16:18:53 +0200
Subject: [PATCH 65/96] net/mlx5: E-Switch, return error if encap isn't
 supported

On regular ConnectX HCAs getting encap mode isn't supported when the
E-Switch is in NONE mode. Current code would return no error code when
trying to get encap mode in such case which is wrong.

Fix by returning error value to indicate failure to caller in such case.

Fixes: 8e0aa4bc959c ("net/mlx5: E-switch, Protect eswitch mode changes")
Signed-off-by: Raed Salem <raeds@nvidia.com>
Reviewed-by: Mark Bloch <mbloch@nvidia.com>
Reviewed-by: Maor Dickman <maord@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index 80fa76f60e1e6..a46455694f7ae 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -3623,7 +3623,7 @@ int mlx5_devlink_eswitch_encap_mode_get(struct devlink *devlink,
 	*encap = esw->offloads.encap;
 unlock:
 	up_write(&esw->mode_lock);
-	return 0;
+	return err;
 }
 
 static bool

From 2460386bef0b9b98b71728d3c173e15558b78d82 Mon Sep 17 00:00:00 2001
From: Marcin Wojtas <mw@semihalf.com>
Date: Mon, 15 Nov 2021 16:30:24 +0100
Subject: [PATCH 66/96] net: mvmdio: fix compilation warning

The kernel test robot reported a following issue:

>> drivers/net/ethernet/marvell/mvmdio.c:426:36: warning:
unused variable 'orion_mdio_acpi_match' [-Wunused-const-variable]
   static const struct acpi_device_id orion_mdio_acpi_match[] = {
                                      ^
   1 warning generated.

Fix that by surrounding the variable by appropriate ifdef.

Fixes: c54da4c1acb1 ("net: mvmdio: add ACPI support")
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Marcin Wojtas <mw@semihalf.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Link: https://lore.kernel.org/r/20211115153024.209083-1-mw@semihalf.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/marvell/mvmdio.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/ethernet/marvell/mvmdio.c b/drivers/net/ethernet/marvell/mvmdio.c
index 62a97c46fba05..ef878973b8597 100644
--- a/drivers/net/ethernet/marvell/mvmdio.c
+++ b/drivers/net/ethernet/marvell/mvmdio.c
@@ -429,12 +429,14 @@ static const struct of_device_id orion_mdio_match[] = {
 };
 MODULE_DEVICE_TABLE(of, orion_mdio_match);
 
+#ifdef CONFIG_ACPI
 static const struct acpi_device_id orion_mdio_acpi_match[] = {
 	{ "MRVL0100", BUS_TYPE_SMI },
 	{ "MRVL0101", BUS_TYPE_XSMI },
 	{ },
 };
 MODULE_DEVICE_TABLE(acpi, orion_mdio_acpi_match);
+#endif
 
 static struct platform_driver orion_mdio_driver = {
 	.probe = orion_mdio_probe,

From 9f5363916a5099e618e6e40606e91b8ce0833754 Mon Sep 17 00:00:00 2001
From: Michael Chan <michael.chan@broadcom.com>
Date: Tue, 16 Nov 2021 14:26:10 -0500
Subject: [PATCH 67/96] bnxt_en: Fix compile error regression when
 CONFIG_BNXT_SRIOV is not set

bp->sriov_cfg is not defined when CONFIG_BNXT_SRIOV is not set.  Fix
it by adding a helper function bnxt_sriov_cfg() to handle the logic
with or without the config option.

Fixes: 46d08f55d24e ("bnxt_en: extend RTNL to VF check in devlink driver_reinit")
Reported-by: kernel test robot <lkp@intel.com>
Reviewed-by: Edwin Peer <edwin.peer@broadcom.com>
Reviewed-by: Andy Gospodarek <gospo@broadcom.com>
Signed-off-by: Michael Chan <michael.chan@broadcom.com>
Link: https://lore.kernel.org/r/1637090770-22835-1-git-send-email-michael.chan@broadcom.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.h         | 10 ++++++++++
 drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c |  2 +-
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index d0d5da9b78f8b..4c9507d82fd0d 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -2258,6 +2258,16 @@ static inline void bnxt_db_write(struct bnxt *bp, struct bnxt_db_info *db,
 	}
 }
 
+/* Must hold rtnl_lock */
+static inline bool bnxt_sriov_cfg(struct bnxt *bp)
+{
+#if defined(CONFIG_BNXT_SRIOV)
+	return BNXT_PF(bp) && (bp->pf.active_vfs || bp->sriov_cfg);
+#else
+	return false;
+#endif
+}
+
 extern const u16 bnxt_lhint_arr[];
 
 int bnxt_alloc_rx_data(struct bnxt *bp, struct bnxt_rx_ring_info *rxr,
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c
index 6fe9e9b59f835..951c4c569a9b3 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c
@@ -442,7 +442,7 @@ static int bnxt_dl_reload_down(struct devlink *dl, bool netns_change,
 	switch (action) {
 	case DEVLINK_RELOAD_ACTION_DRIVER_REINIT: {
 		rtnl_lock();
-		if (BNXT_PF(bp) && (bp->pf.active_vfs || bp->sriov_cfg)) {
+		if (bnxt_sriov_cfg(bp)) {
 			NL_SET_ERR_MSG_MOD(extack,
 					   "reload is unsupported while VFs are allocated or being configured");
 			rtnl_unlock();

From 0a83f96f8709f65a6498a012ba49f608925dfae6 Mon Sep 17 00:00:00 2001
From: Pavel Skripkin <paskripkin@gmail.com>
Date: Tue, 16 Nov 2021 17:13:03 +0300
Subject: [PATCH 68/96] MAINTAINERS: remove GR-everest-linux-l2@marvell.com

I've sent a patch to GR-everest-linux-l2@marvell.com few days ago and
got a reply from postmaster@marvell.com:

	Delivery has failed to these recipients or groups:

	gr-everest-linux-l2@marvell.com<mailto:gr-everest-linux-l2@marvell.com>
	The email address you entered couldn't be found. Please check the
	recipient's email address and try to resend the message. If the problem
	continues, please contact your helpdesk.

As requested by Alok Prasad, replacing GR-everest-linux-l2@marvell.com
with Manish Chopra's email address. [0]

Link: https://lore.kernel.org/all/20211116081601.11208-1-palok@marvell.com/ [0]
Signed-off-by: Pavel Skripkin <paskripkin@gmail.com>
Link: https://lore.kernel.org/r/20211116141303.32180-1-paskripkin@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 MAINTAINERS | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 4c74516e4353f..ae8b503ed64da 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3733,7 +3733,7 @@ F:	drivers/scsi/bnx2i/
 BROADCOM BNX2X 10 GIGABIT ETHERNET DRIVER
 M:	Ariel Elior <aelior@marvell.com>
 M:	Sudarsana Kalluru <skalluru@marvell.com>
-M:	GR-everest-linux-l2@marvell.com
+M:	Manish Chopra <manishc@marvell.com>
 L:	netdev@vger.kernel.org
 S:	Supported
 F:	drivers/net/ethernet/broadcom/bnx2x/
@@ -15592,7 +15592,7 @@ F:	drivers/scsi/qedi/
 
 QLOGIC QL4xxx ETHERNET DRIVER
 M:	Ariel Elior <aelior@marvell.com>
-M:	GR-everest-linux-l2@marvell.com
+M:	Manish Chopra <manishc@marvell.com>
 L:	netdev@vger.kernel.org
 S:	Supported
 F:	drivers/net/ethernet/qlogic/qed/

From b0024a04e48837b6556a080ff37ecd8351632596 Mon Sep 17 00:00:00 2001
From: Taehee Yoo <ap420073@gmail.com>
Date: Tue, 16 Nov 2021 16:09:23 +0000
Subject: [PATCH 69/96] amt: cancel delayed_work synchronously in amt_fini()

When the amt module is being removed, it calls cancel_delayed_work()
to cancel pending delayed_work. But this function doesn't wait for
canceling delayed_work.
So, workers can be still doing after module delete.

In order to avoid this, cancel_delayed_work_sync() should be used instead.

Suggested-by: Jakub Kicinski <kuba@kernel.org>
Fixes: bc54e49c140b ("amt: add multicast(IGMP) report message handler")
Signed-off-by: Taehee Yoo <ap420073@gmail.com>
Link: https://lore.kernel.org/r/20211116160923.25258-1-ap420073@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/amt.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/amt.c b/drivers/net/amt.c
index 47a04c330885e..b732ee9a50ef9 100644
--- a/drivers/net/amt.c
+++ b/drivers/net/amt.c
@@ -3286,7 +3286,7 @@ static void __exit amt_fini(void)
 {
 	rtnl_link_unregister(&amt_link_ops);
 	unregister_netdevice_notifier(&amt_notifier_block);
-	cancel_delayed_work(&source_gc_wq);
+	cancel_delayed_work_sync(&source_gc_wq);
 	__amt_source_gc_work();
 	destroy_workqueue(amt_wq);
 }

From f799ada6bf2397c351220088b9b0980125c77280 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Fri, 12 Nov 2021 11:33:11 -0500
Subject: [PATCH 70/96] net: sched: act_mirred: drop dst for the direction from
 egress to ingress

Without dropping dst, the packets sent from local mirred/redirected
to ingress will may still use the old dst. ip_rcv() will drop it as
the old dst is for output and its .input is dst_discard.

This patch is to fix by also dropping dst for those packets that are
mirred or redirected from egress to ingress in act_mirred.

Note that we don't drop it for the direction change from ingress to
egress, as on which there might be a user case attaching a metadata
dst by act_tunnel_key that would be used later.

Fixes: b57dc7c13ea9 ("net/sched: Introduce action ct")
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Cong Wang <cong.wang@bytedance.com>
Reviewed-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/sched/act_mirred.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index d64b0eeccbe4d..efc963ab995a3 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -19,6 +19,7 @@
 #include <linux/if_arp.h>
 #include <net/net_namespace.h>
 #include <net/netlink.h>
+#include <net/dst.h>
 #include <net/pkt_sched.h>
 #include <net/pkt_cls.h>
 #include <linux/tc_act/tc_mirred.h>
@@ -228,6 +229,7 @@ static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a,
 	bool want_ingress;
 	bool is_redirect;
 	bool expects_nh;
+	bool at_ingress;
 	int m_eaction;
 	int mac_len;
 	bool at_nh;
@@ -263,7 +265,8 @@ static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a,
 	 * ingress - that covers the TC S/W datapath.
 	 */
 	is_redirect = tcf_mirred_is_act_redirect(m_eaction);
-	use_reinsert = skb_at_tc_ingress(skb) && is_redirect &&
+	at_ingress = skb_at_tc_ingress(skb);
+	use_reinsert = at_ingress && is_redirect &&
 		       tcf_mirred_can_reinsert(retval);
 	if (!use_reinsert) {
 		skb2 = skb_clone(skb, GFP_ATOMIC);
@@ -271,10 +274,12 @@ static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a,
 			goto out;
 	}
 
+	want_ingress = tcf_mirred_act_wants_ingress(m_eaction);
+
 	/* All mirred/redirected skbs should clear previous ct info */
 	nf_reset_ct(skb2);
-
-	want_ingress = tcf_mirred_act_wants_ingress(m_eaction);
+	if (want_ingress && !at_ingress) /* drop dst for egress -> ingress */
+		skb_dst_drop(skb2);
 
 	expects_nh = want_ingress || !m_mac_header_xmit;
 	at_nh = skb->data == skb_network_header(skb);

From 1d127effdc1750d1f43de42100185430ea0c90bf Mon Sep 17 00:00:00 2001
From: Davide Caratti <dcaratti@redhat.com>
Date: Fri, 12 Nov 2021 11:33:12 -0500
Subject: [PATCH 71/96] selftests: add a test case for mirred egress to ingress

add a selftest that verifies the correct behavior of TC act_mirred egress
to ingress: in particular, it checks if the dst_entry is removed from skb
before redirect egress -> ingress. The correct behavior is: an ICMP 'echo
request' generated by ping will be received and generate a reply the same
way as the one generated by mausezahn.

Suggested-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: Davide Caratti <dcaratti@redhat.com>
Acked-by: Cong Wang <cong.wang@bytedance.com>
Reviewed-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/forwarding/config |  1 +
 .../selftests/net/forwarding/tc_actions.sh    | 47 ++++++++++++++++++-
 2 files changed, 47 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/net/forwarding/config b/tools/testing/selftests/net/forwarding/config
index a4bd1b0873035..697994a9278bb 100644
--- a/tools/testing/selftests/net/forwarding/config
+++ b/tools/testing/selftests/net/forwarding/config
@@ -6,6 +6,7 @@ CONFIG_IPV6_MULTIPLE_TABLES=y
 CONFIG_NET_VRF=m
 CONFIG_BPF_SYSCALL=y
 CONFIG_CGROUP_BPF=y
+CONFIG_NET_ACT_CT=m
 CONFIG_NET_ACT_MIRRED=m
 CONFIG_NET_ACT_MPLS=m
 CONFIG_NET_ACT_VLAN=m
diff --git a/tools/testing/selftests/net/forwarding/tc_actions.sh b/tools/testing/selftests/net/forwarding/tc_actions.sh
index d9eca227136bb..de19eb6c38f04 100755
--- a/tools/testing/selftests/net/forwarding/tc_actions.sh
+++ b/tools/testing/selftests/net/forwarding/tc_actions.sh
@@ -3,7 +3,7 @@
 
 ALL_TESTS="gact_drop_and_ok_test mirred_egress_redirect_test \
 	mirred_egress_mirror_test matchall_mirred_egress_mirror_test \
-	gact_trap_test"
+	gact_trap_test mirred_egress_to_ingress_test"
 NUM_NETIFS=4
 source tc_common.sh
 source lib.sh
@@ -13,10 +13,12 @@ tcflags="skip_hw"
 h1_create()
 {
 	simple_if_init $h1 192.0.2.1/24
+	tc qdisc add dev $h1 clsact
 }
 
 h1_destroy()
 {
+	tc qdisc del dev $h1 clsact
 	simple_if_fini $h1 192.0.2.1/24
 }
 
@@ -153,6 +155,49 @@ gact_trap_test()
 	log_test "trap ($tcflags)"
 }
 
+mirred_egress_to_ingress_test()
+{
+	RET=0
+
+	tc filter add dev $h1 protocol ip pref 100 handle 100 egress flower \
+		ip_proto icmp src_ip 192.0.2.1 dst_ip 192.0.2.2 type 8 action \
+			ct commit nat src addr 192.0.2.2 pipe \
+			ct clear pipe \
+			ct commit nat dst addr 192.0.2.1 pipe \
+			mirred ingress redirect dev $h1
+
+	tc filter add dev $swp1 protocol ip pref 11 handle 111 ingress flower \
+		ip_proto icmp src_ip 192.0.2.1 dst_ip 192.0.2.2 type 8 action drop
+	tc filter add dev $swp1 protocol ip pref 12 handle 112 ingress flower \
+		ip_proto icmp src_ip 192.0.2.1 dst_ip 192.0.2.2 type 0 action pass
+
+	$MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac -A 192.0.2.1 -B 192.0.2.2 \
+		-t icmp "ping,id=42,seq=10" -q
+
+	tc_check_packets "dev $h1 egress" 100 1
+	check_err $? "didn't mirror first packet"
+
+	tc_check_packets "dev $swp1 ingress" 111 1
+	check_fail $? "didn't redirect first packet"
+	tc_check_packets "dev $swp1 ingress" 112 1
+	check_err $? "didn't receive reply to first packet"
+
+	ping 192.0.2.2 -I$h1 -c1 -w1 -q 1>/dev/null 2>&1
+
+	tc_check_packets "dev $h1 egress" 100 2
+	check_err $? "didn't mirror second packet"
+	tc_check_packets "dev $swp1 ingress" 111 1
+	check_fail $? "didn't redirect second packet"
+	tc_check_packets "dev $swp1 ingress" 112 2
+	check_err $? "didn't receive reply to second packet"
+
+	tc filter del dev $h1 egress protocol ip pref 100 handle 100 flower
+	tc filter del dev $swp1 ingress protocol ip pref 11 handle 111 flower
+	tc filter del dev $swp1 ingress protocol ip pref 12 handle 112 flower
+
+	log_test "mirred_egress_to_ingress ($tcflags)"
+}
+
 setup_prepare()
 {
 	h1=${NETIFS[p1]}

From 3751c3d34cd5a750c86d1c8eaf217d8faf7f9325 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 15 Nov 2021 16:21:23 +0100
Subject: [PATCH 72/96] net: stmmac: Fix signed/unsigned wreckage

The recent addition of timestamp correction to compensate the CDC error
introduced a subtle signed/unsigned bug in stmmac_get_tx_hwtstamp() while
it managed for some obscure reason to avoid that in stmmac_get_rx_hwtstamp().

The issue is:

    s64 adjust = 0;
    u64 ns;

    adjust += -(2 * (NSEC_PER_SEC / priv->plat->clk_ptp_rate));
    ns += adjust;

works by chance on 64bit, but falls apart on 32bit because the compiler
knows that adjust fits into 32bit and then treats the addition as a u64 +
u32 resulting in an off by ~2 seconds failure.

The RX variant uses an u64 for adjust and does the adjustment via

    ns -= adjust;

because consistency is obviously overrated.

Get rid of the pointless zero initialized adjust variable and do:

	ns -= (2 * NSEC_PER_SEC) / priv->plat->clk_ptp_rate;

which is obviously correct and spares the adjust obfuscation. Aside of that
it yields a more accurate result because the multiplication takes place
before the integer divide truncation and not afterwards.

Stick the calculation into an inline so it can't be accidentally
disimproved. Return an u32 from that inline as the result is guaranteed
to fit which lets the compiler optimize the substraction.

Cc: stable@vger.kernel.org
Fixes: 3600be5f58c1 ("net: stmmac: add timestamp correction to rid CDC sync error")
Reported-by: Benedikt Spranger <b.spranger@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Benedikt Spranger <b.spranger@linutronix.de>
Tested-by: Kurt Kanzenbach <kurt@linutronix.de> # Intel EHL
Link: https://lore.kernel.org/r/87mtm578cs.ffs@tglx
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../net/ethernet/stmicro/stmmac/stmmac_main.c | 23 ++++++++-----------
 1 file changed, 10 insertions(+), 13 deletions(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index d3f350c25b9b6..2eb2845763367 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -511,6 +511,14 @@ bool stmmac_eee_init(struct stmmac_priv *priv)
 	return true;
 }
 
+static inline u32 stmmac_cdc_adjust(struct stmmac_priv *priv)
+{
+	/* Correct the clk domain crossing(CDC) error */
+	if (priv->plat->has_gmac4 && priv->plat->clk_ptp_rate)
+		return (2 * NSEC_PER_SEC) / priv->plat->clk_ptp_rate;
+	return 0;
+}
+
 /* stmmac_get_tx_hwtstamp - get HW TX timestamps
  * @priv: driver private structure
  * @p : descriptor pointer
@@ -524,7 +532,6 @@ static void stmmac_get_tx_hwtstamp(struct stmmac_priv *priv,
 {
 	struct skb_shared_hwtstamps shhwtstamp;
 	bool found = false;
-	s64 adjust = 0;
 	u64 ns = 0;
 
 	if (!priv->hwts_tx_en)
@@ -543,12 +550,7 @@ static void stmmac_get_tx_hwtstamp(struct stmmac_priv *priv,
 	}
 
 	if (found) {
-		/* Correct the clk domain crossing(CDC) error */
-		if (priv->plat->has_gmac4 && priv->plat->clk_ptp_rate) {
-			adjust += -(2 * (NSEC_PER_SEC /
-					 priv->plat->clk_ptp_rate));
-			ns += adjust;
-		}
+		ns -= stmmac_cdc_adjust(priv);
 
 		memset(&shhwtstamp, 0, sizeof(struct skb_shared_hwtstamps));
 		shhwtstamp.hwtstamp = ns_to_ktime(ns);
@@ -573,7 +575,6 @@ static void stmmac_get_rx_hwtstamp(struct stmmac_priv *priv, struct dma_desc *p,
 {
 	struct skb_shared_hwtstamps *shhwtstamp = NULL;
 	struct dma_desc *desc = p;
-	u64 adjust = 0;
 	u64 ns = 0;
 
 	if (!priv->hwts_rx_en)
@@ -586,11 +587,7 @@ static void stmmac_get_rx_hwtstamp(struct stmmac_priv *priv, struct dma_desc *p,
 	if (stmmac_get_rx_timestamp_status(priv, p, np, priv->adv_ts)) {
 		stmmac_get_timestamp(priv, desc, priv->adv_ts, &ns);
 
-		/* Correct the clk domain crossing(CDC) error */
-		if (priv->plat->has_gmac4 && priv->plat->clk_ptp_rate) {
-			adjust += 2 * (NSEC_PER_SEC / priv->plat->clk_ptp_rate);
-			ns -= adjust;
-		}
+		ns -= stmmac_cdc_adjust(priv);
 
 		netdev_dbg(priv->dev, "get valid RX hw timestamp %llu\n", ns);
 		shhwtstamp = skb_hwtstamps(skb);

From f77b83b5bbab53d2be339184838b19ed2c62c0a5 Mon Sep 17 00:00:00 2001
From: Aaron Ma <aaron.ma@canonical.com>
Date: Tue, 16 Nov 2021 22:19:17 +0800
Subject: [PATCH 73/96] net: usb: r8152: Add MAC passthrough support for more
 Lenovo Docks

Like ThinkaPad Thunderbolt 4 Dock, more Lenovo docks start to use the original
Realtek USB ethernet chip ID 0bda:8153.

Lenovo Docks always use their own IDs for usb hub, even for older Docks.
If parent hub is from Lenovo, then r8152 should try MAC passthrough.
Verified on Lenovo TBT3 dock too.

Signed-off-by: Aaron Ma <aaron.ma@canonical.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/usb/r8152.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c
index 4a02f33f0643d..f9877a3e83acf 100644
--- a/drivers/net/usb/r8152.c
+++ b/drivers/net/usb/r8152.c
@@ -9603,12 +9603,9 @@ static int rtl8152_probe(struct usb_interface *intf,
 		netdev->hw_features &= ~NETIF_F_RXCSUM;
 	}
 
-	if (le16_to_cpu(udev->descriptor.idVendor) == VENDOR_ID_LENOVO) {
-		switch (le16_to_cpu(udev->descriptor.idProduct)) {
-		case DEVICE_ID_THINKPAD_THUNDERBOLT3_DOCK_GEN2:
-		case DEVICE_ID_THINKPAD_USB_C_DOCK_GEN2:
-			tp->lenovo_macpassthru = 1;
-		}
+	if (udev->parent &&
+			le16_to_cpu(udev->parent->descriptor.idVendor) == VENDOR_ID_LENOVO) {
+		tp->lenovo_macpassthru = 1;
 	}
 
 	if (le16_to_cpu(udev->descriptor.bcdDevice) == 0x3011 && udev->serial &&

From 9b5a333272a48c2f8b30add7a874e46e8b26129c Mon Sep 17 00:00:00 2001
From: Pavel Skripkin <paskripkin@gmail.com>
Date: Tue, 16 Nov 2021 18:17:12 +0300
Subject: [PATCH 74/96] net: dpaa2-eth: fix use-after-free in dpaa2_eth_remove

Access to netdev after free_netdev() will cause use-after-free bug.
Move debug log before free_netdev() call to avoid it.

Fixes: 7472dd9f6499 ("staging: fsl-dpaa2/eth: Move print message")
Signed-off-by: Pavel Skripkin <paskripkin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c
index 714e961e7a77a..6451c8383639f 100644
--- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c
+++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c
@@ -4550,10 +4550,10 @@ static int dpaa2_eth_remove(struct fsl_mc_device *ls_dev)
 
 	fsl_mc_portal_free(priv->mc_io);
 
-	free_netdev(net_dev);
-
 	dev_dbg(net_dev->dev.parent, "Removed interface %s\n", net_dev->name);
 
+	free_netdev(net_dev);
+
 	return 0;
 }
 

From cf9acc90c80ecbee00334aa85d92f4e74014bcff Mon Sep 17 00:00:00 2001
From: Jonathan Davies <jonathan.davies@nutanix.com>
Date: Tue, 16 Nov 2021 17:42:42 +0000
Subject: [PATCH 75/96] net: virtio_net_hdr_to_skb: count transport header in
 UFO

virtio_net_hdr_to_skb does not set the skb's gso_size and gso_type
correctly for UFO packets received via virtio-net that are a little over
the GSO size. This can lead to problems elsewhere in the networking
stack, e.g. ovs_vport_send dropping over-sized packets if gso_size is
not set.

This is due to the comparison

  if (skb->len - p_off > gso_size)

not properly accounting for the transport layer header.

p_off includes the size of the transport layer header (thlen), so
skb->len - p_off is the size of the TCP/UDP payload.

gso_size is read from the virtio-net header. For UFO, fragmentation
happens at the IP level so does not need to include the UDP header.

Hence the calculation could be comparing a TCP/UDP payload length with
an IP payload length, causing legitimate virtio-net packets to have
lack gso_type/gso_size information.

Example: a UDP packet with payload size 1473 has IP payload size 1481.
If the guest used UFO, it is not fragmented and the virtio-net header's
flags indicate that it is a GSO frame (VIRTIO_NET_HDR_GSO_UDP), with
gso_size = 1480 for an MTU of 1500.  skb->len will be 1515 and p_off
will be 42, so skb->len - p_off = 1473.  Hence the comparison fails, and
shinfo->gso_size and gso_type are not set as they should be.

Instead, add the UDP header length before comparing to gso_size when
using UFO. In this way, it is the size of the IP payload that is
compared to gso_size.

Fixes: 6dd912f82680 ("net: check untrusted gso_size at kernel entry")
Signed-off-by: Jonathan Davies <jonathan.davies@nutanix.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/virtio_net.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h
index b465f8f3e554f..04e87f4b9417c 100644
--- a/include/linux/virtio_net.h
+++ b/include/linux/virtio_net.h
@@ -120,10 +120,15 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb,
 
 	if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
 		u16 gso_size = __virtio16_to_cpu(little_endian, hdr->gso_size);
+		unsigned int nh_off = p_off;
 		struct skb_shared_info *shinfo = skb_shinfo(skb);
 
+		/* UFO may not include transport header in gso_size. */
+		if (gso_type & SKB_GSO_UDP)
+			nh_off -= thlen;
+
 		/* Too small packets are not really GSO ones. */
-		if (skb->len - p_off > gso_size) {
+		if (skb->len - nh_off > gso_size) {
 			shinfo->gso_size = gso_size;
 			shinfo->gso_type = gso_type;
 

From c366ce28750e9633f8d4b07829a9cde0e59034eb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=81ukasz=20Stelmach?= <l.stelmach@samsung.com>
Date: Tue, 16 Nov 2021 22:29:15 +0100
Subject: [PATCH 76/96] net: ax88796c: use bit numbers insetad of bit masks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Change the values of EVENT_* constants from bit masks to bit numbers as
accepted by {clear,set,test}_bit() functions.

Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Łukasz Stelmach <l.stelmach@samsung.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/asix/ax88796c_main.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/asix/ax88796c_main.h b/drivers/net/ethernet/asix/ax88796c_main.h
index 80263c3cef757..4a83c991dcbed 100644
--- a/drivers/net/ethernet/asix/ax88796c_main.h
+++ b/drivers/net/ethernet/asix/ax88796c_main.h
@@ -127,9 +127,9 @@ struct ax88796c_device {
 		#define AX_PRIV_FLAGS_MASK	(AX_CAP_COMP)
 
 	unsigned long		flags;
-		#define EVENT_INTR		BIT(0)
-		#define EVENT_TX		BIT(1)
-		#define EVENT_SET_MULTI		BIT(2)
+		#define EVENT_INTR		0
+		#define EVENT_TX		1
+		#define EVENT_SET_MULTI		2
 
 };
 

From 6afbd7b3c53cb7417189f476e99d431daccb85b0 Mon Sep 17 00:00:00 2001
From: Eryk Rybak <eryk.roch.rybak@intel.com>
Date: Thu, 21 Jan 2021 16:17:22 +0000
Subject: [PATCH 77/96] i40e: Fix correct max_pkt_size on VF RX queue

Setting VLAN port increasing RX queue max_pkt_size
by 4 bytes to take VLAN tag into account.
Trigger the VF reset when setting port VLAN for
VF to renegotiate its capabilities and reinitialize.

Fixes: ba4e003d29c1 ("i40e: don't hold spinlock while resetting VF")
Signed-off-by: Sylwester Dziedziuch <sylwesterx.dziedziuch@intel.com>
Signed-off-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
Signed-off-by: Eryk Rybak <eryk.roch.rybak@intel.com>
Tested-by: Konrad Jankowski <konrad0.jankowski@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 .../ethernet/intel/i40e/i40e_virtchnl_pf.c    | 53 ++++---------------
 1 file changed, 9 insertions(+), 44 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
index 472f56b360b8c..815661632e7a7 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
@@ -674,14 +674,13 @@ static int i40e_config_vsi_rx_queue(struct i40e_vf *vf, u16 vsi_id,
 				    u16 vsi_queue_id,
 				    struct virtchnl_rxq_info *info)
 {
+	u16 pf_queue_id = i40e_vc_get_pf_queue_id(vf, vsi_id, vsi_queue_id);
 	struct i40e_pf *pf = vf->pf;
+	struct i40e_vsi *vsi = pf->vsi[vf->lan_vsi_idx];
 	struct i40e_hw *hw = &pf->hw;
 	struct i40e_hmc_obj_rxq rx_ctx;
-	u16 pf_queue_id;
 	int ret = 0;
 
-	pf_queue_id = i40e_vc_get_pf_queue_id(vf, vsi_id, vsi_queue_id);
-
 	/* clear the context structure first */
 	memset(&rx_ctx, 0, sizeof(struct i40e_hmc_obj_rxq));
 
@@ -719,6 +718,10 @@ static int i40e_config_vsi_rx_queue(struct i40e_vf *vf, u16 vsi_id,
 	}
 	rx_ctx.rxmax = info->max_pkt_size;
 
+	/* if port VLAN is configured increase the max packet size */
+	if (vsi->info.pvid)
+		rx_ctx.rxmax += VLAN_HLEN;
+
 	/* enable 32bytes desc always */
 	rx_ctx.dsize = 1;
 
@@ -4169,34 +4172,6 @@ int i40e_ndo_set_vf_mac(struct net_device *netdev, int vf_id, u8 *mac)
 	return ret;
 }
 
-/**
- * i40e_vsi_has_vlans - True if VSI has configured VLANs
- * @vsi: pointer to the vsi
- *
- * Check if a VSI has configured any VLANs. False if we have a port VLAN or if
- * we have no configured VLANs. Do not call while holding the
- * mac_filter_hash_lock.
- */
-static bool i40e_vsi_has_vlans(struct i40e_vsi *vsi)
-{
-	bool have_vlans;
-
-	/* If we have a port VLAN, then the VSI cannot have any VLANs
-	 * configured, as all MAC/VLAN filters will be assigned to the PVID.
-	 */
-	if (vsi->info.pvid)
-		return false;
-
-	/* Since we don't have a PVID, we know that if the device is in VLAN
-	 * mode it must be because of a VLAN filter configured on this VSI.
-	 */
-	spin_lock_bh(&vsi->mac_filter_hash_lock);
-	have_vlans = i40e_is_vsi_in_vlan(vsi);
-	spin_unlock_bh(&vsi->mac_filter_hash_lock);
-
-	return have_vlans;
-}
-
 /**
  * i40e_ndo_set_vf_port_vlan
  * @netdev: network interface device structure
@@ -4253,19 +4228,9 @@ int i40e_ndo_set_vf_port_vlan(struct net_device *netdev, int vf_id,
 		/* duplicate request, so just return success */
 		goto error_pvid;
 
-	if (i40e_vsi_has_vlans(vsi)) {
-		dev_err(&pf->pdev->dev,
-			"VF %d has already configured VLAN filters and the administrator is requesting a port VLAN override.\nPlease unload and reload the VF driver for this change to take effect.\n",
-			vf_id);
-		/* Administrator Error - knock the VF offline until he does
-		 * the right thing by reconfiguring his network correctly
-		 * and then reloading the VF driver.
-		 */
-		i40e_vc_disable_vf(vf);
-		/* During reset the VF got a new VSI, so refresh the pointer. */
-		vsi = pf->vsi[vf->lan_vsi_idx];
-	}
-
+	i40e_vc_disable_vf(vf);
+	/* During reset the VF got a new VSI, so refresh a pointer. */
+	vsi = pf->vsi[vf->lan_vsi_idx];
 	/* Locked once because multiple functions below iterate list */
 	spin_lock_bh(&vsi->mac_filter_hash_lock);
 

From 37d9e304acd903a445df8208b8a13d707902dea6 Mon Sep 17 00:00:00 2001
From: Michal Maloszewski <michal.maloszewski@intel.com>
Date: Wed, 24 Feb 2021 12:07:48 +0000
Subject: [PATCH 78/96] i40e: Fix NULL ptr dereference on VSI filter sync

Remove the reason of null pointer dereference in sync VSI filters.
Added new I40E_VSI_RELEASING flag to signalize deleting and releasing
of VSI resources to sync this thread with sync filters subtask.
Without this patch it is possible to start update the VSI filter list
after VSI is removed, that's causing a kernel oops.

Fixes: 41c445ff0f48 ("i40e: main driver core")
Signed-off-by: Grzegorz Szczurek <grzegorzx.szczurek@intel.com>
Signed-off-by: Michal Maloszewski <michal.maloszewski@intel.com>
Reviewed-by: Przemyslaw Patynowski <przemyslawx.patynowski@intel.com>
Reviewed-by: Witold Fijalkowski <witoldx.fijalkowski@intel.com>
Reviewed-by: Jaroslaw Gawin <jaroslawx.gawin@intel.com>
Reviewed-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
Tested-by: Tony Brelinski <tony.brelinski@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e.h      | 1 +
 drivers/net/ethernet/intel/i40e/i40e_main.c | 5 +++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e.h b/drivers/net/ethernet/intel/i40e/i40e.h
index 3d528fba754b4..35a83a161b6f7 100644
--- a/drivers/net/ethernet/intel/i40e/i40e.h
+++ b/drivers/net/ethernet/intel/i40e/i40e.h
@@ -161,6 +161,7 @@ enum i40e_vsi_state_t {
 	__I40E_VSI_OVERFLOW_PROMISC,
 	__I40E_VSI_REINIT_REQUESTED,
 	__I40E_VSI_DOWN_REQUESTED,
+	__I40E_VSI_RELEASING,
 	/* This must be last as it determines the size of the BITMAP */
 	__I40E_VSI_STATE_SIZE__,
 };
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index ba862131b9bdf..6e309d6ce37dc 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -2623,7 +2623,8 @@ static void i40e_sync_filters_subtask(struct i40e_pf *pf)
 
 	for (v = 0; v < pf->num_alloc_vsi; v++) {
 		if (pf->vsi[v] &&
-		    (pf->vsi[v]->flags & I40E_VSI_FLAG_FILTER_CHANGED)) {
+		    (pf->vsi[v]->flags & I40E_VSI_FLAG_FILTER_CHANGED) &&
+		    !test_bit(__I40E_VSI_RELEASING, pf->vsi[v]->state)) {
 			int ret = i40e_sync_vsi_filters(pf->vsi[v]);
 
 			if (ret) {
@@ -13771,7 +13772,7 @@ int i40e_vsi_release(struct i40e_vsi *vsi)
 		dev_info(&pf->pdev->dev, "Can't remove PF VSI\n");
 		return -ENODEV;
 	}
-
+	set_bit(__I40E_VSI_RELEASING, vsi->state);
 	uplink_seid = vsi->uplink_seid;
 	if (vsi->type != I40E_VSI_SRIOV) {
 		if (vsi->netdev_registered) {

From d2a69fefd75683004ffe87166de5635b3267ee07 Mon Sep 17 00:00:00 2001
From: Eryk Rybak <eryk.roch.rybak@intel.com>
Date: Fri, 23 Apr 2021 13:43:25 +0200
Subject: [PATCH 79/96] i40e: Fix changing previously set num_queue_pairs for
 PFs

Currently, the i40e_vsi_setup_queue_map is basing the count of queues in
TCs on a VSI's alloc_queue_pairs member which is not changed throughout
any user's action (for example via ethtool's set_channels callback).

This implies that vsi->tc_config.tc_info[n].qcount value that is given
to the kernel via netdev_set_tc_queue() that notifies about the count of
queues per particular traffic class is constant even if user has changed
the total count of queues.

This in turn caused the kernel warning after setting the queue count to
the lower value than the initial one:

$ ethtool -l ens801f0
Channel parameters for ens801f0:
Pre-set maximums:
RX:             0
TX:             0
Other:          1
Combined:       64
Current hardware settings:
RX:             0
TX:             0
Other:          1
Combined:       64

$ ethtool -L ens801f0 combined 40

[dmesg]
Number of in use tx queues changed invalidating tc mappings. Priority
traffic classification disabled!

Reason was that vsi->alloc_queue_pairs stayed at 64 value which was used
to set the qcount on TC0 (by default only TC0 exists so all of the
existing queues are assigned to TC0). we update the offset/qcount via
netdev_set_tc_queue() back to the old value but then the
netif_set_real_num_tx_queues() is using the vsi->num_queue_pairs as a
value which got set to 40.

Fix it by using vsi->req_queue_pairs as a queue count that will be
distributed across TCs. Do it only for non-zero values, which implies
that user actually requested the new count of queues.

For VSIs other than main, stay with the vsi->alloc_queue_pairs as we
only allow manipulating the queue count on main VSI.

Fixes: bc6d33c8d93f ("i40e: Fix the number of queues available to be mapped for use")
Co-developed-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Co-developed-by: Przemyslaw Patynowski <przemyslawx.patynowski@intel.com>
Signed-off-by: Przemyslaw Patynowski <przemyslawx.patynowski@intel.com>
Signed-off-by: Eryk Rybak <eryk.roch.rybak@intel.com>
Tested-by: Tony Brelinski <tony.brelinski@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_main.c | 35 ++++++++++++++-------
 1 file changed, 23 insertions(+), 12 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 6e309d6ce37dc..8437cc14bfc69 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -1790,6 +1790,7 @@ static void i40e_vsi_setup_queue_map(struct i40e_vsi *vsi,
 				     bool is_add)
 {
 	struct i40e_pf *pf = vsi->back;
+	u16 num_tc_qps = 0;
 	u16 sections = 0;
 	u8 netdev_tc = 0;
 	u16 numtc = 1;
@@ -1797,13 +1798,29 @@ static void i40e_vsi_setup_queue_map(struct i40e_vsi *vsi,
 	u8 offset;
 	u16 qmap;
 	int i;
-	u16 num_tc_qps = 0;
 
 	sections = I40E_AQ_VSI_PROP_QUEUE_MAP_VALID;
 	offset = 0;
 
+	if (vsi->type == I40E_VSI_MAIN) {
+		/* This code helps add more queue to the VSI if we have
+		 * more cores than RSS can support, the higher cores will
+		 * be served by ATR or other filters. Furthermore, the
+		 * non-zero req_queue_pairs says that user requested a new
+		 * queue count via ethtool's set_channels, so use this
+		 * value for queues distribution across traffic classes
+		 */
+		if (vsi->req_queue_pairs > 0)
+			vsi->num_queue_pairs = vsi->req_queue_pairs;
+		else if (pf->flags & I40E_FLAG_MSIX_ENABLED)
+			vsi->num_queue_pairs = pf->num_lan_msix;
+	}
+
 	/* Number of queues per enabled TC */
-	num_tc_qps = vsi->alloc_queue_pairs;
+	if (vsi->type == I40E_VSI_MAIN)
+		num_tc_qps = vsi->num_queue_pairs;
+	else
+		num_tc_qps = vsi->alloc_queue_pairs;
 	if (enabled_tc && (vsi->back->flags & I40E_FLAG_DCB_ENABLED)) {
 		/* Find numtc from enabled TC bitmap */
 		for (i = 0, numtc = 0; i < I40E_MAX_TRAFFIC_CLASS; i++) {
@@ -1881,16 +1898,10 @@ static void i40e_vsi_setup_queue_map(struct i40e_vsi *vsi,
 		}
 		ctxt->info.tc_mapping[i] = cpu_to_le16(qmap);
 	}
-
-	/* Set actual Tx/Rx queue pairs */
-	vsi->num_queue_pairs = offset;
-	if ((vsi->type == I40E_VSI_MAIN) && (numtc == 1)) {
-		if (vsi->req_queue_pairs > 0)
-			vsi->num_queue_pairs = vsi->req_queue_pairs;
-		else if (pf->flags & I40E_FLAG_MSIX_ENABLED)
-			vsi->num_queue_pairs = pf->num_lan_msix;
-	}
-
+	/* Do not change previously set num_queue_pairs for PFs */
+	if ((vsi->type == I40E_VSI_MAIN && numtc != 1) ||
+	    vsi->type != I40E_VSI_MAIN)
+		vsi->num_queue_pairs = offset;
 	/* Scheduler section valid can only be set for ADD VSI */
 	if (is_add) {
 		sections |= I40E_AQ_VSI_PROP_SCHED_VALID;

From 9e0a603cb7dce2a19d98116d42de84b6db26d716 Mon Sep 17 00:00:00 2001
From: Eryk Rybak <eryk.roch.rybak@intel.com>
Date: Fri, 23 Apr 2021 13:43:26 +0200
Subject: [PATCH 80/96] i40e: Fix ping is lost after configuring ADq on VF

Properly reconfigure VF VSIs after VF request ADQ.
Created new function to update queue mapping and queue pairs per TC
with AQ update VSI. This sets proper RSS size on NIC.
VFs num_queue_pairs should not be changed during setup of queue maps.
Previously, VF main VSI in ADQ had configured too many queues and had
wrong RSS size, which lead to packets not being consumed and drops in
connectivity.

Fixes: bc6d33c8d93f ("i40e: Fix the number of queues available to be mapped for use")
Co-developed-by: Przemyslaw Patynowski <przemyslawx.patynowski@intel.com>
Signed-off-by: Przemyslaw Patynowski <przemyslawx.patynowski@intel.com>
Signed-off-by: Eryk Rybak <eryk.roch.rybak@intel.com>
Tested-by: Tony Brelinski <tony.brelinski@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e.h        |  1 +
 drivers/net/ethernet/intel/i40e/i40e_main.c   | 64 ++++++++++++++++++-
 .../ethernet/intel/i40e/i40e_virtchnl_pf.c    | 17 +++--
 3 files changed, 74 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e.h b/drivers/net/ethernet/intel/i40e/i40e.h
index 35a83a161b6f7..4d939af0a626c 100644
--- a/drivers/net/ethernet/intel/i40e/i40e.h
+++ b/drivers/net/ethernet/intel/i40e/i40e.h
@@ -1248,6 +1248,7 @@ void i40e_ptp_restore_hw_time(struct i40e_pf *pf);
 void i40e_ptp_init(struct i40e_pf *pf);
 void i40e_ptp_stop(struct i40e_pf *pf);
 int i40e_ptp_alloc_pins(struct i40e_pf *pf);
+int i40e_update_adq_vsi_queues(struct i40e_vsi *vsi, int vsi_offset);
 int i40e_is_vsi_uplink_mode_veb(struct i40e_vsi *vsi);
 i40e_status i40e_get_partition_bw_setting(struct i40e_pf *pf);
 i40e_status i40e_set_partition_bw_setting(struct i40e_pf *pf);
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 8437cc14bfc69..37386a270db54 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -1801,6 +1801,8 @@ static void i40e_vsi_setup_queue_map(struct i40e_vsi *vsi,
 
 	sections = I40E_AQ_VSI_PROP_QUEUE_MAP_VALID;
 	offset = 0;
+	/* zero out queue mapping, it will get updated on the end of the function */
+	memset(ctxt->info.queue_mapping, 0, sizeof(ctxt->info.queue_mapping));
 
 	if (vsi->type == I40E_VSI_MAIN) {
 		/* This code helps add more queue to the VSI if we have
@@ -1817,10 +1819,12 @@ static void i40e_vsi_setup_queue_map(struct i40e_vsi *vsi,
 	}
 
 	/* Number of queues per enabled TC */
-	if (vsi->type == I40E_VSI_MAIN)
+	if (vsi->type == I40E_VSI_MAIN ||
+	    (vsi->type == I40E_VSI_SRIOV && vsi->num_queue_pairs != 0))
 		num_tc_qps = vsi->num_queue_pairs;
 	else
 		num_tc_qps = vsi->alloc_queue_pairs;
+
 	if (enabled_tc && (vsi->back->flags & I40E_FLAG_DCB_ENABLED)) {
 		/* Find numtc from enabled TC bitmap */
 		for (i = 0, numtc = 0; i < I40E_MAX_TRAFFIC_CLASS; i++) {
@@ -1898,10 +1902,12 @@ static void i40e_vsi_setup_queue_map(struct i40e_vsi *vsi,
 		}
 		ctxt->info.tc_mapping[i] = cpu_to_le16(qmap);
 	}
-	/* Do not change previously set num_queue_pairs for PFs */
+	/* Do not change previously set num_queue_pairs for PFs and VFs*/
 	if ((vsi->type == I40E_VSI_MAIN && numtc != 1) ||
-	    vsi->type != I40E_VSI_MAIN)
+	    (vsi->type == I40E_VSI_SRIOV && vsi->num_queue_pairs == 0) ||
+	    (vsi->type != I40E_VSI_MAIN && vsi->type != I40E_VSI_SRIOV))
 		vsi->num_queue_pairs = offset;
+
 	/* Scheduler section valid can only be set for ADD VSI */
 	if (is_add) {
 		sections |= I40E_AQ_VSI_PROP_SCHED_VALID;
@@ -5438,6 +5444,58 @@ static void i40e_vsi_update_queue_map(struct i40e_vsi *vsi,
 	       sizeof(vsi->info.tc_mapping));
 }
 
+/**
+ * i40e_update_adq_vsi_queues - update queue mapping for ADq VSI
+ * @vsi: the VSI being reconfigured
+ * @vsi_offset: offset from main VF VSI
+ */
+int i40e_update_adq_vsi_queues(struct i40e_vsi *vsi, int vsi_offset)
+{
+	struct i40e_vsi_context ctxt = {};
+	struct i40e_pf *pf;
+	struct i40e_hw *hw;
+	int ret;
+
+	if (!vsi)
+		return I40E_ERR_PARAM;
+	pf = vsi->back;
+	hw = &pf->hw;
+
+	ctxt.seid = vsi->seid;
+	ctxt.pf_num = hw->pf_id;
+	ctxt.vf_num = vsi->vf_id + hw->func_caps.vf_base_id + vsi_offset;
+	ctxt.uplink_seid = vsi->uplink_seid;
+	ctxt.connection_type = I40E_AQ_VSI_CONN_TYPE_NORMAL;
+	ctxt.flags = I40E_AQ_VSI_TYPE_VF;
+	ctxt.info = vsi->info;
+
+	i40e_vsi_setup_queue_map(vsi, &ctxt, vsi->tc_config.enabled_tc,
+				 false);
+	if (vsi->reconfig_rss) {
+		vsi->rss_size = min_t(int, pf->alloc_rss_size,
+				      vsi->num_queue_pairs);
+		ret = i40e_vsi_config_rss(vsi);
+		if (ret) {
+			dev_info(&pf->pdev->dev, "Failed to reconfig rss for num_queues\n");
+			return ret;
+		}
+		vsi->reconfig_rss = false;
+	}
+
+	ret = i40e_aq_update_vsi_params(hw, &ctxt, NULL);
+	if (ret) {
+		dev_info(&pf->pdev->dev, "Update vsi config failed, err %s aq_err %s\n",
+			 i40e_stat_str(hw, ret),
+			 i40e_aq_str(hw, hw->aq.asq_last_status));
+		return ret;
+	}
+	/* update the local VSI info with updated queue map */
+	i40e_vsi_update_queue_map(vsi, &ctxt);
+	vsi->info.valid_sections = 0;
+
+	return ret;
+}
+
 /**
  * i40e_vsi_config_tc - Configure VSI Tx Scheduler for given TC map
  * @vsi: VSI to be configured
diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
index 815661632e7a7..2102db11972a7 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
@@ -2220,11 +2220,12 @@ static int i40e_vc_config_queues_msg(struct i40e_vf *vf, u8 *msg)
 	struct virtchnl_vsi_queue_config_info *qci =
 	    (struct virtchnl_vsi_queue_config_info *)msg;
 	struct virtchnl_queue_pair_info *qpi;
-	struct i40e_pf *pf = vf->pf;
 	u16 vsi_id, vsi_queue_id = 0;
-	u16 num_qps_all = 0;
+	struct i40e_pf *pf = vf->pf;
 	i40e_status aq_ret = 0;
 	int i, j = 0, idx = 0;
+	struct i40e_vsi *vsi;
+	u16 num_qps_all = 0;
 
 	if (!test_bit(I40E_VF_STATE_ACTIVE, &vf->vf_states)) {
 		aq_ret = I40E_ERR_PARAM;
@@ -2313,9 +2314,15 @@ static int i40e_vc_config_queues_msg(struct i40e_vf *vf, u8 *msg)
 		pf->vsi[vf->lan_vsi_idx]->num_queue_pairs =
 			qci->num_queue_pairs;
 	} else {
-		for (i = 0; i < vf->num_tc; i++)
-			pf->vsi[vf->ch[i].vsi_idx]->num_queue_pairs =
-			       vf->ch[i].num_qps;
+		for (i = 0; i < vf->num_tc; i++) {
+			vsi = pf->vsi[vf->ch[i].vsi_idx];
+			vsi->num_queue_pairs = vf->ch[i].num_qps;
+
+			if (i40e_update_adq_vsi_queues(vsi, i)) {
+				aq_ret = I40E_ERR_CONFIG;
+				goto error_param;
+			}
+		}
 	}
 
 error_param:

From 3a3b311e3881172fc8e019b6508f04bc40c92d9d Mon Sep 17 00:00:00 2001
From: Karen Sornek <karen.sornek@intel.com>
Date: Wed, 28 Apr 2021 10:19:41 +0200
Subject: [PATCH 81/96] i40e: Fix warning message and call stack during rmmod
 i40e driver

Restore part of reset functionality used when reset is called
from the VF to reset itself. Without this fix warning message
is displayed when VF is being removed via sysfs.

Fix the crash of the VF during reset by ensuring
that the PF receives the reset message successfully.
Refactor code to use one function instead of two.

Fixes: 5c3c48ac6bf5 ("i40e: implement virtual device interface")
Signed-off-by: Grzegorz Szczurek <grzegorzx.szczurek@intel.com>
Signed-off-by: Karen Sornek <karen.sornek@intel.com>
Tested-by: Tony Brelinski <tony.brelinski@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 .../ethernet/intel/i40e/i40e_virtchnl_pf.c    | 53 ++++++++-----------
 1 file changed, 21 insertions(+), 32 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
index 2102db11972a7..80ae264c99ba0 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
@@ -183,17 +183,18 @@ void i40e_vc_notify_vf_reset(struct i40e_vf *vf)
 /***********************misc routines*****************************/
 
 /**
- * i40e_vc_disable_vf
+ * i40e_vc_reset_vf
  * @vf: pointer to the VF info
- *
- * Disable the VF through a SW reset.
+ * @notify_vf: notify vf about reset or not
+ * Reset VF handler.
  **/
-static inline void i40e_vc_disable_vf(struct i40e_vf *vf)
+static void i40e_vc_reset_vf(struct i40e_vf *vf, bool notify_vf)
 {
 	struct i40e_pf *pf = vf->pf;
 	int i;
 
-	i40e_vc_notify_vf_reset(vf);
+	if (notify_vf)
+		i40e_vc_notify_vf_reset(vf);
 
 	/* We want to ensure that an actual reset occurs initiated after this
 	 * function was called. However, we do not want to wait forever, so
@@ -211,9 +212,14 @@ static inline void i40e_vc_disable_vf(struct i40e_vf *vf)
 		usleep_range(10000, 20000);
 	}
 
-	dev_warn(&vf->pf->pdev->dev,
-		 "Failed to initiate reset for VF %d after 200 milliseconds\n",
-		 vf->vf_id);
+	if (notify_vf)
+		dev_warn(&vf->pf->pdev->dev,
+			 "Failed to initiate reset for VF %d after 200 milliseconds\n",
+			 vf->vf_id);
+	else
+		dev_dbg(&vf->pf->pdev->dev,
+			"Failed to initiate reset for VF %d after 200 milliseconds\n",
+			vf->vf_id);
 }
 
 /**
@@ -2108,20 +2114,6 @@ static int i40e_vc_get_vf_resources_msg(struct i40e_vf *vf, u8 *msg)
 	return ret;
 }
 
-/**
- * i40e_vc_reset_vf_msg
- * @vf: pointer to the VF info
- *
- * called from the VF to reset itself,
- * unlike other virtchnl messages, PF driver
- * doesn't send the response back to the VF
- **/
-static void i40e_vc_reset_vf_msg(struct i40e_vf *vf)
-{
-	if (test_bit(I40E_VF_STATE_ACTIVE, &vf->vf_states))
-		i40e_reset_vf(vf, false);
-}
-
 /**
  * i40e_vc_config_promiscuous_mode_msg
  * @vf: pointer to the VF info
@@ -2617,8 +2609,7 @@ static int i40e_vc_request_queues_msg(struct i40e_vf *vf, u8 *msg)
 	} else {
 		/* successful request */
 		vf->num_req_queues = req_pairs;
-		i40e_vc_notify_vf_reset(vf);
-		i40e_reset_vf(vf, false);
+		i40e_vc_reset_vf(vf, true);
 		return 0;
 	}
 
@@ -3813,8 +3804,7 @@ static int i40e_vc_add_qch_msg(struct i40e_vf *vf, u8 *msg)
 	vf->num_req_queues = 0;
 
 	/* reset the VF in order to allocate resources */
-	i40e_vc_notify_vf_reset(vf);
-	i40e_reset_vf(vf, false);
+	i40e_vc_reset_vf(vf, true);
 
 	return I40E_SUCCESS;
 
@@ -3854,8 +3844,7 @@ static int i40e_vc_del_qch_msg(struct i40e_vf *vf, u8 *msg)
 	}
 
 	/* reset the VF in order to allocate resources */
-	i40e_vc_notify_vf_reset(vf);
-	i40e_reset_vf(vf, false);
+	i40e_vc_reset_vf(vf, true);
 
 	return I40E_SUCCESS;
 
@@ -3917,7 +3906,7 @@ int i40e_vc_process_vf_msg(struct i40e_pf *pf, s16 vf_id, u32 v_opcode,
 		i40e_vc_notify_vf_link_state(vf);
 		break;
 	case VIRTCHNL_OP_RESET_VF:
-		i40e_vc_reset_vf_msg(vf);
+		i40e_vc_reset_vf(vf, false);
 		ret = 0;
 		break;
 	case VIRTCHNL_OP_CONFIG_PROMISCUOUS_MODE:
@@ -4171,7 +4160,7 @@ int i40e_ndo_set_vf_mac(struct net_device *netdev, int vf_id, u8 *mac)
 	/* Force the VF interface down so it has to bring up with new MAC
 	 * address
 	 */
-	i40e_vc_disable_vf(vf);
+	i40e_vc_reset_vf(vf, true);
 	dev_info(&pf->pdev->dev, "Bring down and up the VF interface to make this change effective.\n");
 
 error_param:
@@ -4235,7 +4224,7 @@ int i40e_ndo_set_vf_port_vlan(struct net_device *netdev, int vf_id,
 		/* duplicate request, so just return success */
 		goto error_pvid;
 
-	i40e_vc_disable_vf(vf);
+	i40e_vc_reset_vf(vf, true);
 	/* During reset the VF got a new VSI, so refresh a pointer. */
 	vsi = pf->vsi[vf->lan_vsi_idx];
 	/* Locked once because multiple functions below iterate list */
@@ -4613,7 +4602,7 @@ int i40e_ndo_set_vf_trust(struct net_device *netdev, int vf_id, bool setting)
 		goto out;
 
 	vf->trusted = setting;
-	i40e_vc_disable_vf(vf);
+	i40e_vc_reset_vf(vf, true);
 	dev_info(&pf->pdev->dev, "VF %u is now %strusted\n",
 		 vf_id, setting ? "" : "un");
 

From 2e6d218c1ec6fb9cd70693b78134cbc35ae0b5a9 Mon Sep 17 00:00:00 2001
From: Jedrzej Jagielski <jedrzej.jagielski@intel.com>
Date: Mon, 21 Jun 2021 08:37:31 +0000
Subject: [PATCH 82/96] i40e: Fix creation of first queue by omitting it if is
 not power of two

Reject TCs creation with proper message if the first queue
assignment is not equal to the power of two.
The first queue number was checked too late in the second queue
iteration, if second queue was configured at all. Now if first queue value
is not a power of two, then trying to create qdisc will be rejected.

Fixes: 8f88b3034db3 ("i40e: Add infrastructure for queue channel support")
Signed-off-by: Grzegorz Szczurek <grzegorzx.szczurek@intel.com>
Signed-off-by: Jedrzej Jagielski <jedrzej.jagielski@intel.com>
Tested-by: Tony Brelinski <tony.brelinski@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_main.c | 59 +++++++--------------
 1 file changed, 19 insertions(+), 40 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 37386a270db54..0a98fab6d019b 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -5786,24 +5786,6 @@ static void i40e_remove_queue_channels(struct i40e_vsi *vsi)
 	INIT_LIST_HEAD(&vsi->ch_list);
 }
 
-/**
- * i40e_is_any_channel - channel exist or not
- * @vsi: ptr to VSI to which channels are associated with
- *
- * Returns true or false if channel(s) exist for associated VSI or not
- **/
-static bool i40e_is_any_channel(struct i40e_vsi *vsi)
-{
-	struct i40e_channel *ch, *ch_tmp;
-
-	list_for_each_entry_safe(ch, ch_tmp, &vsi->ch_list, list) {
-		if (ch->initialized)
-			return true;
-	}
-
-	return false;
-}
-
 /**
  * i40e_get_max_queues_for_channel
  * @vsi: ptr to VSI to which channels are associated with
@@ -6310,26 +6292,15 @@ int i40e_create_queue_channel(struct i40e_vsi *vsi,
 	/* By default we are in VEPA mode, if this is the first VF/VMDq
 	 * VSI to be added switch to VEB mode.
 	 */
-	if ((!(pf->flags & I40E_FLAG_VEB_MODE_ENABLED)) ||
-	    (!i40e_is_any_channel(vsi))) {
-		if (!is_power_of_2(vsi->tc_config.tc_info[0].qcount)) {
-			dev_dbg(&pf->pdev->dev,
-				"Failed to create channel. Override queues (%u) not power of 2\n",
-				vsi->tc_config.tc_info[0].qcount);
-			return -EINVAL;
-		}
 
-		if (!(pf->flags & I40E_FLAG_VEB_MODE_ENABLED)) {
-			pf->flags |= I40E_FLAG_VEB_MODE_ENABLED;
+	if (!(pf->flags & I40E_FLAG_VEB_MODE_ENABLED)) {
+		pf->flags |= I40E_FLAG_VEB_MODE_ENABLED;
 
-			if (vsi->type == I40E_VSI_MAIN) {
-				if (pf->flags & I40E_FLAG_TC_MQPRIO)
-					i40e_do_reset(pf, I40E_PF_RESET_FLAG,
-						      true);
-				else
-					i40e_do_reset_safe(pf,
-							   I40E_PF_RESET_FLAG);
-			}
+		if (vsi->type == I40E_VSI_MAIN) {
+			if (pf->flags & I40E_FLAG_TC_MQPRIO)
+				i40e_do_reset(pf, I40E_PF_RESET_FLAG, true);
+			else
+				i40e_do_reset_safe(pf, I40E_PF_RESET_FLAG);
 		}
 		/* now onwards for main VSI, number of queues will be value
 		 * of TC0's queue count
@@ -7982,12 +7953,20 @@ static int i40e_setup_tc(struct net_device *netdev, void *type_data)
 			    vsi->seid);
 		need_reset = true;
 		goto exit;
-	} else {
-		dev_info(&vsi->back->pdev->dev,
-			 "Setup channel (id:%u) utilizing num_queues %d\n",
-			 vsi->seid, vsi->tc_config.tc_info[0].qcount);
+	} else if (enabled_tc &&
+		   (!is_power_of_2(vsi->tc_config.tc_info[0].qcount))) {
+		netdev_info(netdev,
+			    "Failed to create channel. Override queues (%u) not power of 2\n",
+			    vsi->tc_config.tc_info[0].qcount);
+		ret = -EINVAL;
+		need_reset = true;
+		goto exit;
 	}
 
+	dev_info(&vsi->back->pdev->dev,
+		 "Setup channel (id:%u) utilizing num_queues %d\n",
+		 vsi->seid, vsi->tc_config.tc_info[0].qcount);
+
 	if (pf->flags & I40E_FLAG_TC_MQPRIO) {
 		if (vsi->mqprio_qopt.max_rate[0]) {
 			u64 max_tx_rate = vsi->mqprio_qopt.max_rate[0];

From 5aff430d4e33a0b48a6b3d5beb06f79da23f9916 Mon Sep 17 00:00:00 2001
From: Grzegorz Szczurek <grzegorzx.szczurek@intel.com>
Date: Fri, 29 Oct 2021 11:26:01 +0200
Subject: [PATCH 83/96] i40e: Fix display error code in dmesg

Fix misleading display error in dmesg if tc filter return fail.
Only i40e status error code should be converted to string, not linux
error code. Otherwise, we return false information about the error.

Fixes: 2f4b411a3d67 ("i40e: Enable cloud filters via tc-flower")
Signed-off-by: Grzegorz Szczurek <grzegorzx.szczurek@intel.com>
Signed-off-by: Mateusz Palczewski <mateusz.palczewski@intel.com>
Tested-by: Dave Switzer <david.switzer@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_main.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 0a98fab6d019b..e118cf9265c79 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -8531,9 +8531,8 @@ static int i40e_configure_clsflower(struct i40e_vsi *vsi,
 		err = i40e_add_del_cloud_filter(vsi, filter, true);
 
 	if (err) {
-		dev_err(&pf->pdev->dev,
-			"Failed to add cloud filter, err %s\n",
-			i40e_stat_str(&pf->hw, err));
+		dev_err(&pf->pdev->dev, "Failed to add cloud filter, err %d\n",
+			err);
 		goto err;
 	}
 

From 3e6db079751afd527bf3db32314ae938dc571916 Mon Sep 17 00:00:00 2001
From: Tadeusz Struk <tadeusz.struk@linaro.org>
Date: Mon, 15 Nov 2021 08:01:43 -0800
Subject: [PATCH 84/96] tipc: check for null after calling kmemdup

kmemdup can return a null pointer so need to check for it, otherwise
the null key will be dereferenced later in tipc_crypto_key_xmit as
can be seen in the trace [1].

Cc: tipc-discussion@lists.sourceforge.net
Cc: stable@vger.kernel.org # 5.15, 5.14, 5.10

[1] https://syzkaller.appspot.com/bug?id=bca180abb29567b189efdbdb34cbf7ba851c2a58

Reported-by: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Tadeusz Struk <tadeusz.struk@linaro.org>
Acked-by: Ying Xue <ying.xue@windriver.com>
Acked-by: Jon Maloy <jmaloy@redhat.com>
Link: https://lore.kernel.org/r/20211115160143.5099-1-tadeusz.struk@linaro.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/tipc/crypto.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/net/tipc/crypto.c b/net/tipc/crypto.c
index e701651f6533a..b4d9419a015b1 100644
--- a/net/tipc/crypto.c
+++ b/net/tipc/crypto.c
@@ -597,6 +597,10 @@ static int tipc_aead_init(struct tipc_aead **aead, struct tipc_aead_key *ukey,
 	tmp->cloned = NULL;
 	tmp->authsize = TIPC_AES_GCM_TAG_SIZE;
 	tmp->key = kmemdup(ukey, tipc_aead_key_size(ukey), GFP_KERNEL);
+	if (!tmp->key) {
+		tipc_aead_free(&tmp->rcu);
+		return -ENOMEM;
+	}
 	memcpy(&tmp->salt, ukey->key + keylen, TIPC_AES_GCM_SALT_SIZE);
 	atomic_set(&tmp->users, 0);
 	atomic64_set(&tmp->seqno, 0);

From 86cdf8e38792545161dbe3350a7eced558ba4d15 Mon Sep 17 00:00:00 2001
From: Lin Ma <linma@zju.edu.cn>
Date: Mon, 15 Nov 2021 22:56:00 +0800
Subject: [PATCH 85/96] NFC: reorganize the functions in nci_request

There is a possible data race as shown below:

thread-A in nci_request()       | thread-B in nci_close_device()
                                | mutex_lock(&ndev->req_lock);
test_bit(NCI_UP, &ndev->flags); |
...                             | test_and_clear_bit(NCI_UP, &ndev->flags)
mutex_lock(&ndev->req_lock);    |
                                |

This race will allow __nci_request() to be awaked while the device is
getting removed.

Similar to commit e2cb6b891ad2 ("bluetooth: eliminate the potential race
condition when removing the HCI controller"). this patch alters the
function sequence in nci_request() to prevent the data races between the
nci_close_device().

Signed-off-by: Lin Ma <linma@zju.edu.cn>
Fixes: 6a2968aaf50c ("NFC: basic NCI protocol implementation")
Link: https://lore.kernel.org/r/20211115145600.8320-1-linma@zju.edu.cn
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/nfc/nci/core.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/net/nfc/nci/core.c b/net/nfc/nci/core.c
index 6fd873aa86bee..1dd0269c1a723 100644
--- a/net/nfc/nci/core.c
+++ b/net/nfc/nci/core.c
@@ -144,12 +144,15 @@ inline int nci_request(struct nci_dev *ndev,
 {
 	int rc;
 
-	if (!test_bit(NCI_UP, &ndev->flags))
-		return -ENETDOWN;
-
 	/* Serialize all requests */
 	mutex_lock(&ndev->req_lock);
-	rc = __nci_request(ndev, req, opt, timeout);
+	/* check the state after obtaing the lock against any races
+	 * from nci_close_device when the device gets removed.
+	 */
+	if (test_bit(NCI_UP, &ndev->flags))
+		rc = __nci_request(ndev, req, opt, timeout);
+	else
+		rc = -ENETDOWN;
 	mutex_unlock(&ndev->req_lock);
 
 	return rc;

From 3e3b5dfcd16a3e254aab61bd1e8c417dd4503102 Mon Sep 17 00:00:00 2001
From: Lin Ma <linma@zju.edu.cn>
Date: Tue, 16 Nov 2021 23:26:52 +0800
Subject: [PATCH 86/96] NFC: reorder the logic in nfc_{un,}register_device

There is a potential UAF between the unregistration routine and the NFC
netlink operations.

The race that cause that UAF can be shown as below:

 (FREE)                      |  (USE)
nfcmrvl_nci_unregister_dev   |  nfc_genl_dev_up
  nci_close_device           |
  nci_unregister_device      |    nfc_get_device
    nfc_unregister_device    |    nfc_dev_up
      rfkill_destory         |
      device_del             |      rfkill_blocked
  ...                        |    ...

The root cause for this race is concluded below:
1. The rfkill_blocked (USE) in nfc_dev_up is supposed to be placed after
the device_is_registered check.
2. Since the netlink operations are possible just after the device_add
in nfc_register_device, the nfc_dev_up() can happen anywhere during the
rfkill creation process, which leads to data race.

This patch reorder these actions to permit
1. Once device_del is finished, the nfc_dev_up cannot dereference the
rfkill object.
2. The rfkill_register need to be placed after the device_add of nfc_dev
because the parent device need to be created first. So this patch keeps
the order but inject device_lock to prevent the data race.

Signed-off-by: Lin Ma <linma@zju.edu.cn>
Fixes: be055b2f89b5 ("NFC: RFKILL support")
Reviewed-by: Jakub Kicinski <kuba@kernel.org>
Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
Link: https://lore.kernel.org/r/20211116152652.19217-1-linma@zju.edu.cn
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/nfc/core.c | 32 ++++++++++++++++++--------------
 1 file changed, 18 insertions(+), 14 deletions(-)

diff --git a/net/nfc/core.c b/net/nfc/core.c
index 3c645c1d99c9b..dc7a2404efdf9 100644
--- a/net/nfc/core.c
+++ b/net/nfc/core.c
@@ -94,13 +94,13 @@ int nfc_dev_up(struct nfc_dev *dev)
 
 	device_lock(&dev->dev);
 
-	if (dev->rfkill && rfkill_blocked(dev->rfkill)) {
-		rc = -ERFKILL;
+	if (!device_is_registered(&dev->dev)) {
+		rc = -ENODEV;
 		goto error;
 	}
 
-	if (!device_is_registered(&dev->dev)) {
-		rc = -ENODEV;
+	if (dev->rfkill && rfkill_blocked(dev->rfkill)) {
+		rc = -ERFKILL;
 		goto error;
 	}
 
@@ -1125,11 +1125,7 @@ int nfc_register_device(struct nfc_dev *dev)
 	if (rc)
 		pr_err("Could not register llcp device\n");
 
-	rc = nfc_genl_device_added(dev);
-	if (rc)
-		pr_debug("The userspace won't be notified that the device %s was added\n",
-			 dev_name(&dev->dev));
-
+	device_lock(&dev->dev);
 	dev->rfkill = rfkill_alloc(dev_name(&dev->dev), &dev->dev,
 				   RFKILL_TYPE_NFC, &nfc_rfkill_ops, dev);
 	if (dev->rfkill) {
@@ -1138,6 +1134,12 @@ int nfc_register_device(struct nfc_dev *dev)
 			dev->rfkill = NULL;
 		}
 	}
+	device_unlock(&dev->dev);
+
+	rc = nfc_genl_device_added(dev);
+	if (rc)
+		pr_debug("The userspace won't be notified that the device %s was added\n",
+			 dev_name(&dev->dev));
 
 	return 0;
 }
@@ -1154,10 +1156,17 @@ void nfc_unregister_device(struct nfc_dev *dev)
 
 	pr_debug("dev_name=%s\n", dev_name(&dev->dev));
 
+	rc = nfc_genl_device_removed(dev);
+	if (rc)
+		pr_debug("The userspace won't be notified that the device %s "
+			 "was removed\n", dev_name(&dev->dev));
+
+	device_lock(&dev->dev);
 	if (dev->rfkill) {
 		rfkill_unregister(dev->rfkill);
 		rfkill_destroy(dev->rfkill);
 	}
+	device_unlock(&dev->dev);
 
 	if (dev->ops->check_presence) {
 		device_lock(&dev->dev);
@@ -1167,11 +1176,6 @@ void nfc_unregister_device(struct nfc_dev *dev)
 		cancel_work_sync(&dev->check_pres_work);
 	}
 
-	rc = nfc_genl_device_removed(dev);
-	if (rc)
-		pr_debug("The userspace won't be notified that the device %s "
-			 "was removed\n", dev_name(&dev->dev));
-
 	nfc_llcp_unregister_device(dev);
 
 	mutex_lock(&nfc_devlist_mutex);

From 48b71a9e66c2eab60564b1b1c85f4928ed04e406 Mon Sep 17 00:00:00 2001
From: Lin Ma <linma@zju.edu.cn>
Date: Tue, 16 Nov 2021 23:27:32 +0800
Subject: [PATCH 87/96] NFC: add NCI_UNREG flag to eliminate the race

There are two sites that calls queue_work() after the
destroy_workqueue() and lead to possible UAF.

The first site is nci_send_cmd(), which can happen after the
nci_close_device as below

nfcmrvl_nci_unregister_dev   |  nfc_genl_dev_up
  nci_close_device           |
    flush_workqueue          |
    del_timer_sync           |
  nci_unregister_device      |    nfc_get_device
    destroy_workqueue        |    nfc_dev_up
    nfc_unregister_device    |      nci_dev_up
      device_del             |        nci_open_device
                             |          __nci_request
                             |            nci_send_cmd
                             |              queue_work !!!

Another site is nci_cmd_timer, awaked by the nci_cmd_work from the
nci_send_cmd.

  ...                        |  ...
  nci_unregister_device      |  queue_work
    destroy_workqueue        |
    nfc_unregister_device    |  ...
      device_del             |  nci_cmd_work
                             |  mod_timer
                             |  ...
                             |  nci_cmd_timer
                             |    queue_work !!!

For the above two UAF, the root cause is that the nfc_dev_up can race
between the nci_unregister_device routine. Therefore, this patch
introduce NCI_UNREG flag to easily eliminate the possible race. In
addition, the mutex_lock in nci_close_device can act as a barrier.

Signed-off-by: Lin Ma <linma@zju.edu.cn>
Fixes: 6a2968aaf50c ("NFC: basic NCI protocol implementation")
Reviewed-by: Jakub Kicinski <kuba@kernel.org>
Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
Link: https://lore.kernel.org/r/20211116152732.19238-1-linma@zju.edu.cn
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/nfc/nci_core.h |  1 +
 net/nfc/nci/core.c         | 19 +++++++++++++++++--
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/include/net/nfc/nci_core.h b/include/net/nfc/nci_core.h
index a964daedc17b6..ea8595651c384 100644
--- a/include/net/nfc/nci_core.h
+++ b/include/net/nfc/nci_core.h
@@ -30,6 +30,7 @@ enum nci_flag {
 	NCI_UP,
 	NCI_DATA_EXCHANGE,
 	NCI_DATA_EXCHANGE_TO,
+	NCI_UNREG,
 };
 
 /* NCI device states */
diff --git a/net/nfc/nci/core.c b/net/nfc/nci/core.c
index 1dd0269c1a723..d2537383a3e89 100644
--- a/net/nfc/nci/core.c
+++ b/net/nfc/nci/core.c
@@ -476,6 +476,11 @@ static int nci_open_device(struct nci_dev *ndev)
 
 	mutex_lock(&ndev->req_lock);
 
+	if (test_bit(NCI_UNREG, &ndev->flags)) {
+		rc = -ENODEV;
+		goto done;
+	}
+
 	if (test_bit(NCI_UP, &ndev->flags)) {
 		rc = -EALREADY;
 		goto done;
@@ -548,6 +553,10 @@ static int nci_open_device(struct nci_dev *ndev)
 static int nci_close_device(struct nci_dev *ndev)
 {
 	nci_req_cancel(ndev, ENODEV);
+
+	/* This mutex needs to be held as a barrier for
+	 * caller nci_unregister_device
+	 */
 	mutex_lock(&ndev->req_lock);
 
 	if (!test_and_clear_bit(NCI_UP, &ndev->flags)) {
@@ -585,8 +594,8 @@ static int nci_close_device(struct nci_dev *ndev)
 
 	del_timer_sync(&ndev->cmd_timer);
 
-	/* Clear flags */
-	ndev->flags = 0;
+	/* Clear flags except NCI_UNREG */
+	ndev->flags &= BIT(NCI_UNREG);
 
 	mutex_unlock(&ndev->req_lock);
 
@@ -1269,6 +1278,12 @@ void nci_unregister_device(struct nci_dev *ndev)
 {
 	struct nci_conn_info *conn_info, *n;
 
+	/* This set_bit is not protected with specialized barrier,
+	 * However, it is fine because the mutex_lock(&ndev->req_lock);
+	 * in nci_close_device() will help to emit one.
+	 */
+	set_bit(NCI_UNREG, &ndev->flags);
+
 	nci_close_device(ndev);
 
 	destroy_workqueue(ndev->cmd_wq);

From a280ef90af01dc133d0a52387e563015686d6294 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Wed, 17 Nov 2021 10:34:54 +0300
Subject: [PATCH 88/96] octeontx2-af: debugfs: don't corrupt user memory

The user supplies the "count" value to say how big its read buffer is.
The rvu_dbg_lmtst_map_table_display() function does not take the "count"
into account but instead just copies the whole table, potentially
corrupting the user's data.

Introduce the "ret" variable to store how many bytes we can copy.  Also
I changed the type of "off" to size_t to make using min() simpler.

Fixes: 0daa55d033b0 ("octeontx2-af: cn10k: debugfs for dumping LMTST map table")
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Link: https://lore.kernel.org/r/20211117073454.GD5237@kili
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../ethernet/marvell/octeontx2/af/rvu_debugfs.c | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c
index c7fd466a0efdc..a09a507369ac3 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c
@@ -236,10 +236,11 @@ static ssize_t rvu_dbg_lmtst_map_table_display(struct file *filp,
 	u64 lmt_addr, val, tbl_base;
 	int pf, vf, num_vfs, hw_vfs;
 	void __iomem *lmt_map_base;
-	int index = 0, off = 0;
-	int bytes_not_copied;
 	int buf_size = 10240;
+	size_t off = 0;
+	int index = 0;
 	char *buf;
+	int ret;
 
 	/* don't allow partial reads */
 	if (*ppos != 0)
@@ -303,15 +304,17 @@ static ssize_t rvu_dbg_lmtst_map_table_display(struct file *filp,
 	}
 	off +=	scnprintf(&buf[off], buf_size - 1 - off, "\n");
 
-	bytes_not_copied = copy_to_user(buffer, buf, off);
+	ret = min(off, count);
+	if (copy_to_user(buffer, buf, ret))
+		ret = -EFAULT;
 	kfree(buf);
 
 	iounmap(lmt_map_base);
-	if (bytes_not_copied)
-		return -EFAULT;
+	if (ret < 0)
+		return ret;
 
-	*ppos = off;
-	return off;
+	*ppos = ret;
+	return ret;
 }
 
 RVU_DEBUG_FOPS(lmtst_map_table, lmtst_map_table_display, NULL);

From a66998e0fbf213d47d02813b9679426129d0d114 Mon Sep 17 00:00:00 2001
From: Teng Qi <starmiku1207184332@gmail.com>
Date: Wed, 17 Nov 2021 11:44:53 +0800
Subject: [PATCH 89/96] ethernet: hisilicon: hns: hns_dsaf_misc: fix a possible
 array overflow in hns_dsaf_ge_srst_by_port()

The if statement:
  if (port >= DSAF_GE_NUM)
        return;

limits the value of port less than DSAF_GE_NUM (i.e., 8).
However, if the value of port is 6 or 7, an array overflow could occur:
  port_rst_off = dsaf_dev->mac_cb[port]->port_rst_off;

because the length of dsaf_dev->mac_cb is DSAF_MAX_PORT_NUM (i.e., 6).

To fix this possible array overflow, we first check port and if it is
greater than or equal to DSAF_MAX_PORT_NUM, the function returns.

Reported-by: TOTE Robot <oslab@tsinghua.edu.cn>
Signed-off-by: Teng Qi <starmiku1207184332@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/hisilicon/hns/hns_dsaf_misc.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_misc.c b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_misc.c
index 23d9cbf262c32..740850b64aff5 100644
--- a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_misc.c
+++ b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_misc.c
@@ -400,6 +400,10 @@ static void hns_dsaf_ge_srst_by_port(struct dsaf_device *dsaf_dev, u32 port,
 		return;
 
 	if (!HNS_DSAF_IS_DEBUG(dsaf_dev)) {
+		/* DSAF_MAX_PORT_NUM is 6, but DSAF_GE_NUM is 8.
+		   We need check to prevent array overflow */
+		if (port >= DSAF_MAX_PORT_NUM)
+			return;
 		reg_val_1  = 0x1 << port;
 		port_rst_off = dsaf_dev->mac_cb[port]->port_rst_off;
 		/* there is difference between V1 and V2 in register.*/

From f915b75bffb7257bd8d26376b8e1cc67771927f8 Mon Sep 17 00:00:00 2001
From: Yunsheng Lin <linyunsheng@huawei.com>
Date: Wed, 17 Nov 2021 15:56:52 +0800
Subject: [PATCH 90/96] page_pool: Revert "page_pool: disable dma mapping
 support..."

This reverts commit d00e60ee54b12de945b8493cf18c1ada9e422514.

As reported by Guillaume in [1]:
Enabling LPAE always enables CONFIG_ARCH_DMA_ADDR_T_64BIT
in 32-bit systems, which breaks the bootup proceess when a
ethernet driver is using page pool with PP_FLAG_DMA_MAP flag.
As we were hoping we had no active consumers for such system
when we removed the dma mapping support, and LPAE seems like
a common feature for 32 bits system, so revert it.

1. https://www.spinics.net/lists/netdev/msg779890.html

Fixes: d00e60ee54b1 ("page_pool: disable dma mapping support for 32-bit arch with 64-bit DMA")
Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
Reported-by: "kernelci.org bot" <bot@kernelci.org>
Tested-by: "kernelci.org bot" <bot@kernelci.org>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Ilias Apalodimas <ilias.apalodimas@linaro.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mm_types.h | 13 ++++++++++++-
 include/net/page_pool.h  | 12 +++++++++++-
 net/core/page_pool.c     | 10 ++++------
 3 files changed, 27 insertions(+), 8 deletions(-)

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index bb8c6f5f19bca..c3a6e62096006 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -105,7 +105,18 @@ struct page {
 			struct page_pool *pp;
 			unsigned long _pp_mapping_pad;
 			unsigned long dma_addr;
-			atomic_long_t pp_frag_count;
+			union {
+				/**
+				 * dma_addr_upper: might require a 64-bit
+				 * value on 32-bit architectures.
+				 */
+				unsigned long dma_addr_upper;
+				/**
+				 * For frag page support, not supported in
+				 * 32-bit architectures with 64-bit DMA.
+				 */
+				atomic_long_t pp_frag_count;
+			};
 		};
 		struct {	/* slab, slob and slub */
 			union {
diff --git a/include/net/page_pool.h b/include/net/page_pool.h
index 3855f069627f4..a4082406a0039 100644
--- a/include/net/page_pool.h
+++ b/include/net/page_pool.h
@@ -216,14 +216,24 @@ static inline void page_pool_recycle_direct(struct page_pool *pool,
 	page_pool_put_full_page(pool, page, true);
 }
 
+#define PAGE_POOL_DMA_USE_PP_FRAG_COUNT	\
+		(sizeof(dma_addr_t) > sizeof(unsigned long))
+
 static inline dma_addr_t page_pool_get_dma_addr(struct page *page)
 {
-	return page->dma_addr;
+	dma_addr_t ret = page->dma_addr;
+
+	if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT)
+		ret |= (dma_addr_t)page->dma_addr_upper << 16 << 16;
+
+	return ret;
 }
 
 static inline void page_pool_set_dma_addr(struct page *page, dma_addr_t addr)
 {
 	page->dma_addr = addr;
+	if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT)
+		page->dma_addr_upper = upper_32_bits(addr);
 }
 
 static inline void page_pool_set_frag_count(struct page *page, long nr)
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index 9b60e4301a44f..1a6978427d6c8 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -49,12 +49,6 @@ static int page_pool_init(struct page_pool *pool,
 	 * which is the XDP_TX use-case.
 	 */
 	if (pool->p.flags & PP_FLAG_DMA_MAP) {
-		/* DMA-mapping is not supported on 32-bit systems with
-		 * 64-bit DMA mapping.
-		 */
-		if (sizeof(dma_addr_t) > sizeof(unsigned long))
-			return -EOPNOTSUPP;
-
 		if ((pool->p.dma_dir != DMA_FROM_DEVICE) &&
 		    (pool->p.dma_dir != DMA_BIDIRECTIONAL))
 			return -EINVAL;
@@ -75,6 +69,10 @@ static int page_pool_init(struct page_pool *pool,
 		 */
 	}
 
+	if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT &&
+	    pool->p.flags & PP_FLAG_PAGE_FRAG)
+		return -EINVAL;
+
 	if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0)
 		return -ENOMEM;
 

From fec1faf221f61118aa52f44c65a13c3e173a64c2 Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@nvidia.com>
Date: Wed, 17 Nov 2021 16:49:09 +0200
Subject: [PATCH 91/96] devlink: Don't throw an error if flash notification
 sent before devlink visible

The mlxsw driver calls to various devlink flash routines even before
users can get any access to the devlink instance itself. For example,
mlxsw_core_fw_rev_validate() one of such functions.

__mlxsw_core_bus_device_register
 -> mlxsw_core_fw_rev_validate
  -> mlxsw_core_fw_flash
   -> mlxfw_firmware_flash
    -> mlxfw_status_notify
     -> devlink_flash_update_status_notify
      -> __devlink_flash_update_notify
       -> WARN_ON(...)

It causes to the WARN_ON to trigger warning about devlink not registered.

Fixes: cf530217408e ("devlink: Notify users when objects are accessible")
Reported-by: Danielle Ratson <danieller@nvidia.com>
Tested-by: Danielle Ratson <danieller@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Acked-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/devlink.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/core/devlink.c b/net/core/devlink.c
index 5ba4f9434acda..5ad72dbfcd079 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -4229,7 +4229,9 @@ static void __devlink_flash_update_notify(struct devlink *devlink,
 	WARN_ON(cmd != DEVLINK_CMD_FLASH_UPDATE &&
 		cmd != DEVLINK_CMD_FLASH_UPDATE_END &&
 		cmd != DEVLINK_CMD_FLASH_UPDATE_STATUS);
-	WARN_ON(!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED));
+
+	if (!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED))
+		return;
 
 	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
 	if (!msg)

From 5d2ca2e12dfb2aff3388ca57b06f570fa6206ced Mon Sep 17 00:00:00 2001
From: Jesse Brandeburg <jesse.brandeburg@intel.com>
Date: Wed, 17 Nov 2021 12:59:52 -0800
Subject: [PATCH 92/96] e100: fix device suspend/resume

As reported in [1], e100 was no longer working for suspend/resume
cycles. The previous commit mentioned in the fixes appears to have
broken things and this attempts to practice best known methods for
device power management and keep wake-up working while allowing
suspend/resume to work. To do this, I reorder a little bit of code
and fix the resume path to make sure the device is enabled.

[1] https://bugzilla.kernel.org/show_bug.cgi?id=214933

Fixes: 69a74aef8a18 ("e100: use generic power management")
Cc: Vaibhav Gupta <vaibhavgupta40@gmail.com>
Reported-by: Alexey Kuznetsov <axet@me.com>
Signed-off-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Tested-by: Alexey Kuznetsov <axet@me.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/intel/e100.c | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/intel/e100.c b/drivers/net/ethernet/intel/e100.c
index 5039a25369517..0bf3d47bb90dc 100644
--- a/drivers/net/ethernet/intel/e100.c
+++ b/drivers/net/ethernet/intel/e100.c
@@ -3003,9 +3003,10 @@ static void __e100_shutdown(struct pci_dev *pdev, bool *enable_wake)
 	struct net_device *netdev = pci_get_drvdata(pdev);
 	struct nic *nic = netdev_priv(netdev);
 
+	netif_device_detach(netdev);
+
 	if (netif_running(netdev))
 		e100_down(nic);
-	netif_device_detach(netdev);
 
 	if ((nic->flags & wol_magic) | e100_asf(nic)) {
 		/* enable reverse auto-negotiation */
@@ -3022,7 +3023,7 @@ static void __e100_shutdown(struct pci_dev *pdev, bool *enable_wake)
 		*enable_wake = false;
 	}
 
-	pci_clear_master(pdev);
+	pci_disable_device(pdev);
 }
 
 static int __e100_power_off(struct pci_dev *pdev, bool wake)
@@ -3042,8 +3043,6 @@ static int __maybe_unused e100_suspend(struct device *dev_d)
 
 	__e100_shutdown(to_pci_dev(dev_d), &wake);
 
-	device_wakeup_disable(dev_d);
-
 	return 0;
 }
 
@@ -3051,6 +3050,14 @@ static int __maybe_unused e100_resume(struct device *dev_d)
 {
 	struct net_device *netdev = dev_get_drvdata(dev_d);
 	struct nic *nic = netdev_priv(netdev);
+	int err;
+
+	err = pci_enable_device(to_pci_dev(dev_d));
+	if (err) {
+		netdev_err(netdev, "Resume cannot enable PCI device, aborting\n");
+		return err;
+	}
+	pci_set_master(to_pci_dev(dev_d));
 
 	/* disable reverse auto-negotiation */
 	if (nic->phy == phy_82552_v) {
@@ -3062,10 +3069,11 @@ static int __maybe_unused e100_resume(struct device *dev_d)
 		           smartspeed & ~(E100_82552_REV_ANEG));
 	}
 
-	netif_device_attach(netdev);
 	if (netif_running(netdev))
 		e100_up(nic);
 
+	netif_device_attach(netdev);
+
 	return 0;
 }
 

From 5f9c55c8066bcd93ac25234a02585701fe2e31df Mon Sep 17 00:00:00 2001
From: Jordy Zomer <jordy@pwning.systems>
Date: Wed, 17 Nov 2021 20:06:48 +0100
Subject: [PATCH 93/96] ipv6: check return value of ipv6_skip_exthdr

The offset value is used in pointer math on skb->data.
Since ipv6_skip_exthdr may return -1 the pointer to uh and th
may not point to the actual udp and tcp headers and potentially
overwrite other stuff. This is why I think this should be checked.

EDIT:  added {}'s, thanks Kees

Signed-off-by: Jordy Zomer <jordy@pwning.systems>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/esp6.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index ed2f061b87685..f0bac6f7ab6bb 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -808,6 +808,12 @@ int esp6_input_done2(struct sk_buff *skb, int err)
 		struct tcphdr *th;
 
 		offset = ipv6_skip_exthdr(skb, offset, &nexthdr, &frag_off);
+
+		if (offset < 0) {
+			err = -EINVAL;
+			goto out;
+		}
+
 		uh = (void *)(skb->data + offset);
 		th = (void *)(skb->data + offset);
 		hdr_len += offset;

From 61217be886b5f7402843677e4be7e7e83de9cb41 Mon Sep 17 00:00:00 2001
From: zhangyue <zhangyue1@kylinos.cn>
Date: Thu, 18 Nov 2021 13:46:32 +0800
Subject: [PATCH 94/96] net: tulip: de4x5: fix the problem that the array
 'lp->phy[8]' may be out of bound

In line 5001, if all id in the array 'lp->phy[8]' is not 0, when the
'for' end, the 'k' is 8.

At this time, the array 'lp->phy[8]' may be out of bound.

Signed-off-by: zhangyue <zhangyue1@kylinos.cn>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/dec/tulip/de4x5.c | 30 +++++++++++++++-----------
 1 file changed, 17 insertions(+), 13 deletions(-)

diff --git a/drivers/net/ethernet/dec/tulip/de4x5.c b/drivers/net/ethernet/dec/tulip/de4x5.c
index 13121c4dcfe6f..bb334042f8e13 100644
--- a/drivers/net/ethernet/dec/tulip/de4x5.c
+++ b/drivers/net/ethernet/dec/tulip/de4x5.c
@@ -5000,19 +5000,23 @@ mii_get_phy(struct net_device *dev)
 	}
 	if ((j == limit) && (i < DE4X5_MAX_MII)) {
 	    for (k=0; k < DE4X5_MAX_PHY && lp->phy[k].id; k++);
-	    lp->phy[k].addr = i;
-	    lp->phy[k].id = id;
-	    lp->phy[k].spd.reg = GENERIC_REG;      /* ANLPA register         */
-	    lp->phy[k].spd.mask = GENERIC_MASK;    /* 100Mb/s technologies   */
-	    lp->phy[k].spd.value = GENERIC_VALUE;  /* TX & T4, H/F Duplex    */
-	    lp->mii_cnt++;
-	    lp->active++;
-	    printk("%s: Using generic MII device control. If the board doesn't operate,\nplease mail the following dump to the author:\n", dev->name);
-	    j = de4x5_debug;
-	    de4x5_debug |= DEBUG_MII;
-	    de4x5_dbg_mii(dev, k);
-	    de4x5_debug = j;
-	    printk("\n");
+	    if (k < DE4X5_MAX_PHY) {
+		lp->phy[k].addr = i;
+		lp->phy[k].id = id;
+		lp->phy[k].spd.reg = GENERIC_REG;      /* ANLPA register         */
+		lp->phy[k].spd.mask = GENERIC_MASK;    /* 100Mb/s technologies   */
+		lp->phy[k].spd.value = GENERIC_VALUE;  /* TX & T4, H/F Duplex    */
+		lp->mii_cnt++;
+		lp->active++;
+		printk("%s: Using generic MII device control. If the board doesn't operate,\nplease mail the following dump to the author:\n", dev->name);
+		j = de4x5_debug;
+		de4x5_debug |= DEBUG_MII;
+		de4x5_dbg_mii(dev, k);
+		de4x5_debug = j;
+		printk("\n");
+	    } else {
+		goto purgatory;
+	    }
 	}
     }
   purgatory:

From 0fa68da72c3be09e06dd833258ee89c33374195f Mon Sep 17 00:00:00 2001
From: Teng Qi <starmiku1207184332@gmail.com>
Date: Thu, 18 Nov 2021 15:01:18 +0800
Subject: [PATCH 95/96] net: ethernet: dec: tulip: de4x5: fix possible array
 overflows in type3_infoblock()

The definition of macro MOTO_SROM_BUG is:
  #define MOTO_SROM_BUG    (lp->active == 8 && (get_unaligned_le32(
  dev->dev_addr) & 0x00ffffff) == 0x3e0008)

and the if statement
  if (MOTO_SROM_BUG) lp->active = 0;

using this macro indicates lp->active could be 8. If lp->active is 8 and
the second comparison of this macro is false. lp->active will remain 8 in:
  lp->phy[lp->active].gep = (*p ? p : NULL); p += (2 * (*p) + 1);
  lp->phy[lp->active].rst = (*p ? p : NULL); p += (2 * (*p) + 1);
  lp->phy[lp->active].mc  = get_unaligned_le16(p); p += 2;
  lp->phy[lp->active].ana = get_unaligned_le16(p); p += 2;
  lp->phy[lp->active].fdx = get_unaligned_le16(p); p += 2;
  lp->phy[lp->active].ttm = get_unaligned_le16(p); p += 2;
  lp->phy[lp->active].mci = *p;

However, the length of array lp->phy is 8, so array overflows can occur.
To fix these possible array overflows, we first check lp->active and then
return -EINVAL if it is greater or equal to ARRAY_SIZE(lp->phy) (i.e. 8).

Reported-by: TOTE Robot <oslab@tsinghua.edu.cn>
Signed-off-by: Teng Qi <starmiku1207184332@gmail.com>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/dec/tulip/de4x5.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/net/ethernet/dec/tulip/de4x5.c b/drivers/net/ethernet/dec/tulip/de4x5.c
index bb334042f8e13..71730ef4cd570 100644
--- a/drivers/net/ethernet/dec/tulip/de4x5.c
+++ b/drivers/net/ethernet/dec/tulip/de4x5.c
@@ -4709,6 +4709,10 @@ type3_infoblock(struct net_device *dev, u_char count, u_char *p)
         lp->ibn = 3;
         lp->active = *p++;
 	if (MOTO_SROM_BUG) lp->active = 0;
+	/* if (MOTO_SROM_BUG) statement indicates lp->active could
+	 * be 8 (i.e. the size of array lp->phy) */
+	if (WARN_ON(lp->active >= ARRAY_SIZE(lp->phy)))
+		return -EINVAL;
 	lp->phy[lp->active].gep = (*p ? p : NULL); p += (2 * (*p) + 1);
 	lp->phy[lp->active].rst = (*p ? p : NULL); p += (2 * (*p) + 1);
 	lp->phy[lp->active].mc  = get_unaligned_le16(p); p += 2;

From c7521d3aa2fa7fc785682758c99b5bcae503f6be Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Thu, 18 Nov 2021 14:22:11 +0300
Subject: [PATCH 96/96] ptp: ocp: Fix a couple NULL vs IS_ERR() checks

The ptp_ocp_get_mem() function does not return NULL, it returns error
pointers.

Fixes: 773bda964921 ("ptp: ocp: Expose various resources on the timecard.")
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/ptp/ptp_ocp.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/drivers/ptp/ptp_ocp.c b/drivers/ptp/ptp_ocp.c
index 34f943c8c9fd8..0f1b5a7d2a89c 100644
--- a/drivers/ptp/ptp_ocp.c
+++ b/drivers/ptp/ptp_ocp.c
@@ -1304,10 +1304,11 @@ ptp_ocp_register_ext(struct ptp_ocp *bp, struct ocp_resource *r)
 	if (!ext)
 		return -ENOMEM;
 
-	err = -EINVAL;
 	ext->mem = ptp_ocp_get_mem(bp, r);
-	if (!ext->mem)
+	if (IS_ERR(ext->mem)) {
+		err = PTR_ERR(ext->mem);
 		goto out;
+	}
 
 	ext->bp = bp;
 	ext->info = r->extra;
@@ -1371,8 +1372,8 @@ ptp_ocp_register_mem(struct ptp_ocp *bp, struct ocp_resource *r)
 	void __iomem *mem;
 
 	mem = ptp_ocp_get_mem(bp, r);
-	if (!mem)
-		return -EINVAL;
+	if (IS_ERR(mem))
+		return PTR_ERR(mem);
 
 	bp_assign_entry(bp, r, mem);