From b1fd94e704571f98b21027340eecf821b2bdffba Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 18 May 2022 20:15:31 +0200
Subject: [PATCH 01/47] netfilter: use get_random_u32 instead of prandom

bh might occur while updating per-cpu rnd_state from user context,
ie. local_out path.

BUG: using smp_processor_id() in preemptible [00000000] code: nginx/2725
caller is nft_ng_random_eval+0x24/0x54 [nft_numgen]
Call Trace:
 check_preemption_disabled+0xde/0xe0
 nft_ng_random_eval+0x24/0x54 [nft_numgen]

Use the random driver instead, this also avoids need for local prandom
state. Moreover, prandom now uses the random driver since d4150779e60f
("random32: use real rng for non-deterministic randomness").

Based on earlier patch from Pablo Neira.

Fixes: 6b2faee0ca91 ("netfilter: nft_meta: place prandom handling in a helper")
Fixes: 978d8f9055c3 ("netfilter: nft_numgen: add map lookups for numgen random operations")
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nft_meta.c   | 13 ++-----------
 net/netfilter/nft_numgen.c | 12 +++---------
 2 files changed, 5 insertions(+), 20 deletions(-)

diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c
index ac4859241e177..55d2d49c34259 100644
--- a/net/netfilter/nft_meta.c
+++ b/net/netfilter/nft_meta.c
@@ -14,6 +14,7 @@
 #include <linux/in.h>
 #include <linux/ip.h>
 #include <linux/ipv6.h>
+#include <linux/random.h>
 #include <linux/smp.h>
 #include <linux/static_key.h>
 #include <net/dst.h>
@@ -32,8 +33,6 @@
 #define NFT_META_SECS_PER_DAY		86400
 #define NFT_META_DAYS_PER_WEEK		7
 
-static DEFINE_PER_CPU(struct rnd_state, nft_prandom_state);
-
 static u8 nft_meta_weekday(void)
 {
 	time64_t secs = ktime_get_real_seconds();
@@ -271,13 +270,6 @@ static bool nft_meta_get_eval_ifname(enum nft_meta_keys key, u32 *dest,
 	return true;
 }
 
-static noinline u32 nft_prandom_u32(void)
-{
-	struct rnd_state *state = this_cpu_ptr(&nft_prandom_state);
-
-	return prandom_u32_state(state);
-}
-
 #ifdef CONFIG_IP_ROUTE_CLASSID
 static noinline bool
 nft_meta_get_eval_rtclassid(const struct sk_buff *skb, u32 *dest)
@@ -389,7 +381,7 @@ void nft_meta_get_eval(const struct nft_expr *expr,
 		break;
 #endif
 	case NFT_META_PRANDOM:
-		*dest = nft_prandom_u32();
+		*dest = get_random_u32();
 		break;
 #ifdef CONFIG_XFRM
 	case NFT_META_SECPATH:
@@ -518,7 +510,6 @@ int nft_meta_get_init(const struct nft_ctx *ctx,
 		len = IFNAMSIZ;
 		break;
 	case NFT_META_PRANDOM:
-		prandom_init_once(&nft_prandom_state);
 		len = sizeof(u32);
 		break;
 #ifdef CONFIG_XFRM
diff --git a/net/netfilter/nft_numgen.c b/net/netfilter/nft_numgen.c
index 81b40c663d86a..45d3dc9e96f2c 100644
--- a/net/netfilter/nft_numgen.c
+++ b/net/netfilter/nft_numgen.c
@@ -9,12 +9,11 @@
 #include <linux/netlink.h>
 #include <linux/netfilter.h>
 #include <linux/netfilter/nf_tables.h>
+#include <linux/random.h>
 #include <linux/static_key.h>
 #include <net/netfilter/nf_tables.h>
 #include <net/netfilter/nf_tables_core.h>
 
-static DEFINE_PER_CPU(struct rnd_state, nft_numgen_prandom_state);
-
 struct nft_ng_inc {
 	u8			dreg;
 	u32			modulus;
@@ -135,12 +134,9 @@ struct nft_ng_random {
 	u32			offset;
 };
 
-static u32 nft_ng_random_gen(struct nft_ng_random *priv)
+static u32 nft_ng_random_gen(const struct nft_ng_random *priv)
 {
-	struct rnd_state *state = this_cpu_ptr(&nft_numgen_prandom_state);
-
-	return reciprocal_scale(prandom_u32_state(state), priv->modulus) +
-	       priv->offset;
+	return reciprocal_scale(get_random_u32(), priv->modulus) + priv->offset;
 }
 
 static void nft_ng_random_eval(const struct nft_expr *expr,
@@ -168,8 +164,6 @@ static int nft_ng_random_init(const struct nft_ctx *ctx,
 	if (priv->offset + priv->modulus - 1 < priv->offset)
 		return -EOVERFLOW;
 
-	prandom_init_once(&nft_numgen_prandom_state);
-
 	return nft_parse_register_store(ctx, tb[NFTA_NG_DREG], &priv->dreg,
 					NULL, NFT_DATA_VALUE, sizeof(u32));
 }

From a6e944f25cdbe6b82275402b8bc9a55ad7aac10b Mon Sep 17 00:00:00 2001
From: Ciara Loftus <ciara.loftus@intel.com>
Date: Tue, 14 Jun 2022 07:07:46 +0000
Subject: [PATCH 02/47] xsk: Fix generic transmit when completion queue
 reservation fails

Two points of potential failure in the generic transmit function are:

  1. completion queue (cq) reservation failure.
  2. skb allocation failure

Originally the cq reservation was performed first, followed by the skb
allocation. Commit 675716400da6 ("xdp: fix possible cq entry leak")
reversed the order because at the time there was no mechanism available
to undo the cq reservation which could have led to possible cq entry leaks
in the event of skb allocation failure. However if the skb allocation is
performed first and the cq reservation then fails, the xsk skb destructor
is called which blindly adds the skb address to the already full cq leading
to undefined behavior.

This commit restores the original order (cq reservation followed by skb
allocation) and uses the xskq_prod_cancel helper to undo the cq reserve
in event of skb allocation failure.

Fixes: 675716400da6 ("xdp: fix possible cq entry leak")
Signed-off-by: Ciara Loftus <ciara.loftus@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Magnus Karlsson <magnus.karlsson@intel.com>
Link: https://lore.kernel.org/bpf/20220614070746.8871-1-ciara.loftus@intel.com
---
 net/xdp/xsk.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 19ac872a66240..09002387987ea 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -538,12 +538,6 @@ static int xsk_generic_xmit(struct sock *sk)
 			goto out;
 		}
 
-		skb = xsk_build_skb(xs, &desc);
-		if (IS_ERR(skb)) {
-			err = PTR_ERR(skb);
-			goto out;
-		}
-
 		/* This is the backpressure mechanism for the Tx path.
 		 * Reserve space in the completion queue and only proceed
 		 * if there is space in it. This avoids having to implement
@@ -552,11 +546,19 @@ static int xsk_generic_xmit(struct sock *sk)
 		spin_lock_irqsave(&xs->pool->cq_lock, flags);
 		if (xskq_prod_reserve(xs->pool->cq)) {
 			spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
-			kfree_skb(skb);
 			goto out;
 		}
 		spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
 
+		skb = xsk_build_skb(xs, &desc);
+		if (IS_ERR(skb)) {
+			err = PTR_ERR(skb);
+			spin_lock_irqsave(&xs->pool->cq_lock, flags);
+			xskq_prod_cancel(xs->pool->cq);
+			spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
+			goto out;
+		}
+
 		err = __dev_direct_xmit(skb, xs->queue_id);
 		if  (err == NETDEV_TX_BUSY) {
 			/* Tell user-space to retry the send */

From 3046a827316c0e55fc563b4fb78c93b9ca5c7c37 Mon Sep 17 00:00:00 2001
From: Jon Maxwell <jmaxwell37@gmail.com>
Date: Wed, 15 Jun 2022 11:15:40 +1000
Subject: [PATCH 03/47] bpf: Fix request_sock leak in sk lookup helpers

A customer reported a request_socket leak in a Calico cloud environment. We
found that a BPF program was doing a socket lookup with takes a refcnt on
the socket and that it was finding the request_socket but returning the parent
LISTEN socket via sk_to_full_sk() without decrementing the child request socket
1st, resulting in request_sock slab object leak. This patch retains the
existing behaviour of returning full socks to the caller but it also decrements
the child request_socket if one is present before doing so to prevent the leak.

Thanks to Curtis Taylor for all the help in diagnosing and testing this. And
thanks to Antoine Tenart for the reproducer and patch input.

v2 of this patch contains, refactor as per Daniel Borkmann's suggestions to
validate RCU flags on the listen socket so that it balances with bpf_sk_release()
and update comments as per Martin KaFai Lau's suggestion. One small change to
Daniels suggestion, put "sk = sk2" under "if (sk2 != sk)" to avoid an extra
instruction.

Fixes: f7355a6c0497 ("bpf: Check sk_fullsock() before returning from bpf_sk_lookup()")
Fixes: edbf8c01de5a ("bpf: add skc_lookup_tcp helper")
Co-developed-by: Antoine Tenart <atenart@kernel.org>
Signed-off-by: Antoine Tenart <atenart@kernel.org>
Signed-off-by: Jon Maxwell <jmaxwell37@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Tested-by: Curtis Taylor <cutaylor-pub@yahoo.com>
Cc: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/56d6f898-bde0-bb25-3427-12a330b29fb8@iogearbox.net
Link: https://lore.kernel.org/bpf/20220615011540.813025-1-jmaxwell37@gmail.com
---
 net/core/filter.c | 34 ++++++++++++++++++++++++++++------
 1 file changed, 28 insertions(+), 6 deletions(-)

diff --git a/net/core/filter.c b/net/core/filter.c
index 5af58eb485875..5d16d66727fc8 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -6516,10 +6516,21 @@ __bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
 					   ifindex, proto, netns_id, flags);
 
 	if (sk) {
-		sk = sk_to_full_sk(sk);
-		if (!sk_fullsock(sk)) {
+		struct sock *sk2 = sk_to_full_sk(sk);
+
+		/* sk_to_full_sk() may return (sk)->rsk_listener, so make sure the original sk
+		 * sock refcnt is decremented to prevent a request_sock leak.
+		 */
+		if (!sk_fullsock(sk2))
+			sk2 = NULL;
+		if (sk2 != sk) {
 			sock_gen_put(sk);
-			return NULL;
+			/* Ensure there is no need to bump sk2 refcnt */
+			if (unlikely(sk2 && !sock_flag(sk2, SOCK_RCU_FREE))) {
+				WARN_ONCE(1, "Found non-RCU, unreferenced socket!");
+				return NULL;
+			}
+			sk = sk2;
 		}
 	}
 
@@ -6553,10 +6564,21 @@ bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
 					 flags);
 
 	if (sk) {
-		sk = sk_to_full_sk(sk);
-		if (!sk_fullsock(sk)) {
+		struct sock *sk2 = sk_to_full_sk(sk);
+
+		/* sk_to_full_sk() may return (sk)->rsk_listener, so make sure the original sk
+		 * sock refcnt is decremented to prevent a request_sock leak.
+		 */
+		if (!sk_fullsock(sk2))
+			sk2 = NULL;
+		if (sk2 != sk) {
 			sock_gen_put(sk);
-			return NULL;
+			/* Ensure there is no need to bump sk2 refcnt */
+			if (unlikely(sk2 && !sock_flag(sk2, SOCK_RCU_FREE))) {
+				WARN_ONCE(1, "Found non-RCU, unreferenced socket!");
+				return NULL;
+			}
+			sk = sk2;
 		}
 	}
 

From d1a374a1aeb7e31191448e225ed2f9c5e894f280 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Wed, 15 Jun 2022 09:51:51 +0530
Subject: [PATCH 04/47] bpf: Limit maximum modifier chain length in
 btf_check_type_tags

On processing a module BTF of module built for an older kernel, we might
sometimes find that some type points to itself forming a loop. If such a
type is a modifier, btf_check_type_tags's while loop following modifier
chain will be caught in an infinite loop.

Fix this by defining a maximum chain length and bailing out if we spin
any longer than that.

Fixes: eb596b090558 ("bpf: Ensure type tags precede modifiers in BTF")
Reported-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20220615042151.2266537-1-memxor@gmail.com
---
 kernel/bpf/btf.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 63d0ac7dfe2fb..eb12d4f705cce 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -4815,6 +4815,7 @@ static int btf_check_type_tags(struct btf_verifier_env *env,
 	n = btf_nr_types(btf);
 	for (i = start_id; i < n; i++) {
 		const struct btf_type *t;
+		int chain_limit = 32;
 		u32 cur_id = i;
 
 		t = btf_type_by_id(btf, i);
@@ -4827,6 +4828,10 @@ static int btf_check_type_tags(struct btf_verifier_env *env,
 
 		in_tags = btf_type_is_type_tag(t);
 		while (btf_type_is_modifier(t)) {
+			if (!chain_limit--) {
+				btf_verifier_log(env, "Max chain length or cycle detected");
+				return -ELOOP;
+			}
 			if (btf_type_is_type_tag(t)) {
 				if (!in_tags) {
 					btf_verifier_log(env, "Type tags don't precede modifiers");

From ff672c67ee7635ca1e28fb13729e8ef0d1f08ce5 Mon Sep 17 00:00:00 2001
From: Jakub Sitnicki <jakub@cloudflare.com>
Date: Thu, 16 Jun 2022 18:20:36 +0200
Subject: [PATCH 05/47] bpf, x86: Fix tail call count offset calculation on
 bpf2bpf call

On x86-64 the tail call count is passed from one BPF function to another
through %rax. Additionally, on function entry, the tail call count value
is stored on stack right after the BPF program stack, due to register
shortage.

The stored count is later loaded from stack either when performing a tail
call - to check if we have not reached the tail call limit - or before
calling another BPF function call in order to pass it via %rax.

In the latter case, we miscalculate the offset at which the tail call count
was stored on function entry. The JIT does not take into account that the
allocated BPF program stack is always a multiple of 8 on x86, while the
actual stack depth does not have to be.

This leads to a load from an offset that belongs to the BPF stack, as shown
in the example below:

SEC("tc")
int entry(struct __sk_buff *skb)
{
	/* Have data on stack which size is not a multiple of 8 */
	volatile char arr[1] = {};
	return subprog_tail(skb);
}

int entry(struct __sk_buff * skb):
   0: (b4) w2 = 0
   1: (73) *(u8 *)(r10 -1) = r2
   2: (85) call pc+1#bpf_prog_ce2f79bb5f3e06dd_F
   3: (95) exit

int entry(struct __sk_buff * skb):
   0xffffffffa0201788:  nop    DWORD PTR [rax+rax*1+0x0]
   0xffffffffa020178d:  xor    eax,eax
   0xffffffffa020178f:  push   rbp
   0xffffffffa0201790:  mov    rbp,rsp
   0xffffffffa0201793:  sub    rsp,0x8
   0xffffffffa020179a:  push   rax
   0xffffffffa020179b:  xor    esi,esi
   0xffffffffa020179d:  mov    BYTE PTR [rbp-0x1],sil
   0xffffffffa02017a1:  mov    rax,QWORD PTR [rbp-0x9]	!!! tail call count
   0xffffffffa02017a8:  call   0xffffffffa02017d8       !!! is at rbp-0x10
   0xffffffffa02017ad:  leave
   0xffffffffa02017ae:  ret

Fix it by rounding up the BPF stack depth to a multiple of 8, when
calculating the tail call count offset on stack.

Fixes: ebf7d1f508a7 ("bpf, x64: rework pro/epilogue and tailcall handling in JIT")
Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20220616162037.535469-2-jakub@cloudflare.com
---
 arch/x86/net/bpf_jit_comp.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index f298b18a9a3d1..c98b8c0ed3b83 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -1420,8 +1420,9 @@ st:			if (is_imm8(insn->off))
 		case BPF_JMP | BPF_CALL:
 			func = (u8 *) __bpf_call_base + imm32;
 			if (tail_call_reachable) {
+				/* mov rax, qword ptr [rbp - rounded_stack_depth - 8] */
 				EMIT3_off32(0x48, 0x8B, 0x85,
-					    -(bpf_prog->aux->stack_depth + 8));
+					    -round_up(bpf_prog->aux->stack_depth, 8) - 8);
 				if (!imm32 || emit_call(&prog, func, image + addrs[i - 1] + 7))
 					return -EINVAL;
 			} else {

From 5e0b0a4c52d30bb09659446f40b77a692361600d Mon Sep 17 00:00:00 2001
From: Jakub Sitnicki <jakub@cloudflare.com>
Date: Thu, 16 Jun 2022 18:20:37 +0200
Subject: [PATCH 06/47] selftests/bpf: Test tail call counting with bpf2bpf and
 data on stack

Cover the case when tail call count needs to be passed from BPF function to
BPF function, and the caller has data on stack. Specifically when the size
of data allocated on BPF stack is not a multiple on 8.

Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20220616162037.535469-3-jakub@cloudflare.com
---
 .../selftests/bpf/prog_tests/tailcalls.c      | 55 +++++++++++++++++++
 .../selftests/bpf/progs/tailcall_bpf2bpf6.c   | 42 ++++++++++++++
 2 files changed, 97 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/progs/tailcall_bpf2bpf6.c

diff --git a/tools/testing/selftests/bpf/prog_tests/tailcalls.c b/tools/testing/selftests/bpf/prog_tests/tailcalls.c
index c4da87ec3ba47..19c70880cfb3a 100644
--- a/tools/testing/selftests/bpf/prog_tests/tailcalls.c
+++ b/tools/testing/selftests/bpf/prog_tests/tailcalls.c
@@ -831,6 +831,59 @@ static void test_tailcall_bpf2bpf_4(bool noise)
 	bpf_object__close(obj);
 }
 
+#include "tailcall_bpf2bpf6.skel.h"
+
+/* Tail call counting works even when there is data on stack which is
+ * not aligned to 8 bytes.
+ */
+static void test_tailcall_bpf2bpf_6(void)
+{
+	struct tailcall_bpf2bpf6 *obj;
+	int err, map_fd, prog_fd, main_fd, data_fd, i, val;
+	LIBBPF_OPTS(bpf_test_run_opts, topts,
+		.data_in = &pkt_v4,
+		.data_size_in = sizeof(pkt_v4),
+		.repeat = 1,
+	);
+
+	obj = tailcall_bpf2bpf6__open_and_load();
+	if (!ASSERT_OK_PTR(obj, "open and load"))
+		return;
+
+	main_fd = bpf_program__fd(obj->progs.entry);
+	if (!ASSERT_GE(main_fd, 0, "entry prog fd"))
+		goto out;
+
+	map_fd = bpf_map__fd(obj->maps.jmp_table);
+	if (!ASSERT_GE(map_fd, 0, "jmp_table map fd"))
+		goto out;
+
+	prog_fd = bpf_program__fd(obj->progs.classifier_0);
+	if (!ASSERT_GE(prog_fd, 0, "classifier_0 prog fd"))
+		goto out;
+
+	i = 0;
+	err = bpf_map_update_elem(map_fd, &i, &prog_fd, BPF_ANY);
+	if (!ASSERT_OK(err, "jmp_table map update"))
+		goto out;
+
+	err = bpf_prog_test_run_opts(main_fd, &topts);
+	ASSERT_OK(err, "entry prog test run");
+	ASSERT_EQ(topts.retval, 0, "tailcall retval");
+
+	data_fd = bpf_map__fd(obj->maps.bss);
+	if (!ASSERT_GE(map_fd, 0, "bss map fd"))
+		goto out;
+
+	i = 0;
+	err = bpf_map_lookup_elem(data_fd, &i, &val);
+	ASSERT_OK(err, "bss map lookup");
+	ASSERT_EQ(val, 1, "done flag is set");
+
+out:
+	tailcall_bpf2bpf6__destroy(obj);
+}
+
 void test_tailcalls(void)
 {
 	if (test__start_subtest("tailcall_1"))
@@ -855,4 +908,6 @@ void test_tailcalls(void)
 		test_tailcall_bpf2bpf_4(false);
 	if (test__start_subtest("tailcall_bpf2bpf_5"))
 		test_tailcall_bpf2bpf_4(true);
+	if (test__start_subtest("tailcall_bpf2bpf_6"))
+		test_tailcall_bpf2bpf_6();
 }
diff --git a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf6.c b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf6.c
new file mode 100644
index 0000000000000..41ce83da78e8b
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf6.c
@@ -0,0 +1,42 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+
+#define __unused __attribute__((unused))
+
+struct {
+	__uint(type, BPF_MAP_TYPE_PROG_ARRAY);
+	__uint(max_entries, 1);
+	__uint(key_size, sizeof(__u32));
+	__uint(value_size, sizeof(__u32));
+} jmp_table SEC(".maps");
+
+int done = 0;
+
+SEC("tc")
+int classifier_0(struct __sk_buff *skb __unused)
+{
+	done = 1;
+	return 0;
+}
+
+static __noinline
+int subprog_tail(struct __sk_buff *skb)
+{
+	/* Don't propagate the constant to the caller */
+	volatile int ret = 1;
+
+	bpf_tail_call_static(skb, &jmp_table, 0);
+	return ret;
+}
+
+SEC("tc")
+int entry(struct __sk_buff *skb)
+{
+	/* Have data on stack which size is not a multiple of 8 */
+	volatile char arr[1] = {};
+
+	return subprog_tail(skb);
+}
+
+char __license[] SEC("license") = "GPL";

From ad8848535e97f4a5374fc68f7a5d16e2565940cc Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Wed, 15 Jun 2022 13:21:15 +0200
Subject: [PATCH 07/47] selftests/bpf: Shuffle cookies symbols in kprobe multi
 test

There's a kernel bug that causes cookies to be misplaced and
the reason we did not catch this with this test is that we
provide bpf_fentry_test* functions already sorted by name.

Shuffling function bpf_fentry_test2 deeper in the list and
keeping the current cookie values as before will trigger
the bug.

The kernel fix is coming in following changes.

Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Link: https://lore.kernel.org/r/20220615112118.497303-2-jolsa@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../selftests/bpf/prog_tests/bpf_cookie.c     | 78 +++++++++----------
 .../selftests/bpf/progs/kprobe_multi.c        | 24 +++---
 2 files changed, 51 insertions(+), 51 deletions(-)

diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c b/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c
index 83ef55e3caa4a..2974b44f80faf 100644
--- a/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c
+++ b/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c
@@ -121,24 +121,24 @@ static void kprobe_multi_link_api_subtest(void)
 })
 
 	GET_ADDR("bpf_fentry_test1", addrs[0]);
-	GET_ADDR("bpf_fentry_test2", addrs[1]);
-	GET_ADDR("bpf_fentry_test3", addrs[2]);
-	GET_ADDR("bpf_fentry_test4", addrs[3]);
-	GET_ADDR("bpf_fentry_test5", addrs[4]);
-	GET_ADDR("bpf_fentry_test6", addrs[5]);
-	GET_ADDR("bpf_fentry_test7", addrs[6]);
+	GET_ADDR("bpf_fentry_test3", addrs[1]);
+	GET_ADDR("bpf_fentry_test4", addrs[2]);
+	GET_ADDR("bpf_fentry_test5", addrs[3]);
+	GET_ADDR("bpf_fentry_test6", addrs[4]);
+	GET_ADDR("bpf_fentry_test7", addrs[5]);
+	GET_ADDR("bpf_fentry_test2", addrs[6]);
 	GET_ADDR("bpf_fentry_test8", addrs[7]);
 
 #undef GET_ADDR
 
-	cookies[0] = 1;
-	cookies[1] = 2;
-	cookies[2] = 3;
-	cookies[3] = 4;
-	cookies[4] = 5;
-	cookies[5] = 6;
-	cookies[6] = 7;
-	cookies[7] = 8;
+	cookies[0] = 1; /* bpf_fentry_test1 */
+	cookies[1] = 2; /* bpf_fentry_test3 */
+	cookies[2] = 3; /* bpf_fentry_test4 */
+	cookies[3] = 4; /* bpf_fentry_test5 */
+	cookies[4] = 5; /* bpf_fentry_test6 */
+	cookies[5] = 6; /* bpf_fentry_test7 */
+	cookies[6] = 7; /* bpf_fentry_test2 */
+	cookies[7] = 8; /* bpf_fentry_test8 */
 
 	opts.kprobe_multi.addrs = (const unsigned long *) &addrs;
 	opts.kprobe_multi.cnt = ARRAY_SIZE(addrs);
@@ -149,14 +149,14 @@ static void kprobe_multi_link_api_subtest(void)
 	if (!ASSERT_GE(link1_fd, 0, "link1_fd"))
 		goto cleanup;
 
-	cookies[0] = 8;
-	cookies[1] = 7;
-	cookies[2] = 6;
-	cookies[3] = 5;
-	cookies[4] = 4;
-	cookies[5] = 3;
-	cookies[6] = 2;
-	cookies[7] = 1;
+	cookies[0] = 8; /* bpf_fentry_test1 */
+	cookies[1] = 7; /* bpf_fentry_test3 */
+	cookies[2] = 6; /* bpf_fentry_test4 */
+	cookies[3] = 5; /* bpf_fentry_test5 */
+	cookies[4] = 4; /* bpf_fentry_test6 */
+	cookies[5] = 3; /* bpf_fentry_test7 */
+	cookies[6] = 2; /* bpf_fentry_test2 */
+	cookies[7] = 1; /* bpf_fentry_test8 */
 
 	opts.kprobe_multi.flags = BPF_F_KPROBE_MULTI_RETURN;
 	prog_fd = bpf_program__fd(skel->progs.test_kretprobe);
@@ -181,12 +181,12 @@ static void kprobe_multi_attach_api_subtest(void)
 	struct kprobe_multi *skel = NULL;
 	const char *syms[8] = {
 		"bpf_fentry_test1",
-		"bpf_fentry_test2",
 		"bpf_fentry_test3",
 		"bpf_fentry_test4",
 		"bpf_fentry_test5",
 		"bpf_fentry_test6",
 		"bpf_fentry_test7",
+		"bpf_fentry_test2",
 		"bpf_fentry_test8",
 	};
 	__u64 cookies[8];
@@ -198,14 +198,14 @@ static void kprobe_multi_attach_api_subtest(void)
 	skel->bss->pid = getpid();
 	skel->bss->test_cookie = true;
 
-	cookies[0] = 1;
-	cookies[1] = 2;
-	cookies[2] = 3;
-	cookies[3] = 4;
-	cookies[4] = 5;
-	cookies[5] = 6;
-	cookies[6] = 7;
-	cookies[7] = 8;
+	cookies[0] = 1; /* bpf_fentry_test1 */
+	cookies[1] = 2; /* bpf_fentry_test3 */
+	cookies[2] = 3; /* bpf_fentry_test4 */
+	cookies[3] = 4; /* bpf_fentry_test5 */
+	cookies[4] = 5; /* bpf_fentry_test6 */
+	cookies[5] = 6; /* bpf_fentry_test7 */
+	cookies[6] = 7; /* bpf_fentry_test2 */
+	cookies[7] = 8; /* bpf_fentry_test8 */
 
 	opts.syms = syms;
 	opts.cnt = ARRAY_SIZE(syms);
@@ -216,14 +216,14 @@ static void kprobe_multi_attach_api_subtest(void)
 	if (!ASSERT_OK_PTR(link1, "bpf_program__attach_kprobe_multi_opts"))
 		goto cleanup;
 
-	cookies[0] = 8;
-	cookies[1] = 7;
-	cookies[2] = 6;
-	cookies[3] = 5;
-	cookies[4] = 4;
-	cookies[5] = 3;
-	cookies[6] = 2;
-	cookies[7] = 1;
+	cookies[0] = 8; /* bpf_fentry_test1 */
+	cookies[1] = 7; /* bpf_fentry_test3 */
+	cookies[2] = 6; /* bpf_fentry_test4 */
+	cookies[3] = 5; /* bpf_fentry_test5 */
+	cookies[4] = 4; /* bpf_fentry_test6 */
+	cookies[5] = 3; /* bpf_fentry_test7 */
+	cookies[6] = 2; /* bpf_fentry_test2 */
+	cookies[7] = 1; /* bpf_fentry_test8 */
 
 	opts.retprobe = true;
 
diff --git a/tools/testing/selftests/bpf/progs/kprobe_multi.c b/tools/testing/selftests/bpf/progs/kprobe_multi.c
index 93510f4f0f3aa..08f95a8155d1b 100644
--- a/tools/testing/selftests/bpf/progs/kprobe_multi.c
+++ b/tools/testing/selftests/bpf/progs/kprobe_multi.c
@@ -54,21 +54,21 @@ static void kprobe_multi_check(void *ctx, bool is_return)
 
 	if (is_return) {
 		SET(kretprobe_test1_result, &bpf_fentry_test1, 8);
-		SET(kretprobe_test2_result, &bpf_fentry_test2, 7);
-		SET(kretprobe_test3_result, &bpf_fentry_test3, 6);
-		SET(kretprobe_test4_result, &bpf_fentry_test4, 5);
-		SET(kretprobe_test5_result, &bpf_fentry_test5, 4);
-		SET(kretprobe_test6_result, &bpf_fentry_test6, 3);
-		SET(kretprobe_test7_result, &bpf_fentry_test7, 2);
+		SET(kretprobe_test2_result, &bpf_fentry_test2, 2);
+		SET(kretprobe_test3_result, &bpf_fentry_test3, 7);
+		SET(kretprobe_test4_result, &bpf_fentry_test4, 6);
+		SET(kretprobe_test5_result, &bpf_fentry_test5, 5);
+		SET(kretprobe_test6_result, &bpf_fentry_test6, 4);
+		SET(kretprobe_test7_result, &bpf_fentry_test7, 3);
 		SET(kretprobe_test8_result, &bpf_fentry_test8, 1);
 	} else {
 		SET(kprobe_test1_result, &bpf_fentry_test1, 1);
-		SET(kprobe_test2_result, &bpf_fentry_test2, 2);
-		SET(kprobe_test3_result, &bpf_fentry_test3, 3);
-		SET(kprobe_test4_result, &bpf_fentry_test4, 4);
-		SET(kprobe_test5_result, &bpf_fentry_test5, 5);
-		SET(kprobe_test6_result, &bpf_fentry_test6, 6);
-		SET(kprobe_test7_result, &bpf_fentry_test7, 7);
+		SET(kprobe_test2_result, &bpf_fentry_test2, 7);
+		SET(kprobe_test3_result, &bpf_fentry_test3, 2);
+		SET(kprobe_test4_result, &bpf_fentry_test4, 3);
+		SET(kprobe_test5_result, &bpf_fentry_test5, 4);
+		SET(kprobe_test6_result, &bpf_fentry_test6, 5);
+		SET(kprobe_test7_result, &bpf_fentry_test7, 6);
 		SET(kprobe_test8_result, &bpf_fentry_test8, 8);
 	}
 

From eb1b2985fe5c5f02e43e4c0d47bbe7ed835007f3 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Wed, 15 Jun 2022 13:21:16 +0200
Subject: [PATCH 08/47] ftrace: Keep address offset in ftrace_lookup_symbols

We want to store the resolved address on the same index as
the symbol string, because that's the user (bpf kprobe link)
code assumption.

Also making sure we don't store duplicates that might be
present in kallsyms.

Acked-by: Song Liu <songliubraving@fb.com>
Acked-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Fixes: bed0d9a50dac ("ftrace: Add ftrace_lookup_symbols function")
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Link: https://lore.kernel.org/r/20220615112118.497303-3-jolsa@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/trace/ftrace.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index e750fe141a606..601ccf1b2f091 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -8029,15 +8029,23 @@ static int kallsyms_callback(void *data, const char *name,
 			     struct module *mod, unsigned long addr)
 {
 	struct kallsyms_data *args = data;
+	const char **sym;
+	int idx;
 
-	if (!bsearch(&name, args->syms, args->cnt, sizeof(*args->syms), symbols_cmp))
+	sym = bsearch(&name, args->syms, args->cnt, sizeof(*args->syms), symbols_cmp);
+	if (!sym)
+		return 0;
+
+	idx = sym - args->syms;
+	if (args->addrs[idx])
 		return 0;
 
 	addr = ftrace_location(addr);
 	if (!addr)
 		return 0;
 
-	args->addrs[args->found++] = addr;
+	args->addrs[idx] = addr;
+	args->found++;
 	return args->found == args->cnt ? 1 : 0;
 }
 
@@ -8062,6 +8070,7 @@ int ftrace_lookup_symbols(const char **sorted_syms, size_t cnt, unsigned long *a
 	struct kallsyms_data args;
 	int err;
 
+	memset(addrs, 0, sizeof(*addrs) * cnt);
 	args.addrs = addrs;
 	args.syms = sorted_syms;
 	args.cnt = cnt;

From eb5fb0325698d05f0bf78d322de82c451a3685a2 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Wed, 15 Jun 2022 13:21:17 +0200
Subject: [PATCH 09/47] bpf: Force cookies array to follow symbols sorting

When user specifies symbols and cookies for kprobe_multi link
interface it's very likely the cookies will be misplaced and
returned to wrong functions (via get_attach_cookie helper).

The reason is that to resolve the provided functions we sort
them before passing them to ftrace_lookup_symbols, but we do
not do the same sort on the cookie values.

Fixing this by using sort_r function with custom swap callback
that swaps cookie values as well.

Fixes: 0236fec57a15 ("bpf: Resolve symbols with ftrace_lookup_symbols for kprobe multi link")
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Link: https://lore.kernel.org/r/20220615112118.497303-4-jolsa@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/trace/bpf_trace.c | 60 ++++++++++++++++++++++++++++++----------
 1 file changed, 45 insertions(+), 15 deletions(-)

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 7a13e6ac6327c..88589d74a892e 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -2423,7 +2423,7 @@ kprobe_multi_link_handler(struct fprobe *fp, unsigned long entry_ip,
 	kprobe_multi_link_prog_run(link, entry_ip, regs);
 }
 
-static int symbols_cmp(const void *a, const void *b)
+static int symbols_cmp_r(const void *a, const void *b, const void *priv)
 {
 	const char **str_a = (const char **) a;
 	const char **str_b = (const char **) b;
@@ -2431,6 +2431,28 @@ static int symbols_cmp(const void *a, const void *b)
 	return strcmp(*str_a, *str_b);
 }
 
+struct multi_symbols_sort {
+	const char **funcs;
+	u64 *cookies;
+};
+
+static void symbols_swap_r(void *a, void *b, int size, const void *priv)
+{
+	const struct multi_symbols_sort *data = priv;
+	const char **name_a = a, **name_b = b;
+
+	swap(*name_a, *name_b);
+
+	/* If defined, swap also related cookies. */
+	if (data->cookies) {
+		u64 *cookie_a, *cookie_b;
+
+		cookie_a = data->cookies + (name_a - data->funcs);
+		cookie_b = data->cookies + (name_b - data->funcs);
+		swap(*cookie_a, *cookie_b);
+	}
+}
+
 int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
 {
 	struct bpf_kprobe_multi_link *link = NULL;
@@ -2468,38 +2490,46 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
 	if (!addrs)
 		return -ENOMEM;
 
+	ucookies = u64_to_user_ptr(attr->link_create.kprobe_multi.cookies);
+	if (ucookies) {
+		cookies = kvmalloc_array(cnt, sizeof(*addrs), GFP_KERNEL);
+		if (!cookies) {
+			err = -ENOMEM;
+			goto error;
+		}
+		if (copy_from_user(cookies, ucookies, size)) {
+			err = -EFAULT;
+			goto error;
+		}
+	}
+
 	if (uaddrs) {
 		if (copy_from_user(addrs, uaddrs, size)) {
 			err = -EFAULT;
 			goto error;
 		}
 	} else {
+		struct multi_symbols_sort data = {
+			.cookies = cookies,
+		};
 		struct user_syms us;
 
 		err = copy_user_syms(&us, usyms, cnt);
 		if (err)
 			goto error;
 
-		sort(us.syms, cnt, sizeof(*us.syms), symbols_cmp, NULL);
+		if (cookies)
+			data.funcs = us.syms;
+
+		sort_r(us.syms, cnt, sizeof(*us.syms), symbols_cmp_r,
+		       symbols_swap_r, &data);
+
 		err = ftrace_lookup_symbols(us.syms, cnt, addrs);
 		free_user_syms(&us);
 		if (err)
 			goto error;
 	}
 
-	ucookies = u64_to_user_ptr(attr->link_create.kprobe_multi.cookies);
-	if (ucookies) {
-		cookies = kvmalloc_array(cnt, sizeof(*addrs), GFP_KERNEL);
-		if (!cookies) {
-			err = -ENOMEM;
-			goto error;
-		}
-		if (copy_from_user(cookies, ucookies, size)) {
-			err = -EFAULT;
-			goto error;
-		}
-	}
-
 	link = kzalloc(sizeof(*link), GFP_KERNEL);
 	if (!link) {
 		err = -ENOMEM;

From 730067022c0137691b27726377c2d088f7f8e33c Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Wed, 15 Jun 2022 13:21:18 +0200
Subject: [PATCH 10/47] selftest/bpf: Fix kprobe_multi bench test

With [1] the available_filter_functions file contains records
starting with __ftrace_invalid_address___ and marking disabled
entries.

We need to filter them out for the bench test to pass only
resolvable symbols to kernel.

[1] commit b39181f7c690 ("ftrace: Add FTRACE_MCOUNT_MAX_OFFSET to avoid adding weak function")

Fixes: b39181f7c690 ("ftrace: Add FTRACE_MCOUNT_MAX_OFFSET to avoid adding weak function")
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Link: https://lore.kernel.org/r/20220615112118.497303-5-jolsa@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c b/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c
index 586dc52d6fb96..5b93d5d0bd931 100644
--- a/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c
+++ b/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c
@@ -364,6 +364,9 @@ static int get_syms(char ***symsp, size_t *cntp)
 			continue;
 		if (!strncmp(name, "rcu_", 4))
 			continue;
+		if (!strncmp(name, "__ftrace_invalid_address__",
+			     sizeof("__ftrace_invalid_address__") - 1))
+			continue;
 		err = hashmap__add(map, name, NULL);
 		if (err) {
 			free(name);

From 9b7fd1670a94a57d974795acebde843a5c1a354e Mon Sep 17 00:00:00 2001
From: Claudiu Manoil <claudiu.manoil@nxp.com>
Date: Fri, 10 Jun 2022 11:40:37 +0300
Subject: [PATCH 11/47] phy: aquantia: Fix AN when higher speeds than 1G are
 not advertised

Even when the eth port is resticted to work with speeds not higher than 1G,
and so the eth driver is requesting the phy (via phylink) to advertise up
to 1000BASET support, the aquantia phy device is still advertising for 2.5G
and 5G speeds.
Clear these advertising defaults when requested.

Cc: Ondrej Spacek <ondrej.spacek@nxp.com>
Fixes: 09c4c57f7bc41 ("net: phy: aquantia: add support for auto-negotiation configuration")
Signed-off-by: Claudiu Manoil <claudiu.manoil@nxp.com>
Link: https://lore.kernel.org/r/20220610084037.7625-1-claudiu.manoil@nxp.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/phy/aquantia_main.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/drivers/net/phy/aquantia_main.c b/drivers/net/phy/aquantia_main.c
index a8db1a19011bd..c7047f5d7a9b0 100644
--- a/drivers/net/phy/aquantia_main.c
+++ b/drivers/net/phy/aquantia_main.c
@@ -34,6 +34,8 @@
 #define MDIO_AN_VEND_PROV			0xc400
 #define MDIO_AN_VEND_PROV_1000BASET_FULL	BIT(15)
 #define MDIO_AN_VEND_PROV_1000BASET_HALF	BIT(14)
+#define MDIO_AN_VEND_PROV_5000BASET_FULL	BIT(11)
+#define MDIO_AN_VEND_PROV_2500BASET_FULL	BIT(10)
 #define MDIO_AN_VEND_PROV_DOWNSHIFT_EN		BIT(4)
 #define MDIO_AN_VEND_PROV_DOWNSHIFT_MASK	GENMASK(3, 0)
 #define MDIO_AN_VEND_PROV_DOWNSHIFT_DFLT	4
@@ -231,9 +233,20 @@ static int aqr_config_aneg(struct phy_device *phydev)
 			      phydev->advertising))
 		reg |= MDIO_AN_VEND_PROV_1000BASET_HALF;
 
+	/* Handle the case when the 2.5G and 5G speeds are not advertised */
+	if (linkmode_test_bit(ETHTOOL_LINK_MODE_2500baseT_Full_BIT,
+			      phydev->advertising))
+		reg |= MDIO_AN_VEND_PROV_2500BASET_FULL;
+
+	if (linkmode_test_bit(ETHTOOL_LINK_MODE_5000baseT_Full_BIT,
+			      phydev->advertising))
+		reg |= MDIO_AN_VEND_PROV_5000BASET_FULL;
+
 	ret = phy_modify_mmd_changed(phydev, MDIO_MMD_AN, MDIO_AN_VEND_PROV,
 				     MDIO_AN_VEND_PROV_1000BASET_HALF |
-				     MDIO_AN_VEND_PROV_1000BASET_FULL, reg);
+				     MDIO_AN_VEND_PROV_1000BASET_FULL |
+				     MDIO_AN_VEND_PROV_2500BASET_FULL |
+				     MDIO_AN_VEND_PROV_5000BASET_FULL, reg);
 	if (ret < 0)
 		return ret;
 	if (ret > 0)

From cc26c2661fefea215f41edb665193324a5f99021 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 16 Jun 2022 00:34:34 -0700
Subject: [PATCH 12/47] net: fix data-race in dev_isalive()

dev_isalive() is called under RTNL or dev_base_lock protection.

This means that changes to dev->reg_state should be done with both locks held.

syzbot reported:

BUG: KCSAN: data-race in register_netdevice / type_show

write to 0xffff888144ecf518 of 1 bytes by task 20886 on cpu 0:
register_netdevice+0xb9f/0xdf0 net/core/dev.c:10050
lapbeth_new_device drivers/net/wan/lapbether.c:414 [inline]
lapbeth_device_event+0x4a0/0x6c0 drivers/net/wan/lapbether.c:456
notifier_call_chain kernel/notifier.c:87 [inline]
raw_notifier_call_chain+0x53/0xb0 kernel/notifier.c:455
__dev_notify_flags+0x1d6/0x3a0
dev_change_flags+0xa2/0xc0 net/core/dev.c:8607
do_setlink+0x778/0x2230 net/core/rtnetlink.c:2780
__rtnl_newlink net/core/rtnetlink.c:3546 [inline]
rtnl_newlink+0x114c/0x16a0 net/core/rtnetlink.c:3593
rtnetlink_rcv_msg+0x811/0x8c0 net/core/rtnetlink.c:6089
netlink_rcv_skb+0x13e/0x240 net/netlink/af_netlink.c:2501
rtnetlink_rcv+0x18/0x20 net/core/rtnetlink.c:6107
netlink_unicast_kernel net/netlink/af_netlink.c:1319 [inline]
netlink_unicast+0x58a/0x660 net/netlink/af_netlink.c:1345
netlink_sendmsg+0x661/0x750 net/netlink/af_netlink.c:1921
sock_sendmsg_nosec net/socket.c:714 [inline]
sock_sendmsg net/socket.c:734 [inline]
__sys_sendto+0x21e/0x2c0 net/socket.c:2119
__do_sys_sendto net/socket.c:2131 [inline]
__se_sys_sendto net/socket.c:2127 [inline]
__x64_sys_sendto+0x74/0x90 net/socket.c:2127
do_syscall_x64 arch/x86/entry/common.c:50 [inline]
do_syscall_64+0x2b/0x70 arch/x86/entry/common.c:80
entry_SYSCALL_64_after_hwframe+0x46/0xb0

read to 0xffff888144ecf518 of 1 bytes by task 20423 on cpu 1:
dev_isalive net/core/net-sysfs.c:38 [inline]
netdev_show net/core/net-sysfs.c:50 [inline]
type_show+0x24/0x90 net/core/net-sysfs.c:112
dev_attr_show+0x35/0x90 drivers/base/core.c:2095
sysfs_kf_seq_show+0x175/0x240 fs/sysfs/file.c:59
kernfs_seq_show+0x75/0x80 fs/kernfs/file.c:162
seq_read_iter+0x2c3/0x8e0 fs/seq_file.c:230
kernfs_fop_read_iter+0xd1/0x2f0 fs/kernfs/file.c:235
call_read_iter include/linux/fs.h:2052 [inline]
new_sync_read fs/read_write.c:401 [inline]
vfs_read+0x5a5/0x6a0 fs/read_write.c:482
ksys_read+0xe8/0x1a0 fs/read_write.c:620
__do_sys_read fs/read_write.c:630 [inline]
__se_sys_read fs/read_write.c:628 [inline]
__x64_sys_read+0x3e/0x50 fs/read_write.c:628
do_syscall_x64 arch/x86/entry/common.c:50 [inline]
do_syscall_64+0x2b/0x70 arch/x86/entry/common.c:80
entry_SYSCALL_64_after_hwframe+0x46/0xb0

value changed: 0x00 -> 0x01

Reported by Kernel Concurrency Sanitizer on:
CPU: 1 PID: 20423 Comm: udevd Tainted: G W 5.19.0-rc2-syzkaller-dirty #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011

Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c       | 25 +++++++++++++++----------
 net/core/net-sysfs.c |  1 +
 2 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index 08ce317fcec89..8e6f229612066 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -397,16 +397,18 @@ static void list_netdevice(struct net_device *dev)
 /* Device list removal
  * caller must respect a RCU grace period before freeing/reusing dev
  */
-static void unlist_netdevice(struct net_device *dev)
+static void unlist_netdevice(struct net_device *dev, bool lock)
 {
 	ASSERT_RTNL();
 
 	/* Unlink dev from the device chain */
-	write_lock(&dev_base_lock);
+	if (lock)
+		write_lock(&dev_base_lock);
 	list_del_rcu(&dev->dev_list);
 	netdev_name_node_del(dev->name_node);
 	hlist_del_rcu(&dev->index_hlist);
-	write_unlock(&dev_base_lock);
+	if (lock)
+		write_unlock(&dev_base_lock);
 
 	dev_base_seq_inc(dev_net(dev));
 }
@@ -10043,11 +10045,11 @@ int register_netdevice(struct net_device *dev)
 		goto err_uninit;
 
 	ret = netdev_register_kobject(dev);
-	if (ret) {
-		dev->reg_state = NETREG_UNREGISTERED;
+	write_lock(&dev_base_lock);
+	dev->reg_state = ret ? NETREG_UNREGISTERED : NETREG_REGISTERED;
+	write_unlock(&dev_base_lock);
+	if (ret)
 		goto err_uninit;
-	}
-	dev->reg_state = NETREG_REGISTERED;
 
 	__netdev_update_features(dev);
 
@@ -10329,7 +10331,9 @@ void netdev_run_todo(void)
 			continue;
 		}
 
+		write_lock(&dev_base_lock);
 		dev->reg_state = NETREG_UNREGISTERED;
+		write_unlock(&dev_base_lock);
 		linkwatch_forget_dev(dev);
 	}
 
@@ -10810,9 +10814,10 @@ void unregister_netdevice_many(struct list_head *head)
 
 	list_for_each_entry(dev, head, unreg_list) {
 		/* And unlink it from device chain. */
-		unlist_netdevice(dev);
-
+		write_lock(&dev_base_lock);
+		unlist_netdevice(dev, false);
 		dev->reg_state = NETREG_UNREGISTERING;
+		write_unlock(&dev_base_lock);
 	}
 	flush_all_backlogs();
 
@@ -10959,7 +10964,7 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net,
 	dev_close(dev);
 
 	/* And unlink it from device chain */
-	unlist_netdevice(dev);
+	unlist_netdevice(dev, true);
 
 	synchronize_net();
 
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index e319e242dddf0..a3642569fe535 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -33,6 +33,7 @@ static const char fmt_dec[] = "%d\n";
 static const char fmt_ulong[] = "%lu\n";
 static const char fmt_u64[] = "%llu\n";
 
+/* Caller holds RTNL or dev_base_lock */
 static inline int dev_isalive(const struct net_device *dev)
 {
 	return dev->reg_state <= NETREG_REGISTERED;

From e66e257a5d8368d9c0ba13d4630f474436533e8b Mon Sep 17 00:00:00 2001
From: Jay Vosburgh <jay.vosburgh@canonical.com>
Date: Thu, 16 Jun 2022 12:26:30 -0700
Subject: [PATCH 13/47] veth: Add updating of trans_start

Since commit 21a75f0915dd ("bonding: Fix ARP monitor validation"),
the bonding ARP / ND link monitors depend on the trans_start time to
determine link availability.  NETIF_F_LLTX drivers must update trans_start
directly, which veth does not do.  This prevents use of the ARP or ND link
monitors with veth interfaces in a bond.

	Resolve this by having veth_xmit update the trans_start time.

Reported-by: Jonathan Toppins <jtoppins@redhat.com>
Tested-by: Jonathan Toppins <jtoppins@redhat.com>
Signed-off-by: Jay Vosburgh <jay.vosburgh@canonical.com>
Fixes: 21a75f0915dd ("bonding: Fix ARP monitor validation")
Link: https://lore.kernel.org/netdev/b2fd4147-8f50-bebd-963a-1a3e8d1d9715@redhat.com/
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/veth.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index 466da01ba2e3e..2cb833b3006a7 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -312,6 +312,7 @@ static bool veth_skb_is_eligible_for_gro(const struct net_device *dev,
 static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct veth_priv *rcv_priv, *priv = netdev_priv(dev);
+	struct netdev_queue *queue = NULL;
 	struct veth_rq *rq = NULL;
 	struct net_device *rcv;
 	int length = skb->len;
@@ -329,6 +330,7 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev)
 	rxq = skb_get_queue_mapping(skb);
 	if (rxq < rcv->real_num_rx_queues) {
 		rq = &rcv_priv->rq[rxq];
+		queue = netdev_get_tx_queue(dev, rxq);
 
 		/* The napi pointer is available when an XDP program is
 		 * attached or when GRO is enabled
@@ -340,6 +342,8 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev)
 
 	skb_tx_timestamp(skb);
 	if (likely(veth_forward_skb(rcv, skb, rq, use_napi) == NET_RX_SUCCESS)) {
+		if (queue)
+			txq_trans_cond_update(queue);
 		if (!use_napi)
 			dev_lstats_add(dev, length);
 	} else {

From 911600bf5a5e84bfda4d33ee32acc75ecf6159f0 Mon Sep 17 00:00:00 2001
From: Hoang Le <hoang.h.le@dektech.com.au>
Date: Fri, 17 Jun 2022 08:45:51 +0700
Subject: [PATCH 14/47] tipc: fix use-after-free Read in tipc_named_reinit

syzbot found the following issue on:
==================================================================
BUG: KASAN: use-after-free in tipc_named_reinit+0x94f/0x9b0
net/tipc/name_distr.c:413
Read of size 8 at addr ffff88805299a000 by task kworker/1:9/23764

CPU: 1 PID: 23764 Comm: kworker/1:9 Not tainted
5.18.0-rc4-syzkaller-00878-g17d49e6e8012 #0
Hardware name: Google Compute Engine/Google Compute Engine,
BIOS Google 01/01/2011
Workqueue: events tipc_net_finalize_work
Call Trace:
 <TASK>
 __dump_stack lib/dump_stack.c:88 [inline]
 dump_stack_lvl+0xcd/0x134 lib/dump_stack.c:106
 print_address_description.constprop.0.cold+0xeb/0x495
mm/kasan/report.c:313
 print_report mm/kasan/report.c:429 [inline]
 kasan_report.cold+0xf4/0x1c6 mm/kasan/report.c:491
 tipc_named_reinit+0x94f/0x9b0 net/tipc/name_distr.c:413
 tipc_net_finalize+0x234/0x3d0 net/tipc/net.c:138
 process_one_work+0x996/0x1610 kernel/workqueue.c:2289
 worker_thread+0x665/0x1080 kernel/workqueue.c:2436
 kthread+0x2e9/0x3a0 kernel/kthread.c:376
 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:298
 </TASK>
[...]
==================================================================

In the commit
d966ddcc3821 ("tipc: fix a deadlock when flushing scheduled work"),
the cancel_work_sync() function just to make sure ONLY the work
tipc_net_finalize_work() is executing/pending on any CPU completed before
tipc namespace is destroyed through tipc_exit_net(). But this function
is not guaranteed the work is the last queued. So, the destroyed instance
may be accessed in the work which will try to enqueue later.

In order to completely fix, we re-order the calling of cancel_work_sync()
to make sure the work tipc_net_finalize_work() was last queued and it
must be completed by calling cancel_work_sync().

Reported-by: syzbot+47af19f3307fc9c5c82e@syzkaller.appspotmail.com
Fixes: d966ddcc3821 ("tipc: fix a deadlock when flushing scheduled work")
Acked-by: Jon Maloy <jmaloy@redhat.com>
Signed-off-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Hoang Le <hoang.h.le@dektech.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/core.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/net/tipc/core.c b/net/tipc/core.c
index 3f4542e0f0650..434e70eabe081 100644
--- a/net/tipc/core.c
+++ b/net/tipc/core.c
@@ -109,10 +109,9 @@ static void __net_exit tipc_exit_net(struct net *net)
 	struct tipc_net *tn = tipc_net(net);
 
 	tipc_detach_loopback(net);
+	tipc_net_stop(net);
 	/* Make sure the tipc_net_finalize_work() finished */
 	cancel_work_sync(&tn->work);
-	tipc_net_stop(net);
-
 	tipc_bcast_stop(net);
 	tipc_nametbl_stop(net);
 	tipc_sk_rht_destroy(net);

From 2b04495e21cdb9b45c28c6aeb2da560184de20a3 Mon Sep 17 00:00:00 2001
From: Xu Jia <xujia39@huawei.com>
Date: Fri, 17 Jun 2022 17:31:06 +0800
Subject: [PATCH 15/47] hamradio: 6pack: fix array-index-out-of-bounds in
 decode_std_command()

Hulk Robot reports incorrect sp->rx_count_cooked value in decode_std_command().
This should be caused by the subtracting from sp->rx_count_cooked before.
It seems that sp->rx_count_cooked value is changed to 0, which bypassed the
previous judgment.

The situation is shown below:

         (Thread 1)			|  (Thread 2)
decode_std_command()		| resync_tnc()
...					|
if (rest == 2)			|
	sp->rx_count_cooked -= 2;	|
else if (rest == 3)			| ...
					| sp->rx_count_cooked = 0;
	sp->rx_count_cooked -= 1;	|
for (i = 0; i < sp->rx_count_cooked; i++) // report error
	checksum += sp->cooked_buf[i];

sp->rx_count_cooked is a shared variable but is not protected by a lock.
The same applies to sp->rx_count. This patch adds a lock to fix the bug.

The fail log is shown below:
=======================================================================
UBSAN: array-index-out-of-bounds in drivers/net/hamradio/6pack.c:925:31
index 400 is out of range for type 'unsigned char [400]'
CPU: 3 PID: 7433 Comm: kworker/u10:1 Not tainted 5.18.0-rc5-00163-g4b97bac0756a #2
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014
Workqueue: events_unbound flush_to_ldisc
Call Trace:
 <TASK>
 dump_stack_lvl+0xcd/0x134
 ubsan_epilogue+0xb/0x50
 __ubsan_handle_out_of_bounds.cold+0x62/0x6c
 sixpack_receive_buf+0xfda/0x1330
 tty_ldisc_receive_buf+0x13e/0x180
 tty_port_default_receive_buf+0x6d/0xa0
 flush_to_ldisc+0x213/0x3f0
 process_one_work+0x98f/0x1620
 worker_thread+0x665/0x1080
 kthread+0x2e9/0x3a0
 ret_from_fork+0x1f/0x30
 ...

Reported-by: Hulk Robot <hulkci@huawei.com>
Signed-off-by: Xu Jia <xujia39@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/hamradio/6pack.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/drivers/net/hamradio/6pack.c b/drivers/net/hamradio/6pack.c
index 45c3c4a1101b7..9fb5675242207 100644
--- a/drivers/net/hamradio/6pack.c
+++ b/drivers/net/hamradio/6pack.c
@@ -99,6 +99,7 @@ struct sixpack {
 
 	unsigned int		rx_count;
 	unsigned int		rx_count_cooked;
+	spinlock_t		rxlock;
 
 	int			mtu;		/* Our mtu (to spot changes!) */
 	int			buffsize;       /* Max buffers sizes */
@@ -565,6 +566,7 @@ static int sixpack_open(struct tty_struct *tty)
 	sp->dev = dev;
 
 	spin_lock_init(&sp->lock);
+	spin_lock_init(&sp->rxlock);
 	refcount_set(&sp->refcnt, 1);
 	init_completion(&sp->dead);
 
@@ -913,6 +915,7 @@ static void decode_std_command(struct sixpack *sp, unsigned char cmd)
 			sp->led_state = 0x60;
 			/* fill trailing bytes with zeroes */
 			sp->tty->ops->write(sp->tty, &sp->led_state, 1);
+			spin_lock_bh(&sp->rxlock);
 			rest = sp->rx_count;
 			if (rest != 0)
 				 for (i = rest; i <= 3; i++)
@@ -930,6 +933,7 @@ static void decode_std_command(struct sixpack *sp, unsigned char cmd)
 				sp_bump(sp, 0);
 			}
 			sp->rx_count_cooked = 0;
+			spin_unlock_bh(&sp->rxlock);
 		}
 		break;
 	case SIXP_TX_URUN: printk(KERN_DEBUG "6pack: TX underrun\n");
@@ -959,8 +963,11 @@ sixpack_decode(struct sixpack *sp, const unsigned char *pre_rbuff, int count)
 			decode_prio_command(sp, inbyte);
 		else if ((inbyte & SIXP_STD_CMD_MASK) != 0)
 			decode_std_command(sp, inbyte);
-		else if ((sp->status & SIXP_RX_DCD_MASK) == SIXP_RX_DCD_MASK)
+		else if ((sp->status & SIXP_RX_DCD_MASK) == SIXP_RX_DCD_MASK) {
+			spin_lock_bh(&sp->rxlock);
 			decode_data(sp, inbyte);
+			spin_unlock_bh(&sp->rxlock);
+		}
 	}
 }
 

From b4a028c4d031c27704ad73b1195ca69a1206941e Mon Sep 17 00:00:00 2001
From: Riccardo Paolo Bestetti <pbl@bestov.io>
Date: Fri, 17 Jun 2022 10:54:35 +0200
Subject: [PATCH 16/47] ipv4: ping: fix bind address validity check
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Commit 8ff978b8b222 ("ipv4/raw: support binding to nonlocal addresses")
introduced a helper function to fold duplicated validity checks of bind
addresses into inet_addr_valid_or_nonlocal(). However, this caused an
unintended regression in ping_check_bind_addr(), which previously would
reject binding to multicast and broadcast addresses, but now these are
both incorrectly allowed as reported in [1].

This patch restores the original check. A simple reordering is done to
improve readability and make it evident that multicast and broadcast
addresses should not be allowed. Also, add an early exit for INADDR_ANY
which replaces lost behavior added by commit 0ce779a9f501 ("net: Avoid
unnecessary inet_addr_type() call when addr is INADDR_ANY").

Furthermore, this patch introduces regression selftests to catch these
specific cases.

[1] https://lore.kernel.org/netdev/CANP3RGdkAcDyAZoT1h8Gtuu0saq+eOrrTiWbxnOs+5zn+cpyKg@mail.gmail.com/

Fixes: 8ff978b8b222 ("ipv4/raw: support binding to nonlocal addresses")
Cc: Miaohe Lin <linmiaohe@huawei.com>
Reported-by: Maciej Żenczykowski <maze@google.com>
Signed-off-by: Carlos Llamas <cmllamas@google.com>
Signed-off-by: Riccardo Paolo Bestetti <pbl@bestov.io>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ping.c                           | 10 ++++---
 tools/testing/selftests/net/fcnal-test.sh | 33 +++++++++++++++++++++++
 2 files changed, 40 insertions(+), 3 deletions(-)

diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 1a43ca73f94d6..3c6101def7d6b 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -319,12 +319,16 @@ static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk,
 		pr_debug("ping_check_bind_addr(sk=%p,addr=%pI4,port=%d)\n",
 			 sk, &addr->sin_addr.s_addr, ntohs(addr->sin_port));
 
+		if (addr->sin_addr.s_addr == htonl(INADDR_ANY))
+			return 0;
+
 		tb_id = l3mdev_fib_table_by_index(net, sk->sk_bound_dev_if) ? : tb_id;
 		chk_addr_ret = inet_addr_type_table(net, addr->sin_addr.s_addr, tb_id);
 
-		if (!inet_addr_valid_or_nonlocal(net, inet_sk(sk),
-					         addr->sin_addr.s_addr,
-	                                         chk_addr_ret))
+		if (chk_addr_ret == RTN_MULTICAST ||
+		    chk_addr_ret == RTN_BROADCAST ||
+		    (chk_addr_ret != RTN_LOCAL &&
+		     !inet_can_nonlocal_bind(net, isk)))
 			return -EADDRNOTAVAIL;
 
 #if IS_ENABLED(CONFIG_IPV6)
diff --git a/tools/testing/selftests/net/fcnal-test.sh b/tools/testing/selftests/net/fcnal-test.sh
index 54701c8b0cd70..75223b63e3c88 100755
--- a/tools/testing/selftests/net/fcnal-test.sh
+++ b/tools/testing/selftests/net/fcnal-test.sh
@@ -70,6 +70,10 @@ NSB_LO_IP6=2001:db8:2::2
 NL_IP=172.17.1.1
 NL_IP6=2001:db8:4::1
 
+# multicast and broadcast addresses
+MCAST_IP=224.0.0.1
+BCAST_IP=255.255.255.255
+
 MD5_PW=abc123
 MD5_WRONG_PW=abc1234
 
@@ -308,6 +312,9 @@ addr2str()
 	127.0.0.1) echo "loopback";;
 	::1) echo "IPv6 loopback";;
 
+	${BCAST_IP}) echo "broadcast";;
+	${MCAST_IP}) echo "multicast";;
+
 	${NSA_IP})	echo "ns-A IP";;
 	${NSA_IP6})	echo "ns-A IPv6";;
 	${NSA_LO_IP})	echo "ns-A loopback IP";;
@@ -1800,6 +1807,19 @@ ipv4_addr_bind_novrf()
 	run_cmd nettest -s -R -P icmp -f -l ${a} -I ${NSA_DEV} -b
 	log_test_addr ${a} $? 0 "Raw socket bind to nonlocal address after device bind"
 
+	#
+	# check that ICMP sockets cannot bind to broadcast and multicast addresses
+	#
+	a=${BCAST_IP}
+	log_start
+	run_cmd nettest -s -R -P icmp -l ${a} -b
+	log_test_addr ${a} $? 1 "ICMP socket bind to broadcast address"
+
+	a=${MCAST_IP}
+	log_start
+	run_cmd nettest -s -R -P icmp -f -l ${a} -b
+	log_test_addr ${a} $? 1 "ICMP socket bind to multicast address"
+
 	#
 	# tcp sockets
 	#
@@ -1857,6 +1877,19 @@ ipv4_addr_bind_vrf()
 	run_cmd nettest -s -R -P icmp -f -l ${a} -I ${VRF} -b
 	log_test_addr ${a} $? 0 "Raw socket bind to nonlocal address after VRF bind"
 
+	#
+	# check that ICMP sockets cannot bind to broadcast and multicast addresses
+	#
+	a=${BCAST_IP}
+	log_start
+	run_cmd nettest -s -R -P icmp -l ${a} -I ${VRF} -b
+	log_test_addr ${a} $? 1 "ICMP socket bind to broadcast address after VRF bind"
+
+	a=${MCAST_IP}
+	log_start
+	run_cmd nettest -s -R -P icmp -f -l ${a} -I ${VRF} -b
+	log_test_addr ${a} $? 1 "ICMP socket bind to multicast address after VRF bind"
+
 	#
 	# tcp sockets
 	#

From 63ce81d1c40459e2d9d28f90e2a3e3863e2f63d4 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 17 Jun 2022 21:42:33 +0200
Subject: [PATCH 17/47] bpf, docs: Update some of the JIT/maintenance entries

Various minor updates around some of the BPF-related entries:

JITs for ARM32/NFP/SPARC/X86-32 haven't seen updates in quite a while, thus
for now, mark them as 'Odd Fixes' until they become more actively developed.

JITs for POWERPC/S390 are in good shape and receive active development and
review, thus bump to 'Supported' similar as we have with X86-64/ARM64.

JITs for MIPS/RISC-V are in similar good shape as the ones mentioned above,
but looked after mostly in spare time, thus leave for now in 'Maintained' state.

Add Michael to PPC JIT given he's picking up the patches there, so it better
reflects today's state.

Also, I haven't done much reviewing around BPF sockmap/kTLS after John and I
did the big rework back in the days to integrate sockmap with kTLS.

These days, most of this is taken care by John, Jakub {Sitnicki,Kicinski} and
others in the community, so remove myself from these two.

Lastly, move all BPF-related entries into one place, that is, move the sockmap
one over near rest of BPF.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/r/f9b8a63a0b48dc764bd4c50f87632889f5813f69.1655494758.git.daniel@iogearbox.net
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 MAINTAINERS | 41 ++++++++++++++++++++---------------------
 1 file changed, 20 insertions(+), 21 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 96158b337b407..795f7e40230cf 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3662,7 +3662,7 @@ BPF JIT for ARM
 M:	Shubham Bansal <illusionist.neo@gmail.com>
 L:	netdev@vger.kernel.org
 L:	bpf@vger.kernel.org
-S:	Maintained
+S:	Odd Fixes
 F:	arch/arm/net/
 
 BPF JIT for ARM64
@@ -3686,14 +3686,15 @@ BPF JIT for NFP NICs
 M:	Jakub Kicinski <kuba@kernel.org>
 L:	netdev@vger.kernel.org
 L:	bpf@vger.kernel.org
-S:	Supported
+S:	Odd Fixes
 F:	drivers/net/ethernet/netronome/nfp/bpf/
 
 BPF JIT for POWERPC (32-BIT AND 64-BIT)
 M:	Naveen N. Rao <naveen.n.rao@linux.ibm.com>
+M:	Michael Ellerman <mpe@ellerman.id.au>
 L:	netdev@vger.kernel.org
 L:	bpf@vger.kernel.org
-S:	Maintained
+S:	Supported
 F:	arch/powerpc/net/
 
 BPF JIT for RISC-V (32-bit)
@@ -3719,7 +3720,7 @@ M:	Heiko Carstens <hca@linux.ibm.com>
 M:	Vasily Gorbik <gor@linux.ibm.com>
 L:	netdev@vger.kernel.org
 L:	bpf@vger.kernel.org
-S:	Maintained
+S:	Supported
 F:	arch/s390/net/
 X:	arch/s390/net/pnet.c
 
@@ -3727,14 +3728,14 @@ BPF JIT for SPARC (32-BIT AND 64-BIT)
 M:	David S. Miller <davem@davemloft.net>
 L:	netdev@vger.kernel.org
 L:	bpf@vger.kernel.org
-S:	Maintained
+S:	Odd Fixes
 F:	arch/sparc/net/
 
 BPF JIT for X86 32-BIT
 M:	Wang YanQing <udknight@gmail.com>
 L:	netdev@vger.kernel.org
 L:	bpf@vger.kernel.org
-S:	Maintained
+S:	Odd Fixes
 F:	arch/x86/net/bpf_jit_comp32.c
 
 BPF JIT for X86 64-BIT
@@ -3757,6 +3758,19 @@ F:	include/linux/bpf_lsm.h
 F:	kernel/bpf/bpf_lsm.c
 F:	security/bpf/
 
+BPF L7 FRAMEWORK
+M:	John Fastabend <john.fastabend@gmail.com>
+M:	Jakub Sitnicki <jakub@cloudflare.com>
+L:	netdev@vger.kernel.org
+L:	bpf@vger.kernel.org
+S:	Maintained
+F:	include/linux/skmsg.h
+F:	net/core/skmsg.c
+F:	net/core/sock_map.c
+F:	net/ipv4/tcp_bpf.c
+F:	net/ipv4/udp_bpf.c
+F:	net/unix/unix_bpf.c
+
 BPFTOOL
 M:	Quentin Monnet <quentin@isovalent.com>
 L:	bpf@vger.kernel.org
@@ -11095,20 +11109,6 @@ S:	Maintained
 F:	include/net/l3mdev.h
 F:	net/l3mdev
 
-L7 BPF FRAMEWORK
-M:	John Fastabend <john.fastabend@gmail.com>
-M:	Daniel Borkmann <daniel@iogearbox.net>
-M:	Jakub Sitnicki <jakub@cloudflare.com>
-L:	netdev@vger.kernel.org
-L:	bpf@vger.kernel.org
-S:	Maintained
-F:	include/linux/skmsg.h
-F:	net/core/skmsg.c
-F:	net/core/sock_map.c
-F:	net/ipv4/tcp_bpf.c
-F:	net/ipv4/udp_bpf.c
-F:	net/unix/unix_bpf.c
-
 LANDLOCK SECURITY MODULE
 M:	Mickaël Salaün <mic@digikod.net>
 L:	linux-security-module@vger.kernel.org
@@ -13950,7 +13950,6 @@ F:	net/ipv6/tcp*.c
 NETWORKING [TLS]
 M:	Boris Pismenny <borisp@nvidia.com>
 M:	John Fastabend <john.fastabend@gmail.com>
-M:	Daniel Borkmann <daniel@iogearbox.net>
 M:	Jakub Kicinski <kuba@kernel.org>
 L:	netdev@vger.kernel.org
 S:	Maintained

From c88dbbcd88c233cb759ec857b57864c5bfcea26a Mon Sep 17 00:00:00 2001
From: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Date: Wed, 8 Jun 2022 01:11:02 +0900
Subject: [PATCH 18/47] fprobe, samples: Add use_trace option and show
 hit/missed counter

Add use_trace option to use trace_printk() instead of pr_info()
so that the handler doesn't involve the RCU operations.
And show the hit and missed counter so that the user can check
how many times the probe handler hit and missed.

Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Link: https://lore.kernel.org/bpf/165461826247.280167.11939123218334322352.stgit@devnote2
---
 samples/fprobe/fprobe_example.c | 29 +++++++++++++++++++++++++----
 1 file changed, 25 insertions(+), 4 deletions(-)

diff --git a/samples/fprobe/fprobe_example.c b/samples/fprobe/fprobe_example.c
index 24d3cf1091407..01ee6c8c83829 100644
--- a/samples/fprobe/fprobe_example.c
+++ b/samples/fprobe/fprobe_example.c
@@ -21,6 +21,7 @@
 #define BACKTRACE_DEPTH 16
 #define MAX_SYMBOL_LEN 4096
 struct fprobe sample_probe;
+static unsigned long nhit;
 
 static char symbol[MAX_SYMBOL_LEN] = "kernel_clone";
 module_param_string(symbol, symbol, sizeof(symbol), 0644);
@@ -28,6 +29,8 @@ static char nosymbol[MAX_SYMBOL_LEN] = "";
 module_param_string(nosymbol, nosymbol, sizeof(nosymbol), 0644);
 static bool stackdump = true;
 module_param(stackdump, bool, 0644);
+static bool use_trace = false;
+module_param(use_trace, bool, 0644);
 
 static void show_backtrace(void)
 {
@@ -40,7 +43,15 @@ static void show_backtrace(void)
 
 static void sample_entry_handler(struct fprobe *fp, unsigned long ip, struct pt_regs *regs)
 {
-	pr_info("Enter <%pS> ip = 0x%p\n", (void *)ip, (void *)ip);
+	if (use_trace)
+		/*
+		 * This is just an example, no kernel code should call
+		 * trace_printk() except when actively debugging.
+		 */
+		trace_printk("Enter <%pS> ip = 0x%p\n", (void *)ip, (void *)ip);
+	else
+		pr_info("Enter <%pS> ip = 0x%p\n", (void *)ip, (void *)ip);
+	nhit++;
 	if (stackdump)
 		show_backtrace();
 }
@@ -49,8 +60,17 @@ static void sample_exit_handler(struct fprobe *fp, unsigned long ip, struct pt_r
 {
 	unsigned long rip = instruction_pointer(regs);
 
-	pr_info("Return from <%pS> ip = 0x%p to rip = 0x%p (%pS)\n",
-		(void *)ip, (void *)ip, (void *)rip, (void *)rip);
+	if (use_trace)
+		/*
+		 * This is just an example, no kernel code should call
+		 * trace_printk() except when actively debugging.
+		 */
+		trace_printk("Return from <%pS> ip = 0x%p to rip = 0x%p (%pS)\n",
+			(void *)ip, (void *)ip, (void *)rip, (void *)rip);
+	else
+		pr_info("Return from <%pS> ip = 0x%p to rip = 0x%p (%pS)\n",
+			(void *)ip, (void *)ip, (void *)rip, (void *)rip);
+	nhit++;
 	if (stackdump)
 		show_backtrace();
 }
@@ -112,7 +132,8 @@ static void __exit fprobe_exit(void)
 {
 	unregister_fprobe(&sample_probe);
 
-	pr_info("fprobe at %s unregistered\n", symbol);
+	pr_info("fprobe at %s unregistered. %ld times hit, %ld times missed\n",
+		symbol, nhit, sample_probe.nmissed);
 }
 
 module_init(fprobe_init)

From c0f3bb4054ef036e5f67e27f2e3cad9e6512cf00 Mon Sep 17 00:00:00 2001
From: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Date: Wed, 8 Jun 2022 01:11:12 +0900
Subject: [PATCH 19/47] rethook: Reject getting a rethook if RCU is not
 watching

Since the rethook_recycle() will involve the call_rcu() for reclaiming
the rethook_instance, the rethook must be set up at the RCU available
context (non idle). This rethook_recycle() in the rethook trampoline
handler is inevitable, thus the RCU available check must be done before
setting the rethook trampoline.

This adds a rcu_is_watching() check in the rethook_try_get() so that
it will return NULL if it is called when !rcu_is_watching().

Fixes: 54ecbe6f1ed5 ("rethook: Add a generic return hook")
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Link: https://lore.kernel.org/bpf/165461827269.280167.7379263615545598958.stgit@devnote2
---
 kernel/trace/rethook.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/kernel/trace/rethook.c b/kernel/trace/rethook.c
index b56833700d23f..c69d82273ce78 100644
--- a/kernel/trace/rethook.c
+++ b/kernel/trace/rethook.c
@@ -154,6 +154,15 @@ struct rethook_node *rethook_try_get(struct rethook *rh)
 	if (unlikely(!handler))
 		return NULL;
 
+	/*
+	 * This expects the caller will set up a rethook on a function entry.
+	 * When the function returns, the rethook will eventually be reclaimed
+	 * or released in the rethook_recycle() with call_rcu().
+	 * This means the caller must be run in the RCU-availabe context.
+	 */
+	if (unlikely(!rcu_is_watching()))
+		return NULL;
+
 	fn = freelist_try_get(&rh->pool);
 	if (!fn)
 		return NULL;

From 394e771684f7a2cd4e154647bff50084c31bc7cf Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 15 Jun 2022 15:36:54 +0200
Subject: [PATCH 20/47] netfilter: cttimeout: fix slab-out-of-bounds read typo
 in cttimeout_net_exit

syzbot reports:
  BUG: KASAN: slab-out-of-bounds in __list_del_entry_valid+0xcc/0xf0 lib/list_debug.c:42
  [..]
  list_del include/linux/list.h:148 [inline]
  cttimeout_net_exit+0x211/0x540 net/netfilter/nfnetlink_cttimeout.c:617

Problem is the wrong name of the list member, so container_of() result is wrong.

Reported-by: <syzbot+92968395eedbdbd3617d@syzkaller.appspotmail.com>
Fixes: 78222bacfca9 ("netfilter: cttimeout: decouple unlink and free on netns destruction")
Signed-off-by: Florian Westphal <fw@strlen.de>
---
 net/netfilter/nfnetlink_cttimeout.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c
index af15102bc696f..f466af4f85317 100644
--- a/net/netfilter/nfnetlink_cttimeout.c
+++ b/net/netfilter/nfnetlink_cttimeout.c
@@ -614,7 +614,7 @@ static void __net_exit cttimeout_net_exit(struct net *net)
 
 	nf_ct_untimeout(net, NULL);
 
-	list_for_each_entry_safe(cur, tmp, &pernet->nfct_timeout_freelist, head) {
+	list_for_each_entry_safe(cur, tmp, &pernet->nfct_timeout_freelist, free_head) {
 		list_del(&cur->free_head);
 
 		if (refcount_dec_and_test(&cur->refcnt))

From 3f6a57ee8544ec3982f8a3cbcbf4aea7d47eb9ec Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo@kernel.org>
Date: Thu, 16 Jun 2022 16:13:20 +0200
Subject: [PATCH 21/47] igb: fix a use-after-free issue in igb_clean_tx_ring

Fix the following use-after-free bug in igb_clean_tx_ring routine when
the NIC is running in XDP mode. The issue can be triggered redirecting
traffic into the igb NIC and then closing the device while the traffic
is flowing.

[   73.322719] CPU: 1 PID: 487 Comm: xdp_redirect Not tainted 5.18.3-apu2 #9
[   73.330639] Hardware name: PC Engines APU2/APU2, BIOS 4.0.7 02/28/2017
[   73.337434] RIP: 0010:refcount_warn_saturate+0xa7/0xf0
[   73.362283] RSP: 0018:ffffc9000081f798 EFLAGS: 00010282
[   73.367761] RAX: 0000000000000000 RBX: ffffc90000420f80 RCX: 0000000000000000
[   73.375200] RDX: ffff88811ad22d00 RSI: ffff88811ad171e0 RDI: ffff88811ad171e0
[   73.382590] RBP: 0000000000000900 R08: ffffffff82298f28 R09: 0000000000000058
[   73.390008] R10: 0000000000000219 R11: ffffffff82280f40 R12: 0000000000000090
[   73.397356] R13: ffff888102343a40 R14: ffff88810359e0e4 R15: 0000000000000000
[   73.404806] FS:  00007ff38d31d740(0000) GS:ffff88811ad00000(0000) knlGS:0000000000000000
[   73.413129] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   73.419096] CR2: 000055cff35f13f8 CR3: 0000000106391000 CR4: 00000000000406e0
[   73.426565] Call Trace:
[   73.429087]  <TASK>
[   73.431314]  igb_clean_tx_ring+0x43/0x140 [igb]
[   73.436002]  igb_down+0x1d7/0x220 [igb]
[   73.439974]  __igb_close+0x3c/0x120 [igb]
[   73.444118]  igb_xdp+0x10c/0x150 [igb]
[   73.447983]  ? igb_pci_sriov_configure+0x70/0x70 [igb]
[   73.453362]  dev_xdp_install+0xda/0x110
[   73.457371]  dev_xdp_attach+0x1da/0x550
[   73.461369]  do_setlink+0xfd0/0x10f0
[   73.465166]  ? __nla_validate_parse+0x89/0xc70
[   73.469714]  rtnl_setlink+0x11a/0x1e0
[   73.473547]  rtnetlink_rcv_msg+0x145/0x3d0
[   73.477709]  ? rtnl_calcit.isra.0+0x130/0x130
[   73.482258]  netlink_rcv_skb+0x8d/0x110
[   73.486229]  netlink_unicast+0x230/0x340
[   73.490317]  netlink_sendmsg+0x215/0x470
[   73.494395]  __sys_sendto+0x179/0x190
[   73.498268]  ? move_addr_to_user+0x37/0x70
[   73.502547]  ? __sys_getsockname+0x84/0xe0
[   73.506853]  ? netlink_setsockopt+0x1c1/0x4a0
[   73.511349]  ? __sys_setsockopt+0xc8/0x1d0
[   73.515636]  __x64_sys_sendto+0x20/0x30
[   73.519603]  do_syscall_64+0x3b/0x80
[   73.523399]  entry_SYSCALL_64_after_hwframe+0x44/0xae
[   73.528712] RIP: 0033:0x7ff38d41f20c
[   73.551866] RSP: 002b:00007fff3b945a68 EFLAGS: 00000246 ORIG_RAX: 000000000000002c
[   73.559640] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007ff38d41f20c
[   73.567066] RDX: 0000000000000034 RSI: 00007fff3b945b30 RDI: 0000000000000003
[   73.574457] RBP: 0000000000000003 R08: 0000000000000000 R09: 0000000000000000
[   73.581852] R10: 0000000000000000 R11: 0000000000000246 R12: 00007fff3b945ab0
[   73.589179] R13: 0000000000000000 R14: 0000000000000003 R15: 00007fff3b945b30
[   73.596545]  </TASK>
[   73.598842] ---[ end trace 0000000000000000 ]---

Fixes: 9cbc948b5a20c ("igb: add XDP support")
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Reviewed-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Link: https://lore.kernel.org/r/e5c01d549dc37bff18e46aeabd6fb28a7bcf84be.1655388571.git.lorenzo@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/intel/igb/igb_main.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index 68be2976f539f..1c26bec7d6fa7 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -4819,8 +4819,11 @@ static void igb_clean_tx_ring(struct igb_ring *tx_ring)
 	while (i != tx_ring->next_to_use) {
 		union e1000_adv_tx_desc *eop_desc, *tx_desc;
 
-		/* Free all the Tx ring sk_buffs */
-		dev_kfree_skb_any(tx_buffer->skb);
+		/* Free all the Tx ring sk_buffs or xdp frames */
+		if (tx_buffer->type == IGB_TYPE_SKB)
+			dev_kfree_skb_any(tx_buffer->skb);
+		else
+			xdp_return_frame(tx_buffer->xdpf);
 
 		/* unmap skb header data */
 		dma_unmap_single(tx_ring->dev,

From 7a9214f3d88cfdb099f3896e102a306b316d8707 Mon Sep 17 00:00:00 2001
From: Jay Vosburgh <jay.vosburgh@canonical.com>
Date: Thu, 16 Jun 2022 12:32:40 -0700
Subject: [PATCH 22/47] bonding: ARP monitor spams NETDEV_NOTIFY_PEERS
 notifiers

The bonding ARP monitor fails to decrement send_peer_notif, the
number of peer notifications (gratuitous ARP or ND) to be sent. This
results in a continuous series of notifications.

Correct this by decrementing the counter for each notification.

Reported-by: Jonathan Toppins <jtoppins@redhat.com>
Signed-off-by: Jay Vosburgh <jay.vosburgh@canonical.com>
Fixes: b0929915e035 ("bonding: Fix RTNL: assertion failed at net/core/rtnetlink.c for ab arp monitor")
Link: https://lore.kernel.org/netdev/b2fd4147-8f50-bebd-963a-1a3e8d1d9715@redhat.com/
Tested-by: Jonathan Toppins <jtoppins@redhat.com>
Reviewed-by: Jonathan Toppins <jtoppins@redhat.com>
Link: https://lore.kernel.org/r/9400.1655407960@famine
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/bonding/bond_main.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index f85372adf0426..6ba4c83fe5fc0 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -3684,9 +3684,11 @@ static void bond_activebackup_arp_mon(struct bonding *bond)
 		if (!rtnl_trylock())
 			return;
 
-		if (should_notify_peers)
+		if (should_notify_peers) {
+			bond->send_peer_notif--;
 			call_netdevice_notifiers(NETDEV_NOTIFY_PEERS,
 						 bond->dev);
+		}
 		if (should_notify_rtnl) {
 			bond_slave_state_notify(bond);
 			bond_slave_link_notify(bond);

From a3bb7b63813f674fb62bac321cdd897cc62de094 Mon Sep 17 00:00:00 2001
From: Ivan Vecera <ivecera@redhat.com>
Date: Thu, 16 Jun 2022 18:08:55 +0200
Subject: [PATCH 23/47] ethtool: Fix get module eeprom fallback

Function fallback_set_params() checks if the module type returned
by a driver is ETH_MODULE_SFF_8079 and in this case it assumes
that buffer returns a concatenated content of page  A0h and A2h.
The check is wrong because the correct type is ETH_MODULE_SFF_8472.

Fixes: 96d971e307cc ("ethtool: Add fallback to get_module_eeprom from netlink command")
Signed-off-by: Ivan Vecera <ivecera@redhat.com>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Link: https://lore.kernel.org/r/20220616160856.3623273-1-ivecera@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/ethtool/eeprom.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ethtool/eeprom.c b/net/ethtool/eeprom.c
index 7e6b37a54add3..1c94bb8ea03f2 100644
--- a/net/ethtool/eeprom.c
+++ b/net/ethtool/eeprom.c
@@ -36,7 +36,7 @@ static int fallback_set_params(struct eeprom_req_info *request,
 	if (request->page)
 		offset = request->page * ETH_MODULE_EEPROM_PAGE_LEN + offset;
 
-	if (modinfo->type == ETH_MODULE_SFF_8079 &&
+	if (modinfo->type == ETH_MODULE_SFF_8472 &&
 	    request->i2c_address == 0x51)
 		offset += ETH_MODULE_EEPROM_PAGE_LEN * 2;
 

From a2b1a5d40bd12b44322c2ccd40bb0ec1699708b6 Mon Sep 17 00:00:00 2001
From: Peilin Ye <peilin.ye@bytedance.com>
Date: Thu, 16 Jun 2022 16:43:36 -0700
Subject: [PATCH 24/47] net/sched: sch_netem: Fix arithmetic in netem_dump()
 for 32-bit platforms

As reported by Yuming, currently tc always show a latency of UINT_MAX
for netem Qdisc's on 32-bit platforms:

    $ tc qdisc add dev dummy0 root netem latency 100ms
    $ tc qdisc show dev dummy0
    qdisc netem 8001: root refcnt 2 limit 1000 delay 275s  275s
                                               ^^^^^^^^^^^^^^^^

Let us take a closer look at netem_dump():

        qopt.latency = min_t(psched_tdiff_t, PSCHED_NS2TICKS(q->latency,
                             UINT_MAX);

qopt.latency is __u32, psched_tdiff_t is signed long,
(psched_tdiff_t)(UINT_MAX) is negative for 32-bit platforms, so
qopt.latency is always UINT_MAX.

Fix it by using psched_time_t (u64) instead.

Note: confusingly, users have two ways to specify 'latency':

  1. normally, via '__u32 latency' in struct tc_netem_qopt;
  2. via the TCA_NETEM_LATENCY64 attribute, which is s64.

For the second case, theoretically 'latency' could be negative.  This
patch ignores that corner case, since it is broken (i.e. assigning a
negative s64 to __u32) anyways, and should be handled separately.

Thanks Ted Lin for the analysis [1] .

[1] https://github.com/raspberrypi/linux/issues/3512

Reported-by: Yuming Chen <chenyuming.junnan@bytedance.com>
Fixes: 112f9cb65643 ("netem: convert to qdisc_watchdog_schedule_ns")
Reviewed-by: Cong Wang <cong.wang@bytedance.com>
Signed-off-by: Peilin Ye <peilin.ye@bytedance.com>
Acked-by: Stephen Hemminger <stephen@networkplumber.org>
Link: https://lore.kernel.org/r/20220616234336.2443-1-yepeilin.cs@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/sched/sch_netem.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index ed4ccef5d6a82..5449ed114e406 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -1146,9 +1146,9 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
 	struct tc_netem_rate rate;
 	struct tc_netem_slot slot;
 
-	qopt.latency = min_t(psched_tdiff_t, PSCHED_NS2TICKS(q->latency),
+	qopt.latency = min_t(psched_time_t, PSCHED_NS2TICKS(q->latency),
 			     UINT_MAX);
-	qopt.jitter = min_t(psched_tdiff_t, PSCHED_NS2TICKS(q->jitter),
+	qopt.jitter = min_t(psched_time_t, PSCHED_NS2TICKS(q->jitter),
 			    UINT_MAX);
 	qopt.limit = q->limit;
 	qopt.loss = q->loss;

From 8fc74d18639a2402ca52b177e990428e26ea881f Mon Sep 17 00:00:00 2001
From: Wentao_Liang <Wentao_Liang_g@163.com>
Date: Sun, 19 Jun 2022 22:14:54 +0800
Subject: [PATCH 25/47] drivers/net/ethernet/neterion/vxge: Fix a
 use-after-free bug in vxge-main.c
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The pointer vdev points to a memory region adjacent to a net_device
structure ndev, which is a field of hldev. At line 4740, the invocation
to vxge_device_unregister unregisters device hldev, and it also releases
the memory region pointed by vdev->bar0. At line 4743, the freed memory
region is referenced (i.e., iounmap(vdev->bar0)), resulting in a
use-after-free vulnerability. We can fix the bug by calling iounmap
before vxge_device_unregister.

4721.      static void vxge_remove(struct pci_dev *pdev)
4722.      {
4723.             struct __vxge_hw_device *hldev;
4724.             struct vxgedev *vdev;
…
4731.             vdev = netdev_priv(hldev->ndev);
…
4740.             vxge_device_unregister(hldev);
4741.             /* Do not call pci_disable_sriov here, as it
						will break child devices */
4742.             vxge_hw_device_terminate(hldev);
4743.             iounmap(vdev->bar0);
…
4749              vxge_debug_init(vdev->level_trace, "%s:%d
								Device unregistered",
4750                            __func__, __LINE__);
4751              vxge_debug_entryexit(vdev->level_trace, "%s:%d
								Exiting...", __func__,
4752                          __LINE__);
4753.      }

This is the screenshot when the vulnerability is triggered by using
KASAN. We can see that there is a use-after-free reported by KASAN.

/***************************start**************************/

root@kernel:~# echo 1 > /sys/bus/pci/devices/0000:00:03.0/remove
[  178.296316] vxge_remove
[  182.057081]
 ==================================================================
[  182.057548] BUG: KASAN: use-after-free in vxge_remove+0xe0/0x15c
[  182.057760] Read of size 8 at addr ffff888006c76598 by task bash/119
[  182.057983]
[  182.058747] CPU: 0 PID: 119 Comm: bash Not tainted 5.18.0 #5
[  182.058919] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS
rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
[  182.059463] Call Trace:
[  182.059726]  <TASK>
[  182.060017]  dump_stack_lvl+0x34/0x44
[  182.060316]  print_report.cold+0xb2/0x6b7
[  182.060401]  ? kfree+0x89/0x290
[  182.060478]  ? vxge_remove+0xe0/0x15c
[  182.060545]  kasan_report+0xa9/0x120
[  182.060629]  ? vxge_remove+0xe0/0x15c
[  182.060706]  vxge_remove+0xe0/0x15c
[  182.060793]  pci_device_remove+0x5d/0xe0
[  182.060968]  device_release_driver_internal+0xf1/0x180
[  182.061063]  pci_stop_bus_device+0xae/0xe0
[  182.061150]  pci_stop_and_remove_bus_device_locked+0x11/0x20
[  182.061236]  remove_store+0xc6/0xe0
[  182.061297]  ? subordinate_bus_number_show+0xc0/0xc0
[  182.061359]  ? __mutex_lock_slowpath+0x10/0x10
[  182.061438]  ? sysfs_kf_write+0x6d/0xa0
[  182.061525]  kernfs_fop_write_iter+0x1b0/0x260
[  182.061610]  ? sysfs_kf_bin_read+0xf0/0xf0
[  182.061695]  new_sync_write+0x209/0x310
[  182.061789]  ? new_sync_read+0x310/0x310
[  182.061865]  ? cgroup_rstat_updated+0x5c/0x170
[  182.061937]  ? preempt_count_sub+0xf/0xb0
[  182.061995]  ? pick_next_entity+0x13a/0x220
[  182.062063]  ? __inode_security_revalidate+0x44/0x80
[  182.062155]  ? security_file_permission+0x46/0x2a0
[  182.062230]  vfs_write+0x33f/0x3e0
[  182.062303]  ksys_write+0xb4/0x150
[  182.062369]  ? __ia32_sys_read+0x40/0x40
[  182.062451]  do_syscall_64+0x3b/0x90
[  182.062531]  entry_SYSCALL_64_after_hwframe+0x46/0xb0
[  182.062894] RIP: 0033:0x7f3f37d17274
[  182.063558] Code: 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb b3 0f 1f
80 00 00 00 00 48 8d 05 89 54 0d 00 8b 00 85 c0 75 13 b8 01 00 00 00 0f
05 <48> 3d 00 f0 ff ff 77 54 c3 0f 1f 00 41 54 49 89 d4 55 48 89 f5 53
[  182.063797] RSP: 002b:00007ffd5ba9e178 EFLAGS: 00000246
ORIG_RAX: 0000000000000001
[  182.064117] RAX: ffffffffffffffda RBX: 0000000000000002
RCX: 00007f3f37d17274
[  182.064219] RDX: 0000000000000002 RSI: 000055bbec327180
RDI: 0000000000000001
[  182.064315] RBP: 000055bbec327180 R08: 000000000000000a
R09: 00007f3f37de7cf0
[  182.064414] R10: 000000000000000a R11: 0000000000000246
R12: 00007f3f37de8760
[  182.064513] R13: 0000000000000002 R14: 00007f3f37de3760
R15: 0000000000000002
[  182.064691]  </TASK>
[  182.064916]
[  182.065224] The buggy address belongs to the physical page:
[  182.065804] page:00000000ef31e4f4 refcount:0 mapcount:0
mapping:0000000000000000 index:0x0 pfn:0x6c76
[  182.067419] flags: 0x100000000000000(node=0|zone=1)
[  182.068997] raw: 0100000000000000 0000000000000000
ffffea00001b1d88 0000000000000000
[  182.069118] raw: 0000000000000000 0000000000000000
00000000ffffffff 0000000000000000
[  182.069294] page dumped because: kasan: bad access detected
[  182.069331]
[  182.069360] Memory state around the buggy address:
[  182.070006]  ffff888006c76480: ff ff ff ff ff ff ff ff ff ff ff
 ff ff ff ff ff
[  182.070136]  ffff888006c76500: ff ff ff ff ff ff ff ff ff ff ff
 ff ff ff ff ff
[  182.070230] >ffff888006c76580: ff ff ff ff ff ff ff ff ff ff ff
 ff ff ff ff ff
[  182.070305]                             ^
[  182.070456]  ffff888006c76600: ff ff ff ff ff ff ff ff ff ff ff
 ff ff ff ff ff
[  182.070505]  ffff888006c76680: ff ff ff ff ff ff ff ff ff ff ff
 ff ff ff ff ff
[  182.070606]
==================================================================
[  182.071374] Disabling lock debugging due to kernel taint

/*****************************end*****************************/

After fixing the bug as done in the patch, we can find KASAN do not report
 the bug and the device(00:03.0) has been successfully removed.

/*****************************start***************************/

root@kernel:~# echo 1 > /sys/bus/pci/devices/0000:00:03.0/remove
root@kernel:~#

/******************************end****************************/

Signed-off-by: Wentao_Liang <Wentao_Liang_g@163.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/neterion/vxge/vxge-main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/neterion/vxge/vxge-main.c b/drivers/net/ethernet/neterion/vxge/vxge-main.c
index fa5d4ddf429b6..092fd0ae58314 100644
--- a/drivers/net/ethernet/neterion/vxge/vxge-main.c
+++ b/drivers/net/ethernet/neterion/vxge/vxge-main.c
@@ -4736,10 +4736,10 @@ static void vxge_remove(struct pci_dev *pdev)
 	for (i = 0; i < vdev->no_of_vpath; i++)
 		vxge_free_mac_add_list(&vdev->vpaths[i]);
 
+	iounmap(vdev->bar0);
 	vxge_device_unregister(hldev);
 	/* Do not call pci_disable_sriov here, as it will break child devices */
 	vxge_hw_device_terminate(hldev);
-	iounmap(vdev->bar0);
 	pci_release_region(pdev, 0);
 	pci_disable_device(pdev);
 	driver_config->config_dev_cnt--;

From 9926de7315be3d606cc011a305ad9adb9e8e14c9 Mon Sep 17 00:00:00 2001
From: Oleksij Rempel <o.rempel@pengutronix.de>
Date: Sat, 18 Jun 2022 14:23:33 +0200
Subject: [PATCH 26/47] net: phy: at803x: fix NULL pointer dereference on
 AR9331 PHY

Latest kernel will explode on the PHY interrupt config, since it depends
now on allocated priv. So, run probe to allocate priv to fix it.

 ar9331_switch ethernet.1:10 lan0 (uninitialized): PHY [!ahb!ethernet@1a000000!mdio!switch@10:00] driver [Qualcomm Atheros AR9331 built-in PHY] (irq=13)
 CPU 0 Unable to handle kernel paging request at virtual address 0000000a, epc == 8050e8a8, ra == 80504b34
         ...
 Call Trace:
 [<8050e8a8>] at803x_config_intr+0x5c/0xd0
 [<80504b34>] phy_request_interrupt+0xa8/0xd0
 [<8050289c>] phylink_bringup_phy+0x2d8/0x3ac
 [<80502b68>] phylink_fwnode_phy_connect+0x118/0x130
 [<8074d8ec>] dsa_slave_create+0x270/0x420
 [<80743b04>] dsa_port_setup+0x12c/0x148
 [<8074580c>] dsa_register_switch+0xaf0/0xcc0
 [<80511344>] ar9331_sw_probe+0x370/0x388
 [<8050cb78>] mdio_probe+0x44/0x70
 [<804df300>] really_probe+0x200/0x424
 [<804df7b4>] __driver_probe_device+0x290/0x298
 [<804df810>] driver_probe_device+0x54/0xe4
 [<804dfd50>] __device_attach_driver+0xe4/0x130
 [<804dcb00>] bus_for_each_drv+0xb4/0xd8
 [<804dfac4>] __device_attach+0x104/0x1a4
 [<804ddd24>] bus_probe_device+0x48/0xc4
 [<804deb44>] deferred_probe_work_func+0xf0/0x10c
 [<800a0ffc>] process_one_work+0x314/0x4d4
 [<800a17fc>] worker_thread+0x2a4/0x354
 [<800a9a54>] kthread+0x134/0x13c
 [<8006306c>] ret_from_kernel_thread+0x14/0x1c

Same Issue would affect some other PHYs (QCA8081, QCA9561), so fix it
too.

Fixes: 3265f4218878 ("net: phy: at803x: add fiber support")
Signed-off-by: Oleksij Rempel <o.rempel@pengutronix.de>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/at803x.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/net/phy/at803x.c b/drivers/net/phy/at803x.c
index 6a467e7817a6a..59fe356942b51 100644
--- a/drivers/net/phy/at803x.c
+++ b/drivers/net/phy/at803x.c
@@ -2072,6 +2072,8 @@ static struct phy_driver at803x_driver[] = {
 	/* ATHEROS AR9331 */
 	PHY_ID_MATCH_EXACT(ATH9331_PHY_ID),
 	.name			= "Qualcomm Atheros AR9331 built-in PHY",
+	.probe			= at803x_probe,
+	.remove			= at803x_remove,
 	.suspend		= at803x_suspend,
 	.resume			= at803x_resume,
 	.flags			= PHY_POLL_CABLE_TEST,
@@ -2087,6 +2089,8 @@ static struct phy_driver at803x_driver[] = {
 	/* Qualcomm Atheros QCA9561 */
 	PHY_ID_MATCH_EXACT(QCA9561_PHY_ID),
 	.name			= "Qualcomm Atheros QCA9561 built-in PHY",
+	.probe			= at803x_probe,
+	.remove			= at803x_remove,
 	.suspend		= at803x_suspend,
 	.resume			= at803x_resume,
 	.flags			= PHY_POLL_CABLE_TEST,
@@ -2151,6 +2155,8 @@ static struct phy_driver at803x_driver[] = {
 	PHY_ID_MATCH_EXACT(QCA8081_PHY_ID),
 	.name			= "Qualcomm QCA8081",
 	.flags			= PHY_POLL_CABLE_TEST,
+	.probe			= at803x_probe,
+	.remove			= at803x_remove,
 	.config_intr		= at803x_config_intr,
 	.handle_interrupt	= at803x_handle_interrupt,
 	.get_tunable		= at803x_get_tunable,

From 313c502fa3b3494159cb8f18d4a6444d06c5c9a5 Mon Sep 17 00:00:00 2001
From: Riccardo Paolo Bestetti <pbl@bestov.io>
Date: Sun, 19 Jun 2022 18:27:35 +0200
Subject: [PATCH 27/47] ipv4: fix bind address validity regression tests

Commit 8ff978b8b222 ("ipv4/raw: support binding to nonlocal addresses")
introduces support for binding to nonlocal addresses, as well as some
basic test coverage for some of the related cases.

Commit b4a028c4d031 ("ipv4: ping: fix bind address validity check")
fixes a regression which incorrectly removed some checks for bind
address validation. In addition, it introduces regression tests for
those specific checks. However, those regression tests are defective, in
that they perform the tests using an incorrect combination of bind
flags. As a result, those tests fail when they should succeed.

This commit introduces additional regression tests for nonlocal binding
and fixes the defective regression tests. It also introduces new
set_sysctl calls for the ipv4_bind test group, as to perform the ICMP
binding tests it is necessary to allow ICMP socket creation by setting
the net.ipv4.ping_group_range knob.

Fixes: b4a028c4d031 ("ipv4: ping: fix bind address validity check")
Reported-by: Riccardo Paolo Bestetti <pbl@bestov.io>
Signed-off-by: Riccardo Paolo Bestetti <pbl@bestov.io>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/fcnal-test.sh | 36 +++++++++++++++++------
 1 file changed, 27 insertions(+), 9 deletions(-)

diff --git a/tools/testing/selftests/net/fcnal-test.sh b/tools/testing/selftests/net/fcnal-test.sh
index 75223b63e3c88..03b586760164a 100755
--- a/tools/testing/selftests/net/fcnal-test.sh
+++ b/tools/testing/selftests/net/fcnal-test.sh
@@ -1800,24 +1800,32 @@ ipv4_addr_bind_novrf()
 	done
 
 	#
-	# raw socket with nonlocal bind
+	# tests for nonlocal bind
 	#
 	a=${NL_IP}
 	log_start
-	run_cmd nettest -s -R -P icmp -f -l ${a} -I ${NSA_DEV} -b
-	log_test_addr ${a} $? 0 "Raw socket bind to nonlocal address after device bind"
+	run_cmd nettest -s -R -f -l ${a} -b
+	log_test_addr ${a} $? 0 "Raw socket bind to nonlocal address"
+
+	log_start
+	run_cmd nettest -s -f -l ${a} -b
+	log_test_addr ${a} $? 0 "TCP socket bind to nonlocal address"
+
+	log_start
+	run_cmd nettest -s -D -P icmp -f -l ${a} -b
+	log_test_addr ${a} $? 0 "ICMP socket bind to nonlocal address"
 
 	#
 	# check that ICMP sockets cannot bind to broadcast and multicast addresses
 	#
 	a=${BCAST_IP}
 	log_start
-	run_cmd nettest -s -R -P icmp -l ${a} -b
+	run_cmd nettest -s -D -P icmp -l ${a} -b
 	log_test_addr ${a} $? 1 "ICMP socket bind to broadcast address"
 
 	a=${MCAST_IP}
 	log_start
-	run_cmd nettest -s -R -P icmp -f -l ${a} -b
+	run_cmd nettest -s -D -P icmp -l ${a} -b
 	log_test_addr ${a} $? 1 "ICMP socket bind to multicast address"
 
 	#
@@ -1870,24 +1878,32 @@ ipv4_addr_bind_vrf()
 	log_test_addr ${a} $? 1 "Raw socket bind to out of scope address after VRF bind"
 
 	#
-	# raw socket with nonlocal bind
+	# tests for nonlocal bind
 	#
 	a=${NL_IP}
 	log_start
-	run_cmd nettest -s -R -P icmp -f -l ${a} -I ${VRF} -b
+	run_cmd nettest -s -R -f -l ${a} -I ${VRF} -b
 	log_test_addr ${a} $? 0 "Raw socket bind to nonlocal address after VRF bind"
 
+	log_start
+	run_cmd nettest -s -f -l ${a} -I ${VRF} -b
+	log_test_addr ${a} $? 0 "TCP socket bind to nonlocal address after VRF bind"
+
+	log_start
+	run_cmd nettest -s -D -P icmp -f -l ${a} -I ${VRF} -b
+	log_test_addr ${a} $? 0 "ICMP socket bind to nonlocal address after VRF bind"
+
 	#
 	# check that ICMP sockets cannot bind to broadcast and multicast addresses
 	#
 	a=${BCAST_IP}
 	log_start
-	run_cmd nettest -s -R -P icmp -l ${a} -I ${VRF} -b
+	run_cmd nettest -s -D -P icmp -l ${a} -I ${VRF} -b
 	log_test_addr ${a} $? 1 "ICMP socket bind to broadcast address after VRF bind"
 
 	a=${MCAST_IP}
 	log_start
-	run_cmd nettest -s -R -P icmp -f -l ${a} -I ${VRF} -b
+	run_cmd nettest -s -D -P icmp -l ${a} -I ${VRF} -b
 	log_test_addr ${a} $? 1 "ICMP socket bind to multicast address after VRF bind"
 
 	#
@@ -1922,10 +1938,12 @@ ipv4_addr_bind()
 
 	log_subsection "No VRF"
 	setup
+	set_sysctl net.ipv4.ping_group_range='0 2147483647' 2>/dev/null
 	ipv4_addr_bind_novrf
 
 	log_subsection "With VRF"
 	setup "yes"
+	set_sysctl net.ipv4.ping_group_range='0 2147483647' 2>/dev/null
 	ipv4_addr_bind_vrf
 }
 

From 301bd140ed0b24f0da660874c7e8a47dad8c8222 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 20 Jun 2022 01:35:06 -0700
Subject: [PATCH 28/47] erspan: do not assume transport header is always set

Rewrite tests in ip6erspan_tunnel_xmit() and
erspan_fb_xmit() to not assume transport header is set.

syzbot reported:

WARNING: CPU: 0 PID: 1350 at include/linux/skbuff.h:2911 skb_transport_header include/linux/skbuff.h:2911 [inline]
WARNING: CPU: 0 PID: 1350 at include/linux/skbuff.h:2911 ip6erspan_tunnel_xmit+0x15af/0x2eb0 net/ipv6/ip6_gre.c:963
Modules linked in:
CPU: 0 PID: 1350 Comm: aoe_tx0 Not tainted 5.19.0-rc2-syzkaller-00160-g274295c6e53f #0
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.14.0-2 04/01/2014
RIP: 0010:skb_transport_header include/linux/skbuff.h:2911 [inline]
RIP: 0010:ip6erspan_tunnel_xmit+0x15af/0x2eb0 net/ipv6/ip6_gre.c:963
Code: 0f 47 f0 40 88 b5 7f fe ff ff e8 8c 16 4b f9 89 de bf ff ff ff ff e8 a0 12 4b f9 66 83 fb ff 0f 85 1d f1 ff ff e8 71 16 4b f9 <0f> 0b e9 43 f0 ff ff e8 65 16 4b f9 48 8d 85 30 ff ff ff ba 60 00
RSP: 0018:ffffc90005daf910 EFLAGS: 00010293
RAX: 0000000000000000 RBX: 000000000000ffff RCX: 0000000000000000
RDX: ffff88801f032100 RSI: ffffffff882e8d3f RDI: 0000000000000003
RBP: ffffc90005dafab8 R08: 0000000000000003 R09: 000000000000ffff
R10: 000000000000ffff R11: 0000000000000000 R12: ffff888024f21d40
R13: 000000000000a288 R14: 00000000000000b0 R15: ffff888025a2e000
FS: 0000000000000000(0000) GS:ffff88802c800000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000001b2e425000 CR3: 000000006d099000 CR4: 0000000000152ef0
Call Trace:
<TASK>
__netdev_start_xmit include/linux/netdevice.h:4805 [inline]
netdev_start_xmit include/linux/netdevice.h:4819 [inline]
xmit_one net/core/dev.c:3588 [inline]
dev_hard_start_xmit+0x188/0x880 net/core/dev.c:3604
sch_direct_xmit+0x19f/0xbe0 net/sched/sch_generic.c:342
__dev_xmit_skb net/core/dev.c:3815 [inline]
__dev_queue_xmit+0x14a1/0x3900 net/core/dev.c:4219
dev_queue_xmit include/linux/netdevice.h:2994 [inline]
tx+0x6a/0xc0 drivers/block/aoe/aoenet.c:63
kthread+0x1e7/0x3b0 drivers/block/aoe/aoecmd.c:1229
kthread+0x2e9/0x3a0 kernel/kthread.c:376
ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:302
</TASK>

Fixes: d5db21a3e697 ("erspan: auto detect truncated ipv6 packets.")
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: William Tu <u9012063@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_gre.c  | 15 ++++++++++-----
 net/ipv6/ip6_gre.c | 15 ++++++++++-----
 2 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 3b9cd487075af..5c58e21f724e9 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -524,7 +524,6 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev)
 	int tunnel_hlen;
 	int version;
 	int nhoff;
-	int thoff;
 
 	tun_info = skb_tunnel_info(skb);
 	if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
@@ -558,10 +557,16 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev)
 	    (ntohs(ip_hdr(skb)->tot_len) > skb->len - nhoff))
 		truncate = true;
 
-	thoff = skb_transport_header(skb) - skb_mac_header(skb);
-	if (skb->protocol == htons(ETH_P_IPV6) &&
-	    (ntohs(ipv6_hdr(skb)->payload_len) > skb->len - thoff))
-		truncate = true;
+	if (skb->protocol == htons(ETH_P_IPV6)) {
+		int thoff;
+
+		if (skb_transport_header_was_set(skb))
+			thoff = skb_transport_header(skb) - skb_mac_header(skb);
+		else
+			thoff = nhoff + sizeof(struct ipv6hdr);
+		if (ntohs(ipv6_hdr(skb)->payload_len) > skb->len - thoff)
+			truncate = true;
+	}
 
 	if (version == 1) {
 		erspan_build_header(skb, ntohl(tunnel_id_to_key32(key->tun_id)),
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 4e37f7c299004..a9051df0625dc 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -939,7 +939,6 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb,
 	__be16 proto;
 	__u32 mtu;
 	int nhoff;
-	int thoff;
 
 	if (!pskb_inet_may_pull(skb))
 		goto tx_err;
@@ -960,10 +959,16 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb,
 	    (ntohs(ip_hdr(skb)->tot_len) > skb->len - nhoff))
 		truncate = true;
 
-	thoff = skb_transport_header(skb) - skb_mac_header(skb);
-	if (skb->protocol == htons(ETH_P_IPV6) &&
-	    (ntohs(ipv6_hdr(skb)->payload_len) > skb->len - thoff))
-		truncate = true;
+	if (skb->protocol == htons(ETH_P_IPV6)) {
+		int thoff;
+
+		if (skb_transport_header_was_set(skb))
+			thoff = skb_transport_header(skb) - skb_mac_header(skb);
+		else
+			thoff = nhoff + sizeof(struct ipv6hdr);
+		if (ntohs(ipv6_hdr(skb)->payload_len) > skb->len - thoff)
+			truncate = true;
+	}
 
 	if (skb_cow_head(skb, dev->needed_headroom ?: t->hlen))
 		goto tx_err;

From 69135c572d1f84261a6de2a1268513a7e71753e2 Mon Sep 17 00:00:00 2001
From: Ziyang Xuan <william.xuanziyang@huawei.com>
Date: Mon, 20 Jun 2022 12:35:08 +0800
Subject: [PATCH 29/47] net/tls: fix tls_sk_proto_close executed repeatedly

After setting the sock ktls, update ctx->sk_proto to sock->sk_prot by
tls_update(), so now ctx->sk_proto->close is tls_sk_proto_close(). When
close the sock, tls_sk_proto_close() is called for sock->sk_prot->close
is tls_sk_proto_close(). But ctx->sk_proto->close() will be executed later
in tls_sk_proto_close(). Thus tls_sk_proto_close() executed repeatedly
occurred. That will trigger the following bug.

=================================================================
KASAN: null-ptr-deref in range [0x0000000000000010-0x0000000000000017]
RIP: 0010:tls_sk_proto_close+0xd8/0xaf0 net/tls/tls_main.c:306
Call Trace:
 <TASK>
 tls_sk_proto_close+0x356/0xaf0 net/tls/tls_main.c:329
 inet_release+0x12e/0x280 net/ipv4/af_inet.c:428
 __sock_release+0xcd/0x280 net/socket.c:650
 sock_close+0x18/0x20 net/socket.c:1365

Updating a proto which is same with sock->sk_prot is incorrect. Add proto
and sock->sk_prot equality check at the head of tls_update() to fix it.

Fixes: 95fa145479fb ("bpf: sockmap/tls, close can race with map free")
Reported-by: syzbot+29c3c12f3214b85ad081@syzkaller.appspotmail.com
Signed-off-by: Ziyang Xuan <william.xuanziyang@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tls/tls_main.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index da176411c1b5f..46bd5f26338bf 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -921,6 +921,9 @@ static void tls_update(struct sock *sk, struct proto *p,
 {
 	struct tls_context *ctx;
 
+	if (sk->sk_prot == p)
+		return;
+
 	ctx = tls_get_ctx(sk);
 	if (likely(ctx)) {
 		ctx->sk_write_space = write_space;

From 5d79d8af8dec58bf709b3124d09d9572edd9c617 Mon Sep 17 00:00:00 2001
From: Jie2x Zhou <jie2x.zhou@intel.com>
Date: Thu, 16 Jun 2022 15:40:46 +0800
Subject: [PATCH 30/47] selftests: netfilter: correct PKTGEN_SCRIPT_PATHS in
 nft_concat_range.sh

Before change:
make -C netfilter
 TEST: performance
   net,port                                                      [SKIP]
   perf not supported
   port,net                                                      [SKIP]
   perf not supported
   net6,port                                                     [SKIP]
   perf not supported
   port,proto                                                    [SKIP]
   perf not supported
   net6,port,mac                                                 [SKIP]
   perf not supported
   net6,port,mac,proto                                           [SKIP]
   perf not supported
   net,mac                                                       [SKIP]
   perf not supported

After change:
   net,mac                                                       [ OK ]
     baseline (drop from netdev hook):               2061098pps
     baseline hash (non-ranged entries):             1606741pps
     baseline rbtree (match on first field only):    1191607pps
     set with  1000 full, ranged entries:            1639119pps
ok 8 selftests: netfilter: nft_concat_range.sh

Fixes: 611973c1e06f ("selftests: netfilter: Introduce tests for sets with range concatenation")
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Jie2x Zhou <jie2x.zhou@intel.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 tools/testing/selftests/netfilter/nft_concat_range.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/testing/selftests/netfilter/nft_concat_range.sh b/tools/testing/selftests/netfilter/nft_concat_range.sh
index b35010cc7f6ae..a6991877e50cd 100755
--- a/tools/testing/selftests/netfilter/nft_concat_range.sh
+++ b/tools/testing/selftests/netfilter/nft_concat_range.sh
@@ -31,7 +31,7 @@ BUGS="flush_remove_add reload"
 
 # List of possible paths to pktgen script from kernel tree for performance tests
 PKTGEN_SCRIPT_PATHS="
-	../../../samples/pktgen/pktgen_bench_xmit_mode_netif_receive.sh
+	../../../../samples/pktgen/pktgen_bench_xmit_mode_netif_receive.sh
 	pktgen/pktgen_bench_xmit_mode_netif_receive.sh"
 
 # Definition of set types:

From 574a5b85dc3b9ab672ff3fba0ee020f927960648 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 20 Jun 2022 16:17:30 +0200
Subject: [PATCH 31/47] netfilter: nf_dup_netdev: do not push mac header a
 second time

Eric reports skb_under_panic when using dup/fwd via bond+egress hook.
Before pushing mac header, we should make sure that we're called from
ingress to put back what was pulled earlier.

In egress case, the MAC header is already there; we should leave skb
alone.

While at it be more careful here: skb might have been altered and
headroom reduced, so add a skb_cow() before so that headroom is
increased if necessary.

nf_do_netdev_egress() assumes skb ownership (it normally ends with
a call to dev_queue_xmit), so we must free the packet on error.

Fixes: f87b9464d152 ("netfilter: nft_fwd_netdev: Support egress hook")
Reported-by: Eric Garver <eric@garver.life>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_dup_netdev.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/net/netfilter/nf_dup_netdev.c b/net/netfilter/nf_dup_netdev.c
index 7873bd1389c36..13b7f6a660865 100644
--- a/net/netfilter/nf_dup_netdev.c
+++ b/net/netfilter/nf_dup_netdev.c
@@ -13,10 +13,16 @@
 #include <net/netfilter/nf_tables_offload.h>
 #include <net/netfilter/nf_dup_netdev.h>
 
-static void nf_do_netdev_egress(struct sk_buff *skb, struct net_device *dev)
+static void nf_do_netdev_egress(struct sk_buff *skb, struct net_device *dev,
+				enum nf_dev_hooks hook)
 {
-	if (skb_mac_header_was_set(skb))
+	if (hook == NF_NETDEV_INGRESS && skb_mac_header_was_set(skb)) {
+		if (skb_cow_head(skb, skb->mac_len)) {
+			kfree_skb(skb);
+			return;
+		}
 		skb_push(skb, skb->mac_len);
+	}
 
 	skb->dev = dev;
 	skb_clear_tstamp(skb);
@@ -33,7 +39,7 @@ void nf_fwd_netdev_egress(const struct nft_pktinfo *pkt, int oif)
 		return;
 	}
 
-	nf_do_netdev_egress(pkt->skb, dev);
+	nf_do_netdev_egress(pkt->skb, dev, nft_hook(pkt));
 }
 EXPORT_SYMBOL_GPL(nf_fwd_netdev_egress);
 
@@ -48,7 +54,7 @@ void nf_dup_netdev_egress(const struct nft_pktinfo *pkt, int oif)
 
 	skb = skb_clone(pkt->skb, GFP_ATOMIC);
 	if (skb)
-		nf_do_netdev_egress(skb, dev);
+		nf_do_netdev_egress(skb, dev, nft_hook(pkt));
 }
 EXPORT_SYMBOL_GPL(nf_dup_netdev_egress);
 

From fcd53c51d03709bc429822086f1e9b3e88904284 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 20 Jun 2022 16:17:31 +0200
Subject: [PATCH 32/47] netfilter: nf_dup_netdev: add and use recursion counter

Now that the egress function can be called from egress hook, we need
to avoid recursive calls into the nf_tables traverser, else crash.

Fixes: f87b9464d152 ("netfilter: nft_fwd_netdev: Support egress hook")
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_dup_netdev.c | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/net/netfilter/nf_dup_netdev.c b/net/netfilter/nf_dup_netdev.c
index 13b7f6a660865..a8e2425e43b0d 100644
--- a/net/netfilter/nf_dup_netdev.c
+++ b/net/netfilter/nf_dup_netdev.c
@@ -13,20 +13,31 @@
 #include <net/netfilter/nf_tables_offload.h>
 #include <net/netfilter/nf_dup_netdev.h>
 
+#define NF_RECURSION_LIMIT	2
+
+static DEFINE_PER_CPU(u8, nf_dup_skb_recursion);
+
 static void nf_do_netdev_egress(struct sk_buff *skb, struct net_device *dev,
 				enum nf_dev_hooks hook)
 {
+	if (__this_cpu_read(nf_dup_skb_recursion) > NF_RECURSION_LIMIT)
+		goto err;
+
 	if (hook == NF_NETDEV_INGRESS && skb_mac_header_was_set(skb)) {
-		if (skb_cow_head(skb, skb->mac_len)) {
-			kfree_skb(skb);
-			return;
-		}
+		if (skb_cow_head(skb, skb->mac_len))
+			goto err;
+
 		skb_push(skb, skb->mac_len);
 	}
 
 	skb->dev = dev;
 	skb_clear_tstamp(skb);
+	__this_cpu_inc(nf_dup_skb_recursion);
 	dev_queue_xmit(skb);
+	__this_cpu_dec(nf_dup_skb_recursion);
+	return;
+err:
+	kfree_skb(skb);
 }
 
 void nf_fwd_netdev_egress(const struct nft_pktinfo *pkt, int oif)

From d4ea6f6373ef56d1d795a24f1f5874f4a6019199 Mon Sep 17 00:00:00 2001
From: Marcin Szycik <marcin.szycik@linux.intel.com>
Date: Mon, 9 May 2022 21:01:18 +0200
Subject: [PATCH 33/47] ice: ignore protocol field in GTP offload

Commit 34a897758efe ("ice: Add support for inner etype in switchdev")
added the ability to match on inner ethertype. A side effect of that change
is that it is now impossible to add some filters for protocols which do not
contain inner ethtype field. tc requires the protocol field to be specified
when providing certain other options, e.g. src_ip. This is a problem in
case of GTP - when user wants to specify e.g. src_ip, they also need to
specify protocol in tc command (otherwise tc fails with: Illegal "src_ip").
Because GTP is a tunnel, the protocol field is treated as inner protocol.
GTP does not contain inner ethtype field and the filter cannot be added.

To fix this, ignore the ethertype field in case of GTP filters.

Fixes: 9a225f81f540 ("ice: Support GTP-U and GTP-C offload in switchdev")
Signed-off-by: Marcin Szycik <marcin.szycik@linux.intel.com>
Tested-by: Sandeep Penigalapati <sandeep.penigalapati@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 drivers/net/ethernet/intel/ice/ice_tc_lib.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/ice/ice_tc_lib.c b/drivers/net/ethernet/intel/ice/ice_tc_lib.c
index 0a0c55fb86993..b677c62a244ca 100644
--- a/drivers/net/ethernet/intel/ice/ice_tc_lib.c
+++ b/drivers/net/ethernet/intel/ice/ice_tc_lib.c
@@ -993,7 +993,9 @@ ice_parse_cls_flower(struct net_device *filter_dev, struct ice_vsi *vsi,
 		n_proto_key = ntohs(match.key->n_proto);
 		n_proto_mask = ntohs(match.mask->n_proto);
 
-		if (n_proto_key == ETH_P_ALL || n_proto_key == 0) {
+		if (n_proto_key == ETH_P_ALL || n_proto_key == 0 ||
+		    fltr->tunnel_type == TNL_GTPU ||
+		    fltr->tunnel_type == TNL_GTPC) {
 			n_proto_key = 0;
 			n_proto_mask = 0;
 		} else {

From 3578dc90013b1fa20da996cdadd8515802716132 Mon Sep 17 00:00:00 2001
From: Wojciech Drewek <wojciech.drewek@intel.com>
Date: Mon, 23 May 2022 11:41:55 +0200
Subject: [PATCH 34/47] ice: Fix switchdev rules book keeping

Adding two filters with same matching criteria ends up with
one rule in hardware with act = ICE_FWD_TO_VSI_LIST.
In order to remove them properly we have to keep the
information about vsi handle which is used in VSI bitmap
(ice_adv_fltr_mgmt_list_entry::vsi_list_info::vsi_map).

Fixes: 0d08a441fb1a ("ice: ndo_setup_tc implementation for PF")
Reported-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
Signed-off-by: Wojciech Drewek <wojciech.drewek@intel.com>
Tested-by: Sandeep Penigalapati <sandeep.penigalapati@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 drivers/net/ethernet/intel/ice/ice_tc_lib.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/intel/ice/ice_tc_lib.c b/drivers/net/ethernet/intel/ice/ice_tc_lib.c
index b677c62a244ca..b803f2ab3cc76 100644
--- a/drivers/net/ethernet/intel/ice/ice_tc_lib.c
+++ b/drivers/net/ethernet/intel/ice/ice_tc_lib.c
@@ -524,6 +524,7 @@ ice_eswitch_add_tc_fltr(struct ice_vsi *vsi, struct ice_tc_flower_fltr *fltr)
 	 */
 	fltr->rid = rule_added.rid;
 	fltr->rule_id = rule_added.rule_id;
+	fltr->dest_id = rule_added.vsi_handle;
 
 exit:
 	kfree(list);

From c3d184c83ff4b80167e34edfc3d21df424bf27ff Mon Sep 17 00:00:00 2001
From: Anatolii Gerasymenko <anatolii.gerasymenko@intel.com>
Date: Mon, 20 Jun 2022 09:47:05 +0200
Subject: [PATCH 35/47] ice: ethtool: advertise 1000M speeds properly

In current implementation ice_update_phy_type enables all link modes
for selected speed. This approach doesn't work for 1000M speeds,
because both copper (1000baseT) and optical (1000baseX) standards
cannot be enabled at once.

Fix this, by adding the function `ice_set_phy_type_from_speed()`
for 1000M speeds.

Fixes: 48cb27f2fd18 ("ice: Implement handlers for ethtool PHY/link operations")
Signed-off-by: Anatolii Gerasymenko <anatolii.gerasymenko@intel.com>
Tested-by: Gurucharan <gurucharanx.g@intel.com> (A Contingent worker at Intel)
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 drivers/net/ethernet/intel/ice/ice_ethtool.c | 39 +++++++++++++++++++-
 1 file changed, 38 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/ice/ice_ethtool.c b/drivers/net/ethernet/intel/ice/ice_ethtool.c
index 1e71b70f0e523..8078618ce1b61 100644
--- a/drivers/net/ethernet/intel/ice/ice_ethtool.c
+++ b/drivers/net/ethernet/intel/ice/ice_ethtool.c
@@ -2189,6 +2189,42 @@ ice_setup_autoneg(struct ice_port_info *p, struct ethtool_link_ksettings *ks,
 	return err;
 }
 
+/**
+ * ice_set_phy_type_from_speed - set phy_types based on speeds
+ * and advertised modes
+ * @ks: ethtool link ksettings struct
+ * @phy_type_low: pointer to the lower part of phy_type
+ * @phy_type_high: pointer to the higher part of phy_type
+ * @adv_link_speed: targeted link speeds bitmap
+ */
+static void
+ice_set_phy_type_from_speed(const struct ethtool_link_ksettings *ks,
+			    u64 *phy_type_low, u64 *phy_type_high,
+			    u16 adv_link_speed)
+{
+	/* Handle 1000M speed in a special way because ice_update_phy_type
+	 * enables all link modes, but having mixed copper and optical
+	 * standards is not supported.
+	 */
+	adv_link_speed &= ~ICE_AQ_LINK_SPEED_1000MB;
+
+	if (ethtool_link_ksettings_test_link_mode(ks, advertising,
+						  1000baseT_Full))
+		*phy_type_low |= ICE_PHY_TYPE_LOW_1000BASE_T |
+				 ICE_PHY_TYPE_LOW_1G_SGMII;
+
+	if (ethtool_link_ksettings_test_link_mode(ks, advertising,
+						  1000baseKX_Full))
+		*phy_type_low |= ICE_PHY_TYPE_LOW_1000BASE_KX;
+
+	if (ethtool_link_ksettings_test_link_mode(ks, advertising,
+						  1000baseX_Full))
+		*phy_type_low |= ICE_PHY_TYPE_LOW_1000BASE_SX |
+				 ICE_PHY_TYPE_LOW_1000BASE_LX;
+
+	ice_update_phy_type(phy_type_low, phy_type_high, adv_link_speed);
+}
+
 /**
  * ice_set_link_ksettings - Set Speed and Duplex
  * @netdev: network interface device structure
@@ -2320,7 +2356,8 @@ ice_set_link_ksettings(struct net_device *netdev,
 		adv_link_speed = curr_link_speed;
 
 	/* Convert the advertise link speeds to their corresponded PHY_TYPE */
-	ice_update_phy_type(&phy_type_low, &phy_type_high, adv_link_speed);
+	ice_set_phy_type_from_speed(ks, &phy_type_low, &phy_type_high,
+				    adv_link_speed);
 
 	if (!autoneg_changed && adv_link_speed == curr_link_speed) {
 		netdev_info(netdev, "Nothing changed, exiting without setting anything.\n");

From a632b2a4c920ce5af29410fb091f7ee6d2e77dc6 Mon Sep 17 00:00:00 2001
From: Anatolii Gerasymenko <anatolii.gerasymenko@intel.com>
Date: Mon, 6 Jun 2022 09:01:21 +0200
Subject: [PATCH 36/47] ice: ethtool: Prohibit improper channel config for DCB

Do not allow setting less channels, than Traffic Classes there are
via ethtool. There must be at least one channel per Traffic Class.

If you set less channels, than Traffic Classes there are, then during
ice_vsi_rebuild there would be allocated only the requested amount
of tx/rx rings in ice_vsi_alloc_arrays. But later in ice_vsi_setup_q_map
there would be requested at least one channel per Traffic Class. This
results in setting num_rxq > alloc_rxq and num_txq > alloc_txq.
Later, there would be a NULL pointer dereference in
ice_vsi_map_rings_to_vectors, because we go beyond of rx_rings or
tx_rings arrays.

Change ice_set_channels() to return error if you try to allocate less
channels, than Traffic Classes there are.
Change ice_vsi_setup_q_map() and ice_vsi_setup_q_map_mqprio() to return
status code instead of void.
Add error handling for ice_vsi_setup_q_map() and
ice_vsi_setup_q_map_mqprio() in ice_vsi_init() and ice_vsi_cfg_tc().

[53753.889983] INFO: Flow control is disabled for this traffic class (0) on this vsi.
[53763.984862] BUG: unable to handle kernel NULL pointer dereference at 0000000000000028
[53763.992915] PGD 14b45f5067 P4D 0
[53763.996444] Oops: 0002 [#1] SMP NOPTI
[53764.000312] CPU: 12 PID: 30661 Comm: ethtool Kdump: loaded Tainted: GOE    --------- -  - 4.18.0-240.el8.x86_64 #1
[53764.011825] Hardware name: Intel Corporation WilsonCity/WilsonCity, BIOS WLYDCRB1.SYS.0020.P21.2012150710 12/15/2020
[53764.022584] RIP: 0010:ice_vsi_map_rings_to_vectors+0x7e/0x120 [ice]
[53764.029089] Code: 41 0d 0f b7 b7 12 05 00 00 0f b6 d0 44 29 de 44 0f b7 c6 44 01 c2 41 39 d0 7d 2d 4c 8b 47 28 44 0f b7 ce 83 c6 01 4f 8b 04 c8 <49> 89 48 28 4                           c 8b 89 b8 01 00 00 4d 89 08 4c 89 81 b8 01 00 00 44
[53764.048379] RSP: 0018:ff550dd88ea47b20 EFLAGS: 00010206
[53764.053884] RAX: 0000000000000002 RBX: 0000000000000004 RCX: ff385ea42fa4a018
[53764.061301] RDX: 0000000000000006 RSI: 0000000000000005 RDI: ff385e9baeedd018
[53764.068717] RBP: 0000000000000010 R08: 0000000000000000 R09: 0000000000000004
[53764.076133] R10: 0000000000000002 R11: 0000000000000004 R12: 0000000000000000
[53764.083553] R13: 0000000000000000 R14: ff385e658fdd9000 R15: ff385e9baeedd018
[53764.090976] FS:  000014872c5b5740(0000) GS:ff385e847f100000(0000) knlGS:0000000000000000
[53764.099362] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[53764.105409] CR2: 0000000000000028 CR3: 0000000a820fa002 CR4: 0000000000761ee0
[53764.112851] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[53764.120301] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[53764.127747] PKRU: 55555554
[53764.130781] Call Trace:
[53764.133564]  ice_vsi_rebuild+0x611/0x870 [ice]
[53764.138341]  ice_vsi_recfg_qs+0x94/0x100 [ice]
[53764.143116]  ice_set_channels+0x1a8/0x3e0 [ice]
[53764.147975]  ethtool_set_channels+0x14e/0x240
[53764.152667]  dev_ethtool+0xd74/0x2a10
[53764.156665]  ? __mod_lruvec_state+0x44/0x110
[53764.161280]  ? __mod_lruvec_state+0x44/0x110
[53764.165893]  ? page_add_file_rmap+0x15/0x170
[53764.170518]  ? inet_ioctl+0xd1/0x220
[53764.174445]  ? netdev_run_todo+0x5e/0x290
[53764.178808]  dev_ioctl+0xb5/0x550
[53764.182485]  sock_do_ioctl+0xa0/0x140
[53764.186512]  sock_ioctl+0x1a8/0x300
[53764.190367]  ? selinux_file_ioctl+0x161/0x200
[53764.195090]  do_vfs_ioctl+0xa4/0x640
[53764.199035]  ksys_ioctl+0x60/0x90
[53764.202722]  __x64_sys_ioctl+0x16/0x20
[53764.206845]  do_syscall_64+0x5b/0x1a0
[53764.210887]  entry_SYSCALL_64_after_hwframe+0x65/0xca

Fixes: 87324e747fde ("ice: Implement ethtool ops for channels")
Signed-off-by: Anatolii Gerasymenko <anatolii.gerasymenko@intel.com>
Tested-by: Gurucharan <gurucharanx.g@intel.com> (A Contingent worker at Intel)
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 drivers/net/ethernet/intel/ice/ice_ethtool.c | 10 +++++
 drivers/net/ethernet/intel/ice/ice_lib.c     | 42 +++++++++++++++++---
 2 files changed, 47 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/intel/ice/ice_ethtool.c b/drivers/net/ethernet/intel/ice/ice_ethtool.c
index 8078618ce1b61..70335f6e85243 100644
--- a/drivers/net/ethernet/intel/ice/ice_ethtool.c
+++ b/drivers/net/ethernet/intel/ice/ice_ethtool.c
@@ -3507,6 +3507,16 @@ static int ice_set_channels(struct net_device *dev, struct ethtool_channels *ch)
 	new_rx = ch->combined_count + ch->rx_count;
 	new_tx = ch->combined_count + ch->tx_count;
 
+	if (new_rx < vsi->tc_cfg.numtc) {
+		netdev_err(dev, "Cannot set less Rx channels, than Traffic Classes you have (%u)\n",
+			   vsi->tc_cfg.numtc);
+		return -EINVAL;
+	}
+	if (new_tx < vsi->tc_cfg.numtc) {
+		netdev_err(dev, "Cannot set less Tx channels, than Traffic Classes you have (%u)\n",
+			   vsi->tc_cfg.numtc);
+		return -EINVAL;
+	}
 	if (new_rx > ice_get_max_rxq(pf)) {
 		netdev_err(dev, "Maximum allowed Rx channels is %d\n",
 			   ice_get_max_rxq(pf));
diff --git a/drivers/net/ethernet/intel/ice/ice_lib.c b/drivers/net/ethernet/intel/ice/ice_lib.c
index 454e01ae09b97..f7f9c973ec54d 100644
--- a/drivers/net/ethernet/intel/ice/ice_lib.c
+++ b/drivers/net/ethernet/intel/ice/ice_lib.c
@@ -909,7 +909,7 @@ static void ice_set_dflt_vsi_ctx(struct ice_hw *hw, struct ice_vsi_ctx *ctxt)
  * @vsi: the VSI being configured
  * @ctxt: VSI context structure
  */
-static void ice_vsi_setup_q_map(struct ice_vsi *vsi, struct ice_vsi_ctx *ctxt)
+static int ice_vsi_setup_q_map(struct ice_vsi *vsi, struct ice_vsi_ctx *ctxt)
 {
 	u16 offset = 0, qmap = 0, tx_count = 0, pow = 0;
 	u16 num_txq_per_tc, num_rxq_per_tc;
@@ -982,7 +982,18 @@ static void ice_vsi_setup_q_map(struct ice_vsi *vsi, struct ice_vsi_ctx *ctxt)
 	else
 		vsi->num_rxq = num_rxq_per_tc;
 
+	if (vsi->num_rxq > vsi->alloc_rxq) {
+		dev_err(ice_pf_to_dev(vsi->back), "Trying to use more Rx queues (%u), than were allocated (%u)!\n",
+			vsi->num_rxq, vsi->alloc_rxq);
+		return -EINVAL;
+	}
+
 	vsi->num_txq = tx_count;
+	if (vsi->num_txq > vsi->alloc_txq) {
+		dev_err(ice_pf_to_dev(vsi->back), "Trying to use more Tx queues (%u), than were allocated (%u)!\n",
+			vsi->num_txq, vsi->alloc_txq);
+		return -EINVAL;
+	}
 
 	if (vsi->type == ICE_VSI_VF && vsi->num_txq != vsi->num_rxq) {
 		dev_dbg(ice_pf_to_dev(vsi->back), "VF VSI should have same number of Tx and Rx queues. Hence making them equal\n");
@@ -1000,6 +1011,8 @@ static void ice_vsi_setup_q_map(struct ice_vsi *vsi, struct ice_vsi_ctx *ctxt)
 	 */
 	ctxt->info.q_mapping[0] = cpu_to_le16(vsi->rxq_map[0]);
 	ctxt->info.q_mapping[1] = cpu_to_le16(vsi->num_rxq);
+
+	return 0;
 }
 
 /**
@@ -1187,7 +1200,10 @@ static int ice_vsi_init(struct ice_vsi *vsi, bool init_vsi)
 	if (vsi->type == ICE_VSI_CHNL) {
 		ice_chnl_vsi_setup_q_map(vsi, ctxt);
 	} else {
-		ice_vsi_setup_q_map(vsi, ctxt);
+		ret = ice_vsi_setup_q_map(vsi, ctxt);
+		if (ret)
+			goto out;
+
 		if (!init_vsi) /* means VSI being updated */
 			/* must to indicate which section of VSI context are
 			 * being modified
@@ -3464,7 +3480,7 @@ void ice_vsi_cfg_netdev_tc(struct ice_vsi *vsi, u8 ena_tc)
  *
  * Prepares VSI tc_config to have queue configurations based on MQPRIO options.
  */
-static void
+static int
 ice_vsi_setup_q_map_mqprio(struct ice_vsi *vsi, struct ice_vsi_ctx *ctxt,
 			   u8 ena_tc)
 {
@@ -3513,7 +3529,18 @@ ice_vsi_setup_q_map_mqprio(struct ice_vsi *vsi, struct ice_vsi_ctx *ctxt,
 
 	/* Set actual Tx/Rx queue pairs */
 	vsi->num_txq = offset + qcount_tx;
+	if (vsi->num_txq > vsi->alloc_txq) {
+		dev_err(ice_pf_to_dev(vsi->back), "Trying to use more Tx queues (%u), than were allocated (%u)!\n",
+			vsi->num_txq, vsi->alloc_txq);
+		return -EINVAL;
+	}
+
 	vsi->num_rxq = offset + qcount_rx;
+	if (vsi->num_rxq > vsi->alloc_rxq) {
+		dev_err(ice_pf_to_dev(vsi->back), "Trying to use more Rx queues (%u), than were allocated (%u)!\n",
+			vsi->num_rxq, vsi->alloc_rxq);
+		return -EINVAL;
+	}
 
 	/* Setup queue TC[0].qmap for given VSI context */
 	ctxt->info.tc_mapping[0] = cpu_to_le16(qmap);
@@ -3531,6 +3558,8 @@ ice_vsi_setup_q_map_mqprio(struct ice_vsi *vsi, struct ice_vsi_ctx *ctxt,
 	dev_dbg(ice_pf_to_dev(vsi->back), "vsi->num_rxq = %d\n",  vsi->num_rxq);
 	dev_dbg(ice_pf_to_dev(vsi->back), "all_numtc %u, all_enatc: 0x%04x, tc_cfg.numtc %u\n",
 		vsi->all_numtc, vsi->all_enatc, vsi->tc_cfg.numtc);
+
+	return 0;
 }
 
 /**
@@ -3580,9 +3609,12 @@ int ice_vsi_cfg_tc(struct ice_vsi *vsi, u8 ena_tc)
 
 	if (vsi->type == ICE_VSI_PF &&
 	    test_bit(ICE_FLAG_TC_MQPRIO, pf->flags))
-		ice_vsi_setup_q_map_mqprio(vsi, ctx, ena_tc);
+		ret = ice_vsi_setup_q_map_mqprio(vsi, ctx, ena_tc);
 	else
-		ice_vsi_setup_q_map(vsi, ctx);
+		ret = ice_vsi_setup_q_map(vsi, ctx);
+
+	if (ret)
+		goto out;
 
 	/* must to indicate which section of VSI context are being modified */
 	ctx->info.valid_sections = cpu_to_le16(ICE_AQ_VSI_PROP_RXQ_MAP_VALID);

From 2642cc6c3bbe0900ba15bab078fd15ad8baccbc5 Mon Sep 17 00:00:00 2001
From: Lukas Wunner <lukas@wunner.de>
Date: Mon, 20 Jun 2022 13:04:50 +0200
Subject: [PATCH 37/47] net: phy: smsc: Disable Energy Detect Power-Down in
 interrupt mode

Simon reports that if two LAN9514 USB adapters are directly connected
without an intermediate switch, the link fails to come up and link LEDs
remain dark.  The issue was introduced by commit 1ce8b37241ed ("usbnet:
smsc95xx: Forward PHY interrupts to PHY driver to avoid polling").

The PHY suffers from a known erratum wherein link detection becomes
unreliable if Energy Detect Power-Down is used.  In poll mode, the
driver works around the erratum by briefly disabling EDPD for 640 msec
to detect a neighbor, then re-enabling it to save power.

In interrupt mode, no interrupt is signaled if EDPD is used by both link
partners, so it must not be enabled at all.

We'll recoup the power savings by enabling SUSPEND1 mode on affected
LAN95xx chips in a forthcoming commit.

Fixes: 1ce8b37241ed ("usbnet: smsc95xx: Forward PHY interrupts to PHY driver to avoid polling")
Reported-by: Simon Han <z.han@kunbus.com>
Signed-off-by: Lukas Wunner <lukas@wunner.de>
Link: https://lore.kernel.org/r/439a3f3168c2f9d44b5fd9bb8d2b551711316be6.1655714438.git.lukas@wunner.de
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/phy/smsc.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/net/phy/smsc.c b/drivers/net/phy/smsc.c
index 1b54684b68a07..96d3c40932d8c 100644
--- a/drivers/net/phy/smsc.c
+++ b/drivers/net/phy/smsc.c
@@ -110,7 +110,7 @@ static int smsc_phy_config_init(struct phy_device *phydev)
 	struct smsc_phy_priv *priv = phydev->priv;
 	int rc;
 
-	if (!priv->energy_enable)
+	if (!priv->energy_enable || phydev->irq != PHY_POLL)
 		return 0;
 
 	rc = phy_read(phydev, MII_LAN83C185_CTRL_STATUS);
@@ -210,6 +210,8 @@ static int lan95xx_config_aneg_ext(struct phy_device *phydev)
  * response on link pulses to detect presence of plugged Ethernet cable.
  * The Energy Detect Power-Down mode is enabled again in the end of procedure to
  * save approximately 220 mW of power if cable is unplugged.
+ * The workaround is only applicable to poll mode. Energy Detect Power-Down may
+ * not be used in interrupt mode lest link change detection becomes unreliable.
  */
 static int lan87xx_read_status(struct phy_device *phydev)
 {
@@ -217,7 +219,7 @@ static int lan87xx_read_status(struct phy_device *phydev)
 
 	int err = genphy_read_status(phydev);
 
-	if (!phydev->link && priv->energy_enable) {
+	if (!phydev->link && priv->energy_enable && phydev->irq == PHY_POLL) {
 		/* Disable EDPD to wake up PHY */
 		int rc = phy_read(phydev, MII_LAN83C185_CTRL_STATUS);
 		if (rc < 0)

From 877fe9d49b74e5f84346f9df34e2c7f8086dbceb Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Mon, 20 Jun 2022 12:12:37 -0700
Subject: [PATCH 38/47] Revert "drivers/net/ethernet/neterion/vxge: Fix a
 use-after-free bug in vxge-main.c"

This reverts commit 8fc74d18639a2402ca52b177e990428e26ea881f.

BAR0 is the main (only?) register bank for this device. We most
obviously can't unmap it before the netdev is unregistered.
This was pointed out in review but the patch got reposted and
merged, anyway.

The author of the patch was only testing it with a QEMU model,
which I presume does not emulate enough for the netdev to be brought
up (author's replies are not visible in lore because they kept sending
their emails in HTML).

Link: https://lore.kernel.org/all/20220616085059.680dc215@kernel.org/
Fixes: 8fc74d18639a ("drivers/net/ethernet/neterion/vxge: Fix a use-after-free bug in vxge-main.c")
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/neterion/vxge/vxge-main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/neterion/vxge/vxge-main.c b/drivers/net/ethernet/neterion/vxge/vxge-main.c
index 092fd0ae58314..fa5d4ddf429b6 100644
--- a/drivers/net/ethernet/neterion/vxge/vxge-main.c
+++ b/drivers/net/ethernet/neterion/vxge/vxge-main.c
@@ -4736,10 +4736,10 @@ static void vxge_remove(struct pci_dev *pdev)
 	for (i = 0; i < vdev->no_of_vpath; i++)
 		vxge_free_mac_add_list(&vdev->vpaths[i]);
 
-	iounmap(vdev->bar0);
 	vxge_device_unregister(hldev);
 	/* Do not call pci_disable_sriov here, as it will break child devices */
 	vxge_hw_device_terminate(hldev);
+	iounmap(vdev->bar0);
 	pci_release_region(pdev, 0);
 	pci_disable_device(pdev);
 	driver_config->config_dev_cnt--;

From 1e70212e031528918066a631c9fdccda93a1ffaa Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 15 Jun 2022 22:23:12 -0700
Subject: [PATCH 39/47] hinic: Replace memcpy() with direct assignment

Under CONFIG_FORTIFY_SOURCE=y and CONFIG_UBSAN_BOUNDS=y, Clang is bugged
here for calculating the size of the destination buffer (0x10 instead of
0x14). This copy is a fixed size (sizeof(struct fw_section_info_st)), with
the source and dest being struct fw_section_info_st, so the memcpy should
be safe, assuming the index is within bounds, which is UBSAN_BOUNDS's
responsibility to figure out.

Avoid the whole thing and just do a direct assignment. This results in
no change to the executable code.

[This is a duplicate of commit 2c0ab32b73cf ("hinic: Replace memcpy()
 with direct assignment") which was applied to net-next.]

Cc: Nick Desaulniers <ndesaulniers@google.com>
Cc: Tom Rix <trix@redhat.com>
Cc: llvm@lists.linux.dev
Link: https://github.com/ClangBuiltLinux/linux/issues/1592
Signed-off-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Tested-by: Nathan Chancellor <nathan@kernel.org> # build
Link: https://lore.kernel.org/r/20220616052312.292861-1-keescook@chromium.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/huawei/hinic/hinic_devlink.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/huawei/hinic/hinic_devlink.c b/drivers/net/ethernet/huawei/hinic/hinic_devlink.c
index 60ae8bfc5f69a..1749d26f4befc 100644
--- a/drivers/net/ethernet/huawei/hinic/hinic_devlink.c
+++ b/drivers/net/ethernet/huawei/hinic/hinic_devlink.c
@@ -43,9 +43,7 @@ static bool check_image_valid(struct hinic_devlink_priv *priv, const u8 *buf,
 
 	for (i = 0; i < fw_image->fw_info.fw_section_cnt; i++) {
 		len += fw_image->fw_section_info[i].fw_section_len;
-		memcpy(&host_image->image_section_info[i],
-		       &fw_image->fw_section_info[i],
-		       sizeof(struct fw_section_info_st));
+		host_image->image_section_info[i] = fw_image->fw_section_info[i];
 	}
 
 	if (len != fw_image->fw_len ||

From 13f28c2cf0701eaf3b251a57b46bca0fe17248ca Mon Sep 17 00:00:00 2001
From: Vadim Fedorenko <vadfed@fb.com>
Date: Wed, 22 Jun 2022 02:31:31 +0300
Subject: [PATCH 40/47] MAINTAINERS: Add a maintainer for OCP Time Card

I've been contributing and reviewing patches for ptp_ocp driver for
some time and I'm taking care of it's github mirror. On Jakub's
suggestion, I would like to step forward and become a maintainer for
this driver. This patch adds a dedicated entry to MAINTAINERS.

Signed-off-by: Vadim Fedorenko <vadfed@fb.com>
Acked-by: Jonathan Lemon <jonathan.lemon@gmail.com>
Link: https://lore.kernel.org/r/20220621233131.21240-1-vfedorenko@novek.ru
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 MAINTAINERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 15a2341936ea5..d922601f02ae5 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -14868,6 +14868,7 @@ F:	include/dt-bindings/
 
 OPENCOMPUTE PTP CLOCK DRIVER
 M:	Jonathan Lemon <jonathan.lemon@gmail.com>
+M:	Vadim Fedorenko <vadfed@fb.com>
 L:	netdev@vger.kernel.org
 S:	Maintained
 F:	drivers/ptp/ptp_ocp.c

From 386228c694bf1e7a7688e44412cb33500b0ac585 Mon Sep 17 00:00:00 2001
From: Christian Marangi <ansuelsmth@gmail.com>
Date: Tue, 21 Jun 2022 17:11:22 +0200
Subject: [PATCH 41/47] net: dsa: qca8k: reset cpu port on MTU change

It was discovered that the Documentation lacks of a fundamental detail
on how to correctly change the MAX_FRAME_SIZE of the switch.

In fact if the MAX_FRAME_SIZE is changed while the cpu port is on, the
switch panics and cease to send any packet. This cause the mgmt ethernet
system to not receive any packet (the slow fallback still works) and
makes the device not reachable. To recover from this a switch reset is
required.

To correctly handle this, turn off the cpu ports before changing the
MAX_FRAME_SIZE and turn on again after the value is applied.

Fixes: f58d2598cf70 ("net: dsa: qca8k: implement the port MTU callbacks")
Cc: stable@vger.kernel.org
Signed-off-by: Christian Marangi <ansuelsmth@gmail.com>
Link: https://lore.kernel.org/r/20220621151122.10220-1-ansuelsmth@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/dsa/qca8k.c | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/drivers/net/dsa/qca8k.c b/drivers/net/dsa/qca8k.c
index 2727d3169c252..1cbb05b0323f0 100644
--- a/drivers/net/dsa/qca8k.c
+++ b/drivers/net/dsa/qca8k.c
@@ -2334,6 +2334,7 @@ static int
 qca8k_port_change_mtu(struct dsa_switch *ds, int port, int new_mtu)
 {
 	struct qca8k_priv *priv = ds->priv;
+	int ret;
 
 	/* We have only have a general MTU setting.
 	 * DSA always set the CPU port's MTU to the largest MTU of the slave
@@ -2344,8 +2345,27 @@ qca8k_port_change_mtu(struct dsa_switch *ds, int port, int new_mtu)
 	if (!dsa_is_cpu_port(ds, port))
 		return 0;
 
+	/* To change the MAX_FRAME_SIZE the cpu ports must be off or
+	 * the switch panics.
+	 * Turn off both cpu ports before applying the new value to prevent
+	 * this.
+	 */
+	if (priv->port_enabled_map & BIT(0))
+		qca8k_port_set_status(priv, 0, 0);
+
+	if (priv->port_enabled_map & BIT(6))
+		qca8k_port_set_status(priv, 6, 0);
+
 	/* Include L2 header / FCS length */
-	return qca8k_write(priv, QCA8K_MAX_FRAME_SIZE, new_mtu + ETH_HLEN + ETH_FCS_LEN);
+	ret = qca8k_write(priv, QCA8K_MAX_FRAME_SIZE, new_mtu + ETH_HLEN + ETH_FCS_LEN);
+
+	if (priv->port_enabled_map & BIT(0))
+		qca8k_port_set_status(priv, 0, 1);
+
+	if (priv->port_enabled_map & BIT(6))
+		qca8k_port_set_status(priv, 6, 1);
+
+	return ret;
 }
 
 static int

From 85467f7da18992311deafdbf32a8d163cb1e98d7 Mon Sep 17 00:00:00 2001
From: Christian Marangi <ansuelsmth@gmail.com>
Date: Tue, 21 Jun 2022 17:16:33 +0200
Subject: [PATCH 42/47] net: dsa: qca8k: reduce mgmt ethernet timeout

The current mgmt ethernet timeout is set to 100ms. This value is too
big and would slow down any mdio command in case the mgmt ethernet
packet have some problems on the receiving part.
Reduce it to just 5ms to handle case when some operation are done on the
master port that would cause the mgmt ethernet to not work temporarily.

Fixes: 5950c7c0a68c ("net: dsa: qca8k: add support for mgmt read/write in Ethernet packet")
Signed-off-by: Christian Marangi <ansuelsmth@gmail.com>
Link: https://lore.kernel.org/r/20220621151633.11741-1-ansuelsmth@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/dsa/qca8k.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/dsa/qca8k.h b/drivers/net/dsa/qca8k.h
index 04408e11402a9..ec58d0e80a70f 100644
--- a/drivers/net/dsa/qca8k.h
+++ b/drivers/net/dsa/qca8k.h
@@ -15,7 +15,7 @@
 
 #define QCA8K_ETHERNET_MDIO_PRIORITY			7
 #define QCA8K_ETHERNET_PHY_PRIORITY			6
-#define QCA8K_ETHERNET_TIMEOUT				100
+#define QCA8K_ETHERNET_TIMEOUT				5
 
 #define QCA8K_NUM_PORTS					7
 #define QCA8K_NUM_CPU_PORTS				2

From 4e0effd9007ea0be31f7488611eb3824b4541554 Mon Sep 17 00:00:00 2001
From: Kai-Heng Feng <kai.heng.feng@canonical.com>
Date: Tue, 21 Jun 2022 15:10:56 -0700
Subject: [PATCH 43/47] igb: Make DMA faster when CPU is active on the PCIe
 link

Intel I210 on some Intel Alder Lake platforms can only achieve ~750Mbps
Tx speed via iperf. The RR2DCDELAY shows around 0x2xxx DMA delay, which
will be significantly lower when 1) ASPM is disabled or 2) SoC package
c-state stays above PC3. When the RR2DCDELAY is around 0x1xxx the Tx
speed can reach to ~950Mbps.

According to the I210 datasheet "8.26.1 PCIe Misc. Register - PCIEMISC",
"DMA Idle Indication" doesn't seem to tie to DMA coalesce anymore, so
set it to 1b for "DMA is considered idle when there is no Rx or Tx AND
when there are no TLPs indicating that CPU is active detected on the
PCIe link (such as the host executes CSR or Configuration register read
or write operation)" and performing Tx should also fall under "active
CPU on PCIe link" case.

In addition to that, commit b6e0c419f040 ("igb: Move DMA Coalescing init
code to separate function.") seems to wrongly changed from enabling
E1000_PCIEMISC_LX_DECISION to disabling it, also fix that.

Fixes: b6e0c419f040 ("igb: Move DMA Coalescing init code to separate function.")
Signed-off-by: Kai-Heng Feng <kai.heng.feng@canonical.com>
Tested-by: Gurucharan <gurucharanx.g@intel.com> (A Contingent worker at Intel)
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
Link: https://lore.kernel.org/r/20220621221056.604304-1-anthony.l.nguyen@intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/intel/igb/igb_main.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index 1c26bec7d6fa7..c5f04c40284bf 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -9901,11 +9901,10 @@ static void igb_init_dmac(struct igb_adapter *adapter, u32 pba)
 	struct e1000_hw *hw = &adapter->hw;
 	u32 dmac_thr;
 	u16 hwm;
+	u32 reg;
 
 	if (hw->mac.type > e1000_82580) {
 		if (adapter->flags & IGB_FLAG_DMAC) {
-			u32 reg;
-
 			/* force threshold to 0. */
 			wr32(E1000_DMCTXTH, 0);
 
@@ -9938,7 +9937,6 @@ static void igb_init_dmac(struct igb_adapter *adapter, u32 pba)
 			/* Disable BMC-to-OS Watchdog Enable */
 			if (hw->mac.type != e1000_i354)
 				reg &= ~E1000_DMACR_DC_BMC2OSW_EN;
-
 			wr32(E1000_DMACR, reg);
 
 			/* no lower threshold to disable
@@ -9955,12 +9953,12 @@ static void igb_init_dmac(struct igb_adapter *adapter, u32 pba)
 			 */
 			wr32(E1000_DMCTXTH, (IGB_MIN_TXPBSIZE -
 			     (IGB_TX_BUF_4096 + adapter->max_frame_size)) >> 6);
+		}
 
-			/* make low power state decision controlled
-			 * by DMA coal
-			 */
+		if (hw->mac.type >= e1000_i210 ||
+		    (adapter->flags & IGB_FLAG_DMAC)) {
 			reg = rd32(E1000_PCIEMISC);
-			reg &= ~E1000_PCIEMISC_LX_DECISION;
+			reg |= E1000_PCIEMISC_LX_DECISION;
 			wr32(E1000_PCIEMISC, reg);
 		} /* endif adapter->dmac is not disabled */
 	} else if (hw->mac.type == e1000_82580) {

From 8af52fe9fd3bf5e7478da99193c0632276e1dfce Mon Sep 17 00:00:00 2001
From: Stephan Gerhold <stephan.gerhold@kernkonzept.com>
Date: Tue, 21 Jun 2022 13:48:44 +0200
Subject: [PATCH 44/47] virtio_net: fix xdp_rxq_info bug after suspend/resume

The following sequence currently causes a driver bug warning
when using virtio_net:

  # ip link set eth0 up
  # echo mem > /sys/power/state (or e.g. # rtcwake -s 10 -m mem)
  <resume>
  # ip link set eth0 down

  Missing register, driver bug
  WARNING: CPU: 0 PID: 375 at net/core/xdp.c:138 xdp_rxq_info_unreg+0x58/0x60
  Call trace:
   xdp_rxq_info_unreg+0x58/0x60
   virtnet_close+0x58/0xac
   __dev_close_many+0xac/0x140
   __dev_change_flags+0xd8/0x210
   dev_change_flags+0x24/0x64
   do_setlink+0x230/0xdd0
   ...

This happens because virtnet_freeze() frees the receive_queue
completely (including struct xdp_rxq_info) but does not call
xdp_rxq_info_unreg(). Similarly, virtnet_restore() sets up the
receive_queue again but does not call xdp_rxq_info_reg().

Actually, parts of virtnet_freeze_down() and virtnet_restore_up()
are almost identical to virtnet_close() and virtnet_open(): only
the calls to xdp_rxq_info_(un)reg() are missing. This means that
we can fix this easily and avoid such problems in the future by
just calling virtnet_close()/open() from the freeze/restore handlers.

Aside from adding the missing xdp_rxq_info calls the only difference
is that the refill work is only cancelled if netif_running(). However,
this should not make any functional difference since the refill work
should only be active if the network interface is actually up.

Fixes: 754b8a21a96d ("virtio_net: setup xdp_rxq_info")
Signed-off-by: Stephan Gerhold <stephan.gerhold@kernkonzept.com>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Link: https://lore.kernel.org/r/20220621114845.3650258-1-stephan.gerhold@kernkonzept.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/virtio_net.c | 25 ++++++-------------------
 1 file changed, 6 insertions(+), 19 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index db05b5e930bed..969a67970e71c 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -2768,7 +2768,6 @@ static const struct ethtool_ops virtnet_ethtool_ops = {
 static void virtnet_freeze_down(struct virtio_device *vdev)
 {
 	struct virtnet_info *vi = vdev->priv;
-	int i;
 
 	/* Make sure no work handler is accessing the device */
 	flush_work(&vi->config_work);
@@ -2776,14 +2775,8 @@ static void virtnet_freeze_down(struct virtio_device *vdev)
 	netif_tx_lock_bh(vi->dev);
 	netif_device_detach(vi->dev);
 	netif_tx_unlock_bh(vi->dev);
-	cancel_delayed_work_sync(&vi->refill);
-
-	if (netif_running(vi->dev)) {
-		for (i = 0; i < vi->max_queue_pairs; i++) {
-			napi_disable(&vi->rq[i].napi);
-			virtnet_napi_tx_disable(&vi->sq[i].napi);
-		}
-	}
+	if (netif_running(vi->dev))
+		virtnet_close(vi->dev);
 }
 
 static int init_vqs(struct virtnet_info *vi);
@@ -2791,7 +2784,7 @@ static int init_vqs(struct virtnet_info *vi);
 static int virtnet_restore_up(struct virtio_device *vdev)
 {
 	struct virtnet_info *vi = vdev->priv;
-	int err, i;
+	int err;
 
 	err = init_vqs(vi);
 	if (err)
@@ -2800,15 +2793,9 @@ static int virtnet_restore_up(struct virtio_device *vdev)
 	virtio_device_ready(vdev);
 
 	if (netif_running(vi->dev)) {
-		for (i = 0; i < vi->curr_queue_pairs; i++)
-			if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL))
-				schedule_delayed_work(&vi->refill, 0);
-
-		for (i = 0; i < vi->max_queue_pairs; i++) {
-			virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi);
-			virtnet_napi_tx_enable(vi, vi->sq[i].vq,
-					       &vi->sq[i].napi);
-		}
+		err = virtnet_open(vi->dev);
+		if (err)
+			return err;
 	}
 
 	netif_tx_lock_bh(vi->dev);

From 1b205d948fbb06a7613d87dcea0ff5fd8a08ed91 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Mon, 20 Jun 2022 12:13:52 -0700
Subject: [PATCH 45/47] Revert "net/tls: fix tls_sk_proto_close executed
 repeatedly"

This reverts commit 69135c572d1f84261a6de2a1268513a7e71753e2.

This commit was just papering over the issue, ULP should not
get ->update() called with its own sk_prot. Each ULP would
need to add this check.

Fixes: 69135c572d1f ("net/tls: fix tls_sk_proto_close executed repeatedly")
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Reviewed-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/r/20220620191353.1184629-1-kuba@kernel.org
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 net/tls/tls_main.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index 46bd5f26338bf..da176411c1b5f 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -921,9 +921,6 @@ static void tls_update(struct sock *sk, struct proto *p,
 {
 	struct tls_context *ctx;
 
-	if (sk->sk_prot == p)
-		return;
-
 	ctx = tls_get_ctx(sk);
 	if (likely(ctx)) {
 		ctx->sk_write_space = write_space;

From e34a07c0ae3906f97eb18df50902e2a01c1015b6 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Mon, 20 Jun 2022 12:13:53 -0700
Subject: [PATCH 46/47] sock: redo the psock vs ULP protection check

Commit 8a59f9d1e3d4 ("sock: Introduce sk->sk_prot->psock_update_sk_prot()")
has moved the inet_csk_has_ulp(sk) check from sk_psock_init() to
the new tcp_bpf_update_proto() function. I'm guessing that this
was done to allow creating psocks for non-inet sockets.

Unfortunately the destruction path for psock includes the ULP
unwind, so we need to fail the sk_psock_init() itself.
Otherwise if ULP is already present we'll notice that later,
and call tcp_update_ulp() with the sk_proto of the ULP
itself, which will most likely result in the ULP looping
its callbacks.

Fixes: 8a59f9d1e3d4 ("sock: Introduce sk->sk_prot->psock_update_sk_prot()")
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Reviewed-by: John Fastabend <john.fastabend@gmail.com>
Reviewed-by: Jakub Sitnicki <jakub@cloudflare.com>
Tested-by: Jakub Sitnicki <jakub@cloudflare.com>
Link: https://lore.kernel.org/r/20220620191353.1184629-2-kuba@kernel.org
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/inet_sock.h | 5 +++++
 net/core/skmsg.c        | 5 +++++
 net/ipv4/tcp_bpf.c      | 3 ---
 net/tls/tls_main.c      | 2 ++
 4 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index c1b5dcd6597c6..daead5fb389ae 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -253,6 +253,11 @@ struct inet_sock {
 #define IP_CMSG_CHECKSUM	BIT(7)
 #define IP_CMSG_RECVFRAGSIZE	BIT(8)
 
+static inline bool sk_is_inet(struct sock *sk)
+{
+	return sk->sk_family == AF_INET || sk->sk_family == AF_INET6;
+}
+
 /**
  * sk_to_full_sk - Access to a full socket
  * @sk: pointer to a socket
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index 22b983ade0e7d..b0fcd0200e84a 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -699,6 +699,11 @@ struct sk_psock *sk_psock_init(struct sock *sk, int node)
 
 	write_lock_bh(&sk->sk_callback_lock);
 
+	if (sk_is_inet(sk) && inet_csk_has_ulp(sk)) {
+		psock = ERR_PTR(-EINVAL);
+		goto out;
+	}
+
 	if (sk->sk_user_data) {
 		psock = ERR_PTR(-EBUSY);
 		goto out;
diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
index be3947e70fec5..0d3f68bb51c05 100644
--- a/net/ipv4/tcp_bpf.c
+++ b/net/ipv4/tcp_bpf.c
@@ -611,9 +611,6 @@ int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore)
 		return 0;
 	}
 
-	if (inet_csk_has_ulp(sk))
-		return -EINVAL;
-
 	if (sk->sk_family == AF_INET6) {
 		if (tcp_bpf_assert_proto_ops(psock->sk_proto))
 			return -EINVAL;
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index da176411c1b5f..2ffede463e4a5 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -921,6 +921,8 @@ static void tls_update(struct sock *sk, struct proto *p,
 {
 	struct tls_context *ctx;
 
+	WARN_ON_ONCE(sk->sk_prot == p);
+
 	ctx = tls_get_ctx(sk);
 	if (likely(ctx)) {
 		ctx->sk_write_space = write_space;

From 12378a5a75e33f34f8586706eb61cca9e6d4690c Mon Sep 17 00:00:00 2001
From: Rosemarie O'Riorden <roriorden@redhat.com>
Date: Tue, 21 Jun 2022 16:48:45 -0400
Subject: [PATCH 47/47] net: openvswitch: fix parsing of nw_proto for IPv6
 fragments

When a packet enters the OVS datapath and does not match any existing
flows installed in the kernel flow cache, the packet will be sent to
userspace to be parsed, and a new flow will be created. The kernel and
OVS rely on each other to parse packet fields in the same way so that
packets will be handled properly.

As per the design document linked below, OVS expects all later IPv6
fragments to have nw_proto=44 in the flow key, so they can be correctly
matched on OpenFlow rules. OpenFlow controllers create pipelines based
on this design.

This behavior was changed by the commit in the Fixes tag so that
nw_proto equals the next_header field of the last extension header.
However, there is no counterpart for this change in OVS userspace,
meaning that this field is parsed differently between OVS and the
kernel. This is a problem because OVS creates actions based on what is
parsed in userspace, but the kernel-provided flow key is used as a match
criteria, as described in Documentation/networking/openvswitch.rst. This
leads to issues such as packets incorrectly matching on a flow and thus
the wrong list of actions being applied to the packet. Such changes in
packet parsing cannot be implemented without breaking the userspace.

The offending commit is partially reverted to restore the expected
behavior.

The change technically made sense and there is a good reason that it was
implemented, but it does not comply with the original design of OVS.
If in the future someone wants to implement such a change, then it must
be user-configurable and disabled by default to preserve backwards
compatibility with existing OVS versions.

Cc: stable@vger.kernel.org
Fixes: fa642f08839b ("openvswitch: Derive IP protocol number for IPv6 later frags")
Link: https://docs.openvswitch.org/en/latest/topics/design/#fragments
Signed-off-by: Rosemarie O'Riorden <roriorden@redhat.com>
Acked-by: Eelco Chaudron <echaudro@redhat.com>
Link: https://lore.kernel.org/r/20220621204845.9721-1-roriorden@redhat.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 net/openvswitch/flow.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index 372bf54a0ca9e..e20d1a9734175 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -407,7 +407,7 @@ static int parse_ipv6hdr(struct sk_buff *skb, struct sw_flow_key *key)
 	if (flags & IP6_FH_F_FRAG) {
 		if (frag_off) {
 			key->ip.frag = OVS_FRAG_TYPE_LATER;
-			key->ip.proto = nexthdr;
+			key->ip.proto = NEXTHDR_FRAGMENT;
 			return 0;
 		}
 		key->ip.frag = OVS_FRAG_TYPE_FIRST;