From 580031ff9952b7dbf48dedba6b56a100ae002bef Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Mon, 27 Mar 2023 17:42:32 -0700 Subject: [PATCH 01/50] bpf: tcp: Use sock_gen_put instead of sock_put in bpf_iter_tcp While reviewing the udp-iter batching patches, noticed the bpf_iter_tcp calling sock_put() is incorrect. It should call sock_gen_put instead because bpf_iter_tcp is iterating the ehash table which has the req sk and tw sk. This patch replaces all sock_put with sock_gen_put in the bpf_iter_tcp codepath. Fixes: 04c7820b776f ("bpf: tcp: Bpf iter batching and lock_sock") Signed-off-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20230328004232.2134233-1-martin.lau@linux.dev --- net/ipv4/tcp_ipv4.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index ea370afa70ed9..b9d55277cb858 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2780,7 +2780,7 @@ static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter) { while (iter->cur_sk < iter->end_sk) - sock_put(iter->batch[iter->cur_sk++]); + sock_gen_put(iter->batch[iter->cur_sk++]); } static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter, @@ -2941,7 +2941,7 @@ static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) * st->bucket. See tcp_seek_last_pos(). */ st->offset++; - sock_put(iter->batch[iter->cur_sk++]); + sock_gen_put(iter->batch[iter->cur_sk++]); } if (iter->cur_sk < iter->end_sk) From a6f6a95f25803500079513780d11a911ce551d76 Mon Sep 17 00:00:00 2001 From: George Guo Date: Tue, 28 Mar 2023 15:13:35 +0800 Subject: [PATCH 02/50] LoongArch, bpf: Fix jit to skip speculation barrier opcode Just skip the opcode(BPF_ST | BPF_NOSPEC) in the BPF JIT instead of failing to JIT the entire program, given LoongArch currently has no couterpart of a speculation barrier instruction. To verify the issue, use the ltp testcase as shown below. Also, Wang says: I can confirm there's currently no speculation barrier equivalent on LonogArch. (Loongson says there are builtin mitigations for Spectre-V1 and V2 on their chips, and AFAIK efforts to port the exploits to mips/LoongArch have all failed a few years ago.) Without this patch: $ ./bpf_prog02 [...] bpf_common.c:123: TBROK: Failed verification: ??? (524) [...] Summary: passed 0 failed 0 broken 1 skipped 0 warnings 0 With this patch: $ ./bpf_prog02 [...] Summary: passed 0 failed 0 broken 0 skipped 0 warnings 0 Fixes: 5dc615520c4d ("LoongArch: Add BPF JIT support") Signed-off-by: George Guo Signed-off-by: Daniel Borkmann Acked-by: WANG Xuerui Cc: Tiezhu Yang Link: https://lore.kernel.org/bpf/20230328071335.2664966-1-guodongtai@kylinos.cn --- arch/loongarch/net/bpf_jit.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/loongarch/net/bpf_jit.c b/arch/loongarch/net/bpf_jit.c index 288003a9f0cae..d586df48ecc64 100644 --- a/arch/loongarch/net/bpf_jit.c +++ b/arch/loongarch/net/bpf_jit.c @@ -1022,6 +1022,10 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, bool ext emit_atomic(insn, ctx); break; + /* Speculation barrier */ + case BPF_ST | BPF_NOSPEC: + break; + default: pr_err("bpf_jit: unknown opcode %02x\n", code); return -EINVAL; From 738a96c4a8c36950803fdd27e7c30aca92dccefd Mon Sep 17 00:00:00 2001 From: Xu Kuohai Date: Sat, 1 Apr 2023 19:41:44 -0400 Subject: [PATCH 03/50] bpf, arm64: Fixed a BTI error on returning to patched function When BPF_TRAMP_F_CALL_ORIG is set, BPF trampoline uses BLR to jump back to the instruction next to call site to call the patched function. For BTI-enabled kernel, the instruction next to call site is usually PACIASP, in this case, it's safe to jump back with BLR. But when the call site is not followed by a PACIASP or bti, a BTI exception is triggered. Here is a fault log: Unhandled 64-bit el1h sync exception on CPU0, ESR 0x0000000034000002 -- BTI CPU: 0 PID: 263 Comm: test_progs Tainted: GF Hardware name: linux,dummy-virt (DT) pstate: 40400805 (nZcv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=-c) pc : bpf_fentry_test1+0xc/0x30 lr : bpf_trampoline_6442573892_0+0x48/0x1000 sp : ffff80000c0c3a50 x29: ffff80000c0c3a90 x28: ffff0000c2e6c080 x27: 0000000000000000 x26: 0000000000000000 x25: 0000000000000000 x24: 0000000000000050 x23: 0000000000000000 x22: 0000ffffcfd2a7f0 x21: 000000000000000a x20: 0000ffffcfd2a7f0 x19: 0000000000000000 x18: 0000000000000000 x17: 0000000000000000 x16: 0000000000000000 x15: 0000ffffcfd2a7f0 x14: 0000000000000000 x13: 0000000000000000 x12: 0000000000000000 x11: 0000000000000000 x10: ffff80000914f5e4 x9 : ffff8000082a1528 x8 : 0000000000000000 x7 : 0000000000000000 x6 : 0101010101010101 x5 : 0000000000000000 x4 : 00000000fffffff2 x3 : 0000000000000001 x2 : ffff8001f4b82000 x1 : 0000000000000000 x0 : 0000000000000001 Kernel panic - not syncing: Unhandled exception CPU: 0 PID: 263 Comm: test_progs Tainted: GF Hardware name: linux,dummy-virt (DT) Call trace: dump_backtrace+0xec/0x144 show_stack+0x24/0x7c dump_stack_lvl+0x8c/0xb8 dump_stack+0x18/0x34 panic+0x1cc/0x3ec __el0_error_handler_common+0x0/0x130 el1h_64_sync_handler+0x60/0xd0 el1h_64_sync+0x78/0x7c bpf_fentry_test1+0xc/0x30 bpf_fentry_test1+0xc/0x30 bpf_prog_test_run_tracing+0xdc/0x2a0 __sys_bpf+0x438/0x22a0 __arm64_sys_bpf+0x30/0x54 invoke_syscall+0x78/0x110 el0_svc_common.constprop.0+0x6c/0x1d0 do_el0_svc+0x38/0xe0 el0_svc+0x30/0xd0 el0t_64_sync_handler+0x1ac/0x1b0 el0t_64_sync+0x1a0/0x1a4 Kernel Offset: disabled CPU features: 0x0000,00034c24,f994fdab Memory Limit: none And the instruction next to call site of bpf_fentry_test1 is ADD, not PACIASP: : bti c nop nop add w0, w0, #0x1 paciasp For BPF prog, JIT always puts a PACIASP after call site for BTI-enabled kernel, so there is no problem. To fix it, replace BLR with RET to bypass the branch target check. Fixes: efc9909fdce0 ("bpf, arm64: Add bpf trampoline for arm64") Reported-by: Florent Revest Signed-off-by: Xu Kuohai Signed-off-by: Daniel Borkmann Tested-by: Florent Revest Acked-by: Florent Revest Link: https://lore.kernel.org/bpf/20230401234144.3719742-1-xukuohai@huaweicloud.com --- arch/arm64/net/bpf_jit.h | 4 ++++ arch/arm64/net/bpf_jit_comp.c | 3 ++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/arm64/net/bpf_jit.h b/arch/arm64/net/bpf_jit.h index a6acb94ea3d63..c2edadb8ec6a3 100644 --- a/arch/arm64/net/bpf_jit.h +++ b/arch/arm64/net/bpf_jit.h @@ -281,4 +281,8 @@ /* DMB */ #define A64_DMB_ISH aarch64_insn_gen_dmb(AARCH64_INSN_MB_ISH) +/* ADR */ +#define A64_ADR(Rd, offset) \ + aarch64_insn_gen_adr(0, offset, Rd, AARCH64_INSN_ADR_TYPE_ADR) + #endif /* _BPF_JIT_H */ diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index 62f805f427b79..b26da8efa616e 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -1900,7 +1900,8 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im, restore_args(ctx, args_off, nargs); /* call original func */ emit(A64_LDR64I(A64_R(10), A64_SP, retaddr_off), ctx); - emit(A64_BLR(A64_R(10)), ctx); + emit(A64_ADR(A64_LR, AARCH64_INSN_SIZE * 2), ctx); + emit(A64_RET(A64_R(10)), ctx); /* store return value */ emit(A64_STR64I(A64_R(0), A64_SP, retval_off), ctx); /* reserve a nop for bpf_tramp_image_put */ From 919e659ed12568b5b8ba6c2ffdd82d8d31fc28af Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Thu, 6 Apr 2023 12:40:19 +0200 Subject: [PATCH 04/50] selftests/bpf: fix xdp_redirect xdp-features selftest for veth driver xdp-features supported by veth driver are no more static, but they depends on veth configuration (e.g. if GRO is enabled/disabled or TX/RX queue configuration). Take it into account in xdp_redirect xdp-features selftest for veth driver. Fixes: fccca038f300 ("veth: take into account device reconfiguration for xdp_features flag") Signed-off-by: Lorenzo Bianconi Link: https://lore.kernel.org/r/bc35455cfbb1d4f7f52536955ded81ad47d8dc54.1680777371.git.lorenzo@kernel.org Signed-off-by: Martin KaFai Lau --- .../bpf/prog_tests/xdp_do_redirect.c | 30 +++++++++++++++++-- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_do_redirect.c b/tools/testing/selftests/bpf/prog_tests/xdp_do_redirect.c index 7271a18ab3e22..8251a0fc6ee94 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_do_redirect.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_do_redirect.c @@ -167,8 +167,7 @@ void test_xdp_do_redirect(void) if (!ASSERT_EQ(query_opts.feature_flags, NETDEV_XDP_ACT_BASIC | NETDEV_XDP_ACT_REDIRECT | - NETDEV_XDP_ACT_NDO_XMIT | NETDEV_XDP_ACT_RX_SG | - NETDEV_XDP_ACT_NDO_XMIT_SG, + NETDEV_XDP_ACT_RX_SG, "veth_src query_opts.feature_flags")) goto out; @@ -176,11 +175,36 @@ void test_xdp_do_redirect(void) if (!ASSERT_OK(err, "veth_dst bpf_xdp_query")) goto out; + if (!ASSERT_EQ(query_opts.feature_flags, + NETDEV_XDP_ACT_BASIC | NETDEV_XDP_ACT_REDIRECT | + NETDEV_XDP_ACT_RX_SG, + "veth_dst query_opts.feature_flags")) + goto out; + + /* Enable GRO */ + SYS("ethtool -K veth_src gro on"); + SYS("ethtool -K veth_dst gro on"); + + err = bpf_xdp_query(ifindex_src, XDP_FLAGS_DRV_MODE, &query_opts); + if (!ASSERT_OK(err, "veth_src bpf_xdp_query gro on")) + goto out; + if (!ASSERT_EQ(query_opts.feature_flags, NETDEV_XDP_ACT_BASIC | NETDEV_XDP_ACT_REDIRECT | NETDEV_XDP_ACT_NDO_XMIT | NETDEV_XDP_ACT_RX_SG | NETDEV_XDP_ACT_NDO_XMIT_SG, - "veth_dst query_opts.feature_flags")) + "veth_src query_opts.feature_flags gro on")) + goto out; + + err = bpf_xdp_query(ifindex_dst, XDP_FLAGS_DRV_MODE, &query_opts); + if (!ASSERT_OK(err, "veth_dst bpf_xdp_query gro on")) + goto out; + + if (!ASSERT_EQ(query_opts.feature_flags, + NETDEV_XDP_ACT_BASIC | NETDEV_XDP_ACT_REDIRECT | + NETDEV_XDP_ACT_NDO_XMIT | NETDEV_XDP_ACT_RX_SG | + NETDEV_XDP_ACT_NDO_XMIT_SG, + "veth_dst query_opts.feature_flags gro on")) goto out; memcpy(skel->rodata->expect_dst, &pkt_udp.eth.h_dest, ETH_ALEN); From 8ce07be703456acb00e83d99f3b8036252c33b02 Mon Sep 17 00:00:00 2001 From: Harshit Mogalapalli Date: Wed, 5 Apr 2023 23:31:18 -0700 Subject: [PATCH 05/50] niu: Fix missing unwind goto in niu_alloc_channels() Smatch reports: drivers/net/ethernet/sun/niu.c:4525 niu_alloc_channels() warn: missing unwind goto? If niu_rbr_fill() fails, then we are directly returning 'err' without freeing the channels. Fix this by changing direct return to a goto 'out_err'. Fixes: a3138df9f20e ("[NIU]: Add Sun Neptune ethernet driver.") Signed-off-by: Harshit Mogalapalli Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/ethernet/sun/niu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/sun/niu.c b/drivers/net/ethernet/sun/niu.c index ab8b09a9ef61d..7a2e767762974 100644 --- a/drivers/net/ethernet/sun/niu.c +++ b/drivers/net/ethernet/sun/niu.c @@ -4522,7 +4522,7 @@ static int niu_alloc_channels(struct niu *np) err = niu_rbr_fill(np, rp, GFP_KERNEL); if (err) - return err; + goto out_err; } tx_rings = kcalloc(num_tx_rings, sizeof(struct tx_ring_info), From dc5110c2d959c1707e12df5f792f41d90614adaa Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Thu, 6 Apr 2023 14:34:50 +0800 Subject: [PATCH 06/50] tcp: restrict net.ipv4.tcp_app_win UBSAN: shift-out-of-bounds in net/ipv4/tcp_input.c:555:23 shift exponent 255 is too large for 32-bit type 'int' CPU: 1 PID: 7907 Comm: ssh Not tainted 6.3.0-rc4-00161-g62bad54b26db-dirty #206 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014 Call Trace: dump_stack_lvl+0x136/0x150 __ubsan_handle_shift_out_of_bounds+0x21f/0x5a0 tcp_init_transfer.cold+0x3a/0xb9 tcp_finish_connect+0x1d0/0x620 tcp_rcv_state_process+0xd78/0x4d60 tcp_v4_do_rcv+0x33d/0x9d0 __release_sock+0x133/0x3b0 release_sock+0x58/0x1b0 'maxwin' is int, shifting int for 32 or more bits is undefined behaviour. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: YueHaibing Reviewed-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.rst | 2 ++ net/ipv4/sysctl_net_ipv4.c | 3 +++ 2 files changed, 5 insertions(+) diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst index 87dd1c5283e61..58a78a3166978 100644 --- a/Documentation/networking/ip-sysctl.rst +++ b/Documentation/networking/ip-sysctl.rst @@ -340,6 +340,8 @@ tcp_app_win - INTEGER Reserve max(window/2^tcp_app_win, mss) of window for application buffer. Value 0 is special, it means that nothing is reserved. + Possible values are [0, 31], inclusive. + Default: 31 tcp_autocorking - BOOLEAN diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 0d0cc4ef2b85a..40fe70fc2015d 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -25,6 +25,7 @@ static int ip_local_port_range_min[] = { 1, 1 }; static int ip_local_port_range_max[] = { 65535, 65535 }; static int tcp_adv_win_scale_min = -31; static int tcp_adv_win_scale_max = 31; +static int tcp_app_win_max = 31; static int tcp_min_snd_mss_min = TCP_MIN_SND_MSS; static int tcp_min_snd_mss_max = 65535; static int ip_privileged_port_min; @@ -1198,6 +1199,8 @@ static struct ctl_table ipv4_net_table[] = { .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &tcp_app_win_max, }, { .procname = "tcp_adv_win_scale", From 4598380f9c548aa161eb4e990a1583f0a7d1e0d7 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Thu, 6 Apr 2023 16:23:50 +0800 Subject: [PATCH 07/50] bonding: fix ns validation on backup slaves When arp_validate is set to 2, 3, or 6, validation is performed for backup slaves as well. As stated in the bond documentation, validation involves checking the broadcast ARP request sent out via the active slave. This helps determine which slaves are more likely to function in the event of an active slave failure. However, when the target is an IPv6 address, the NS message sent from the active interface is not checked on backup slaves. Additionally, based on the bond_arp_rcv() rule b, we must reverse the saddr and daddr when checking the NS message. Note that when checking the NS message, the destination address is a multicast address. Therefore, we must convert the target address to solicited multicast in the bond_get_targets_ip6() function. Prior to the fix, the backup slaves had a mii status of "down", but after the fix, all of the slaves' mii status was updated to "UP". Fixes: 4e24be018eb9 ("bonding: add new parameter ns_targets") Reviewed-by: Jonathan Toppins Acked-by: Jay Vosburgh Signed-off-by: Hangbin Liu Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 5 +++-- include/net/bonding.h | 8 ++++++-- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 236e5219c8112..8cc9a74789b79 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -3269,7 +3269,8 @@ static int bond_na_rcv(const struct sk_buff *skb, struct bonding *bond, combined = skb_header_pointer(skb, 0, sizeof(_combined), &_combined); if (!combined || combined->ip6.nexthdr != NEXTHDR_ICMP || - combined->icmp6.icmp6_type != NDISC_NEIGHBOUR_ADVERTISEMENT) + (combined->icmp6.icmp6_type != NDISC_NEIGHBOUR_SOLICITATION && + combined->icmp6.icmp6_type != NDISC_NEIGHBOUR_ADVERTISEMENT)) goto out; saddr = &combined->ip6.saddr; @@ -3291,7 +3292,7 @@ static int bond_na_rcv(const struct sk_buff *skb, struct bonding *bond, else if (curr_active_slave && time_after(slave_last_rx(bond, curr_active_slave), curr_active_slave->last_link_up)) - bond_validate_na(bond, slave, saddr, daddr); + bond_validate_na(bond, slave, daddr, saddr); else if (curr_arp_slave && bond_time_in_interval(bond, slave_last_tx(curr_arp_slave), 1)) bond_validate_na(bond, slave, saddr, daddr); diff --git a/include/net/bonding.h b/include/net/bonding.h index ea36ab7f9e724..c3843239517d5 100644 --- a/include/net/bonding.h +++ b/include/net/bonding.h @@ -761,13 +761,17 @@ static inline int bond_get_targets_ip(__be32 *targets, __be32 ip) #if IS_ENABLED(CONFIG_IPV6) static inline int bond_get_targets_ip6(struct in6_addr *targets, struct in6_addr *ip) { + struct in6_addr mcaddr; int i; - for (i = 0; i < BOND_MAX_NS_TARGETS; i++) - if (ipv6_addr_equal(&targets[i], ip)) + for (i = 0; i < BOND_MAX_NS_TARGETS; i++) { + addrconf_addr_solict_mult(&targets[i], &mcaddr); + if ((ipv6_addr_equal(&targets[i], ip)) || + (ipv6_addr_equal(&mcaddr, ip))) return i; else if (ipv6_addr_any(&targets[i])) break; + } return -1; } From 481b56e0391ea46d6bf1a2604422a21063615901 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Thu, 6 Apr 2023 16:23:51 +0800 Subject: [PATCH 08/50] selftests: bonding: re-format bond option tests To improve the testing process for bond options, A new bond topology lib is added to our testing setup. The current option_prio.sh file will be renamed to bond_options.sh so that all bonding options can be tested here. Specifically, for priority testing, we will run all tests using modes 1, 5, and 6. These changes will help us streamline the testing process and ensure that our bond options are rigorously evaluated. Acked-by: Jay Vosburgh Signed-off-by: Hangbin Liu Acked-by: Jonathan Toppins Signed-off-by: David S. Miller --- .../selftests/drivers/net/bonding/Makefile | 3 +- .../drivers/net/bonding/bond_options.sh | 209 +++++++++++++++ .../drivers/net/bonding/bond_topo_3d1c.sh | 143 ++++++++++ .../drivers/net/bonding/option_prio.sh | 245 ------------------ 4 files changed, 354 insertions(+), 246 deletions(-) create mode 100755 tools/testing/selftests/drivers/net/bonding/bond_options.sh create mode 100644 tools/testing/selftests/drivers/net/bonding/bond_topo_3d1c.sh delete mode 100755 tools/testing/selftests/drivers/net/bonding/option_prio.sh diff --git a/tools/testing/selftests/drivers/net/bonding/Makefile b/tools/testing/selftests/drivers/net/bonding/Makefile index a39bb2560d9bf..03f92d7aeb19b 100644 --- a/tools/testing/selftests/drivers/net/bonding/Makefile +++ b/tools/testing/selftests/drivers/net/bonding/Makefile @@ -8,11 +8,12 @@ TEST_PROGS := \ dev_addr_lists.sh \ mode-1-recovery-updelay.sh \ mode-2-recovery-updelay.sh \ - option_prio.sh \ + bond_options.sh \ bond-eth-type-change.sh TEST_FILES := \ lag_lib.sh \ + bond_topo_3d1c.sh \ net_forwarding_lib.sh include ../../../lib.mk diff --git a/tools/testing/selftests/drivers/net/bonding/bond_options.sh b/tools/testing/selftests/drivers/net/bonding/bond_options.sh new file mode 100755 index 0000000000000..7213211d0bde2 --- /dev/null +++ b/tools/testing/selftests/drivers/net/bonding/bond_options.sh @@ -0,0 +1,209 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Test bonding options with mode 1,5,6 + +ALL_TESTS=" + prio +" + +REQUIRE_MZ=no +NUM_NETIFS=0 +lib_dir=$(dirname "$0") +source ${lib_dir}/net_forwarding_lib.sh +source ${lib_dir}/bond_topo_3d1c.sh + +skip_prio() +{ + local skip=1 + + # check if iproute support prio option + ip -n ${s_ns} link set eth0 type bond_slave prio 10 + [[ $? -ne 0 ]] && skip=0 + + # check if kernel support prio option + ip -n ${s_ns} -d link show eth0 | grep -q "prio 10" + [[ $? -ne 0 ]] && skip=0 + + return $skip +} + +skip_ns() +{ + local skip=1 + + # check if iproute support ns_ip6_target option + ip -n ${s_ns} link add bond1 type bond ns_ip6_target ${g_ip6} + [[ $? -ne 0 ]] && skip=0 + + # check if kernel support ns_ip6_target option + ip -n ${s_ns} -d link show bond1 | grep -q "ns_ip6_target ${g_ip6}" + [[ $? -ne 0 ]] && skip=0 + + ip -n ${s_ns} link del bond1 + + return $skip +} + +active_slave="" +check_active_slave() +{ + local target_active_slave=$1 + active_slave=$(cmd_jq "ip -n ${s_ns} -d -j link show bond0" ".[].linkinfo.info_data.active_slave") + test "$active_slave" = "$target_active_slave" + check_err $? "Current active slave is $active_slave but not $target_active_slave" +} + + +# Test bonding prio option +prio_test() +{ + local param="$1" + RET=0 + + # create bond + bond_reset "${param}" + + # check bonding member prio value + ip -n ${s_ns} link set eth0 type bond_slave prio 0 + ip -n ${s_ns} link set eth1 type bond_slave prio 10 + ip -n ${s_ns} link set eth2 type bond_slave prio 11 + cmd_jq "ip -n ${s_ns} -d -j link show eth0" \ + ".[].linkinfo.info_slave_data | select (.prio == 0)" "-e" &> /dev/null + check_err $? "eth0 prio is not 0" + cmd_jq "ip -n ${s_ns} -d -j link show eth1" \ + ".[].linkinfo.info_slave_data | select (.prio == 10)" "-e" &> /dev/null + check_err $? "eth1 prio is not 10" + cmd_jq "ip -n ${s_ns} -d -j link show eth2" \ + ".[].linkinfo.info_slave_data | select (.prio == 11)" "-e" &> /dev/null + check_err $? "eth2 prio is not 11" + + bond_check_connection "setup" + + # active slave should be the primary slave + check_active_slave eth1 + + # active slave should be the higher prio slave + ip -n ${s_ns} link set $active_slave down + bond_check_connection "fail over" + check_active_slave eth2 + + # when only 1 slave is up + ip -n ${s_ns} link set $active_slave down + bond_check_connection "only 1 slave up" + check_active_slave eth0 + + # when a higher prio slave change to up + ip -n ${s_ns} link set eth2 up + bond_check_connection "higher prio slave up" + case $primary_reselect in + "0") + check_active_slave "eth2" + ;; + "1") + check_active_slave "eth0" + ;; + "2") + check_active_slave "eth0" + ;; + esac + local pre_active_slave=$active_slave + + # when the primary slave change to up + ip -n ${s_ns} link set eth1 up + bond_check_connection "primary slave up" + case $primary_reselect in + "0") + check_active_slave "eth1" + ;; + "1") + check_active_slave "$pre_active_slave" + ;; + "2") + check_active_slave "$pre_active_slave" + ip -n ${s_ns} link set $active_slave down + bond_check_connection "pre_active slave down" + check_active_slave "eth1" + ;; + esac + + # Test changing bond slave prio + if [[ "$primary_reselect" == "0" ]];then + ip -n ${s_ns} link set eth0 type bond_slave prio 1000000 + ip -n ${s_ns} link set eth1 type bond_slave prio 0 + ip -n ${s_ns} link set eth2 type bond_slave prio -50 + ip -n ${s_ns} -d link show eth0 | grep -q 'prio 1000000' + check_err $? "eth0 prio is not 1000000" + ip -n ${s_ns} -d link show eth1 | grep -q 'prio 0' + check_err $? "eth1 prio is not 0" + ip -n ${s_ns} -d link show eth2 | grep -q 'prio -50' + check_err $? "eth3 prio is not -50" + check_active_slave "eth1" + + ip -n ${s_ns} link set $active_slave down + bond_check_connection "change slave prio" + check_active_slave "eth0" + fi +} + +prio_miimon() +{ + local primary_reselect + local mode=$1 + + for primary_reselect in 0 1 2; do + prio_test "mode $mode miimon 100 primary eth1 primary_reselect $primary_reselect" + log_test "prio" "$mode miimon primary_reselect $primary_reselect" + done +} + +prio_arp() +{ + local primary_reselect + local mode=$1 + + for primary_reselect in 0 1 2; do + prio_test "mode active-backup arp_interval 100 arp_ip_target ${g_ip4} primary eth1 primary_reselect $primary_reselect" + log_test "prio" "$mode arp_ip_target primary_reselect $primary_reselect" + done +} + +prio_ns() +{ + local primary_reselect + local mode=$1 + + if skip_ns; then + log_test_skip "prio ns" "Current iproute or kernel doesn't support bond option 'ns_ip6_target'." + return 0 + fi + + for primary_reselect in 0 1 2; do + prio_test "mode active-backup arp_interval 100 ns_ip6_target ${g_ip6} primary eth1 primary_reselect $primary_reselect" + log_test "prio" "$mode ns_ip6_target primary_reselect $primary_reselect" + done +} + +prio() +{ + local mode modes="active-backup balance-tlb balance-alb" + + if skip_prio; then + log_test_skip "prio" "Current iproute or kernel doesn't support bond option 'prio'." + return 0 + fi + + for mode in $modes; do + prio_miimon $mode + prio_arp $mode + prio_ns $mode + done +} + +trap cleanup EXIT + +setup_prepare +setup_wait +tests_run + +exit $EXIT_STATUS diff --git a/tools/testing/selftests/drivers/net/bonding/bond_topo_3d1c.sh b/tools/testing/selftests/drivers/net/bonding/bond_topo_3d1c.sh new file mode 100644 index 0000000000000..4045ca97fb22d --- /dev/null +++ b/tools/testing/selftests/drivers/net/bonding/bond_topo_3d1c.sh @@ -0,0 +1,143 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Topology for Bond mode 1,5,6 testing +# +# +-------------------------------------+ +# | bond0 | +# | + | Server +# | eth0 | eth1 eth2 | 192.0.2.1/24 +# | +-------------------+ | 2001:db8::1/24 +# | | | | | +# +-------------------------------------+ +# | | | +# +-------------------------------------+ +# | | | | | +# | +---+---------+---------+---+ | Gateway +# | | br0 | | 192.0.2.254/24 +# | +-------------+-------------+ | 2001:db8::254/24 +# | | | +# +-------------------------------------+ +# | +# +-------------------------------------+ +# | | | Client +# | + | 192.0.2.10/24 +# | eth0 | 2001:db8::10/24 +# +-------------------------------------+ + +s_ns="s-$(mktemp -u XXXXXX)" +c_ns="c-$(mktemp -u XXXXXX)" +g_ns="g-$(mktemp -u XXXXXX)" +s_ip4="192.0.2.1" +c_ip4="192.0.2.10" +g_ip4="192.0.2.254" +s_ip6="2001:db8::1" +c_ip6="2001:db8::10" +g_ip6="2001:db8::254" + +gateway_create() +{ + ip netns add ${g_ns} + ip -n ${g_ns} link add br0 type bridge + ip -n ${g_ns} link set br0 up + ip -n ${g_ns} addr add ${g_ip4}/24 dev br0 + ip -n ${g_ns} addr add ${g_ip6}/24 dev br0 +} + +gateway_destroy() +{ + ip -n ${g_ns} link del br0 + ip netns del ${g_ns} +} + +server_create() +{ + ip netns add ${s_ns} + ip -n ${s_ns} link add bond0 type bond mode active-backup miimon 100 + + for i in $(seq 0 2); do + ip -n ${s_ns} link add eth${i} type veth peer name s${i} netns ${g_ns} + + ip -n ${g_ns} link set s${i} up + ip -n ${g_ns} link set s${i} master br0 + ip -n ${s_ns} link set eth${i} master bond0 + done + + ip -n ${s_ns} link set bond0 up + ip -n ${s_ns} addr add ${s_ip4}/24 dev bond0 + ip -n ${s_ns} addr add ${s_ip6}/24 dev bond0 + sleep 2 +} + +# Reset bond with new mode and options +bond_reset() +{ + local param="$1" + + ip -n ${s_ns} link set bond0 down + ip -n ${s_ns} link del bond0 + + ip -n ${s_ns} link add bond0 type bond $param + for i in $(seq 0 2); do + ip -n ${s_ns} link set eth$i master bond0 + done + + ip -n ${s_ns} link set bond0 up + ip -n ${s_ns} addr add ${s_ip4}/24 dev bond0 + ip -n ${s_ns} addr add ${s_ip6}/24 dev bond0 + sleep 2 +} + +server_destroy() +{ + for i in $(seq 0 2); do + ip -n ${s_ns} link del eth${i} + done + ip netns del ${s_ns} +} + +client_create() +{ + ip netns add ${c_ns} + ip -n ${c_ns} link add eth0 type veth peer name c0 netns ${g_ns} + + ip -n ${g_ns} link set c0 up + ip -n ${g_ns} link set c0 master br0 + + ip -n ${c_ns} link set eth0 up + ip -n ${c_ns} addr add ${c_ip4}/24 dev eth0 + ip -n ${c_ns} addr add ${c_ip6}/24 dev eth0 +} + +client_destroy() +{ + ip -n ${c_ns} link del eth0 + ip netns del ${c_ns} +} + +setup_prepare() +{ + gateway_create + server_create + client_create +} + +cleanup() +{ + pre_cleanup + + client_destroy + server_destroy + gateway_destroy +} + +bond_check_connection() +{ + local msg=${1:-"check connection"} + + sleep 2 + ip netns exec ${s_ns} ping ${c_ip4} -c5 -i 0.1 &>/dev/null + check_err $? "${msg}: ping failed" + ip netns exec ${s_ns} ping6 ${c_ip6} -c5 -i 0.1 &>/dev/null + check_err $? "${msg}: ping6 failed" +} diff --git a/tools/testing/selftests/drivers/net/bonding/option_prio.sh b/tools/testing/selftests/drivers/net/bonding/option_prio.sh deleted file mode 100755 index c32eebff5005d..0000000000000 --- a/tools/testing/selftests/drivers/net/bonding/option_prio.sh +++ /dev/null @@ -1,245 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 -# -# Test bonding option prio -# - -ALL_TESTS=" - prio_arp_ip_target_test - prio_miimon_test -" - -REQUIRE_MZ=no -REQUIRE_JQ=no -NUM_NETIFS=0 -lib_dir=$(dirname "$0") -source "$lib_dir"/net_forwarding_lib.sh - -destroy() -{ - ip link del bond0 &>/dev/null - ip link del br0 &>/dev/null - ip link del veth0 &>/dev/null - ip link del veth1 &>/dev/null - ip link del veth2 &>/dev/null - ip netns del ns1 &>/dev/null - ip link del veth3 &>/dev/null -} - -cleanup() -{ - pre_cleanup - - destroy -} - -skip() -{ - local skip=1 - ip link add name bond0 type bond mode 1 miimon 100 &>/dev/null - ip link add name veth0 type veth peer name veth0_p - ip link set veth0 master bond0 - - # check if iproute support prio option - ip link set dev veth0 type bond_slave prio 10 - [[ $? -ne 0 ]] && skip=0 - - # check if bonding support prio option - ip -d link show veth0 | grep -q "prio 10" - [[ $? -ne 0 ]] && skip=0 - - ip link del bond0 &>/dev/null - ip link del veth0 - - return $skip -} - -active_slave="" -check_active_slave() -{ - local target_active_slave=$1 - active_slave="$(cat /sys/class/net/bond0/bonding/active_slave)" - test "$active_slave" = "$target_active_slave" - check_err $? "Current active slave is $active_slave but not $target_active_slave" -} - - -# Test bonding prio option with mode=$mode monitor=$monitor -# and primary_reselect=$primary_reselect -prio_test() -{ - RET=0 - - local monitor=$1 - local mode=$2 - local primary_reselect=$3 - - local bond_ip4="192.169.1.2" - local peer_ip4="192.169.1.1" - local bond_ip6="2009:0a:0b::02" - local peer_ip6="2009:0a:0b::01" - - - # create veths - ip link add name veth0 type veth peer name veth0_p - ip link add name veth1 type veth peer name veth1_p - ip link add name veth2 type veth peer name veth2_p - - # create bond - if [[ "$monitor" == "miimon" ]];then - ip link add name bond0 type bond mode $mode miimon 100 primary veth1 primary_reselect $primary_reselect - elif [[ "$monitor" == "arp_ip_target" ]];then - ip link add name bond0 type bond mode $mode arp_interval 1000 arp_ip_target $peer_ip4 primary veth1 primary_reselect $primary_reselect - elif [[ "$monitor" == "ns_ip6_target" ]];then - ip link add name bond0 type bond mode $mode arp_interval 1000 ns_ip6_target $peer_ip6 primary veth1 primary_reselect $primary_reselect - fi - ip link set bond0 up - ip link set veth0 master bond0 - ip link set veth1 master bond0 - ip link set veth2 master bond0 - # check bonding member prio value - ip link set dev veth0 type bond_slave prio 0 - ip link set dev veth1 type bond_slave prio 10 - ip link set dev veth2 type bond_slave prio 11 - ip -d link show veth0 | grep -q 'prio 0' - check_err $? "veth0 prio is not 0" - ip -d link show veth1 | grep -q 'prio 10' - check_err $? "veth0 prio is not 10" - ip -d link show veth2 | grep -q 'prio 11' - check_err $? "veth0 prio is not 11" - - ip link set veth0 up - ip link set veth1 up - ip link set veth2 up - ip link set veth0_p up - ip link set veth1_p up - ip link set veth2_p up - - # prepare ping target - ip link add name br0 type bridge - ip link set br0 up - ip link set veth0_p master br0 - ip link set veth1_p master br0 - ip link set veth2_p master br0 - ip link add name veth3 type veth peer name veth3_p - ip netns add ns1 - ip link set veth3_p master br0 up - ip link set veth3 netns ns1 up - ip netns exec ns1 ip addr add $peer_ip4/24 dev veth3 - ip netns exec ns1 ip addr add $peer_ip6/64 dev veth3 - ip addr add $bond_ip4/24 dev bond0 - ip addr add $bond_ip6/64 dev bond0 - sleep 5 - - ping $peer_ip4 -c5 -I bond0 &>/dev/null - check_err $? "ping failed 1." - ping6 $peer_ip6 -c5 -I bond0 &>/dev/null - check_err $? "ping6 failed 1." - - # active salve should be the primary slave - check_active_slave veth1 - - # active slave should be the higher prio slave - ip link set $active_slave down - ping $peer_ip4 -c5 -I bond0 &>/dev/null - check_err $? "ping failed 2." - check_active_slave veth2 - - # when only 1 slave is up - ip link set $active_slave down - ping $peer_ip4 -c5 -I bond0 &>/dev/null - check_err $? "ping failed 3." - check_active_slave veth0 - - # when a higher prio slave change to up - ip link set veth2 up - ping $peer_ip4 -c5 -I bond0 &>/dev/null - check_err $? "ping failed 4." - case $primary_reselect in - "0") - check_active_slave "veth2" - ;; - "1") - check_active_slave "veth0" - ;; - "2") - check_active_slave "veth0" - ;; - esac - local pre_active_slave=$active_slave - - # when the primary slave change to up - ip link set veth1 up - ping $peer_ip4 -c5 -I bond0 &>/dev/null - check_err $? "ping failed 5." - case $primary_reselect in - "0") - check_active_slave "veth1" - ;; - "1") - check_active_slave "$pre_active_slave" - ;; - "2") - check_active_slave "$pre_active_slave" - ip link set $active_slave down - ping $peer_ip4 -c5 -I bond0 &>/dev/null - check_err $? "ping failed 6." - check_active_slave "veth1" - ;; - esac - - # Test changing bond salve prio - if [[ "$primary_reselect" == "0" ]];then - ip link set dev veth0 type bond_slave prio 1000000 - ip link set dev veth1 type bond_slave prio 0 - ip link set dev veth2 type bond_slave prio -50 - ip -d link show veth0 | grep -q 'prio 1000000' - check_err $? "veth0 prio is not 1000000" - ip -d link show veth1 | grep -q 'prio 0' - check_err $? "veth1 prio is not 0" - ip -d link show veth2 | grep -q 'prio -50' - check_err $? "veth3 prio is not -50" - check_active_slave "veth1" - - ip link set $active_slave down - ping $peer_ip4 -c5 -I bond0 &>/dev/null - check_err $? "ping failed 7." - check_active_slave "veth0" - fi - - cleanup - - log_test "prio_test" "Test bonding option 'prio' with mode=$mode monitor=$monitor and primary_reselect=$primary_reselect" -} - -prio_miimon_test() -{ - local mode - local primary_reselect - - for mode in 1 5 6; do - for primary_reselect in 0 1 2; do - prio_test "miimon" $mode $primary_reselect - done - done -} - -prio_arp_ip_target_test() -{ - local primary_reselect - - for primary_reselect in 0 1 2; do - prio_test "arp_ip_target" 1 $primary_reselect - done -} - -if skip;then - log_test_skip "option_prio.sh" "Current iproute doesn't support 'prio'." - exit 0 -fi - -trap cleanup EXIT - -tests_run - -exit "$EXIT_STATUS" From 2e825f8accb4491466677162cd9893fe77aea2f9 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Thu, 6 Apr 2023 16:23:52 +0800 Subject: [PATCH 09/50] selftests: bonding: add arp validate test This patch add bonding arp validate tests with mode active backup, monitor arp_ip_target and ns_ip6_target. It also checks mii_status to make sure all slaves are UP. Acked-by: Jonathan Toppins Acked-by: Jay Vosburgh Signed-off-by: Hangbin Liu Signed-off-by: David S. Miller --- .../drivers/net/bonding/bond_options.sh | 55 +++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/tools/testing/selftests/drivers/net/bonding/bond_options.sh b/tools/testing/selftests/drivers/net/bonding/bond_options.sh index 7213211d0bde2..db29a3146a861 100755 --- a/tools/testing/selftests/drivers/net/bonding/bond_options.sh +++ b/tools/testing/selftests/drivers/net/bonding/bond_options.sh @@ -5,6 +5,7 @@ ALL_TESTS=" prio + arp_validate " REQUIRE_MZ=no @@ -200,6 +201,60 @@ prio() done } +arp_validate_test() +{ + local param="$1" + RET=0 + + # create bond + bond_reset "${param}" + + bond_check_connection + [ $RET -ne 0 ] && log_test "arp_validate" "$retmsg" + + # wait for a while to make sure the mii status stable + sleep 5 + for i in $(seq 0 2); do + mii_status=$(cmd_jq "ip -n ${s_ns} -j -d link show eth$i" ".[].linkinfo.info_slave_data.mii_status") + if [ ${mii_status} != "UP" ]; then + RET=1 + log_test "arp_validate" "interface eth$i mii_status $mii_status" + fi + done +} + +arp_validate_arp() +{ + local mode=$1 + local val + for val in $(seq 0 6); do + arp_validate_test "mode $mode arp_interval 100 arp_ip_target ${g_ip4} arp_validate $val" + log_test "arp_validate" "$mode arp_ip_target arp_validate $val" + done +} + +arp_validate_ns() +{ + local mode=$1 + local val + + if skip_ns; then + log_test_skip "arp_validate ns" "Current iproute or kernel doesn't support bond option 'ns_ip6_target'." + return 0 + fi + + for val in $(seq 0 6); do + arp_validate_test "mode $mode arp_interval 100 ns_ip6_target ${g_ip6} arp_validate $val" + log_test "arp_validate" "$mode ns_ip6_target arp_validate $val" + done +} + +arp_validate() +{ + arp_validate_arp "active-backup" + arp_validate_ns "active-backup" +} + trap cleanup EXIT setup_prepare From 0c0da0e951053fda20412cd284e2714bbbb31bff Mon Sep 17 00:00:00 2001 From: Ahmed Zaki Date: Thu, 6 Apr 2023 15:35:27 -0600 Subject: [PATCH 10/50] iavf: refactor VLAN filter states The VLAN filter states are currently being saved as individual bits. This is error prone as multiple bits might be mistakenly set. Fix by replacing the bits with a single state enum. Also, add an "ACTIVE" state for filters that are accepted by the PF. Signed-off-by: Ahmed Zaki Tested-by: Rafal Romanowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/iavf/iavf.h | 15 +++++---- drivers/net/ethernet/intel/iavf/iavf_main.c | 8 ++--- .../net/ethernet/intel/iavf/iavf_virtchnl.c | 31 +++++++++---------- 3 files changed, 28 insertions(+), 26 deletions(-) diff --git a/drivers/net/ethernet/intel/iavf/iavf.h b/drivers/net/ethernet/intel/iavf/iavf.h index 232bc61d9eee9..692d6a6421183 100644 --- a/drivers/net/ethernet/intel/iavf/iavf.h +++ b/drivers/net/ethernet/intel/iavf/iavf.h @@ -158,15 +158,18 @@ struct iavf_vlan { u16 tpid; }; +enum iavf_vlan_state_t { + IAVF_VLAN_INVALID, + IAVF_VLAN_ADD, /* filter needs to be added */ + IAVF_VLAN_IS_NEW, /* filter is new, wait for PF answer */ + IAVF_VLAN_ACTIVE, /* filter is accepted by PF */ + IAVF_VLAN_REMOVE, /* filter needs to be removed */ +}; + struct iavf_vlan_filter { struct list_head list; struct iavf_vlan vlan; - struct { - u8 is_new_vlan:1; /* filter is new, wait for PF answer */ - u8 remove:1; /* filter needs to be removed */ - u8 add:1; /* filter needs to be added */ - u8 padding:5; - }; + enum iavf_vlan_state_t state; }; #define IAVF_MAX_TRAFFIC_CLASS 4 diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c index 095201e83c9db..fe9798e4b4ace 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_main.c +++ b/drivers/net/ethernet/intel/iavf/iavf_main.c @@ -791,7 +791,7 @@ iavf_vlan_filter *iavf_add_vlan(struct iavf_adapter *adapter, f->vlan = vlan; list_add_tail(&f->list, &adapter->vlan_filter_list); - f->add = true; + f->state = IAVF_VLAN_ADD; adapter->aq_required |= IAVF_FLAG_AQ_ADD_VLAN_FILTER; } @@ -813,7 +813,7 @@ static void iavf_del_vlan(struct iavf_adapter *adapter, struct iavf_vlan vlan) f = iavf_find_vlan(adapter, vlan); if (f) { - f->remove = true; + f->state = IAVF_VLAN_REMOVE; adapter->aq_required |= IAVF_FLAG_AQ_DEL_VLAN_FILTER; } @@ -1296,11 +1296,11 @@ static void iavf_clear_mac_vlan_filters(struct iavf_adapter *adapter) /* remove all VLAN filters */ list_for_each_entry_safe(vlf, vlftmp, &adapter->vlan_filter_list, list) { - if (vlf->add) { + if (vlf->state == IAVF_VLAN_ADD) { list_del(&vlf->list); kfree(vlf); } else { - vlf->remove = true; + vlf->state = IAVF_VLAN_REMOVE; } } spin_unlock_bh(&adapter->mac_vlan_list_lock); diff --git a/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c b/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c index 4e17d006c52d4..5047b4c83718d 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c +++ b/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c @@ -642,7 +642,7 @@ static void iavf_vlan_add_reject(struct iavf_adapter *adapter) spin_lock_bh(&adapter->mac_vlan_list_lock); list_for_each_entry_safe(f, ftmp, &adapter->vlan_filter_list, list) { - if (f->is_new_vlan) { + if (f->state == IAVF_VLAN_IS_NEW) { if (f->vlan.tpid == ETH_P_8021Q) clear_bit(f->vlan.vid, adapter->vsi.active_cvlans); @@ -679,7 +679,7 @@ void iavf_add_vlans(struct iavf_adapter *adapter) spin_lock_bh(&adapter->mac_vlan_list_lock); list_for_each_entry(f, &adapter->vlan_filter_list, list) { - if (f->add) + if (f->state == IAVF_VLAN_ADD) count++; } if (!count || !VLAN_FILTERING_ALLOWED(adapter)) { @@ -710,11 +710,10 @@ void iavf_add_vlans(struct iavf_adapter *adapter) vvfl->vsi_id = adapter->vsi_res->vsi_id; vvfl->num_elements = count; list_for_each_entry(f, &adapter->vlan_filter_list, list) { - if (f->add) { + if (f->state == IAVF_VLAN_ADD) { vvfl->vlan_id[i] = f->vlan.vid; i++; - f->add = false; - f->is_new_vlan = true; + f->state = IAVF_VLAN_IS_NEW; if (i == count) break; } @@ -760,7 +759,7 @@ void iavf_add_vlans(struct iavf_adapter *adapter) vvfl_v2->vport_id = adapter->vsi_res->vsi_id; vvfl_v2->num_elements = count; list_for_each_entry(f, &adapter->vlan_filter_list, list) { - if (f->add) { + if (f->state == IAVF_VLAN_ADD) { struct virtchnl_vlan_supported_caps *filtering_support = &adapter->vlan_v2_caps.filtering.filtering_support; struct virtchnl_vlan *vlan; @@ -778,8 +777,7 @@ void iavf_add_vlans(struct iavf_adapter *adapter) vlan->tpid = f->vlan.tpid; i++; - f->add = false; - f->is_new_vlan = true; + f->state = IAVF_VLAN_IS_NEW; } } @@ -822,10 +820,11 @@ void iavf_del_vlans(struct iavf_adapter *adapter) * filters marked for removal to enable bailing out before * sending a virtchnl message */ - if (f->remove && !VLAN_FILTERING_ALLOWED(adapter)) { + if (f->state == IAVF_VLAN_REMOVE && + !VLAN_FILTERING_ALLOWED(adapter)) { list_del(&f->list); kfree(f); - } else if (f->remove) { + } else if (f->state == IAVF_VLAN_REMOVE) { count++; } } @@ -857,7 +856,7 @@ void iavf_del_vlans(struct iavf_adapter *adapter) vvfl->vsi_id = adapter->vsi_res->vsi_id; vvfl->num_elements = count; list_for_each_entry_safe(f, ftmp, &adapter->vlan_filter_list, list) { - if (f->remove) { + if (f->state == IAVF_VLAN_REMOVE) { vvfl->vlan_id[i] = f->vlan.vid; i++; list_del(&f->list); @@ -901,7 +900,7 @@ void iavf_del_vlans(struct iavf_adapter *adapter) vvfl_v2->vport_id = adapter->vsi_res->vsi_id; vvfl_v2->num_elements = count; list_for_each_entry_safe(f, ftmp, &adapter->vlan_filter_list, list) { - if (f->remove) { + if (f->state == IAVF_VLAN_REMOVE) { struct virtchnl_vlan_supported_caps *filtering_support = &adapter->vlan_v2_caps.filtering.filtering_support; struct virtchnl_vlan *vlan; @@ -2192,7 +2191,7 @@ void iavf_virtchnl_completion(struct iavf_adapter *adapter, list_for_each_entry(vlf, &adapter->vlan_filter_list, list) - vlf->add = true; + vlf->state = IAVF_VLAN_ADD; adapter->aq_required |= IAVF_FLAG_AQ_ADD_VLAN_FILTER; @@ -2260,7 +2259,7 @@ void iavf_virtchnl_completion(struct iavf_adapter *adapter, list_for_each_entry(vlf, &adapter->vlan_filter_list, list) - vlf->add = true; + vlf->state = IAVF_VLAN_ADD; aq_required |= IAVF_FLAG_AQ_ADD_VLAN_FILTER; } @@ -2444,8 +2443,8 @@ void iavf_virtchnl_completion(struct iavf_adapter *adapter, spin_lock_bh(&adapter->mac_vlan_list_lock); list_for_each_entry(f, &adapter->vlan_filter_list, list) { - if (f->is_new_vlan) { - f->is_new_vlan = false; + if (f->state == IAVF_VLAN_IS_NEW) { + f->state = IAVF_VLAN_ACTIVE; if (f->vlan.tpid == ETH_P_8021Q) set_bit(f->vlan.vid, adapter->vsi.active_cvlans); From 9c85b7fa12ef2e4fc11a4e31ac595fb5f9d0ddf9 Mon Sep 17 00:00:00 2001 From: Ahmed Zaki Date: Thu, 6 Apr 2023 15:35:28 -0600 Subject: [PATCH 11/50] iavf: remove active_cvlans and active_svlans bitmaps The VLAN filters info is currently being held in a list and 2 bitmaps (active_cvlans and active_svlans). We are experiencing some racing where data is not in sync in the list and bitmaps. For example, the VLAN is initially added to the list but only when the PF replies, it is added to the bitmap. If a user adds many V2 VLANS before the PF responds: while [ $((i++)) ] ip l add l eth0 name eth0.$i type vlan id $i we might end up with more VLAN list entries than the designated limit. Also, The "ip link show" will show more links added than the PF limit. On the other and, the bitmaps are only used to check the number of VLAN filters and to re-enable the filters when the interface goes from DOWN to UP. This patch gets rid of the bitmaps and uses the list only. To do that, the states of the VLAN filter are modified: 1 - IAVF_VLAN_REMOVE: the entry needs to be totally removed after informing the PF. This is the "ip link del eth0.$i" path. 2 - IAVF_VLAN_DISABLE: (new) the netdev went down. The filter needs to be removed from the PF and then marked INACTIVE. 3 - IAVF_VLAN_INACTIVE: (new) no PF filter exists, but the user did not delete the VLAN. Fixes: 48ccc43ecf10 ("iavf: Add support VIRTCHNL_VF_OFFLOAD_VLAN_V2 during netdev config") Signed-off-by: Ahmed Zaki Tested-by: Rafal Romanowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/iavf/iavf.h | 7 +-- drivers/net/ethernet/intel/iavf/iavf_main.c | 40 +++++++---------- .../net/ethernet/intel/iavf/iavf_virtchnl.c | 45 ++++++++++--------- 3 files changed, 45 insertions(+), 47 deletions(-) diff --git a/drivers/net/ethernet/intel/iavf/iavf.h b/drivers/net/ethernet/intel/iavf/iavf.h index 692d6a6421183..746ff76f2fb1e 100644 --- a/drivers/net/ethernet/intel/iavf/iavf.h +++ b/drivers/net/ethernet/intel/iavf/iavf.h @@ -59,8 +59,6 @@ enum iavf_vsi_state_t { struct iavf_vsi { struct iavf_adapter *back; struct net_device *netdev; - unsigned long active_cvlans[BITS_TO_LONGS(VLAN_N_VID)]; - unsigned long active_svlans[BITS_TO_LONGS(VLAN_N_VID)]; u16 seid; u16 id; DECLARE_BITMAP(state, __IAVF_VSI_STATE_SIZE__); @@ -163,7 +161,9 @@ enum iavf_vlan_state_t { IAVF_VLAN_ADD, /* filter needs to be added */ IAVF_VLAN_IS_NEW, /* filter is new, wait for PF answer */ IAVF_VLAN_ACTIVE, /* filter is accepted by PF */ - IAVF_VLAN_REMOVE, /* filter needs to be removed */ + IAVF_VLAN_DISABLE, /* filter needs to be deleted by PF, then marked INACTIVE */ + IAVF_VLAN_INACTIVE, /* filter is inactive, we are in IFF_DOWN */ + IAVF_VLAN_REMOVE, /* filter needs to be removed from list */ }; struct iavf_vlan_filter { @@ -261,6 +261,7 @@ struct iavf_adapter { wait_queue_head_t vc_waitqueue; struct iavf_q_vector *q_vectors; struct list_head vlan_filter_list; + int num_vlan_filters; struct list_head mac_filter_list; struct mutex crit_lock; struct mutex client_lock; diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c index fe9798e4b4ace..2de4baff4c205 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_main.c +++ b/drivers/net/ethernet/intel/iavf/iavf_main.c @@ -792,6 +792,7 @@ iavf_vlan_filter *iavf_add_vlan(struct iavf_adapter *adapter, list_add_tail(&f->list, &adapter->vlan_filter_list); f->state = IAVF_VLAN_ADD; + adapter->num_vlan_filters++; adapter->aq_required |= IAVF_FLAG_AQ_ADD_VLAN_FILTER; } @@ -828,14 +829,18 @@ static void iavf_del_vlan(struct iavf_adapter *adapter, struct iavf_vlan vlan) **/ static void iavf_restore_filters(struct iavf_adapter *adapter) { - u16 vid; + struct iavf_vlan_filter *f; /* re-add all VLAN filters */ - for_each_set_bit(vid, adapter->vsi.active_cvlans, VLAN_N_VID) - iavf_add_vlan(adapter, IAVF_VLAN(vid, ETH_P_8021Q)); + spin_lock_bh(&adapter->mac_vlan_list_lock); - for_each_set_bit(vid, adapter->vsi.active_svlans, VLAN_N_VID) - iavf_add_vlan(adapter, IAVF_VLAN(vid, ETH_P_8021AD)); + list_for_each_entry(f, &adapter->vlan_filter_list, list) { + if (f->state == IAVF_VLAN_INACTIVE) + f->state = IAVF_VLAN_ADD; + } + + spin_unlock_bh(&adapter->mac_vlan_list_lock); + adapter->aq_required |= IAVF_FLAG_AQ_ADD_VLAN_FILTER; } /** @@ -844,8 +849,7 @@ static void iavf_restore_filters(struct iavf_adapter *adapter) */ u16 iavf_get_num_vlans_added(struct iavf_adapter *adapter) { - return bitmap_weight(adapter->vsi.active_cvlans, VLAN_N_VID) + - bitmap_weight(adapter->vsi.active_svlans, VLAN_N_VID); + return adapter->num_vlan_filters; } /** @@ -928,11 +932,6 @@ static int iavf_vlan_rx_kill_vid(struct net_device *netdev, return 0; iavf_del_vlan(adapter, IAVF_VLAN(vid, be16_to_cpu(proto))); - if (proto == cpu_to_be16(ETH_P_8021Q)) - clear_bit(vid, adapter->vsi.active_cvlans); - else - clear_bit(vid, adapter->vsi.active_svlans); - return 0; } @@ -1293,16 +1292,11 @@ static void iavf_clear_mac_vlan_filters(struct iavf_adapter *adapter) } } - /* remove all VLAN filters */ + /* disable all VLAN filters */ list_for_each_entry_safe(vlf, vlftmp, &adapter->vlan_filter_list, - list) { - if (vlf->state == IAVF_VLAN_ADD) { - list_del(&vlf->list); - kfree(vlf); - } else { - vlf->state = IAVF_VLAN_REMOVE; - } - } + list) + vlf->state = IAVF_VLAN_DISABLE; + spin_unlock_bh(&adapter->mac_vlan_list_lock); } @@ -2914,6 +2908,7 @@ static void iavf_disable_vf(struct iavf_adapter *adapter) list_del(&fv->list); kfree(fv); } + adapter->num_vlan_filters = 0; spin_unlock_bh(&adapter->mac_vlan_list_lock); @@ -3131,9 +3126,6 @@ static void iavf_reset_task(struct work_struct *work) adapter->aq_required |= IAVF_FLAG_AQ_ADD_CLOUD_FILTER; iavf_misc_irq_enable(adapter); - bitmap_clear(adapter->vsi.active_cvlans, 0, VLAN_N_VID); - bitmap_clear(adapter->vsi.active_svlans, 0, VLAN_N_VID); - mod_delayed_work(adapter->wq, &adapter->watchdog_task, 2); /* We were running when the reset started, so we need to restore some diff --git a/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c b/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c index 5047b4c83718d..9afbbdac35903 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c +++ b/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c @@ -643,15 +643,9 @@ static void iavf_vlan_add_reject(struct iavf_adapter *adapter) spin_lock_bh(&adapter->mac_vlan_list_lock); list_for_each_entry_safe(f, ftmp, &adapter->vlan_filter_list, list) { if (f->state == IAVF_VLAN_IS_NEW) { - if (f->vlan.tpid == ETH_P_8021Q) - clear_bit(f->vlan.vid, - adapter->vsi.active_cvlans); - else - clear_bit(f->vlan.vid, - adapter->vsi.active_svlans); - list_del(&f->list); kfree(f); + adapter->num_vlan_filters--; } } spin_unlock_bh(&adapter->mac_vlan_list_lock); @@ -824,7 +818,12 @@ void iavf_del_vlans(struct iavf_adapter *adapter) !VLAN_FILTERING_ALLOWED(adapter)) { list_del(&f->list); kfree(f); - } else if (f->state == IAVF_VLAN_REMOVE) { + adapter->num_vlan_filters--; + } else if (f->state == IAVF_VLAN_DISABLE && + !VLAN_FILTERING_ALLOWED(adapter)) { + f->state = IAVF_VLAN_INACTIVE; + } else if (f->state == IAVF_VLAN_REMOVE || + f->state == IAVF_VLAN_DISABLE) { count++; } } @@ -856,11 +855,18 @@ void iavf_del_vlans(struct iavf_adapter *adapter) vvfl->vsi_id = adapter->vsi_res->vsi_id; vvfl->num_elements = count; list_for_each_entry_safe(f, ftmp, &adapter->vlan_filter_list, list) { - if (f->state == IAVF_VLAN_REMOVE) { + if (f->state == IAVF_VLAN_DISABLE) { vvfl->vlan_id[i] = f->vlan.vid; + f->state = IAVF_VLAN_INACTIVE; i++; + if (i == count) + break; + } else if (f->state == IAVF_VLAN_REMOVE) { + vvfl->vlan_id[i] = f->vlan.vid; list_del(&f->list); kfree(f); + adapter->num_vlan_filters--; + i++; if (i == count) break; } @@ -900,7 +906,8 @@ void iavf_del_vlans(struct iavf_adapter *adapter) vvfl_v2->vport_id = adapter->vsi_res->vsi_id; vvfl_v2->num_elements = count; list_for_each_entry_safe(f, ftmp, &adapter->vlan_filter_list, list) { - if (f->state == IAVF_VLAN_REMOVE) { + if (f->state == IAVF_VLAN_DISABLE || + f->state == IAVF_VLAN_REMOVE) { struct virtchnl_vlan_supported_caps *filtering_support = &adapter->vlan_v2_caps.filtering.filtering_support; struct virtchnl_vlan *vlan; @@ -914,8 +921,13 @@ void iavf_del_vlans(struct iavf_adapter *adapter) vlan->tci = f->vlan.vid; vlan->tpid = f->vlan.tpid; - list_del(&f->list); - kfree(f); + if (f->state == IAVF_VLAN_DISABLE) { + f->state = IAVF_VLAN_INACTIVE; + } else { + list_del(&f->list); + kfree(f); + adapter->num_vlan_filters--; + } i++; if (i == count) break; @@ -2443,15 +2455,8 @@ void iavf_virtchnl_completion(struct iavf_adapter *adapter, spin_lock_bh(&adapter->mac_vlan_list_lock); list_for_each_entry(f, &adapter->vlan_filter_list, list) { - if (f->state == IAVF_VLAN_IS_NEW) { + if (f->state == IAVF_VLAN_IS_NEW) f->state = IAVF_VLAN_ACTIVE; - if (f->vlan.tpid == ETH_P_8021Q) - set_bit(f->vlan.vid, - adapter->vsi.active_cvlans); - else - set_bit(f->vlan.vid, - adapter->vsi.active_svlans); - } } spin_unlock_bh(&adapter->mac_vlan_list_lock); } From 066b86787fa3d97b7aefb5ac0a99a22dad2d15f8 Mon Sep 17 00:00:00 2001 From: Felix Huettner Date: Wed, 5 Apr 2023 07:53:41 +0000 Subject: [PATCH 12/50] net: openvswitch: fix race on port output assume the following setup on a single machine: 1. An openvswitch instance with one bridge and default flows 2. two network namespaces "server" and "client" 3. two ovs interfaces "server" and "client" on the bridge 4. for each ovs interface a veth pair with a matching name and 32 rx and tx queues 5. move the ends of the veth pairs to the respective network namespaces 6. assign ip addresses to each of the veth ends in the namespaces (needs to be the same subnet) 7. start some http server on the server network namespace 8. test if a client in the client namespace can reach the http server when following the actions below the host has a chance of getting a cpu stuck in a infinite loop: 1. send a large amount of parallel requests to the http server (around 3000 curls should work) 2. in parallel delete the network namespace (do not delete interfaces or stop the server, just kill the namespace) there is a low chance that this will cause the below kernel cpu stuck message. If this does not happen just retry. Below there is also the output of bpftrace for the functions mentioned in the output. The series of events happening here is: 1. the network namespace is deleted calling `unregister_netdevice_many_notify` somewhere in the process 2. this sets first `NETREG_UNREGISTERING` on both ends of the veth and then runs `synchronize_net` 3. it then calls `call_netdevice_notifiers` with `NETDEV_UNREGISTER` 4. this is then handled by `dp_device_event` which calls `ovs_netdev_detach_dev` (if a vport is found, which is the case for the veth interface attached to ovs) 5. this removes the rx_handlers of the device but does not prevent packages to be sent to the device 6. `dp_device_event` then queues the vport deletion to work in background as a ovs_lock is needed that we do not hold in the unregistration path 7. `unregister_netdevice_many_notify` continues to call `netdev_unregister_kobject` which sets `real_num_tx_queues` to 0 8. port deletion continues (but details are not relevant for this issue) 9. at some future point the background task deletes the vport If after 7. but before 9. a packet is send to the ovs vport (which is not deleted at this point in time) which forwards it to the `dev_queue_xmit` flow even though the device is unregistering. In `skb_tx_hash` (which is called in the `dev_queue_xmit`) path there is a while loop (if the packet has a rx_queue recorded) that is infinite if `dev->real_num_tx_queues` is zero. To prevent this from happening we update `do_output` to handle devices without carrier the same as if the device is not found (which would be the code path after 9. is done). Additionally we now produce a warning in `skb_tx_hash` if we will hit the infinite loop. bpftrace (first word is function name): __dev_queue_xmit server: real_num_tx_queues: 1, cpu: 2, pid: 28024, tid: 28024, skb_addr: 0xffff9edb6f207000, reg_state: 1 netdev_core_pick_tx server: addr: 0xffff9f0a46d4a000 real_num_tx_queues: 1, cpu: 2, pid: 28024, tid: 28024, skb_addr: 0xffff9edb6f207000, reg_state: 1 dp_device_event server: real_num_tx_queues: 1 cpu 9, pid: 21024, tid: 21024, event 2, reg_state: 1 synchronize_rcu_expedited: cpu 9, pid: 21024, tid: 21024 synchronize_rcu_expedited: cpu 9, pid: 21024, tid: 21024 synchronize_rcu_expedited: cpu 9, pid: 21024, tid: 21024 synchronize_rcu_expedited: cpu 9, pid: 21024, tid: 21024 dp_device_event server: real_num_tx_queues: 1 cpu 9, pid: 21024, tid: 21024, event 6, reg_state: 2 ovs_netdev_detach_dev server: real_num_tx_queues: 1 cpu 9, pid: 21024, tid: 21024, reg_state: 2 netdev_rx_handler_unregister server: real_num_tx_queues: 1, cpu: 9, pid: 21024, tid: 21024, reg_state: 2 synchronize_rcu_expedited: cpu 9, pid: 21024, tid: 21024 netdev_rx_handler_unregister ret server: real_num_tx_queues: 1, cpu: 9, pid: 21024, tid: 21024, reg_state: 2 dp_device_event server: real_num_tx_queues: 1 cpu 9, pid: 21024, tid: 21024, event 27, reg_state: 2 dp_device_event server: real_num_tx_queues: 1 cpu 9, pid: 21024, tid: 21024, event 22, reg_state: 2 dp_device_event server: real_num_tx_queues: 1 cpu 9, pid: 21024, tid: 21024, event 18, reg_state: 2 netdev_unregister_kobject: real_num_tx_queues: 1, cpu: 9, pid: 21024, tid: 21024 synchronize_rcu_expedited: cpu 9, pid: 21024, tid: 21024 ovs_vport_send server: real_num_tx_queues: 0, cpu: 2, pid: 28024, tid: 28024, skb_addr: 0xffff9edb6f207000, reg_state: 2 __dev_queue_xmit server: real_num_tx_queues: 0, cpu: 2, pid: 28024, tid: 28024, skb_addr: 0xffff9edb6f207000, reg_state: 2 netdev_core_pick_tx server: addr: 0xffff9f0a46d4a000 real_num_tx_queues: 0, cpu: 2, pid: 28024, tid: 28024, skb_addr: 0xffff9edb6f207000, reg_state: 2 broken device server: real_num_tx_queues: 0, cpu: 2, pid: 28024, tid: 28024 ovs_dp_detach_port server: real_num_tx_queues: 0 cpu 9, pid: 9124, tid: 9124, reg_state: 2 synchronize_rcu_expedited: cpu 9, pid: 33604, tid: 33604 stuck message: watchdog: BUG: soft lockup - CPU#5 stuck for 26s! [curl:1929279] Modules linked in: veth pktgen bridge stp llc ip_set_hash_net nft_counter xt_set nft_compat nf_tables ip_set_hash_ip ip_set nfnetlink_cttimeout nfnetlink openvswitch nsh nf_conncount nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 tls binfmt_misc nls_iso8859_1 input_leds joydev serio_raw dm_multipath scsi_dh_rdac scsi_dh_emc scsi_dh_alua sch_fq_codel drm efi_pstore virtio_rng ip_tables x_tables autofs4 btrfs blake2b_generic zstd_compress raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx xor raid6_pq libcrc32c raid1 raid0 multipath linear hid_generic usbhid hid crct10dif_pclmul crc32_pclmul ghash_clmulni_intel aesni_intel virtio_net ahci net_failover crypto_simd cryptd psmouse libahci virtio_blk failover CPU: 5 PID: 1929279 Comm: curl Not tainted 5.15.0-67-generic #74-Ubuntu Hardware name: OpenStack Foundation OpenStack Nova, BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014 RIP: 0010:netdev_pick_tx+0xf1/0x320 Code: 00 00 8d 48 ff 0f b7 c1 66 39 ca 0f 86 e9 01 00 00 45 0f b7 ff 41 39 c7 0f 87 5b 01 00 00 44 29 f8 41 39 c7 0f 87 4f 01 00 00 f2 0f 1f 44 00 00 49 8b 94 24 28 04 00 00 48 85 d2 0f 84 53 01 RSP: 0018:ffffb78b40298820 EFLAGS: 00000246 RAX: 0000000000000000 RBX: ffff9c8773adc2e0 RCX: 000000000000083f RDX: 0000000000000000 RSI: ffff9c8773adc2e0 RDI: ffff9c870a25e000 RBP: ffffb78b40298858 R08: 0000000000000001 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: ffff9c870a25e000 R13: ffff9c870a25e000 R14: ffff9c87fe043480 R15: 0000000000000000 FS: 00007f7b80008f00(0000) GS:ffff9c8e5f740000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f7b80f6a0b0 CR3: 0000000329d66000 CR4: 0000000000350ee0 Call Trace: netdev_core_pick_tx+0xa4/0xb0 __dev_queue_xmit+0xf8/0x510 ? __bpf_prog_exit+0x1e/0x30 dev_queue_xmit+0x10/0x20 ovs_vport_send+0xad/0x170 [openvswitch] do_output+0x59/0x180 [openvswitch] do_execute_actions+0xa80/0xaa0 [openvswitch] ? kfree+0x1/0x250 ? kfree+0x1/0x250 ? kprobe_perf_func+0x4f/0x2b0 ? flow_lookup.constprop.0+0x5c/0x110 [openvswitch] ovs_execute_actions+0x4c/0x120 [openvswitch] ovs_dp_process_packet+0xa1/0x200 [openvswitch] ? ovs_ct_update_key.isra.0+0xa8/0x120 [openvswitch] ? ovs_ct_fill_key+0x1d/0x30 [openvswitch] ? ovs_flow_key_extract+0x2db/0x350 [openvswitch] ovs_vport_receive+0x77/0xd0 [openvswitch] ? __htab_map_lookup_elem+0x4e/0x60 ? bpf_prog_680e8aff8547aec1_kfree+0x3b/0x714 ? trace_call_bpf+0xc8/0x150 ? kfree+0x1/0x250 ? kfree+0x1/0x250 ? kprobe_perf_func+0x4f/0x2b0 ? kprobe_perf_func+0x4f/0x2b0 ? __mod_memcg_lruvec_state+0x63/0xe0 netdev_port_receive+0xc4/0x180 [openvswitch] ? netdev_port_receive+0x180/0x180 [openvswitch] netdev_frame_hook+0x1f/0x40 [openvswitch] __netif_receive_skb_core.constprop.0+0x23d/0xf00 __netif_receive_skb_one_core+0x3f/0xa0 __netif_receive_skb+0x15/0x60 process_backlog+0x9e/0x170 __napi_poll+0x33/0x180 net_rx_action+0x126/0x280 ? ttwu_do_activate+0x72/0xf0 __do_softirq+0xd9/0x2e7 ? rcu_report_exp_cpu_mult+0x1b0/0x1b0 do_softirq+0x7d/0xb0 __local_bh_enable_ip+0x54/0x60 ip_finish_output2+0x191/0x460 __ip_finish_output+0xb7/0x180 ip_finish_output+0x2e/0xc0 ip_output+0x78/0x100 ? __ip_finish_output+0x180/0x180 ip_local_out+0x5e/0x70 __ip_queue_xmit+0x184/0x440 ? tcp_syn_options+0x1f9/0x300 ip_queue_xmit+0x15/0x20 __tcp_transmit_skb+0x910/0x9c0 ? __mod_memcg_state+0x44/0xa0 tcp_connect+0x437/0x4e0 ? ktime_get_with_offset+0x60/0xf0 tcp_v4_connect+0x436/0x530 __inet_stream_connect+0xd4/0x3a0 ? kprobe_perf_func+0x4f/0x2b0 ? aa_sk_perm+0x43/0x1c0 inet_stream_connect+0x3b/0x60 __sys_connect_file+0x63/0x70 __sys_connect+0xa6/0xd0 ? setfl+0x108/0x170 ? do_fcntl+0xe8/0x5a0 __x64_sys_connect+0x18/0x20 do_syscall_64+0x5c/0xc0 ? __x64_sys_fcntl+0xa9/0xd0 ? exit_to_user_mode_prepare+0x37/0xb0 ? syscall_exit_to_user_mode+0x27/0x50 ? do_syscall_64+0x69/0xc0 ? __sys_setsockopt+0xea/0x1e0 ? exit_to_user_mode_prepare+0x37/0xb0 ? syscall_exit_to_user_mode+0x27/0x50 ? __x64_sys_setsockopt+0x1f/0x30 ? do_syscall_64+0x69/0xc0 ? irqentry_exit+0x1d/0x30 ? exc_page_fault+0x89/0x170 entry_SYSCALL_64_after_hwframe+0x61/0xcb RIP: 0033:0x7f7b8101c6a7 Code: 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 64 8b 04 25 18 00 00 00 85 c0 75 10 b8 2a 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 51 c3 48 83 ec 18 89 54 24 0c 48 89 34 24 89 RSP: 002b:00007ffffd6b2198 EFLAGS: 00000246 ORIG_RAX: 000000000000002a RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f7b8101c6a7 RDX: 0000000000000010 RSI: 00007ffffd6b2360 RDI: 0000000000000005 RBP: 0000561f1370d560 R08: 00002795ad21d1ac R09: 0030312e302e302e R10: 00007ffffd73f080 R11: 0000000000000246 R12: 0000561f1370c410 R13: 0000000000000000 R14: 0000000000000005 R15: 0000000000000000 Fixes: 7f8a436eaa2c ("openvswitch: Add conntrack action") Co-developed-by: Luca Czesla Signed-off-by: Luca Czesla Signed-off-by: Felix Huettner Reviewed-by: Eric Dumazet Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/ZC0pBXBAgh7c76CA@kernel-bug-kernel-bug Signed-off-by: Jakub Kicinski --- net/core/dev.c | 1 + net/openvswitch/actions.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/net/core/dev.c b/net/core/dev.c index 253584777101f..48067321c0dba 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3199,6 +3199,7 @@ static u16 skb_tx_hash(const struct net_device *dev, } if (skb_rx_queue_recorded(skb)) { + DEBUG_NET_WARN_ON_ONCE(qcount == 0); hash = skb_get_rx_queue(skb); if (hash >= qoffset) hash -= qoffset; diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index ca3ebfdb30231..a8cf9a88758ef 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -913,7 +913,7 @@ static void do_output(struct datapath *dp, struct sk_buff *skb, int out_port, { struct vport *vport = ovs_vport_rcu(dp, out_port); - if (likely(vport)) { + if (likely(vport && netif_carrier_ok(vport->dev))) { u16 mru = OVS_CB(skb)->mru; u32 cutlen = OVS_CB(skb)->cutlen; From bdaaecc127d471c422ee9e994978617c8aa79e1e Mon Sep 17 00:00:00 2001 From: "Radu Pirea (OSS)" Date: Thu, 6 Apr 2023 12:59:53 +0300 Subject: [PATCH 13/50] net: phy: nxp-c45-tja11xx: fix unsigned long multiplication overflow Any multiplication between GENMASK(31, 0) and a number bigger than 1 will be truncated because of the overflow, if the size of unsigned long is 32 bits. Replaced GENMASK with GENMASK_ULL to make sure that multiplication will be between 64 bits values. Cc: # 5.15+ Fixes: 514def5dd339 ("phy: nxp-c45-tja11xx: add timestamping support") Signed-off-by: Radu Pirea (OSS) Reviewed-by: Andrew Lunn Link: https://lore.kernel.org/r/20230406095953.75622-1-radu-nicolae.pirea@oss.nxp.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/nxp-c45-tja11xx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/phy/nxp-c45-tja11xx.c b/drivers/net/phy/nxp-c45-tja11xx.c index 5813b07242ce1..1b7d6941b0f35 100644 --- a/drivers/net/phy/nxp-c45-tja11xx.c +++ b/drivers/net/phy/nxp-c45-tja11xx.c @@ -191,7 +191,7 @@ #define MAX_ID_PS 2260U #define DEFAULT_ID_PS 2000U -#define PPM_TO_SUBNS_INC(ppb) div_u64(GENMASK(31, 0) * (ppb) * \ +#define PPM_TO_SUBNS_INC(ppb) div_u64(GENMASK_ULL(31, 0) * (ppb) * \ PTP_CLK_PERIOD_100BT1, NSEC_PER_SEC) #define NXP_C45_SKB_CB(skb) ((struct nxp_c45_skb_cb *)(skb)->cb) From 5cc33f139e11b893ff6dc60d8a0ae865a65521ac Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Thu, 6 Apr 2023 17:14:26 -0700 Subject: [PATCH 14/50] r8152: Add __GFP_NOWARN to big allocations When memory is a little tight on my system, it's pretty easy to see warnings that look like this. ksoftirqd/0: page allocation failure: order:3, mode:0x40a20(GFP_ATOMIC|__GFP_COMP), nodemask=(null),cpuset=/,mems_allowed=0 ... Call trace: dump_backtrace+0x0/0x1e8 show_stack+0x20/0x2c dump_stack_lvl+0x60/0x78 dump_stack+0x18/0x38 warn_alloc+0x104/0x174 __alloc_pages+0x588/0x67c alloc_rx_agg+0xa0/0x190 [r8152 ...] r8152_poll+0x270/0x760 [r8152 ...] __napi_poll+0x44/0x1ec net_rx_action+0x100/0x300 __do_softirq+0xec/0x38c run_ksoftirqd+0x38/0xec smpboot_thread_fn+0xb8/0x248 kthread+0x134/0x154 ret_from_fork+0x10/0x20 On a fragmented system it's normal that order 3 allocations will sometimes fail, especially atomic ones. The driver handles these failures fine and the WARN just creates spam in the logs for this case. The __GFP_NOWARN flag is exactly for this situation, so add it to the allocation. NOTE: my testing is on a 5.15 system, but there should be no reason that this would be fundamentally different on a mainline kernel. Signed-off-by: Douglas Anderson Acked-by: Hayes Wang Link: https://lore.kernel.org/r/20230406171411.1.I84dbef45786af440fd269b71e9436a96a8e7a152@changeid Signed-off-by: Jakub Kicinski --- drivers/net/usb/r8152.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c index decb5ba56a259..0fc4b959edc18 100644 --- a/drivers/net/usb/r8152.c +++ b/drivers/net/usb/r8152.c @@ -1943,7 +1943,7 @@ static struct rx_agg *alloc_rx_agg(struct r8152 *tp, gfp_t mflags) if (!rx_agg) return NULL; - rx_agg->page = alloc_pages(mflags | __GFP_COMP, order); + rx_agg->page = alloc_pages(mflags | __GFP_COMP | __GFP_NOWARN, order); if (!rx_agg->page) goto free_rx; From 813c2dd78618f108fdcf9cd726ea90f081ee2881 Mon Sep 17 00:00:00 2001 From: Ivan Bornyakov Date: Thu, 6 Apr 2023 16:08:32 +0300 Subject: [PATCH 15/50] net: sfp: initialize sfp->i2c_block_size at sfp allocation sfp->i2c_block_size is initialized at SFP module insertion in sfp_sm_mod_probe(). Because of that, if SFP module was never inserted since boot, sfp_read() call will lead to zero-length I2C read attempt, and not all I2C controllers are happy with zero-length reads. One way to issue sfp_read() on empty SFP cage is to execute ethtool -m. If SFP module was never plugged since boot, there will be a zero-length I2C read attempt. # ethtool -m xge0 i2c i2c-3: adapter quirk: no zero length (addr 0x0050, size 0, read) Cannot get Module EEPROM data: Operation not supported If SFP module was plugged then removed at least once, sfp->i2c_block_size will be initialized and ethtool -m will fail with different exit code and without I2C error # ethtool -m xge0 Cannot get Module EEPROM data: Remote I/O error Fix this by initializing sfp->i2_block_size at struct sfp allocation stage so no wild sfp_read() could issue zero-length I2C read. Signed-off-by: Ivan Bornyakov Fixes: 0d035bed2a4a ("net: sfp: VSOL V2801F / CarlitoxxPro CPGOS03-0490 v2.0 workaround") Cc: stable@vger.kernel.org Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/phy/sfp.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/drivers/net/phy/sfp.c b/drivers/net/phy/sfp.c index 8af10bb53e57b..81edc457c5a14 100644 --- a/drivers/net/phy/sfp.c +++ b/drivers/net/phy/sfp.c @@ -210,6 +210,12 @@ static const enum gpiod_flags gpio_flags[] = { #define SFP_PHY_ADDR 22 #define SFP_PHY_ADDR_ROLLBALL 17 +/* SFP_EEPROM_BLOCK_SIZE is the size of data chunk to read the EEPROM + * at a time. Some SFP modules and also some Linux I2C drivers do not like + * reads longer than 16 bytes. + */ +#define SFP_EEPROM_BLOCK_SIZE 16 + struct sff_data { unsigned int gpios; bool (*module_supported)(const struct sfp_eeprom_id *id); @@ -1929,11 +1935,7 @@ static int sfp_sm_mod_probe(struct sfp *sfp, bool report) u8 check; int ret; - /* Some SFP modules and also some Linux I2C drivers do not like reads - * longer than 16 bytes, so read the EEPROM in chunks of 16 bytes at - * a time. - */ - sfp->i2c_block_size = 16; + sfp->i2c_block_size = SFP_EEPROM_BLOCK_SIZE; ret = sfp_read(sfp, false, 0, &id.base, sizeof(id.base)); if (ret < 0) { @@ -2621,6 +2623,7 @@ static struct sfp *sfp_alloc(struct device *dev) return ERR_PTR(-ENOMEM); sfp->dev = dev; + sfp->i2c_block_size = SFP_EEPROM_BLOCK_SIZE; mutex_init(&sfp->sm_mutex); mutex_init(&sfp->st_mutex); From bef227c1537cb8005311c0842bc5449e8c7a5973 Mon Sep 17 00:00:00 2001 From: Ivan Bornyakov Date: Thu, 6 Apr 2023 16:08:33 +0300 Subject: [PATCH 16/50] net: sfp: avoid EEPROM read of absent SFP module If SFP module is not present, it is sensible to fail sfp_module_eeprom() and sfp_module_eeprom_by_page() early to avoid excessive I2C transfers which are garanteed to fail. Suggested-by: Andrew Lunn Signed-off-by: Ivan Bornyakov Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/phy/sfp.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/phy/sfp.c b/drivers/net/phy/sfp.c index 81edc457c5a14..bf345032d450c 100644 --- a/drivers/net/phy/sfp.c +++ b/drivers/net/phy/sfp.c @@ -2487,6 +2487,9 @@ static int sfp_module_eeprom(struct sfp *sfp, struct ethtool_eeprom *ee, unsigned int first, last, len; int ret; + if (!(sfp->state & SFP_F_PRESENT)) + return -ENODEV; + if (ee->len == 0) return -EINVAL; @@ -2519,6 +2522,9 @@ static int sfp_module_eeprom_by_page(struct sfp *sfp, const struct ethtool_module_eeprom *page, struct netlink_ext_ack *extack) { + if (!(sfp->state & SFP_F_PRESENT)) + return -ENODEV; + if (page->bank) { NL_SET_ERR_MSG(extack, "Banks not supported"); return -EOPNOTSUPP; From 19cf60bf63cbaf5262eac400c707966e19999b83 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Fri, 24 Mar 2023 10:57:55 -0700 Subject: [PATCH 17/50] Bluetooth: hci_conn: Fix not cleaning up on LE Connection failure hci_connect_le_scan_cleanup shall always be invoked to cleanup the states and re-enable passive scanning if necessary, otherwise it may cause the pending action to stay active causing multiple attempts to connect. Fixes: 9b3628d79b46 ("Bluetooth: hci_sync: Cleanup hci_conn if it cannot be aborted") Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_conn.c | 52 +++++++++++++++++++--------------------- 1 file changed, 24 insertions(+), 28 deletions(-) diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 17b946f9ba317..5af3f6b011c95 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -68,7 +68,7 @@ static const struct sco_param esco_param_msbc[] = { }; /* This function requires the caller holds hdev->lock */ -static void hci_connect_le_scan_cleanup(struct hci_conn *conn) +static void hci_connect_le_scan_cleanup(struct hci_conn *conn, u8 status) { struct hci_conn_params *params; struct hci_dev *hdev = conn->hdev; @@ -88,9 +88,28 @@ static void hci_connect_le_scan_cleanup(struct hci_conn *conn) params = hci_pend_le_action_lookup(&hdev->pend_le_conns, bdaddr, bdaddr_type); - if (!params || !params->explicit_connect) + if (!params) return; + if (params->conn) { + hci_conn_drop(params->conn); + hci_conn_put(params->conn); + params->conn = NULL; + } + + if (!params->explicit_connect) + return; + + /* If the status indicates successful cancellation of + * the attempt (i.e. Unknown Connection Id) there's no point of + * notifying failure since we'll go back to keep trying to + * connect. The only exception is explicit connect requests + * where a timeout + cancel does indicate an actual failure. + */ + if (status && status != HCI_ERROR_UNKNOWN_CONN_ID) + mgmt_connect_failed(hdev, &conn->dst, conn->type, + conn->dst_type, status); + /* The connection attempt was doing scan for new RPA, and is * in scan phase. If params are not associated with any other * autoconnect action, remove them completely. If they are, just unmark @@ -178,7 +197,7 @@ static void le_scan_cleanup(struct work_struct *work) rcu_read_unlock(); if (c == conn) { - hci_connect_le_scan_cleanup(conn); + hci_connect_le_scan_cleanup(conn, 0x00); hci_conn_cleanup(conn); } @@ -1179,31 +1198,8 @@ EXPORT_SYMBOL(hci_get_route); static void hci_le_conn_failed(struct hci_conn *conn, u8 status) { struct hci_dev *hdev = conn->hdev; - struct hci_conn_params *params; - params = hci_pend_le_action_lookup(&hdev->pend_le_conns, &conn->dst, - conn->dst_type); - if (params && params->conn) { - hci_conn_drop(params->conn); - hci_conn_put(params->conn); - params->conn = NULL; - } - - /* If the status indicates successful cancellation of - * the attempt (i.e. Unknown Connection Id) there's no point of - * notifying failure since we'll go back to keep trying to - * connect. The only exception is explicit connect requests - * where a timeout + cancel does indicate an actual failure. - */ - if (status != HCI_ERROR_UNKNOWN_CONN_ID || - (params && params->explicit_connect)) - mgmt_connect_failed(hdev, &conn->dst, conn->type, - conn->dst_type, status); - - /* Since we may have temporarily stopped the background scanning in - * favor of connection establishment, we should restart it. - */ - hci_update_passive_scan(hdev); + hci_connect_le_scan_cleanup(conn, status); /* Enable advertising in case this was a failed connection * attempt as a peripheral. @@ -1240,7 +1236,7 @@ static void create_le_conn_complete(struct hci_dev *hdev, void *data, int err) hci_dev_lock(hdev); if (!err) { - hci_connect_le_scan_cleanup(conn); + hci_connect_le_scan_cleanup(conn, 0x00); goto done; } From b62e72200eaad523f08d8319bba50fc652e032a8 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Fri, 24 Mar 2023 13:18:20 -0700 Subject: [PATCH 18/50] Bluetooth: Fix printing errors if LE Connection times out This fixes errors like bellow when LE Connection times out since that is actually not a controller error: Bluetooth: hci0: Opcode 0x200d failed: -110 Bluetooth: hci0: request failed to create LE connection: err -110 Instead the code shall properly detect if -ETIMEDOUT is returned and send HCI_OP_LE_CREATE_CONN_CANCEL to give up on the connection. Link: https://github.com/bluez/bluez/issues/340 Fixes: 8e8b92ee60de ("Bluetooth: hci_sync: Add hci_le_create_conn_sync") Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_core.h | 1 + net/bluetooth/hci_conn.c | 7 +++++-- net/bluetooth/hci_event.c | 16 ++++++---------- net/bluetooth/hci_sync.c | 13 ++++++++++--- 4 files changed, 22 insertions(+), 15 deletions(-) diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 6ed9b4d546a7a..d5311ceb21c62 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -954,6 +954,7 @@ enum { HCI_CONN_STK_ENCRYPT, HCI_CONN_AUTH_INITIATOR, HCI_CONN_DROP, + HCI_CONN_CANCEL, HCI_CONN_PARAM_REMOVAL_PEND, HCI_CONN_NEW_LINK_KEY, HCI_CONN_SCANNING, diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 5af3f6b011c95..e4aee5950c36a 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -1233,6 +1233,8 @@ static void create_le_conn_complete(struct hci_dev *hdev, void *data, int err) { struct hci_conn *conn = data; + bt_dev_dbg(hdev, "err %d", err); + hci_dev_lock(hdev); if (!err) { @@ -1240,8 +1242,6 @@ static void create_le_conn_complete(struct hci_dev *hdev, void *data, int err) goto done; } - bt_dev_err(hdev, "request failed to create LE connection: err %d", err); - /* Check if connection is still pending */ if (conn != hci_lookup_le_connect(hdev)) goto done; @@ -2771,6 +2771,9 @@ int hci_abort_conn(struct hci_conn *conn, u8 reason) { int r = 0; + if (test_and_set_bit(HCI_CONN_CANCEL, &conn->flags)) + return 0; + switch (conn->state) { case BT_CONNECTED: case BT_CONFIG: diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index ad92a4be58517..e68f2a7d863ac 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -2881,16 +2881,6 @@ static void cs_le_create_conn(struct hci_dev *hdev, bdaddr_t *peer_addr, conn->resp_addr_type = peer_addr_type; bacpy(&conn->resp_addr, peer_addr); - - /* We don't want the connection attempt to stick around - * indefinitely since LE doesn't have a page timeout concept - * like BR/EDR. Set a timer for any connection that doesn't use - * the accept list for connecting. - */ - if (filter_policy == HCI_LE_USE_PEER_ADDR) - queue_delayed_work(conn->hdev->workqueue, - &conn->le_conn_timeout, - conn->conn_timeout); } static void hci_cs_le_create_conn(struct hci_dev *hdev, u8 status) @@ -5902,6 +5892,12 @@ static void le_conn_complete_evt(struct hci_dev *hdev, u8 status, if (status) goto unlock; + /* Drop the connection if it has been aborted */ + if (test_bit(HCI_CONN_CANCEL, &conn->flags)) { + hci_conn_drop(conn); + goto unlock; + } + if (conn->dst_type == ADDR_LE_DEV_PUBLIC) addr_type = BDADDR_LE_PUBLIC; else diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index 5a6aa1627791b..632be12672887 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -246,8 +246,9 @@ int __hci_cmd_sync_status_sk(struct hci_dev *hdev, u16 opcode, u32 plen, skb = __hci_cmd_sync_sk(hdev, opcode, plen, param, event, timeout, sk); if (IS_ERR(skb)) { - bt_dev_err(hdev, "Opcode 0x%4x failed: %ld", opcode, - PTR_ERR(skb)); + if (!event) + bt_dev_err(hdev, "Opcode 0x%4x failed: %ld", opcode, + PTR_ERR(skb)); return PTR_ERR(skb); } @@ -5126,8 +5127,11 @@ static int hci_le_connect_cancel_sync(struct hci_dev *hdev, if (test_bit(HCI_CONN_SCANNING, &conn->flags)) return 0; + if (test_and_set_bit(HCI_CONN_CANCEL, &conn->flags)) + return 0; + return __hci_cmd_sync_status(hdev, HCI_OP_LE_CREATE_CONN_CANCEL, - 6, &conn->dst, HCI_CMD_TIMEOUT); + 0, NULL, HCI_CMD_TIMEOUT); } static int hci_connect_cancel_sync(struct hci_dev *hdev, struct hci_conn *conn) @@ -6102,6 +6106,9 @@ int hci_le_create_conn_sync(struct hci_dev *hdev, struct hci_conn *conn) conn->conn_timeout, NULL); done: + if (err == -ETIMEDOUT) + hci_le_connect_cancel_sync(hdev, conn); + /* Re-enable advertising after the connection attempt is finished. */ hci_resume_advertising_sync(hdev); return err; From c95930abd687fcd1aa040dc4fe90dff947916460 Mon Sep 17 00:00:00 2001 From: Min Li Date: Sat, 4 Mar 2023 22:23:30 +0800 Subject: [PATCH 19/50] Bluetooth: Fix race condition in hidp_session_thread There is a potential race condition in hidp_session_thread that may lead to use-after-free. For instance, the timer is active while hidp_del_timer is called in hidp_session_thread(). After hidp_session_put, then 'session' will be freed, causing kernel panic when hidp_idle_timeout is running. The solution is to use del_timer_sync instead of del_timer. Here is the call trace: ? hidp_session_probe+0x780/0x780 call_timer_fn+0x2d/0x1e0 __run_timers.part.0+0x569/0x940 hidp_session_probe+0x780/0x780 call_timer_fn+0x1e0/0x1e0 ktime_get+0x5c/0xf0 lapic_next_deadline+0x2c/0x40 clockevents_program_event+0x205/0x320 run_timer_softirq+0xa9/0x1b0 __do_softirq+0x1b9/0x641 __irq_exit_rcu+0xdc/0x190 irq_exit_rcu+0xe/0x20 sysvec_apic_timer_interrupt+0xa1/0xc0 Cc: stable@vger.kernel.org Signed-off-by: Min Li Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hidp/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c index bed1a7b9205c2..707f229f896a1 100644 --- a/net/bluetooth/hidp/core.c +++ b/net/bluetooth/hidp/core.c @@ -433,7 +433,7 @@ static void hidp_set_timer(struct hidp_session *session) static void hidp_del_timer(struct hidp_session *session) { if (session->idle_to > 0) - del_timer(&session->timer); + del_timer_sync(&session->timer); } static void hidp_process_report(struct hidp_session *session, int type, From 73f7b171b7c09139eb3c6a5677c200dc1be5f318 Mon Sep 17 00:00:00 2001 From: Zheng Wang Date: Thu, 9 Mar 2023 00:45:01 +0800 Subject: [PATCH 20/50] Bluetooth: btsdio: fix use after free bug in btsdio_remove due to race condition In btsdio_probe, the data->work is bound with btsdio_work. It will be started in btsdio_send_frame. If the btsdio_remove runs with a unfinished work, there may be a race condition that hdev is freed but used in btsdio_work. Fix it by canceling the work before do cleanup in btsdio_remove. Signed-off-by: Zheng Wang Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/btsdio.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/bluetooth/btsdio.c b/drivers/bluetooth/btsdio.c index 02893600db390..51000320e1ea8 100644 --- a/drivers/bluetooth/btsdio.c +++ b/drivers/bluetooth/btsdio.c @@ -358,6 +358,7 @@ static void btsdio_remove(struct sdio_func *func) if (!data) return; + cancel_work_sync(&data->work); hdev = data->hdev; sdio_set_drvdata(func, NULL); From b76abe4648c1acc791a207e7c08d1719eb9f4ea8 Mon Sep 17 00:00:00 2001 From: Sasha Finkelstein Date: Fri, 10 Mar 2023 11:28:42 +0100 Subject: [PATCH 21/50] bluetooth: btbcm: Fix logic error in forming the board name. This patch fixes an incorrect loop exit condition in code that replaces '/' symbols in the board name. There might also be a memory corruption issue here, but it is unlikely to be a real problem. Cc: Signed-off-by: Sasha Finkelstein Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/btbcm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/bluetooth/btbcm.c b/drivers/bluetooth/btbcm.c index 3006e2a0f37e1..43e98a598bd9a 100644 --- a/drivers/bluetooth/btbcm.c +++ b/drivers/bluetooth/btbcm.c @@ -511,7 +511,7 @@ static const char *btbcm_get_board_name(struct device *dev) len = strlen(tmp) + 1; board_type = devm_kzalloc(dev, len, GFP_KERNEL); strscpy(board_type, tmp, len); - for (i = 0; i < board_type[i]; i++) { + for (i = 0; i < len; i++) { if (board_type[i] == '/') board_type[i] = '-'; } From 9a8ec9e8ebb5a7c0cfbce2d6b4a6b67b2b78e8f3 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Thu, 30 Mar 2023 14:15:50 -0700 Subject: [PATCH 22/50] Bluetooth: SCO: Fix possible circular locking dependency on sco_connect_cfm This attempts to fix the following trace: ====================================================== WARNING: possible circular locking dependency detected 6.3.0-rc2-g0b93eeba4454 #4703 Not tainted ------------------------------------------------------ kworker/u3:0/46 is trying to acquire lock: ffff888001fd9130 (sk_lock-AF_BLUETOOTH-BTPROTO_SCO){+.+.}-{0:0}, at: sco_connect_cfm+0x118/0x4a0 but task is already holding lock: ffffffff831e3340 (hci_cb_list_lock){+.+.}-{3:3}, at: hci_sync_conn_complete_evt+0x1ad/0x3d0 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #2 (hci_cb_list_lock){+.+.}-{3:3}: __mutex_lock+0x13b/0xcc0 hci_sync_conn_complete_evt+0x1ad/0x3d0 hci_event_packet+0x55c/0x7c0 hci_rx_work+0x34c/0xa00 process_one_work+0x575/0x910 worker_thread+0x89/0x6f0 kthread+0x14e/0x180 ret_from_fork+0x2b/0x50 -> #1 (&hdev->lock){+.+.}-{3:3}: __mutex_lock+0x13b/0xcc0 sco_sock_connect+0xfc/0x630 __sys_connect+0x197/0x1b0 __x64_sys_connect+0x37/0x50 do_syscall_64+0x42/0x90 entry_SYSCALL_64_after_hwframe+0x70/0xda -> #0 (sk_lock-AF_BLUETOOTH-BTPROTO_SCO){+.+.}-{0:0}: __lock_acquire+0x18cc/0x3740 lock_acquire+0x151/0x3a0 lock_sock_nested+0x32/0x80 sco_connect_cfm+0x118/0x4a0 hci_sync_conn_complete_evt+0x1e6/0x3d0 hci_event_packet+0x55c/0x7c0 hci_rx_work+0x34c/0xa00 process_one_work+0x575/0x910 worker_thread+0x89/0x6f0 kthread+0x14e/0x180 ret_from_fork+0x2b/0x50 other info that might help us debug this: Chain exists of: sk_lock-AF_BLUETOOTH-BTPROTO_SCO --> &hdev->lock --> hci_cb_list_lock Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(hci_cb_list_lock); lock(&hdev->lock); lock(hci_cb_list_lock); lock(sk_lock-AF_BLUETOOTH-BTPROTO_SCO); *** DEADLOCK *** 4 locks held by kworker/u3:0/46: #0: ffff8880028d1130 ((wq_completion)hci0#2){+.+.}-{0:0}, at: process_one_work+0x4c0/0x910 #1: ffff8880013dfde0 ((work_completion)(&hdev->rx_work)){+.+.}-{0:0}, at: process_one_work+0x4c0/0x910 #2: ffff8880025d8070 (&hdev->lock){+.+.}-{3:3}, at: hci_sync_conn_complete_evt+0xa6/0x3d0 #3: ffffffffb79e3340 (hci_cb_list_lock){+.+.}-{3:3}, at: hci_sync_conn_complete_evt+0x1ad/0x3d0 Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/sco.c | 69 ++++++++++++++++++++++++++------------------- 1 file changed, 40 insertions(+), 29 deletions(-) diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index 1111da4e2f2bd..f3a5ab9e4fa41 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -235,27 +235,41 @@ static int sco_chan_add(struct sco_conn *conn, struct sock *sk, return err; } -static int sco_connect(struct hci_dev *hdev, struct sock *sk) +static int sco_connect(struct sock *sk) { struct sco_conn *conn; struct hci_conn *hcon; + struct hci_dev *hdev; int err, type; BT_DBG("%pMR -> %pMR", &sco_pi(sk)->src, &sco_pi(sk)->dst); + hdev = hci_get_route(&sco_pi(sk)->dst, &sco_pi(sk)->src, BDADDR_BREDR); + if (!hdev) + return -EHOSTUNREACH; + + hci_dev_lock(hdev); + if (lmp_esco_capable(hdev) && !disable_esco) type = ESCO_LINK; else type = SCO_LINK; if (sco_pi(sk)->setting == BT_VOICE_TRANSPARENT && - (!lmp_transp_capable(hdev) || !lmp_esco_capable(hdev))) - return -EOPNOTSUPP; + (!lmp_transp_capable(hdev) || !lmp_esco_capable(hdev))) { + err = -EOPNOTSUPP; + goto unlock; + } hcon = hci_connect_sco(hdev, type, &sco_pi(sk)->dst, sco_pi(sk)->setting, &sco_pi(sk)->codec); - if (IS_ERR(hcon)) - return PTR_ERR(hcon); + if (IS_ERR(hcon)) { + err = PTR_ERR(hcon); + goto unlock; + } + + hci_dev_unlock(hdev); + hci_dev_put(hdev); conn = sco_conn_add(hcon); if (!conn) { @@ -263,13 +277,15 @@ static int sco_connect(struct hci_dev *hdev, struct sock *sk) return -ENOMEM; } - /* Update source addr of the socket */ - bacpy(&sco_pi(sk)->src, &hcon->src); - err = sco_chan_add(conn, sk, NULL); if (err) return err; + lock_sock(sk); + + /* Update source addr of the socket */ + bacpy(&sco_pi(sk)->src, &hcon->src); + if (hcon->state == BT_CONNECTED) { sco_sock_clear_timer(sk); sk->sk_state = BT_CONNECTED; @@ -278,6 +294,13 @@ static int sco_connect(struct hci_dev *hdev, struct sock *sk) sco_sock_set_timer(sk, sk->sk_sndtimeo); } + release_sock(sk); + + return err; + +unlock: + hci_dev_unlock(hdev); + hci_dev_put(hdev); return err; } @@ -565,7 +588,6 @@ static int sco_sock_connect(struct socket *sock, struct sockaddr *addr, int alen { struct sockaddr_sco *sa = (struct sockaddr_sco *) addr; struct sock *sk = sock->sk; - struct hci_dev *hdev; int err; BT_DBG("sk %p", sk); @@ -574,37 +596,26 @@ static int sco_sock_connect(struct socket *sock, struct sockaddr *addr, int alen addr->sa_family != AF_BLUETOOTH) return -EINVAL; - lock_sock(sk); - if (sk->sk_state != BT_OPEN && sk->sk_state != BT_BOUND) { - err = -EBADFD; - goto done; - } + if (sk->sk_state != BT_OPEN && sk->sk_state != BT_BOUND) + return -EBADFD; - if (sk->sk_type != SOCK_SEQPACKET) { + if (sk->sk_type != SOCK_SEQPACKET) err = -EINVAL; - goto done; - } - - hdev = hci_get_route(&sa->sco_bdaddr, &sco_pi(sk)->src, BDADDR_BREDR); - if (!hdev) { - err = -EHOSTUNREACH; - goto done; - } - hci_dev_lock(hdev); + lock_sock(sk); /* Set destination address and psm */ bacpy(&sco_pi(sk)->dst, &sa->sco_bdaddr); + release_sock(sk); - err = sco_connect(hdev, sk); - hci_dev_unlock(hdev); - hci_dev_put(hdev); + err = sco_connect(sk); if (err) - goto done; + return err; + + lock_sock(sk); err = bt_sock_wait_state(sk, BT_CONNECTED, sock_sndtimeo(sk, flags & O_NONBLOCK)); -done: release_sock(sk); return err; } From 975abc0c90fc485ff9b4a6afa475c3b1398d5d47 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Thu, 30 Mar 2023 14:45:03 -0700 Subject: [PATCH 23/50] Bluetooth: SCO: Fix possible circular locking dependency sco_sock_getsockopt This attempts to fix the following trace: ====================================================== WARNING: possible circular locking dependency detected 6.3.0-rc2-g68fcb3a7bf97 #4706 Not tainted ------------------------------------------------------ sco-tester/31 is trying to acquire lock: ffff8880025b8070 (&hdev->lock){+.+.}-{3:3}, at: sco_sock_getsockopt+0x1fc/0xa90 but task is already holding lock: ffff888001eeb130 (sk_lock-AF_BLUETOOTH-BTPROTO_SCO){+.+.}-{0:0}, at: sco_sock_getsockopt+0x104/0xa90 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #2 (sk_lock-AF_BLUETOOTH-BTPROTO_SCO){+.+.}-{0:0}: lock_sock_nested+0x32/0x80 sco_connect_cfm+0x118/0x4a0 hci_sync_conn_complete_evt+0x1e6/0x3d0 hci_event_packet+0x55c/0x7c0 hci_rx_work+0x34c/0xa00 process_one_work+0x575/0x910 worker_thread+0x89/0x6f0 kthread+0x14e/0x180 ret_from_fork+0x2b/0x50 -> #1 (hci_cb_list_lock){+.+.}-{3:3}: __mutex_lock+0x13b/0xcc0 hci_sync_conn_complete_evt+0x1ad/0x3d0 hci_event_packet+0x55c/0x7c0 hci_rx_work+0x34c/0xa00 process_one_work+0x575/0x910 worker_thread+0x89/0x6f0 kthread+0x14e/0x180 ret_from_fork+0x2b/0x50 -> #0 (&hdev->lock){+.+.}-{3:3}: __lock_acquire+0x18cc/0x3740 lock_acquire+0x151/0x3a0 __mutex_lock+0x13b/0xcc0 sco_sock_getsockopt+0x1fc/0xa90 __sys_getsockopt+0xe9/0x190 __x64_sys_getsockopt+0x5b/0x70 do_syscall_64+0x42/0x90 entry_SYSCALL_64_after_hwframe+0x70/0xda other info that might help us debug this: Chain exists of: &hdev->lock --> hci_cb_list_lock --> sk_lock-AF_BLUETOOTH-BTPROTO_SCO Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(sk_lock-AF_BLUETOOTH-BTPROTO_SCO); lock(hci_cb_list_lock); lock(sk_lock-AF_BLUETOOTH-BTPROTO_SCO); lock(&hdev->lock); *** DEADLOCK *** 1 lock held by sco-tester/31: #0: ffff888001eeb130 (sk_lock-AF_BLUETOOTH-BTPROTO_SCO){+.+.}-{0:0}, at: sco_sock_getsockopt+0x104/0xa90 Fixes: 248733e87d50 ("Bluetooth: Allow querying of supported offload codecs over SCO socket") Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/sco.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index f3a5ab9e4fa41..cd1a27ac555d0 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -1140,6 +1140,8 @@ static int sco_sock_getsockopt(struct socket *sock, int level, int optname, break; } + release_sock(sk); + /* find total buffer size required to copy codec + caps */ hci_dev_lock(hdev); list_for_each_entry(c, &hdev->local_codecs, list) { @@ -1157,15 +1159,13 @@ static int sco_sock_getsockopt(struct socket *sock, int level, int optname, buf_len += sizeof(struct bt_codecs); if (buf_len > len) { hci_dev_put(hdev); - err = -ENOBUFS; - break; + return -ENOBUFS; } ptr = optval; if (put_user(num_codecs, ptr)) { hci_dev_put(hdev); - err = -EFAULT; - break; + return -EFAULT; } ptr += sizeof(num_codecs); @@ -1205,12 +1205,14 @@ static int sco_sock_getsockopt(struct socket *sock, int level, int optname, ptr += len; } - if (!err && put_user(buf_len, optlen)) - err = -EFAULT; - hci_dev_unlock(hdev); hci_dev_put(hdev); + lock_sock(sk); + + if (!err && put_user(buf_len, optlen)) + err = -EFAULT; + break; default: From 5dc7d23e167e2882ef118456ceccd57873e876d8 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Mon, 3 Apr 2023 14:19:14 -0700 Subject: [PATCH 24/50] Bluetooth: hci_conn: Fix possible UAF This fixes the following trace: ================================================================== BUG: KASAN: slab-use-after-free in hci_conn_del+0xba/0x3a0 Write of size 8 at addr ffff88800208e9c8 by task iso-tester/31 CPU: 0 PID: 31 Comm: iso-tester Not tainted 6.3.0-rc2-g991aa4a69a47 #4716 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.1-2.fc36 04/01/2014 Call Trace: dump_stack_lvl+0x1d/0x70 print_report+0xce/0x610 ? __virt_addr_valid+0xd4/0x150 ? hci_conn_del+0xba/0x3a0 kasan_report+0xdd/0x110 ? hci_conn_del+0xba/0x3a0 hci_conn_del+0xba/0x3a0 hci_conn_hash_flush+0xf2/0x120 hci_dev_close_sync+0x388/0x920 hci_unregister_dev+0x122/0x260 vhci_release+0x4f/0x90 __fput+0x102/0x430 task_work_run+0xf1/0x160 ? __pfx_task_work_run+0x10/0x10 ? mark_held_locks+0x24/0x90 exit_to_user_mode_prepare+0x170/0x180 syscall_exit_to_user_mode+0x19/0x50 do_syscall_64+0x4e/0x90 entry_SYSCALL_64_after_hwframe+0x70/0xda Fixes: 0f00cd322d22 ("Bluetooth: Free potentially unfreed SCO connection") Link: https://syzkaller.appspot.com/bug?extid=8bb72f86fc823817bc5d Cc: Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_conn.c | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index e4aee5950c36a..8455ba141ee61 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -1068,6 +1068,17 @@ struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst, return conn; } +static bool hci_conn_unlink(struct hci_conn *conn) +{ + if (!conn->link) + return false; + + conn->link->link = NULL; + conn->link = NULL; + + return true; +} + int hci_conn_del(struct hci_conn *conn) { struct hci_dev *hdev = conn->hdev; @@ -1079,15 +1090,16 @@ int hci_conn_del(struct hci_conn *conn) cancel_delayed_work_sync(&conn->idle_work); if (conn->type == ACL_LINK) { - struct hci_conn *sco = conn->link; - if (sco) { - sco->link = NULL; + struct hci_conn *link = conn->link; + + if (link) { + hci_conn_unlink(conn); /* Due to race, SCO connection might be not established * yet at this point. Delete it now, otherwise it is * possible for it to be stuck and can't be deleted. */ - if (sco->handle == HCI_CONN_HANDLE_UNSET) - hci_conn_del(sco); + if (link->handle == HCI_CONN_HANDLE_UNSET) + hci_conn_del(link); } /* Unacked frames */ @@ -1103,7 +1115,7 @@ int hci_conn_del(struct hci_conn *conn) struct hci_conn *acl = conn->link; if (acl) { - acl->link = NULL; + hci_conn_unlink(conn); hci_conn_drop(acl); } @@ -2434,6 +2446,12 @@ void hci_conn_hash_flush(struct hci_dev *hdev) c->state = BT_CLOSED; hci_disconn_cfm(c, HCI_ERROR_LOCAL_HOST_TERM); + + /* Unlink before deleting otherwise it is possible that + * hci_conn_del removes the link which may cause the list to + * contain items already freed. + */ + hci_conn_unlink(c); hci_conn_del(c); } } From d2e4f1b1cba8742db66aaf77374cab7c0c7c8656 Mon Sep 17 00:00:00 2001 From: Claudia Draghicescu Date: Wed, 5 Apr 2023 14:19:18 +0300 Subject: [PATCH 25/50] Bluetooth: Set ISO Data Path on broadcast sink This patch enables ISO data rx on broadcast sink. Fixes: eca0ae4aea66 ("Bluetooth: Add initial implementation of BIS connections") Signed-off-by: Claudia Draghicescu Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_event.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index e68f2a7d863ac..e87c928c9e17a 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -6991,7 +6991,7 @@ static void hci_le_big_sync_established_evt(struct hci_dev *hdev, void *data, bis->iso_qos.in.latency = le16_to_cpu(ev->interval) * 125 / 100; bis->iso_qos.in.sdu = le16_to_cpu(ev->max_pdu); - hci_connect_cfm(bis, ev->status); + hci_iso_setup_path(bis); } hci_dev_unlock(hdev); From a2a9339e1c9deb7e1e079e12e27a0265aea8421a Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Thu, 6 Apr 2023 09:33:09 -0700 Subject: [PATCH 26/50] Bluetooth: L2CAP: Fix use-after-free in l2cap_disconnect_{req,rsp} Similar to commit d0be8347c623 ("Bluetooth: L2CAP: Fix use-after-free caused by l2cap_chan_put"), just use l2cap_chan_hold_unless_zero to prevent referencing a channel that is about to be destroyed. Cc: stable@kernel.org Signed-off-by: Luiz Augusto von Dentz Signed-off-by: Min Li --- net/bluetooth/l2cap_core.c | 24 ++++++------------------ 1 file changed, 6 insertions(+), 18 deletions(-) diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 49926f59cc123..55a7226233f96 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -4652,33 +4652,27 @@ static inline int l2cap_disconnect_req(struct l2cap_conn *conn, BT_DBG("scid 0x%4.4x dcid 0x%4.4x", scid, dcid); - mutex_lock(&conn->chan_lock); - - chan = __l2cap_get_chan_by_scid(conn, dcid); + chan = l2cap_get_chan_by_scid(conn, dcid); if (!chan) { - mutex_unlock(&conn->chan_lock); cmd_reject_invalid_cid(conn, cmd->ident, dcid, scid); return 0; } - l2cap_chan_hold(chan); - l2cap_chan_lock(chan); - rsp.dcid = cpu_to_le16(chan->scid); rsp.scid = cpu_to_le16(chan->dcid); l2cap_send_cmd(conn, cmd->ident, L2CAP_DISCONN_RSP, sizeof(rsp), &rsp); chan->ops->set_shutdown(chan); + mutex_lock(&conn->chan_lock); l2cap_chan_del(chan, ECONNRESET); + mutex_unlock(&conn->chan_lock); chan->ops->close(chan); l2cap_chan_unlock(chan); l2cap_chan_put(chan); - mutex_unlock(&conn->chan_lock); - return 0; } @@ -4698,33 +4692,27 @@ static inline int l2cap_disconnect_rsp(struct l2cap_conn *conn, BT_DBG("dcid 0x%4.4x scid 0x%4.4x", dcid, scid); - mutex_lock(&conn->chan_lock); - - chan = __l2cap_get_chan_by_scid(conn, scid); + chan = l2cap_get_chan_by_scid(conn, scid); if (!chan) { mutex_unlock(&conn->chan_lock); return 0; } - l2cap_chan_hold(chan); - l2cap_chan_lock(chan); - if (chan->state != BT_DISCONN) { l2cap_chan_unlock(chan); l2cap_chan_put(chan); - mutex_unlock(&conn->chan_lock); return 0; } + mutex_lock(&conn->chan_lock); l2cap_chan_del(chan, 0); + mutex_unlock(&conn->chan_lock); chan->ops->close(chan); l2cap_chan_unlock(chan); l2cap_chan_put(chan); - mutex_unlock(&conn->chan_lock); - return 0; } From a4506722dc39ca840593f14e3faa4c9ba9408211 Mon Sep 17 00:00:00 2001 From: "Radu Pirea (OSS)" Date: Thu, 6 Apr 2023 12:59:04 +0300 Subject: [PATCH 27/50] net: phy: nxp-c45-tja11xx: add remove callback Unregister PTP clock when the driver is removed. Purge the RX and TX skb queues. Fixes: 514def5dd339 ("phy: nxp-c45-tja11xx: add timestamping support") CC: stable@vger.kernel.org # 5.15+ Signed-off-by: Radu Pirea (OSS) Reviewed-by: Andrew Lunn Link: https://lore.kernel.org/r/20230406095904.75456-1-radu-nicolae.pirea@oss.nxp.com Signed-off-by: Paolo Abeni --- drivers/net/phy/nxp-c45-tja11xx.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/net/phy/nxp-c45-tja11xx.c b/drivers/net/phy/nxp-c45-tja11xx.c index 1b7d6941b0f35..029875a59ff89 100644 --- a/drivers/net/phy/nxp-c45-tja11xx.c +++ b/drivers/net/phy/nxp-c45-tja11xx.c @@ -1337,6 +1337,17 @@ static int nxp_c45_probe(struct phy_device *phydev) return ret; } +static void nxp_c45_remove(struct phy_device *phydev) +{ + struct nxp_c45_phy *priv = phydev->priv; + + if (priv->ptp_clock) + ptp_clock_unregister(priv->ptp_clock); + + skb_queue_purge(&priv->tx_queue); + skb_queue_purge(&priv->rx_queue); +} + static struct phy_driver nxp_c45_driver[] = { { PHY_ID_MATCH_MODEL(PHY_ID_TJA_1103), @@ -1359,6 +1370,7 @@ static struct phy_driver nxp_c45_driver[] = { .set_loopback = genphy_c45_loopback, .get_sqi = nxp_c45_get_sqi, .get_sqi_max = nxp_c45_get_sqi_max, + .remove = nxp_c45_remove, }, }; From 7573099e10ca69c3be33995c1fcd0d241226816d Mon Sep 17 00:00:00 2001 From: Denis Plotnikov Date: Fri, 7 Apr 2023 10:18:49 +0300 Subject: [PATCH 28/50] qlcnic: check pci_reset_function result Static code analyzer complains to unchecked return value. The result of pci_reset_function() is unchecked. Despite, the issue is on the FLR supported code path and in that case reset can be done with pcie_flr(), the patch uses less invasive approach by adding the result check of pci_reset_function(). Found by Linux Verification Center (linuxtesting.org) with SVACE. Fixes: 7e2cf4feba05 ("qlcnic: change driver hardware interface mechanism") Signed-off-by: Denis Plotnikov Reviewed-by: Simon Horman Reviewed-by: Bjorn Helgaas Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qlcnic/qlcnic_ctx.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_ctx.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_ctx.c index 87f76bac2e463..eb827b86ecae8 100644 --- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_ctx.c +++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_ctx.c @@ -628,7 +628,13 @@ int qlcnic_fw_create_ctx(struct qlcnic_adapter *dev) int i, err, ring; if (dev->flags & QLCNIC_NEED_FLR) { - pci_reset_function(dev->pdev); + err = pci_reset_function(dev->pdev); + if (err) { + dev_err(&dev->pdev->dev, + "Adapter reset failed (%d). Please reboot\n", + err); + return err; + } dev->flags &= ~QLCNIC_NEED_FLR; } From 9744d2bf19762703704ecba885b7ac282c02eacf Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Sat, 8 Apr 2023 11:49:43 -0700 Subject: [PATCH 29/50] smc: Fix use-after-free in tcp_write_timer_handler(). With Eric's ref tracker, syzbot finally found a repro for use-after-free in tcp_write_timer_handler() by kernel TCP sockets. [0] If SMC creates a kernel socket in __smc_create(), the kernel socket is supposed to be freed in smc_clcsock_release() by calling sock_release() when we close() the parent SMC socket. However, at the end of smc_clcsock_release(), the kernel socket's sk_state might not be TCP_CLOSE. This means that we have not called inet_csk_destroy_sock() in __tcp_close() and have not stopped the TCP timers. The kernel socket's TCP timers can be fired later, so we need to hold a refcnt for net as we do for MPTCP subflows in mptcp_subflow_create_socket(). [0]: leaked reference. sk_alloc (./include/net/net_namespace.h:335 net/core/sock.c:2108) inet_create (net/ipv4/af_inet.c:319 net/ipv4/af_inet.c:244) __sock_create (net/socket.c:1546) smc_create (net/smc/af_smc.c:3269 net/smc/af_smc.c:3284) __sock_create (net/socket.c:1546) __sys_socket (net/socket.c:1634 net/socket.c:1618 net/socket.c:1661) __x64_sys_socket (net/socket.c:1672) do_syscall_64 (arch/x86/entry/common.c:50 arch/x86/entry/common.c:80) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:120) ================================================================== BUG: KASAN: slab-use-after-free in tcp_write_timer_handler (net/ipv4/tcp_timer.c:378 net/ipv4/tcp_timer.c:624 net/ipv4/tcp_timer.c:594) Read of size 1 at addr ffff888052b65e0d by task syzrepro/18091 CPU: 0 PID: 18091 Comm: syzrepro Tainted: G W 6.3.0-rc4-01174-gb5d54eb5899a #7 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.0-1.amzn2022.0.1 04/01/2014 Call Trace: dump_stack_lvl (lib/dump_stack.c:107) print_report (mm/kasan/report.c:320 mm/kasan/report.c:430) kasan_report (mm/kasan/report.c:538) tcp_write_timer_handler (net/ipv4/tcp_timer.c:378 net/ipv4/tcp_timer.c:624 net/ipv4/tcp_timer.c:594) tcp_write_timer (./include/linux/spinlock.h:390 net/ipv4/tcp_timer.c:643) call_timer_fn (./arch/x86/include/asm/jump_label.h:27 ./include/linux/jump_label.h:207 ./include/trace/events/timer.h:127 kernel/time/timer.c:1701) __run_timers.part.0 (kernel/time/timer.c:1752 kernel/time/timer.c:2022) run_timer_softirq (kernel/time/timer.c:2037) __do_softirq (./arch/x86/include/asm/jump_label.h:27 ./include/linux/jump_label.h:207 ./include/trace/events/irq.h:142 kernel/softirq.c:572) __irq_exit_rcu (kernel/softirq.c:445 kernel/softirq.c:650) irq_exit_rcu (kernel/softirq.c:664) sysvec_apic_timer_interrupt (arch/x86/kernel/apic/apic.c:1107 (discriminator 14)) Fixes: ac7138746e14 ("smc: establish new socket family") Reported-by: syzbot+7e1e1bdb852961150198@syzkaller.appspotmail.com Link: https://lore.kernel.org/netdev/000000000000a3f51805f8bcc43a@google.com/ Signed-off-by: Kuniyuki Iwashima Reviewed-by: Tony Lu Signed-off-by: David S. Miller --- net/smc/af_smc.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index c6b4a62276f6d..50c38b624f772 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -3270,6 +3270,17 @@ static int __smc_create(struct net *net, struct socket *sock, int protocol, sk_common_release(sk); goto out; } + + /* smc_clcsock_release() does not wait smc->clcsock->sk's + * destruction; its sk_state might not be TCP_CLOSE after + * smc->sk is close()d, and TCP timers can be fired later, + * which need net ref. + */ + sk = smc->clcsock->sk; + __netns_tracker_free(net, &sk->ns_tracker, false); + sk->sk_net_refcnt = 1; + get_net_track(net, &sk->ns_tracker, GFP_KERNEL); + sock_inuse_add(net, 1); } else { smc->clcsock = clcsock; } From a56ef25619e079bd7d744636cf18d054d1e91982 Mon Sep 17 00:00:00 2001 From: Harshit Mogalapalli Date: Sat, 8 Apr 2023 12:43:21 -0700 Subject: [PATCH 30/50] net: wwan: iosm: Fix error handling path in ipc_pcie_probe() Smatch reports: drivers/net/wwan/iosm/iosm_ipc_pcie.c:298 ipc_pcie_probe() warn: missing unwind goto? When dma_set_mask fails it directly returns without disabling pci device and freeing ipc_pcie. Fix this my calling a correct goto label As dma_set_mask returns either 0 or -EIO, we can use a goto label, as it finally returns -EIO. Add a set_mask_fail goto label which stands consistent with other goto labels in this function.. Fixes: 035e3befc191 ("net: wwan: iosm: fix driver not working with INTEL_IOMMU disabled") Reviewed-by: Simon Horman Signed-off-by: Harshit Mogalapalli Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/wwan/iosm/iosm_ipc_pcie.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/wwan/iosm/iosm_ipc_pcie.c b/drivers/net/wwan/iosm/iosm_ipc_pcie.c index 5bf5a93937c9c..04517bd3325a2 100644 --- a/drivers/net/wwan/iosm/iosm_ipc_pcie.c +++ b/drivers/net/wwan/iosm/iosm_ipc_pcie.c @@ -295,7 +295,7 @@ static int ipc_pcie_probe(struct pci_dev *pci, ret = dma_set_mask(ipc_pcie->dev, DMA_BIT_MASK(64)); if (ret) { dev_err(ipc_pcie->dev, "Could not set PCI DMA mask: %d", ret); - return ret; + goto set_mask_fail; } ipc_pcie_config_aspm(ipc_pcie); @@ -323,6 +323,7 @@ static int ipc_pcie_probe(struct pci_dev *pci, imem_init_fail: ipc_pcie_resources_release(ipc_pcie); resources_req_fail: +set_mask_fail: pci_disable_device(pci); pci_enable_fail: kfree(ipc_pcie); From 136f36c74b0345d5d0087d4094894a006470bbd5 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Mon, 10 Apr 2023 18:27:19 -0500 Subject: [PATCH 31/50] net: ti/cpsw: Add explicit platform_device.h and of_platform.h includes TI CPSW uses of_platform_* functions which are declared in of_platform.h. of_platform.h gets implicitly included by of_device.h, but that is going to be removed soon. Nothing else depends on of_device.h so it can be dropped. of_platform.h also implicitly includes platform_device.h, so add an explicit include for it, too. Signed-off-by: Rob Herring Reviewed-by: Jesse Brandeburg Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/cpsw.c | 2 +- drivers/net/ethernet/ti/cpsw_new.c | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c index 37f0b62ec5d6a..f9cd566d1c9b5 100644 --- a/drivers/net/ethernet/ti/cpsw.c +++ b/drivers/net/ethernet/ti/cpsw.c @@ -27,7 +27,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/net/ethernet/ti/cpsw_new.c b/drivers/net/ethernet/ti/cpsw_new.c index 35128dd45ffce..c61e4e44a78f0 100644 --- a/drivers/net/ethernet/ti/cpsw_new.c +++ b/drivers/net/ethernet/ti/cpsw_new.c @@ -7,6 +7,7 @@ #include #include +#include #include #include #include @@ -23,7 +24,7 @@ #include #include #include -#include +#include #include #include #include From 59d3efd27c11c59b32291e5ebc307bed2edb65ee Mon Sep 17 00:00:00 2001 From: Martin Willi Date: Tue, 11 Apr 2023 09:43:19 +0200 Subject: [PATCH 32/50] rtnetlink: Restore RTM_NEW/DELLINK notification behavior The commits referenced below allows userspace to use the NLM_F_ECHO flag for RTM_NEW/DELLINK operations to receive unicast notifications for the affected link. Prior to these changes, applications may have relied on multicast notifications to learn the same information without specifying the NLM_F_ECHO flag. For such applications, the mentioned commits changed the behavior for requests not using NLM_F_ECHO. Multicast notifications are still received, but now use the portid of the requester and the sequence number of the request instead of zero values used previously. For the application, this message may be unexpected and likely handled as a response to the NLM_F_ACKed request, especially if it uses the same socket to handle requests and notifications. To fix existing applications relying on the old notification behavior, set the portid and sequence number in the notification only if the request included the NLM_F_ECHO flag. This restores the old behavior for applications not using it, but allows unicasted notifications for others. Fixes: f3a63cce1b4f ("rtnetlink: Honour NLM_F_ECHO flag in rtnl_delete_link") Fixes: d88e136cab37 ("rtnetlink: Honour NLM_F_ECHO flag in rtnl_newlink_create") Signed-off-by: Martin Willi Acked-by: Guillaume Nault Acked-by: Hangbin Liu Link: https://lore.kernel.org/r/20230411074319.24133-1-martin@strongswan.org Signed-off-by: Jakub Kicinski --- include/linux/rtnetlink.h | 3 ++- net/core/dev.c | 2 +- net/core/rtnetlink.c | 11 +++++++++-- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 92ad75549e9cd..b6e6378dcbbd7 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -25,7 +25,8 @@ void rtmsg_ifinfo_newnet(int type, struct net_device *dev, unsigned int change, struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev, unsigned change, u32 event, gfp_t flags, int *new_nsid, - int new_ifindex, u32 portid, u32 seq); + int new_ifindex, u32 portid, + const struct nlmsghdr *nlh); void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev, gfp_t flags, u32 portid, const struct nlmsghdr *nlh); diff --git a/net/core/dev.c b/net/core/dev.c index 48067321c0dba..1488f700bf819 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -10847,7 +10847,7 @@ void unregister_netdevice_many_notify(struct list_head *head, dev->rtnl_link_state == RTNL_LINK_INITIALIZED) skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0, GFP_KERNEL, NULL, 0, - portid, nlmsg_seq(nlh)); + portid, nlh); /* * Flush the unicast and multicast chains diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 5d8eb57867a96..6e44e92ebdf5d 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3972,16 +3972,23 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb) struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev, unsigned int change, u32 event, gfp_t flags, int *new_nsid, - int new_ifindex, u32 portid, u32 seq) + int new_ifindex, u32 portid, + const struct nlmsghdr *nlh) { struct net *net = dev_net(dev); struct sk_buff *skb; int err = -ENOBUFS; + u32 seq = 0; skb = nlmsg_new(if_nlmsg_size(dev, 0), flags); if (skb == NULL) goto errout; + if (nlmsg_report(nlh)) + seq = nlmsg_seq(nlh); + else + portid = 0; + err = rtnl_fill_ifinfo(skb, dev, dev_net(dev), type, portid, seq, change, 0, 0, event, new_nsid, new_ifindex, -1, flags); @@ -4017,7 +4024,7 @@ static void rtmsg_ifinfo_event(int type, struct net_device *dev, return; skb = rtmsg_ifinfo_build_skb(type, dev, change, event, flags, new_nsid, - new_ifindex, portid, nlmsg_seq(nlh)); + new_ifindex, portid, nlh); if (skb) rtmsg_ifinfo_send(skb, dev, flags, portid, nlh); } From 6417070918de3bcdbe0646e7256dae58fd8083ba Mon Sep 17 00:00:00 2001 From: Ziyang Xuan Date: Mon, 10 Apr 2023 09:23:52 +0800 Subject: [PATCH 33/50] net: qrtr: Fix an uninit variable access bug in qrtr_tx_resume() Syzbot reported a bug as following: ===================================================== BUG: KMSAN: uninit-value in qrtr_tx_resume+0x185/0x1f0 net/qrtr/af_qrtr.c:230 qrtr_tx_resume+0x185/0x1f0 net/qrtr/af_qrtr.c:230 qrtr_endpoint_post+0xf85/0x11b0 net/qrtr/af_qrtr.c:519 qrtr_tun_write_iter+0x270/0x400 net/qrtr/tun.c:108 call_write_iter include/linux/fs.h:2189 [inline] aio_write+0x63a/0x950 fs/aio.c:1600 io_submit_one+0x1d1c/0x3bf0 fs/aio.c:2019 __do_sys_io_submit fs/aio.c:2078 [inline] __se_sys_io_submit+0x293/0x770 fs/aio.c:2048 __x64_sys_io_submit+0x92/0xd0 fs/aio.c:2048 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x3d/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd Uninit was created at: slab_post_alloc_hook mm/slab.h:766 [inline] slab_alloc_node mm/slub.c:3452 [inline] __kmem_cache_alloc_node+0x71f/0xce0 mm/slub.c:3491 __do_kmalloc_node mm/slab_common.c:967 [inline] __kmalloc_node_track_caller+0x114/0x3b0 mm/slab_common.c:988 kmalloc_reserve net/core/skbuff.c:492 [inline] __alloc_skb+0x3af/0x8f0 net/core/skbuff.c:565 __netdev_alloc_skb+0x120/0x7d0 net/core/skbuff.c:630 qrtr_endpoint_post+0xbd/0x11b0 net/qrtr/af_qrtr.c:446 qrtr_tun_write_iter+0x270/0x400 net/qrtr/tun.c:108 call_write_iter include/linux/fs.h:2189 [inline] aio_write+0x63a/0x950 fs/aio.c:1600 io_submit_one+0x1d1c/0x3bf0 fs/aio.c:2019 __do_sys_io_submit fs/aio.c:2078 [inline] __se_sys_io_submit+0x293/0x770 fs/aio.c:2048 __x64_sys_io_submit+0x92/0xd0 fs/aio.c:2048 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x3d/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd It is because that skb->len requires at least sizeof(struct qrtr_ctrl_pkt) in qrtr_tx_resume(). And skb->len equals to size in qrtr_endpoint_post(). But size is less than sizeof(struct qrtr_ctrl_pkt) when qrtr_cb->type equals to QRTR_TYPE_RESUME_TX in qrtr_endpoint_post() under the syzbot scenario. This triggers the uninit variable access bug. Add size check when qrtr_cb->type equals to QRTR_TYPE_RESUME_TX in qrtr_endpoint_post() to fix the bug. Fixes: 5fdeb0d372ab ("net: qrtr: Implement outgoing flow control") Reported-by: syzbot+4436c9630a45820fda76@syzkaller.appspotmail.com Link: https://syzkaller.appspot.com/bug?id=c14607f0963d27d5a3d5f4c8639b500909e43540 Suggested-by: Manivannan Sadhasivam Signed-off-by: Ziyang Xuan Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20230410012352.3997823-1-william.xuanziyang@huawei.com Signed-off-by: Paolo Abeni --- net/qrtr/af_qrtr.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/net/qrtr/af_qrtr.c b/net/qrtr/af_qrtr.c index 3a70255c8d02f..76f0434d3d06a 100644 --- a/net/qrtr/af_qrtr.c +++ b/net/qrtr/af_qrtr.c @@ -498,6 +498,11 @@ int qrtr_endpoint_post(struct qrtr_endpoint *ep, const void *data, size_t len) if (!size || len != ALIGN(size, 4) + hdrlen) goto err; + if ((cb->type == QRTR_TYPE_NEW_SERVER || + cb->type == QRTR_TYPE_RESUME_TX) && + size < sizeof(struct qrtr_ctrl_pkt)) + goto err; + if (cb->dst_port != QRTR_PORT_CTRL && cb->type != QRTR_TYPE_DATA && cb->type != QRTR_TYPE_RESUME_TX) goto err; @@ -510,9 +515,6 @@ int qrtr_endpoint_post(struct qrtr_endpoint *ep, const void *data, size_t len) /* Remote node endpoint can bridge other distant nodes */ const struct qrtr_ctrl_pkt *pkt; - if (size < sizeof(*pkt)) - goto err; - pkt = data + hdrlen; qrtr_node_assign(node, le32_to_cpu(pkt->server.node)); } From 32832a2caf82663870126c5186cf8f86c8b2a649 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 10 Apr 2023 15:43:30 -0400 Subject: [PATCH 34/50] sctp: fix a potential overflow in sctp_ifwdtsn_skip Currently, when traversing ifwdtsn skips with _sctp_walk_ifwdtsn, it only checks the pos against the end of the chunk. However, the data left for the last pos may be < sizeof(struct sctp_ifwdtsn_skip), and dereference it as struct sctp_ifwdtsn_skip may cause coverflow. This patch fixes it by checking the pos against "the end of the chunk - sizeof(struct sctp_ifwdtsn_skip)" in sctp_ifwdtsn_skip, similar to sctp_fwdtsn_skip. Fixes: 0fc2ea922c8a ("sctp: implement validate_ftsn for sctp_stream_interleave") Signed-off-by: Xin Long Link: https://lore.kernel.org/r/2a71bffcd80b4f2c61fac6d344bb2f11c8fd74f7.1681155810.git.lucien.xin@gmail.com Signed-off-by: Paolo Abeni --- net/sctp/stream_interleave.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/sctp/stream_interleave.c b/net/sctp/stream_interleave.c index 94727feb07b3e..b046b11200c93 100644 --- a/net/sctp/stream_interleave.c +++ b/net/sctp/stream_interleave.c @@ -1154,7 +1154,8 @@ static void sctp_generate_iftsn(struct sctp_outq *q, __u32 ctsn) #define _sctp_walk_ifwdtsn(pos, chunk, end) \ for (pos = chunk->subh.ifwdtsn_hdr->skip; \ - (void *)pos < (void *)chunk->subh.ifwdtsn_hdr->skip + (end); pos++) + (void *)pos <= (void *)chunk->subh.ifwdtsn_hdr->skip + (end) - \ + sizeof(struct sctp_ifwdtsn_skip); pos++) #define sctp_walk_ifwdtsn(pos, ch) \ _sctp_walk_ifwdtsn((pos), (ch), ntohs((ch)->chunk_hdr->length) - \ From 5b7be2d4fd6eb8bec14c2de96c664e07c7d0bd82 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 11 Apr 2023 22:26:45 +0300 Subject: [PATCH 35/50] net: enetc: workaround for unresponsive pMAC after receiving express traffic I have observed an issue where the RX direction of the LS1028A ENETC pMAC seems unresponsive. The minimal procedure to reproduce the issue is: 1. Connect ENETC port 0 with a loopback RJ45 cable to one of the Felix switch ports (0). 2. Bring the ports up (MAC Merge layer is not enabled on either end). 3. Send a large quantity of unidirectional (express) traffic from Felix to ENETC. I tried altering frame size and frame count, and it doesn't appear to be specific to either of them, but rather, to the quantity of octets received. Lowering the frame count, the minimum quantity of packets to reproduce relatively consistently seems to be around 37000 frames at 1514 octets (w/o FCS) each. 4. Using ethtool --set-mm, enable the pMAC in the Felix and in the ENETC ports, in both RX and TX directions, and with verification on both ends. 5. Wait for verification to complete on both sides. 6. Configure a traffic class as preemptible on both ends. 7. Send some packets again. The issue is at step 5, where the verification process of ENETC ends (meaning that Felix responds with an SMD-R and ENETC sees the response), but the verification process of Felix never ends (it remains VERIFYING). If step 3 is skipped or if ENETC receives less traffic than approximately that threshold, the test runs all the way through (verification succeeds on both ends, preemptible traffic passes fine). If, between step 4 and 5, the step below is also introduced: 4.1. Disable and re-enable PM0_COMMAND_CONFIG bit RX_EN then again, the sequence of steps runs all the way through, and verification succeeds, even if there was the previous RX traffic injected into ENETC. Traffic sent *by* the ENETC port prior to enabling the MAC Merge layer does not seem to influence the verification result, only received traffic does. The LS1028A manual does not mention any relationship between PM0_COMMAND_CONFIG and MMCSR, and the hardware people don't seem to know for now either. The bit that is toggled to work around the issue is also toggled by enetc_mac_enable(), called from phylink's mac_link_down() and mac_link_up() methods - which is how the workaround was found: verification would work after a link down/up. Fixes: c7b9e8086902 ("net: enetc: add support for MAC Merge layer") Signed-off-by: Vladimir Oltean Reviewed-by: Jacob Keller Link: https://lore.kernel.org/r/20230411192645.1896048-1-vladimir.oltean@nxp.com Signed-off-by: Paolo Abeni --- .../net/ethernet/freescale/enetc/enetc_ethtool.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/drivers/net/ethernet/freescale/enetc/enetc_ethtool.c b/drivers/net/ethernet/freescale/enetc/enetc_ethtool.c index da9d4b310fcdd..838750a03cf68 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc_ethtool.c +++ b/drivers/net/ethernet/freescale/enetc/enetc_ethtool.c @@ -989,6 +989,20 @@ static int enetc_get_mm(struct net_device *ndev, struct ethtool_mm_state *state) return 0; } +/* FIXME: Workaround for the link partner's verification failing if ENETC + * priorly received too much express traffic. The documentation doesn't + * suggest this is needed. + */ +static void enetc_restart_emac_rx(struct enetc_si *si) +{ + u32 val = enetc_port_rd(&si->hw, ENETC_PM0_CMD_CFG); + + enetc_port_wr(&si->hw, ENETC_PM0_CMD_CFG, val & ~ENETC_PM0_RX_EN); + + if (val & ENETC_PM0_RX_EN) + enetc_port_wr(&si->hw, ENETC_PM0_CMD_CFG, val); +} + static int enetc_set_mm(struct net_device *ndev, struct ethtool_mm_cfg *cfg, struct netlink_ext_ack *extack) { @@ -1040,6 +1054,8 @@ static int enetc_set_mm(struct net_device *ndev, struct ethtool_mm_cfg *cfg, enetc_port_wr(hw, ENETC_MMCSR, val); + enetc_restart_emac_rx(priv->si); + mutex_unlock(&priv->mm_lock); return 0; From a5cb752b125766524c921faab1a45cc96065b0a7 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 11 Apr 2023 22:42:09 +0200 Subject: [PATCH 36/50] mptcp: use mptcp_schedule_work instead of open-coding it Beyond reducing code duplication this also avoids scheduling the mptcp_worker on a closed socket on some edge scenarios. The addressed issue is actually older than the blamed commit below, but this fix needs it as a pre-requisite. Fixes: ba8f48f7a4d7 ("mptcp: introduce mptcp_schedule_work") Cc: stable@vger.kernel.org Signed-off-by: Paolo Abeni Reviewed-by: Matthieu Baerts Signed-off-by: Matthieu Baerts Signed-off-by: Jakub Kicinski --- net/mptcp/options.c | 5 ++--- net/mptcp/subflow.c | 18 ++++++------------ 2 files changed, 8 insertions(+), 15 deletions(-) diff --git a/net/mptcp/options.c b/net/mptcp/options.c index b30cea2fbf3fd..355f798d575a4 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -1192,9 +1192,8 @@ bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) */ if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) { if (mp_opt.data_fin && mp_opt.data_len == 1 && - mptcp_update_rcv_data_fin(msk, mp_opt.data_seq, mp_opt.dsn64) && - schedule_work(&msk->work)) - sock_hold(subflow->conn); + mptcp_update_rcv_data_fin(msk, mp_opt.data_seq, mp_opt.dsn64)) + mptcp_schedule_work((struct sock *)msk); return true; } diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index a0041360ee9d9..d345888505457 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -408,9 +408,8 @@ void mptcp_subflow_reset(struct sock *ssk) tcp_send_active_reset(ssk, GFP_ATOMIC); tcp_done(ssk); - if (!test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &mptcp_sk(sk)->flags) && - schedule_work(&mptcp_sk(sk)->work)) - return; /* worker will put sk for us */ + if (!test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &mptcp_sk(sk)->flags)) + mptcp_schedule_work(sk); sock_put(sk); } @@ -1118,8 +1117,8 @@ static enum mapping_status get_mapping_status(struct sock *ssk, skb_ext_del(skb, SKB_EXT_MPTCP); return MAPPING_OK; } else { - if (updated && schedule_work(&msk->work)) - sock_hold((struct sock *)msk); + if (updated) + mptcp_schedule_work((struct sock *)msk); return MAPPING_DATA_FIN; } @@ -1222,17 +1221,12 @@ static void mptcp_subflow_discard_data(struct sock *ssk, struct sk_buff *skb, /* sched mptcp worker to remove the subflow if no more data is pending */ static void subflow_sched_work_if_closed(struct mptcp_sock *msk, struct sock *ssk) { - struct sock *sk = (struct sock *)msk; - if (likely(ssk->sk_state != TCP_CLOSE)) return; if (skb_queue_empty(&ssk->sk_receive_queue) && - !test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags)) { - sock_hold(sk); - if (!schedule_work(&msk->work)) - sock_put(sk); - } + !test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags)) + mptcp_schedule_work((struct sock *)msk); } static bool subflow_can_fallback(struct mptcp_subflow_context *subflow) From d6a0443733434408f2cbd4c53fea6910599bab9e Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 11 Apr 2023 22:42:10 +0200 Subject: [PATCH 37/50] mptcp: stricter state check in mptcp_worker As reported by Christoph, the mptcp protocol can run the worker when the relevant msk socket is in an unexpected state: connect() // incoming reset + fastclose // the mptcp worker is scheduled mptcp_disconnect() // msk is now CLOSED listen() mptcp_worker() Leading to the following splat: divide error: 0000 [#1] PREEMPT SMP CPU: 1 PID: 21 Comm: kworker/1:0 Not tainted 6.3.0-rc1-gde5e8fd0123c #11 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.11.0-2.el7 04/01/2014 Workqueue: events mptcp_worker RIP: 0010:__tcp_select_window+0x22c/0x4b0 net/ipv4/tcp_output.c:3018 RSP: 0018:ffffc900000b3c98 EFLAGS: 00010293 RAX: 000000000000ffd7 RBX: 000000000000ffd7 RCX: 0000000000000000 RDX: 0000000000000000 RSI: ffffffff8214ce97 RDI: 0000000000000004 RBP: 000000000000ffd7 R08: 0000000000000004 R09: 0000000000010000 R10: 000000000000ffd7 R11: ffff888005afa148 R12: 000000000000ffd7 R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 FS: 0000000000000000(0000) GS:ffff88803ed00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000405270 CR3: 000000003011e006 CR4: 0000000000370ee0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: tcp_select_window net/ipv4/tcp_output.c:262 [inline] __tcp_transmit_skb+0x356/0x1280 net/ipv4/tcp_output.c:1345 tcp_transmit_skb net/ipv4/tcp_output.c:1417 [inline] tcp_send_active_reset+0x13e/0x320 net/ipv4/tcp_output.c:3459 mptcp_check_fastclose net/mptcp/protocol.c:2530 [inline] mptcp_worker+0x6c7/0x800 net/mptcp/protocol.c:2705 process_one_work+0x3bd/0x950 kernel/workqueue.c:2390 worker_thread+0x5b/0x610 kernel/workqueue.c:2537 kthread+0x138/0x170 kernel/kthread.c:376 ret_from_fork+0x2c/0x50 arch/x86/entry/entry_64.S:308 This change addresses the issue explicitly checking for bad states before running the mptcp worker. Fixes: e16163b6e2b7 ("mptcp: refactor shutdown and close") Cc: stable@vger.kernel.org Reported-by: Christoph Paasch Link: https://github.com/multipath-tcp/mptcp_net-next/issues/374 Signed-off-by: Paolo Abeni Reviewed-by: Matthieu Baerts Tested-by: Christoph Paasch Signed-off-by: Matthieu Baerts Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 60b23b2716c40..06c5872e3b003 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -2626,7 +2626,7 @@ static void mptcp_worker(struct work_struct *work) lock_sock(sk); state = sk->sk_state; - if (unlikely(state == TCP_CLOSE)) + if (unlikely((1 << state) & (TCPF_CLOSE | TCPF_LISTEN))) goto unlock; mptcp_check_data_fin_ack(sk); From c0ff6f6da66a7791a32c0234388b1bdc00244917 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 11 Apr 2023 22:42:11 +0200 Subject: [PATCH 38/50] mptcp: fix NULL pointer dereference on fastopen early fallback In case of early fallback to TCP, subflow_syn_recv_sock() deletes the subflow context before returning the newly allocated sock to the caller. The fastopen path does not cope with the above unconditionally dereferencing the subflow context. Fixes: 36b122baf6a8 ("mptcp: add subflow_v(4,6)_send_synack()") Cc: stable@vger.kernel.org Signed-off-by: Paolo Abeni Reviewed-by: Matthieu Baerts Signed-off-by: Matthieu Baerts Signed-off-by: Jakub Kicinski --- net/mptcp/fastopen.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/net/mptcp/fastopen.c b/net/mptcp/fastopen.c index d237d142171c5..bceaab8dd8e46 100644 --- a/net/mptcp/fastopen.c +++ b/net/mptcp/fastopen.c @@ -9,11 +9,18 @@ void mptcp_fastopen_subflow_synack_set_params(struct mptcp_subflow_context *subflow, struct request_sock *req) { - struct sock *ssk = subflow->tcp_sock; - struct sock *sk = subflow->conn; + struct sock *sk, *ssk; struct sk_buff *skb; struct tcp_sock *tp; + /* on early fallback the subflow context is deleted by + * subflow_syn_recv_sock() + */ + if (!subflow) + return; + + ssk = subflow->tcp_sock; + sk = subflow->conn; tp = tcp_sk(ssk); subflow->is_mptfo = 1; From 711ae788cbbb82818531b55e32b09518ee09a11a Mon Sep 17 00:00:00 2001 From: Matthieu Baerts Date: Tue, 11 Apr 2023 22:42:12 +0200 Subject: [PATCH 39/50] selftests: mptcp: userspace pm: uniform verify events Simply adding a "sleep" before checking something is usually not a good idea because the time that has been picked can not be enough or too much. The best is to wait for events with a timeout. In this selftest, 'sleep 0.5' is used more than 40 times. It is always used before calling a 'verify_*' function except for this verify_listener_events which has been added later. At the end, using all these 'sleep 0.5' seems to work: the slow CIs don't complain so far. Also because it doesn't take too much time, we can just add two more 'sleep 0.5' to uniform what is done before calling a 'verify_*' function. For the same reasons, we can also delay a bigger refactoring to replace all these 'sleep 0.5' by functions waiting for events instead of waiting for a fix time and hope for the best. Fixes: 6c73008aa301 ("selftests: mptcp: listener test for userspace PM") Cc: stable@vger.kernel.org Suggested-by: Paolo Abeni Signed-off-by: Matthieu Baerts Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/userspace_pm.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/testing/selftests/net/mptcp/userspace_pm.sh b/tools/testing/selftests/net/mptcp/userspace_pm.sh index 48e52f995a98c..b1eb7bce599dc 100755 --- a/tools/testing/selftests/net/mptcp/userspace_pm.sh +++ b/tools/testing/selftests/net/mptcp/userspace_pm.sh @@ -913,6 +913,7 @@ test_listener() $client4_port > /dev/null 2>&1 & local listener_pid=$! + sleep 0.5 verify_listener_events $client_evts $LISTENER_CREATED $AF_INET 10.0.2.2 $client4_port # ADD_ADDR from client to server machine reusing the subflow port @@ -928,6 +929,7 @@ test_listener() # Delete the listener from the client ns, if one was created kill_wait $listener_pid + sleep 0.5 verify_listener_events $client_evts $LISTENER_CLOSED $AF_INET 10.0.2.2 $client4_port } From 306dc21361993f4fe50a15d4db6b1a4de5d0adb0 Mon Sep 17 00:00:00 2001 From: Aaron Conole Date: Wed, 12 Apr 2023 07:58:28 -0400 Subject: [PATCH 40/50] selftests: openvswitch: adjust datapath NL message declaration The netlink message for creating a new datapath takes an array of ports for the PID creation. This shouldn't cause much issue but correct it for future cases where we need to do decode of datapath information that could include the per-cpu PID map. Fixes: 25f16c873fb1 ("selftests: add openvswitch selftest suite") Signed-off-by: Aaron Conole Link: https://lore.kernel.org/r/20230412115828.3991806-1-aconole@redhat.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/openvswitch/ovs-dpctl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/openvswitch/ovs-dpctl.py b/tools/testing/selftests/net/openvswitch/ovs-dpctl.py index 3243c90d449e6..5d467d1993cb1 100644 --- a/tools/testing/selftests/net/openvswitch/ovs-dpctl.py +++ b/tools/testing/selftests/net/openvswitch/ovs-dpctl.py @@ -62,7 +62,7 @@ class dp_cmd_msg(ovs_dp_msg): nla_map = ( ("OVS_DP_ATTR_UNSPEC", "none"), ("OVS_DP_ATTR_NAME", "asciiz"), - ("OVS_DP_ATTR_UPCALL_PID", "uint32"), + ("OVS_DP_ATTR_UPCALL_PID", "array(uint32)"), ("OVS_DP_ATTR_STATS", "dpstats"), ("OVS_DP_ATTR_MEGAFLOW_STATS", "megaflowstats"), ("OVS_DP_ATTR_USER_FEATURES", "uint32"), From 1c5950fc6fe996235f1d18539b9c6b64b597f50f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 12 Apr 2023 13:03:08 +0000 Subject: [PATCH 41/50] udp6: fix potential access to stale information MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit lena wang reported an issue caused by udpv6_sendmsg() mangling msg->msg_name and msg->msg_namelen, which are later read from ____sys_sendmsg() : /* * If this is sendmmsg() and sending to current destination address was * successful, remember it. */ if (used_address && err >= 0) { used_address->name_len = msg_sys->msg_namelen; if (msg_sys->msg_name) memcpy(&used_address->name, msg_sys->msg_name, used_address->name_len); } udpv6_sendmsg() wants to pretend the remote address family is AF_INET in order to call udp_sendmsg(). A fix would be to modify the address in-place, instead of using a local variable, but this could have other side effects. Instead, restore initial values before we return from udpv6_sendmsg(). Fixes: c71d8ebe7a44 ("net: Fix security_socket_sendmsg() bypass problem.") Reported-by: lena wang Signed-off-by: Eric Dumazet Reviewed-by: Maciej Żenczykowski Link: https://lore.kernel.org/r/20230412130308.1202254-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv6/udp.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 9fb2f33ee3a76..a675acfb901d1 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1395,9 +1395,11 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) msg->msg_name = &sin; msg->msg_namelen = sizeof(sin); do_udp_sendmsg: - if (ipv6_only_sock(sk)) - return -ENETUNREACH; - return udp_sendmsg(sk, msg, len); + err = ipv6_only_sock(sk) ? + -ENETUNREACH : udp_sendmsg(sk, msg, len); + msg->msg_name = sin6; + msg->msg_namelen = addr_len; + return err; } } From 3a0385be133e7091cc9a9a998c7ec712bb9585db Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 12 Apr 2023 11:13:06 -0400 Subject: [PATCH 42/50] selftests: add the missing CONFIG_IP_SCTP in net config The selftest sctp_vrf needs CONFIG_IP_SCTP set in config when building the kernel, so add it. Fixes: a61bd7b9fef3 ("selftests: add a selftest for sctp vrf") Reported-by: Naresh Kamboju Signed-off-by: Xin Long Reviewed-by: Sridhar Samudrala Link: https://lore.kernel.org/r/61dddebc4d2dd98fe7fb145e24d4b2430e42b572.1681312386.git.lucien.xin@gmail.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/config | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/net/config b/tools/testing/selftests/net/config index cc9fd55ab8699..2529226ce87ca 100644 --- a/tools/testing/selftests/net/config +++ b/tools/testing/selftests/net/config @@ -48,3 +48,4 @@ CONFIG_BAREUDP=m CONFIG_IPV6_IOAM6_LWTUNNEL=y CONFIG_CRYPTO_SM4_GENERIC=y CONFIG_AMT=m +CONFIG_IP_SCTP=m From e8b74453555872851bdd7ea43a7c0ec39659834f Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Wed, 12 Apr 2023 16:21:44 -0700 Subject: [PATCH 43/50] net: macb: fix a memory corruption in extended buffer descriptor mode For quite some time we were chasing a bug which looked like a sudden permanent failure of networking and mmc on some of our devices. The bug was very sensitive to any software changes and even more to any kernel debug options. Finally we got a setup where the problem was reproducible with CONFIG_DMA_API_DEBUG=y and it revealed the issue with the rx dma: [ 16.992082] ------------[ cut here ]------------ [ 16.996779] DMA-API: macb ff0b0000.ethernet: device driver tries to free DMA memory it has not allocated [device address=0x0000000875e3e244] [size=1536 bytes] [ 17.011049] WARNING: CPU: 0 PID: 85 at kernel/dma/debug.c:1011 check_unmap+0x6a0/0x900 [ 17.018977] Modules linked in: xxxxx [ 17.038823] CPU: 0 PID: 85 Comm: irq/55-8000f000 Not tainted 5.4.0 #28 [ 17.045345] Hardware name: xxxxx [ 17.049528] pstate: 60000005 (nZCv daif -PAN -UAO) [ 17.054322] pc : check_unmap+0x6a0/0x900 [ 17.058243] lr : check_unmap+0x6a0/0x900 [ 17.062163] sp : ffffffc010003c40 [ 17.065470] x29: ffffffc010003c40 x28: 000000004000c03c [ 17.070783] x27: ffffffc010da7048 x26: ffffff8878e38800 [ 17.076095] x25: ffffff8879d22810 x24: ffffffc010003cc8 [ 17.081407] x23: 0000000000000000 x22: ffffffc010a08750 [ 17.086719] x21: ffffff8878e3c7c0 x20: ffffffc010acb000 [ 17.092032] x19: 0000000875e3e244 x18: 0000000000000010 [ 17.097343] x17: 0000000000000000 x16: 0000000000000000 [ 17.102647] x15: ffffff8879e4a988 x14: 0720072007200720 [ 17.107959] x13: 0720072007200720 x12: 0720072007200720 [ 17.113261] x11: 0720072007200720 x10: 0720072007200720 [ 17.118565] x9 : 0720072007200720 x8 : 000000000000022d [ 17.123869] x7 : 0000000000000015 x6 : 0000000000000098 [ 17.129173] x5 : 0000000000000000 x4 : 0000000000000000 [ 17.134475] x3 : 00000000ffffffff x2 : ffffffc010a1d370 [ 17.139778] x1 : b420c9d75d27bb00 x0 : 0000000000000000 [ 17.145082] Call trace: [ 17.147524] check_unmap+0x6a0/0x900 [ 17.151091] debug_dma_unmap_page+0x88/0x90 [ 17.155266] gem_rx+0x114/0x2f0 [ 17.158396] macb_poll+0x58/0x100 [ 17.161705] net_rx_action+0x118/0x400 [ 17.165445] __do_softirq+0x138/0x36c [ 17.169100] irq_exit+0x98/0xc0 [ 17.172234] __handle_domain_irq+0x64/0xc0 [ 17.176320] gic_handle_irq+0x5c/0xc0 [ 17.179974] el1_irq+0xb8/0x140 [ 17.183109] xiic_process+0x5c/0xe30 [ 17.186677] irq_thread_fn+0x28/0x90 [ 17.190244] irq_thread+0x208/0x2a0 [ 17.193724] kthread+0x130/0x140 [ 17.196945] ret_from_fork+0x10/0x20 [ 17.200510] ---[ end trace 7240980785f81d6f ]--- [ 237.021490] ------------[ cut here ]------------ [ 237.026129] DMA-API: exceeded 7 overlapping mappings of cacheline 0x0000000021d79e7b [ 237.033886] WARNING: CPU: 0 PID: 0 at kernel/dma/debug.c:499 add_dma_entry+0x214/0x240 [ 237.041802] Modules linked in: xxxxx [ 237.061637] CPU: 0 PID: 0 Comm: swapper/0 Tainted: G W 5.4.0 #28 [ 237.068941] Hardware name: xxxxx [ 237.073116] pstate: 80000085 (Nzcv daIf -PAN -UAO) [ 237.077900] pc : add_dma_entry+0x214/0x240 [ 237.081986] lr : add_dma_entry+0x214/0x240 [ 237.086072] sp : ffffffc010003c30 [ 237.089379] x29: ffffffc010003c30 x28: ffffff8878a0be00 [ 237.094683] x27: 0000000000000180 x26: ffffff8878e387c0 [ 237.099987] x25: 0000000000000002 x24: 0000000000000000 [ 237.105290] x23: 000000000000003b x22: ffffffc010a0fa00 [ 237.110594] x21: 0000000021d79e7b x20: ffffffc010abe600 [ 237.115897] x19: 00000000ffffffef x18: 0000000000000010 [ 237.121201] x17: 0000000000000000 x16: 0000000000000000 [ 237.126504] x15: ffffffc010a0fdc8 x14: 0720072007200720 [ 237.131807] x13: 0720072007200720 x12: 0720072007200720 [ 237.137111] x11: 0720072007200720 x10: 0720072007200720 [ 237.142415] x9 : 0720072007200720 x8 : 0000000000000259 [ 237.147718] x7 : 0000000000000001 x6 : 0000000000000000 [ 237.153022] x5 : ffffffc010003a20 x4 : 0000000000000001 [ 237.158325] x3 : 0000000000000006 x2 : 0000000000000007 [ 237.163628] x1 : 8ac721b3a7dc1c00 x0 : 0000000000000000 [ 237.168932] Call trace: [ 237.171373] add_dma_entry+0x214/0x240 [ 237.175115] debug_dma_map_page+0xf8/0x120 [ 237.179203] gem_rx_refill+0x190/0x280 [ 237.182942] gem_rx+0x224/0x2f0 [ 237.186075] macb_poll+0x58/0x100 [ 237.189384] net_rx_action+0x118/0x400 [ 237.193125] __do_softirq+0x138/0x36c [ 237.196780] irq_exit+0x98/0xc0 [ 237.199914] __handle_domain_irq+0x64/0xc0 [ 237.204000] gic_handle_irq+0x5c/0xc0 [ 237.207654] el1_irq+0xb8/0x140 [ 237.210789] arch_cpu_idle+0x40/0x200 [ 237.214444] default_idle_call+0x18/0x30 [ 237.218359] do_idle+0x200/0x280 [ 237.221578] cpu_startup_entry+0x20/0x30 [ 237.225493] rest_init+0xe4/0xf0 [ 237.228713] arch_call_rest_init+0xc/0x14 [ 237.232714] start_kernel+0x47c/0x4a8 [ 237.236367] ---[ end trace 7240980785f81d70 ]--- Lars was fast to find an explanation: according to the datasheet bit 2 of the rx buffer descriptor entry has a different meaning in the extended mode: Address [2] of beginning of buffer, or in extended buffer descriptor mode (DMA configuration register [28] = 1), indicates a valid timestamp in the buffer descriptor entry. The macb driver didn't mask this bit while getting an address and it eventually caused a memory corruption and a dma failure. The problem is resolved by explicitly clearing the problematic bit if hw timestamping is used. Fixes: 7b4296148066 ("net: macb: Add support for PTP timestamps in DMA descriptors") Signed-off-by: Roman Gushchin Co-developed-by: Lars-Peter Clausen Signed-off-by: Lars-Peter Clausen Acked-by: Nicolas Ferre Reviewed-by: Jacob Keller Link: https://lore.kernel.org/r/20230412232144.770336-1-roman.gushchin@linux.dev Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/cadence/macb_main.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c index 66e30561569eb..e43d99ec50ba2 100644 --- a/drivers/net/ethernet/cadence/macb_main.c +++ b/drivers/net/ethernet/cadence/macb_main.c @@ -1064,6 +1064,10 @@ static dma_addr_t macb_get_addr(struct macb *bp, struct macb_dma_desc *desc) } #endif addr |= MACB_BF(RX_WADDR, MACB_BFEXT(RX_WADDR, desc->addr)); +#ifdef CONFIG_MACB_USE_HWSTAMP + if (bp->hw_dma_cap & HW_DMA_CAP_PTP) + addr &= ~GEM_BIT(DMA_RXVALID); +#endif return addr; } From 0646dc31ca886693274df5749cd0c8c1eaaeb5ca Mon Sep 17 00:00:00 2001 From: Liang Chen Date: Thu, 13 Apr 2023 17:03:53 +0800 Subject: [PATCH 44/50] skbuff: Fix a race between coalescing and releasing SKBs Commit 1effe8ca4e34 ("skbuff: fix coalescing for page_pool fragment recycling") allowed coalescing to proceed with non page pool page and page pool page when @from is cloned, i.e. to->pp_recycle --> false from->pp_recycle --> true skb_cloned(from) --> true However, it actually requires skb_cloned(@from) to hold true until coalescing finishes in this situation. If the other cloned SKB is released while the merging is in process, from_shinfo->nr_frags will be set to 0 toward the end of the function, causing the increment of frag page _refcount to be unexpectedly skipped resulting in inconsistent reference counts. Later when SKB(@to) is released, it frees the page directly even though the page pool page is still in use, leading to use-after-free or double-free errors. So it should be prohibited. The double-free error message below prompted us to investigate: BUG: Bad page state in process swapper/1 pfn:0e0d1 page:00000000c6548b28 refcount:-1 mapcount:0 mapping:0000000000000000 index:0x2 pfn:0xe0d1 flags: 0xfffffc0000000(node=0|zone=1|lastcpupid=0x1fffff) raw: 000fffffc0000000 0000000000000000 ffffffff00000101 0000000000000000 raw: 0000000000000002 0000000000000000 ffffffffffffffff 0000000000000000 page dumped because: nonzero _refcount CPU: 1 PID: 0 Comm: swapper/1 Tainted: G E 6.2.0+ Call Trace: dump_stack_lvl+0x32/0x50 bad_page+0x69/0xf0 free_pcp_prepare+0x260/0x2f0 free_unref_page+0x20/0x1c0 skb_release_data+0x10b/0x1a0 napi_consume_skb+0x56/0x150 net_rx_action+0xf0/0x350 ? __napi_schedule+0x79/0x90 __do_softirq+0xc8/0x2b1 __irq_exit_rcu+0xb9/0xf0 common_interrupt+0x82/0xa0 asm_common_interrupt+0x22/0x40 RIP: 0010:default_idle+0xb/0x20 Fixes: 53e0961da1c7 ("page_pool: add frag page recycling support in page pool") Signed-off-by: Liang Chen Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20230413090353.14448-1-liangchen.linux@gmail.com Signed-off-by: Jakub Kicinski --- net/core/skbuff.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 1a31815104d61..4c0879798eb8a 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -5599,18 +5599,18 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, if (skb_cloned(to)) return false; - /* In general, avoid mixing slab allocated and page_pool allocated - * pages within the same SKB. However when @to is not pp_recycle and - * @from is cloned, we can transition frag pages from page_pool to - * reference counted. - * - * On the other hand, don't allow coalescing two pp_recycle SKBs if - * @from is cloned, in case the SKB is using page_pool fragment + /* In general, avoid mixing page_pool and non-page_pool allocated + * pages within the same SKB. Additionally avoid dealing with clones + * with page_pool pages, in case the SKB is using page_pool fragment * references (PP_FLAG_PAGE_FRAG). Since we only take full page * references for cloned SKBs at the moment that would result in * inconsistent reference counts. + * In theory we could take full references if @from is cloned and + * !@to->pp_recycle but its tricky (due to potential race with + * the clone disappearing) and rare, so not worth dealing with. */ - if (to->pp_recycle != (from->pp_recycle && !skb_cloned(from))) + if (to->pp_recycle != from->pp_recycle || + (from->pp_recycle && skb_cloned(from))) return false; if (len <= skb_tailroom(to)) { From e8163b98d96c4d87c870689f560c53be7ccd55c8 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Wed, 12 Apr 2023 21:48:35 +0200 Subject: [PATCH 45/50] selftests/bpf: xdp_hw_metadata remove bpf_printk and add counters The tool xdp_hw_metadata can be used by driver developers implementing XDP-hints metadata kfuncs. Remove all bpf_printk calls, as the tool already transfers all the XDP-hints related information via metadata area to AF_XDP userspace process. Add counters for providing remaining information about failure and skipped packet events. Signed-off-by: Jesper Dangaard Brouer Acked-by: Stanislav Fomichev Link: https://lore.kernel.org/r/168132891533.340624.7313781245316405141.stgit@firesoul Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/progs/xdp_hw_metadata.c | 36 +++++++++++-------- tools/testing/selftests/bpf/xdp_hw_metadata.c | 4 ++- 2 files changed, 24 insertions(+), 16 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/xdp_hw_metadata.c b/tools/testing/selftests/bpf/progs/xdp_hw_metadata.c index 4c55b4d79d3d4..0687d11162f6b 100644 --- a/tools/testing/selftests/bpf/progs/xdp_hw_metadata.c +++ b/tools/testing/selftests/bpf/progs/xdp_hw_metadata.c @@ -12,6 +12,10 @@ struct { __type(value, __u32); } xsk SEC(".maps"); +__u64 pkts_skip = 0; +__u64 pkts_fail = 0; +__u64 pkts_redir = 0; + extern int bpf_xdp_metadata_rx_timestamp(const struct xdp_md *ctx, __u64 *timestamp) __ksym; extern int bpf_xdp_metadata_rx_hash(const struct xdp_md *ctx, @@ -26,7 +30,7 @@ int rx(struct xdp_md *ctx) struct udphdr *udp = NULL; struct iphdr *iph = NULL; struct xdp_meta *meta; - int ret; + int err; data = (void *)(long)ctx->data; data_end = (void *)(long)ctx->data_end; @@ -46,17 +50,20 @@ int rx(struct xdp_md *ctx) udp = NULL; } - if (!udp) + if (!udp) { + __sync_add_and_fetch(&pkts_skip, 1); return XDP_PASS; + } - if (udp->dest != bpf_htons(9091)) + /* Forwarding UDP:9091 to AF_XDP */ + if (udp->dest != bpf_htons(9091)) { + __sync_add_and_fetch(&pkts_skip, 1); return XDP_PASS; + } - bpf_printk("forwarding UDP:9091 to AF_XDP"); - - ret = bpf_xdp_adjust_meta(ctx, -(int)sizeof(struct xdp_meta)); - if (ret != 0) { - bpf_printk("bpf_xdp_adjust_meta returned %d", ret); + err = bpf_xdp_adjust_meta(ctx, -(int)sizeof(struct xdp_meta)); + if (err) { + __sync_add_and_fetch(&pkts_fail, 1); return XDP_PASS; } @@ -65,20 +72,19 @@ int rx(struct xdp_md *ctx) meta = data_meta; if (meta + 1 > data) { - bpf_printk("bpf_xdp_adjust_meta doesn't appear to work"); + __sync_add_and_fetch(&pkts_fail, 1); return XDP_PASS; } - if (!bpf_xdp_metadata_rx_timestamp(ctx, &meta->rx_timestamp)) - bpf_printk("populated rx_timestamp with %llu", meta->rx_timestamp); - else + err = bpf_xdp_metadata_rx_timestamp(ctx, &meta->rx_timestamp); + if (err) meta->rx_timestamp = 0; /* Used by AF_XDP as not avail signal */ - if (!bpf_xdp_metadata_rx_hash(ctx, &meta->rx_hash)) - bpf_printk("populated rx_hash with %u", meta->rx_hash); - else + err = bpf_xdp_metadata_rx_hash(ctx, &meta->rx_hash); + if (err) meta->rx_hash = 0; /* Used by AF_XDP as not avail signal */ + __sync_add_and_fetch(&pkts_redir, 1); return bpf_redirect_map(&xsk, ctx->rx_queue_index, XDP_PASS); } diff --git a/tools/testing/selftests/bpf/xdp_hw_metadata.c b/tools/testing/selftests/bpf/xdp_hw_metadata.c index 1c8acb68b977c..3b942ef7297bf 100644 --- a/tools/testing/selftests/bpf/xdp_hw_metadata.c +++ b/tools/testing/selftests/bpf/xdp_hw_metadata.c @@ -212,7 +212,9 @@ static int verify_metadata(struct xsk *rx_xsk, int rxq, int server_fd) while (true) { errno = 0; ret = poll(fds, rxq + 1, 1000); - printf("poll: %d (%d)\n", ret, errno); + printf("poll: %d (%d) skip=%llu fail=%llu redir=%llu\n", + ret, errno, bpf_obj->bss->pkts_skip, + bpf_obj->bss->pkts_fail, bpf_obj->bss->pkts_redir); if (ret < 0) break; if (ret == 0) From 0cd917a4a8ace70ff9082d797c899f6bf10de910 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Wed, 12 Apr 2023 21:48:40 +0200 Subject: [PATCH 46/50] xdp: rss hash types representation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The RSS hash type specifies what portion of packet data NIC hardware used when calculating RSS hash value. The RSS types are focused on Internet traffic protocols at OSI layers L3 and L4. L2 (e.g. ARP) often get hash value zero and no RSS type. For L3 focused on IPv4 vs. IPv6, and L4 primarily TCP vs UDP, but some hardware supports SCTP. Hardware RSS types are differently encoded for each hardware NIC. Most hardware represent RSS hash type as a number. Determining L3 vs L4 often requires a mapping table as there often isn't a pattern or sorting according to ISO layer. The patch introduce a XDP RSS hash type (enum xdp_rss_hash_type) that contains both BITs for the L3/L4 types, and combinations to be used by drivers for their mapping tables. The enum xdp_rss_type_bits get exposed to BPF via BTF, and it is up to the BPF-programmer to match using these defines. This proposal change the kfunc API bpf_xdp_metadata_rx_hash() adding a pointer value argument for provide the RSS hash type. Change signature for all xmo_rx_hash calls in drivers to make it compile. The RSS type implementations for each driver comes as separate patches. Fixes: 3d76a4d3d4e5 ("bpf: XDP metadata RX kfuncs") Signed-off-by: Jesper Dangaard Brouer Acked-by: Toke Høiland-Jørgensen Acked-by: Stanislav Fomichev Link: https://lore.kernel.org/r/168132892042.340624.582563003880565460.stgit@firesoul Signed-off-by: Alexei Starovoitov --- drivers/net/ethernet/mellanox/mlx4/en_rx.c | 3 +- drivers/net/ethernet/mellanox/mlx4/mlx4_en.h | 3 +- .../net/ethernet/mellanox/mlx5/core/en/xdp.c | 3 +- drivers/net/veth.c | 3 +- include/linux/netdevice.h | 3 +- include/net/xdp.h | 45 +++++++++++++++++++ net/core/xdp.c | 10 ++++- 7 files changed, 64 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c index 4b5e459b6d49f..73d10aa4c503f 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c @@ -681,7 +681,8 @@ int mlx4_en_xdp_rx_timestamp(const struct xdp_md *ctx, u64 *timestamp) return 0; } -int mlx4_en_xdp_rx_hash(const struct xdp_md *ctx, u32 *hash) +int mlx4_en_xdp_rx_hash(const struct xdp_md *ctx, u32 *hash, + enum xdp_rss_hash_type *rss_type) { struct mlx4_en_xdp_buff *_ctx = (void *)ctx; diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h index 544e09b97483c..4ac4d883047b1 100644 --- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h +++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h @@ -798,7 +798,8 @@ int mlx4_en_netdev_event(struct notifier_block *this, struct xdp_md; int mlx4_en_xdp_rx_timestamp(const struct xdp_md *ctx, u64 *timestamp); -int mlx4_en_xdp_rx_hash(const struct xdp_md *ctx, u32 *hash); +int mlx4_en_xdp_rx_hash(const struct xdp_md *ctx, u32 *hash, + enum xdp_rss_hash_type *rss_type); /* * Functions for time stamping diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c index c5dae48b7932f..efe609f8e3aac 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c @@ -169,7 +169,8 @@ static int mlx5e_xdp_rx_timestamp(const struct xdp_md *ctx, u64 *timestamp) return 0; } -static int mlx5e_xdp_rx_hash(const struct xdp_md *ctx, u32 *hash) +static int mlx5e_xdp_rx_hash(const struct xdp_md *ctx, u32 *hash, + enum xdp_rss_hash_type *rss_type) { const struct mlx5e_xdp_buff *_ctx = (void *)ctx; diff --git a/drivers/net/veth.c b/drivers/net/veth.c index c1178915496d8..424e8876a16b6 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -1648,7 +1648,8 @@ static int veth_xdp_rx_timestamp(const struct xdp_md *ctx, u64 *timestamp) return 0; } -static int veth_xdp_rx_hash(const struct xdp_md *ctx, u32 *hash) +static int veth_xdp_rx_hash(const struct xdp_md *ctx, u32 *hash, + enum xdp_rss_hash_type *rss_type) { struct veth_xdp_buff *_ctx = (void *)ctx; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 470085b121d3c..c35f04f636f15 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1624,7 +1624,8 @@ struct net_device_ops { struct xdp_metadata_ops { int (*xmo_rx_timestamp)(const struct xdp_md *ctx, u64 *timestamp); - int (*xmo_rx_hash)(const struct xdp_md *ctx, u32 *hash); + int (*xmo_rx_hash)(const struct xdp_md *ctx, u32 *hash, + enum xdp_rss_hash_type *rss_type); }; /** diff --git a/include/net/xdp.h b/include/net/xdp.h index 41c57b8b16714..a76c4ea203eaa 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -8,6 +8,7 @@ #include /* skb_shared_info */ #include +#include /** * DOC: XDP RX-queue information @@ -425,6 +426,50 @@ XDP_METADATA_KFUNC_xxx MAX_XDP_METADATA_KFUNC, }; +enum xdp_rss_hash_type { + /* First part: Individual bits for L3/L4 types */ + XDP_RSS_L3_IPV4 = BIT(0), + XDP_RSS_L3_IPV6 = BIT(1), + + /* The fixed (L3) IPv4 and IPv6 headers can both be followed by + * variable/dynamic headers, IPv4 called Options and IPv6 called + * Extension Headers. HW RSS type can contain this info. + */ + XDP_RSS_L3_DYNHDR = BIT(2), + + /* When RSS hash covers L4 then drivers MUST set XDP_RSS_L4 bit in + * addition to the protocol specific bit. This ease interaction with + * SKBs and avoids reserving a fixed mask for future L4 protocol bits. + */ + XDP_RSS_L4 = BIT(3), /* L4 based hash, proto can be unknown */ + XDP_RSS_L4_TCP = BIT(4), + XDP_RSS_L4_UDP = BIT(5), + XDP_RSS_L4_SCTP = BIT(6), + XDP_RSS_L4_IPSEC = BIT(7), /* L4 based hash include IPSEC SPI */ + + /* Second part: RSS hash type combinations used for driver HW mapping */ + XDP_RSS_TYPE_NONE = 0, + XDP_RSS_TYPE_L2 = XDP_RSS_TYPE_NONE, + + XDP_RSS_TYPE_L3_IPV4 = XDP_RSS_L3_IPV4, + XDP_RSS_TYPE_L3_IPV6 = XDP_RSS_L3_IPV6, + XDP_RSS_TYPE_L3_IPV4_OPT = XDP_RSS_L3_IPV4 | XDP_RSS_L3_DYNHDR, + XDP_RSS_TYPE_L3_IPV6_EX = XDP_RSS_L3_IPV6 | XDP_RSS_L3_DYNHDR, + + XDP_RSS_TYPE_L4_ANY = XDP_RSS_L4, + XDP_RSS_TYPE_L4_IPV4_TCP = XDP_RSS_L3_IPV4 | XDP_RSS_L4 | XDP_RSS_L4_TCP, + XDP_RSS_TYPE_L4_IPV4_UDP = XDP_RSS_L3_IPV4 | XDP_RSS_L4 | XDP_RSS_L4_UDP, + XDP_RSS_TYPE_L4_IPV4_SCTP = XDP_RSS_L3_IPV4 | XDP_RSS_L4 | XDP_RSS_L4_SCTP, + + XDP_RSS_TYPE_L4_IPV6_TCP = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_TCP, + XDP_RSS_TYPE_L4_IPV6_UDP = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_UDP, + XDP_RSS_TYPE_L4_IPV6_SCTP = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_SCTP, + + XDP_RSS_TYPE_L4_IPV6_TCP_EX = XDP_RSS_TYPE_L4_IPV6_TCP | XDP_RSS_L3_DYNHDR, + XDP_RSS_TYPE_L4_IPV6_UDP_EX = XDP_RSS_TYPE_L4_IPV6_UDP | XDP_RSS_L3_DYNHDR, + XDP_RSS_TYPE_L4_IPV6_SCTP_EX = XDP_RSS_TYPE_L4_IPV6_SCTP | XDP_RSS_L3_DYNHDR, +}; + #ifdef CONFIG_NET u32 bpf_xdp_metadata_kfunc_id(int id); bool bpf_dev_bound_kfunc_id(u32 btf_id); diff --git a/net/core/xdp.c b/net/core/xdp.c index 528d4b37983df..fb85aca819619 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -734,13 +734,21 @@ __bpf_kfunc int bpf_xdp_metadata_rx_timestamp(const struct xdp_md *ctx, u64 *tim * bpf_xdp_metadata_rx_hash - Read XDP frame RX hash. * @ctx: XDP context pointer. * @hash: Return value pointer. + * @rss_type: Return value pointer for RSS type. + * + * The RSS hash type (@rss_type) specifies what portion of packet headers NIC + * hardware used when calculating RSS hash value. The RSS type can be decoded + * via &enum xdp_rss_hash_type either matching on individual L3/L4 bits + * ``XDP_RSS_L*`` or by combined traditional *RSS Hashing Types* + * ``XDP_RSS_TYPE_L*``. * * Return: * * Returns 0 on success or ``-errno`` on error. * * ``-EOPNOTSUPP`` : means device driver doesn't implement kfunc * * ``-ENODATA`` : means no RX-hash available for this frame */ -__bpf_kfunc int bpf_xdp_metadata_rx_hash(const struct xdp_md *ctx, u32 *hash) +__bpf_kfunc int bpf_xdp_metadata_rx_hash(const struct xdp_md *ctx, u32 *hash, + enum xdp_rss_hash_type *rss_type) { return -EOPNOTSUPP; } From 67f245c2ec0af17d7a90c78910e28bc8b206297c Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Wed, 12 Apr 2023 21:48:45 +0200 Subject: [PATCH 47/50] mlx5: bpf_xdp_metadata_rx_hash add xdp rss hash type MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Update API for bpf_xdp_metadata_rx_hash() with arg for xdp rss hash type via mapping table. The mlx5 hardware can also identify and RSS hash IPSEC. This indicate hash includes SPI (Security Parameters Index) as part of IPSEC hash. Extend xdp core enum xdp_rss_hash_type with IPSEC hash type. Fixes: bc8d405b1ba9 ("net/mlx5e: Support RX XDP metadata") Signed-off-by: Jesper Dangaard Brouer Acked-by: Toke Høiland-Jørgensen Acked-by: Stanislav Fomichev Link: https://lore.kernel.org/r/168132892548.340624.11185734579430124869.stgit@firesoul Signed-off-by: Alexei Starovoitov --- .../net/ethernet/mellanox/mlx5/core/en/xdp.c | 60 ++++++++++++++++++- include/linux/mlx5/device.h | 14 ++++- include/net/xdp.h | 2 + 3 files changed, 73 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c index efe609f8e3aac..d9d3b9e1f15aa 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c @@ -34,6 +34,7 @@ #include #include "en/xdp.h" #include "en/params.h" +#include int mlx5e_xdp_max_mtu(struct mlx5e_params *params, struct mlx5e_xsk_param *xsk) { @@ -169,15 +170,72 @@ static int mlx5e_xdp_rx_timestamp(const struct xdp_md *ctx, u64 *timestamp) return 0; } +/* Mapping HW RSS Type bits CQE_RSS_HTYPE_IP + CQE_RSS_HTYPE_L4 into 4-bits*/ +#define RSS_TYPE_MAX_TABLE 16 /* 4-bits max 16 entries */ +#define RSS_L4 GENMASK(1, 0) +#define RSS_L3 GENMASK(3, 2) /* Same as CQE_RSS_HTYPE_IP */ + +/* Valid combinations of CQE_RSS_HTYPE_IP + CQE_RSS_HTYPE_L4 sorted numerical */ +enum mlx5_rss_hash_type { + RSS_TYPE_NO_HASH = (FIELD_PREP_CONST(RSS_L3, CQE_RSS_IP_NONE) | + FIELD_PREP_CONST(RSS_L4, CQE_RSS_L4_NONE)), + RSS_TYPE_L3_IPV4 = (FIELD_PREP_CONST(RSS_L3, CQE_RSS_IPV4) | + FIELD_PREP_CONST(RSS_L4, CQE_RSS_L4_NONE)), + RSS_TYPE_L4_IPV4_TCP = (FIELD_PREP_CONST(RSS_L3, CQE_RSS_IPV4) | + FIELD_PREP_CONST(RSS_L4, CQE_RSS_L4_TCP)), + RSS_TYPE_L4_IPV4_UDP = (FIELD_PREP_CONST(RSS_L3, CQE_RSS_IPV4) | + FIELD_PREP_CONST(RSS_L4, CQE_RSS_L4_UDP)), + RSS_TYPE_L4_IPV4_IPSEC = (FIELD_PREP_CONST(RSS_L3, CQE_RSS_IPV4) | + FIELD_PREP_CONST(RSS_L4, CQE_RSS_L4_IPSEC)), + RSS_TYPE_L3_IPV6 = (FIELD_PREP_CONST(RSS_L3, CQE_RSS_IPV6) | + FIELD_PREP_CONST(RSS_L4, CQE_RSS_L4_NONE)), + RSS_TYPE_L4_IPV6_TCP = (FIELD_PREP_CONST(RSS_L3, CQE_RSS_IPV6) | + FIELD_PREP_CONST(RSS_L4, CQE_RSS_L4_TCP)), + RSS_TYPE_L4_IPV6_UDP = (FIELD_PREP_CONST(RSS_L3, CQE_RSS_IPV6) | + FIELD_PREP_CONST(RSS_L4, CQE_RSS_L4_UDP)), + RSS_TYPE_L4_IPV6_IPSEC = (FIELD_PREP_CONST(RSS_L3, CQE_RSS_IPV6) | + FIELD_PREP_CONST(RSS_L4, CQE_RSS_L4_IPSEC)), +}; + +/* Invalid combinations will simply return zero, allows no boundary checks */ +static const enum xdp_rss_hash_type mlx5_xdp_rss_type[RSS_TYPE_MAX_TABLE] = { + [RSS_TYPE_NO_HASH] = XDP_RSS_TYPE_NONE, + [1] = XDP_RSS_TYPE_NONE, /* Implicit zero */ + [2] = XDP_RSS_TYPE_NONE, /* Implicit zero */ + [3] = XDP_RSS_TYPE_NONE, /* Implicit zero */ + [RSS_TYPE_L3_IPV4] = XDP_RSS_TYPE_L3_IPV4, + [RSS_TYPE_L4_IPV4_TCP] = XDP_RSS_TYPE_L4_IPV4_TCP, + [RSS_TYPE_L4_IPV4_UDP] = XDP_RSS_TYPE_L4_IPV4_UDP, + [RSS_TYPE_L4_IPV4_IPSEC] = XDP_RSS_TYPE_L4_IPV4_IPSEC, + [RSS_TYPE_L3_IPV6] = XDP_RSS_TYPE_L3_IPV6, + [RSS_TYPE_L4_IPV6_TCP] = XDP_RSS_TYPE_L4_IPV6_TCP, + [RSS_TYPE_L4_IPV6_UDP] = XDP_RSS_TYPE_L4_IPV6_UDP, + [RSS_TYPE_L4_IPV6_IPSEC] = XDP_RSS_TYPE_L4_IPV6_IPSEC, + [12] = XDP_RSS_TYPE_NONE, /* Implicit zero */ + [13] = XDP_RSS_TYPE_NONE, /* Implicit zero */ + [14] = XDP_RSS_TYPE_NONE, /* Implicit zero */ + [15] = XDP_RSS_TYPE_NONE, /* Implicit zero */ +}; + static int mlx5e_xdp_rx_hash(const struct xdp_md *ctx, u32 *hash, enum xdp_rss_hash_type *rss_type) { const struct mlx5e_xdp_buff *_ctx = (void *)ctx; + const struct mlx5_cqe64 *cqe = _ctx->cqe; + u32 hash_type, l4_type, ip_type, lookup; if (unlikely(!(_ctx->xdp.rxq->dev->features & NETIF_F_RXHASH))) return -ENODATA; - *hash = be32_to_cpu(_ctx->cqe->rss_hash_result); + *hash = be32_to_cpu(cqe->rss_hash_result); + + hash_type = cqe->rss_hash_type; + BUILD_BUG_ON(CQE_RSS_HTYPE_IP != RSS_L3); /* same mask */ + ip_type = hash_type & CQE_RSS_HTYPE_IP; + l4_type = FIELD_GET(CQE_RSS_HTYPE_L4, hash_type); + lookup = ip_type | l4_type; + *rss_type = mlx5_xdp_rss_type[lookup]; + return 0; } diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 71b06ebad4024..1db19a9d26e32 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -36,6 +36,7 @@ #include #include #include +#include #if defined(__LITTLE_ENDIAN) #define MLX5_SET_HOST_ENDIANNESS 0 @@ -980,14 +981,23 @@ enum { }; enum { - CQE_RSS_HTYPE_IP = 0x3 << 2, + CQE_RSS_HTYPE_IP = GENMASK(3, 2), /* cqe->rss_hash_type[3:2] - IP destination selected for hash * (00 = none, 01 = IPv4, 10 = IPv6, 11 = Reserved) */ - CQE_RSS_HTYPE_L4 = 0x3 << 6, + CQE_RSS_IP_NONE = 0x0, + CQE_RSS_IPV4 = 0x1, + CQE_RSS_IPV6 = 0x2, + CQE_RSS_RESERVED = 0x3, + + CQE_RSS_HTYPE_L4 = GENMASK(7, 6), /* cqe->rss_hash_type[7:6] - L4 destination selected for hash * (00 = none, 01 = TCP. 10 = UDP, 11 = IPSEC.SPI */ + CQE_RSS_L4_NONE = 0x0, + CQE_RSS_L4_TCP = 0x1, + CQE_RSS_L4_UDP = 0x2, + CQE_RSS_L4_IPSEC = 0x3, }; enum { diff --git a/include/net/xdp.h b/include/net/xdp.h index a76c4ea203eaa..76aa748e79237 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -460,10 +460,12 @@ enum xdp_rss_hash_type { XDP_RSS_TYPE_L4_IPV4_TCP = XDP_RSS_L3_IPV4 | XDP_RSS_L4 | XDP_RSS_L4_TCP, XDP_RSS_TYPE_L4_IPV4_UDP = XDP_RSS_L3_IPV4 | XDP_RSS_L4 | XDP_RSS_L4_UDP, XDP_RSS_TYPE_L4_IPV4_SCTP = XDP_RSS_L3_IPV4 | XDP_RSS_L4 | XDP_RSS_L4_SCTP, + XDP_RSS_TYPE_L4_IPV4_IPSEC = XDP_RSS_L3_IPV4 | XDP_RSS_L4 | XDP_RSS_L4_IPSEC, XDP_RSS_TYPE_L4_IPV6_TCP = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_TCP, XDP_RSS_TYPE_L4_IPV6_UDP = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_UDP, XDP_RSS_TYPE_L4_IPV6_SCTP = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_SCTP, + XDP_RSS_TYPE_L4_IPV6_IPSEC = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_IPSEC, XDP_RSS_TYPE_L4_IPV6_TCP_EX = XDP_RSS_TYPE_L4_IPV6_TCP | XDP_RSS_L3_DYNHDR, XDP_RSS_TYPE_L4_IPV6_UDP_EX = XDP_RSS_TYPE_L4_IPV6_UDP | XDP_RSS_L3_DYNHDR, From 96b1a098f3db06223a6b6268e756f980d5c07f10 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Wed, 12 Apr 2023 21:48:50 +0200 Subject: [PATCH 48/50] veth: bpf_xdp_metadata_rx_hash add xdp rss hash type MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Update API for bpf_xdp_metadata_rx_hash() with arg for xdp rss hash type. The veth driver currently only support XDP-hints based on SKB code path. The SKB have lost information about the RSS hash type, by compressing the information down to a single bitfield skb->l4_hash, that only knows if this was a L4 hash value. In preparation for veth, the xdp_rss_hash_type have an L4 indication bit that allow us to return a meaningful L4 indication when working with SKB based packets. Fixes: 306531f0249f ("veth: Support RX XDP metadata") Signed-off-by: Jesper Dangaard Brouer Acked-by: Toke Høiland-Jørgensen Acked-by: Stanislav Fomichev Link: https://lore.kernel.org/r/168132893055.340624.16209448340644513469.stgit@firesoul Signed-off-by: Alexei Starovoitov --- drivers/net/veth.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/net/veth.c b/drivers/net/veth.c index 424e8876a16b6..e1b38fbf1dd95 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -1652,11 +1652,14 @@ static int veth_xdp_rx_hash(const struct xdp_md *ctx, u32 *hash, enum xdp_rss_hash_type *rss_type) { struct veth_xdp_buff *_ctx = (void *)ctx; + struct sk_buff *skb = _ctx->skb; - if (!_ctx->skb) + if (!skb) return -ENODATA; - *hash = skb_get_hash(_ctx->skb); + *hash = skb_get_hash(skb); + *rss_type = skb->l4_hash ? XDP_RSS_TYPE_L4_ANY : XDP_RSS_TYPE_NONE; + return 0; } From 9123397aeeb4f93dda5828e37c35312f1b62231e Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Wed, 12 Apr 2023 21:48:55 +0200 Subject: [PATCH 49/50] mlx4: bpf_xdp_metadata_rx_hash add xdp rss hash type MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Update API for bpf_xdp_metadata_rx_hash() with arg for xdp rss hash type via matching individual Completion Queue Entry (CQE) status bits. Fixes: ab46182d0dcb ("net/mlx4_en: Support RX XDP metadata") Signed-off-by: Jesper Dangaard Brouer Acked-by: Toke Høiland-Jørgensen Acked-by: Stanislav Fomichev Link: https://lore.kernel.org/r/168132893562.340624.12779118462402031248.stgit@firesoul Signed-off-by: Alexei Starovoitov --- drivers/net/ethernet/mellanox/mlx4/en_rx.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c index 73d10aa4c503f..332472fe49902 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c @@ -685,11 +685,28 @@ int mlx4_en_xdp_rx_hash(const struct xdp_md *ctx, u32 *hash, enum xdp_rss_hash_type *rss_type) { struct mlx4_en_xdp_buff *_ctx = (void *)ctx; + struct mlx4_cqe *cqe = _ctx->cqe; + enum xdp_rss_hash_type xht = 0; + __be16 status; if (unlikely(!(_ctx->dev->features & NETIF_F_RXHASH))) return -ENODATA; - *hash = be32_to_cpu(_ctx->cqe->immed_rss_invalid); + *hash = be32_to_cpu(cqe->immed_rss_invalid); + status = cqe->status; + if (status & cpu_to_be16(MLX4_CQE_STATUS_TCP)) + xht = XDP_RSS_L4_TCP; + if (status & cpu_to_be16(MLX4_CQE_STATUS_UDP)) + xht = XDP_RSS_L4_UDP; + if (status & cpu_to_be16(MLX4_CQE_STATUS_IPV4 | MLX4_CQE_STATUS_IPV4F)) + xht |= XDP_RSS_L3_IPV4; + if (status & cpu_to_be16(MLX4_CQE_STATUS_IPV6)) { + xht |= XDP_RSS_L3_IPV6; + if (cqe->ipv6_ext_mask) + xht |= XDP_RSS_L3_DYNHDR; + } + *rss_type = xht; + return 0; } From 0f26b74e7d071b0dc18e2c43d79d496c2b144035 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Wed, 12 Apr 2023 21:49:00 +0200 Subject: [PATCH 50/50] selftests/bpf: Adjust bpf_xdp_metadata_rx_hash for new arg MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Update BPF selftests to use the new RSS type argument for kfunc bpf_xdp_metadata_rx_hash. Signed-off-by: Jesper Dangaard Brouer Acked-by: Toke Høiland-Jørgensen Acked-by: Stanislav Fomichev Link: https://lore.kernel.org/r/168132894068.340624.8914711185697163690.stgit@firesoul Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/xdp_metadata.c | 2 ++ tools/testing/selftests/bpf/progs/xdp_hw_metadata.c | 10 +++++----- tools/testing/selftests/bpf/progs/xdp_metadata.c | 6 +++--- tools/testing/selftests/bpf/progs/xdp_metadata2.c | 7 ++++--- tools/testing/selftests/bpf/xdp_hw_metadata.c | 6 +++++- tools/testing/selftests/bpf/xdp_metadata.h | 4 ++++ 6 files changed, 23 insertions(+), 12 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c b/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c index aa4beae99f4f6..8c5e98da9ae9f 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c @@ -273,6 +273,8 @@ static int verify_xsk_metadata(struct xsk *xsk) if (!ASSERT_NEQ(meta->rx_hash, 0, "rx_hash")) return -1; + ASSERT_EQ(meta->rx_hash_type, 0, "rx_hash_type"); + xsk_ring_cons__release(&xsk->rx, 1); refill_rx(xsk, comp_addr); diff --git a/tools/testing/selftests/bpf/progs/xdp_hw_metadata.c b/tools/testing/selftests/bpf/progs/xdp_hw_metadata.c index 0687d11162f6b..e1c787815e44b 100644 --- a/tools/testing/selftests/bpf/progs/xdp_hw_metadata.c +++ b/tools/testing/selftests/bpf/progs/xdp_hw_metadata.c @@ -18,8 +18,8 @@ __u64 pkts_redir = 0; extern int bpf_xdp_metadata_rx_timestamp(const struct xdp_md *ctx, __u64 *timestamp) __ksym; -extern int bpf_xdp_metadata_rx_hash(const struct xdp_md *ctx, - __u32 *hash) __ksym; +extern int bpf_xdp_metadata_rx_hash(const struct xdp_md *ctx, __u32 *hash, + enum xdp_rss_hash_type *rss_type) __ksym; SEC("xdp") int rx(struct xdp_md *ctx) @@ -80,9 +80,9 @@ int rx(struct xdp_md *ctx) if (err) meta->rx_timestamp = 0; /* Used by AF_XDP as not avail signal */ - err = bpf_xdp_metadata_rx_hash(ctx, &meta->rx_hash); - if (err) - meta->rx_hash = 0; /* Used by AF_XDP as not avail signal */ + err = bpf_xdp_metadata_rx_hash(ctx, &meta->rx_hash, &meta->rx_hash_type); + if (err < 0) + meta->rx_hash_err = err; /* Used by AF_XDP as no hash signal */ __sync_add_and_fetch(&pkts_redir, 1); return bpf_redirect_map(&xsk, ctx->rx_queue_index, XDP_PASS); diff --git a/tools/testing/selftests/bpf/progs/xdp_metadata.c b/tools/testing/selftests/bpf/progs/xdp_metadata.c index 77678b0343897..d151d406a123e 100644 --- a/tools/testing/selftests/bpf/progs/xdp_metadata.c +++ b/tools/testing/selftests/bpf/progs/xdp_metadata.c @@ -21,8 +21,8 @@ struct { extern int bpf_xdp_metadata_rx_timestamp(const struct xdp_md *ctx, __u64 *timestamp) __ksym; -extern int bpf_xdp_metadata_rx_hash(const struct xdp_md *ctx, - __u32 *hash) __ksym; +extern int bpf_xdp_metadata_rx_hash(const struct xdp_md *ctx, __u32 *hash, + enum xdp_rss_hash_type *rss_type) __ksym; SEC("xdp") int rx(struct xdp_md *ctx) @@ -56,7 +56,7 @@ int rx(struct xdp_md *ctx) if (timestamp == 0) meta->rx_timestamp = 1; - bpf_xdp_metadata_rx_hash(ctx, &meta->rx_hash); + bpf_xdp_metadata_rx_hash(ctx, &meta->rx_hash, &meta->rx_hash_type); return bpf_redirect_map(&xsk, ctx->rx_queue_index, XDP_PASS); } diff --git a/tools/testing/selftests/bpf/progs/xdp_metadata2.c b/tools/testing/selftests/bpf/progs/xdp_metadata2.c index cf69d05451c39..85f88d9d7a785 100644 --- a/tools/testing/selftests/bpf/progs/xdp_metadata2.c +++ b/tools/testing/selftests/bpf/progs/xdp_metadata2.c @@ -5,17 +5,18 @@ #include #include -extern int bpf_xdp_metadata_rx_hash(const struct xdp_md *ctx, - __u32 *hash) __ksym; +extern int bpf_xdp_metadata_rx_hash(const struct xdp_md *ctx, __u32 *hash, + enum xdp_rss_hash_type *rss_type) __ksym; int called; SEC("freplace/rx") int freplace_rx(struct xdp_md *ctx) { + enum xdp_rss_hash_type type = 0; u32 hash = 0; /* Call _any_ metadata function to make sure we don't crash. */ - bpf_xdp_metadata_rx_hash(ctx, &hash); + bpf_xdp_metadata_rx_hash(ctx, &hash, &type); called++; return XDP_PASS; } diff --git a/tools/testing/selftests/bpf/xdp_hw_metadata.c b/tools/testing/selftests/bpf/xdp_hw_metadata.c index 3b942ef7297bf..987cf0db5ebc8 100644 --- a/tools/testing/selftests/bpf/xdp_hw_metadata.c +++ b/tools/testing/selftests/bpf/xdp_hw_metadata.c @@ -141,7 +141,11 @@ static void verify_xdp_metadata(void *data) meta = data - sizeof(*meta); printf("rx_timestamp: %llu\n", meta->rx_timestamp); - printf("rx_hash: %u\n", meta->rx_hash); + if (meta->rx_hash_err < 0) + printf("No rx_hash err=%d\n", meta->rx_hash_err); + else + printf("rx_hash: 0x%X with RSS type:0x%X\n", + meta->rx_hash, meta->rx_hash_type); } static void verify_skb_metadata(int fd) diff --git a/tools/testing/selftests/bpf/xdp_metadata.h b/tools/testing/selftests/bpf/xdp_metadata.h index f6780fbb0a214..0c4624dc6f2f7 100644 --- a/tools/testing/selftests/bpf/xdp_metadata.h +++ b/tools/testing/selftests/bpf/xdp_metadata.h @@ -12,4 +12,8 @@ struct xdp_meta { __u64 rx_timestamp; __u32 rx_hash; + union { + __u32 rx_hash_type; + __s32 rx_hash_err; + }; };