From c05c5e5aa163f4682ca97a2f0536575fc7dbdecb Mon Sep 17 00:00:00 2001 From: Jianbo Liu Date: Tue, 12 Nov 2024 14:10:31 +0200 Subject: [PATCH 01/88] xfrm: replay: Fix the update of replay_esn->oseq_hi for GSO When skb needs GSO and wrap around happens, if xo->seq.low (seqno of the first skb segment) is before the last seq number but oseq (seqno of the last segment) is after it, xo->seq.low is still bigger than replay_esn->oseq while oseq is smaller than it, so the update of replay_esn->oseq_hi is missed for this case wrap around because of the change in the cited commit. For example, if sending a packet with gso_segs=3 while old replay_esn->oseq=0xfffffffe, we calculate: xo->seq.low = 0xfffffffe + 1 = 0x0xffffffff oseq = 0xfffffffe + 3 = 0x1 (oseq < replay_esn->oseq) is true, but (xo->seq.low < replay_esn->oseq) is false, so replay_esn->oseq_hi is not incremented. To fix this issue, change the outer checking back for the update of replay_esn->oseq_hi. And add new checking inside for the update of packet's oseq_hi. Fixes: 4b549ccce941 ("xfrm: replay: Fix ESN wrap around for GSO") Signed-off-by: Jianbo Liu Reviewed-by: Patrisious Haddad Signed-off-by: Leon Romanovsky Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_replay.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/net/xfrm/xfrm_replay.c b/net/xfrm/xfrm_replay.c index bc56c63057252..235bbefc2abae 100644 --- a/net/xfrm/xfrm_replay.c +++ b/net/xfrm/xfrm_replay.c @@ -714,10 +714,12 @@ static int xfrm_replay_overflow_offload_esn(struct xfrm_state *x, struct sk_buff oseq += skb_shinfo(skb)->gso_segs; } - if (unlikely(xo->seq.low < replay_esn->oseq)) { - XFRM_SKB_CB(skb)->seq.output.hi = ++oseq_hi; - xo->seq.hi = oseq_hi; - replay_esn->oseq_hi = oseq_hi; + if (unlikely(oseq < replay_esn->oseq)) { + replay_esn->oseq_hi = ++oseq_hi; + if (xo->seq.low < replay_esn->oseq) { + XFRM_SKB_CB(skb)->seq.output.hi = oseq_hi; + xo->seq.hi = oseq_hi; + } if (replay_esn->oseq_hi == 0) { replay_esn->oseq--; replay_esn->oseq_hi--; From e952837f3ddb0ff726d5b582aa1aad9aa38d024d Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 28 Nov 2024 15:26:25 +0100 Subject: [PATCH 02/88] xfrm: state: fix out-of-bounds read during lookup lookup and resize can run in parallel. The xfrm_state_hash_generation seqlock ensures a retry, but the hash functions can observe a hmask value that is too large for the new hlist array. rehash does: rcu_assign_pointer(net->xfrm.state_bydst, ndst) [..] net->xfrm.state_hmask = nhashmask; While state lookup does: h = xfrm_dst_hash(net, daddr, saddr, tmpl->reqid, encap_family); hlist_for_each_entry_rcu(x, net->xfrm.state_bydst + h, bydst) { This is only safe in case the update to state_bydst is larger than net->xfrm.xfrm_state_hmask (or if the lookup function gets serialized via state spinlock again). Fix this by prefetching state_hmask and the associated pointers. The xfrm_state_hash_generation seqlock retry will ensure that the pointer and the hmask will be consistent. The existing helpers, like xfrm_dst_hash(), are now unsafe for RCU side, add lockdep assertions to document that they are only safe for insert side. xfrm_state_lookup_byaddr() uses the spinlock rather than RCU. AFAICS this is an oversight from back when state lookup was converted to RCU, this lock should be replaced with RCU in a future patch. Reported-by: syzbot+5f9f31cb7d985f584d8e@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/CACT4Y+azwfrE3uz6A5ZErov5YN2LYBN5KrsymBerT36VU8qzBA@mail.gmail.com/ Diagnosed-by: Dmitry Vyukov Fixes: c2f672fc9464 ("xfrm: state lookup can be lockless") Signed-off-by: Florian Westphal Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_state.c | 89 ++++++++++++++++++++++++++++++++++--------- 1 file changed, 70 insertions(+), 19 deletions(-) diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 67ca7ac955a37..1781728ca4285 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -34,6 +34,8 @@ #define xfrm_state_deref_prot(table, net) \ rcu_dereference_protected((table), lockdep_is_held(&(net)->xfrm.xfrm_state_lock)) +#define xfrm_state_deref_check(table, net) \ + rcu_dereference_check((table), lockdep_is_held(&(net)->xfrm.xfrm_state_lock)) static void xfrm_state_gc_task(struct work_struct *work); @@ -62,6 +64,8 @@ static inline unsigned int xfrm_dst_hash(struct net *net, u32 reqid, unsigned short family) { + lockdep_assert_held(&net->xfrm.xfrm_state_lock); + return __xfrm_dst_hash(daddr, saddr, reqid, family, net->xfrm.state_hmask); } @@ -70,6 +74,8 @@ static inline unsigned int xfrm_src_hash(struct net *net, const xfrm_address_t *saddr, unsigned short family) { + lockdep_assert_held(&net->xfrm.xfrm_state_lock); + return __xfrm_src_hash(daddr, saddr, family, net->xfrm.state_hmask); } @@ -77,11 +83,15 @@ static inline unsigned int xfrm_spi_hash(struct net *net, const xfrm_address_t *daddr, __be32 spi, u8 proto, unsigned short family) { + lockdep_assert_held(&net->xfrm.xfrm_state_lock); + return __xfrm_spi_hash(daddr, spi, proto, family, net->xfrm.state_hmask); } static unsigned int xfrm_seq_hash(struct net *net, u32 seq) { + lockdep_assert_held(&net->xfrm.xfrm_state_lock); + return __xfrm_seq_hash(seq, net->xfrm.state_hmask); } @@ -1041,16 +1051,38 @@ xfrm_init_tempstate(struct xfrm_state *x, const struct flowi *fl, x->props.family = tmpl->encap_family; } -static struct xfrm_state *__xfrm_state_lookup_all(struct net *net, u32 mark, +struct xfrm_hash_state_ptrs { + const struct hlist_head *bydst; + const struct hlist_head *bysrc; + const struct hlist_head *byspi; + unsigned int hmask; +}; + +static void xfrm_hash_ptrs_get(const struct net *net, struct xfrm_hash_state_ptrs *ptrs) +{ + unsigned int sequence; + + do { + sequence = read_seqcount_begin(&net->xfrm.xfrm_state_hash_generation); + + ptrs->bydst = xfrm_state_deref_check(net->xfrm.state_bydst, net); + ptrs->bysrc = xfrm_state_deref_check(net->xfrm.state_bysrc, net); + ptrs->byspi = xfrm_state_deref_check(net->xfrm.state_byspi, net); + ptrs->hmask = net->xfrm.state_hmask; + } while (read_seqcount_retry(&net->xfrm.xfrm_state_hash_generation, sequence)); +} + +static struct xfrm_state *__xfrm_state_lookup_all(const struct xfrm_hash_state_ptrs *state_ptrs, + u32 mark, const xfrm_address_t *daddr, __be32 spi, u8 proto, unsigned short family, struct xfrm_dev_offload *xdo) { - unsigned int h = xfrm_spi_hash(net, daddr, spi, proto, family); + unsigned int h = __xfrm_spi_hash(daddr, spi, proto, family, state_ptrs->hmask); struct xfrm_state *x; - hlist_for_each_entry_rcu(x, net->xfrm.state_byspi + h, byspi) { + hlist_for_each_entry_rcu(x, state_ptrs->byspi + h, byspi) { #ifdef CONFIG_XFRM_OFFLOAD if (xdo->type == XFRM_DEV_OFFLOAD_PACKET) { if (x->xso.type != XFRM_DEV_OFFLOAD_PACKET) @@ -1084,15 +1116,16 @@ static struct xfrm_state *__xfrm_state_lookup_all(struct net *net, u32 mark, return NULL; } -static struct xfrm_state *__xfrm_state_lookup(struct net *net, u32 mark, +static struct xfrm_state *__xfrm_state_lookup(const struct xfrm_hash_state_ptrs *state_ptrs, + u32 mark, const xfrm_address_t *daddr, __be32 spi, u8 proto, unsigned short family) { - unsigned int h = xfrm_spi_hash(net, daddr, spi, proto, family); + unsigned int h = __xfrm_spi_hash(daddr, spi, proto, family, state_ptrs->hmask); struct xfrm_state *x; - hlist_for_each_entry_rcu(x, net->xfrm.state_byspi + h, byspi) { + hlist_for_each_entry_rcu(x, state_ptrs->byspi + h, byspi) { if (x->props.family != family || x->id.spi != spi || x->id.proto != proto || @@ -1114,6 +1147,7 @@ struct xfrm_state *xfrm_input_state_lookup(struct net *net, u32 mark, __be32 spi, u8 proto, unsigned short family) { + struct xfrm_hash_state_ptrs state_ptrs; struct hlist_head *state_cache_input; struct xfrm_state *x = NULL; int cpu = get_cpu(); @@ -1135,7 +1169,9 @@ struct xfrm_state *xfrm_input_state_lookup(struct net *net, u32 mark, goto out; } - x = __xfrm_state_lookup(net, mark, daddr, spi, proto, family); + xfrm_hash_ptrs_get(net, &state_ptrs); + + x = __xfrm_state_lookup(&state_ptrs, mark, daddr, spi, proto, family); if (x && x->km.state == XFRM_STATE_VALID) { spin_lock_bh(&net->xfrm.xfrm_state_lock); @@ -1155,15 +1191,16 @@ struct xfrm_state *xfrm_input_state_lookup(struct net *net, u32 mark, } EXPORT_SYMBOL(xfrm_input_state_lookup); -static struct xfrm_state *__xfrm_state_lookup_byaddr(struct net *net, u32 mark, +static struct xfrm_state *__xfrm_state_lookup_byaddr(const struct xfrm_hash_state_ptrs *state_ptrs, + u32 mark, const xfrm_address_t *daddr, const xfrm_address_t *saddr, u8 proto, unsigned short family) { - unsigned int h = xfrm_src_hash(net, daddr, saddr, family); + unsigned int h = __xfrm_src_hash(daddr, saddr, family, state_ptrs->hmask); struct xfrm_state *x; - hlist_for_each_entry_rcu(x, net->xfrm.state_bysrc + h, bysrc) { + hlist_for_each_entry_rcu(x, state_ptrs->bysrc + h, bysrc) { if (x->props.family != family || x->id.proto != proto || !xfrm_addr_equal(&x->id.daddr, daddr, family) || @@ -1183,14 +1220,17 @@ static struct xfrm_state *__xfrm_state_lookup_byaddr(struct net *net, u32 mark, static inline struct xfrm_state * __xfrm_state_locate(struct xfrm_state *x, int use_spi, int family) { + struct xfrm_hash_state_ptrs state_ptrs; struct net *net = xs_net(x); u32 mark = x->mark.v & x->mark.m; + xfrm_hash_ptrs_get(net, &state_ptrs); + if (use_spi) - return __xfrm_state_lookup(net, mark, &x->id.daddr, + return __xfrm_state_lookup(&state_ptrs, mark, &x->id.daddr, x->id.spi, x->id.proto, family); else - return __xfrm_state_lookup_byaddr(net, mark, + return __xfrm_state_lookup_byaddr(&state_ptrs, mark, &x->id.daddr, &x->props.saddr, x->id.proto, family); @@ -1264,6 +1304,7 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr, unsigned short family, u32 if_id) { static xfrm_address_t saddr_wildcard = { }; + struct xfrm_hash_state_ptrs state_ptrs; struct net *net = xp_net(pol); unsigned int h, h_wildcard; struct xfrm_state *x, *x0, *to_put; @@ -1328,8 +1369,10 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr, else if (acquire_in_progress) /* XXX: acquire_in_progress should not happen */ WARN_ON(1); - h = xfrm_dst_hash(net, daddr, saddr, tmpl->reqid, encap_family); - hlist_for_each_entry_rcu(x, net->xfrm.state_bydst + h, bydst) { + xfrm_hash_ptrs_get(net, &state_ptrs); + + h = __xfrm_dst_hash(daddr, saddr, tmpl->reqid, encap_family, state_ptrs.hmask); + hlist_for_each_entry_rcu(x, state_ptrs.bydst + h, bydst) { #ifdef CONFIG_XFRM_OFFLOAD if (pol->xdo.type == XFRM_DEV_OFFLOAD_PACKET) { if (x->xso.type != XFRM_DEV_OFFLOAD_PACKET) @@ -1362,8 +1405,9 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr, if (best || acquire_in_progress) goto found; - h_wildcard = xfrm_dst_hash(net, daddr, &saddr_wildcard, tmpl->reqid, encap_family); - hlist_for_each_entry_rcu(x, net->xfrm.state_bydst + h_wildcard, bydst) { + h_wildcard = __xfrm_dst_hash(daddr, &saddr_wildcard, tmpl->reqid, + encap_family, state_ptrs.hmask); + hlist_for_each_entry_rcu(x, state_ptrs.bydst + h_wildcard, bydst) { #ifdef CONFIG_XFRM_OFFLOAD if (pol->xdo.type == XFRM_DEV_OFFLOAD_PACKET) { if (x->xso.type != XFRM_DEV_OFFLOAD_PACKET) @@ -1401,7 +1445,7 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr, if (!x && !error && !acquire_in_progress) { if (tmpl->id.spi && - (x0 = __xfrm_state_lookup_all(net, mark, daddr, + (x0 = __xfrm_state_lookup_all(&state_ptrs, mark, daddr, tmpl->id.spi, tmpl->id.proto, encap_family, &pol->xdo)) != NULL) { @@ -2180,10 +2224,13 @@ struct xfrm_state * xfrm_state_lookup(struct net *net, u32 mark, const xfrm_address_t *daddr, __be32 spi, u8 proto, unsigned short family) { + struct xfrm_hash_state_ptrs state_ptrs; struct xfrm_state *x; rcu_read_lock(); - x = __xfrm_state_lookup(net, mark, daddr, spi, proto, family); + xfrm_hash_ptrs_get(net, &state_ptrs); + + x = __xfrm_state_lookup(&state_ptrs, mark, daddr, spi, proto, family); rcu_read_unlock(); return x; } @@ -2194,10 +2241,14 @@ xfrm_state_lookup_byaddr(struct net *net, u32 mark, const xfrm_address_t *daddr, const xfrm_address_t *saddr, u8 proto, unsigned short family) { + struct xfrm_hash_state_ptrs state_ptrs; struct xfrm_state *x; spin_lock_bh(&net->xfrm.xfrm_state_lock); - x = __xfrm_state_lookup_byaddr(net, mark, daddr, saddr, proto, family); + + xfrm_hash_ptrs_get(net, &state_ptrs); + + x = __xfrm_state_lookup_byaddr(&state_ptrs, mark, daddr, saddr, proto, family); spin_unlock_bh(&net->xfrm.xfrm_state_lock); return x; } From 600258d555f0710b9c47fb78d2d80a4aecd608cc Mon Sep 17 00:00:00 2001 From: Alexandre Cassen Date: Thu, 2 Jan 2025 12:11:11 +0200 Subject: [PATCH 03/88] xfrm: delete intermediate secpath entry in packet offload mode Packets handled by hardware have added secpath as a way to inform XFRM core code that this path was already handled. That secpath is not needed at all after policy is checked and it is removed later in the stack. However, in the case of IP forwarding is enabled (/proc/sys/net/ipv4/ip_forward), that secpath is not removed and packets which already were handled are reentered to the driver TX path with xfrm_offload set. The following kernel panic is observed in mlx5 in such case: mlx5_core 0000:04:00.0 enp4s0f0np0: Link up mlx5_core 0000:04:00.1 enp4s0f1np1: Link up Initializing XFRM netlink socket IPsec XFRM device driver BUG: kernel NULL pointer dereference, address: 0000000000000000 #PF: supervisor instruction fetch in kernel mode #PF: error_code(0x0010) - not-present page PGD 0 P4D 0 Oops: Oops: 0010 [#1] PREEMPT SMP CPU: 0 UID: 0 PID: 0 Comm: swapper/0 Not tainted 6.13.0-rc1-alex #3 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.13.0-1ubuntu1.1 04/01/2014 RIP: 0010:0x0 Code: Unable to access opcode bytes at 0xffffffffffffffd6. RSP: 0018:ffffb87380003800 EFLAGS: 00010206 RAX: ffff8df004e02600 RBX: ffffb873800038d8 RCX: 00000000ffff98cf RDX: ffff8df00733e108 RSI: ffff8df00521fb80 RDI: ffff8df001661f00 RBP: ffffb87380003850 R08: ffff8df013980000 R09: 0000000000000010 R10: 0000000000000002 R11: 0000000000000002 R12: ffff8df001661f00 R13: ffff8df00521fb80 R14: ffff8df00733e108 R15: ffff8df011faf04e FS: 0000000000000000(0000) GS:ffff8df46b800000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffffffffffffffd6 CR3: 0000000106384000 CR4: 0000000000350ef0 Call Trace: ? show_regs+0x63/0x70 ? __die_body+0x20/0x60 ? __die+0x2b/0x40 ? page_fault_oops+0x15c/0x550 ? do_user_addr_fault+0x3ed/0x870 ? exc_page_fault+0x7f/0x190 ? asm_exc_page_fault+0x27/0x30 mlx5e_ipsec_handle_tx_skb+0xe7/0x2f0 [mlx5_core] mlx5e_xmit+0x58e/0x1980 [mlx5_core] ? __fib_lookup+0x6a/0xb0 dev_hard_start_xmit+0x82/0x1d0 sch_direct_xmit+0xfe/0x390 __dev_queue_xmit+0x6d8/0xee0 ? __fib_lookup+0x6a/0xb0 ? internal_add_timer+0x48/0x70 ? mod_timer+0xe2/0x2b0 neigh_resolve_output+0x115/0x1b0 __neigh_update+0x26a/0xc50 neigh_update+0x14/0x20 arp_process+0x2cb/0x8e0 ? __napi_build_skb+0x5e/0x70 arp_rcv+0x11e/0x1c0 ? dev_gro_receive+0x574/0x820 __netif_receive_skb_list_core+0x1cf/0x1f0 netif_receive_skb_list_internal+0x183/0x2a0 napi_complete_done+0x76/0x1c0 mlx5e_napi_poll+0x234/0x7a0 [mlx5_core] __napi_poll+0x2d/0x1f0 net_rx_action+0x1a6/0x370 ? atomic_notifier_call_chain+0x3b/0x50 ? irq_int_handler+0x15/0x20 [mlx5_core] handle_softirqs+0xb9/0x2f0 ? handle_irq_event+0x44/0x60 irq_exit_rcu+0xdb/0x100 common_interrupt+0x98/0xc0 asm_common_interrupt+0x27/0x40 RIP: 0010:pv_native_safe_halt+0xb/0x10 Code: 09 c3 66 66 2e 0f 1f 84 00 00 00 00 00 66 90 0f 22 0f 1f 84 00 00 00 00 00 90 eb 07 0f 00 2d 7f e9 36 00 fb 40 00 83 ff 07 77 21 89 ff ff 24 fd 88 3d a1 bd 0f 21 f8 RSP: 0018:ffffffffbe603de8 EFLAGS: 00000202 RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000f92f46680 RDX: 0000000000000037 RSI: 00000000ffffffff RDI: 00000000000518d4 RBP: ffffffffbe603df0 R08: 000000cd42e4dffb R09: ffffffffbe603d70 R10: 0000004d80d62680 R11: 0000000000000001 R12: ffffffffbe60bf40 R13: 0000000000000000 R14: 0000000000000000 R15: ffffffffbe60aff8 ? default_idle+0x9/0x20 arch_cpu_idle+0x9/0x10 default_idle_call+0x29/0xf0 do_idle+0x1f2/0x240 cpu_startup_entry+0x2c/0x30 rest_init+0xe7/0x100 start_kernel+0x76b/0xb90 x86_64_start_reservations+0x18/0x30 x86_64_start_kernel+0xc0/0x110 ? setup_ghcb+0xe/0x130 common_startup_64+0x13e/0x141 Modules linked in: esp4_offload esp4 xfrm_interface xfrm6_tunnel tunnel4 tunnel6 xfrm_user xfrm_algo binfmt_misc intel_rapl_msr intel_rapl_common kvm_amd ccp kvm input_leds serio_raw qemu_fw_cfg sch_fq_codel dm_multipath scsi_dh_rdac scsi_dh_emc scsi_dh_alua efi_pstore ip_tables x_tables autofs4 raid10 raid456 async_raid6_recov async_memcpy async_pq raid6_pq async_xor xor async_tx libcrc32c raid1 raid0 mlx5_core crct10dif_pclmul crc32_pclmul polyval_clmulni polyval_generic ghash_clmulni_intel sha256_ssse3 sha1_ssse3 ahci mlxfw i2c_i801 libahci i2c_mux i2c_smbus psample virtio_rng pci_hyperv_intf aesni_intel crypto_simd cryptd CR2: 0000000000000000 ---[ end trace 0000000000000000 ]--- RIP: 0010:0x0 Code: Unable to access opcode bytes at 0xffffffffffffffd6. RSP: 0018:ffffb87380003800 EFLAGS: 00010206 RAX: ffff8df004e02600 RBX: ffffb873800038d8 RCX: 00000000ffff98cf RDX: ffff8df00733e108 RSI: ffff8df00521fb80 RDI: ffff8df001661f00 RBP: ffffb87380003850 R08: ffff8df013980000 R09: 0000000000000010 R10: 0000000000000002 R11: 0000000000000002 R12: ffff8df001661f00 R13: ffff8df00521fb80 R14: ffff8df00733e108 R15: ffff8df011faf04e FS: 0000000000000000(0000) GS:ffff8df46b800000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffffffffffffffd6 CR3: 0000000106384000 CR4: 0000000000350ef0 Kernel panic - not syncing: Fatal exception in interrupt Kernel Offset: 0x3b800000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff) ---[ end Kernel panic - not syncing: Fatal exception in interrupt ]--- Fixes: 5958372ddf62 ("xfrm: add RX datapath protection for IPsec packet offload mode") Signed-off-by: Alexandre Cassen Signed-off-by: Leon Romanovsky Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 32c09e85a64ce..2c4eda6a85966 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1224,9 +1224,19 @@ static inline int __xfrm_policy_check2(struct sock *sk, int dir, if (xo) { x = xfrm_input_state(skb); - if (x->xso.type == XFRM_DEV_OFFLOAD_PACKET) - return (xo->flags & CRYPTO_DONE) && - (xo->status & CRYPTO_SUCCESS); + if (x->xso.type == XFRM_DEV_OFFLOAD_PACKET) { + bool check = (xo->flags & CRYPTO_DONE) && + (xo->status & CRYPTO_SUCCESS); + + /* The packets here are plain ones and secpath was + * needed to indicate that hardware already handled + * them and there is no need to do nothing in addition. + * + * Consume secpath which was set by drivers. + */ + secpath_reset(skb); + return check; + } } return __xfrm_check_nopolicy(net, skb, dir) || From 1620c88887b16940e00dbe57dd38c74eda9bad9e Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Thu, 16 Jan 2025 11:46:03 +0100 Subject: [PATCH 04/88] xfrm: Fix the usage of skb->sk xfrm assumed to always have a full socket at skb->sk. This is not always true, so fix it by converting to a full socket before it is used. Signed-off-by: Steffen Klassert Reviewed-by: Eric Dumazet --- net/ipv4/esp4.c | 2 +- net/ipv6/esp6.c | 2 +- net/ipv6/xfrm6_output.c | 4 ++-- net/xfrm/xfrm_interface_core.c | 2 +- net/xfrm/xfrm_output.c | 7 ++++--- net/xfrm/xfrm_policy.c | 2 +- 6 files changed, 10 insertions(+), 9 deletions(-) diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index f3281312eb5eb..8cf5f6634775f 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -279,7 +279,7 @@ static void esp_output_done(void *data, int err) x->encap && x->encap->encap_type == TCP_ENCAP_ESPINTCP) esp_output_tail_tcp(x, skb); else - xfrm_output_resume(skb->sk, skb, err); + xfrm_output_resume(skb_to_full_sk(skb), skb, err); } } diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index b2400c226a325..fad4d7c9fa502 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -315,7 +315,7 @@ static void esp_output_done(void *data, int err) x->encap && x->encap->encap_type == TCP_ENCAP_ESPINTCP) esp_output_tail_tcp(x, skb); else - xfrm_output_resume(skb->sk, skb, err); + xfrm_output_resume(skb_to_full_sk(skb), skb, err); } } diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c index 5f7b1fdbffe62..b3d5d1f266eee 100644 --- a/net/ipv6/xfrm6_output.c +++ b/net/ipv6/xfrm6_output.c @@ -82,14 +82,14 @@ static int __xfrm6_output(struct net *net, struct sock *sk, struct sk_buff *skb) toobig = skb->len > mtu && !skb_is_gso(skb); - if (toobig && xfrm6_local_dontfrag(skb->sk)) { + if (toobig && xfrm6_local_dontfrag(sk)) { xfrm6_local_rxpmtu(skb, mtu); kfree_skb(skb); return -EMSGSIZE; } else if (toobig && xfrm6_noneed_fragment(skb)) { skb->ignore_df = 1; goto skip_frag; - } else if (!skb->ignore_df && toobig && skb->sk) { + } else if (!skb->ignore_df && toobig && sk) { xfrm_local_error(skb, mtu); kfree_skb(skb); return -EMSGSIZE; diff --git a/net/xfrm/xfrm_interface_core.c b/net/xfrm/xfrm_interface_core.c index 98f1e2b67c76b..c397eb99d8679 100644 --- a/net/xfrm/xfrm_interface_core.c +++ b/net/xfrm/xfrm_interface_core.c @@ -506,7 +506,7 @@ xfrmi_xmit2(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) skb_dst_set(skb, dst); skb->dev = tdev; - err = dst_output(xi->net, skb->sk, skb); + err = dst_output(xi->net, skb_to_full_sk(skb), skb); if (net_xmit_eval(err) == 0) { dev_sw_netstats_tx_add(dev, 1, length); } else { diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index e5722c95b8bb3..1fb4dc0a76e13 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -796,7 +796,7 @@ static int xfrm4_tunnel_check_size(struct sk_buff *skb) !skb_gso_validate_network_len(skb, ip_skb_dst_mtu(skb->sk, skb)))) { skb->protocol = htons(ETH_P_IP); - if (skb->sk) + if (skb->sk && sk_fullsock(skb->sk)) xfrm_local_error(skb, mtu); else icmp_send(skb, ICMP_DEST_UNREACH, @@ -832,6 +832,7 @@ static int xfrm6_tunnel_check_size(struct sk_buff *skb) { int mtu, ret = 0; struct dst_entry *dst = skb_dst(skb); + struct sock *sk = skb_to_full_sk(skb); if (skb->ignore_df) goto out; @@ -846,9 +847,9 @@ static int xfrm6_tunnel_check_size(struct sk_buff *skb) skb->dev = dst->dev; skb->protocol = htons(ETH_P_IPV6); - if (xfrm6_local_dontfrag(skb->sk)) + if (xfrm6_local_dontfrag(sk)) ipv6_stub->xfrm6_local_rxpmtu(skb, mtu); - else if (skb->sk) + else if (sk) xfrm_local_error(skb, mtu); else icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 4408c11c0835f..c27da1fd070ec 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -2959,7 +2959,7 @@ static void xfrm_policy_queue_process(struct timer_list *t) skb_dst_drop(skb); skb_dst_set(skb, dst); - dst_output(net, skb->sk, skb); + dst_output(net, skb_to_full_sk(skb), skb); } out: From d62b04fca4340a0d468d7853bd66e511935a18cb Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Sat, 11 Jan 2025 09:57:39 -0500 Subject: [PATCH 05/88] net: sched: fix ets qdisc OOB Indexing Haowei Yan found that ets_class_from_arg() can index an Out-Of-Bound class in ets_class_from_arg() when passed clid of 0. The overflow may cause local privilege escalation. [ 18.852298] ------------[ cut here ]------------ [ 18.853271] UBSAN: array-index-out-of-bounds in net/sched/sch_ets.c:93:20 [ 18.853743] index 18446744073709551615 is out of range for type 'ets_class [16]' [ 18.854254] CPU: 0 UID: 0 PID: 1275 Comm: poc Not tainted 6.12.6-dirty #17 [ 18.854821] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014 [ 18.856532] Call Trace: [ 18.857441] [ 18.858227] dump_stack_lvl+0xc2/0xf0 [ 18.859607] dump_stack+0x10/0x20 [ 18.860908] __ubsan_handle_out_of_bounds+0xa7/0xf0 [ 18.864022] ets_class_change+0x3d6/0x3f0 [ 18.864322] tc_ctl_tclass+0x251/0x910 [ 18.864587] ? lock_acquire+0x5e/0x140 [ 18.865113] ? __mutex_lock+0x9c/0xe70 [ 18.866009] ? __mutex_lock+0xa34/0xe70 [ 18.866401] rtnetlink_rcv_msg+0x170/0x6f0 [ 18.866806] ? __lock_acquire+0x578/0xc10 [ 18.867184] ? __pfx_rtnetlink_rcv_msg+0x10/0x10 [ 18.867503] netlink_rcv_skb+0x59/0x110 [ 18.867776] rtnetlink_rcv+0x15/0x30 [ 18.868159] netlink_unicast+0x1c3/0x2b0 [ 18.868440] netlink_sendmsg+0x239/0x4b0 [ 18.868721] ____sys_sendmsg+0x3e2/0x410 [ 18.869012] ___sys_sendmsg+0x88/0xe0 [ 18.869276] ? rseq_ip_fixup+0x198/0x260 [ 18.869563] ? rseq_update_cpu_node_id+0x10a/0x190 [ 18.869900] ? trace_hardirqs_off+0x5a/0xd0 [ 18.870196] ? syscall_exit_to_user_mode+0xcc/0x220 [ 18.870547] ? do_syscall_64+0x93/0x150 [ 18.870821] ? __memcg_slab_free_hook+0x69/0x290 [ 18.871157] __sys_sendmsg+0x69/0xd0 [ 18.871416] __x64_sys_sendmsg+0x1d/0x30 [ 18.871699] x64_sys_call+0x9e2/0x2670 [ 18.871979] do_syscall_64+0x87/0x150 [ 18.873280] ? do_syscall_64+0x93/0x150 [ 18.874742] ? lock_release+0x7b/0x160 [ 18.876157] ? do_user_addr_fault+0x5ce/0x8f0 [ 18.877833] ? irqentry_exit_to_user_mode+0xc2/0x210 [ 18.879608] ? irqentry_exit+0x77/0xb0 [ 18.879808] ? clear_bhb_loop+0x15/0x70 [ 18.880023] ? clear_bhb_loop+0x15/0x70 [ 18.880223] ? clear_bhb_loop+0x15/0x70 [ 18.880426] entry_SYSCALL_64_after_hwframe+0x76/0x7e [ 18.880683] RIP: 0033:0x44a957 [ 18.880851] Code: ff ff e8 fc 00 00 00 66 2e 0f 1f 84 00 00 00 00 00 66 90 f3 0f 1e fa 64 8b 04 25 18 00 00 00 85 c0 75 10 b8 2e 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 51 c3 48 83 ec 28 89 54 24 1c 48 8974 24 10 [ 18.881766] RSP: 002b:00007ffcdd00fad8 EFLAGS: 00000246 ORIG_RAX: 000000000000002e [ 18.882149] RAX: ffffffffffffffda RBX: 00007ffcdd010db8 RCX: 000000000044a957 [ 18.882507] RDX: 0000000000000000 RSI: 00007ffcdd00fb70 RDI: 0000000000000003 [ 18.885037] RBP: 00007ffcdd010bc0 R08: 000000000703c770 R09: 000000000703c7c0 [ 18.887203] R10: 0000000000000080 R11: 0000000000000246 R12: 0000000000000001 [ 18.888026] R13: 00007ffcdd010da8 R14: 00000000004ca7d0 R15: 0000000000000001 [ 18.888395] [ 18.888610] ---[ end trace ]--- Fixes: dcc68b4d8084 ("net: sch_ets: Add a new Qdisc") Reported-by: Haowei Yan Suggested-by: Haowei Yan Signed-off-by: Jamal Hadi Salim Reviewed-by: Eric Dumazet Reviewed-by: Petr Machata Link: https://patch.msgid.link/20250111145740.74755-1-jhs@mojatatu.com Signed-off-by: Jakub Kicinski --- net/sched/sch_ets.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/sched/sch_ets.c b/net/sched/sch_ets.c index f80bc05d4c5a5..516038a441638 100644 --- a/net/sched/sch_ets.c +++ b/net/sched/sch_ets.c @@ -91,6 +91,8 @@ ets_class_from_arg(struct Qdisc *sch, unsigned long arg) { struct ets_sched *q = qdisc_priv(sch); + if (arg == 0 || arg > q->nbands) + return NULL; return &q->classes[arg - 1]; } From 110b43ef05342d5a11284cc8b21582b698b4ef1c Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 17 Jan 2025 12:38:41 +0300 Subject: [PATCH 06/88] NFC: nci: Add bounds checking in nci_hci_create_pipe() The "pipe" variable is a u8 which comes from the network. If it's more than 127, then it results in memory corruption in the caller, nci_hci_connect_gate(). Cc: stable@vger.kernel.org Fixes: a1b0b9415817 ("NFC: nci: Create pipe on specific gate in nci_hci_connect_gate") Signed-off-by: Dan Carpenter Reviewed-by: Simon Horman Reviewed-by: Krzysztof Kozlowski Link: https://patch.msgid.link/bcf5453b-7204-4297-9c20-4d8c7dacf586@stanley.mountain Signed-off-by: Jakub Kicinski --- net/nfc/nci/hci.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/nfc/nci/hci.c b/net/nfc/nci/hci.c index de175318a3a0f..082ab66f120b7 100644 --- a/net/nfc/nci/hci.c +++ b/net/nfc/nci/hci.c @@ -542,6 +542,8 @@ static u8 nci_hci_create_pipe(struct nci_dev *ndev, u8 dest_host, pr_debug("pipe created=%d\n", pipe); + if (pipe >= NCI_HCI_MAX_PIPES) + pipe = NCI_HCI_INVALID_PIPE; return pipe; } From d31a49d37cb132b31cc6683eef2122f8609d6229 Mon Sep 17 00:00:00 2001 From: Christian Marangi Date: Mon, 20 Jan 2025 16:41:40 +0100 Subject: [PATCH 07/88] net: airoha: Fix wrong GDM4 register definition Fix wrong GDM4 register definition, in Airoha SDK GDM4 is defined at offset 0x2400 but this doesn't make sense as it does conflict with the CDM4 that is in the same location. Following the pattern where each GDM base is at the FWD_CFG, currently GDM4 base offset is set to 0x2500. This is correct but REG_GDM4_FWD_CFG and REG_GDM4_SRC_PORT_SET are still using the SDK reference with the 0x2400 offset. Fix these 2 define by subtracting 0x100 to each register to reflect the real address location. Fixes: 23020f049327 ("net: airoha: Introduce ethernet support for EN7581 SoC") Signed-off-by: Christian Marangi Acked-by: Lorenzo Bianconi Reviewed-by: Jacob Keller Link: https://patch.msgid.link/20250120154148.13424-1-ansuelsmth@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mediatek/airoha_eth.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mediatek/airoha_eth.c b/drivers/net/ethernet/mediatek/airoha_eth.c index 415d784de741c..09f448f291240 100644 --- a/drivers/net/ethernet/mediatek/airoha_eth.c +++ b/drivers/net/ethernet/mediatek/airoha_eth.c @@ -266,11 +266,11 @@ #define REG_GDM3_FWD_CFG GDM3_BASE #define GDM3_PAD_EN_MASK BIT(28) -#define REG_GDM4_FWD_CFG (GDM4_BASE + 0x100) +#define REG_GDM4_FWD_CFG GDM4_BASE #define GDM4_PAD_EN_MASK BIT(28) #define GDM4_SPORT_OFFSET0_MASK GENMASK(11, 8) -#define REG_GDM4_SRC_PORT_SET (GDM4_BASE + 0x33c) +#define REG_GDM4_SRC_PORT_SET (GDM4_BASE + 0x23c) #define GDM4_SPORT_OFF2_MASK GENMASK(19, 16) #define GDM4_SPORT_OFF1_MASK GENMASK(15, 12) #define GDM4_SPORT_OFF0_MASK GENMASK(11, 8) From 6bb194d036c6e1b329dcdff459338cdd9a54802a Mon Sep 17 00:00:00 2001 From: Paul Fertser Date: Thu, 16 Jan 2025 18:29:00 +0300 Subject: [PATCH 08/88] net/ncsi: wait for the last response to Deselect Package before configuring channel The NCSI state machine as it's currently implemented assumes that transition to the next logical state is performed either explicitly by calling `schedule_work(&ndp->work)` to re-queue itself or implicitly after processing the predefined (ndp->pending_req_num) number of replies. Thus to avoid the configuration FSM from advancing prematurely and getting out of sync with the process it's essential to not skip waiting for a reply. This patch makes the code wait for reception of the Deselect Package response for the last package probed before proceeding to channel configuration. Thanks go to Potin Lai and Cosmo Chou for the initial investigation and testing. Fixes: 8e13f70be05e ("net/ncsi: Probe single packages to avoid conflict") Cc: stable@vger.kernel.org Signed-off-by: Paul Fertser Link: https://patch.msgid.link/20250116152900.8656-1-fercerpav@gmail.com Signed-off-by: Jakub Kicinski --- net/ncsi/ncsi-manage.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c index bf276eaf93307..7891a537bddd1 100644 --- a/net/ncsi/ncsi-manage.c +++ b/net/ncsi/ncsi-manage.c @@ -1385,6 +1385,12 @@ static void ncsi_probe_channel(struct ncsi_dev_priv *ndp) nd->state = ncsi_dev_state_probe_package; break; case ncsi_dev_state_probe_package: + if (ndp->package_probe_id >= 8) { + /* Last package probed, finishing */ + ndp->flags |= NCSI_DEV_PROBED; + break; + } + ndp->pending_req_num = 1; nca.type = NCSI_PKT_CMD_SP; @@ -1501,13 +1507,8 @@ static void ncsi_probe_channel(struct ncsi_dev_priv *ndp) if (ret) goto error; - /* Probe next package */ + /* Probe next package after receiving response */ ndp->package_probe_id++; - if (ndp->package_probe_id >= 8) { - /* Probe finished */ - ndp->flags |= NCSI_DEV_PROBED; - break; - } nd->state = ncsi_dev_state_probe_package; ndp->active_package = NULL; break; From ba1af257a0575bfa86e69eaa85bddcc6cf346df3 Mon Sep 17 00:00:00 2001 From: Yijie Yang Date: Mon, 20 Jan 2025 15:08:28 +0800 Subject: [PATCH 09/88] dt-bindings: net: qcom,ethqos: Correct fallback compatible for qcom,qcs615-ethqos The qcs615-ride utilizes the same EMAC as the qcs404, rather than the sm8150. The current incorrect fallback could result in packet loss. The Ethernet on qcs615-ride is currently not utilized by anyone. Therefore, there is no need to worry about any ABI impact. Fixes: 32535b9410b8 ("dt-bindings: net: qcom,ethqos: add description for qcs615") Signed-off-by: Yijie Yang Reviewed-by: Krzysztof Kozlowski Link: https://patch.msgid.link/20250120-schema_qcs615-v4-1-d9d122f89e64@quicinc.com Signed-off-by: Jakub Kicinski --- Documentation/devicetree/bindings/net/qcom,ethqos.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Documentation/devicetree/bindings/net/qcom,ethqos.yaml b/Documentation/devicetree/bindings/net/qcom,ethqos.yaml index 0bcd593a7bd09..1cd028495e6c6 100644 --- a/Documentation/devicetree/bindings/net/qcom,ethqos.yaml +++ b/Documentation/devicetree/bindings/net/qcom,ethqos.yaml @@ -21,12 +21,12 @@ properties: oneOf: - items: - enum: - - qcom,qcs8300-ethqos - - const: qcom,sa8775p-ethqos + - qcom,qcs615-ethqos + - const: qcom,qcs404-ethqos - items: - enum: - - qcom,qcs615-ethqos - - const: qcom,sm8150-ethqos + - qcom,qcs8300-ethqos + - const: qcom,sa8775p-ethqos - enum: - qcom,qcs404-ethqos - qcom,sa8775p-ethqos From 92e5995773774a3e70257e9c95ea03518268bea5 Mon Sep 17 00:00:00 2001 From: Jian Shen Date: Sat, 18 Jan 2025 17:47:41 +0800 Subject: [PATCH 10/88] net: hns3: fix oops when unload drivers paralleling When unload hclge driver, it tries to disable sriov first for each ae_dev node from hnae3_ae_dev_list. If user unloads hns3 driver at the time, because it removes all the ae_dev nodes, and it may cause oops. But we can't simply use hnae3_common_lock for this. Because in the process flow of pci_disable_sriov(), it will trigger the remove flow of VF, which will also take hnae3_common_lock. To fixes it, introduce a new mutex to protect the unload process. Fixes: 0dd8a25f355b ("net: hns3: disable sriov before unload hclge layer") Signed-off-by: Jian Shen Signed-off-by: Jijie Shao Link: https://patch.msgid.link/20250118094741.3046663-1-shaojijie@huawei.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/hisilicon/hns3/hnae3.c | 15 +++++++++++++++ drivers/net/ethernet/hisilicon/hns3/hnae3.h | 2 ++ drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 2 ++ .../ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 2 ++ .../ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c | 2 ++ 5 files changed, 23 insertions(+) diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.c b/drivers/net/ethernet/hisilicon/hns3/hnae3.c index 9a63fbc694083..b25fb400f4767 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hnae3.c +++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.c @@ -40,6 +40,21 @@ EXPORT_SYMBOL(hnae3_unregister_ae_algo_prepare); */ static DEFINE_MUTEX(hnae3_common_lock); +/* ensure the drivers being unloaded one by one */ +static DEFINE_MUTEX(hnae3_unload_lock); + +void hnae3_acquire_unload_lock(void) +{ + mutex_lock(&hnae3_unload_lock); +} +EXPORT_SYMBOL(hnae3_acquire_unload_lock); + +void hnae3_release_unload_lock(void) +{ + mutex_unlock(&hnae3_unload_lock); +} +EXPORT_SYMBOL(hnae3_release_unload_lock); + static bool hnae3_client_match(enum hnae3_client_type client_type) { if (client_type == HNAE3_CLIENT_KNIC || diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.h b/drivers/net/ethernet/hisilicon/hns3/hnae3.h index 12ba380eb7019..4e44f28288f90 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hnae3.h +++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.h @@ -963,4 +963,6 @@ int hnae3_register_client(struct hnae3_client *client); void hnae3_set_client_init_flag(struct hnae3_client *client, struct hnae3_ae_dev *ae_dev, unsigned int inited); +void hnae3_acquire_unload_lock(void); +void hnae3_release_unload_lock(void); #endif diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index a7e3b22f641c8..9ff797fb36c45 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -6002,9 +6002,11 @@ module_init(hns3_init_module); */ static void __exit hns3_exit_module(void) { + hnae3_acquire_unload_lock(); pci_unregister_driver(&hns3_driver); hnae3_unregister_client(&client); hns3_dbg_unregister_debugfs(); + hnae3_release_unload_lock(); } module_exit(hns3_exit_module); diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index db78450092526..3f17b3073e50f 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -12919,9 +12919,11 @@ static int __init hclge_init(void) static void __exit hclge_exit(void) { + hnae3_acquire_unload_lock(); hnae3_unregister_ae_algo_prepare(&ae_algo); hnae3_unregister_ae_algo(&ae_algo); destroy_workqueue(hclge_wq); + hnae3_release_unload_lock(); } module_init(hclge_init); module_exit(hclge_exit); diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c index 163c6e59ea4c1..9ba767740a043 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c @@ -3410,8 +3410,10 @@ static int __init hclgevf_init(void) static void __exit hclgevf_exit(void) { + hnae3_acquire_unload_lock(); hnae3_unregister_ae_algo(&ae_algovf); destroy_workqueue(hclgevf_wq); + hnae3_release_unload_lock(); } module_init(hclgevf_init); module_exit(hclgevf_exit); From a197004cf3c2e6c8cc0695c787a97e62e3229754 Mon Sep 17 00:00:00 2001 From: Dimitri Fedrau Date: Sat, 18 Jan 2025 19:43:43 +0100 Subject: [PATCH 11/88] net: phy: marvell-88q2xxx: Fix temperature measurement with reset-gpios MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When using temperature measurement on Marvell 88Q2XXX devices and the reset-gpios property is set in DT, the device does a hardware reset when interface is brought down and up again. That means that the content of the register MDIO_MMD_PCS_MV_TEMP_SENSOR2 is reset to default and that leads to permanent deactivation of the temperature measurement, because activation is done in mv88q2xxx_probe. To fix this move activation of temperature measurement to mv88q222x_config_init. Fixes: a557a92e6881 ("net: phy: marvell-88q2xxx: add support for temperature sensor") Reviewed-by: Niklas Söderlund Signed-off-by: Dimitri Fedrau Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20250118-marvell-88q2xxx-fix-hwmon-v2-1-402e62ba2dcb@gmail.com Signed-off-by: Paolo Abeni --- drivers/net/phy/marvell-88q2xxx.c | 33 ++++++++++++++++++++++++------- 1 file changed, 26 insertions(+), 7 deletions(-) diff --git a/drivers/net/phy/marvell-88q2xxx.c b/drivers/net/phy/marvell-88q2xxx.c index 4494b3e39ce2b..a3996471a1c9a 100644 --- a/drivers/net/phy/marvell-88q2xxx.c +++ b/drivers/net/phy/marvell-88q2xxx.c @@ -95,6 +95,10 @@ #define MDIO_MMD_PCS_MV_TDR_OFF_CUTOFF 65246 +struct mv88q2xxx_priv { + bool enable_temp; +}; + struct mmd_val { int devad; u32 regnum; @@ -710,17 +714,12 @@ static const struct hwmon_chip_info mv88q2xxx_hwmon_chip_info = { static int mv88q2xxx_hwmon_probe(struct phy_device *phydev) { + struct mv88q2xxx_priv *priv = phydev->priv; struct device *dev = &phydev->mdio.dev; struct device *hwmon; char *hwmon_name; - int ret; - - /* Enable temperature sense */ - ret = phy_modify_mmd(phydev, MDIO_MMD_PCS, MDIO_MMD_PCS_MV_TEMP_SENSOR2, - MDIO_MMD_PCS_MV_TEMP_SENSOR2_DIS_MASK, 0); - if (ret < 0) - return ret; + priv->enable_temp = true; hwmon_name = devm_hwmon_sanitize_name(dev, dev_name(dev)); if (IS_ERR(hwmon_name)) return PTR_ERR(hwmon_name); @@ -743,6 +742,14 @@ static int mv88q2xxx_hwmon_probe(struct phy_device *phydev) static int mv88q2xxx_probe(struct phy_device *phydev) { + struct mv88q2xxx_priv *priv; + + priv = devm_kzalloc(&phydev->mdio.dev, sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + phydev->priv = priv; + return mv88q2xxx_hwmon_probe(phydev); } @@ -810,6 +817,18 @@ static int mv88q222x_revb1_revb2_config_init(struct phy_device *phydev) static int mv88q222x_config_init(struct phy_device *phydev) { + struct mv88q2xxx_priv *priv = phydev->priv; + int ret; + + /* Enable temperature sense */ + if (priv->enable_temp) { + ret = phy_modify_mmd(phydev, MDIO_MMD_PCS, + MDIO_MMD_PCS_MV_TEMP_SENSOR2, + MDIO_MMD_PCS_MV_TEMP_SENSOR2_DIS_MASK, 0); + if (ret < 0) + return ret; + } + if (phydev->c45_ids.device_ids[MDIO_MMD_PMAPMD] == PHY_ID_88Q2220_REVB0) return mv88q222x_revb0_config_init(phydev); else From 61dc1fd9205bc9d9918aa933a847b08e80b4dc20 Mon Sep 17 00:00:00 2001 From: Dheeraj Reddy Jonnalagadda Date: Mon, 20 Jan 2025 14:24:30 +0530 Subject: [PATCH 12/88] net: fec: implement TSO descriptor cleanup Implement cleanup of descriptors in the TSO error path of fec_enet_txq_submit_tso(). The cleanup - Unmaps DMA buffers for data descriptors skipping TSO header - Clears all buffer descriptors - Handles extended descriptors by clearing cbd_esc when enabled Fixes: 79f339125ea3 ("net: fec: Add software TSO support") Signed-off-by: Dheeraj Reddy Jonnalagadda Reviewed-by: Wei Fang Link: https://patch.msgid.link/20250120085430.99318-1-dheeraj.linuxdev@gmail.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/freescale/fec_main.c | 31 ++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c index 68725506a095d..f7c4ce8e9a265 100644 --- a/drivers/net/ethernet/freescale/fec_main.c +++ b/drivers/net/ethernet/freescale/fec_main.c @@ -840,6 +840,8 @@ static int fec_enet_txq_submit_tso(struct fec_enet_priv_tx_q *txq, struct fec_enet_private *fep = netdev_priv(ndev); int hdr_len, total_len, data_left; struct bufdesc *bdp = txq->bd.cur; + struct bufdesc *tmp_bdp; + struct bufdesc_ex *ebdp; struct tso_t tso; unsigned int index = 0; int ret; @@ -913,7 +915,34 @@ static int fec_enet_txq_submit_tso(struct fec_enet_priv_tx_q *txq, return 0; err_release: - /* TODO: Release all used data descriptors for TSO */ + /* Release all used data descriptors for TSO */ + tmp_bdp = txq->bd.cur; + + while (tmp_bdp != bdp) { + /* Unmap data buffers */ + if (tmp_bdp->cbd_bufaddr && + !IS_TSO_HEADER(txq, fec32_to_cpu(tmp_bdp->cbd_bufaddr))) + dma_unmap_single(&fep->pdev->dev, + fec32_to_cpu(tmp_bdp->cbd_bufaddr), + fec16_to_cpu(tmp_bdp->cbd_datlen), + DMA_TO_DEVICE); + + /* Clear standard buffer descriptor fields */ + tmp_bdp->cbd_sc = 0; + tmp_bdp->cbd_datlen = 0; + tmp_bdp->cbd_bufaddr = 0; + + /* Handle extended descriptor if enabled */ + if (fep->bufdesc_ex) { + ebdp = (struct bufdesc_ex *)tmp_bdp; + ebdp->cbd_esc = 0; + } + + tmp_bdp = fec_enet_get_nextdesc(tmp_bdp, &txq->bd); + } + + dev_kfree_skb_any(skb); + return ret; } From 59e00e8ca24220acea2d2d9f540fccf64e0f41ea Mon Sep 17 00:00:00 2001 From: Harshit Mogalapalli Date: Mon, 20 Jan 2025 16:50:02 -0800 Subject: [PATCH 13/88] net: mvneta: fix locking in mvneta_cpu_online() When port is stopped, unlock before returning Fixes: 413f0271f396 ("net: protect NAPI enablement with netdev_lock()") Signed-off-by: Harshit Mogalapalli Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250121005002.3938236-1-harshit.m.mogalapalli@oracle.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/mvneta.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c index 82f4333fb426c..4fe121b9f94b6 100644 --- a/drivers/net/ethernet/marvell/mvneta.c +++ b/drivers/net/ethernet/marvell/mvneta.c @@ -4432,6 +4432,7 @@ static int mvneta_cpu_online(unsigned int cpu, struct hlist_node *node) */ if (pp->is_stopped) { spin_unlock(&pp->lock); + netdev_unlock(port->napi.dev); return 0; } netif_tx_stop_all_queues(pp->dev); From 965adae5a33a998e2b62ec1b37c5eb21f1875b15 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 21 Jan 2025 06:34:23 -0800 Subject: [PATCH 14/88] selftests/net: packetdrill: more xfail changes (and a correction) Recent change to add more cases to XFAIL has a broken regex, the matching needs a real regex not a glob pattern. While at it add the cases Willem pointed out during review. Fixes: 3030e3d57ba8 ("selftests/net: packetdrill: make tcp buf limited timing tests benign") Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20250121143423.215261-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/packetdrill/ksft_runner.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/packetdrill/ksft_runner.sh b/tools/testing/selftests/net/packetdrill/ksft_runner.sh index e15c43b7359b2..ef8b25a606d81 100755 --- a/tools/testing/selftests/net/packetdrill/ksft_runner.sh +++ b/tools/testing/selftests/net/packetdrill/ksft_runner.sh @@ -39,11 +39,13 @@ if [[ -n "${KSFT_MACHINE_SLOW}" ]]; then # xfail tests that are known flaky with dbg config, not fixable. # still run them for coverage (and expect 100% pass without dbg). declare -ar xfail_list=( + "tcp_eor_no-coalesce-retrans.pkt" "tcp_fast_recovery_prr-ss.*.pkt" + "tcp_slow_start_slow-start-after-win-update.pkt" "tcp_timestamping.*.pkt" "tcp_user_timeout_user-timeout-probe.pkt" "tcp_zerocopy_epoll_.*.pkt" - "tcp_tcp_info_tcp-info-*-limited.pkt" + "tcp_tcp_info_tcp-info-.*-limited.pkt" ) readonly xfail_regex="^($(printf '%s|' "${xfail_list[@]}"))$" [[ "$script" =~ ${xfail_regex} ]] && failfunc=ktap_test_xfail From 15a901361ec3fb1c393f91880e1cbf24ec0a88bd Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 21 Jan 2025 18:12:41 +0000 Subject: [PATCH 15/88] ipmr: do not call mr_mfc_uses_dev() for unres entries syzbot found that calling mr_mfc_uses_dev() for unres entries would crash [1], because c->mfc_un.res.minvif / c->mfc_un.res.maxvif alias to "struct sk_buff_head unresolved", which contain two pointers. This code never worked, lets remove it. [1] Unable to handle kernel paging request at virtual address ffff5fff2d536613 KASAN: maybe wild-memory-access in range [0xfffefff96a9b3098-0xfffefff96a9b309f] Modules linked in: CPU: 1 UID: 0 PID: 7321 Comm: syz.0.16 Not tainted 6.13.0-rc7-syzkaller-g1950a0af2d55 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/13/2024 pstate: 80400005 (Nzcv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) pc : mr_mfc_uses_dev net/ipv4/ipmr_base.c:290 [inline] pc : mr_table_dump+0x5a4/0x8b0 net/ipv4/ipmr_base.c:334 lr : mr_mfc_uses_dev net/ipv4/ipmr_base.c:289 [inline] lr : mr_table_dump+0x694/0x8b0 net/ipv4/ipmr_base.c:334 Call trace: mr_mfc_uses_dev net/ipv4/ipmr_base.c:290 [inline] (P) mr_table_dump+0x5a4/0x8b0 net/ipv4/ipmr_base.c:334 (P) mr_rtm_dumproute+0x254/0x454 net/ipv4/ipmr_base.c:382 ipmr_rtm_dumproute+0x248/0x4b4 net/ipv4/ipmr.c:2648 rtnl_dump_all+0x2e4/0x4e8 net/core/rtnetlink.c:4327 rtnl_dumpit+0x98/0x1d0 net/core/rtnetlink.c:6791 netlink_dump+0x4f0/0xbc0 net/netlink/af_netlink.c:2317 netlink_recvmsg+0x56c/0xe64 net/netlink/af_netlink.c:1973 sock_recvmsg_nosec net/socket.c:1033 [inline] sock_recvmsg net/socket.c:1055 [inline] sock_read_iter+0x2d8/0x40c net/socket.c:1125 new_sync_read fs/read_write.c:484 [inline] vfs_read+0x740/0x970 fs/read_write.c:565 ksys_read+0x15c/0x26c fs/read_write.c:708 Fixes: cb167893f41e ("net: Plumb support for filtering ipv4 and ipv6 multicast route dumps") Reported-by: syzbot+5cfae50c0e5f2c500013@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/678fe2d1.050a0220.15cac.00b3.GAE@google.com/T/#u Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Link: https://patch.msgid.link/20250121181241.841212-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/ipmr_base.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c index 03b6eee407a24..28d77d454d442 100644 --- a/net/ipv4/ipmr_base.c +++ b/net/ipv4/ipmr_base.c @@ -330,9 +330,6 @@ int mr_table_dump(struct mr_table *mrt, struct sk_buff *skb, list_for_each_entry(mfc, &mrt->mfc_unres_queue, list) { if (e < s_e) goto next_entry2; - if (filter->dev && - !mr_mfc_uses_dev(mrt, mfc, filter->dev)) - goto next_entry2; err = fill(mrt, skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, mfc, RTM_NEWROUTE, flags); From 6c9b7db96db62ee9ad8d359d90ff468d462518c4 Mon Sep 17 00:00:00 2001 From: Sebastian Sewior Date: Thu, 23 Jan 2025 17:20:45 +0100 Subject: [PATCH 16/88] xfrm: Don't disable preemption while looking up cache state. For the state cache lookup xfrm_input_state_lookup() first disables preemption, to remain on the CPU and then retrieves a per-CPU pointer. Within the preempt-disable section it also acquires netns_xfrm::xfrm_state_lock, a spinlock_t. This lock must not be acquired with explicit disabled preemption (such as by get_cpu()) because this lock becomes a sleeping lock on PREEMPT_RT. To remain on the same CPU is just an optimisation for the CPU local lookup. The actual modification of the per-CPU variable happens with netns_xfrm::xfrm_state_lock acquired. Remove get_cpu() and use the state_cache_input on the current CPU. Reported-by: Alexei Starovoitov Closes: https://lore.kernel.org/all/CAADnVQKkCLaj=roayH=Mjiiqz_svdf1tsC3OE4EC0E=mAD+L1A@mail.gmail.com/ Fixes: 81a331a0e72dd ("xfrm: Add an inbound percpu state cache.") Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_state.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 1781728ca4285..711e816fc4041 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -1150,9 +1150,8 @@ struct xfrm_state *xfrm_input_state_lookup(struct net *net, u32 mark, struct xfrm_hash_state_ptrs state_ptrs; struct hlist_head *state_cache_input; struct xfrm_state *x = NULL; - int cpu = get_cpu(); - state_cache_input = per_cpu_ptr(net->xfrm.state_cache_input, cpu); + state_cache_input = raw_cpu_ptr(net->xfrm.state_cache_input); rcu_read_lock(); hlist_for_each_entry_rcu(x, state_cache_input, state_cache_input) { @@ -1186,7 +1185,6 @@ struct xfrm_state *xfrm_input_state_lookup(struct net *net, u32 mark, out: rcu_read_unlock(); - put_cpu(); return x; } EXPORT_SYMBOL(xfrm_input_state_lookup); From 396f0165672c6a74d7379027d344b83b5f05948c Mon Sep 17 00:00:00 2001 From: Emil Tantilov Date: Thu, 21 Nov 2024 20:40:59 -0800 Subject: [PATCH 17/88] idpf: add read memory barrier when checking descriptor done bit Add read memory barrier to ensure the order of operations when accessing control queue descriptors. Specifically, we want to avoid cases where loads can be reordered: 1. Load #1 is dispatched to read descriptor flags. 2. Load #2 is dispatched to read some other field from the descriptor. 3. Load #2 completes, accessing memory/cache at a point in time when the DD flag is zero. 4. NIC DMA overwrites the descriptor, now the DD flag is one. 5. Any fields loaded before step 4 are now inconsistent with the actual descriptor state. Add read memory barrier between steps 1 and 2, so that load #2 is not executed until load #1 has completed. Fixes: 8077c727561a ("idpf: add controlq init and reset checks") Reviewed-by: Przemek Kitszel Reviewed-by: Sridhar Samudrala Suggested-by: Lance Richardson Signed-off-by: Emil Tantilov Tested-by: Krishneil Singh Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/idpf/idpf_controlq.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/ethernet/intel/idpf/idpf_controlq.c b/drivers/net/ethernet/intel/idpf/idpf_controlq.c index 4849590a5591f..b28991dd18703 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_controlq.c +++ b/drivers/net/ethernet/intel/idpf/idpf_controlq.c @@ -376,6 +376,9 @@ int idpf_ctlq_clean_sq(struct idpf_ctlq_info *cq, u16 *clean_count, if (!(le16_to_cpu(desc->flags) & IDPF_CTLQ_FLAG_DD)) break; + /* Ensure no other fields are read until DD flag is checked */ + dma_rmb(); + /* strip off FW internal code */ desc_err = le16_to_cpu(desc->ret_val) & 0xff; @@ -563,6 +566,9 @@ int idpf_ctlq_recv(struct idpf_ctlq_info *cq, u16 *num_q_msg, if (!(flags & IDPF_CTLQ_FLAG_DD)) break; + /* Ensure no other fields are read until DD flag is checked */ + dma_rmb(); + q_msg[i].vmvf_type = (flags & (IDPF_CTLQ_FLAG_FTYPE_VM | IDPF_CTLQ_FLAG_FTYPE_PF)) >> From 137da75ba72593598898a4e79da34f4b2da5d151 Mon Sep 17 00:00:00 2001 From: Emil Tantilov Date: Thu, 19 Dec 2024 18:09:32 -0800 Subject: [PATCH 18/88] idpf: fix transaction timeouts on reset Restore the call to idpf_vc_xn_shutdown() at the beginning of idpf_vc_core_deinit() provided the function is not called on remove. In the reset path the mailbox is destroyed, leading to all transactions timing out. Fixes: 09d0fb5cb30e ("idpf: deinit virtchnl transaction manager after vport and vectors") Reviewed-by: Larysa Zaremba Signed-off-by: Emil Tantilov Reviewed-by: Simon Horman Tested-by: Krishneil Singh Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/idpf/idpf_virtchnl.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/idpf/idpf_virtchnl.c b/drivers/net/ethernet/intel/idpf/idpf_virtchnl.c index d46c95f91b0d8..7639d520b8063 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_virtchnl.c +++ b/drivers/net/ethernet/intel/idpf/idpf_virtchnl.c @@ -3077,12 +3077,21 @@ int idpf_vc_core_init(struct idpf_adapter *adapter) */ void idpf_vc_core_deinit(struct idpf_adapter *adapter) { + bool remove_in_prog; + if (!test_bit(IDPF_VC_CORE_INIT, adapter->flags)) return; + /* Avoid transaction timeouts when called during reset */ + remove_in_prog = test_bit(IDPF_REMOVE_IN_PROG, adapter->flags); + if (!remove_in_prog) + idpf_vc_xn_shutdown(adapter->vcxn_mngr); + idpf_deinit_task(adapter); idpf_intr_rel(adapter); - idpf_vc_xn_shutdown(adapter->vcxn_mngr); + + if (remove_in_prog) + idpf_vc_xn_shutdown(adapter->vcxn_mngr); cancel_delayed_work_sync(&adapter->serv_task); cancel_delayed_work_sync(&adapter->mbx_task); From d15fe4edd7decdf14d8ad2b78df100ea23302065 Mon Sep 17 00:00:00 2001 From: Manoj Vishwanathan Date: Mon, 16 Dec 2024 16:27:33 +0000 Subject: [PATCH 19/88] idpf: Acquire the lock before accessing the xn->salt The transaction salt was being accessed before acquiring the idpf_vc_xn_lock when idpf has to forward the virtchnl reply. Fixes: 34c21fa894a1 ("idpf: implement virtchnl transaction manager") Signed-off-by: Manoj Vishwanathan Signed-off-by: David Decotigny Signed-off-by: Brian Vazquez Reviewed-by: Jacob Keller Reviewed-by: Pavan Kumar Linga Tested-by: Krishneil Singh Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/idpf/idpf_virtchnl.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/idpf/idpf_virtchnl.c b/drivers/net/ethernet/intel/idpf/idpf_virtchnl.c index 7639d520b8063..99bdb95bf2266 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_virtchnl.c +++ b/drivers/net/ethernet/intel/idpf/idpf_virtchnl.c @@ -612,14 +612,15 @@ idpf_vc_xn_forward_reply(struct idpf_adapter *adapter, return -EINVAL; } xn = &adapter->vcxn_mngr->ring[xn_idx]; + idpf_vc_xn_lock(xn); salt = FIELD_GET(IDPF_VC_XN_SALT_M, msg_info); if (xn->salt != salt) { dev_err_ratelimited(&adapter->pdev->dev, "Transaction salt does not match (%02x != %02x)\n", xn->salt, salt); + idpf_vc_xn_unlock(xn); return -EINVAL; } - idpf_vc_xn_lock(xn); switch (xn->state) { case IDPF_VC_XN_WAITING: /* success */ From 9a5b021cb8186f1854bac2812bd4f396bb1e881c Mon Sep 17 00:00:00 2001 From: Marco Leogrande Date: Mon, 16 Dec 2024 16:27:34 +0000 Subject: [PATCH 20/88] idpf: convert workqueues to unbound When a workqueue is created with `WQ_UNBOUND`, its work items are served by special worker-pools, whose host workers are not bound to any specific CPU. In the default configuration (i.e. when `queue_delayed_work` and friends do not specify which CPU to run the work item on), `WQ_UNBOUND` allows the work item to be executed on any CPU in the same node of the CPU it was enqueued on. While this solution potentially sacrifices locality, it avoids contention with other processes that might dominate the CPU time of the processor the work item was scheduled on. This is not just a theoretical problem: in a particular scenario misconfigured process was hogging most of the time from CPU0, leaving less than 0.5% of its CPU time to the kworker. The IDPF workqueues that were using the kworker on CPU0 suffered large completion delays as a result, causing performance degradation, timeouts and eventual system crash. Tested: * I have also run a manual test to gauge the performance improvement. The test consists of an antagonist process (`./stress --cpu 2`) consuming as much of CPU 0 as possible. This process is run under `taskset 01` to bind it to CPU0, and its priority is changed with `chrt -pQ 9900 10000 ${pid}` and `renice -n -20 ${pid}` after start. Then, the IDPF driver is forced to prefer CPU0 by editing all calls to `queue_delayed_work`, `mod_delayed_work`, etc... to use CPU 0. Finally, `ktraces` for the workqueue events are collected. Without the current patch, the antagonist process can force arbitrary delays between `workqueue_queue_work` and `workqueue_execute_start`, that in my tests were as high as `30ms`. With the current patch applied, the workqueue can be migrated to another unloaded CPU in the same node, and, keeping everything else equal, the maximum delay I could see was `6us`. Fixes: 0fe45467a104 ("idpf: add create vport and netdev configuration") Signed-off-by: Marco Leogrande Signed-off-by: Manoj Vishwanathan Signed-off-by: Brian Vazquez Reviewed-by: Jacob Keller Reviewed-by: Pavan Kumar Linga Tested-by: Krishneil Singh Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/idpf/idpf_main.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/intel/idpf/idpf_main.c b/drivers/net/ethernet/intel/idpf/idpf_main.c index f71d3182580b6..b6c515d14cbf0 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_main.c +++ b/drivers/net/ethernet/intel/idpf/idpf_main.c @@ -174,7 +174,8 @@ static int idpf_probe(struct pci_dev *pdev, const struct pci_device_id *ent) pci_set_master(pdev); pci_set_drvdata(pdev, adapter); - adapter->init_wq = alloc_workqueue("%s-%s-init", 0, 0, + adapter->init_wq = alloc_workqueue("%s-%s-init", + WQ_UNBOUND | WQ_MEM_RECLAIM, 0, dev_driver_string(dev), dev_name(dev)); if (!adapter->init_wq) { @@ -183,7 +184,8 @@ static int idpf_probe(struct pci_dev *pdev, const struct pci_device_id *ent) goto err_free; } - adapter->serv_wq = alloc_workqueue("%s-%s-service", 0, 0, + adapter->serv_wq = alloc_workqueue("%s-%s-service", + WQ_UNBOUND | WQ_MEM_RECLAIM, 0, dev_driver_string(dev), dev_name(dev)); if (!adapter->serv_wq) { @@ -192,7 +194,8 @@ static int idpf_probe(struct pci_dev *pdev, const struct pci_device_id *ent) goto err_serv_wq_alloc; } - adapter->mbx_wq = alloc_workqueue("%s-%s-mbx", 0, 0, + adapter->mbx_wq = alloc_workqueue("%s-%s-mbx", + WQ_UNBOUND | WQ_MEM_RECLAIM, 0, dev_driver_string(dev), dev_name(dev)); if (!adapter->mbx_wq) { @@ -201,7 +204,8 @@ static int idpf_probe(struct pci_dev *pdev, const struct pci_device_id *ent) goto err_mbx_wq_alloc; } - adapter->stats_wq = alloc_workqueue("%s-%s-stats", 0, 0, + adapter->stats_wq = alloc_workqueue("%s-%s-stats", + WQ_UNBOUND | WQ_MEM_RECLAIM, 0, dev_driver_string(dev), dev_name(dev)); if (!adapter->stats_wq) { @@ -210,7 +214,8 @@ static int idpf_probe(struct pci_dev *pdev, const struct pci_device_id *ent) goto err_stats_wq_alloc; } - adapter->vc_event_wq = alloc_workqueue("%s-%s-vc_event", 0, 0, + adapter->vc_event_wq = alloc_workqueue("%s-%s-vc_event", + WQ_UNBOUND | WQ_MEM_RECLAIM, 0, dev_driver_string(dev), dev_name(dev)); if (!adapter->vc_event_wq) { From d0ea9ebac3e7a5b53bc259e51f54043aa98696ad Mon Sep 17 00:00:00 2001 From: Manoj Vishwanathan Date: Mon, 16 Dec 2024 16:27:35 +0000 Subject: [PATCH 21/88] idpf: add more info during virtchnl transaction timeout/salt mismatch Add more information related to the transaction like cookie, vc_op, salt when transaction times out and include similar information when transaction salt does not match. Info output for transaction timeout: ------------------- (op:5015 cookie:45fe vc_op:5015 salt:45 timeout:60000ms) ------------------- before it was: ------------------- (op 5015, 60000ms) ------------------- Signed-off-by: Manoj Vishwanathan Signed-off-by: Brian Vazquez Reviewed-by: Jacob Keller Reviewed-by: Pavan Kumar Linga Reviewed-by: Paul Menzel Tested-by: Krishneil Singh Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/idpf/idpf_virtchnl.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/intel/idpf/idpf_virtchnl.c b/drivers/net/ethernet/intel/idpf/idpf_virtchnl.c index 99bdb95bf2266..3d2413b8684fc 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_virtchnl.c +++ b/drivers/net/ethernet/intel/idpf/idpf_virtchnl.c @@ -517,8 +517,10 @@ static ssize_t idpf_vc_xn_exec(struct idpf_adapter *adapter, retval = -ENXIO; goto only_unlock; case IDPF_VC_XN_WAITING: - dev_notice_ratelimited(&adapter->pdev->dev, "Transaction timed-out (op %d, %dms)\n", - params->vc_op, params->timeout_ms); + dev_notice_ratelimited(&adapter->pdev->dev, + "Transaction timed-out (op:%d cookie:%04x vc_op:%d salt:%02x timeout:%dms)\n", + params->vc_op, cookie, xn->vc_op, + xn->salt, params->timeout_ms); retval = -ETIME; break; case IDPF_VC_XN_COMPLETED_SUCCESS: @@ -615,8 +617,9 @@ idpf_vc_xn_forward_reply(struct idpf_adapter *adapter, idpf_vc_xn_lock(xn); salt = FIELD_GET(IDPF_VC_XN_SALT_M, msg_info); if (xn->salt != salt) { - dev_err_ratelimited(&adapter->pdev->dev, "Transaction salt does not match (%02x != %02x)\n", - xn->salt, salt); + dev_err_ratelimited(&adapter->pdev->dev, "Transaction salt does not match (exp:%d@%02x(%d) != got:%d@%02x)\n", + xn->vc_op, xn->salt, xn->state, + ctlq_msg->cookie.mbx.chnl_opcode, salt); idpf_vc_xn_unlock(xn); return -EINVAL; } From 18625e26fefced78f2ae28b25e80a07079821e04 Mon Sep 17 00:00:00 2001 From: Przemek Kitszel Date: Thu, 19 Dec 2024 12:55:16 +0100 Subject: [PATCH 22/88] ice: fix ice_parser_rt::bst_key array size Fix &ice_parser_rt::bst_key size. It was wrongly set to 10 instead of 20 in the initial impl commit (see Fixes tag). All usage code assumed it was of size 20. That was also the initial size present up to v2 of the intro series [2], but halved by v3 [3] refactor described as "Replace magic hardcoded values with macros." The introducing series was so big that some ugliness was unnoticed, same for bugs :/ ICE_BST_KEY_TCAM_SIZE and ICE_BST_TCAM_KEY_SIZE were differing by one. There was tmp variable @j in the scope of edited function, but was not used in all places. This ugliness is now gone. I'm moving ice_parser_rt::pg_prio a few positions up, to fill up one of the holes in order to compensate for the added 10 bytes to the ::bst_key, resulting in the same size of the whole as prior to the fix, and minimal changes in the offsets of the fields. Extend also the debug dump print of the key to cover all bytes. To not have string with 20 "%02x" and 20 params, switch to ice_debug_array_w_prefix(). This fix obsoletes Ahmed's attempt at [1]. [1] https://lore.kernel.org/intel-wired-lan/20240823230847.172295-1-ahmed.zaki@intel.com [2] https://lore.kernel.org/intel-wired-lan/20230605054641.2865142-13-junfeng.guo@intel.com [3] https://lore.kernel.org/intel-wired-lan/20230817093442.2576997-13-junfeng.guo@intel.com Reported-by: Dan Carpenter Closes: https://lore.kernel.org/intel-wired-lan/b1fb6ff9-b69e-4026-9988-3c783d86c2e0@stanley.mountain Fixes: 9a4c07aaa0f5 ("ice: add parser execution main loop") CC: Ahmed Zaki Reviewed-by: Larysa Zaremba Signed-off-by: Przemek Kitszel Reviewed-by: Simon Horman Tested-by: Rafal Romanowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_parser.h | 6 ++---- drivers/net/ethernet/intel/ice/ice_parser_rt.c | 12 +++++------- 2 files changed, 7 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_parser.h b/drivers/net/ethernet/intel/ice/ice_parser.h index 6509d807627ce..4f56d53d56b9a 100644 --- a/drivers/net/ethernet/intel/ice/ice_parser.h +++ b/drivers/net/ethernet/intel/ice/ice_parser.h @@ -257,7 +257,6 @@ ice_pg_nm_cam_match(struct ice_pg_nm_cam_item *table, int size, /*** ICE_SID_RXPARSER_BOOST_TCAM and ICE_SID_LBL_RXPARSER_TMEM sections ***/ #define ICE_BST_TCAM_TABLE_SIZE 256 #define ICE_BST_TCAM_KEY_SIZE 20 -#define ICE_BST_KEY_TCAM_SIZE 19 /* Boost TCAM item */ struct ice_bst_tcam_item { @@ -401,7 +400,6 @@ u16 ice_xlt_kb_flag_get(struct ice_xlt_kb *kb, u64 pkt_flag); #define ICE_PARSER_GPR_NUM 128 #define ICE_PARSER_FLG_NUM 64 #define ICE_PARSER_ERR_NUM 16 -#define ICE_BST_KEY_SIZE 10 #define ICE_MARKER_ID_SIZE 9 #define ICE_MARKER_MAX_SIZE \ (ICE_MARKER_ID_SIZE * BITS_PER_BYTE - 1) @@ -431,13 +429,13 @@ struct ice_parser_rt { u8 pkt_buf[ICE_PARSER_MAX_PKT_LEN + ICE_PARSER_PKT_REV]; u16 pkt_len; u16 po; - u8 bst_key[ICE_BST_KEY_SIZE]; + u8 bst_key[ICE_BST_TCAM_KEY_SIZE]; struct ice_pg_cam_key pg_key; + u8 pg_prio; struct ice_alu *alu0; struct ice_alu *alu1; struct ice_alu *alu2; struct ice_pg_cam_action *action; - u8 pg_prio; struct ice_gpr_pu pu; u8 markers[ICE_MARKER_ID_SIZE]; bool protocols[ICE_PO_PAIR_SIZE]; diff --git a/drivers/net/ethernet/intel/ice/ice_parser_rt.c b/drivers/net/ethernet/intel/ice/ice_parser_rt.c index dedf5e854e4b7..3995d662e0509 100644 --- a/drivers/net/ethernet/intel/ice/ice_parser_rt.c +++ b/drivers/net/ethernet/intel/ice/ice_parser_rt.c @@ -125,22 +125,20 @@ static void ice_bst_key_init(struct ice_parser_rt *rt, else key[idd] = imem->b_kb.prio; - idd = ICE_BST_KEY_TCAM_SIZE - 1; + idd = ICE_BST_TCAM_KEY_SIZE - 2; for (i = idd; i >= 0; i--) { int j; j = ho + idd - i; if (j < ICE_PARSER_MAX_PKT_LEN) - key[i] = rt->pkt_buf[ho + idd - i]; + key[i] = rt->pkt_buf[j]; else key[i] = 0; } - ice_debug(rt->psr->hw, ICE_DBG_PARSER, "Generated Boost TCAM Key:\n"); - ice_debug(rt->psr->hw, ICE_DBG_PARSER, "%02X %02X %02X %02X %02X %02X %02X %02X %02X %02X\n", - key[0], key[1], key[2], key[3], key[4], - key[5], key[6], key[7], key[8], key[9]); - ice_debug(rt->psr->hw, ICE_DBG_PARSER, "\n"); + ice_debug_array_w_prefix(rt->psr->hw, ICE_DBG_PARSER, + KBUILD_MODNAME ": Generated Boost TCAM Key", + key, ICE_BST_TCAM_KEY_SIZE); } static u16 ice_bit_rev_u16(u16 v, int len) From c5cc2a27e04f2fcd77c74ada9aef76a758a24697 Mon Sep 17 00:00:00 2001 From: Mateusz Polchlopek Date: Tue, 31 Dec 2024 10:50:44 +0100 Subject: [PATCH 23/88] ice: remove invalid parameter of equalizer It occurred that in the commit 70838938e89c ("ice: Implement driver functionality to dump serdes equalizer values") the invalid DRATE parameter for reading has been added. The output of the command: $ ethtool -d returns the garbage value in the place where DRATE value should be stored. Remove mentioned parameter to prevent return of corrupted data to userspace. Fixes: 70838938e89c ("ice: Implement driver functionality to dump serdes equalizer values") Signed-off-by: Mateusz Polchlopek Reviewed-by: Michal Swiatkowski Tested-by: Rinitha S (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_adminq_cmd.h | 1 - drivers/net/ethernet/intel/ice/ice_ethtool.c | 1 - drivers/net/ethernet/intel/ice/ice_ethtool.h | 1 - 3 files changed, 3 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h b/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h index 01536a382e54e..bdee499f991a6 100644 --- a/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h +++ b/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h @@ -1498,7 +1498,6 @@ struct ice_aqc_dnl_equa_param { #define ICE_AQC_RX_EQU_POST1 (0x12 << ICE_AQC_RX_EQU_SHIFT) #define ICE_AQC_RX_EQU_BFLF (0x13 << ICE_AQC_RX_EQU_SHIFT) #define ICE_AQC_RX_EQU_BFHF (0x14 << ICE_AQC_RX_EQU_SHIFT) -#define ICE_AQC_RX_EQU_DRATE (0x15 << ICE_AQC_RX_EQU_SHIFT) #define ICE_AQC_RX_EQU_CTLE_GAINHF (0x20 << ICE_AQC_RX_EQU_SHIFT) #define ICE_AQC_RX_EQU_CTLE_GAINLF (0x21 << ICE_AQC_RX_EQU_SHIFT) #define ICE_AQC_RX_EQU_CTLE_GAINDC (0x22 << ICE_AQC_RX_EQU_SHIFT) diff --git a/drivers/net/ethernet/intel/ice/ice_ethtool.c b/drivers/net/ethernet/intel/ice/ice_ethtool.c index 3072634bf049c..f241493a6ac88 100644 --- a/drivers/net/ethernet/intel/ice/ice_ethtool.c +++ b/drivers/net/ethernet/intel/ice/ice_ethtool.c @@ -710,7 +710,6 @@ static int ice_get_tx_rx_equa(struct ice_hw *hw, u8 serdes_num, { ICE_AQC_RX_EQU_POST1, rx, &ptr->rx_equ_post1 }, { ICE_AQC_RX_EQU_BFLF, rx, &ptr->rx_equ_bflf }, { ICE_AQC_RX_EQU_BFHF, rx, &ptr->rx_equ_bfhf }, - { ICE_AQC_RX_EQU_DRATE, rx, &ptr->rx_equ_drate }, { ICE_AQC_RX_EQU_CTLE_GAINHF, rx, &ptr->rx_equ_ctle_gainhf }, { ICE_AQC_RX_EQU_CTLE_GAINLF, rx, &ptr->rx_equ_ctle_gainlf }, { ICE_AQC_RX_EQU_CTLE_GAINDC, rx, &ptr->rx_equ_ctle_gaindc }, diff --git a/drivers/net/ethernet/intel/ice/ice_ethtool.h b/drivers/net/ethernet/intel/ice/ice_ethtool.h index 8f2ad1c172c06..23b2cfbc9684c 100644 --- a/drivers/net/ethernet/intel/ice/ice_ethtool.h +++ b/drivers/net/ethernet/intel/ice/ice_ethtool.h @@ -15,7 +15,6 @@ struct ice_serdes_equalization_to_ethtool { int rx_equ_post1; int rx_equ_bflf; int rx_equ_bfhf; - int rx_equ_drate; int rx_equ_ctle_gainhf; int rx_equ_ctle_gainlf; int rx_equ_ctle_gaindc; From ee7d79433d783346430ee32f28c9df44a88b3bb6 Mon Sep 17 00:00:00 2001 From: Michal Swiatkowski Date: Thu, 5 Sep 2024 11:14:10 +0200 Subject: [PATCH 24/88] iavf: allow changing VLAN state without calling PF First case: > ip l a l $VF name vlanx type vlan id 100 > ip l d vlanx > ip l a l $VF name vlanx type vlan id 100 As workqueue can be execute after sometime, there is a window to have call trace like that: - iavf_del_vlan - iavf_add_vlan - iavf_del_vlans (wq) It means that our VLAN 100 will change the state from IAVF_VLAN_ACTIVE to IAVF_VLAN_REMOVE (iavf_del_vlan). After that in iavf_add_vlan state won't be changed because VLAN 100 is on the filter list. The final result is that the VLAN 100 filter isn't added in hardware (no iavf_add_vlans call). To fix that change the state if the filter wasn't removed yet directly to active. It is save as IAVF_VLAN_REMOVE means that virtchnl message wasn't sent yet. Second case: > ip l a l $VF name vlanx type vlan id 100 Any type of VF reset ex. change trust > ip l s $PF vf $VF_NUM trust on > ip l d vlanx > ip l a l $VF name vlanx type vlan id 100 In case of reset iavf driver is responsible for readding all filters that are being used. To do that all VLAN filters state are changed to IAVF_VLAN_ADD. Here is even longer window for changing VLAN state from kernel side, as workqueue isn't called immediately. We can have call trace like that: - changing to IAVF_VLAN_ADD (after reset) - iavf_del_vlan (called from kernel ops) - iavf_del_vlans (wq) Not exsisitng VLAN filters will be removed from hardware. It isn't a bug, ice driver will handle it fine. However, we can have call trace like that: - changing to IAVF_VLAN_ADD (after reset) - iavf_del_vlan (called from kernel ops) - iavf_add_vlan (called from kernel ops) - iavf_del_vlans (wq) With fix for previous case we end up with no VLAN filters in hardware. We have to remove VLAN filters if the state is IAVF_VLAN_ADD and delete VLAN was called. It is save as IAVF_VLAN_ADD means that virtchnl message wasn't sent yet. Fixes: 0c0da0e95105 ("iavf: refactor VLAN filter states") Signed-off-by: Michal Swiatkowski Reviewed-by: Przemek Kitszel Tested-by: Rafal Romanowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/iavf/iavf_main.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c index cbfaaa5b7d02e..2d7a18fcc3be4 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_main.c +++ b/drivers/net/ethernet/intel/iavf/iavf_main.c @@ -773,6 +773,11 @@ iavf_vlan_filter *iavf_add_vlan(struct iavf_adapter *adapter, f->state = IAVF_VLAN_ADD; adapter->num_vlan_filters++; iavf_schedule_aq_request(adapter, IAVF_FLAG_AQ_ADD_VLAN_FILTER); + } else if (f->state == IAVF_VLAN_REMOVE) { + /* IAVF_VLAN_REMOVE means that VLAN wasn't yet removed. + * We can safely only change the state here. + */ + f->state = IAVF_VLAN_ACTIVE; } clearout: @@ -793,8 +798,18 @@ static void iavf_del_vlan(struct iavf_adapter *adapter, struct iavf_vlan vlan) f = iavf_find_vlan(adapter, vlan); if (f) { - f->state = IAVF_VLAN_REMOVE; - iavf_schedule_aq_request(adapter, IAVF_FLAG_AQ_DEL_VLAN_FILTER); + /* IAVF_ADD_VLAN means that VLAN wasn't even added yet. + * Remove it from the list. + */ + if (f->state == IAVF_VLAN_ADD) { + list_del(&f->list); + kfree(f); + adapter->num_vlan_filters--; + } else { + f->state = IAVF_VLAN_REMOVE; + iavf_schedule_aq_request(adapter, + IAVF_FLAG_AQ_DEL_VLAN_FILTER); + } } spin_unlock_bh(&adapter->mac_vlan_list_lock); From 05d91cdb1f9108426b14975ef4eeddf15875ca05 Mon Sep 17 00:00:00 2001 From: Paul Fertser Date: Mon, 20 Jan 2025 16:35:36 +0300 Subject: [PATCH 25/88] net/ncsi: use dev_set_mac_address() for Get MC MAC Address handling Copy of the rationale from 790071347a0a1a89e618eedcd51c687ea783aeb3: Change ndo_set_mac_address to dev_set_mac_address because dev_set_mac_address provides a way to notify network layer about MAC change. In other case, services may not aware about MAC change and keep using old one which set from network adapter driver. As example, DHCP client from systemd do not update MAC address without notification from net subsystem which leads to the problem with acquiring the right address from DHCP server. Since dev_set_mac_address requires RTNL lock the operation can not be performed directly in the response handler, see 9e2bbab94b88295dcc57c7580393c9ee08d7314d. The way of selecting the first suitable MAC address from the list is changed, instead of having the driver check it this patch just assumes any valid MAC should be good. Fixes: b8291cf3d118 ("net/ncsi: Add NC-SI 1.2 Get MC MAC Address command") Signed-off-by: Paul Fertser Signed-off-by: David S. Miller --- net/ncsi/ncsi-rsp.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/net/ncsi/ncsi-rsp.c b/net/ncsi/ncsi-rsp.c index 14bd66909ca45..4a8ce2949faea 100644 --- a/net/ncsi/ncsi-rsp.c +++ b/net/ncsi/ncsi-rsp.c @@ -1089,14 +1089,12 @@ static int ncsi_rsp_handler_netlink(struct ncsi_request *nr) static int ncsi_rsp_handler_gmcma(struct ncsi_request *nr) { struct ncsi_dev_priv *ndp = nr->ndp; + struct sockaddr *saddr = &ndp->pending_mac; struct net_device *ndev = ndp->ndev.dev; struct ncsi_rsp_gmcma_pkt *rsp; - struct sockaddr saddr; - int ret = -1; int i; rsp = (struct ncsi_rsp_gmcma_pkt *)skb_network_header(nr->rsp); - saddr.sa_family = ndev->type; ndev->priv_flags |= IFF_LIVE_ADDR_CHANGE; netdev_info(ndev, "NCSI: Received %d provisioned MAC addresses\n", @@ -1108,20 +1106,20 @@ static int ncsi_rsp_handler_gmcma(struct ncsi_request *nr) rsp->addresses[i][4], rsp->addresses[i][5]); } + saddr->sa_family = ndev->type; for (i = 0; i < rsp->address_count; i++) { - memcpy(saddr.sa_data, &rsp->addresses[i], ETH_ALEN); - ret = ndev->netdev_ops->ndo_set_mac_address(ndev, &saddr); - if (ret < 0) { + if (!is_valid_ether_addr(rsp->addresses[i])) { netdev_warn(ndev, "NCSI: Unable to assign %pM to device\n", - saddr.sa_data); + rsp->addresses[i]); continue; } - netdev_warn(ndev, "NCSI: Set MAC address to %pM\n", saddr.sa_data); + memcpy(saddr->sa_data, rsp->addresses[i], ETH_ALEN); + netdev_warn(ndev, "NCSI: Will set MAC address to %pM\n", saddr->sa_data); break; } - ndp->gma_flag = ret == 0; - return ret; + ndp->gma_flag = 1; + return 0; } static struct ncsi_rsp_handler { From 09ebd028d6d70c7dc4b1c69212a18134ad2e0020 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=B0=A2=E8=87=B4=E9=82=A6=20=28XIE=20Zhibang=29?= Date: Thu, 23 Jan 2025 11:57:03 +0000 Subject: [PATCH 26/88] net: the appletalk subsystem no longer uses ndo_do_ioctl MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ndo_do_ioctl is no longer used by the appletalk subsystem after commit 45bd1c5ba758 ("net: appletalk: Drop aarp_send_probe_phase1()"). Signed-off-by: 谢致邦 (XIE Zhibang) Reviewed-by: Simon Horman Link: https://patch.msgid.link/tencent_4AC6ED413FEA8116B4253D3ED6947FDBCF08@qq.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 8da4c61f97b97..2a59034a5fa2f 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1085,8 +1085,8 @@ struct netdev_net_notifier { * * int (*ndo_do_ioctl)(struct net_device *dev, struct ifreq *ifr, int cmd); * Old-style ioctl entry point. This is used internally by the - * appletalk and ieee802154 subsystems but is no longer called by - * the device ioctl handler. + * ieee802154 subsystem but is no longer called by the device + * ioctl handler. * * int (*ndo_siocbond)(struct net_device *dev, struct ifreq *ifr, int cmd); * Used by the bonding driver for its device specific ioctls: From 5de7665e0a0746b5ad7943554b34db8f8614a196 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 22 Jan 2025 18:02:44 +0000 Subject: [PATCH 27/88] net: rose: fix timer races against user threads Rose timers only acquire the socket spinlock, without checking if the socket is owned by one user thread. Add a check and rearm the timers if needed. BUG: KASAN: slab-use-after-free in rose_timer_expiry+0x31d/0x360 net/rose/rose_timer.c:174 Read of size 2 at addr ffff88802f09b82a by task swapper/0/0 CPU: 0 UID: 0 PID: 0 Comm: swapper/0 Not tainted 6.13.0-rc5-syzkaller-00172-gd1bf27c4e176 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/13/2024 Call Trace: __dump_stack lib/dump_stack.c:94 [inline] dump_stack_lvl+0x241/0x360 lib/dump_stack.c:120 print_address_description mm/kasan/report.c:378 [inline] print_report+0x169/0x550 mm/kasan/report.c:489 kasan_report+0x143/0x180 mm/kasan/report.c:602 rose_timer_expiry+0x31d/0x360 net/rose/rose_timer.c:174 call_timer_fn+0x187/0x650 kernel/time/timer.c:1793 expire_timers kernel/time/timer.c:1844 [inline] __run_timers kernel/time/timer.c:2418 [inline] __run_timer_base+0x66a/0x8e0 kernel/time/timer.c:2430 run_timer_base kernel/time/timer.c:2439 [inline] run_timer_softirq+0xb7/0x170 kernel/time/timer.c:2449 handle_softirqs+0x2d4/0x9b0 kernel/softirq.c:561 __do_softirq kernel/softirq.c:595 [inline] invoke_softirq kernel/softirq.c:435 [inline] __irq_exit_rcu+0xf7/0x220 kernel/softirq.c:662 irq_exit_rcu+0x9/0x30 kernel/softirq.c:678 instr_sysvec_apic_timer_interrupt arch/x86/kernel/apic/apic.c:1049 [inline] sysvec_apic_timer_interrupt+0xa6/0xc0 arch/x86/kernel/apic/apic.c:1049 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: syzbot Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20250122180244.1861468-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/rose/rose_timer.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/net/rose/rose_timer.c b/net/rose/rose_timer.c index f06ddbed3fed6..1525773e94aa1 100644 --- a/net/rose/rose_timer.c +++ b/net/rose/rose_timer.c @@ -122,6 +122,10 @@ static void rose_heartbeat_expiry(struct timer_list *t) struct rose_sock *rose = rose_sk(sk); bh_lock_sock(sk); + if (sock_owned_by_user(sk)) { + sk_reset_timer(sk, &sk->sk_timer, jiffies + HZ/20); + goto out; + } switch (rose->state) { case ROSE_STATE_0: /* Magic here: If we listen() and a new link dies before it @@ -152,6 +156,7 @@ static void rose_heartbeat_expiry(struct timer_list *t) } rose_start_heartbeat(sk); +out: bh_unlock_sock(sk); sock_put(sk); } @@ -162,6 +167,10 @@ static void rose_timer_expiry(struct timer_list *t) struct sock *sk = &rose->sock; bh_lock_sock(sk); + if (sock_owned_by_user(sk)) { + sk_reset_timer(sk, &rose->timer, jiffies + HZ/20); + goto out; + } switch (rose->state) { case ROSE_STATE_1: /* T1 */ case ROSE_STATE_4: /* T2 */ @@ -182,6 +191,7 @@ static void rose_timer_expiry(struct timer_list *t) } break; } +out: bh_unlock_sock(sk); sock_put(sk); } @@ -192,6 +202,10 @@ static void rose_idletimer_expiry(struct timer_list *t) struct sock *sk = &rose->sock; bh_lock_sock(sk); + if (sock_owned_by_user(sk)) { + sk_reset_timer(sk, &rose->idletimer, jiffies + HZ/20); + goto out; + } rose_clear_queues(sk); rose_write_internal(sk, ROSE_CLEAR_REQUEST); @@ -207,6 +221,7 @@ static void rose_idletimer_expiry(struct timer_list *t) sk->sk_state_change(sk); sock_set_flag(sk, SOCK_DEAD); } +out: bh_unlock_sock(sk); sock_put(sk); } From 50bf398e1ceacb9a7f85bd3bdca065ebe5cb6159 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 22 Jan 2025 14:45:03 -0800 Subject: [PATCH 28/88] net: netdevsim: try to close UDP port harness races syzbot discovered that we remove the debugfs files after we free the netdev. Try to clean up the relevant dir while the device is still around. Reported-by: syzbot+2e5de9e3ab986b71d2bf@syzkaller.appspotmail.com Fixes: 424be63ad831 ("netdevsim: add UDP tunnel port offload support") Reviewed-by: Michal Swiatkowski Link: https://patch.msgid.link/20250122224503.762705-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/netdevsim/netdevsim.h | 1 + drivers/net/netdevsim/udp_tunnels.c | 23 +++++++++++-------- .../drivers/net/netdevsim/udp_tunnel_nic.sh | 16 ++++++------- 3 files changed, 23 insertions(+), 17 deletions(-) diff --git a/drivers/net/netdevsim/netdevsim.h b/drivers/net/netdevsim/netdevsim.h index dcf073bc4802e..96d54c08043d3 100644 --- a/drivers/net/netdevsim/netdevsim.h +++ b/drivers/net/netdevsim/netdevsim.h @@ -134,6 +134,7 @@ struct netdevsim { u32 sleep; u32 __ports[2][NSIM_UDP_TUNNEL_N_PORTS]; u32 (*ports)[NSIM_UDP_TUNNEL_N_PORTS]; + struct dentry *ddir; struct debugfs_u32_array dfs_ports[2]; } udp_ports; diff --git a/drivers/net/netdevsim/udp_tunnels.c b/drivers/net/netdevsim/udp_tunnels.c index 02dc3123eb6c1..640b4983a9a0d 100644 --- a/drivers/net/netdevsim/udp_tunnels.c +++ b/drivers/net/netdevsim/udp_tunnels.c @@ -112,9 +112,11 @@ nsim_udp_tunnels_info_reset_write(struct file *file, const char __user *data, struct net_device *dev = file->private_data; struct netdevsim *ns = netdev_priv(dev); - memset(ns->udp_ports.ports, 0, sizeof(ns->udp_ports.__ports)); rtnl_lock(); - udp_tunnel_nic_reset_ntf(dev); + if (dev->reg_state == NETREG_REGISTERED) { + memset(ns->udp_ports.ports, 0, sizeof(ns->udp_ports.__ports)); + udp_tunnel_nic_reset_ntf(dev); + } rtnl_unlock(); return count; @@ -144,23 +146,23 @@ int nsim_udp_tunnels_info_create(struct nsim_dev *nsim_dev, else ns->udp_ports.ports = nsim_dev->udp_ports.__ports; - debugfs_create_u32("udp_ports_inject_error", 0600, - ns->nsim_dev_port->ddir, + ns->udp_ports.ddir = debugfs_create_dir("udp_ports", + ns->nsim_dev_port->ddir); + + debugfs_create_u32("inject_error", 0600, ns->udp_ports.ddir, &ns->udp_ports.inject_error); ns->udp_ports.dfs_ports[0].array = ns->udp_ports.ports[0]; ns->udp_ports.dfs_ports[0].n_elements = NSIM_UDP_TUNNEL_N_PORTS; - debugfs_create_u32_array("udp_ports_table0", 0400, - ns->nsim_dev_port->ddir, + debugfs_create_u32_array("table0", 0400, ns->udp_ports.ddir, &ns->udp_ports.dfs_ports[0]); ns->udp_ports.dfs_ports[1].array = ns->udp_ports.ports[1]; ns->udp_ports.dfs_ports[1].n_elements = NSIM_UDP_TUNNEL_N_PORTS; - debugfs_create_u32_array("udp_ports_table1", 0400, - ns->nsim_dev_port->ddir, + debugfs_create_u32_array("table1", 0400, ns->udp_ports.ddir, &ns->udp_ports.dfs_ports[1]); - debugfs_create_file("udp_ports_reset", 0200, ns->nsim_dev_port->ddir, + debugfs_create_file("reset", 0200, ns->udp_ports.ddir, dev, &nsim_udp_tunnels_info_reset_fops); /* Note: it's not normal to allocate the info struct like this! @@ -196,6 +198,9 @@ int nsim_udp_tunnels_info_create(struct nsim_dev *nsim_dev, void nsim_udp_tunnels_info_destroy(struct net_device *dev) { + struct netdevsim *ns = netdev_priv(dev); + + debugfs_remove_recursive(ns->udp_ports.ddir); kfree(dev->udp_tunnel_nic_info); dev->udp_tunnel_nic_info = NULL; } diff --git a/tools/testing/selftests/drivers/net/netdevsim/udp_tunnel_nic.sh b/tools/testing/selftests/drivers/net/netdevsim/udp_tunnel_nic.sh index 384cfa3d38a6c..92c2f0376c081 100755 --- a/tools/testing/selftests/drivers/net/netdevsim/udp_tunnel_nic.sh +++ b/tools/testing/selftests/drivers/net/netdevsim/udp_tunnel_nic.sh @@ -142,7 +142,7 @@ function pre_ethtool { } function check_table { - local path=$NSIM_DEV_DFS/ports/$port/udp_ports_table$1 + local path=$NSIM_DEV_DFS/ports/$port/udp_ports/table$1 local -n expected=$2 local last=$3 @@ -212,7 +212,7 @@ function check_tables { } function print_table { - local path=$NSIM_DEV_DFS/ports/$port/udp_ports_table$1 + local path=$NSIM_DEV_DFS/ports/$port/udp_ports/table$1 read -a have < $path tree $NSIM_DEV_DFS/ @@ -641,7 +641,7 @@ for port in 0 1; do NSIM_NETDEV=`get_netdev_name old_netdevs` ip link set dev $NSIM_NETDEV up - echo 110 > $NSIM_DEV_DFS/ports/$port/udp_ports_inject_error + echo 110 > $NSIM_DEV_DFS/ports/$port/udp_ports/inject_error msg="1 - create VxLANs v6" exp0=( 0 0 0 0 ) @@ -663,7 +663,7 @@ for port in 0 1; do new_geneve gnv0 20000 msg="2 - destroy GENEVE" - echo 2 > $NSIM_DEV_DFS/ports/$port/udp_ports_inject_error + echo 2 > $NSIM_DEV_DFS/ports/$port/udp_ports/inject_error exp1=( `mke 20000 2` 0 0 0 ) del_dev gnv0 @@ -764,7 +764,7 @@ for port in 0 1; do msg="create VxLANs v4" new_vxlan vxlan0 10000 $NSIM_NETDEV - echo 1 > $NSIM_DEV_DFS/ports/$port/udp_ports_reset + echo 1 > $NSIM_DEV_DFS/ports/$port/udp_ports/reset check_tables msg="NIC device goes down" @@ -775,7 +775,7 @@ for port in 0 1; do fi check_tables - echo 1 > $NSIM_DEV_DFS/ports/$port/udp_ports_reset + echo 1 > $NSIM_DEV_DFS/ports/$port/udp_ports/reset check_tables msg="NIC device goes up again" @@ -789,7 +789,7 @@ for port in 0 1; do del_dev vxlan0 check_tables - echo 1 > $NSIM_DEV_DFS/ports/$port/udp_ports_reset + echo 1 > $NSIM_DEV_DFS/ports/$port/udp_ports/reset check_tables msg="destroy NIC" @@ -896,7 +896,7 @@ msg="vacate VxLAN in overflow table" exp0=( `mke 10000 1` `mke 10004 1` 0 `mke 10003 1` ) del_dev vxlan2 -echo 1 > $NSIM_DEV_DFS/ports/$port/udp_ports_reset +echo 1 > $NSIM_DEV_DFS/ports/$port/udp_ports/reset check_tables msg="tunnels destroyed 2" From 979284535aaf12a287a2f43d9d5dfcbdc1dc4cac Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Wed, 22 Jan 2025 16:04:07 -0800 Subject: [PATCH 29/88] net/mlx5e: add missing cpu_to_node to kvzalloc_node in mlx5e_open_xdpredirect_sq kvzalloc_node is not doing a runtime check on the node argument (__alloc_pages_node_noprof does have a VM_BUG_ON, but it expands to nothing on !CONFIG_DEBUG_VM builds), so doing any ethtool/netlink operation that calls mlx5e_open on a CPU that's larger that MAX_NUMNODES triggers OOB access and panic (see the trace below). Add missing cpu_to_node call to convert cpu id to node id. [ 165.427394] mlx5_core 0000:5c:00.0 beth1: Link up [ 166.479327] BUG: unable to handle page fault for address: 0000000800000010 [ 166.494592] #PF: supervisor read access in kernel mode [ 166.505995] #PF: error_code(0x0000) - not-present page ... [ 166.816958] Call Trace: [ 166.822380] [ 166.827034] ? __die_body+0x64/0xb0 [ 166.834774] ? page_fault_oops+0x2cd/0x3f0 [ 166.843862] ? exc_page_fault+0x63/0x130 [ 166.852564] ? asm_exc_page_fault+0x22/0x30 [ 166.861843] ? __kvmalloc_node_noprof+0x43/0xd0 [ 166.871897] ? get_partial_node+0x1c/0x320 [ 166.880983] ? deactivate_slab+0x269/0x2b0 [ 166.890069] ___slab_alloc+0x521/0xa90 [ 166.898389] ? __kvmalloc_node_noprof+0x43/0xd0 [ 166.908442] __kmalloc_node_noprof+0x216/0x3f0 [ 166.918302] ? __kvmalloc_node_noprof+0x43/0xd0 [ 166.928354] __kvmalloc_node_noprof+0x43/0xd0 [ 166.938021] mlx5e_open_channels+0x5e2/0xc00 [ 166.947496] mlx5e_open_locked+0x3e/0xf0 [ 166.956201] mlx5e_open+0x23/0x50 [ 166.963551] __dev_open+0x114/0x1c0 [ 166.971292] __dev_change_flags+0xa2/0x1b0 [ 166.980378] dev_change_flags+0x21/0x60 [ 166.988887] do_setlink+0x38d/0xf20 [ 166.996628] ? ep_poll_callback+0x1b9/0x240 [ 167.005910] ? __nla_validate_parse.llvm.10713395753544950386+0x80/0xd70 [ 167.020782] ? __wake_up_sync_key+0x52/0x80 [ 167.030066] ? __mutex_lock+0xff/0x550 [ 167.038382] ? security_capable+0x50/0x90 [ 167.047279] rtnl_setlink+0x1c9/0x210 [ 167.055403] ? ep_poll_callback+0x1b9/0x240 [ 167.064684] ? security_capable+0x50/0x90 [ 167.073579] rtnetlink_rcv_msg+0x2f9/0x310 [ 167.082667] ? rtnetlink_bind+0x30/0x30 [ 167.091173] netlink_rcv_skb+0xb1/0xe0 [ 167.099492] netlink_unicast+0x20f/0x2e0 [ 167.108191] netlink_sendmsg+0x389/0x420 [ 167.116896] __sys_sendto+0x158/0x1c0 [ 167.125024] __x64_sys_sendto+0x22/0x30 [ 167.133534] do_syscall_64+0x63/0x130 [ 167.141657] ? __irq_exit_rcu.llvm.17843942359718260576+0x52/0xd0 [ 167.155181] entry_SYSCALL_64_after_hwframe+0x4b/0x53 Fixes: bb135e40129d ("net/mlx5e: move XDP_REDIRECT sq to dynamic allocation") Signed-off-by: Stanislav Fomichev Reviewed-by: Joe Damato Reviewed-by: Tariq Toukan Link: https://patch.msgid.link/20250123000407.3464715-1-sdf@fomichev.me Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index bd41b75d246e1..a814b63ed97e5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -2087,7 +2087,7 @@ static struct mlx5e_xdpsq *mlx5e_open_xdpredirect_sq(struct mlx5e_channel *c, struct mlx5e_xdpsq *xdpsq; int err; - xdpsq = kvzalloc_node(sizeof(*xdpsq), GFP_KERNEL, c->cpu); + xdpsq = kvzalloc_node(sizeof(*xdpsq), GFP_KERNEL, cpu_to_node(c->cpu)); if (!xdpsq) return ERR_PTR(-ENOMEM); From cde5959913b713eecee44451c9bb3141e625a8d8 Mon Sep 17 00:00:00 2001 From: Khaled Elnaggar Date: Thu, 23 Jan 2025 10:25:20 +0200 Subject: [PATCH 30/88] documentation: networking: fix spelling mistakes Fix a couple of typos/spelling mistakes in the documentation. Signed-off-by: Khaled Elnaggar Acked-by: Marc Kleine-Budde Reviewed-by: Przemek Kitszel Acked-by: Randy Dunlap Reviewed-by: Bagas Sanjaya Link: https://patch.msgid.link/20250123082521.59997-1-khaledelnaggarlinux@gmail.com Signed-off-by: Jakub Kicinski --- Documentation/networking/can.rst | 4 ++-- Documentation/networking/napi.rst | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Documentation/networking/can.rst b/Documentation/networking/can.rst index 62519d38c58ba..b018ce3463926 100644 --- a/Documentation/networking/can.rst +++ b/Documentation/networking/can.rst @@ -699,10 +699,10 @@ RAW socket option CAN_RAW_JOIN_FILTERS The CAN_RAW socket can set multiple CAN identifier specific filters that lead to multiple filters in the af_can.c filter processing. These filters -are indenpendent from each other which leads to logical OR'ed filters when +are independent from each other which leads to logical OR'ed filters when applied (see :ref:`socketcan-rawfilter`). -This socket option joines the given CAN filters in the way that only CAN +This socket option joins the given CAN filters in the way that only CAN frames are passed to user space that matched *all* given CAN filters. The semantic for the applied filters is therefore changed to a logical AND. diff --git a/Documentation/networking/napi.rst b/Documentation/networking/napi.rst index 6083210ab2a42..f970a2be271a0 100644 --- a/Documentation/networking/napi.rst +++ b/Documentation/networking/napi.rst @@ -362,7 +362,7 @@ It is expected that ``irq-suspend-timeout`` will be set to a value much larger than ``gro_flush_timeout`` as ``irq-suspend-timeout`` should suspend IRQs for the duration of one userland processing cycle. -While it is not stricly necessary to use ``napi_defer_hard_irqs`` and +While it is not strictly necessary to use ``napi_defer_hard_irqs`` and ``gro_flush_timeout`` to use IRQ suspension, their use is strongly recommended. From fd53aa40e65f518453115b6f56183b0c201db26b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Thu, 23 Jan 2025 08:22:40 +0100 Subject: [PATCH 31/88] ptp: Ensure info->enable callback is always set MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The ioctl and sysfs handlers unconditionally call the ->enable callback. Not all drivers implement that callback, leading to NULL dereferences. Example of affected drivers: ptp_s390.c, ptp_vclock.c and ptp_mock.c. Instead use a dummy callback if no better was specified by the driver. Fixes: d94ba80ebbea ("ptp: Added a brand new class driver for ptp clocks.") Cc: stable@vger.kernel.org Signed-off-by: Thomas Weißschuh Acked-by: Richard Cochran Reviewed-by: Michal Swiatkowski Link: https://patch.msgid.link/20250123-ptp-enable-v1-1-b015834d3a47@weissschuh.net Signed-off-by: Jakub Kicinski --- drivers/ptp/ptp_clock.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/ptp/ptp_clock.c b/drivers/ptp/ptp_clock.c index b932425ddc6a3..35a5994bf64f6 100644 --- a/drivers/ptp/ptp_clock.c +++ b/drivers/ptp/ptp_clock.c @@ -217,6 +217,11 @@ static int ptp_getcycles64(struct ptp_clock_info *info, struct timespec64 *ts) return info->gettime64(info, ts); } +static int ptp_enable(struct ptp_clock_info *ptp, struct ptp_clock_request *request, int on) +{ + return -EOPNOTSUPP; +} + static void ptp_aux_kworker(struct kthread_work *work) { struct ptp_clock *ptp = container_of(work, struct ptp_clock, @@ -294,6 +299,9 @@ struct ptp_clock *ptp_clock_register(struct ptp_clock_info *info, ptp->info->getcrosscycles = ptp->info->getcrosststamp; } + if (!ptp->info->enable) + ptp->info->enable = ptp_enable; + if (ptp->info->do_aux_work) { kthread_init_delayed_work(&ptp->aux_work, ptp_aux_kworker); ptp->kworker = kthread_run_worker(0, "ptp%d", ptp->index); From 964417a5d4a06614ef7fb3ae69bb17c91a2dc016 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 23 Jan 2025 17:21:30 -0800 Subject: [PATCH 32/88] tools: ynl: c: correct reverse decode of empty attrs netlink reports which attribute was incorrect by sending back an attribute offset. Offset points to the address of struct nlattr, but to interpret the type we also need the nesting path. Attribute IDs have different meaning in different nests of the same message. Correct the condition for "is the offset within current attribute". ynl_attr_data_len() does not include the attribute header, so the end offset was off by 4 bytes. This means that we'd always skip over flags and empty nests. The devmem tests, for example, issues an invalid request with empty queue nests, resulting in the following error: YNL failed: Kernel error: missing attribute: .queues.ifindex The message is incorrect, "queues" nest does not have an "ifindex" attribute defined. With this fix we decend correctly into the nest: YNL failed: Kernel error: missing attribute: .queues.id Fixes: 86878f14d71a ("tools: ynl: user space helpers") Reviewed-by: Donald Hunter Link: https://patch.msgid.link/20250124012130.1121227-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/net/ynl/lib/ynl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/net/ynl/lib/ynl.c b/tools/net/ynl/lib/ynl.c index e16cef160bc2c..ce32cb35007d6 100644 --- a/tools/net/ynl/lib/ynl.c +++ b/tools/net/ynl/lib/ynl.c @@ -95,7 +95,7 @@ ynl_err_walk(struct ynl_sock *ys, void *start, void *end, unsigned int off, ynl_attr_for_each_payload(start, data_len, attr) { astart_off = (char *)attr - (char *)start; - aend_off = astart_off + ynl_attr_data_len(attr); + aend_off = (char *)ynl_attr_data_end(attr) - (char *)start; if (aend_off <= off) continue; From 8ed47e4e0b428424e4c43704f12e9c1085f5425c Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 23 Jan 2025 19:18:35 -0800 Subject: [PATCH 33/88] eth: tg3: fix calling napi_enable() in atomic context tg3 has a spin lock protecting most of the config, switch to taking netdev_lock() explicitly on enable/start paths. Disable/stop paths seem to not be under the spin lock (since napi_disable() already needs to sleep), so leave that side as is. tg3_restart_hw() releases and re-takes the spin lock, we need to do the same because dev_close() needs to take netdev_lock(). Fixes: 413f0271f396 ("net: protect NAPI enablement with netdev_lock()") Reported-by: Dan Carpenter Link: https://lore.kernel.org/dcfd56bc-de32-4b11-9e19-d8bd1543745d@stanley.mountain Reviewed-by: Eric Dumazet Reviewed-by: Michael Chan Link: https://patch.msgid.link/20250124031841.1179756-2-kuba@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/tg3.c | 35 +++++++++++++++++++++++++---- 1 file changed, 31 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c index 9cc8db10a8d60..1c94bf1db7186 100644 --- a/drivers/net/ethernet/broadcom/tg3.c +++ b/drivers/net/ethernet/broadcom/tg3.c @@ -7424,7 +7424,7 @@ static void tg3_napi_enable(struct tg3 *tp) for (i = 0; i < tp->irq_cnt; i++) { tnapi = &tp->napi[i]; - napi_enable(&tnapi->napi); + napi_enable_locked(&tnapi->napi); if (tnapi->tx_buffers) { netif_queue_set_napi(tp->dev, txq_idx, NETDEV_QUEUE_TYPE_TX, @@ -7445,9 +7445,10 @@ static void tg3_napi_init(struct tg3 *tp) int i; for (i = 0; i < tp->irq_cnt; i++) { - netif_napi_add(tp->dev, &tp->napi[i].napi, - i ? tg3_poll_msix : tg3_poll); - netif_napi_set_irq(&tp->napi[i].napi, tp->napi[i].irq_vec); + netif_napi_add_locked(tp->dev, &tp->napi[i].napi, + i ? tg3_poll_msix : tg3_poll); + netif_napi_set_irq_locked(&tp->napi[i].napi, + tp->napi[i].irq_vec); } } @@ -11259,6 +11260,8 @@ static void tg3_timer_stop(struct tg3 *tp) static int tg3_restart_hw(struct tg3 *tp, bool reset_phy) __releases(tp->lock) __acquires(tp->lock) + __releases(tp->dev->lock) + __acquires(tp->dev->lock) { int err; @@ -11271,7 +11274,9 @@ static int tg3_restart_hw(struct tg3 *tp, bool reset_phy) tg3_timer_stop(tp); tp->irq_sync = 0; tg3_napi_enable(tp); + netdev_unlock(tp->dev); dev_close(tp->dev); + netdev_lock(tp->dev); tg3_full_lock(tp, 0); } return err; @@ -11299,6 +11304,7 @@ static void tg3_reset_task(struct work_struct *work) tg3_netif_stop(tp); + netdev_lock(tp->dev); tg3_full_lock(tp, 1); if (tg3_flag(tp, TX_RECOVERY_PENDING)) { @@ -11318,12 +11324,14 @@ static void tg3_reset_task(struct work_struct *work) * call cancel_work_sync() and wait forever. */ tg3_flag_clear(tp, RESET_TASK_PENDING); + netdev_unlock(tp->dev); dev_close(tp->dev); goto out; } tg3_netif_start(tp); tg3_full_unlock(tp); + netdev_unlock(tp->dev); tg3_phy_start(tp); tg3_flag_clear(tp, RESET_TASK_PENDING); out: @@ -11683,9 +11691,11 @@ static int tg3_start(struct tg3 *tp, bool reset_phy, bool test_irq, if (err) goto out_ints_fini; + netdev_lock(dev); tg3_napi_init(tp); tg3_napi_enable(tp); + netdev_unlock(dev); for (i = 0; i < tp->irq_cnt; i++) { err = tg3_request_irq(tp, i); @@ -12569,6 +12579,7 @@ static int tg3_set_ringparam(struct net_device *dev, irq_sync = 1; } + netdev_lock(dev); tg3_full_lock(tp, irq_sync); tp->rx_pending = ering->rx_pending; @@ -12597,6 +12608,7 @@ static int tg3_set_ringparam(struct net_device *dev, } tg3_full_unlock(tp); + netdev_unlock(dev); if (irq_sync && !err) tg3_phy_start(tp); @@ -12678,6 +12690,7 @@ static int tg3_set_pauseparam(struct net_device *dev, struct ethtool_pauseparam irq_sync = 1; } + netdev_lock(dev); tg3_full_lock(tp, irq_sync); if (epause->autoneg) @@ -12707,6 +12720,7 @@ static int tg3_set_pauseparam(struct net_device *dev, struct ethtool_pauseparam } tg3_full_unlock(tp); + netdev_unlock(dev); } tp->phy_flags |= TG3_PHYFLG_USER_CONFIGURED; @@ -13911,6 +13925,7 @@ static void tg3_self_test(struct net_device *dev, struct ethtool_test *etest, data[TG3_INTERRUPT_TEST] = 1; } + netdev_lock(dev); tg3_full_lock(tp, 0); tg3_halt(tp, RESET_KIND_SHUTDOWN, 1); @@ -13922,6 +13937,7 @@ static void tg3_self_test(struct net_device *dev, struct ethtool_test *etest, } tg3_full_unlock(tp); + netdev_unlock(dev); if (irq_sync && !err2) tg3_phy_start(tp); @@ -14365,6 +14381,7 @@ static int tg3_change_mtu(struct net_device *dev, int new_mtu) tg3_set_mtu(dev, tp, new_mtu); + netdev_lock(dev); tg3_full_lock(tp, 1); tg3_halt(tp, RESET_KIND_SHUTDOWN, 1); @@ -14384,6 +14401,7 @@ static int tg3_change_mtu(struct net_device *dev, int new_mtu) tg3_netif_start(tp); tg3_full_unlock(tp); + netdev_unlock(dev); if (!err) tg3_phy_start(tp); @@ -18164,6 +18182,7 @@ static int tg3_resume(struct device *device) netif_device_attach(dev); + netdev_lock(dev); tg3_full_lock(tp, 0); tg3_ape_driver_state_change(tp, RESET_KIND_INIT); @@ -18180,6 +18199,7 @@ static int tg3_resume(struct device *device) out: tg3_full_unlock(tp); + netdev_unlock(dev); if (!err) tg3_phy_start(tp); @@ -18260,7 +18280,9 @@ static pci_ers_result_t tg3_io_error_detected(struct pci_dev *pdev, done: if (state == pci_channel_io_perm_failure) { if (netdev) { + netdev_lock(netdev); tg3_napi_enable(tp); + netdev_unlock(netdev); dev_close(netdev); } err = PCI_ERS_RESULT_DISCONNECT; @@ -18314,7 +18336,9 @@ static pci_ers_result_t tg3_io_slot_reset(struct pci_dev *pdev) done: if (rc != PCI_ERS_RESULT_RECOVERED && netdev && netif_running(netdev)) { + netdev_lock(netdev); tg3_napi_enable(tp); + netdev_unlock(netdev); dev_close(netdev); } rtnl_unlock(); @@ -18340,12 +18364,14 @@ static void tg3_io_resume(struct pci_dev *pdev) if (!netdev || !netif_running(netdev)) goto done; + netdev_lock(netdev); tg3_full_lock(tp, 0); tg3_ape_driver_state_change(tp, RESET_KIND_INIT); tg3_flag_set(tp, INIT_COMPLETE); err = tg3_restart_hw(tp, true); if (err) { tg3_full_unlock(tp); + netdev_unlock(netdev); netdev_err(netdev, "Cannot restart hardware after reset.\n"); goto done; } @@ -18357,6 +18383,7 @@ static void tg3_io_resume(struct pci_dev *pdev) tg3_netif_start(tp); tg3_full_unlock(tp); + netdev_unlock(netdev); tg3_phy_start(tp); From a878f3e4ace7c042852747c2cd34c15872d59cd2 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 23 Jan 2025 19:18:36 -0800 Subject: [PATCH 34/88] eth: forcedeth: remove local wrappers for napi enable/disable The local helpers for calling napi_enable() and napi_disable() don't serve much purpose and they will complicate the fix in the subsequent patch. Remove them, call the core functions directly. Acked-by: Zhu Yanjun Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250124031841.1179756-3-kuba@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/nvidia/forcedeth.c | 30 +++++++------------------ 1 file changed, 8 insertions(+), 22 deletions(-) diff --git a/drivers/net/ethernet/nvidia/forcedeth.c b/drivers/net/ethernet/nvidia/forcedeth.c index 720f577929dbf..b00df57f2ca39 100644 --- a/drivers/net/ethernet/nvidia/forcedeth.c +++ b/drivers/net/ethernet/nvidia/forcedeth.c @@ -1120,20 +1120,6 @@ static void nv_disable_hw_interrupts(struct net_device *dev, u32 mask) } } -static void nv_napi_enable(struct net_device *dev) -{ - struct fe_priv *np = get_nvpriv(dev); - - napi_enable(&np->napi); -} - -static void nv_napi_disable(struct net_device *dev) -{ - struct fe_priv *np = get_nvpriv(dev); - - napi_disable(&np->napi); -} - #define MII_READ (-1) /* mii_rw: read/write a register on the PHY. * @@ -3114,7 +3100,7 @@ static int nv_change_mtu(struct net_device *dev, int new_mtu) * Changing the MTU is a rare event, it shouldn't matter. */ nv_disable_irq(dev); - nv_napi_disable(dev); + napi_disable(&np->napi); netif_tx_lock_bh(dev); netif_addr_lock(dev); spin_lock(&np->lock); @@ -3143,7 +3129,7 @@ static int nv_change_mtu(struct net_device *dev, int new_mtu) spin_unlock(&np->lock); netif_addr_unlock(dev); netif_tx_unlock_bh(dev); - nv_napi_enable(dev); + napi_enable(&np->napi); nv_enable_irq(dev); } return 0; @@ -4731,7 +4717,7 @@ static int nv_set_ringparam(struct net_device *dev, if (netif_running(dev)) { nv_disable_irq(dev); - nv_napi_disable(dev); + napi_disable(&np->napi); netif_tx_lock_bh(dev); netif_addr_lock(dev); spin_lock(&np->lock); @@ -4784,7 +4770,7 @@ static int nv_set_ringparam(struct net_device *dev, spin_unlock(&np->lock); netif_addr_unlock(dev); netif_tx_unlock_bh(dev); - nv_napi_enable(dev); + napi_enable(&np->napi); nv_enable_irq(dev); } return 0; @@ -5277,7 +5263,7 @@ static void nv_self_test(struct net_device *dev, struct ethtool_test *test, u64 if (test->flags & ETH_TEST_FL_OFFLINE) { if (netif_running(dev)) { netif_stop_queue(dev); - nv_napi_disable(dev); + napi_disable(&np->napi); netif_tx_lock_bh(dev); netif_addr_lock(dev); spin_lock_irq(&np->lock); @@ -5334,7 +5320,7 @@ static void nv_self_test(struct net_device *dev, struct ethtool_test *test, u64 /* restart rx engine */ nv_start_rxtx(dev); netif_start_queue(dev); - nv_napi_enable(dev); + napi_enable(&np->napi); nv_enable_hw_interrupts(dev, np->irqmask); } } @@ -5594,7 +5580,7 @@ static int nv_open(struct net_device *dev) ret = nv_update_linkspeed(dev); nv_start_rxtx(dev); netif_start_queue(dev); - nv_napi_enable(dev); + napi_enable(&np->napi); if (ret) { netif_carrier_on(dev); @@ -5632,7 +5618,7 @@ static int nv_close(struct net_device *dev) spin_lock_irq(&np->lock); np->in_shutdown = 1; spin_unlock_irq(&np->lock); - nv_napi_disable(dev); + napi_disable(&np->napi); synchronize_irq(np->pci_dev->irq); del_timer_sync(&np->oom_kick); From 5c4470a1719c664d3b9aaa03619a49d74d6e6345 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 23 Jan 2025 19:18:37 -0800 Subject: [PATCH 35/88] eth: forcedeth: fix calling napi_enable() in atomic context napi_enable() may sleep now, take netdev_lock() before np->lock. Fixes: 413f0271f396 ("net: protect NAPI enablement with netdev_lock()") Reported-by: Dan Carpenter Link: https://lore.kernel.org/dcfd56bc-de32-4b11-9e19-d8bd1543745d@stanley.mountain Acked-by: Zhu Yanjun Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250124031841.1179756-4-kuba@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/nvidia/forcedeth.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/nvidia/forcedeth.c b/drivers/net/ethernet/nvidia/forcedeth.c index b00df57f2ca39..499e5e39d5134 100644 --- a/drivers/net/ethernet/nvidia/forcedeth.c +++ b/drivers/net/ethernet/nvidia/forcedeth.c @@ -5562,6 +5562,7 @@ static int nv_open(struct net_device *dev) /* ask for interrupts */ nv_enable_hw_interrupts(dev, np->irqmask); + netdev_lock(dev); spin_lock_irq(&np->lock); writel(NVREG_MCASTADDRA_FORCE, base + NvRegMulticastAddrA); writel(0, base + NvRegMulticastAddrB); @@ -5580,7 +5581,7 @@ static int nv_open(struct net_device *dev) ret = nv_update_linkspeed(dev); nv_start_rxtx(dev); netif_start_queue(dev); - napi_enable(&np->napi); + napi_enable_locked(&np->napi); if (ret) { netif_carrier_on(dev); @@ -5597,6 +5598,7 @@ static int nv_open(struct net_device *dev) round_jiffies(jiffies + STATS_INTERVAL)); spin_unlock_irq(&np->lock); + netdev_unlock(dev); /* If the loopback feature was set while the device was down, make sure * that it's set correctly now. From d19e612c47f4cf9da07105d95ac7233fe27e5f48 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 23 Jan 2025 19:18:38 -0800 Subject: [PATCH 36/88] eth: 8139too: fix calling napi_enable() in atomic context napi_enable() may sleep now, take netdev_lock() before tp->lock and tp->rx_lock. Fixes: 413f0271f396 ("net: protect NAPI enablement with netdev_lock()") Reported-by: Dan Carpenter Link: https://lore.kernel.org/dcfd56bc-de32-4b11-9e19-d8bd1543745d@stanley.mountain Reviewed-by: Eric Dumazet Acked-by: Francois Romieu Link: https://patch.msgid.link/20250124031841.1179756-5-kuba@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/realtek/8139too.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/realtek/8139too.c b/drivers/net/ethernet/realtek/8139too.c index 9ce0e8a64ba83..a73dcaffa8c5d 100644 --- a/drivers/net/ethernet/realtek/8139too.c +++ b/drivers/net/ethernet/realtek/8139too.c @@ -1684,6 +1684,7 @@ static void rtl8139_tx_timeout_task (struct work_struct *work) if (tmp8 & CmdTxEnb) RTL_W8 (ChipCmd, CmdRxEnb); + netdev_lock(dev); spin_lock_bh(&tp->rx_lock); /* Disable interrupts by clearing the interrupt mask. */ RTL_W16 (IntrMask, 0x0000); @@ -1694,11 +1695,12 @@ static void rtl8139_tx_timeout_task (struct work_struct *work) spin_unlock_irq(&tp->lock); /* ...and finally, reset everything */ - napi_enable(&tp->napi); + napi_enable_locked(&tp->napi); rtl8139_hw_start(dev); netif_wake_queue(dev); spin_unlock_bh(&tp->rx_lock); + netdev_unlock(dev); } static void rtl8139_tx_timeout(struct net_device *dev, unsigned int txqueue) From f1d12bc7a596fce1df6a02cab1fdb797cc4d1642 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 23 Jan 2025 19:18:39 -0800 Subject: [PATCH 37/88] eth: niu: fix calling napi_enable() in atomic context napi_enable() may sleep now, take netdev_lock() before np->lock. Fixes: 413f0271f396 ("net: protect NAPI enablement with netdev_lock()") Reported-by: Dan Carpenter Link: https://lore.kernel.org/dcfd56bc-de32-4b11-9e19-d8bd1543745d@stanley.mountain Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250124031841.1179756-6-kuba@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/sun/niu.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/sun/niu.c b/drivers/net/ethernet/sun/niu.c index d7459866d24c1..72177fea1cfb3 100644 --- a/drivers/net/ethernet/sun/niu.c +++ b/drivers/net/ethernet/sun/niu.c @@ -6086,7 +6086,7 @@ static void niu_enable_napi(struct niu *np) int i; for (i = 0; i < np->num_ldg; i++) - napi_enable(&np->ldg[i].napi); + napi_enable_locked(&np->ldg[i].napi); } static void niu_disable_napi(struct niu *np) @@ -6116,7 +6116,9 @@ static int niu_open(struct net_device *dev) if (err) goto out_free_channels; + netdev_lock(dev); niu_enable_napi(np); + netdev_unlock(dev); spin_lock_irq(&np->lock); @@ -6521,6 +6523,7 @@ static void niu_reset_task(struct work_struct *work) niu_reset_buffers(np); + netdev_lock(np->dev); spin_lock_irqsave(&np->lock, flags); err = niu_init_hw(np); @@ -6531,6 +6534,7 @@ static void niu_reset_task(struct work_struct *work) } spin_unlock_irqrestore(&np->lock, flags); + netdev_unlock(np->dev); } static void niu_tx_timeout(struct net_device *dev, unsigned int txqueue) @@ -6761,7 +6765,9 @@ static int niu_change_mtu(struct net_device *dev, int new_mtu) niu_free_channels(np); + netdev_lock(dev); niu_enable_napi(np); + netdev_unlock(dev); err = niu_alloc_channels(np); if (err) @@ -9937,6 +9943,7 @@ static int __maybe_unused niu_resume(struct device *dev_d) spin_lock_irqsave(&np->lock, flags); + netdev_lock(dev); err = niu_init_hw(np); if (!err) { np->timer.expires = jiffies + HZ; @@ -9945,6 +9952,7 @@ static int __maybe_unused niu_resume(struct device *dev_d) } spin_unlock_irqrestore(&np->lock, flags); + netdev_unlock(dev); return err; } From 09a939487fc8b2a437a8f1df20fe95e2857523d9 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 23 Jan 2025 19:18:40 -0800 Subject: [PATCH 38/88] eth: via-rhine: fix calling napi_enable() in atomic context napi_enable() may sleep now, take netdev_lock() before rp->lock. napi_enable() is hidden inside init_registers(). Note that this patch orders netdev_lock after rp->task_lock, to avoid having to take the netdev_lock() around disable path. Fixes: 413f0271f396 ("net: protect NAPI enablement with netdev_lock()") Reported-by: Dan Carpenter Link: https://lore.kernel.org/dcfd56bc-de32-4b11-9e19-d8bd1543745d@stanley.mountain Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250124031841.1179756-7-kuba@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/via/via-rhine.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/via/via-rhine.c b/drivers/net/ethernet/via/via-rhine.c index 894911f3d5603..e56ebbdd428d5 100644 --- a/drivers/net/ethernet/via/via-rhine.c +++ b/drivers/net/ethernet/via/via-rhine.c @@ -1568,7 +1568,7 @@ static void init_registers(struct net_device *dev) if (rp->quirks & rqMgmt) rhine_init_cam_filter(dev); - napi_enable(&rp->napi); + napi_enable_locked(&rp->napi); iowrite16(RHINE_EVENT & 0xffff, ioaddr + IntrEnable); @@ -1696,7 +1696,10 @@ static int rhine_open(struct net_device *dev) rhine_power_init(dev); rhine_chip_reset(dev); rhine_task_enable(rp); + + netdev_lock(dev); init_registers(dev); + netdev_unlock(dev); netif_dbg(rp, ifup, dev, "%s() Done - status %04x MII status: %04x\n", __func__, ioread16(ioaddr + ChipCmd), @@ -1727,6 +1730,8 @@ static void rhine_reset_task(struct work_struct *work) napi_disable(&rp->napi); netif_tx_disable(dev); + + netdev_lock(dev); spin_lock_bh(&rp->lock); /* clear all descriptors */ @@ -1740,6 +1745,7 @@ static void rhine_reset_task(struct work_struct *work) init_registers(dev); spin_unlock_bh(&rp->lock); + netdev_unlock(dev); netif_trans_update(dev); /* prevent tx timeout */ dev->stats.tx_errors++; @@ -2541,9 +2547,12 @@ static int rhine_resume(struct device *device) alloc_tbufs(dev); rhine_reset_rbufs(rp); rhine_task_enable(rp); + + netdev_lock(dev); spin_lock_bh(&rp->lock); init_registers(dev); spin_unlock_bh(&rp->lock); + netdev_unlock(dev); netif_device_attach(dev); From a60558644e20d036cae0e63d0e3a37f46b7789c0 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 23 Jan 2025 19:18:41 -0800 Subject: [PATCH 39/88] wifi: mt76: move napi_enable() from under BH mt76 does a lot of: local_bh_disable(); napi_enable(...napi); napi_schedule(...napi); local_bh_enable(); local_bh_disable() is not a real lock, its most likely taken because napi_schedule() requires that we invoke softirqs at some point. napi_enable() needs to take a mutex, so move it from under the BH protection. Fixes: 413f0271f396 ("net: protect NAPI enablement with netdev_lock()") Reported-by: Dan Carpenter Link: https://lore.kernel.org/dcfd56bc-de32-4b11-9e19-d8bd1543745d@stanley.mountain Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250124031841.1179756-8-kuba@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/wireless/mediatek/mt76/mt7603/mac.c | 9 ++++----- drivers/net/wireless/mediatek/mt76/mt7615/pci.c | 8 ++++++-- .../net/wireless/mediatek/mt76/mt7615/pci_mac.c | 8 +++++--- drivers/net/wireless/mediatek/mt76/mt76x0/pci.c | 8 +++++--- .../net/wireless/mediatek/mt76/mt76x02_mmio.c | 8 +++++--- drivers/net/wireless/mediatek/mt76/mt76x2/pci.c | 7 +++++-- drivers/net/wireless/mediatek/mt76/mt7915/mac.c | 17 +++++++++++++---- drivers/net/wireless/mediatek/mt76/mt7921/pci.c | 7 +++++-- .../net/wireless/mediatek/mt76/mt7921/pci_mac.c | 7 +++++-- drivers/net/wireless/mediatek/mt76/mt7925/pci.c | 7 +++++-- .../net/wireless/mediatek/mt76/mt7925/pci_mac.c | 7 +++++-- drivers/net/wireless/mediatek/mt76/mt7996/mac.c | 12 ++++++------ 12 files changed, 69 insertions(+), 36 deletions(-) diff --git a/drivers/net/wireless/mediatek/mt76/mt7603/mac.c b/drivers/net/wireless/mediatek/mt76/mt7603/mac.c index a259f4dd95401..413973d05b431 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7603/mac.c +++ b/drivers/net/wireless/mediatek/mt76/mt7603/mac.c @@ -1479,14 +1479,13 @@ static void mt7603_mac_watchdog_reset(struct mt7603_dev *dev) tasklet_enable(&dev->mt76.pre_tbtt_tasklet); mt7603_beacon_set_timer(dev, -1, beacon_int); - local_bh_disable(); napi_enable(&dev->mt76.tx_napi); - napi_schedule(&dev->mt76.tx_napi); - napi_enable(&dev->mt76.napi[0]); - napi_schedule(&dev->mt76.napi[0]); - napi_enable(&dev->mt76.napi[1]); + + local_bh_disable(); + napi_schedule(&dev->mt76.tx_napi); + napi_schedule(&dev->mt76.napi[0]); napi_schedule(&dev->mt76.napi[1]); local_bh_enable(); diff --git a/drivers/net/wireless/mediatek/mt76/mt7615/pci.c b/drivers/net/wireless/mediatek/mt76/mt7615/pci.c index 9a278589df4ef..68010e27f065e 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7615/pci.c +++ b/drivers/net/wireless/mediatek/mt76/mt7615/pci.c @@ -164,12 +164,16 @@ static int mt7615_pci_resume(struct pci_dev *pdev) dev_err(mdev->dev, "PDMA engine must be reinitialized\n"); mt76_worker_enable(&mdev->tx_worker); - local_bh_disable(); + mt76_for_each_q_rx(mdev, i) { napi_enable(&mdev->napi[i]); - napi_schedule(&mdev->napi[i]); } napi_enable(&mdev->tx_napi); + + local_bh_disable(); + mt76_for_each_q_rx(mdev, i) { + napi_schedule(&mdev->napi[i]); + } napi_schedule(&mdev->tx_napi); local_bh_enable(); diff --git a/drivers/net/wireless/mediatek/mt76/mt7615/pci_mac.c b/drivers/net/wireless/mediatek/mt76/mt7615/pci_mac.c index a0ca3bbdfcafb..c2e4e6aabd9fa 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7615/pci_mac.c +++ b/drivers/net/wireless/mediatek/mt76/mt7615/pci_mac.c @@ -262,12 +262,14 @@ void mt7615_mac_reset_work(struct work_struct *work) mt76_worker_enable(&dev->mt76.tx_worker); - local_bh_disable(); napi_enable(&dev->mt76.tx_napi); - napi_schedule(&dev->mt76.tx_napi); - mt76_for_each_q_rx(&dev->mt76, i) { napi_enable(&dev->mt76.napi[i]); + } + + local_bh_disable(); + napi_schedule(&dev->mt76.tx_napi); + mt76_for_each_q_rx(&dev->mt76, i) { napi_schedule(&dev->mt76.napi[i]); } local_bh_enable(); diff --git a/drivers/net/wireless/mediatek/mt76/mt76x0/pci.c b/drivers/net/wireless/mediatek/mt76/mt76x0/pci.c index 1eb955f3ca130..b456ccd00d581 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76x0/pci.c +++ b/drivers/net/wireless/mediatek/mt76/mt76x0/pci.c @@ -282,14 +282,16 @@ static int mt76x0e_resume(struct pci_dev *pdev) mt76_worker_enable(&mdev->tx_worker); - local_bh_disable(); mt76_for_each_q_rx(mdev, i) { mt76_queue_rx_reset(dev, i); napi_enable(&mdev->napi[i]); - napi_schedule(&mdev->napi[i]); } - napi_enable(&mdev->tx_napi); + + local_bh_disable(); + mt76_for_each_q_rx(mdev, i) { + napi_schedule(&mdev->napi[i]); + } napi_schedule(&mdev->tx_napi); local_bh_enable(); diff --git a/drivers/net/wireless/mediatek/mt76/mt76x02_mmio.c b/drivers/net/wireless/mediatek/mt76/mt76x02_mmio.c index 7d840ad4ae65e..a82c75ba26e66 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76x02_mmio.c +++ b/drivers/net/wireless/mediatek/mt76/mt76x02_mmio.c @@ -504,12 +504,14 @@ static void mt76x02_watchdog_reset(struct mt76x02_dev *dev) mt76_worker_enable(&dev->mt76.tx_worker); tasklet_enable(&dev->mt76.pre_tbtt_tasklet); - local_bh_disable(); napi_enable(&dev->mt76.tx_napi); - napi_schedule(&dev->mt76.tx_napi); - mt76_for_each_q_rx(&dev->mt76, i) { napi_enable(&dev->mt76.napi[i]); + } + + local_bh_disable(); + napi_schedule(&dev->mt76.tx_napi); + mt76_for_each_q_rx(&dev->mt76, i) { napi_schedule(&dev->mt76.napi[i]); } local_bh_enable(); diff --git a/drivers/net/wireless/mediatek/mt76/mt76x2/pci.c b/drivers/net/wireless/mediatek/mt76/mt76x2/pci.c index 67c9d1caa0bd6..727bfdd00b400 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76x2/pci.c +++ b/drivers/net/wireless/mediatek/mt76/mt76x2/pci.c @@ -151,12 +151,15 @@ mt76x2e_resume(struct pci_dev *pdev) mt76_worker_enable(&mdev->tx_worker); - local_bh_disable(); mt76_for_each_q_rx(mdev, i) { napi_enable(&mdev->napi[i]); - napi_schedule(&mdev->napi[i]); } napi_enable(&mdev->tx_napi); + + local_bh_disable(); + mt76_for_each_q_rx(mdev, i) { + napi_schedule(&mdev->napi[i]); + } napi_schedule(&mdev->tx_napi); local_bh_enable(); diff --git a/drivers/net/wireless/mediatek/mt76/mt7915/mac.c b/drivers/net/wireless/mediatek/mt76/mt7915/mac.c index 13bdc0a7174c9..2ba6eb3038cec 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7915/mac.c +++ b/drivers/net/wireless/mediatek/mt76/mt7915/mac.c @@ -1356,10 +1356,15 @@ mt7915_mac_restart(struct mt7915_dev *dev) mt7915_dma_reset(dev, true); - local_bh_disable(); mt76_for_each_q_rx(mdev, i) { if (mdev->q_rx[i].ndesc) { napi_enable(&dev->mt76.napi[i]); + } + } + + local_bh_disable(); + mt76_for_each_q_rx(mdev, i) { + if (mdev->q_rx[i].ndesc) { napi_schedule(&dev->mt76.napi[i]); } } @@ -1419,8 +1424,9 @@ mt7915_mac_restart(struct mt7915_dev *dev) if (phy2) clear_bit(MT76_RESET, &phy2->mt76->state); - local_bh_disable(); napi_enable(&dev->mt76.tx_napi); + + local_bh_disable(); napi_schedule(&dev->mt76.tx_napi); local_bh_enable(); @@ -1570,9 +1576,12 @@ void mt7915_mac_reset_work(struct work_struct *work) if (phy2) clear_bit(MT76_RESET, &phy2->mt76->state); - local_bh_disable(); mt76_for_each_q_rx(&dev->mt76, i) { napi_enable(&dev->mt76.napi[i]); + } + + local_bh_disable(); + mt76_for_each_q_rx(&dev->mt76, i) { napi_schedule(&dev->mt76.napi[i]); } local_bh_enable(); @@ -1581,8 +1590,8 @@ void mt7915_mac_reset_work(struct work_struct *work) mt76_worker_enable(&dev->mt76.tx_worker); - local_bh_disable(); napi_enable(&dev->mt76.tx_napi); + local_bh_disable(); napi_schedule(&dev->mt76.tx_napi); local_bh_enable(); diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/pci.c b/drivers/net/wireless/mediatek/mt76/mt7921/pci.c index ba870e1b05fb8..a0c9df3c2cc75 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7921/pci.c +++ b/drivers/net/wireless/mediatek/mt76/mt7921/pci.c @@ -523,12 +523,15 @@ static int mt7921_pci_resume(struct device *device) mt76_worker_enable(&mdev->tx_worker); - local_bh_disable(); mt76_for_each_q_rx(mdev, i) { napi_enable(&mdev->napi[i]); - napi_schedule(&mdev->napi[i]); } napi_enable(&mdev->tx_napi); + + local_bh_disable(); + mt76_for_each_q_rx(mdev, i) { + napi_schedule(&mdev->napi[i]); + } napi_schedule(&mdev->tx_napi); local_bh_enable(); diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/pci_mac.c b/drivers/net/wireless/mediatek/mt76/mt7921/pci_mac.c index 2452b1a2d1187..881812ba03ffa 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7921/pci_mac.c +++ b/drivers/net/wireless/mediatek/mt76/mt7921/pci_mac.c @@ -81,9 +81,12 @@ int mt7921e_mac_reset(struct mt792x_dev *dev) mt792x_wpdma_reset(dev, true); - local_bh_disable(); mt76_for_each_q_rx(&dev->mt76, i) { napi_enable(&dev->mt76.napi[i]); + } + + local_bh_disable(); + mt76_for_each_q_rx(&dev->mt76, i) { napi_schedule(&dev->mt76.napi[i]); } local_bh_enable(); @@ -115,8 +118,8 @@ int mt7921e_mac_reset(struct mt792x_dev *dev) err = __mt7921_start(&dev->phy); out: - local_bh_disable(); napi_enable(&dev->mt76.tx_napi); + local_bh_disable(); napi_schedule(&dev->mt76.tx_napi); local_bh_enable(); diff --git a/drivers/net/wireless/mediatek/mt76/mt7925/pci.c b/drivers/net/wireless/mediatek/mt76/mt7925/pci.c index f36893e20c617..c7b5dc1dbb349 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7925/pci.c +++ b/drivers/net/wireless/mediatek/mt76/mt7925/pci.c @@ -556,12 +556,15 @@ static int mt7925_pci_resume(struct device *device) mt76_worker_enable(&mdev->tx_worker); - local_bh_disable(); mt76_for_each_q_rx(mdev, i) { napi_enable(&mdev->napi[i]); - napi_schedule(&mdev->napi[i]); } napi_enable(&mdev->tx_napi); + + local_bh_disable(); + mt76_for_each_q_rx(mdev, i) { + napi_schedule(&mdev->napi[i]); + } napi_schedule(&mdev->tx_napi); local_bh_enable(); diff --git a/drivers/net/wireless/mediatek/mt76/mt7925/pci_mac.c b/drivers/net/wireless/mediatek/mt76/mt7925/pci_mac.c index faedbf766d1a0..4578d16bf456a 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7925/pci_mac.c +++ b/drivers/net/wireless/mediatek/mt76/mt7925/pci_mac.c @@ -101,12 +101,15 @@ int mt7925e_mac_reset(struct mt792x_dev *dev) mt792x_wpdma_reset(dev, true); - local_bh_disable(); mt76_for_each_q_rx(&dev->mt76, i) { napi_enable(&dev->mt76.napi[i]); - napi_schedule(&dev->mt76.napi[i]); } napi_enable(&dev->mt76.tx_napi); + + local_bh_disable(); + mt76_for_each_q_rx(&dev->mt76, i) { + napi_schedule(&dev->mt76.napi[i]); + } napi_schedule(&dev->mt76.tx_napi); local_bh_enable(); diff --git a/drivers/net/wireless/mediatek/mt76/mt7996/mac.c b/drivers/net/wireless/mediatek/mt76/mt7996/mac.c index bc8cba4dca47c..019c925ae600e 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7996/mac.c +++ b/drivers/net/wireless/mediatek/mt76/mt7996/mac.c @@ -1695,7 +1695,6 @@ mt7996_mac_restart(struct mt7996_dev *dev) mt7996_dma_reset(dev, true); - local_bh_disable(); mt76_for_each_q_rx(mdev, i) { if (mtk_wed_device_active(&dev->mt76.mmio.wed) && mt76_queue_is_wed_rro(&mdev->q_rx[i])) @@ -1703,10 +1702,11 @@ mt7996_mac_restart(struct mt7996_dev *dev) if (mdev->q_rx[i].ndesc) { napi_enable(&dev->mt76.napi[i]); + local_bh_disable(); napi_schedule(&dev->mt76.napi[i]); + local_bh_enable(); } } - local_bh_enable(); clear_bit(MT76_MCU_RESET, &dev->mphy.state); clear_bit(MT76_STATE_MCU_RUNNING, &dev->mphy.state); @@ -1764,8 +1764,8 @@ mt7996_mac_restart(struct mt7996_dev *dev) if (phy3) clear_bit(MT76_RESET, &phy3->mt76->state); - local_bh_disable(); napi_enable(&dev->mt76.tx_napi); + local_bh_disable(); napi_schedule(&dev->mt76.tx_napi); local_bh_enable(); @@ -1958,23 +1958,23 @@ void mt7996_mac_reset_work(struct work_struct *work) if (phy3) clear_bit(MT76_RESET, &phy3->mt76->state); - local_bh_disable(); mt76_for_each_q_rx(&dev->mt76, i) { if (mtk_wed_device_active(&dev->mt76.mmio.wed) && mt76_queue_is_wed_rro(&dev->mt76.q_rx[i])) continue; napi_enable(&dev->mt76.napi[i]); + local_bh_disable(); napi_schedule(&dev->mt76.napi[i]); + local_bh_enable(); } - local_bh_enable(); tasklet_schedule(&dev->mt76.irq_tasklet); mt76_worker_enable(&dev->mt76.tx_worker); - local_bh_disable(); napi_enable(&dev->mt76.tx_napi); + local_bh_disable(); napi_schedule(&dev->mt76.tx_napi); local_bh_enable(); From 3b1af76604398c12cf6e54a5587cc61566680ca3 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 23 Jan 2025 07:55:40 -0800 Subject: [PATCH 40/88] MAINTAINERS: add Paul Fertser as a NC-SI reviewer Paul has been providing very solid reviews for NC-SI changes lately, so much so I started CCing him on all NC-SI patches. Make the designation official. Reviewed-by: Paul Fertser Link: https://patch.msgid.link/20250123155540.943243-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 1899ef93e498e..5bcc78c0be70b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16189,6 +16189,7 @@ F: drivers/scsi/sun3_scsi_vme.c NCSI LIBRARY M: Samuel Mendoza-Jonas +R: Paul Fertser S: Maintained F: net/ncsi/ From 6db9d3a536cd638e2ee17cf880e14026cec19d26 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 23 Jan 2025 14:14:10 -0800 Subject: [PATCH 41/88] netdevsim: don't assume core pre-populates HDS params on GET Syzbot reports: BUG: KMSAN: uninit-value in nsim_get_ringparam+0xa8/0xe0 drivers/net/netdevsim/ethtool.c:77 nsim_get_ringparam+0xa8/0xe0 drivers/net/netdevsim/ethtool.c:77 ethtool_set_ringparam+0x268/0x570 net/ethtool/ioctl.c:2072 __dev_ethtool net/ethtool/ioctl.c:3209 [inline] dev_ethtool+0x126d/0x2a40 net/ethtool/ioctl.c:3398 dev_ioctl+0xb0e/0x1280 net/core/dev_ioctl.c:759 This is the SET path, where we call GET to either check user request against max values, or check if any of the settings will change. The logic in netdevsim is trying to report the default (ENABLED) if user has not requested any specific setting. The user setting is recorded in dev->cfg, don't depend on kernel_ringparam being pre-populated with it. Fixes: 928459bbda19 ("net: ethtool: populate the default HDS params in the core") Reported-by: Eric Dumazet Reported-by: syzbot+b3bcd80232d00091e061@syzkaller.appspotmail.com Tested-by: syzbot+b3bcd80232d00091e061@syzkaller.appspotmail.com Link: https://patch.msgid.link/20250123221410.1067678-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/netdevsim/ethtool.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/netdevsim/ethtool.c b/drivers/net/netdevsim/ethtool.c index 3b23f3d3ca2bd..5c80fbee79138 100644 --- a/drivers/net/netdevsim/ethtool.c +++ b/drivers/net/netdevsim/ethtool.c @@ -74,7 +74,7 @@ static void nsim_get_ringparam(struct net_device *dev, memcpy(ring, &ns->ethtool.ring, sizeof(ns->ethtool.ring)); kernel_ring->hds_thresh_max = NSIM_HDS_THRESHOLD_MAX; - if (kernel_ring->tcp_data_split == ETHTOOL_TCP_DATA_SPLIT_UNKNOWN) + if (dev->cfg->hds_config == ETHTOOL_TCP_DATA_SPLIT_UNKNOWN) kernel_ring->tcp_data_split = ETHTOOL_TCP_DATA_SPLIT_ENABLED; } From 67e4bb2ced0f2d8fbd414f932daea94ba63ae4c4 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 23 Jan 2025 15:16:20 -0800 Subject: [PATCH 42/88] net: page_pool: don't try to stash the napi id MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Page ppol tried to cache the NAPI ID in page pool info to avoid having a dependency on the life cycle of the NAPI instance. Since commit under Fixes the NAPI ID is not populated until napi_enable() and there's a good chance that page pool is created before NAPI gets enabled. Protect the NAPI pointer with the existing page pool mutex, the reading path already holds it. napi_id itself we need to READ_ONCE(), it's protected by netdev_lock() which are not holding in page pool. Before this patch napi IDs were missing for mlx5: # ./cli.py --spec netlink/specs/netdev.yaml --dump page-pool-get [{'id': 144, 'ifindex': 2, 'inflight': 3072, 'inflight-mem': 12582912}, {'id': 143, 'ifindex': 2, 'inflight': 5568, 'inflight-mem': 22806528}, {'id': 142, 'ifindex': 2, 'inflight': 5120, 'inflight-mem': 20971520}, {'id': 141, 'ifindex': 2, 'inflight': 4992, 'inflight-mem': 20447232}, ... After: [{'id': 144, 'ifindex': 2, 'inflight': 3072, 'inflight-mem': 12582912, 'napi-id': 565}, {'id': 143, 'ifindex': 2, 'inflight': 4224, 'inflight-mem': 17301504, 'napi-id': 525}, {'id': 142, 'ifindex': 2, 'inflight': 4288, 'inflight-mem': 17563648, 'napi-id': 524}, ... Fixes: 86e25f40aa1e ("net: napi: Add napi_config") Reviewed-by: Mina Almasry Reviewed-by: Toke Høiland-Jørgensen Link: https://patch.msgid.link/20250123231620.1086401-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/net/page_pool/types.h | 1 - net/core/dev.c | 2 +- net/core/page_pool.c | 2 ++ net/core/page_pool_priv.h | 2 ++ net/core/page_pool_user.c | 15 +++++++++------ 5 files changed, 14 insertions(+), 8 deletions(-) diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h index ed4cd114180ae..7f405672b089d 100644 --- a/include/net/page_pool/types.h +++ b/include/net/page_pool/types.h @@ -237,7 +237,6 @@ struct page_pool { struct { struct hlist_node list; u64 detach_time; - u32 napi_id; u32 id; } user; }; diff --git a/net/core/dev.c b/net/core/dev.c index afa2282f26043..07b2bb1ce64ff 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6708,7 +6708,7 @@ void napi_resume_irqs(unsigned int napi_id) static void __napi_hash_add_with_id(struct napi_struct *napi, unsigned int napi_id) { - napi->napi_id = napi_id; + WRITE_ONCE(napi->napi_id, napi_id); hlist_add_head_rcu(&napi->napi_hash_node, &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]); } diff --git a/net/core/page_pool.c b/net/core/page_pool.c index a3de752c5178d..ed0f89373259c 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -1147,7 +1147,9 @@ void page_pool_disable_direct_recycling(struct page_pool *pool) WARN_ON(!test_bit(NAPI_STATE_SCHED, &pool->p.napi->state)); WARN_ON(READ_ONCE(pool->p.napi->list_owner) != -1); + mutex_lock(&page_pools_lock); WRITE_ONCE(pool->p.napi, NULL); + mutex_unlock(&page_pools_lock); } EXPORT_SYMBOL(page_pool_disable_direct_recycling); diff --git a/net/core/page_pool_priv.h b/net/core/page_pool_priv.h index 57439787b9c2b..2fb06d5f6d559 100644 --- a/net/core/page_pool_priv.h +++ b/net/core/page_pool_priv.h @@ -7,6 +7,8 @@ #include "netmem_priv.h" +extern struct mutex page_pools_lock; + s32 page_pool_inflight(const struct page_pool *pool, bool strict); int page_pool_list(struct page_pool *pool); diff --git a/net/core/page_pool_user.c b/net/core/page_pool_user.c index 48335766c1bfd..6677e0c2e2565 100644 --- a/net/core/page_pool_user.c +++ b/net/core/page_pool_user.c @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include @@ -14,10 +15,11 @@ #include "netdev-genl-gen.h" static DEFINE_XARRAY_FLAGS(page_pools, XA_FLAGS_ALLOC1); -/* Protects: page_pools, netdevice->page_pools, pool->slow.netdev, pool->user. +/* Protects: page_pools, netdevice->page_pools, pool->p.napi, pool->slow.netdev, + * pool->user. * Ordering: inside rtnl_lock */ -static DEFINE_MUTEX(page_pools_lock); +DEFINE_MUTEX(page_pools_lock); /* Page pools are only reachable from user space (via netlink) if they are * linked to a netdev at creation time. Following page pool "visibility" @@ -216,6 +218,7 @@ page_pool_nl_fill(struct sk_buff *rsp, const struct page_pool *pool, { struct net_devmem_dmabuf_binding *binding = pool->mp_priv; size_t inflight, refsz; + unsigned int napi_id; void *hdr; hdr = genlmsg_iput(rsp, info); @@ -229,8 +232,10 @@ page_pool_nl_fill(struct sk_buff *rsp, const struct page_pool *pool, nla_put_u32(rsp, NETDEV_A_PAGE_POOL_IFINDEX, pool->slow.netdev->ifindex)) goto err_cancel; - if (pool->user.napi_id && - nla_put_uint(rsp, NETDEV_A_PAGE_POOL_NAPI_ID, pool->user.napi_id)) + + napi_id = pool->p.napi ? READ_ONCE(pool->p.napi->napi_id) : 0; + if (napi_id >= MIN_NAPI_ID && + nla_put_uint(rsp, NETDEV_A_PAGE_POOL_NAPI_ID, napi_id)) goto err_cancel; inflight = page_pool_inflight(pool, false); @@ -319,8 +324,6 @@ int page_pool_list(struct page_pool *pool) if (pool->slow.netdev) { hlist_add_head(&pool->user.list, &pool->slow.netdev->page_pools); - pool->user.napi_id = pool->p.napi ? pool->p.napi->napi_id : 0; - netdev_nl_page_pool_event(pool, NETDEV_CMD_PAGE_POOL_ADD_NTF); } From 23b3a7c4a7583eac9e3976355928a832c87caa0f Mon Sep 17 00:00:00 2001 From: Jan Stancek Date: Thu, 23 Jan 2025 09:35:42 +0100 Subject: [PATCH 43/88] selftests: mptcp: extend CFLAGS to keep options from environment Package build environments like Fedora rpmbuild introduced hardening options (e.g. -pie -Wl,-z,now) by passing a -spec option to CFLAGS and LDFLAGS. mptcp Makefile currently overrides CFLAGS but not LDFLAGS, which leads to a mismatch and build failure, for example: make[1]: *** [../../lib.mk:222: tools/testing/selftests/net/mptcp/mptcp_sockopt] Error 1 /usr/bin/ld: /tmp/ccqyMVdb.o: relocation R_X86_64_32 against `.rodata.str1.8' can not be used when making a PIE object; recompile with -fPIE /usr/bin/ld: failed to set dynamic section sizes: bad value collect2: error: ld returned 1 exit status Fixes: cc937dad85ae ("selftests: centralize -D_GNU_SOURCE= to CFLAGS in lib.mk") Signed-off-by: Jan Stancek Reviewed-by: Hangbin Liu Reviewed-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/7abc701da9df39c2d6cd15bc3cf9e6cee445cb96.1737621162.git.jstancek@redhat.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/mptcp/Makefile b/tools/testing/selftests/net/mptcp/Makefile index 8e3fc05a53979..c76525fe2b84d 100644 --- a/tools/testing/selftests/net/mptcp/Makefile +++ b/tools/testing/selftests/net/mptcp/Makefile @@ -2,7 +2,7 @@ top_srcdir = ../../../../.. -CFLAGS = -Wall -Wl,--no-as-needed -O2 -g -I$(top_srcdir)/usr/include $(KHDR_INCLUDES) +CFLAGS += -Wall -Wl,--no-as-needed -O2 -g -I$(top_srcdir)/usr/include $(KHDR_INCLUDES) TEST_PROGS := mptcp_connect.sh pm_netlink.sh mptcp_join.sh diag.sh \ simult_flows.sh mptcp_sockopt.sh userspace_pm.sh From 9b06d5b956131bde535f5c045cf8c1ff6bfba76c Mon Sep 17 00:00:00 2001 From: Jan Stancek Date: Thu, 23 Jan 2025 13:38:51 +0100 Subject: [PATCH 44/88] selftests: net/{lib,openvswitch}: extend CFLAGS to keep options from environment Package build environments like Fedora rpmbuild introduced hardening options (e.g. -pie -Wl,-z,now) by passing a -spec option to CFLAGS and LDFLAGS. Some Makefiles currently override CFLAGS but not LDFLAGS, which leads to a mismatch and build failure, for example: /usr/bin/ld: /tmp/ccd2apay.o: relocation R_X86_64_32 against `.rodata.str1.1' can not be used when making a PIE object; recompile with -fPIE /usr/bin/ld: failed to set dynamic section sizes: bad value collect2: error: ld returned 1 exit status make[1]: *** [../../lib.mk:222: tools/testing/selftests/net/lib/csum] Error 1 openvswitch/Makefile CFLAGS currently do not appear to be used, but fix it anyway for the case when new tests are introduced in future. Fixes: 1d0dc857b5d8 ("selftests: drv-net: add checksum tests") Signed-off-by: Jan Stancek Acked-by: Matthieu Baerts (NGI0) Reviewed-by: Hangbin Liu Link: https://patch.msgid.link/3d173603ee258f419d0403363765c9f9494ff79a.1737635092.git.jstancek@redhat.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/lib/Makefile | 2 +- tools/testing/selftests/net/openvswitch/Makefile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/net/lib/Makefile b/tools/testing/selftests/net/lib/Makefile index 18b9443454a9e..bc6b6762baf3e 100644 --- a/tools/testing/selftests/net/lib/Makefile +++ b/tools/testing/selftests/net/lib/Makefile @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 -CFLAGS = -Wall -Wl,--no-as-needed -O2 -g +CFLAGS += -Wall -Wl,--no-as-needed -O2 -g CFLAGS += -I../../../../../usr/include/ $(KHDR_INCLUDES) # Additional include paths needed by kselftest.h CFLAGS += -I../../ diff --git a/tools/testing/selftests/net/openvswitch/Makefile b/tools/testing/selftests/net/openvswitch/Makefile index 2f1508abc826b..3fd1da2ec07d5 100644 --- a/tools/testing/selftests/net/openvswitch/Makefile +++ b/tools/testing/selftests/net/openvswitch/Makefile @@ -2,7 +2,7 @@ top_srcdir = ../../../../.. -CFLAGS = -Wall -Wl,--no-as-needed -O2 -g -I$(top_srcdir)/usr/include $(KHDR_INCLUDES) +CFLAGS += -Wall -Wl,--no-as-needed -O2 -g -I$(top_srcdir)/usr/include $(KHDR_INCLUDES) TEST_PROGS := openvswitch.sh From 79d458c13056559d49b5e41fbc4b6890e68cf65b Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 23 Jan 2025 08:59:12 +0000 Subject: [PATCH 45/88] rxrpc, afs: Fix peer hash locking vs RCU callback In its address list, afs now retains pointers to and refs on one or more rxrpc_peer objects. The address list is freed under RCU and at this time, it puts the refs on those peers. Now, when an rxrpc_peer object runs out of refs, it gets removed from the peer hash table and, for that, rxrpc has to take a spinlock. However, it is now being called from afs's RCU cleanup, which takes place in BH context - but it is just taking an ordinary spinlock. The put may also be called from non-BH context, and so there exists the possibility of deadlock if the BH-based RCU cleanup happens whilst the hash spinlock is held. This led to the attached lockdep complaint. Fix this by changing spinlocks of rxnet->peer_hash_lock back to BH-disabling locks. ================================ WARNING: inconsistent lock state 6.13.0-rc5-build2+ #1223 Tainted: G E -------------------------------- inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} usage. swapper/1/0 [HC0[0]:SC1[1]:HE1:SE0] takes: ffff88810babe228 (&rxnet->peer_hash_lock){+.?.}-{3:3}, at: rxrpc_put_peer+0xcb/0x180 {SOFTIRQ-ON-W} state was registered at: mark_usage+0x164/0x180 __lock_acquire+0x544/0x990 lock_acquire.part.0+0x103/0x280 _raw_spin_lock+0x2f/0x40 rxrpc_peer_keepalive_worker+0x144/0x440 process_one_work+0x486/0x7c0 process_scheduled_works+0x73/0x90 worker_thread+0x1c8/0x2a0 kthread+0x19b/0x1b0 ret_from_fork+0x24/0x40 ret_from_fork_asm+0x1a/0x30 irq event stamp: 972402 hardirqs last enabled at (972402): [] _raw_spin_unlock_irqrestore+0x2e/0x50 hardirqs last disabled at (972401): [] _raw_spin_lock_irqsave+0x18/0x60 softirqs last enabled at (972300): [] handle_softirqs+0x3ee/0x430 softirqs last disabled at (972313): [] __irq_exit_rcu+0x44/0x110 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(&rxnet->peer_hash_lock); lock(&rxnet->peer_hash_lock); *** DEADLOCK *** 1 lock held by swapper/1/0: #0: ffffffff83576be0 (rcu_callback){....}-{0:0}, at: rcu_lock_acquire+0x7/0x30 stack backtrace: CPU: 1 UID: 0 PID: 0 Comm: swapper/1 Tainted: G E 6.13.0-rc5-build2+ #1223 Tainted: [E]=UNSIGNED_MODULE Hardware name: ASUS All Series/H97-PLUS, BIOS 2306 10/09/2014 Call Trace: dump_stack_lvl+0x57/0x80 print_usage_bug.part.0+0x227/0x240 valid_state+0x53/0x70 mark_lock_irq+0xa5/0x2f0 mark_lock+0xf7/0x170 mark_usage+0xe1/0x180 __lock_acquire+0x544/0x990 lock_acquire.part.0+0x103/0x280 _raw_spin_lock+0x2f/0x40 rxrpc_put_peer+0xcb/0x180 afs_free_addrlist+0x46/0x90 [kafs] rcu_do_batch+0x2d2/0x640 rcu_core+0x2f7/0x350 handle_softirqs+0x1ee/0x430 __irq_exit_rcu+0x44/0x110 irq_exit_rcu+0xa/0x30 sysvec_apic_timer_interrupt+0x7f/0xa0 Fixes: 72904d7b9bfb ("rxrpc, afs: Allow afs to pin rxrpc_peer objects") Signed-off-by: David Howells cc: Marc Dionne cc: Simon Horman cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/2095618.1737622752@warthog.procyon.org.uk Signed-off-by: Jakub Kicinski --- net/rxrpc/peer_event.c | 16 ++++++++-------- net/rxrpc/peer_object.c | 12 ++++++------ 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c index d82e44a3901b0..e874c31fa9012 100644 --- a/net/rxrpc/peer_event.c +++ b/net/rxrpc/peer_event.c @@ -246,7 +246,7 @@ static void rxrpc_peer_keepalive_dispatch(struct rxrpc_net *rxnet, bool use; int slot; - spin_lock(&rxnet->peer_hash_lock); + spin_lock_bh(&rxnet->peer_hash_lock); while (!list_empty(collector)) { peer = list_entry(collector->next, @@ -257,7 +257,7 @@ static void rxrpc_peer_keepalive_dispatch(struct rxrpc_net *rxnet, continue; use = __rxrpc_use_local(peer->local, rxrpc_local_use_peer_keepalive); - spin_unlock(&rxnet->peer_hash_lock); + spin_unlock_bh(&rxnet->peer_hash_lock); if (use) { keepalive_at = peer->last_tx_at + RXRPC_KEEPALIVE_TIME; @@ -277,17 +277,17 @@ static void rxrpc_peer_keepalive_dispatch(struct rxrpc_net *rxnet, */ slot += cursor; slot &= mask; - spin_lock(&rxnet->peer_hash_lock); + spin_lock_bh(&rxnet->peer_hash_lock); list_add_tail(&peer->keepalive_link, &rxnet->peer_keepalive[slot & mask]); - spin_unlock(&rxnet->peer_hash_lock); + spin_unlock_bh(&rxnet->peer_hash_lock); rxrpc_unuse_local(peer->local, rxrpc_local_unuse_peer_keepalive); } rxrpc_put_peer(peer, rxrpc_peer_put_keepalive); - spin_lock(&rxnet->peer_hash_lock); + spin_lock_bh(&rxnet->peer_hash_lock); } - spin_unlock(&rxnet->peer_hash_lock); + spin_unlock_bh(&rxnet->peer_hash_lock); } /* @@ -317,7 +317,7 @@ void rxrpc_peer_keepalive_worker(struct work_struct *work) * second; the bucket at cursor + 1 goes at now + 1s and so * on... */ - spin_lock(&rxnet->peer_hash_lock); + spin_lock_bh(&rxnet->peer_hash_lock); list_splice_init(&rxnet->peer_keepalive_new, &collector); stop = cursor + ARRAY_SIZE(rxnet->peer_keepalive); @@ -329,7 +329,7 @@ void rxrpc_peer_keepalive_worker(struct work_struct *work) } base = now; - spin_unlock(&rxnet->peer_hash_lock); + spin_unlock_bh(&rxnet->peer_hash_lock); rxnet->peer_keepalive_base = base; rxnet->peer_keepalive_cursor = cursor; diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c index e1c63129586b1..0fcc87f0409f9 100644 --- a/net/rxrpc/peer_object.c +++ b/net/rxrpc/peer_object.c @@ -325,10 +325,10 @@ void rxrpc_new_incoming_peer(struct rxrpc_local *local, struct rxrpc_peer *peer) hash_key = rxrpc_peer_hash_key(local, &peer->srx); rxrpc_init_peer(local, peer, hash_key); - spin_lock(&rxnet->peer_hash_lock); + spin_lock_bh(&rxnet->peer_hash_lock); hash_add_rcu(rxnet->peer_hash, &peer->hash_link, hash_key); list_add_tail(&peer->keepalive_link, &rxnet->peer_keepalive_new); - spin_unlock(&rxnet->peer_hash_lock); + spin_unlock_bh(&rxnet->peer_hash_lock); } /* @@ -360,7 +360,7 @@ struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *local, return NULL; } - spin_lock(&rxnet->peer_hash_lock); + spin_lock_bh(&rxnet->peer_hash_lock); /* Need to check that we aren't racing with someone else */ peer = __rxrpc_lookup_peer_rcu(local, srx, hash_key); @@ -373,7 +373,7 @@ struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *local, &rxnet->peer_keepalive_new); } - spin_unlock(&rxnet->peer_hash_lock); + spin_unlock_bh(&rxnet->peer_hash_lock); if (peer) rxrpc_free_peer(candidate); @@ -423,10 +423,10 @@ static void __rxrpc_put_peer(struct rxrpc_peer *peer) ASSERT(hlist_empty(&peer->error_targets)); - spin_lock(&rxnet->peer_hash_lock); + spin_lock_bh(&rxnet->peer_hash_lock); hash_del_rcu(&peer->hash_link); list_del_init(&peer->keepalive_link); - spin_unlock(&rxnet->peer_hash_lock); + spin_unlock_bh(&rxnet->peer_hash_lock); rxrpc_free_peer(peer); } From 5066293b9b7046a906eff60e3949a887ae185a43 Mon Sep 17 00:00:00 2001 From: Shigeru Yoshida Date: Thu, 23 Jan 2025 23:57:46 +0900 Subject: [PATCH 46/88] vxlan: Fix uninit-value in vxlan_vnifilter_dump() KMSAN reported an uninit-value access in vxlan_vnifilter_dump() [1]. If the length of the netlink message payload is less than sizeof(struct tunnel_msg), vxlan_vnifilter_dump() accesses bytes beyond the message. This can lead to uninit-value access. Fix this by returning an error in such situations. [1] BUG: KMSAN: uninit-value in vxlan_vnifilter_dump+0x328/0x920 drivers/net/vxlan/vxlan_vnifilter.c:422 vxlan_vnifilter_dump+0x328/0x920 drivers/net/vxlan/vxlan_vnifilter.c:422 rtnl_dumpit+0xd5/0x2f0 net/core/rtnetlink.c:6786 netlink_dump+0x93e/0x15f0 net/netlink/af_netlink.c:2317 __netlink_dump_start+0x716/0xd60 net/netlink/af_netlink.c:2432 netlink_dump_start include/linux/netlink.h:340 [inline] rtnetlink_dump_start net/core/rtnetlink.c:6815 [inline] rtnetlink_rcv_msg+0x1256/0x14a0 net/core/rtnetlink.c:6882 netlink_rcv_skb+0x467/0x660 net/netlink/af_netlink.c:2542 rtnetlink_rcv+0x35/0x40 net/core/rtnetlink.c:6944 netlink_unicast_kernel net/netlink/af_netlink.c:1321 [inline] netlink_unicast+0xed6/0x1290 net/netlink/af_netlink.c:1347 netlink_sendmsg+0x1092/0x1230 net/netlink/af_netlink.c:1891 sock_sendmsg_nosec net/socket.c:711 [inline] __sock_sendmsg+0x330/0x3d0 net/socket.c:726 ____sys_sendmsg+0x7f4/0xb50 net/socket.c:2583 ___sys_sendmsg+0x271/0x3b0 net/socket.c:2637 __sys_sendmsg net/socket.c:2669 [inline] __do_sys_sendmsg net/socket.c:2674 [inline] __se_sys_sendmsg net/socket.c:2672 [inline] __x64_sys_sendmsg+0x211/0x3e0 net/socket.c:2672 x64_sys_call+0x3878/0x3d90 arch/x86/include/generated/asm/syscalls_64.h:47 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xd9/0x1d0 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f Uninit was created at: slab_post_alloc_hook mm/slub.c:4110 [inline] slab_alloc_node mm/slub.c:4153 [inline] kmem_cache_alloc_node_noprof+0x800/0xe80 mm/slub.c:4205 kmalloc_reserve+0x13b/0x4b0 net/core/skbuff.c:587 __alloc_skb+0x347/0x7d0 net/core/skbuff.c:678 alloc_skb include/linux/skbuff.h:1323 [inline] netlink_alloc_large_skb+0xa5/0x280 net/netlink/af_netlink.c:1196 netlink_sendmsg+0xac9/0x1230 net/netlink/af_netlink.c:1866 sock_sendmsg_nosec net/socket.c:711 [inline] __sock_sendmsg+0x330/0x3d0 net/socket.c:726 ____sys_sendmsg+0x7f4/0xb50 net/socket.c:2583 ___sys_sendmsg+0x271/0x3b0 net/socket.c:2637 __sys_sendmsg net/socket.c:2669 [inline] __do_sys_sendmsg net/socket.c:2674 [inline] __se_sys_sendmsg net/socket.c:2672 [inline] __x64_sys_sendmsg+0x211/0x3e0 net/socket.c:2672 x64_sys_call+0x3878/0x3d90 arch/x86/include/generated/asm/syscalls_64.h:47 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xd9/0x1d0 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f CPU: 0 UID: 0 PID: 30991 Comm: syz.4.10630 Not tainted 6.12.0-10694-gc44daa7e3c73 #29 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.3-3.fc41 04/01/2014 Fixes: f9c4bb0b245c ("vxlan: vni filtering support on collect metadata device") Reported-by: syzkaller Signed-off-by: Shigeru Yoshida Reviewed-by: Ido Schimmel Link: https://patch.msgid.link/20250123145746.785768-1-syoshida@redhat.com Signed-off-by: Jakub Kicinski --- drivers/net/vxlan/vxlan_vnifilter.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/vxlan/vxlan_vnifilter.c b/drivers/net/vxlan/vxlan_vnifilter.c index d2023e7131bd4..6e6e9f05509ab 100644 --- a/drivers/net/vxlan/vxlan_vnifilter.c +++ b/drivers/net/vxlan/vxlan_vnifilter.c @@ -411,6 +411,11 @@ static int vxlan_vnifilter_dump(struct sk_buff *skb, struct netlink_callback *cb struct tunnel_msg *tmsg; struct net_device *dev; + if (cb->nlh->nlmsg_len < nlmsg_msg_size(sizeof(struct tunnel_msg))) { + NL_SET_ERR_MSG(cb->extack, "Invalid msg length"); + return -EINVAL; + } + tmsg = nlmsg_data(cb->nlh); if (tmsg->flags & ~TUNNEL_MSG_VALID_USER_FLAGS) { From bd1bbab717608757cccbbe08b0d46e6c3ed0ced5 Mon Sep 17 00:00:00 2001 From: Milos Reljin Date: Fri, 24 Jan 2025 10:41:02 +0000 Subject: [PATCH 47/88] net: phy: c45-tjaxx: add delay between MDIO write and read in soft_reset In application note (AN13663) for TJA1120, on page 30, there's a figure with average PHY startup timing values following software reset. The time it takes for SMI to become operational after software reset ranges roughly from 500 us to 1500 us. This commit adds 2000 us delay after MDIO write which triggers software reset. Without this delay, soft_reset function returns an error and prevents successful PHY init. Cc: stable@vger.kernel.org Fixes: b050f2f15e04 ("phy: nxp-c45: add driver for tja1103") Signed-off-by: Milos Reljin Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/AM8P250MB0124D258E5A71041AF2CC322E1E32@AM8P250MB0124.EURP250.PROD.OUTLOOK.COM Signed-off-by: Jakub Kicinski --- drivers/net/phy/nxp-c45-tja11xx.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/phy/nxp-c45-tja11xx.c b/drivers/net/phy/nxp-c45-tja11xx.c index 323717a4821fc..34231b5b91754 100644 --- a/drivers/net/phy/nxp-c45-tja11xx.c +++ b/drivers/net/phy/nxp-c45-tja11xx.c @@ -1297,6 +1297,8 @@ static int nxp_c45_soft_reset(struct phy_device *phydev) if (ret) return ret; + usleep_range(2000, 2050); + return phy_read_mmd_poll_timeout(phydev, MDIO_MMD_VEND1, VEND1_DEVICE_CONTROL, ret, !(ret & DEVICE_CONTROL_RESET), 20000, From 19e65c45a1507a1a2926649d2db3583ed9d55fd9 Mon Sep 17 00:00:00 2001 From: Chenyuan Yang Date: Thu, 23 Jan 2025 15:42:13 -0600 Subject: [PATCH 48/88] net: davicom: fix UAF in dm9000_drv_remove MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit dm is netdev private data and it cannot be used after free_netdev() call. Using dm after free_netdev() can cause UAF bug. Fix it by moving free_netdev() at the end of the function. This is similar to the issue fixed in commit ad297cd2db89 ("net: qcom/emac: fix UAF in emac_remove"). This bug is detected by our static analysis tool. Fixes: cf9e60aa69ae ("net: davicom: Fix regulator not turned off on driver removal") Signed-off-by: Chenyuan Yang CC: Uwe Kleine-König Link: https://patch.msgid.link/20250123214213.623518-1-chenyuan0y@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/davicom/dm9000.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/davicom/dm9000.c b/drivers/net/ethernet/davicom/dm9000.c index 8735e333034cf..b87eaf0c250ce 100644 --- a/drivers/net/ethernet/davicom/dm9000.c +++ b/drivers/net/ethernet/davicom/dm9000.c @@ -1777,10 +1777,11 @@ static void dm9000_drv_remove(struct platform_device *pdev) unregister_netdev(ndev); dm9000_release_board(pdev, dm); - free_netdev(ndev); /* free device structure */ if (dm->power_supply) regulator_disable(dm->power_supply); + free_netdev(ndev); /* free device structure */ + dev_dbg(&pdev->dev, "released and freed device\n"); } From c86b000782daba926c627d2fa00c3f60a75e7472 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 23 Jan 2025 19:05:54 +0100 Subject: [PATCH 49/88] mptcp: consolidate suboption status MPTCP maintains the received sub-options status is the bitmask carrying the received suboptions and in several bitfields carrying per suboption additional info. Zeroing the bitmask before parsing is not enough to ensure a consistent status, and the MPTCP code has to additionally clear some bitfiled depending on the actually parsed suboption. The above schema is fragile, and syzbot managed to trigger a path where a relevant bitfield is not cleared/initialized: BUG: KMSAN: uninit-value in __mptcp_expand_seq net/mptcp/options.c:1030 [inline] BUG: KMSAN: uninit-value in mptcp_expand_seq net/mptcp/protocol.h:864 [inline] BUG: KMSAN: uninit-value in ack_update_msk net/mptcp/options.c:1060 [inline] BUG: KMSAN: uninit-value in mptcp_incoming_options+0x2036/0x3d30 net/mptcp/options.c:1209 __mptcp_expand_seq net/mptcp/options.c:1030 [inline] mptcp_expand_seq net/mptcp/protocol.h:864 [inline] ack_update_msk net/mptcp/options.c:1060 [inline] mptcp_incoming_options+0x2036/0x3d30 net/mptcp/options.c:1209 tcp_data_queue+0xb4/0x7be0 net/ipv4/tcp_input.c:5233 tcp_rcv_established+0x1061/0x2510 net/ipv4/tcp_input.c:6264 tcp_v4_do_rcv+0x7f3/0x11a0 net/ipv4/tcp_ipv4.c:1916 tcp_v4_rcv+0x51df/0x5750 net/ipv4/tcp_ipv4.c:2351 ip_protocol_deliver_rcu+0x2a3/0x13d0 net/ipv4/ip_input.c:205 ip_local_deliver_finish+0x336/0x500 net/ipv4/ip_input.c:233 NF_HOOK include/linux/netfilter.h:314 [inline] ip_local_deliver+0x21f/0x490 net/ipv4/ip_input.c:254 dst_input include/net/dst.h:460 [inline] ip_rcv_finish+0x4a2/0x520 net/ipv4/ip_input.c:447 NF_HOOK include/linux/netfilter.h:314 [inline] ip_rcv+0xcd/0x380 net/ipv4/ip_input.c:567 __netif_receive_skb_one_core net/core/dev.c:5704 [inline] __netif_receive_skb+0x319/0xa00 net/core/dev.c:5817 process_backlog+0x4ad/0xa50 net/core/dev.c:6149 __napi_poll+0xe7/0x980 net/core/dev.c:6902 napi_poll net/core/dev.c:6971 [inline] net_rx_action+0xa5a/0x19b0 net/core/dev.c:7093 handle_softirqs+0x1a0/0x7c0 kernel/softirq.c:561 __do_softirq+0x14/0x1a kernel/softirq.c:595 do_softirq+0x9a/0x100 kernel/softirq.c:462 __local_bh_enable_ip+0x9f/0xb0 kernel/softirq.c:389 local_bh_enable include/linux/bottom_half.h:33 [inline] rcu_read_unlock_bh include/linux/rcupdate.h:919 [inline] __dev_queue_xmit+0x2758/0x57d0 net/core/dev.c:4493 dev_queue_xmit include/linux/netdevice.h:3168 [inline] neigh_hh_output include/net/neighbour.h:523 [inline] neigh_output include/net/neighbour.h:537 [inline] ip_finish_output2+0x187c/0x1b70 net/ipv4/ip_output.c:236 __ip_finish_output+0x287/0x810 ip_finish_output+0x4b/0x600 net/ipv4/ip_output.c:324 NF_HOOK_COND include/linux/netfilter.h:303 [inline] ip_output+0x15f/0x3f0 net/ipv4/ip_output.c:434 dst_output include/net/dst.h:450 [inline] ip_local_out net/ipv4/ip_output.c:130 [inline] __ip_queue_xmit+0x1f2a/0x20d0 net/ipv4/ip_output.c:536 ip_queue_xmit+0x60/0x80 net/ipv4/ip_output.c:550 __tcp_transmit_skb+0x3cea/0x4900 net/ipv4/tcp_output.c:1468 tcp_transmit_skb net/ipv4/tcp_output.c:1486 [inline] tcp_write_xmit+0x3b90/0x9070 net/ipv4/tcp_output.c:2829 __tcp_push_pending_frames+0xc4/0x380 net/ipv4/tcp_output.c:3012 tcp_send_fin+0x9f6/0xf50 net/ipv4/tcp_output.c:3618 __tcp_close+0x140c/0x1550 net/ipv4/tcp.c:3130 __mptcp_close_ssk+0x74e/0x16f0 net/mptcp/protocol.c:2496 mptcp_close_ssk+0x26b/0x2c0 net/mptcp/protocol.c:2550 mptcp_pm_nl_rm_addr_or_subflow+0x635/0xd10 net/mptcp/pm_netlink.c:889 mptcp_pm_nl_rm_subflow_received net/mptcp/pm_netlink.c:924 [inline] mptcp_pm_flush_addrs_and_subflows net/mptcp/pm_netlink.c:1688 [inline] mptcp_nl_flush_addrs_list net/mptcp/pm_netlink.c:1709 [inline] mptcp_pm_nl_flush_addrs_doit+0xe10/0x1630 net/mptcp/pm_netlink.c:1750 genl_family_rcv_msg_doit net/netlink/genetlink.c:1115 [inline] genl_family_rcv_msg net/netlink/genetlink.c:1195 [inline] genl_rcv_msg+0x1214/0x12c0 net/netlink/genetlink.c:1210 netlink_rcv_skb+0x375/0x650 net/netlink/af_netlink.c:2542 genl_rcv+0x40/0x60 net/netlink/genetlink.c:1219 netlink_unicast_kernel net/netlink/af_netlink.c:1321 [inline] netlink_unicast+0xf52/0x1260 net/netlink/af_netlink.c:1347 netlink_sendmsg+0x10da/0x11e0 net/netlink/af_netlink.c:1891 sock_sendmsg_nosec net/socket.c:711 [inline] __sock_sendmsg+0x30f/0x380 net/socket.c:726 ____sys_sendmsg+0x877/0xb60 net/socket.c:2583 ___sys_sendmsg+0x28d/0x3c0 net/socket.c:2637 __sys_sendmsg net/socket.c:2669 [inline] __do_sys_sendmsg net/socket.c:2674 [inline] __se_sys_sendmsg net/socket.c:2672 [inline] __x64_sys_sendmsg+0x212/0x3c0 net/socket.c:2672 x64_sys_call+0x2ed6/0x3c30 arch/x86/include/generated/asm/syscalls_64.h:47 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xcd/0x1e0 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f Uninit was stored to memory at: mptcp_get_options+0x2c0f/0x2f20 net/mptcp/options.c:397 mptcp_incoming_options+0x19a/0x3d30 net/mptcp/options.c:1150 tcp_data_queue+0xb4/0x7be0 net/ipv4/tcp_input.c:5233 tcp_rcv_established+0x1061/0x2510 net/ipv4/tcp_input.c:6264 tcp_v4_do_rcv+0x7f3/0x11a0 net/ipv4/tcp_ipv4.c:1916 tcp_v4_rcv+0x51df/0x5750 net/ipv4/tcp_ipv4.c:2351 ip_protocol_deliver_rcu+0x2a3/0x13d0 net/ipv4/ip_input.c:205 ip_local_deliver_finish+0x336/0x500 net/ipv4/ip_input.c:233 NF_HOOK include/linux/netfilter.h:314 [inline] ip_local_deliver+0x21f/0x490 net/ipv4/ip_input.c:254 dst_input include/net/dst.h:460 [inline] ip_rcv_finish+0x4a2/0x520 net/ipv4/ip_input.c:447 NF_HOOK include/linux/netfilter.h:314 [inline] ip_rcv+0xcd/0x380 net/ipv4/ip_input.c:567 __netif_receive_skb_one_core net/core/dev.c:5704 [inline] __netif_receive_skb+0x319/0xa00 net/core/dev.c:5817 process_backlog+0x4ad/0xa50 net/core/dev.c:6149 __napi_poll+0xe7/0x980 net/core/dev.c:6902 napi_poll net/core/dev.c:6971 [inline] net_rx_action+0xa5a/0x19b0 net/core/dev.c:7093 handle_softirqs+0x1a0/0x7c0 kernel/softirq.c:561 __do_softirq+0x14/0x1a kernel/softirq.c:595 Uninit was stored to memory at: put_unaligned_be32 include/linux/unaligned.h:68 [inline] mptcp_write_options+0x17f9/0x3100 net/mptcp/options.c:1417 mptcp_options_write net/ipv4/tcp_output.c:465 [inline] tcp_options_write+0x6d9/0xe90 net/ipv4/tcp_output.c:759 __tcp_transmit_skb+0x294b/0x4900 net/ipv4/tcp_output.c:1414 tcp_transmit_skb net/ipv4/tcp_output.c:1486 [inline] tcp_write_xmit+0x3b90/0x9070 net/ipv4/tcp_output.c:2829 __tcp_push_pending_frames+0xc4/0x380 net/ipv4/tcp_output.c:3012 tcp_send_fin+0x9f6/0xf50 net/ipv4/tcp_output.c:3618 __tcp_close+0x140c/0x1550 net/ipv4/tcp.c:3130 __mptcp_close_ssk+0x74e/0x16f0 net/mptcp/protocol.c:2496 mptcp_close_ssk+0x26b/0x2c0 net/mptcp/protocol.c:2550 mptcp_pm_nl_rm_addr_or_subflow+0x635/0xd10 net/mptcp/pm_netlink.c:889 mptcp_pm_nl_rm_subflow_received net/mptcp/pm_netlink.c:924 [inline] mptcp_pm_flush_addrs_and_subflows net/mptcp/pm_netlink.c:1688 [inline] mptcp_nl_flush_addrs_list net/mptcp/pm_netlink.c:1709 [inline] mptcp_pm_nl_flush_addrs_doit+0xe10/0x1630 net/mptcp/pm_netlink.c:1750 genl_family_rcv_msg_doit net/netlink/genetlink.c:1115 [inline] genl_family_rcv_msg net/netlink/genetlink.c:1195 [inline] genl_rcv_msg+0x1214/0x12c0 net/netlink/genetlink.c:1210 netlink_rcv_skb+0x375/0x650 net/netlink/af_netlink.c:2542 genl_rcv+0x40/0x60 net/netlink/genetlink.c:1219 netlink_unicast_kernel net/netlink/af_netlink.c:1321 [inline] netlink_unicast+0xf52/0x1260 net/netlink/af_netlink.c:1347 netlink_sendmsg+0x10da/0x11e0 net/netlink/af_netlink.c:1891 sock_sendmsg_nosec net/socket.c:711 [inline] __sock_sendmsg+0x30f/0x380 net/socket.c:726 ____sys_sendmsg+0x877/0xb60 net/socket.c:2583 ___sys_sendmsg+0x28d/0x3c0 net/socket.c:2637 __sys_sendmsg net/socket.c:2669 [inline] __do_sys_sendmsg net/socket.c:2674 [inline] __se_sys_sendmsg net/socket.c:2672 [inline] __x64_sys_sendmsg+0x212/0x3c0 net/socket.c:2672 x64_sys_call+0x2ed6/0x3c30 arch/x86/include/generated/asm/syscalls_64.h:47 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xcd/0x1e0 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f Uninit was stored to memory at: mptcp_pm_add_addr_signal+0x3d7/0x4c0 mptcp_established_options_add_addr net/mptcp/options.c:666 [inline] mptcp_established_options+0x1b9b/0x3a00 net/mptcp/options.c:884 tcp_established_options+0x2c4/0x7d0 net/ipv4/tcp_output.c:1012 __tcp_transmit_skb+0x5b7/0x4900 net/ipv4/tcp_output.c:1333 tcp_transmit_skb net/ipv4/tcp_output.c:1486 [inline] tcp_write_xmit+0x3b90/0x9070 net/ipv4/tcp_output.c:2829 __tcp_push_pending_frames+0xc4/0x380 net/ipv4/tcp_output.c:3012 tcp_send_fin+0x9f6/0xf50 net/ipv4/tcp_output.c:3618 __tcp_close+0x140c/0x1550 net/ipv4/tcp.c:3130 __mptcp_close_ssk+0x74e/0x16f0 net/mptcp/protocol.c:2496 mptcp_close_ssk+0x26b/0x2c0 net/mptcp/protocol.c:2550 mptcp_pm_nl_rm_addr_or_subflow+0x635/0xd10 net/mptcp/pm_netlink.c:889 mptcp_pm_nl_rm_subflow_received net/mptcp/pm_netlink.c:924 [inline] mptcp_pm_flush_addrs_and_subflows net/mptcp/pm_netlink.c:1688 [inline] mptcp_nl_flush_addrs_list net/mptcp/pm_netlink.c:1709 [inline] mptcp_pm_nl_flush_addrs_doit+0xe10/0x1630 net/mptcp/pm_netlink.c:1750 genl_family_rcv_msg_doit net/netlink/genetlink.c:1115 [inline] genl_family_rcv_msg net/netlink/genetlink.c:1195 [inline] genl_rcv_msg+0x1214/0x12c0 net/netlink/genetlink.c:1210 netlink_rcv_skb+0x375/0x650 net/netlink/af_netlink.c:2542 genl_rcv+0x40/0x60 net/netlink/genetlink.c:1219 netlink_unicast_kernel net/netlink/af_netlink.c:1321 [inline] netlink_unicast+0xf52/0x1260 net/netlink/af_netlink.c:1347 netlink_sendmsg+0x10da/0x11e0 net/netlink/af_netlink.c:1891 sock_sendmsg_nosec net/socket.c:711 [inline] __sock_sendmsg+0x30f/0x380 net/socket.c:726 ____sys_sendmsg+0x877/0xb60 net/socket.c:2583 ___sys_sendmsg+0x28d/0x3c0 net/socket.c:2637 __sys_sendmsg net/socket.c:2669 [inline] __do_sys_sendmsg net/socket.c:2674 [inline] __se_sys_sendmsg net/socket.c:2672 [inline] __x64_sys_sendmsg+0x212/0x3c0 net/socket.c:2672 x64_sys_call+0x2ed6/0x3c30 arch/x86/include/generated/asm/syscalls_64.h:47 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xcd/0x1e0 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f Uninit was stored to memory at: mptcp_pm_add_addr_received+0x95f/0xdd0 net/mptcp/pm.c:235 mptcp_incoming_options+0x2983/0x3d30 net/mptcp/options.c:1169 tcp_data_queue+0xb4/0x7be0 net/ipv4/tcp_input.c:5233 tcp_rcv_state_process+0x2a38/0x49d0 net/ipv4/tcp_input.c:6972 tcp_v4_do_rcv+0xbf9/0x11a0 net/ipv4/tcp_ipv4.c:1939 tcp_v4_rcv+0x51df/0x5750 net/ipv4/tcp_ipv4.c:2351 ip_protocol_deliver_rcu+0x2a3/0x13d0 net/ipv4/ip_input.c:205 ip_local_deliver_finish+0x336/0x500 net/ipv4/ip_input.c:233 NF_HOOK include/linux/netfilter.h:314 [inline] ip_local_deliver+0x21f/0x490 net/ipv4/ip_input.c:254 dst_input include/net/dst.h:460 [inline] ip_rcv_finish+0x4a2/0x520 net/ipv4/ip_input.c:447 NF_HOOK include/linux/netfilter.h:314 [inline] ip_rcv+0xcd/0x380 net/ipv4/ip_input.c:567 __netif_receive_skb_one_core net/core/dev.c:5704 [inline] __netif_receive_skb+0x319/0xa00 net/core/dev.c:5817 process_backlog+0x4ad/0xa50 net/core/dev.c:6149 __napi_poll+0xe7/0x980 net/core/dev.c:6902 napi_poll net/core/dev.c:6971 [inline] net_rx_action+0xa5a/0x19b0 net/core/dev.c:7093 handle_softirqs+0x1a0/0x7c0 kernel/softirq.c:561 __do_softirq+0x14/0x1a kernel/softirq.c:595 Local variable mp_opt created at: mptcp_incoming_options+0x119/0x3d30 net/mptcp/options.c:1127 tcp_data_queue+0xb4/0x7be0 net/ipv4/tcp_input.c:5233 The current schema is too fragile; address the issue grouping all the state-related data together and clearing the whole group instead of just the bitmask. This also cleans-up the code a bit, as there is no need to individually clear "random" bitfield in a couple of places any more. Fixes: 84dfe3677a6f ("mptcp: send out dedicated ADD_ADDR packet") Cc: stable@vger.kernel.org Reported-by: syzbot+23728c2df58b3bd175ad@syzkaller.appspotmail.com Closes: https://lore.kernel.org/6786ac51.050a0220.216c54.00a7.GAE@google.com Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/541 Signed-off-by: Paolo Abeni Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250123-net-mptcp-syzbot-issues-v1-1-af73258a726f@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/options.c | 13 +++++-------- net/mptcp/protocol.h | 30 ++++++++++++++++-------------- 2 files changed, 21 insertions(+), 22 deletions(-) diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 123f3f2972841..fd2de185bc939 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -108,7 +108,6 @@ static void mptcp_parse_option(const struct sk_buff *skb, mp_opt->suboptions |= OPTION_MPTCP_DSS; mp_opt->use_map = 1; mp_opt->mpc_map = 1; - mp_opt->use_ack = 0; mp_opt->data_len = get_unaligned_be16(ptr); ptr += 2; } @@ -157,11 +156,6 @@ static void mptcp_parse_option(const struct sk_buff *skb, pr_debug("DSS\n"); ptr++; - /* we must clear 'mpc_map' be able to detect MP_CAPABLE - * map vs DSS map in mptcp_incoming_options(), and reconstruct - * map info accordingly - */ - mp_opt->mpc_map = 0; flags = (*ptr++) & MPTCP_DSS_FLAG_MASK; mp_opt->data_fin = (flags & MPTCP_DSS_DATA_FIN) != 0; mp_opt->dsn64 = (flags & MPTCP_DSS_DSN64) != 0; @@ -369,8 +363,11 @@ void mptcp_get_options(const struct sk_buff *skb, const unsigned char *ptr; int length; - /* initialize option status */ - mp_opt->suboptions = 0; + /* Ensure that casting the whole status to u32 is efficient and safe */ + BUILD_BUG_ON(sizeof_field(struct mptcp_options_received, status) != sizeof(u32)); + BUILD_BUG_ON(!IS_ALIGNED(offsetof(struct mptcp_options_received, status), + sizeof(u32))); + *(u32 *)&mp_opt->status = 0; length = (th->doff * 4) - sizeof(struct tcphdr); ptr = (const unsigned char *)(th + 1); diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 0174a5aad2796..f6a207958459d 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -149,22 +149,24 @@ struct mptcp_options_received { u32 subflow_seq; u16 data_len; __sum16 csum; - u16 suboptions; + struct_group(status, + u16 suboptions; + u16 use_map:1, + dsn64:1, + data_fin:1, + use_ack:1, + ack64:1, + mpc_map:1, + reset_reason:4, + reset_transient:1, + echo:1, + backup:1, + deny_join_id0:1, + __unused:2; + ); + u8 join_id; u32 token; u32 nonce; - u16 use_map:1, - dsn64:1, - data_fin:1, - use_ack:1, - ack64:1, - mpc_map:1, - reset_reason:4, - reset_transient:1, - echo:1, - backup:1, - deny_join_id0:1, - __unused:2; - u8 join_id; u64 thmac; u8 hmac[MPTCPOPT_HMAC_LEN]; struct mptcp_addr_info addr; From 1bb0d1348546ad059f55c93def34e67cb2a034a6 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Thu, 23 Jan 2025 19:05:55 +0100 Subject: [PATCH 50/88] mptcp: pm: only set fullmesh for subflow endp With the in-kernel path-manager, it is possible to change the 'fullmesh' flag. The code in mptcp_pm_nl_fullmesh() expects to change it only on 'subflow' endpoints, to recreate more or less subflows using the linked address. Unfortunately, the set_flags() hook was a bit more permissive, and allowed 'implicit' endpoints to get the 'fullmesh' flag while it is not allowed before. That's what syzbot found, triggering the following warning: WARNING: CPU: 0 PID: 6499 at net/mptcp/pm_netlink.c:1496 __mark_subflow_endp_available net/mptcp/pm_netlink.c:1496 [inline] WARNING: CPU: 0 PID: 6499 at net/mptcp/pm_netlink.c:1496 mptcp_pm_nl_fullmesh net/mptcp/pm_netlink.c:1980 [inline] WARNING: CPU: 0 PID: 6499 at net/mptcp/pm_netlink.c:1496 mptcp_nl_set_flags net/mptcp/pm_netlink.c:2003 [inline] WARNING: CPU: 0 PID: 6499 at net/mptcp/pm_netlink.c:1496 mptcp_pm_nl_set_flags+0x974/0xdc0 net/mptcp/pm_netlink.c:2064 Modules linked in: CPU: 0 UID: 0 PID: 6499 Comm: syz.1.413 Not tainted 6.13.0-rc5-syzkaller-00172-gd1bf27c4e176 #0 Hardware name: Google Compute Engine/Google Compute Engine, BIOS Google 09/13/2024 RIP: 0010:__mark_subflow_endp_available net/mptcp/pm_netlink.c:1496 [inline] RIP: 0010:mptcp_pm_nl_fullmesh net/mptcp/pm_netlink.c:1980 [inline] RIP: 0010:mptcp_nl_set_flags net/mptcp/pm_netlink.c:2003 [inline] RIP: 0010:mptcp_pm_nl_set_flags+0x974/0xdc0 net/mptcp/pm_netlink.c:2064 Code: 01 00 00 49 89 c5 e8 fb 45 e8 f5 e9 b8 fc ff ff e8 f1 45 e8 f5 4c 89 f7 be 03 00 00 00 e8 44 1d 0b f9 eb a0 e8 dd 45 e8 f5 90 <0f> 0b 90 e9 17 ff ff ff 89 d9 80 e1 07 38 c1 0f 8c c9 fc ff ff 48 RSP: 0018:ffffc9000d307240 EFLAGS: 00010293 RAX: ffffffff8bb72e03 RBX: 0000000000000000 RCX: ffff88807da88000 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000 RBP: ffffc9000d307430 R08: ffffffff8bb72cf0 R09: 1ffff1100b842a5e R10: dffffc0000000000 R11: ffffed100b842a5f R12: ffff88801e2e5ac0 R13: ffff88805c214800 R14: ffff88805c2152e8 R15: 1ffff1100b842a5d FS: 00005555619f6500(0000) GS:ffff8880b8600000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000020002840 CR3: 00000000247e6000 CR4: 00000000003526f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: genl_family_rcv_msg_doit net/netlink/genetlink.c:1115 [inline] genl_family_rcv_msg net/netlink/genetlink.c:1195 [inline] genl_rcv_msg+0xb14/0xec0 net/netlink/genetlink.c:1210 netlink_rcv_skb+0x1e3/0x430 net/netlink/af_netlink.c:2542 genl_rcv+0x28/0x40 net/netlink/genetlink.c:1219 netlink_unicast_kernel net/netlink/af_netlink.c:1321 [inline] netlink_unicast+0x7f6/0x990 net/netlink/af_netlink.c:1347 netlink_sendmsg+0x8e4/0xcb0 net/netlink/af_netlink.c:1891 sock_sendmsg_nosec net/socket.c:711 [inline] __sock_sendmsg+0x221/0x270 net/socket.c:726 ____sys_sendmsg+0x52a/0x7e0 net/socket.c:2583 ___sys_sendmsg net/socket.c:2637 [inline] __sys_sendmsg+0x269/0x350 net/socket.c:2669 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7f5fe8785d29 Code: ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 a8 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007fff571f5558 EFLAGS: 00000246 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 00007f5fe8975fa0 RCX: 00007f5fe8785d29 RDX: 0000000000000000 RSI: 0000000020000480 RDI: 0000000000000007 RBP: 00007f5fe8801b08 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 R13: 00007f5fe8975fa0 R14: 00007f5fe8975fa0 R15: 00000000000011f4 Here, syzbot managed to set the 'fullmesh' flag on an 'implicit' and used -- according to 'id_avail_bitmap' -- endpoint, causing the PM to try decrement the local_addr_used counter which is only incremented for the 'subflow' endpoint. Note that 'no type' endpoints -- not 'subflow', 'signal', 'implicit' -- are fine, because their ID will not be marked as used in the 'id_avail' bitmap, and setting 'fullmesh' can help forcing the creation of subflow when receiving an ADD_ADDR. Fixes: 73c762c1f07d ("mptcp: set fullmesh flag in pm_netlink") Cc: stable@vger.kernel.org Reported-by: syzbot+cd16e79c1e45f3fe0377@syzkaller.appspotmail.com Closes: https://lore.kernel.org/6786ac51.050a0220.216c54.00a6.GAE@google.com Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/540 Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250123-net-mptcp-syzbot-issues-v1-2-af73258a726f@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm_netlink.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 98ac73938bd81..572d160edca33 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -2020,7 +2020,8 @@ int mptcp_pm_nl_set_flags(struct sk_buff *skb, struct genl_info *info) return -EINVAL; } if ((addr.flags & MPTCP_PM_ADDR_FLAG_FULLMESH) && - (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL)) { + (entry->flags & (MPTCP_PM_ADDR_FLAG_SIGNAL | + MPTCP_PM_ADDR_FLAG_IMPLICIT))) { spin_unlock_bh(&pernet->lock); GENL_SET_ERR_MSG(info, "invalid addr flags"); return -EINVAL; From 619af16b3b57a3a4ee50b9a30add9ff155541e71 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 23 Jan 2025 19:05:56 +0100 Subject: [PATCH 51/88] mptcp: handle fastopen disconnect correctly Syzbot was able to trigger a data stream corruption: WARNING: CPU: 0 PID: 9846 at net/mptcp/protocol.c:1024 __mptcp_clean_una+0xddb/0xff0 net/mptcp/protocol.c:1024 Modules linked in: CPU: 0 UID: 0 PID: 9846 Comm: syz-executor351 Not tainted 6.13.0-rc2-syzkaller-00059-g00a5acdbf398 #0 Hardware name: Google Compute Engine/Google Compute Engine, BIOS Google 11/25/2024 RIP: 0010:__mptcp_clean_una+0xddb/0xff0 net/mptcp/protocol.c:1024 Code: fa ff ff 48 8b 4c 24 18 80 e1 07 fe c1 38 c1 0f 8c 8e fa ff ff 48 8b 7c 24 18 e8 e0 db 54 f6 e9 7f fa ff ff e8 e6 80 ee f5 90 <0f> 0b 90 4c 8b 6c 24 40 4d 89 f4 e9 04 f5 ff ff 44 89 f1 80 e1 07 RSP: 0018:ffffc9000c0cf400 EFLAGS: 00010293 RAX: ffffffff8bb0dd5a RBX: ffff888033f5d230 RCX: ffff888059ce8000 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000 RBP: ffffc9000c0cf518 R08: ffffffff8bb0d1dd R09: 1ffff110170c8928 R10: dffffc0000000000 R11: ffffed10170c8929 R12: 0000000000000000 R13: ffff888033f5d220 R14: dffffc0000000000 R15: ffff8880592b8000 FS: 00007f6e866496c0(0000) GS:ffff8880b8600000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f6e86f491a0 CR3: 00000000310e6000 CR4: 00000000003526f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: __mptcp_clean_una_wakeup+0x7f/0x2d0 net/mptcp/protocol.c:1074 mptcp_release_cb+0x7cb/0xb30 net/mptcp/protocol.c:3493 release_sock+0x1aa/0x1f0 net/core/sock.c:3640 inet_wait_for_connect net/ipv4/af_inet.c:609 [inline] __inet_stream_connect+0x8bd/0xf30 net/ipv4/af_inet.c:703 mptcp_sendmsg_fastopen+0x2a2/0x530 net/mptcp/protocol.c:1755 mptcp_sendmsg+0x1884/0x1b10 net/mptcp/protocol.c:1830 sock_sendmsg_nosec net/socket.c:711 [inline] __sock_sendmsg+0x1a6/0x270 net/socket.c:726 ____sys_sendmsg+0x52a/0x7e0 net/socket.c:2583 ___sys_sendmsg net/socket.c:2637 [inline] __sys_sendmsg+0x269/0x350 net/socket.c:2669 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7f6e86ebfe69 Code: 28 00 00 00 75 05 48 83 c4 28 c3 e8 b1 1f 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b0 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007f6e86649168 EFLAGS: 00000246 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 00007f6e86f491b8 RCX: 00007f6e86ebfe69 RDX: 0000000030004001 RSI: 0000000020000080 RDI: 0000000000000003 RBP: 00007f6e86f491b0 R08: 00007f6e866496c0 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 00007f6e86f491bc R13: 000000000000006e R14: 00007ffe445d9420 R15: 00007ffe445d9508 The root cause is the bad handling of disconnect() generated internally by the MPTCP protocol in case of connect FASTOPEN errors. Address the issue increasing the socket disconnect counter even on such a case, to allow other threads waiting on the same socket lock to properly error out. Fixes: c2b2ae3925b6 ("mptcp: handle correctly disconnect() failures") Cc: stable@vger.kernel.org Reported-by: syzbot+ebc0b8ae5d3590b2c074@syzkaller.appspotmail.com Closes: https://lore.kernel.org/67605870.050a0220.37aaf.0137.GAE@google.com Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/537 Tested-by: syzbot+ebc0b8ae5d3590b2c074@syzkaller.appspotmail.com Signed-off-by: Paolo Abeni Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250123-net-mptcp-syzbot-issues-v1-3-af73258a726f@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index c44c89ecaca65..6bd8190474706 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1767,8 +1767,10 @@ static int mptcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, * see mptcp_disconnect(). * Attempt it again outside the problematic scope. */ - if (!mptcp_disconnect(sk, 0)) + if (!mptcp_disconnect(sk, 0)) { + sk->sk_disconnects++; sk->sk_socket->state = SS_UNCONNECTED; + } } inet_clear_bit(DEFER_CONNECT, sk); From 90b7f2961798793275b4844348619b622f983907 Mon Sep 17 00:00:00 2001 From: Nikita Zhandarovich Date: Fri, 24 Jan 2025 01:30:20 -0800 Subject: [PATCH 52/88] net: usb: rtl8150: enable basic endpoint checking Syzkaller reports [1] encountering a common issue of utilizing a wrong usb endpoint type during URB submitting stage. This, in turn, triggers a warning shown below. For now, enable simple endpoint checking (specifically, bulk and interrupt eps, testing control one is not essential) to mitigate the issue with a view to do other related cosmetic changes later, if they are necessary. [1] Syzkaller report: usb 1-1: BOGUS urb xfer, pipe 3 != type 1 WARNING: CPU: 1 PID: 2586 at drivers/usb/core/urb.c:503 usb_submit_urb+0xe4b/0x1730 driv> Modules linked in: CPU: 1 UID: 0 PID: 2586 Comm: dhcpcd Not tainted 6.11.0-rc4-syzkaller-00069-gfc88bb11617> Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 08/06/2024 RIP: 0010:usb_submit_urb+0xe4b/0x1730 drivers/usb/core/urb.c:503 Code: 84 3c 02 00 00 e8 05 e4 fc fc 4c 89 ef e8 fd 25 d7 fe 45 89 e0 89 e9 4c 89 f2 48 8> RSP: 0018:ffffc9000441f740 EFLAGS: 00010282 RAX: 0000000000000000 RBX: ffff888112487a00 RCX: ffffffff811a99a9 RDX: ffff88810df6ba80 RSI: ffffffff811a99b6 RDI: 0000000000000001 RBP: 0000000000000003 R08: 0000000000000001 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000001 R12: 0000000000000001 R13: ffff8881023bf0a8 R14: ffff888112452a20 R15: ffff888112487a7c FS: 00007fc04eea5740(0000) GS:ffff8881f6300000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f0a1de9f870 CR3: 000000010dbd0000 CR4: 00000000003506f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: rtl8150_open+0x300/0xe30 drivers/net/usb/rtl8150.c:733 __dev_open+0x2d4/0x4e0 net/core/dev.c:1474 __dev_change_flags+0x561/0x720 net/core/dev.c:8838 dev_change_flags+0x8f/0x160 net/core/dev.c:8910 devinet_ioctl+0x127a/0x1f10 net/ipv4/devinet.c:1177 inet_ioctl+0x3aa/0x3f0 net/ipv4/af_inet.c:1003 sock_do_ioctl+0x116/0x280 net/socket.c:1222 sock_ioctl+0x22e/0x6c0 net/socket.c:1341 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:907 [inline] __se_sys_ioctl fs/ioctl.c:893 [inline] __x64_sys_ioctl+0x193/0x220 fs/ioctl.c:893 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xcd/0x250 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7fc04ef73d49 ... This change has not been tested on real hardware. Reported-and-tested-by: syzbot+d7e968426f644b567e31@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=d7e968426f644b567e31 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Cc: stable@vger.kernel.org Signed-off-by: Nikita Zhandarovich Link: https://patch.msgid.link/20250124093020.234642-1-n.zhandarovich@fintech.ru Signed-off-by: Paolo Abeni --- drivers/net/usb/rtl8150.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/drivers/net/usb/rtl8150.c b/drivers/net/usb/rtl8150.c index 01a3b2417a540..ddff6f19ff98e 100644 --- a/drivers/net/usb/rtl8150.c +++ b/drivers/net/usb/rtl8150.c @@ -71,6 +71,14 @@ #define MSR_SPEED (1<<3) #define MSR_LINK (1<<2) +/* USB endpoints */ +enum rtl8150_usb_ep { + RTL8150_USB_EP_CONTROL = 0, + RTL8150_USB_EP_BULK_IN = 1, + RTL8150_USB_EP_BULK_OUT = 2, + RTL8150_USB_EP_INT_IN = 3, +}; + /* Interrupt pipe data */ #define INT_TSR 0x00 #define INT_RSR 0x01 @@ -867,6 +875,13 @@ static int rtl8150_probe(struct usb_interface *intf, struct usb_device *udev = interface_to_usbdev(intf); rtl8150_t *dev; struct net_device *netdev; + static const u8 bulk_ep_addr[] = { + RTL8150_USB_EP_BULK_IN | USB_DIR_IN, + RTL8150_USB_EP_BULK_OUT | USB_DIR_OUT, + 0}; + static const u8 int_ep_addr[] = { + RTL8150_USB_EP_INT_IN | USB_DIR_IN, + 0}; netdev = alloc_etherdev(sizeof(rtl8150_t)); if (!netdev) @@ -880,6 +895,13 @@ static int rtl8150_probe(struct usb_interface *intf, return -ENOMEM; } + /* Verify that all required endpoints are present */ + if (!usb_check_bulk_endpoints(intf, bulk_ep_addr) || + !usb_check_int_endpoints(intf, int_ep_addr)) { + dev_err(&intf->dev, "couldn't find required endpoints\n"); + goto out; + } + tasklet_setup(&dev->tl, rx_fixup); spin_lock_init(&dev->rx_pool_lock); From 19ae40f572a9ce1ade9954990af709a03fd37010 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Sat, 25 Jan 2025 10:28:38 +0100 Subject: [PATCH 53/88] ptp: Properly handle compat ioctls MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pointer arguments passed to ioctls need to pass through compat_ptr() to work correctly on s390; as explained in Documentation/driver-api/ioctl.rst. Detect compat mode at runtime and call compat_ptr() for those commands which do take pointer arguments. Suggested-by: Arnd Bergmann Link: https://lore.kernel.org/lkml/1ba5d3a4-7931-455b-a3ce-85a968a7cb10@app.fastmail.com/ Fixes: d94ba80ebbea ("ptp: Added a brand new class driver for ptp clocks.") Signed-off-by: Thomas Weißschuh Reviewed-by: Cyrill Gorcunov Reviewed-by: Arnd Bergmann Acked-by: Richard Cochran Link: https://patch.msgid.link/20250125-posix-clock-compat_ioctl-v2-1-11c865c500eb@weissschuh.net Signed-off-by: Paolo Abeni --- drivers/ptp/ptp_chardev.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/ptp/ptp_chardev.c b/drivers/ptp/ptp_chardev.c index ea96a14d72d14..bf6468c56419c 100644 --- a/drivers/ptp/ptp_chardev.c +++ b/drivers/ptp/ptp_chardev.c @@ -4,6 +4,7 @@ * * Copyright (C) 2010 OMICRON electronics GmbH */ +#include #include #include #include @@ -176,6 +177,9 @@ long ptp_ioctl(struct posix_clock_context *pccontext, unsigned int cmd, struct timespec64 ts; int enable, err = 0; + if (in_compat_syscall() && cmd != PTP_ENABLE_PPS && cmd != PTP_ENABLE_PPS2) + arg = (unsigned long)compat_ptr(arg); + tsevq = pccontext->private_clkdata; switch (cmd) { From c219427ed296f94bb4b91d08626776dc7719ee27 Mon Sep 17 00:00:00 2001 From: Foster Snowhill Date: Sun, 26 Jan 2025 00:54:03 +0100 Subject: [PATCH 54/88] usbnet: ipheth: fix possible overflow in DPE length check Originally, it was possible for the DPE length check to overflow if wDatagramIndex + wDatagramLength > U16_MAX. This could lead to an OoB read. Move the wDatagramIndex term to the other side of the inequality. An existing condition ensures that wDatagramIndex < urb->actual_length. Fixes: a2d274c62e44 ("usbnet: ipheth: add CDC NCM support") Cc: stable@vger.kernel.org Signed-off-by: Foster Snowhill Reviewed-by: Jakub Kicinski Signed-off-by: Paolo Abeni --- drivers/net/usb/ipheth.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/usb/ipheth.c b/drivers/net/usb/ipheth.c index 46afb95ffabe3..45daae234cb85 100644 --- a/drivers/net/usb/ipheth.c +++ b/drivers/net/usb/ipheth.c @@ -243,8 +243,8 @@ static int ipheth_rcvbulk_callback_ncm(struct urb *urb) while (le16_to_cpu(dpe->wDatagramIndex) != 0 && le16_to_cpu(dpe->wDatagramLength) != 0) { if (le16_to_cpu(dpe->wDatagramIndex) >= urb->actual_length || - le16_to_cpu(dpe->wDatagramIndex) + - le16_to_cpu(dpe->wDatagramLength) > urb->actual_length) { + le16_to_cpu(dpe->wDatagramLength) > urb->actual_length - + le16_to_cpu(dpe->wDatagramIndex)) { dev->net->stats.rx_length_errors++; return retval; } From 429fa68b58cefb9aa9de27e4089637298b46b757 Mon Sep 17 00:00:00 2001 From: Foster Snowhill Date: Sun, 26 Jan 2025 00:54:04 +0100 Subject: [PATCH 55/88] usbnet: ipheth: check that DPE points past NCM header By definition, a DPE points at the start of a network frame/datagram. Thus it makes no sense for it to point at anything that's part of the NCM header. It is not a security issue, but merely an indication of a malformed DPE. Enforce that all DPEs point at the data portion of the URB, past the NCM header. Fixes: a2d274c62e44 ("usbnet: ipheth: add CDC NCM support") Cc: stable@vger.kernel.org Signed-off-by: Foster Snowhill Reviewed-by: Jakub Kicinski Signed-off-by: Paolo Abeni --- drivers/net/usb/ipheth.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/usb/ipheth.c b/drivers/net/usb/ipheth.c index 45daae234cb85..1ff5f7076ad53 100644 --- a/drivers/net/usb/ipheth.c +++ b/drivers/net/usb/ipheth.c @@ -242,7 +242,8 @@ static int ipheth_rcvbulk_callback_ncm(struct urb *urb) dpe = ncm0->dpe16; while (le16_to_cpu(dpe->wDatagramIndex) != 0 && le16_to_cpu(dpe->wDatagramLength) != 0) { - if (le16_to_cpu(dpe->wDatagramIndex) >= urb->actual_length || + if (le16_to_cpu(dpe->wDatagramIndex) < IPHETH_NCM_HEADER_SIZE || + le16_to_cpu(dpe->wDatagramIndex) >= urb->actual_length || le16_to_cpu(dpe->wDatagramLength) > urb->actual_length - le16_to_cpu(dpe->wDatagramIndex)) { dev->net->stats.rx_length_errors++; From 86586dcb75cb8fd062a518aca8ee667938b91efb Mon Sep 17 00:00:00 2001 From: Foster Snowhill Date: Sun, 26 Jan 2025 00:54:05 +0100 Subject: [PATCH 56/88] usbnet: ipheth: use static NDP16 location in URB Original code allowed for the start of NDP16 to be anywhere within the URB based on the `wNdpIndex` value in NTH16. Only the start position of NDP16 was checked, so it was possible for even the fixed-length part of NDP16 to extend past the end of URB, leading to an out-of-bounds read. On iOS devices, the NDP16 header always directly follows NTH16. Rely on and check for this specific format. This, along with NCM-specific minimal URB length check that already exists, will ensure that the fixed-length part of NDP16 plus a set amount of DPEs fit within the URB. Note that this commit alone does not fully address the OoB read. The limit on the amount of DPEs needs to be enforced separately. Fixes: a2d274c62e44 ("usbnet: ipheth: add CDC NCM support") Cc: stable@vger.kernel.org Signed-off-by: Foster Snowhill Reviewed-by: Jakub Kicinski Signed-off-by: Paolo Abeni --- drivers/net/usb/ipheth.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/net/usb/ipheth.c b/drivers/net/usb/ipheth.c index 1ff5f7076ad53..c385623596d2f 100644 --- a/drivers/net/usb/ipheth.c +++ b/drivers/net/usb/ipheth.c @@ -226,15 +226,14 @@ static int ipheth_rcvbulk_callback_ncm(struct urb *urb) ncmh = urb->transfer_buffer; if (ncmh->dwSignature != cpu_to_le32(USB_CDC_NCM_NTH16_SIGN) || - le16_to_cpu(ncmh->wNdpIndex) >= urb->actual_length) { + /* On iOS, NDP16 directly follows NTH16 */ + ncmh->wNdpIndex != cpu_to_le16(sizeof(struct usb_cdc_ncm_nth16))) { dev->net->stats.rx_errors++; return retval; } - ncm0 = urb->transfer_buffer + le16_to_cpu(ncmh->wNdpIndex); - if (ncm0->dwSignature != cpu_to_le32(USB_CDC_NCM_NDP16_NOCRC_SIGN) || - le16_to_cpu(ncmh->wHeaderLength) + le16_to_cpu(ncm0->wLength) >= - urb->actual_length) { + ncm0 = urb->transfer_buffer + sizeof(struct usb_cdc_ncm_nth16); + if (ncm0->dwSignature != cpu_to_le32(USB_CDC_NCM_NDP16_NOCRC_SIGN)) { dev->net->stats.rx_errors++; return retval; } From 2a9a196429e98fcc64078366c2679bc40aba5466 Mon Sep 17 00:00:00 2001 From: Foster Snowhill Date: Sun, 26 Jan 2025 00:54:06 +0100 Subject: [PATCH 57/88] usbnet: ipheth: refactor NCM datagram loop Introduce an rx_error label to reduce repetitions in the header signature checks. Store wDatagramIndex and wDatagramLength after endianness conversion to avoid repeated le16_to_cpu() calls. Rewrite the loop to return on a null trailing DPE, which is required by the CDC NCM spec. In case it is missing, fall through to rx_error. This change does not fix any particular issue. Its purpose is to simplify a subsequent commit that fixes a potential OoB read by limiting the maximum amount of processed DPEs. Cc: stable@vger.kernel.org # 6.5.x Signed-off-by: Foster Snowhill Reviewed-by: Jakub Kicinski Signed-off-by: Paolo Abeni --- drivers/net/usb/ipheth.c | 42 ++++++++++++++++++++++------------------ 1 file changed, 23 insertions(+), 19 deletions(-) diff --git a/drivers/net/usb/ipheth.c b/drivers/net/usb/ipheth.c index c385623596d2f..069979e2bb6ed 100644 --- a/drivers/net/usb/ipheth.c +++ b/drivers/net/usb/ipheth.c @@ -213,9 +213,9 @@ static int ipheth_rcvbulk_callback_ncm(struct urb *urb) struct usb_cdc_ncm_ndp16 *ncm0; struct usb_cdc_ncm_dpe16 *dpe; struct ipheth_device *dev; + u16 dg_idx, dg_len; int retval = -EINVAL; char *buf; - int len; dev = urb->context; @@ -227,39 +227,43 @@ static int ipheth_rcvbulk_callback_ncm(struct urb *urb) ncmh = urb->transfer_buffer; if (ncmh->dwSignature != cpu_to_le32(USB_CDC_NCM_NTH16_SIGN) || /* On iOS, NDP16 directly follows NTH16 */ - ncmh->wNdpIndex != cpu_to_le16(sizeof(struct usb_cdc_ncm_nth16))) { - dev->net->stats.rx_errors++; - return retval; - } + ncmh->wNdpIndex != cpu_to_le16(sizeof(struct usb_cdc_ncm_nth16))) + goto rx_error; ncm0 = urb->transfer_buffer + sizeof(struct usb_cdc_ncm_nth16); - if (ncm0->dwSignature != cpu_to_le32(USB_CDC_NCM_NDP16_NOCRC_SIGN)) { - dev->net->stats.rx_errors++; - return retval; - } + if (ncm0->dwSignature != cpu_to_le32(USB_CDC_NCM_NDP16_NOCRC_SIGN)) + goto rx_error; dpe = ncm0->dpe16; - while (le16_to_cpu(dpe->wDatagramIndex) != 0 && - le16_to_cpu(dpe->wDatagramLength) != 0) { - if (le16_to_cpu(dpe->wDatagramIndex) < IPHETH_NCM_HEADER_SIZE || - le16_to_cpu(dpe->wDatagramIndex) >= urb->actual_length || - le16_to_cpu(dpe->wDatagramLength) > urb->actual_length - - le16_to_cpu(dpe->wDatagramIndex)) { + while (true) { + dg_idx = le16_to_cpu(dpe->wDatagramIndex); + dg_len = le16_to_cpu(dpe->wDatagramLength); + + /* Null DPE must be present after last datagram pointer entry + * (3.3.1 USB CDC NCM spec v1.0) + */ + if (dg_idx == 0 && dg_len == 0) + return 0; + + if (dg_idx < IPHETH_NCM_HEADER_SIZE || + dg_idx >= urb->actual_length || + dg_len > urb->actual_length - dg_idx) { dev->net->stats.rx_length_errors++; return retval; } - buf = urb->transfer_buffer + le16_to_cpu(dpe->wDatagramIndex); - len = le16_to_cpu(dpe->wDatagramLength); + buf = urb->transfer_buffer + dg_idx; - retval = ipheth_consume_skb(buf, len, dev); + retval = ipheth_consume_skb(buf, dg_len, dev); if (retval != 0) return retval; dpe++; } - return 0; +rx_error: + dev->net->stats.rx_errors++; + return retval; } static void ipheth_rcvbulk_callback(struct urb *urb) From efcbc678a14be268040ffc1fa33c98faf2d55141 Mon Sep 17 00:00:00 2001 From: Foster Snowhill Date: Sun, 26 Jan 2025 00:54:07 +0100 Subject: [PATCH 58/88] usbnet: ipheth: break up NCM header size computation Originally, the total NCM header size was computed as the sum of two vaguely labelled constants. While accurate, it wasn't particularly clear where they were coming from. Use sizes of existing NCM structs where available. Define the total NDP16 size based on the maximum amount of DPEs that can fit into the iOS-specific fixed-size header. This change does not fix any particular issue. Rather, it introduces intermediate constants that will simplify subsequent commits. It should also make it clearer for the reader where the constant values come from. Cc: stable@vger.kernel.org # 6.5.x Signed-off-by: Foster Snowhill Reviewed-by: Jakub Kicinski Signed-off-by: Paolo Abeni --- drivers/net/usb/ipheth.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/drivers/net/usb/ipheth.c b/drivers/net/usb/ipheth.c index 069979e2bb6ed..03249208612e7 100644 --- a/drivers/net/usb/ipheth.c +++ b/drivers/net/usb/ipheth.c @@ -61,7 +61,18 @@ #define IPHETH_USBINTF_PROTO 1 #define IPHETH_IP_ALIGN 2 /* padding at front of URB */ -#define IPHETH_NCM_HEADER_SIZE (12 + 96) /* NCMH + NCM0 */ +/* On iOS devices, NCM headers in RX have a fixed size regardless of DPE count: + * - NTH16 (NCMH): 12 bytes, as per CDC NCM 1.0 spec + * - NDP16 (NCM0): 96 bytes, of which + * - NDP16 fixed header: 8 bytes + * - maximum of 22 DPEs (21 datagrams + trailer), 4 bytes each + */ +#define IPHETH_NDP16_MAX_DPE 22 +#define IPHETH_NDP16_HEADER_SIZE (sizeof(struct usb_cdc_ncm_ndp16) + \ + IPHETH_NDP16_MAX_DPE * \ + sizeof(struct usb_cdc_ncm_dpe16)) +#define IPHETH_NCM_HEADER_SIZE (sizeof(struct usb_cdc_ncm_nth16) + \ + IPHETH_NDP16_HEADER_SIZE) #define IPHETH_TX_BUF_SIZE ETH_FRAME_LEN #define IPHETH_RX_BUF_SIZE_LEGACY (IPHETH_IP_ALIGN + ETH_FRAME_LEN) #define IPHETH_RX_BUF_SIZE_NCM 65536 From ee591f2b281721171896117f9946fced31441418 Mon Sep 17 00:00:00 2001 From: Foster Snowhill Date: Sun, 26 Jan 2025 00:54:08 +0100 Subject: [PATCH 59/88] usbnet: ipheth: fix DPE OoB read Fix an out-of-bounds DPE read, limit the number of processed DPEs to the amount that fits into the fixed-size NDP16 header. Fixes: a2d274c62e44 ("usbnet: ipheth: add CDC NCM support") Cc: stable@vger.kernel.org Signed-off-by: Foster Snowhill Reviewed-by: Jakub Kicinski Signed-off-by: Paolo Abeni --- drivers/net/usb/ipheth.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/usb/ipheth.c b/drivers/net/usb/ipheth.c index 03249208612e7..5347cd7e295b3 100644 --- a/drivers/net/usb/ipheth.c +++ b/drivers/net/usb/ipheth.c @@ -246,7 +246,7 @@ static int ipheth_rcvbulk_callback_ncm(struct urb *urb) goto rx_error; dpe = ncm0->dpe16; - while (true) { + for (int dpe_i = 0; dpe_i < IPHETH_NDP16_MAX_DPE; ++dpe_i, ++dpe) { dg_idx = le16_to_cpu(dpe->wDatagramIndex); dg_len = le16_to_cpu(dpe->wDatagramLength); @@ -268,8 +268,6 @@ static int ipheth_rcvbulk_callback_ncm(struct urb *urb) retval = ipheth_consume_skb(buf, dg_len, dev); if (retval != 0) return retval; - - dpe++; } rx_error: From be154b598fa54136e2be17d6dd13c8a8bc0078ce Mon Sep 17 00:00:00 2001 From: Foster Snowhill Date: Sun, 26 Jan 2025 00:54:09 +0100 Subject: [PATCH 60/88] usbnet: ipheth: document scope of NCM implementation Clarify that the "NCM" implementation in `ipheth` is very limited, as iOS devices aren't compatible with the CDC NCM specification in regular tethering mode. For a standards-compliant implementation, one shall turn to the `cdc_ncm` module. Cc: stable@vger.kernel.org # 6.5.x Signed-off-by: Foster Snowhill Reviewed-by: Jakub Kicinski Signed-off-by: Paolo Abeni --- drivers/net/usb/ipheth.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/net/usb/ipheth.c b/drivers/net/usb/ipheth.c index 5347cd7e295b3..a19789b571905 100644 --- a/drivers/net/usb/ipheth.c +++ b/drivers/net/usb/ipheth.c @@ -218,6 +218,14 @@ static int ipheth_rcvbulk_callback_legacy(struct urb *urb) return ipheth_consume_skb(buf, len, dev); } +/* In "NCM mode", the iOS device encapsulates RX (phone->computer) traffic + * in NCM Transfer Blocks (similarly to CDC NCM). However, unlike reverse + * tethering (handled by the `cdc_ncm` driver), regular tethering is not + * compliant with the CDC NCM spec, as the device is missing the necessary + * descriptors, and TX (computer->phone) traffic is not encapsulated + * at all. Thus `ipheth` implements a very limited subset of the spec with + * the sole purpose of parsing RX URBs. + */ static int ipheth_rcvbulk_callback_ncm(struct urb *urb) { struct usb_cdc_ncm_nth16 *ncmh; From 4f5a52adeb1ad675ca33f1e1eacd9c0bbaf393d4 Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Sun, 26 Jan 2025 21:18:45 +0200 Subject: [PATCH 61/88] ethtool: Fix set RXNFC command with symmetric RSS hash The sanity check that both source and destination are set when symmetric RSS hash is requested is only relevant for ETHTOOL_SRXFH (rx-flow-hash), it should not be performed on any other commands (e.g. ETHTOOL_SRXCLSRLINS/ETHTOOL_SRXCLSRLDEL). This resolves accessing uninitialized 'info.data' field, and fixes false errors in rule insertion: # ethtool --config-ntuple eth2 flow-type ip4 dst-ip 255.255.255.255 action -1 loc 0 rmgr: Cannot insert RX class rule: Invalid argument Cannot insert classification rule Fixes: 13e59344fb9d ("net: ethtool: add support for symmetric-xor RSS hash") Cc: Ahmed Zaki Reviewed-by: Tariq Toukan Signed-off-by: Gal Pressman Reviewed-by: Michal Swiatkowski Reviewed-by: Edward Cree Reviewed-by: Ahmed Zaki Link: https://patch.msgid.link/20250126191845.316589-1-gal@nvidia.com Signed-off-by: Paolo Abeni --- net/ethtool/ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index 7bb94875a7ec8..34bee42e12470 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -998,7 +998,7 @@ static noinline_for_stack int ethtool_set_rxnfc(struct net_device *dev, ethtool_get_flow_spec_ring(info.fs.ring_cookie)) return -EINVAL; - if (ops->get_rxfh) { + if (cmd == ETHTOOL_SRXFH && ops->get_rxfh) { struct ethtool_rxfh_param rxfh = {}; rc = ops->get_rxfh(dev, &rxfh); From f5fb35a3d6b36d378b2e2ecbfb9caa337d5428e6 Mon Sep 17 00:00:00 2001 From: Kunihiko Hayashi Date: Mon, 27 Jan 2025 10:38:18 +0900 Subject: [PATCH 62/88] net: stmmac: Limit the number of MTL queues to hardware capability The number of MTL queues to use is specified by the parameter "snps,{tx,rx}-queues-to-use" from stmmac_platform layer. However, the maximum numbers of queues are constrained by upper limits determined by the capability of each hardware feature. It's appropriate to limit the values not to exceed the upper limit values and display a warning message. This only works if the hardware capability has the upper limit values. Fixes: d976a525c371 ("net: stmmac: multiple queues dt configuration") Signed-off-by: Kunihiko Hayashi Reviewed-by: Yanteng Si Signed-off-by: Paolo Abeni --- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index edbf8994455d3..f347e6390d291 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -7221,6 +7221,21 @@ static int stmmac_hw_init(struct stmmac_priv *priv) if (priv->dma_cap.tsoen) dev_info(priv->device, "TSO supported\n"); + if (priv->dma_cap.number_rx_queues && + priv->plat->rx_queues_to_use > priv->dma_cap.number_rx_queues) { + dev_warn(priv->device, + "Number of Rx queues (%u) exceeds dma capability\n", + priv->plat->rx_queues_to_use); + priv->plat->rx_queues_to_use = priv->dma_cap.number_rx_queues; + } + if (priv->dma_cap.number_tx_queues && + priv->plat->tx_queues_to_use > priv->dma_cap.number_tx_queues) { + dev_warn(priv->device, + "Number of Tx queues (%u) exceeds dma capability\n", + priv->plat->tx_queues_to_use); + priv->plat->tx_queues_to_use = priv->dma_cap.number_tx_queues; + } + priv->hw->vlan_fail_q_en = (priv->plat->flags & STMMAC_FLAG_VLAN_FAIL_Q_EN); priv->hw->vlan_fail_q = priv->plat->vlan_fail_q; From 044f2fbaa2725696ecbf1f02ba7ab0a8ccb7e1ae Mon Sep 17 00:00:00 2001 From: Kunihiko Hayashi Date: Mon, 27 Jan 2025 10:38:19 +0900 Subject: [PATCH 63/88] net: stmmac: Limit FIFO size by hardware capability Tx/Rx FIFO size is specified by the parameter "{tx,rx}-fifo-depth" from stmmac_platform layer. However, these values are constrained by upper limits determined by the capabilities of each hardware feature. There is a risk that the upper bits will be truncated due to the calculation, so it's appropriate to limit them to the upper limit values and display a warning message. This only works if the hardware capability has the upper limit values. Fixes: e7877f52fd4a ("stmmac: Read tx-fifo-depth and rx-fifo-depth from the devicetree") Signed-off-by: Kunihiko Hayashi Reviewed-by: Yanteng Si Signed-off-by: Paolo Abeni --- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index f347e6390d291..43750651afb15 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -7236,6 +7236,21 @@ static int stmmac_hw_init(struct stmmac_priv *priv) priv->plat->tx_queues_to_use = priv->dma_cap.number_tx_queues; } + if (priv->dma_cap.rx_fifo_size && + priv->plat->rx_fifo_size > priv->dma_cap.rx_fifo_size) { + dev_warn(priv->device, + "Rx FIFO size (%u) exceeds dma capability\n", + priv->plat->rx_fifo_size); + priv->plat->rx_fifo_size = priv->dma_cap.rx_fifo_size; + } + if (priv->dma_cap.tx_fifo_size && + priv->plat->tx_fifo_size > priv->dma_cap.tx_fifo_size) { + dev_warn(priv->device, + "Tx FIFO size (%u) exceeds dma capability\n", + priv->plat->tx_fifo_size); + priv->plat->tx_fifo_size = priv->dma_cap.tx_fifo_size; + } + priv->hw->vlan_fail_q_en = (priv->plat->flags & STMMAC_FLAG_VLAN_FAIL_Q_EN); priv->hw->vlan_fail_q = priv->plat->vlan_fail_q; From 8865d22656b442b8d0fb019e6acb2292b99a9c3c Mon Sep 17 00:00:00 2001 From: Kunihiko Hayashi Date: Mon, 27 Jan 2025 10:38:20 +0900 Subject: [PATCH 64/88] net: stmmac: Specify hardware capability value when FIFO size isn't specified When Tx/Rx FIFO size is not specified in advance, the driver checks if the value is zero and sets the hardware capability value in functions where that value is used. Consolidate the check and settings into function stmmac_hw_init() and remove redundant other statements. If FIFO size is zero and the hardware capability also doesn't have upper limit values, return with an error message. Signed-off-by: Kunihiko Hayashi Reviewed-by: Yanteng Si Signed-off-by: Paolo Abeni --- .../net/ethernet/stmicro/stmmac/stmmac_main.c | 35 ++++++++++--------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 43750651afb15..821350d11533c 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -2424,11 +2424,6 @@ static void stmmac_dma_operation_mode(struct stmmac_priv *priv) u32 chan = 0; u8 qmode = 0; - if (rxfifosz == 0) - rxfifosz = priv->dma_cap.rx_fifo_size; - if (txfifosz == 0) - txfifosz = priv->dma_cap.tx_fifo_size; - /* Split up the shared Tx/Rx FIFO memory on DW QoS Eth and DW XGMAC */ if (priv->plat->has_gmac4 || priv->plat->has_xgmac) { rxfifosz /= rx_channels_count; @@ -2897,11 +2892,6 @@ static void stmmac_set_dma_operation_mode(struct stmmac_priv *priv, u32 txmode, int rxfifosz = priv->plat->rx_fifo_size; int txfifosz = priv->plat->tx_fifo_size; - if (rxfifosz == 0) - rxfifosz = priv->dma_cap.rx_fifo_size; - if (txfifosz == 0) - txfifosz = priv->dma_cap.tx_fifo_size; - /* Adjust for real per queue fifo size */ rxfifosz /= rx_channels_count; txfifosz /= tx_channels_count; @@ -5878,9 +5868,6 @@ static int stmmac_change_mtu(struct net_device *dev, int new_mtu) const int mtu = new_mtu; int ret; - if (txfifosz == 0) - txfifosz = priv->dma_cap.tx_fifo_size; - txfifosz /= priv->plat->tx_queues_to_use; if (stmmac_xdp_is_enabled(priv) && new_mtu > ETH_DATA_LEN) { @@ -7236,15 +7223,29 @@ static int stmmac_hw_init(struct stmmac_priv *priv) priv->plat->tx_queues_to_use = priv->dma_cap.number_tx_queues; } - if (priv->dma_cap.rx_fifo_size && - priv->plat->rx_fifo_size > priv->dma_cap.rx_fifo_size) { + if (!priv->plat->rx_fifo_size) { + if (priv->dma_cap.rx_fifo_size) { + priv->plat->rx_fifo_size = priv->dma_cap.rx_fifo_size; + } else { + dev_err(priv->device, "Can't specify Rx FIFO size\n"); + return -ENODEV; + } + } else if (priv->dma_cap.rx_fifo_size && + priv->plat->rx_fifo_size > priv->dma_cap.rx_fifo_size) { dev_warn(priv->device, "Rx FIFO size (%u) exceeds dma capability\n", priv->plat->rx_fifo_size); priv->plat->rx_fifo_size = priv->dma_cap.rx_fifo_size; } - if (priv->dma_cap.tx_fifo_size && - priv->plat->tx_fifo_size > priv->dma_cap.tx_fifo_size) { + if (!priv->plat->tx_fifo_size) { + if (priv->dma_cap.tx_fifo_size) { + priv->plat->tx_fifo_size = priv->dma_cap.tx_fifo_size; + } else { + dev_err(priv->device, "Can't specify Tx FIFO size\n"); + return -ENODEV; + } + } else if (priv->dma_cap.tx_fifo_size && + priv->plat->tx_fifo_size > priv->dma_cap.tx_fifo_size) { dev_warn(priv->device, "Tx FIFO size (%u) exceeds dma capability\n", priv->plat->tx_fifo_size); From 9e6c4e6b605c1fa3e24f74ee0b641e95f090188a Mon Sep 17 00:00:00 2001 From: Cosmin Ratiu Date: Mon, 27 Jan 2025 12:41:47 +0200 Subject: [PATCH 65/88] bonding: Correctly support GSO ESP offload The referenced fix is incomplete. It correctly computes bond_dev->gso_partial_features across slaves, but unfortunately netdev_fix_features discards gso_partial_features from the feature set if NETIF_F_GSO_PARTIAL isn't set in bond_dev->features. This is visible with ethtool -k bond0 | grep esp: tx-esp-segmentation: off [requested on] esp-hw-offload: on esp-tx-csum-hw-offload: on This patch reworks the bonding GSO offload support by: - making aggregating gso_partial_features across slaves similar to the other feature sets (this part is a no-op). - advertising the default partial gso features on empty bond devs, same as with other feature sets (also a no-op). - adding NETIF_F_GSO_PARTIAL to hw_enc_features filtered across slaves. - adding NETIF_F_GSO_PARTIAL to features in bond_setup() With all of these, 'ethtool -k bond0 | grep esp' now reports: tx-esp-segmentation: on esp-hw-offload: on esp-tx-csum-hw-offload: on Fixes: 4861333b4217 ("bonding: add ESP offload features when slaves support") Signed-off-by: Hangbin Liu Signed-off-by: Cosmin Ratiu Acked-by: Jay Vosburgh Link: https://patch.msgid.link/20250127104147.759658-1-cratiu@nvidia.com Signed-off-by: Paolo Abeni --- drivers/net/bonding/bond_main.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 7b78c2bada814..e45bba240cbcd 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -1538,17 +1538,20 @@ static netdev_features_t bond_fix_features(struct net_device *dev, NETIF_F_HIGHDMA | NETIF_F_LRO) #define BOND_ENC_FEATURES (NETIF_F_HW_CSUM | NETIF_F_SG | \ - NETIF_F_RXCSUM | NETIF_F_GSO_SOFTWARE) + NETIF_F_RXCSUM | NETIF_F_GSO_SOFTWARE | \ + NETIF_F_GSO_PARTIAL) #define BOND_MPLS_FEATURES (NETIF_F_HW_CSUM | NETIF_F_SG | \ NETIF_F_GSO_SOFTWARE) +#define BOND_GSO_PARTIAL_FEATURES (NETIF_F_GSO_ESP) + static void bond_compute_features(struct bonding *bond) { + netdev_features_t gso_partial_features = BOND_GSO_PARTIAL_FEATURES; unsigned int dst_release_flag = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM; - netdev_features_t gso_partial_features = NETIF_F_GSO_ESP; netdev_features_t vlan_features = BOND_VLAN_FEATURES; netdev_features_t enc_features = BOND_ENC_FEATURES; #ifdef CONFIG_XFRM_OFFLOAD @@ -1582,8 +1585,9 @@ static void bond_compute_features(struct bonding *bond) BOND_XFRM_FEATURES); #endif /* CONFIG_XFRM_OFFLOAD */ - if (slave->dev->hw_enc_features & NETIF_F_GSO_PARTIAL) - gso_partial_features &= slave->dev->gso_partial_features; + gso_partial_features = netdev_increment_features(gso_partial_features, + slave->dev->gso_partial_features, + BOND_GSO_PARTIAL_FEATURES); mpls_features = netdev_increment_features(mpls_features, slave->dev->mpls_features, @@ -1598,12 +1602,8 @@ static void bond_compute_features(struct bonding *bond) } bond_dev->hard_header_len = max_hard_header_len; - if (gso_partial_features & NETIF_F_GSO_ESP) - bond_dev->gso_partial_features |= NETIF_F_GSO_ESP; - else - bond_dev->gso_partial_features &= ~NETIF_F_GSO_ESP; - done: + bond_dev->gso_partial_features = gso_partial_features; bond_dev->vlan_features = vlan_features; bond_dev->hw_enc_features = enc_features | NETIF_F_GSO_ENCAP_ALL | NETIF_F_HW_VLAN_CTAG_TX | @@ -6046,6 +6046,7 @@ void bond_setup(struct net_device *bond_dev) bond_dev->hw_features |= NETIF_F_GSO_ENCAP_ALL; bond_dev->features |= bond_dev->hw_features; bond_dev->features |= NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX; + bond_dev->features |= NETIF_F_GSO_PARTIAL; #ifdef CONFIG_XFRM_OFFLOAD bond_dev->hw_features |= BOND_XFRM_FEATURES; /* Only enable XFRM features if this is an active-backup config */ From e9087e828827e5a5c85e124ce77503f2b81c3491 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Wed, 15 Jan 2025 19:36:36 -0800 Subject: [PATCH 66/88] Bluetooth: btusb: mediatek: Add locks for usb_driver_claim_interface() The documentation for usb_driver_claim_interface() says that "the device lock" is needed when the function is called from places other than probe(). This appears to be the lock for the USB interface device. The Mediatek btusb code gets called via this path: Workqueue: hci0 hci_power_on [bluetooth] Call trace: usb_driver_claim_interface btusb_mtk_claim_iso_intf btusb_mtk_setup hci_dev_open_sync hci_power_on process_scheduled_works worker_thread kthread With the above call trace the device lock hasn't been claimed. Claim it. Without this fix, we'd sometimes see the error "Failed to claim iso interface". Sometimes we'd even see worse errors, like a NULL pointer dereference (where `intf->dev.driver` was NULL) with a trace like: Call trace: usb_suspend_both usb_runtime_suspend __rpm_callback rpm_suspend pm_runtime_work process_scheduled_works Both errors appear to be fixed with the proper locking. Fixes: ceac1cb0259d ("Bluetooth: btusb: mediatek: add ISO data transmission functions") Signed-off-by: Douglas Anderson Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/btusb.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c index 9aa018d4f6f50..145b3fbfff550 100644 --- a/drivers/bluetooth/btusb.c +++ b/drivers/bluetooth/btusb.c @@ -2639,8 +2639,15 @@ static void btusb_mtk_claim_iso_intf(struct btusb_data *data) struct btmtk_data *btmtk_data = hci_get_priv(data->hdev); int err; + /* + * The function usb_driver_claim_interface() is documented to need + * locks held if it's not called from a probe routine. The code here + * is called from the hci_power_on workqueue, so grab the lock. + */ + device_lock(&btmtk_data->isopkt_intf->dev); err = usb_driver_claim_interface(&btusb_driver, btmtk_data->isopkt_intf, data); + device_unlock(&btmtk_data->isopkt_intf->dev); if (err < 0) { btmtk_data->isopkt_intf = NULL; bt_dev_err(data->hdev, "Failed to claim iso interface"); From 0983fb4799e78b1783eb5ed0dfe324649c491bb7 Mon Sep 17 00:00:00 2001 From: Hsin-chen Chuang Date: Mon, 20 Jan 2025 18:39:39 +0800 Subject: [PATCH 67/88] Bluetooth: Fix possible infinite recursion of btusb_reset The function enters infinite recursion if the HCI device doesn't support GPIO reset: btusb_reset -> hdev->reset -> vendor_reset -> btusb_reset... btusb_reset shouldn't call hdev->reset after commit f07d478090b0 ("Bluetooth: Get rid of cmd_timeout and use the reset callback") Fixes: f07d478090b0 ("Bluetooth: Get rid of cmd_timeout and use the reset callback") Signed-off-by: Hsin-chen Chuang Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/btusb.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c index 145b3fbfff550..90966dfbd2781 100644 --- a/drivers/bluetooth/btusb.c +++ b/drivers/bluetooth/btusb.c @@ -899,11 +899,6 @@ static void btusb_reset(struct hci_dev *hdev) struct btusb_data *data; int err; - if (hdev->reset) { - hdev->reset(hdev); - return; - } - data = hci_get_drvdata(hdev); /* This is not an unbalanced PM reference since the device will reset */ err = usb_autopm_get_interface(data->intf); From 514a8e6d6152a418da95db942827d5e3ce58613a Mon Sep 17 00:00:00 2001 From: Hsin-chen Chuang Date: Mon, 20 Jan 2025 16:47:36 +0800 Subject: [PATCH 68/88] Bluetooth: Add ABI doc for sysfs reset The functionality was implemented in commit 0f8a00137411 ("Bluetooth: Allow reset via sysfs") Fixes: 0f8a00137411 ("Bluetooth: Allow reset via sysfs") Signed-off-by: Hsin-chen Chuang Signed-off-by: Luiz Augusto von Dentz --- Documentation/ABI/stable/sysfs-class-bluetooth | 9 +++++++++ MAINTAINERS | 1 + 2 files changed, 10 insertions(+) create mode 100644 Documentation/ABI/stable/sysfs-class-bluetooth diff --git a/Documentation/ABI/stable/sysfs-class-bluetooth b/Documentation/ABI/stable/sysfs-class-bluetooth new file mode 100644 index 0000000000000..36be024711747 --- /dev/null +++ b/Documentation/ABI/stable/sysfs-class-bluetooth @@ -0,0 +1,9 @@ +What: /sys/class/bluetooth/hci/reset +Date: 14-Jan-2025 +KernelVersion: 6.13 +Contact: linux-bluetooth@vger.kernel.org +Description: This write-only attribute allows users to trigger the vendor reset + method on the Bluetooth device when arbitrary data is written. + The reset may or may not be done through the device transport + (e.g., UART/USB), and can also be done through an out-of-band + approach such as GPIO. diff --git a/MAINTAINERS b/MAINTAINERS index 5bcc78c0be70b..d996f393fcf42 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4078,6 +4078,7 @@ S: Supported W: http://www.bluez.org/ T: git git://git.kernel.org/pub/scm/linux/kernel/git/bluetooth/bluetooth.git T: git git://git.kernel.org/pub/scm/linux/kernel/git/bluetooth/bluetooth-next.git +F: Documentation/ABI/stable/sysfs-class-bluetooth F: include/net/bluetooth/ F: net/bluetooth/ From 7de119bb79a63f6a1959b83117a98734914fb0b0 Mon Sep 17 00:00:00 2001 From: Neeraj Sanjay Kale Date: Mon, 20 Jan 2025 19:49:46 +0530 Subject: [PATCH 69/88] Bluetooth: btnxpuart: Fix glitches seen in dual A2DP streaming This fixes a regression caused by previous commit for fixing truncated ACL data, which is causing some intermittent glitches when running two A2DP streams. serdev_device_write_buf() is the root cause of the glitch, which is reverted, and the TX work will continue to write until the queue is empty. This change fixes both issues. No A2DP streaming glitches or truncated ACL data issue observed. Fixes: 8023dd220425 ("Bluetooth: btnxpuart: Fix driver sending truncated data") Fixes: 689ca16e5232 ("Bluetooth: NXP: Add protocol support for NXP Bluetooth chipsets") Signed-off-by: Neeraj Sanjay Kale Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/btnxpuart.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/bluetooth/btnxpuart.c b/drivers/bluetooth/btnxpuart.c index 1230045d78a5f..aa5ec1d444a9d 100644 --- a/drivers/bluetooth/btnxpuart.c +++ b/drivers/bluetooth/btnxpuart.c @@ -1381,13 +1381,12 @@ static void btnxpuart_tx_work(struct work_struct *work) while ((skb = nxp_dequeue(nxpdev))) { len = serdev_device_write_buf(serdev, skb->data, skb->len); - serdev_device_wait_until_sent(serdev, 0); hdev->stat.byte_tx += len; skb_pull(skb, len); if (skb->len > 0) { skb_queue_head(&nxpdev->txq, skb); - break; + continue; } switch (hci_skb_pkt_type(skb)) { From 5c61419e02033eaf01733d66e2fcd4044808f482 Mon Sep 17 00:00:00 2001 From: Fedor Pchelkin Date: Wed, 29 Jan 2025 00:08:14 +0300 Subject: [PATCH 70/88] Bluetooth: L2CAP: accept zero as a special value for MTU auto-selection One of the possible ways to enable the input MTU auto-selection for L2CAP connections is supposed to be through passing a special "0" value for it as a socket option. Commit [1] added one of those into avdtp. However, it simply wouldn't work because the kernel still treats the specified value as invalid and denies the setting attempt. Recorded BlueZ logs include the following: bluetoothd[496]: profiles/audio/avdtp.c:l2cap_connect() setsockopt(L2CAP_OPTIONS): Invalid argument (22) [1]: https://github.com/bluez/bluez/commit/ae5be371a9f53fed33d2b34748a95a5498fd4b77 Found by Linux Verification Center (linuxtesting.org). Fixes: 4b6e228e297b ("Bluetooth: Auto tune if input MTU is set to 0") Cc: stable@vger.kernel.org Signed-off-by: Fedor Pchelkin Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/l2cap_sock.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index 49f97d4138ea7..46ea0bee2259f 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -710,12 +710,12 @@ static bool l2cap_valid_mtu(struct l2cap_chan *chan, u16 mtu) { switch (chan->scid) { case L2CAP_CID_ATT: - if (mtu < L2CAP_LE_MIN_MTU) + if (mtu && mtu < L2CAP_LE_MIN_MTU) return false; break; default: - if (mtu < L2CAP_DEFAULT_MIN_MTU) + if (mtu && mtu < L2CAP_DEFAULT_MIN_MTU) return false; } From fcdd2242c0231032fc84e1404315c245ae56322a Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Tue, 28 Jan 2025 14:15:27 +0100 Subject: [PATCH 71/88] vsock: Keep the binding until socket destruction Preserve sockets bindings; this includes both resulting from an explicit bind() and those implicitly bound through autobind during connect(). Prevents socket unbinding during a transport reassignment, which fixes a use-after-free: 1. vsock_create() (refcnt=1) calls vsock_insert_unbound() (refcnt=2) 2. transport->release() calls vsock_remove_bound() without checking if sk was bound and moved to bound list (refcnt=1) 3. vsock_bind() assumes sk is in unbound list and before __vsock_insert_bound(vsock_bound_sockets()) calls __vsock_remove_bound() which does: list_del_init(&vsk->bound_table); // nop sock_put(&vsk->sk); // refcnt=0 BUG: KASAN: slab-use-after-free in __vsock_bind+0x62e/0x730 Read of size 4 at addr ffff88816b46a74c by task a.out/2057 dump_stack_lvl+0x68/0x90 print_report+0x174/0x4f6 kasan_report+0xb9/0x190 __vsock_bind+0x62e/0x730 vsock_bind+0x97/0xe0 __sys_bind+0x154/0x1f0 __x64_sys_bind+0x6e/0xb0 do_syscall_64+0x93/0x1b0 entry_SYSCALL_64_after_hwframe+0x76/0x7e Allocated by task 2057: kasan_save_stack+0x1e/0x40 kasan_save_track+0x10/0x30 __kasan_slab_alloc+0x85/0x90 kmem_cache_alloc_noprof+0x131/0x450 sk_prot_alloc+0x5b/0x220 sk_alloc+0x2c/0x870 __vsock_create.constprop.0+0x2e/0xb60 vsock_create+0xe4/0x420 __sock_create+0x241/0x650 __sys_socket+0xf2/0x1a0 __x64_sys_socket+0x6e/0xb0 do_syscall_64+0x93/0x1b0 entry_SYSCALL_64_after_hwframe+0x76/0x7e Freed by task 2057: kasan_save_stack+0x1e/0x40 kasan_save_track+0x10/0x30 kasan_save_free_info+0x37/0x60 __kasan_slab_free+0x4b/0x70 kmem_cache_free+0x1a1/0x590 __sk_destruct+0x388/0x5a0 __vsock_bind+0x5e1/0x730 vsock_bind+0x97/0xe0 __sys_bind+0x154/0x1f0 __x64_sys_bind+0x6e/0xb0 do_syscall_64+0x93/0x1b0 entry_SYSCALL_64_after_hwframe+0x76/0x7e refcount_t: addition on 0; use-after-free. WARNING: CPU: 7 PID: 2057 at lib/refcount.c:25 refcount_warn_saturate+0xce/0x150 RIP: 0010:refcount_warn_saturate+0xce/0x150 __vsock_bind+0x66d/0x730 vsock_bind+0x97/0xe0 __sys_bind+0x154/0x1f0 __x64_sys_bind+0x6e/0xb0 do_syscall_64+0x93/0x1b0 entry_SYSCALL_64_after_hwframe+0x76/0x7e refcount_t: underflow; use-after-free. WARNING: CPU: 7 PID: 2057 at lib/refcount.c:28 refcount_warn_saturate+0xee/0x150 RIP: 0010:refcount_warn_saturate+0xee/0x150 vsock_remove_bound+0x187/0x1e0 __vsock_release+0x383/0x4a0 vsock_release+0x90/0x120 __sock_release+0xa3/0x250 sock_close+0x14/0x20 __fput+0x359/0xa80 task_work_run+0x107/0x1d0 do_exit+0x847/0x2560 do_group_exit+0xb8/0x250 __x64_sys_exit_group+0x3a/0x50 x64_sys_call+0xfec/0x14f0 do_syscall_64+0x93/0x1b0 entry_SYSCALL_64_after_hwframe+0x76/0x7e Fixes: c0cfa2d8a788 ("vsock: add multi-transports support") Reviewed-by: Stefano Garzarella Signed-off-by: Michal Luczaj Link: https://patch.msgid.link/20250128-vsock-transport-vs-autobind-v3-1-1cf57065b770@rbox.co Signed-off-by: Jakub Kicinski --- net/vmw_vsock/af_vsock.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index fa9d1b49599bf..cfe18bc8fdbe7 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -337,7 +337,10 @@ EXPORT_SYMBOL_GPL(vsock_find_connected_socket); void vsock_remove_sock(struct vsock_sock *vsk) { - vsock_remove_bound(vsk); + /* Transport reassignment must not remove the binding. */ + if (sock_flag(sk_vsock(vsk), SOCK_DEAD)) + vsock_remove_bound(vsk); + vsock_remove_connected(vsk); } EXPORT_SYMBOL_GPL(vsock_remove_sock); @@ -821,12 +824,13 @@ static void __vsock_release(struct sock *sk, int level) */ lock_sock_nested(sk, level); + sock_orphan(sk); + if (vsk->transport) vsk->transport->release(vsk); else if (sock_type_connectible(sk->sk_type)) vsock_remove_sock(vsk); - sock_orphan(sk); sk->sk_shutdown = SHUTDOWN_MASK; skb_queue_purge(&sk->sk_receive_queue); From aa388c72113b7458127b709bdd7d3628af26e9b4 Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Tue, 28 Jan 2025 14:15:28 +0100 Subject: [PATCH 72/88] vsock: Allow retrying on connect() failure sk_err is set when a (connectible) connect() fails. Effectively, this makes an otherwise still healthy SS_UNCONNECTED socket impossible to use for any subsequent connection attempts. Clear sk_err upon trying to establish a connection. Fixes: d021c344051a ("VSOCK: Introduce VM Sockets") Reviewed-by: Stefano Garzarella Reviewed-by: Luigi Leonardi Signed-off-by: Michal Luczaj Link: https://patch.msgid.link/20250128-vsock-transport-vs-autobind-v3-2-1cf57065b770@rbox.co Signed-off-by: Jakub Kicinski --- net/vmw_vsock/af_vsock.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index cfe18bc8fdbe7..075695173648d 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -1523,6 +1523,11 @@ static int vsock_connect(struct socket *sock, struct sockaddr *addr, if (err < 0) goto out; + /* sk_err might have been set as a result of an earlier + * (failed) connect attempt. + */ + sk->sk_err = 0; + /* Mark sock as connecting and set the error code to in * progress in case this is a non-blocking connect. */ From 852a00c4281d3c4cf82020421cc9b5b05d53e93f Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Tue, 28 Jan 2025 14:15:29 +0100 Subject: [PATCH 73/88] vsock/test: Introduce vsock_bind() Add a helper for socket()+bind(). Adapt callers. Reviewed-by: Stefano Garzarella Reviewed-by: Luigi Leonardi Signed-off-by: Michal Luczaj Link: https://patch.msgid.link/20250128-vsock-transport-vs-autobind-v3-3-1cf57065b770@rbox.co Signed-off-by: Jakub Kicinski --- tools/testing/vsock/util.c | 57 ++++++++++++++------------------ tools/testing/vsock/util.h | 1 + tools/testing/vsock/vsock_test.c | 17 +--------- 3 files changed, 26 insertions(+), 49 deletions(-) diff --git a/tools/testing/vsock/util.c b/tools/testing/vsock/util.c index 7058dc614c25f..6e36f9371532c 100644 --- a/tools/testing/vsock/util.c +++ b/tools/testing/vsock/util.c @@ -96,33 +96,43 @@ void vsock_wait_remote_close(int fd) close(epollfd); } -/* Bind to , connect to and return the file descriptor. */ -int vsock_bind_connect(unsigned int cid, unsigned int port, unsigned int bind_port, int type) +/* Create socket , bind to and return the file descriptor. */ +int vsock_bind(unsigned int cid, unsigned int port, int type) { - struct sockaddr_vm sa_client = { - .svm_family = AF_VSOCK, - .svm_cid = VMADDR_CID_ANY, - .svm_port = bind_port, - }; - struct sockaddr_vm sa_server = { + struct sockaddr_vm sa = { .svm_family = AF_VSOCK, .svm_cid = cid, .svm_port = port, }; + int fd; - int client_fd, ret; - - client_fd = socket(AF_VSOCK, type, 0); - if (client_fd < 0) { + fd = socket(AF_VSOCK, type, 0); + if (fd < 0) { perror("socket"); exit(EXIT_FAILURE); } - if (bind(client_fd, (struct sockaddr *)&sa_client, sizeof(sa_client))) { + if (bind(fd, (struct sockaddr *)&sa, sizeof(sa))) { perror("bind"); exit(EXIT_FAILURE); } + return fd; +} + +/* Bind to , connect to and return the file descriptor. */ +int vsock_bind_connect(unsigned int cid, unsigned int port, unsigned int bind_port, int type) +{ + struct sockaddr_vm sa_server = { + .svm_family = AF_VSOCK, + .svm_cid = cid, + .svm_port = port, + }; + + int client_fd, ret; + + client_fd = vsock_bind(VMADDR_CID_ANY, bind_port, type); + timeout_begin(TIMEOUT); do { ret = connect(client_fd, (struct sockaddr *)&sa_server, sizeof(sa_server)); @@ -192,28 +202,9 @@ int vsock_seqpacket_connect(unsigned int cid, unsigned int port) /* Listen on and return the file descriptor. */ static int vsock_listen(unsigned int cid, unsigned int port, int type) { - union { - struct sockaddr sa; - struct sockaddr_vm svm; - } addr = { - .svm = { - .svm_family = AF_VSOCK, - .svm_port = port, - .svm_cid = cid, - }, - }; int fd; - fd = socket(AF_VSOCK, type, 0); - if (fd < 0) { - perror("socket"); - exit(EXIT_FAILURE); - } - - if (bind(fd, &addr.sa, sizeof(addr.svm)) < 0) { - perror("bind"); - exit(EXIT_FAILURE); - } + fd = vsock_bind(cid, port, type); if (listen(fd, 1) < 0) { perror("listen"); diff --git a/tools/testing/vsock/util.h b/tools/testing/vsock/util.h index e62f46b2b92a7..077842905bc3e 100644 --- a/tools/testing/vsock/util.h +++ b/tools/testing/vsock/util.h @@ -43,6 +43,7 @@ int vsock_connect(unsigned int cid, unsigned int port, int type); int vsock_accept(unsigned int cid, unsigned int port, struct sockaddr_vm *clientaddrp, int type); int vsock_stream_connect(unsigned int cid, unsigned int port); +int vsock_bind(unsigned int cid, unsigned int port, int type); int vsock_bind_connect(unsigned int cid, unsigned int port, unsigned int bind_port, int type); int vsock_seqpacket_connect(unsigned int cid, unsigned int port); diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c index 1eebbc0d5f616..daa4f3ca9b6e7 100644 --- a/tools/testing/vsock/vsock_test.c +++ b/tools/testing/vsock/vsock_test.c @@ -113,24 +113,9 @@ static void test_stream_bind_only_client(const struct test_opts *opts) static void test_stream_bind_only_server(const struct test_opts *opts) { - union { - struct sockaddr sa; - struct sockaddr_vm svm; - } addr = { - .svm = { - .svm_family = AF_VSOCK, - .svm_port = opts->peer_port, - .svm_cid = VMADDR_CID_ANY, - }, - }; int fd; - fd = socket(AF_VSOCK, SOCK_STREAM, 0); - - if (bind(fd, &addr.sa, sizeof(addr.svm)) < 0) { - perror("bind"); - exit(EXIT_FAILURE); - } + fd = vsock_bind(VMADDR_CID_ANY, opts->peer_port, SOCK_STREAM); /* Notify the client that the server is ready */ control_writeln("BIND"); From ac12b7e2912d60fceefaacc78ee8b5aec8b51c04 Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Tue, 28 Jan 2025 14:15:30 +0100 Subject: [PATCH 74/88] vsock/test: Introduce vsock_connect_fd() Distill timeout-guarded vsock_connect_fd(). Adapt callers. Suggested-by: Stefano Garzarella Reviewed-by: Stefano Garzarella Signed-off-by: Michal Luczaj Link: https://patch.msgid.link/20250128-vsock-transport-vs-autobind-v3-4-1cf57065b770@rbox.co Signed-off-by: Jakub Kicinski --- tools/testing/vsock/util.c | 45 ++++++++++++++------------------------ tools/testing/vsock/util.h | 1 + 2 files changed, 18 insertions(+), 28 deletions(-) diff --git a/tools/testing/vsock/util.c b/tools/testing/vsock/util.c index 6e36f9371532c..de25892f865f0 100644 --- a/tools/testing/vsock/util.c +++ b/tools/testing/vsock/util.c @@ -120,27 +120,33 @@ int vsock_bind(unsigned int cid, unsigned int port, int type) return fd; } -/* Bind to , connect to and return the file descriptor. */ -int vsock_bind_connect(unsigned int cid, unsigned int port, unsigned int bind_port, int type) +int vsock_connect_fd(int fd, unsigned int cid, unsigned int port) { - struct sockaddr_vm sa_server = { + struct sockaddr_vm sa = { .svm_family = AF_VSOCK, .svm_cid = cid, .svm_port = port, }; - - int client_fd, ret; - - client_fd = vsock_bind(VMADDR_CID_ANY, bind_port, type); + int ret; timeout_begin(TIMEOUT); do { - ret = connect(client_fd, (struct sockaddr *)&sa_server, sizeof(sa_server)); + ret = connect(fd, (struct sockaddr *)&sa, sizeof(sa)); timeout_check("connect"); } while (ret < 0 && errno == EINTR); timeout_end(); - if (ret < 0) { + return ret; +} + +/* Bind to , connect to and return the file descriptor. */ +int vsock_bind_connect(unsigned int cid, unsigned int port, unsigned int bind_port, int type) +{ + int client_fd; + + client_fd = vsock_bind(VMADDR_CID_ANY, bind_port, type); + + if (vsock_connect_fd(client_fd, cid, port)) { perror("connect"); exit(EXIT_FAILURE); } @@ -151,17 +157,6 @@ int vsock_bind_connect(unsigned int cid, unsigned int port, unsigned int bind_po /* Connect to and return the file descriptor. */ int vsock_connect(unsigned int cid, unsigned int port, int type) { - union { - struct sockaddr sa; - struct sockaddr_vm svm; - } addr = { - .svm = { - .svm_family = AF_VSOCK, - .svm_port = port, - .svm_cid = cid, - }, - }; - int ret; int fd; control_expectln("LISTENING"); @@ -172,20 +167,14 @@ int vsock_connect(unsigned int cid, unsigned int port, int type) exit(EXIT_FAILURE); } - timeout_begin(TIMEOUT); - do { - ret = connect(fd, &addr.sa, sizeof(addr.svm)); - timeout_check("connect"); - } while (ret < 0 && errno == EINTR); - timeout_end(); - - if (ret < 0) { + if (vsock_connect_fd(fd, cid, port)) { int old_errno = errno; close(fd); fd = -1; errno = old_errno; } + return fd; } diff --git a/tools/testing/vsock/util.h b/tools/testing/vsock/util.h index 077842905bc3e..d1f765ce3eeee 100644 --- a/tools/testing/vsock/util.h +++ b/tools/testing/vsock/util.h @@ -39,6 +39,7 @@ struct test_case { void init_signals(void); unsigned int parse_cid(const char *str); unsigned int parse_port(const char *str); +int vsock_connect_fd(int fd, unsigned int cid, unsigned int port); int vsock_connect(unsigned int cid, unsigned int port, int type); int vsock_accept(unsigned int cid, unsigned int port, struct sockaddr_vm *clientaddrp, int type); From 301a62dfb0d0dff79460ed3a54e1f8dbd8db52e9 Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Tue, 28 Jan 2025 14:15:31 +0100 Subject: [PATCH 75/88] vsock/test: Add test for UAF due to socket unbinding Fail the autobind, then trigger a transport reassign. Socket might get unbound from unbound_sockets, which then leads to a reference count underflow. Reviewed-by: Stefano Garzarella Signed-off-by: Michal Luczaj Link: https://patch.msgid.link/20250128-vsock-transport-vs-autobind-v3-5-1cf57065b770@rbox.co Signed-off-by: Jakub Kicinski --- tools/testing/vsock/vsock_test.c | 58 ++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c index daa4f3ca9b6e7..92cfd92bbfdc2 100644 --- a/tools/testing/vsock/vsock_test.c +++ b/tools/testing/vsock/vsock_test.c @@ -1693,6 +1693,59 @@ static void test_stream_msgzcopy_leak_zcskb_server(const struct test_opts *opts) close(fd); } +#define MAX_PORT_RETRIES 24 /* net/vmw_vsock/af_vsock.c */ + +/* Test attempts to trigger a transport release for an unbound socket. This can + * lead to a reference count mishandling. + */ +static void test_stream_transport_uaf_client(const struct test_opts *opts) +{ + int sockets[MAX_PORT_RETRIES]; + struct sockaddr_vm addr; + int fd, i, alen; + + fd = vsock_bind(VMADDR_CID_ANY, VMADDR_PORT_ANY, SOCK_STREAM); + + alen = sizeof(addr); + if (getsockname(fd, (struct sockaddr *)&addr, &alen)) { + perror("getsockname"); + exit(EXIT_FAILURE); + } + + for (i = 0; i < MAX_PORT_RETRIES; ++i) + sockets[i] = vsock_bind(VMADDR_CID_ANY, ++addr.svm_port, + SOCK_STREAM); + + close(fd); + fd = socket(AF_VSOCK, SOCK_STREAM, 0); + if (fd < 0) { + perror("socket"); + exit(EXIT_FAILURE); + } + + if (!vsock_connect_fd(fd, addr.svm_cid, addr.svm_port)) { + perror("Unexpected connect() #1 success"); + exit(EXIT_FAILURE); + } + + /* Vulnerable system may crash now. */ + if (!vsock_connect_fd(fd, VMADDR_CID_HOST, VMADDR_PORT_ANY)) { + perror("Unexpected connect() #2 success"); + exit(EXIT_FAILURE); + } + + close(fd); + while (i--) + close(sockets[i]); + + control_writeln("DONE"); +} + +static void test_stream_transport_uaf_server(const struct test_opts *opts) +{ + control_expectln("DONE"); +} + static struct test_case test_cases[] = { { .name = "SOCK_STREAM connection reset", @@ -1838,6 +1891,11 @@ static struct test_case test_cases[] = { .run_client = test_stream_msgzcopy_leak_zcskb_client, .run_server = test_stream_msgzcopy_leak_zcskb_server, }, + { + .name = "SOCK_STREAM transport release use-after-free", + .run_client = test_stream_transport_uaf_client, + .run_server = test_stream_transport_uaf_server, + }, {}, }; From 4695f64e028dd1407d077565fbdb30ddb364180a Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Tue, 28 Jan 2025 14:15:32 +0100 Subject: [PATCH 76/88] vsock/test: Add test for connect() retries Deliberately fail a connect() attempt; expect error. Then verify that subsequent attempt (using the same socket) can still succeed, rather than fail outright. Reviewed-by: Stefano Garzarella Reviewed-by: Luigi Leonardi Signed-off-by: Michal Luczaj Link: https://patch.msgid.link/20250128-vsock-transport-vs-autobind-v3-6-1cf57065b770@rbox.co Signed-off-by: Jakub Kicinski --- tools/testing/vsock/vsock_test.c | 47 ++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c index 92cfd92bbfdc2..dfff8b288265f 100644 --- a/tools/testing/vsock/vsock_test.c +++ b/tools/testing/vsock/vsock_test.c @@ -1746,6 +1746,48 @@ static void test_stream_transport_uaf_server(const struct test_opts *opts) control_expectln("DONE"); } +static void test_stream_connect_retry_client(const struct test_opts *opts) +{ + int fd; + + fd = socket(AF_VSOCK, SOCK_STREAM, 0); + if (fd < 0) { + perror("socket"); + exit(EXIT_FAILURE); + } + + if (!vsock_connect_fd(fd, opts->peer_cid, opts->peer_port)) { + fprintf(stderr, "Unexpected connect() #1 success\n"); + exit(EXIT_FAILURE); + } + + control_writeln("LISTEN"); + control_expectln("LISTENING"); + + if (vsock_connect_fd(fd, opts->peer_cid, opts->peer_port)) { + perror("connect() #2"); + exit(EXIT_FAILURE); + } + + close(fd); +} + +static void test_stream_connect_retry_server(const struct test_opts *opts) +{ + int fd; + + control_expectln("LISTEN"); + + fd = vsock_stream_accept(VMADDR_CID_ANY, opts->peer_port, NULL); + if (fd < 0) { + perror("accept"); + exit(EXIT_FAILURE); + } + + vsock_wait_remote_close(fd); + close(fd); +} + static struct test_case test_cases[] = { { .name = "SOCK_STREAM connection reset", @@ -1896,6 +1938,11 @@ static struct test_case test_cases[] = { .run_client = test_stream_transport_uaf_client, .run_server = test_stream_transport_uaf_server, }, + { + .name = "SOCK_STREAM retry failed connect()", + .run_client = test_stream_connect_retry_client, + .run_server = test_stream_connect_retry_server, + }, {}, }; From 752e5fcc2e77358936d36ef8e522d6439372e201 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= Date: Mon, 27 Jan 2025 09:51:59 -0800 Subject: [PATCH 77/88] bgmac: reduce max frame size to support just MTU 1500 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit bgmac allocates new replacement buffer before handling each received frame. Allocating & DMA-preparing 9724 B each time consumes a lot of CPU time. Ideally bgmac should just respect currently set MTU but it isn't the case right now. For now just revert back to the old limited frame size. This change bumps NAT masquerade speed by ~95%. Since commit 8218f62c9c9b ("mm: page_frag: use initial zero offset for page_frag_alloc_align()"), the bgmac driver fails to open its network interface successfully and runs out of memory in the following call stack: bgmac_open -> bgmac_dma_init -> bgmac_dma_rx_skb_for_slot -> netdev_alloc_frag BGMAC_RX_ALLOC_SIZE = 10048 and PAGE_FRAG_CACHE_MAX_SIZE = 32768. Eventually we land into __page_frag_alloc_align() with the following parameters across multiple successive calls: __page_frag_alloc_align: fragsz=10048, align_mask=-1, size=32768, offset=0 __page_frag_alloc_align: fragsz=10048, align_mask=-1, size=32768, offset=10048 __page_frag_alloc_align: fragsz=10048, align_mask=-1, size=32768, offset=20096 __page_frag_alloc_align: fragsz=10048, align_mask=-1, size=32768, offset=30144 So in that case we do indeed have offset + fragsz (40192) > size (32768) and so we would eventually return NULL. Reverting to the older 1500 bytes MTU allows the network driver to be usable again. Fixes: 8c7da63978f1 ("bgmac: configure MTU and add support for frames beyond 8192 byte size") Signed-off-by: Rafał Miłecki [florian: expand commit message about recent commits] Reviewed-by: Simon Horman Signed-off-by: Florian Fainelli Link: https://patch.msgid.link/20250127175159.1788246-1-florian.fainelli@broadcom.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bgmac.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bgmac.h b/drivers/net/ethernet/broadcom/bgmac.h index d73ef262991d6..6fee9a41839c0 100644 --- a/drivers/net/ethernet/broadcom/bgmac.h +++ b/drivers/net/ethernet/broadcom/bgmac.h @@ -328,8 +328,7 @@ #define BGMAC_RX_FRAME_OFFSET 30 /* There are 2 unused bytes between header and real data */ #define BGMAC_RX_BUF_OFFSET (NET_SKB_PAD + NET_IP_ALIGN - \ BGMAC_RX_FRAME_OFFSET) -/* Jumbo frame size with FCS */ -#define BGMAC_RX_MAX_FRAME_SIZE 9724 +#define BGMAC_RX_MAX_FRAME_SIZE 1536 #define BGMAC_RX_BUF_SIZE (BGMAC_RX_FRAME_OFFSET + BGMAC_RX_MAX_FRAME_SIZE) #define BGMAC_RX_ALLOC_SIZE (SKB_DATA_ALIGN(BGMAC_RX_BUF_SIZE + BGMAC_RX_BUF_OFFSET) + \ SKB_DATA_ALIGN(sizeof(struct skb_shared_info))) From 8c670bdfa58e48abad1d5b6ca1ee843ca91f7303 Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Mon, 27 Jan 2025 18:13:04 -0500 Subject: [PATCH 78/88] tcp: correct handling of extreme memory squeeze MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Testing with iperf3 using the "pasta" protocol splicer has revealed a problem in the way tcp handles window advertising in extreme memory squeeze situations. Under memory pressure, a socket endpoint may temporarily advertise a zero-sized window, but this is not stored as part of the socket data. The reasoning behind this is that it is considered a temporary setting which shouldn't influence any further calculations. However, if we happen to stall at an unfortunate value of the current window size, the algorithm selecting a new value will consistently fail to advertise a non-zero window once we have freed up enough memory. This means that this side's notion of the current window size is different from the one last advertised to the peer, causing the latter to not send any data to resolve the sitution. The problem occurs on the iperf3 server side, and the socket in question is a completely regular socket with the default settings for the fedora40 kernel. We do not use SO_PEEK or SO_RCVBUF on the socket. The following excerpt of a logging session, with own comments added, shows more in detail what is happening: // tcp_v4_rcv(->) // tcp_rcv_established(->) [5201<->39222]: ==== Activating log @ net/ipv4/tcp_input.c/tcp_data_queue()/5257 ==== [5201<->39222]: tcp_data_queue(->) [5201<->39222]: DROPPING skb [265600160..265665640], reason: SKB_DROP_REASON_PROTO_MEM [rcv_nxt 265600160, rcv_wnd 262144, snt_ack 265469200, win_now 131184] [copied_seq 259909392->260034360 (124968), unread 5565800, qlen 85, ofoq 0] [OFO queue: gap: 65480, len: 0] [5201<->39222]: tcp_data_queue(<-) [5201<->39222]: __tcp_transmit_skb(->) [tp->rcv_wup: 265469200, tp->rcv_wnd: 262144, tp->rcv_nxt 265600160] [5201<->39222]: tcp_select_window(->) [5201<->39222]: (inet_csk(sk)->icsk_ack.pending & ICSK_ACK_NOMEM) ? --> TRUE [tp->rcv_wup: 265469200, tp->rcv_wnd: 262144, tp->rcv_nxt 265600160] returning 0 [5201<->39222]: tcp_select_window(<-) [5201<->39222]: ADVERTISING WIN 0, ACK_SEQ: 265600160 [5201<->39222]: [__tcp_transmit_skb(<-) [5201<->39222]: tcp_rcv_established(<-) [5201<->39222]: tcp_v4_rcv(<-) // Receive queue is at 85 buffers and we are out of memory. // We drop the incoming buffer, although it is in sequence, and decide // to send an advertisement with a window of zero. // We don't update tp->rcv_wnd and tp->rcv_wup accordingly, which means // we unconditionally shrink the window. [5201<->39222]: tcp_recvmsg_locked(->) [5201<->39222]: __tcp_cleanup_rbuf(->) tp->rcv_wup: 265469200, tp->rcv_wnd: 262144, tp->rcv_nxt 265600160 [5201<->39222]: [new_win = 0, win_now = 131184, 2 * win_now = 262368] [5201<->39222]: [new_win >= (2 * win_now) ? --> time_to_ack = 0] [5201<->39222]: NOT calling tcp_send_ack() [tp->rcv_wup: 265469200, tp->rcv_wnd: 262144, tp->rcv_nxt 265600160] [5201<->39222]: __tcp_cleanup_rbuf(<-) [rcv_nxt 265600160, rcv_wnd 262144, snt_ack 265469200, win_now 131184] [copied_seq 260040464->260040464 (0), unread 5559696, qlen 85, ofoq 0] returning 6104 bytes [5201<->39222]: tcp_recvmsg_locked(<-) // After each read, the algorithm for calculating the new receive // window in __tcp_cleanup_rbuf() finds it is too small to advertise // or to update tp->rcv_wnd. // Meanwhile, the peer thinks the window is zero, and will not send // any more data to trigger an update from the interrupt mode side. [5201<->39222]: tcp_recvmsg_locked(->) [5201<->39222]: __tcp_cleanup_rbuf(->) tp->rcv_wup: 265469200, tp->rcv_wnd: 262144, tp->rcv_nxt 265600160 [5201<->39222]: [new_win = 262144, win_now = 131184, 2 * win_now = 262368] [5201<->39222]: [new_win >= (2 * win_now) ? --> time_to_ack = 0] [5201<->39222]: NOT calling tcp_send_ack() [tp->rcv_wup: 265469200, tp->rcv_wnd: 262144, tp->rcv_nxt 265600160] [5201<->39222]: __tcp_cleanup_rbuf(<-) [rcv_nxt 265600160, rcv_wnd 262144, snt_ack 265469200, win_now 131184] [copied_seq 260099840->260171536 (71696), unread 5428624, qlen 83, ofoq 0] returning 131072 bytes [5201<->39222]: tcp_recvmsg_locked(<-) // The above pattern repeats again and again, since nothing changes // between the reads. [...] [5201<->39222]: tcp_recvmsg_locked(->) [5201<->39222]: __tcp_cleanup_rbuf(->) tp->rcv_wup: 265469200, tp->rcv_wnd: 262144, tp->rcv_nxt 265600160 [5201<->39222]: [new_win = 262144, win_now = 131184, 2 * win_now = 262368] [5201<->39222]: [new_win >= (2 * win_now) ? --> time_to_ack = 0] [5201<->39222]: NOT calling tcp_send_ack() [tp->rcv_wup: 265469200, tp->rcv_wnd: 262144, tp->rcv_nxt 265600160] [5201<->39222]: __tcp_cleanup_rbuf(<-) [rcv_nxt 265600160, rcv_wnd 262144, snt_ack 265469200, win_now 131184] [copied_seq 265600160->265600160 (0), unread 0, qlen 0, ofoq 0] returning 54672 bytes [5201<->39222]: tcp_recvmsg_locked(<-) // The receive queue is empty, but no new advertisement has been sent. // The peer still thinks the receive window is zero, and sends nothing. // We have ended up in a deadlock situation. Note that well behaved endpoints will send win0 probes, so the problem will not occur. Furthermore, we have observed that in these situations this side may send out an updated 'th->ack_seq´ which is not stored in tp->rcv_wup as it should be. Backing ack_seq seems to be harmless, but is of course still wrong from a protocol viewpoint. We fix this by updating the socket state correctly when a packet has been dropped because of memory exhaustion and we have to advertize a zero window. Further testing shows that the connection recovers neatly from the squeeze situation, and traffic can continue indefinitely. Fixes: e2142825c120 ("net: tcp: send zero-window ACK when no memory") Cc: Menglong Dong Reviewed-by: Stefano Brivio Signed-off-by: Jon Maloy Reviewed-by: Jason Xing Reviewed-by: Eric Dumazet Reviewed-by: Neal Cardwell Link: https://patch.msgid.link/20250127231304.1465565-1-jmaloy@redhat.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_output.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 0e5b9a654254b..bc95d2a5924fd 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -265,11 +265,14 @@ static u16 tcp_select_window(struct sock *sk) u32 cur_win, new_win; /* Make the window 0 if we failed to queue the data because we - * are out of memory. The window is temporary, so we don't store - * it on the socket. + * are out of memory. */ - if (unlikely(inet_csk(sk)->icsk_ack.pending & ICSK_ACK_NOMEM)) + if (unlikely(inet_csk(sk)->icsk_ack.pending & ICSK_ACK_NOMEM)) { + tp->pred_flags = 0; + tp->rcv_wnd = 0; + tp->rcv_wup = tp->rcv_nxt; return 0; + } cur_win = tcp_receive_window(tp); new_win = __tcp_select_window(sk); From 3595599fa8360bb3c7afa7ee50c810b4a64106ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Mon, 27 Jan 2025 14:13:42 +0100 Subject: [PATCH 79/88] net: xdp: Disallow attaching device-bound programs in generic mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Device-bound programs are used to support RX metadata kfuncs. These kfuncs are driver-specific and rely on the driver context to read the metadata. This means they can't work in generic XDP mode. However, there is no check to disallow such programs from being attached in generic mode, in which case the metadata kfuncs will be called in an invalid context, leading to crashes. Fix this by adding a check to disallow attaching device-bound programs in generic mode. Fixes: 2b3486bc2d23 ("bpf: Introduce device-bound XDP programs") Reported-by: Marcus Wichelmann Closes: https://lore.kernel.org/r/dae862ec-43b5-41a0-8edf-46c59071cdda@hetzner-cloud.de Tested-by: Marcus Wichelmann Acked-by: Stanislav Fomichev Signed-off-by: Toke Høiland-Jørgensen Acked-by: Daniel Borkmann Acked-by: Martin KaFai Lau Link: https://patch.msgid.link/20250127131344.238147-1-toke@redhat.com Signed-off-by: Jakub Kicinski --- net/core/dev.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/core/dev.c b/net/core/dev.c index 07b2bb1ce64ff..02cd75b512f92 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -9924,6 +9924,10 @@ static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack NL_SET_ERR_MSG(extack, "Program bound to different device"); return -EINVAL; } + if (bpf_prog_is_dev_bound(new_prog->aux) && mode == XDP_MODE_SKB) { + NL_SET_ERR_MSG(extack, "Can't attach device-bound programs in generic mode"); + return -EINVAL; + } if (new_prog->expected_attach_type == BPF_XDP_DEVMAP) { NL_SET_ERR_MSG(extack, "BPF_XDP_DEVMAP programs can not be attached to a device"); return -EINVAL; From f7bf624b1fedf232195804ac0f6584cb3e4b86bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Mon, 27 Jan 2025 14:13:43 +0100 Subject: [PATCH 80/88] selftests/net: Add test for loading devbound XDP program in generic mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a test to bpf_offload.py for loading a devbound XDP program in generic mode, checking that it fails correctly. Signed-off-by: Toke Høiland-Jørgensen Acked-by: Stanislav Fomichev Link: https://patch.msgid.link/20250127131344.238147-2-toke@redhat.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/bpf_offload.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/bpf_offload.py b/tools/testing/selftests/net/bpf_offload.py index d10f420e4ef65..fd0d959914e47 100755 --- a/tools/testing/selftests/net/bpf_offload.py +++ b/tools/testing/selftests/net/bpf_offload.py @@ -215,12 +215,14 @@ def bpftool_map_list_wait(expected=0, n_retry=20, ns=""): raise Exception("Time out waiting for map counts to stabilize want %d, have %d" % (expected, nmaps)) def bpftool_prog_load(sample, file_name, maps=[], prog_type="xdp", dev=None, - fail=True, include_stderr=False): + fail=True, include_stderr=False, dev_bind=None): args = "prog load %s %s" % (os.path.join(bpf_test_dir, sample), file_name) if prog_type is not None: args += " type " + prog_type if dev is not None: args += " dev " + dev + elif dev_bind is not None: + args += " xdpmeta_dev " + dev_bind if len(maps): args += " map " + " map ".join(maps) @@ -980,6 +982,16 @@ def test_multi_prog(simdev, sim, obj, modename, modeid): rm("/sys/fs/bpf/offload") sim.wait_for_flush() + bpftool_prog_load("sample_ret0.bpf.o", "/sys/fs/bpf/devbound", + dev_bind=sim['ifname']) + devbound = bpf_pinned("/sys/fs/bpf/devbound") + start_test("Test dev-bound program in generic mode...") + ret, _, err = sim.set_xdp(devbound, "generic", fail=False, include_stderr=True) + fail(ret == 0, "devbound program in generic mode allowed") + check_extack(err, "Can't attach device-bound programs in generic mode.", args) + rm("/sys/fs/bpf/devbound") + sim.wait_for_flush() + start_test("Test XDP load failure...") sim.dfs["dev/bpf_bind_verifier_accept"] = 0 ret, _, err = bpftool_prog_load("sample_ret0.bpf.o", "/sys/fs/bpf/offload", From 2c2ebb2b49573e5f8726112ad06b1dffc3c9ea03 Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Wed, 29 Jan 2025 10:50:46 +0100 Subject: [PATCH 81/88] net: ravb: Fix missing rtnl lock in suspend/resume path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix the suspend/resume path by ensuring the rtnl lock is held where required. Calls to ravb_open, ravb_close and wol operations must be performed under the rtnl lock to prevent conflicts with ongoing ndo operations. Without this fix, the following warning is triggered: [ 39.032969] ============================= [ 39.032983] WARNING: suspicious RCU usage [ 39.033019] ----------------------------- [ 39.033033] drivers/net/phy/phy_device.c:2004 suspicious rcu_dereference_protected() usage! ... [ 39.033597] stack backtrace: [ 39.033613] CPU: 0 UID: 0 PID: 174 Comm: python3 Not tainted 6.13.0-rc7-next-20250116-arm64-renesas-00002-g35245dfdc62c #7 [ 39.033623] Hardware name: Renesas SMARC EVK version 2 based on r9a08g045s33 (DT) [ 39.033628] Call trace: [ 39.033633] show_stack+0x14/0x1c (C) [ 39.033652] dump_stack_lvl+0xb4/0xc4 [ 39.033664] dump_stack+0x14/0x1c [ 39.033671] lockdep_rcu_suspicious+0x16c/0x22c [ 39.033682] phy_detach+0x160/0x190 [ 39.033694] phy_disconnect+0x40/0x54 [ 39.033703] ravb_close+0x6c/0x1cc [ 39.033714] ravb_suspend+0x48/0x120 [ 39.033721] dpm_run_callback+0x4c/0x14c [ 39.033731] device_suspend+0x11c/0x4dc [ 39.033740] dpm_suspend+0xdc/0x214 [ 39.033748] dpm_suspend_start+0x48/0x60 [ 39.033758] suspend_devices_and_enter+0x124/0x574 [ 39.033769] pm_suspend+0x1ac/0x274 [ 39.033778] state_store+0x88/0x124 [ 39.033788] kobj_attr_store+0x14/0x24 [ 39.033798] sysfs_kf_write+0x48/0x6c [ 39.033808] kernfs_fop_write_iter+0x118/0x1a8 [ 39.033817] vfs_write+0x27c/0x378 [ 39.033825] ksys_write+0x64/0xf4 [ 39.033833] __arm64_sys_write+0x18/0x20 [ 39.033841] invoke_syscall+0x44/0x104 [ 39.033852] el0_svc_common.constprop.0+0xb4/0xd4 [ 39.033862] do_el0_svc+0x18/0x20 [ 39.033870] el0_svc+0x3c/0xf0 [ 39.033880] el0t_64_sync_handler+0xc0/0xc4 [ 39.033888] el0t_64_sync+0x154/0x158 [ 39.041274] ravb 11c30000.ethernet eth0: Link is Down Reported-by: Claudiu Beznea Closes: https://lore.kernel.org/netdev/4c6419d8-c06b-495c-b987-d66c2e1ff848@tuxon.dev/ Fixes: 0184165b2f42 ("ravb: add sleep PM suspend/resume support") Signed-off-by: Kory Maincent Tested-by: Niklas Söderlund Signed-off-by: Paolo Abeni --- drivers/net/ethernet/renesas/ravb_main.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/renesas/ravb_main.c b/drivers/net/ethernet/renesas/ravb_main.c index bc395294a32df..c9f4976a35275 100644 --- a/drivers/net/ethernet/renesas/ravb_main.c +++ b/drivers/net/ethernet/renesas/ravb_main.c @@ -3217,10 +3217,15 @@ static int ravb_suspend(struct device *dev) netif_device_detach(ndev); - if (priv->wol_enabled) - return ravb_wol_setup(ndev); + rtnl_lock(); + if (priv->wol_enabled) { + ret = ravb_wol_setup(ndev); + rtnl_unlock(); + return ret; + } ret = ravb_close(ndev); + rtnl_unlock(); if (ret) return ret; @@ -3245,19 +3250,20 @@ static int ravb_resume(struct device *dev) if (!netif_running(ndev)) return 0; + rtnl_lock(); /* If WoL is enabled restore the interface. */ - if (priv->wol_enabled) { + if (priv->wol_enabled) ret = ravb_wol_restore(ndev); - if (ret) - return ret; - } else { + else ret = pm_runtime_force_resume(dev); - if (ret) - return ret; + if (ret) { + rtnl_unlock(); + return ret; } /* Reopening the interface will restore the device to the working state. */ ret = ravb_open(ndev); + rtnl_unlock(); if (ret < 0) goto out_rpm_put; From b95102215a8d0987789715ce11c0d4ec031cbfbe Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Wed, 29 Jan 2025 10:50:47 +0100 Subject: [PATCH 82/88] net: sh_eth: Fix missing rtnl lock in suspend/resume path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix the suspend/resume path by ensuring the rtnl lock is held where required. Calls to sh_eth_close, sh_eth_open and wol operations must be performed under the rtnl lock to prevent conflicts with ongoing ndo operations. Fixes: b71af04676e9 ("sh_eth: add more PM methods") Tested-by: Niklas Söderlund Reviewed-by: Sergey Shtylyov Signed-off-by: Kory Maincent Signed-off-by: Paolo Abeni --- drivers/net/ethernet/renesas/sh_eth.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ethernet/renesas/sh_eth.c b/drivers/net/ethernet/renesas/sh_eth.c index 8887b89210093..5fc8027c92c7c 100644 --- a/drivers/net/ethernet/renesas/sh_eth.c +++ b/drivers/net/ethernet/renesas/sh_eth.c @@ -3494,10 +3494,12 @@ static int sh_eth_suspend(struct device *dev) netif_device_detach(ndev); + rtnl_lock(); if (mdp->wol_enabled) ret = sh_eth_wol_setup(ndev); else ret = sh_eth_close(ndev); + rtnl_unlock(); return ret; } @@ -3511,10 +3513,12 @@ static int sh_eth_resume(struct device *dev) if (!netif_running(ndev)) return 0; + rtnl_lock(); if (mdp->wol_enabled) ret = sh_eth_wol_restore(ndev); else ret = sh_eth_open(ndev); + rtnl_unlock(); if (ret < 0) return ret; From 1b9335a8000fb70742f7db10af314104b6ace220 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 28 Jan 2025 12:26:33 +0100 Subject: [PATCH 83/88] netfilter: nf_tables: reject mismatching sum of field_len with set key length The field length description provides the length of each separated key field in the concatenation, each field gets rounded up to 32-bits to calculate the pipapo rule width from pipapo_init(). The set key length provides the total size of the key aligned to 32-bits. Register-based arithmetics still allows for combining mismatching set key length and field length description, eg. set key length 10 and field description [ 5, 4 ] leading to pipapo width of 12. Cc: stable@vger.kernel.org Fixes: 3ce67e3793f4 ("netfilter: nf_tables: do not allow mismatch field size and set key length") Reported-by: Noam Rathaus Reviewed-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index c4af283356e74..e5662dc087c8c 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -5065,7 +5065,7 @@ static int nft_set_desc_concat_parse(const struct nlattr *attr, static int nft_set_desc_concat(struct nft_set_desc *desc, const struct nlattr *nla) { - u32 num_regs = 0, key_num_regs = 0; + u32 len = 0, num_regs; struct nlattr *attr; int rem, err, i; @@ -5079,12 +5079,12 @@ static int nft_set_desc_concat(struct nft_set_desc *desc, } for (i = 0; i < desc->field_count; i++) - num_regs += DIV_ROUND_UP(desc->field_len[i], sizeof(u32)); + len += round_up(desc->field_len[i], sizeof(u32)); - key_num_regs = DIV_ROUND_UP(desc->klen, sizeof(u32)); - if (key_num_regs != num_regs) + if (len != desc->klen) return -EINVAL; + num_regs = DIV_ROUND_UP(desc->klen, sizeof(u32)); if (num_regs > NFT_REG32_COUNT) return -E2BIG; From e598d8981fd34470b78a1ae777dbf131b15d5bf2 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Wed, 29 Jan 2025 13:24:32 +0100 Subject: [PATCH 84/88] mptcp: blackhole only if 1st SYN retrans w/o MPC is accepted The Fixes commit mentioned this: > An MPTCP firewall blackhole can be detected if the following SYN > retransmission after a fallback to "plain" TCP is accepted. But in fact, this blackhole was detected if any following SYN retransmissions after a fallback to TCP was accepted. That's because 'mptcp_subflow_early_fallback()' will set 'request_mptcp' to 0, and 'mpc_drop' will never be reset to 0 after. This is an issue, because some not so unusual situations might cause the kernel to detect a false-positive blackhole, e.g. a client trying to connect to a server while the network is not ready yet, causing a few SYN retransmissions, before reaching the end server. Fixes: 27069e7cb3d1 ("mptcp: disable active MPTCP in case of blackhole") Cc: stable@vger.kernel.org Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: Paolo Abeni --- net/mptcp/ctrl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/mptcp/ctrl.c b/net/mptcp/ctrl.c index 3999e0ba2c35b..2dd81e6c26bdb 100644 --- a/net/mptcp/ctrl.c +++ b/net/mptcp/ctrl.c @@ -418,9 +418,9 @@ void mptcp_active_detect_blackhole(struct sock *ssk, bool expired) MPTCP_INC_STATS(net, MPTCP_MIB_MPCAPABLEACTIVEDROP); subflow->mpc_drop = 1; mptcp_subflow_early_fallback(mptcp_sk(subflow->conn), subflow); - } else { - subflow->mpc_drop = 0; } + } else if (ssk->sk_state == TCP_SYN_SENT) { + subflow->mpc_drop = 0; } } From 18da4b5d123285dea470b15ff51c7fbe61dc37fd Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Wed, 29 Jan 2025 13:24:33 +0100 Subject: [PATCH 85/88] doc: mptcp: sysctl: blackhole_timeout is per-netns All other sysctl entries mention it, and it is a per-namespace sysctl. So mention it as well. Fixes: 27069e7cb3d1 ("mptcp: disable active MPTCP in case of blackhole") Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: Paolo Abeni --- Documentation/networking/mptcp-sysctl.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/networking/mptcp-sysctl.rst b/Documentation/networking/mptcp-sysctl.rst index dc45c02113537..03e1d3610333e 100644 --- a/Documentation/networking/mptcp-sysctl.rst +++ b/Documentation/networking/mptcp-sysctl.rst @@ -41,7 +41,7 @@ blackhole_timeout - INTEGER (seconds) MPTCP is re-enabled and will reset to the initial value when the blackhole issue goes away. - 0 to disable the blackhole detection. + 0 to disable the blackhole detection. This is a per-namespace sysctl. Default: 3600 From 0f5697f1a3f99bc2b674b8aa3c5da822c5673c11 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 29 Jan 2025 13:00:07 +0000 Subject: [PATCH 86/88] net: hsr: fix fill_frame_info() regression vs VLAN packets Stephan Wurm reported that my recent patch broke VLAN support. Apparently skb->mac_len is not correct for VLAN traffic as shown by debug traces [1]. Use instead pskb_may_pull() to make sure the expected header is present in skb->head. Many thanks to Stephan for his help. [1] kernel: skb len=170 headroom=2 headlen=170 tailroom=20 mac=(2,14) mac_len=14 net=(16,-1) trans=-1 shinfo(txflags=0 nr_frags=0 gso(size=0 type=0 segs=0)) csum(0x0 start=0 offset=0 ip_summed=0 complete_sw=0 valid=0 level=0) hash(0x0 sw=0 l4=0) proto=0x0000 pkttype=0 iif=0 priority=0x0 mark=0x0 alloc_cpu=0 vlan_all=0x0 encapsulation=0 inner(proto=0x0000, mac=0, net=0, trans=0) kernel: dev name=prp0 feat=0x0000000000007000 kernel: sk family=17 type=3 proto=0 kernel: skb headroom: 00000000: 74 00 kernel: skb linear: 00000000: 01 0c cd 01 00 01 00 d0 93 53 9c cb 81 00 80 00 kernel: skb linear: 00000010: 88 b8 00 01 00 98 00 00 00 00 61 81 8d 80 16 52 kernel: skb linear: 00000020: 45 47 44 4e 43 54 52 4c 2f 4c 4c 4e 30 24 47 4f kernel: skb linear: 00000030: 24 47 6f 43 62 81 01 14 82 16 52 45 47 44 4e 43 kernel: skb linear: 00000040: 54 52 4c 2f 4c 4c 4e 30 24 44 73 47 6f 6f 73 65 kernel: skb linear: 00000050: 83 07 47 6f 49 64 65 6e 74 84 08 67 8d f5 93 7e kernel: skb linear: 00000060: 76 c8 00 85 01 01 86 01 00 87 01 00 88 01 01 89 kernel: skb linear: 00000070: 01 00 8a 01 02 ab 33 a2 15 83 01 00 84 03 03 00 kernel: skb linear: 00000080: 00 91 08 67 8d f5 92 77 4b c6 1f 83 01 00 a2 1a kernel: skb linear: 00000090: a2 06 85 01 00 83 01 00 84 03 03 00 00 91 08 67 kernel: skb linear: 000000a0: 8d f5 92 77 4b c6 1f 83 01 00 kernel: skb tailroom: 00000000: 80 18 02 00 fe 4e 00 00 01 01 08 0a 4f fd 5e d1 kernel: skb tailroom: 00000010: 4f fd 5e cd Fixes: b9653d19e556 ("net: hsr: avoid potential out-of-bound access in fill_frame_info()") Reported-by: Stephan Wurm Tested-by: Stephan Wurm Closes: https://lore.kernel.org/netdev/Z4o_UC0HweBHJ_cw@PC-LX-SteWu/ Signed-off-by: Eric Dumazet Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250129130007.644084-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/hsr/hsr_forward.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/net/hsr/hsr_forward.c b/net/hsr/hsr_forward.c index 87bb3a91598ee..a4bacf1985558 100644 --- a/net/hsr/hsr_forward.c +++ b/net/hsr/hsr_forward.c @@ -700,9 +700,12 @@ static int fill_frame_info(struct hsr_frame_info *frame, frame->is_vlan = true; if (frame->is_vlan) { - if (skb->mac_len < offsetofend(struct hsr_vlan_ethhdr, vlanhdr)) + /* Note: skb->mac_len might be wrong here. */ + if (!pskb_may_pull(skb, + skb_mac_offset(skb) + + offsetofend(struct hsr_vlan_ethhdr, vlanhdr))) return -EINVAL; - vlan_hdr = (struct hsr_vlan_ethhdr *)ethhdr; + vlan_hdr = (struct hsr_vlan_ethhdr *)skb_mac_header(skb); proto = vlan_hdr->vlanhdr.h_vlan_encapsulated_proto; } From e759e1e4a4bd2926d082afe56046a90224433a31 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 29 Jan 2025 14:27:26 +0000 Subject: [PATCH 87/88] net: revert RTNL changes in unregister_netdevice_many_notify() This patch reverts following changes: 83419b61d187 net: reduce RTNL hold duration in unregister_netdevice_many_notify() (part 2) ae646f1a0bb9 net: reduce RTNL hold duration in unregister_netdevice_many_notify() (part 1) cfa579f66656 net: no longer hold RTNL while calling flush_all_backlogs() This caused issues in layers holding a private mutex: cleanup_net() rtnl_lock(); mutex_lock(subsystem_mutex); unregister_netdevice(); rtnl_unlock(); // LOCKDEP violation rtnl_lock(); I will revisit this in next cycle, opt-in for the new behavior from safe contexts only. Fixes: cfa579f66656 ("net: no longer hold RTNL while calling flush_all_backlogs()") Fixes: ae646f1a0bb9 ("net: reduce RTNL hold duration in unregister_netdevice_many_notify() (part 1)") Fixes: 83419b61d187 ("net: reduce RTNL hold duration in unregister_netdevice_many_notify() (part 2)") Reported-by: syzbot+5b9196ecf74447172a9a@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/6789d55f.050a0220.20d369.004e.GAE@google.com/ Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250129142726.747726-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/core/dev.c | 33 +++------------------------------ 1 file changed, 3 insertions(+), 30 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index 02cd75b512f92..c0021cbd28fc1 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -10264,37 +10264,14 @@ static bool from_cleanup_net(void) #endif } -static void rtnl_drop_if_cleanup_net(void) -{ - if (from_cleanup_net()) - __rtnl_unlock(); -} - -static void rtnl_acquire_if_cleanup_net(void) -{ - if (from_cleanup_net()) - rtnl_lock(); -} - /* Delayed registration/unregisteration */ LIST_HEAD(net_todo_list); -static LIST_HEAD(net_todo_list_for_cleanup_net); - -/* TODO: net_todo_list/net_todo_list_for_cleanup_net should probably - * be provided by callers, instead of being static, rtnl protected. - */ -static struct list_head *todo_list(void) -{ - return from_cleanup_net() ? &net_todo_list_for_cleanup_net : - &net_todo_list; -} - DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq); atomic_t dev_unreg_count = ATOMIC_INIT(0); static void net_set_todo(struct net_device *dev) { - list_add_tail(&dev->todo_list, todo_list()); + list_add_tail(&dev->todo_list, &net_todo_list); } static netdev_features_t netdev_sync_upper_features(struct net_device *lower, @@ -11144,7 +11121,7 @@ void netdev_run_todo(void) #endif /* Snapshot list, allow later requests */ - list_replace_init(todo_list(), &list); + list_replace_init(&net_todo_list, &list); __rtnl_unlock(); @@ -11789,11 +11766,9 @@ void unregister_netdevice_many_notify(struct list_head *head, WRITE_ONCE(dev->reg_state, NETREG_UNREGISTERING); netdev_unlock(dev); } - - rtnl_drop_if_cleanup_net(); flush_all_backlogs(); + synchronize_net(); - rtnl_acquire_if_cleanup_net(); list_for_each_entry(dev, head, unreg_list) { struct sk_buff *skb = NULL; @@ -11853,9 +11828,7 @@ void unregister_netdevice_many_notify(struct list_head *head, #endif } - rtnl_drop_if_cleanup_net(); synchronize_net(); - rtnl_acquire_if_cleanup_net(); list_for_each_entry(dev, head, unreg_list) { netdev_put(dev, &dev->dev_registered_tracker); From d7dda216ca49f1ac214cb577cbeeee760a2b425b Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 29 Jan 2025 11:13:32 -0800 Subject: [PATCH 88/88] MAINTAINERS: add Neal to TCP maintainers Neal Cardwell has been indispensable in TCP reviews and investigations, especially protocol-related. Neal is also the author of packetdrill. Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250129191332.2526140-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index d996f393fcf42..720c692b9dfba 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16525,6 +16525,7 @@ F: tools/testing/selftests/net/mptcp/ NETWORKING [TCP] M: Eric Dumazet +M: Neal Cardwell L: netdev@vger.kernel.org S: Maintained F: Documentation/networking/net_cachelines/tcp_sock.rst