From dddeeaa16ce9d163ccf3b681715512d338afa541 Mon Sep 17 00:00:00 2001 From: Joe Damato Date: Wed, 5 Mar 2025 18:09:00 +0000 Subject: [PATCH 01/56] igc: Fix XSK queue NAPI ID mapping In commit b65969856d4f ("igc: Link queues to NAPI instances"), the XSK queues were incorrectly unmapped from their NAPI instances. After discussion on the mailing list and the introduction of a test to codify the expected behavior, we can see that the unmapping causes the check_xsk test to fail: NETIF=enp86s0 ./tools/testing/selftests/drivers/net/queues.py [...] # Check| ksft_eq(q.get('xsk', None), {}, # Check failed None != {} xsk attr on queue we configured not ok 4 queues.check_xsk After this commit, the test passes: ok 4 queues.check_xsk Note that the test itself is only in net-next, so I tested this change by applying it to my local net-next tree, booting, and running the test. Cc: stable@vger.kernel.org Fixes: b65969856d4f ("igc: Link queues to NAPI instances") Signed-off-by: Joe Damato Reviewed-by: Gerhard Engleder Tested-by: Mor Bar-Gabay Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igc/igc.h | 2 -- drivers/net/ethernet/intel/igc/igc_main.c | 4 ++-- drivers/net/ethernet/intel/igc/igc_xdp.c | 2 -- 3 files changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/intel/igc/igc.h b/drivers/net/ethernet/intel/igc/igc.h index cd1d7b6c1782..c35cc5cb1185 100644 --- a/drivers/net/ethernet/intel/igc/igc.h +++ b/drivers/net/ethernet/intel/igc/igc.h @@ -337,8 +337,6 @@ struct igc_adapter { struct igc_led_classdev *leds; }; -void igc_set_queue_napi(struct igc_adapter *adapter, int q_idx, - struct napi_struct *napi); void igc_up(struct igc_adapter *adapter); void igc_down(struct igc_adapter *adapter); int igc_open(struct net_device *netdev); diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index 491d942cefca..27c99ff59ed4 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -5022,8 +5022,8 @@ static int igc_sw_init(struct igc_adapter *adapter) return 0; } -void igc_set_queue_napi(struct igc_adapter *adapter, int vector, - struct napi_struct *napi) +static void igc_set_queue_napi(struct igc_adapter *adapter, int vector, + struct napi_struct *napi) { struct igc_q_vector *q_vector = adapter->q_vector[vector]; diff --git a/drivers/net/ethernet/intel/igc/igc_xdp.c b/drivers/net/ethernet/intel/igc/igc_xdp.c index c538e6b18aad..9eb47b4beb06 100644 --- a/drivers/net/ethernet/intel/igc/igc_xdp.c +++ b/drivers/net/ethernet/intel/igc/igc_xdp.c @@ -97,7 +97,6 @@ static int igc_xdp_enable_pool(struct igc_adapter *adapter, napi_disable(napi); } - igc_set_queue_napi(adapter, queue_id, NULL); set_bit(IGC_RING_FLAG_AF_XDP_ZC, &rx_ring->flags); set_bit(IGC_RING_FLAG_AF_XDP_ZC, &tx_ring->flags); @@ -147,7 +146,6 @@ static int igc_xdp_disable_pool(struct igc_adapter *adapter, u16 queue_id) xsk_pool_dma_unmap(pool, IGC_RX_DMA_ATTR); clear_bit(IGC_RING_FLAG_AF_XDP_ZC, &rx_ring->flags); clear_bit(IGC_RING_FLAG_AF_XDP_ZC, &tx_ring->flags); - igc_set_queue_napi(adapter, queue_id, napi); if (needs_reset) { napi_enable(napi); From d931cf9b38da0f533cacfe51c863a9912e67822f Mon Sep 17 00:00:00 2001 From: Zdenek Bouska Date: Wed, 19 Mar 2025 15:18:48 +0100 Subject: [PATCH 02/56] igc: Fix TX drops in XDP ZC Fixes TX frame drops in AF_XDP zero copy mode when budget < 4. xsk_tx_peek_desc() consumed TX frame and it was ignored because of low budget. Not even AF_XDP completion was done for dropped frames. It can be reproduced on i226 by sending 100000x 60 B frames with launch time set to minimal IPG (672 ns between starts of frames) on 1Gbit/s. Always 1026 frames are not sent and are missing a completion. Fixes: 9acf59a752d4c ("igc: Enable TX via AF_XDP zero-copy") Signed-off-by: Zdenek Bouska Reviewed-by: Song Yoong Siang Reviewed-by: Florian Bezdeka Tested-by: Mor Bar-Gabay Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igc/igc_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index 27c99ff59ed4..156d123c0e21 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -3042,7 +3042,7 @@ static void igc_xdp_xmit_zc(struct igc_ring *ring) * descriptors. Therefore, to be safe, we always ensure we have at least * 4 descriptors available. */ - while (xsk_tx_peek_desc(pool, &xdp_desc) && budget >= 4) { + while (budget >= 4 && xsk_tx_peek_desc(pool, &xdp_desc)) { struct igc_metadata_request meta_req; struct xsk_tx_metadata *meta = NULL; struct igc_tx_buffer *bi; From efaaf344bc2917cbfa5997633bc18a05d3aed27f Mon Sep 17 00:00:00 2001 From: Vitaly Lifshits Date: Thu, 13 Mar 2025 16:05:56 +0200 Subject: [PATCH 03/56] e1000e: change k1 configuration on MTP and later platforms Starting from Meteor Lake, the Kumeran interface between the integrated MAC and the I219 PHY works at a different frequency. This causes sporadic MDI errors when accessing the PHY, and in rare circumstances could lead to packet corruption. To overcome this, introduce minor changes to the Kumeran idle state (K1) parameters during device initialization. Hardware reset reverts this configuration, therefore it needs to be applied in a few places. Fixes: cc23f4f0b6b9 ("e1000e: Add support for Meteor Lake") Signed-off-by: Vitaly Lifshits Tested-by: Avigail Dahan Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/e1000e/defines.h | 3 + drivers/net/ethernet/intel/e1000e/ich8lan.c | 80 +++++++++++++++++++-- drivers/net/ethernet/intel/e1000e/ich8lan.h | 4 ++ 3 files changed, 82 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/intel/e1000e/defines.h b/drivers/net/ethernet/intel/e1000e/defines.h index 5e2cfa73f889..8294a7c4f122 100644 --- a/drivers/net/ethernet/intel/e1000e/defines.h +++ b/drivers/net/ethernet/intel/e1000e/defines.h @@ -803,4 +803,7 @@ /* SerDes Control */ #define E1000_GEN_POLL_TIMEOUT 640 +#define E1000_FEXTNVM12_PHYPD_CTRL_MASK 0x00C00000 +#define E1000_FEXTNVM12_PHYPD_CTRL_P1 0x00800000 + #endif /* _E1000_DEFINES_H_ */ diff --git a/drivers/net/ethernet/intel/e1000e/ich8lan.c b/drivers/net/ethernet/intel/e1000e/ich8lan.c index 2f9655cf5dd9..364378133526 100644 --- a/drivers/net/ethernet/intel/e1000e/ich8lan.c +++ b/drivers/net/ethernet/intel/e1000e/ich8lan.c @@ -285,6 +285,45 @@ static void e1000_toggle_lanphypc_pch_lpt(struct e1000_hw *hw) } } +/** + * e1000_reconfigure_k1_exit_timeout - reconfigure K1 exit timeout to + * align to MTP and later platform requirements. + * @hw: pointer to the HW structure + * + * Context: PHY semaphore must be held by caller. + * Return: 0 on success, negative on failure + */ +static s32 e1000_reconfigure_k1_exit_timeout(struct e1000_hw *hw) +{ + u16 phy_timeout; + u32 fextnvm12; + s32 ret_val; + + if (hw->mac.type < e1000_pch_mtp) + return 0; + + /* Change Kumeran K1 power down state from P0s to P1 */ + fextnvm12 = er32(FEXTNVM12); + fextnvm12 &= ~E1000_FEXTNVM12_PHYPD_CTRL_MASK; + fextnvm12 |= E1000_FEXTNVM12_PHYPD_CTRL_P1; + ew32(FEXTNVM12, fextnvm12); + + /* Wait for the interface the settle */ + usleep_range(1000, 1100); + + /* Change K1 exit timeout */ + ret_val = e1e_rphy_locked(hw, I217_PHY_TIMEOUTS_REG, + &phy_timeout); + if (ret_val) + return ret_val; + + phy_timeout &= ~I217_PHY_TIMEOUTS_K1_EXIT_TO_MASK; + phy_timeout |= 0xF00; + + return e1e_wphy_locked(hw, I217_PHY_TIMEOUTS_REG, + phy_timeout); +} + /** * e1000_init_phy_workarounds_pchlan - PHY initialization workarounds * @hw: pointer to the HW structure @@ -327,15 +366,22 @@ static s32 e1000_init_phy_workarounds_pchlan(struct e1000_hw *hw) * LANPHYPC Value bit to force the interconnect to PCIe mode. */ switch (hw->mac.type) { + case e1000_pch_mtp: + case e1000_pch_lnp: + case e1000_pch_ptp: + case e1000_pch_nvp: + /* At this point the PHY might be inaccessible so don't + * propagate the failure + */ + if (e1000_reconfigure_k1_exit_timeout(hw)) + e_dbg("Failed to reconfigure K1 exit timeout\n"); + + fallthrough; case e1000_pch_lpt: case e1000_pch_spt: case e1000_pch_cnp: case e1000_pch_tgp: case e1000_pch_adp: - case e1000_pch_mtp: - case e1000_pch_lnp: - case e1000_pch_ptp: - case e1000_pch_nvp: if (e1000_phy_is_accessible_pchlan(hw)) break; @@ -419,8 +465,20 @@ static s32 e1000_init_phy_workarounds_pchlan(struct e1000_hw *hw) * the PHY is in. */ ret_val = hw->phy.ops.check_reset_block(hw); - if (ret_val) + if (ret_val) { e_err("ME blocked access to PHY after reset\n"); + goto out; + } + + if (hw->mac.type >= e1000_pch_mtp) { + ret_val = hw->phy.ops.acquire(hw); + if (ret_val) { + e_err("Failed to reconfigure K1 exit timeout\n"); + goto out; + } + ret_val = e1000_reconfigure_k1_exit_timeout(hw); + hw->phy.ops.release(hw); + } } out: @@ -4888,6 +4946,18 @@ static s32 e1000_init_hw_ich8lan(struct e1000_hw *hw) u16 i; e1000_initialize_hw_bits_ich8lan(hw); + if (hw->mac.type >= e1000_pch_mtp) { + ret_val = hw->phy.ops.acquire(hw); + if (ret_val) + return ret_val; + + ret_val = e1000_reconfigure_k1_exit_timeout(hw); + hw->phy.ops.release(hw); + if (ret_val) { + e_dbg("Error failed to reconfigure K1 exit timeout\n"); + return ret_val; + } + } /* Initialize identification LED */ ret_val = mac->ops.id_led_init(hw); diff --git a/drivers/net/ethernet/intel/e1000e/ich8lan.h b/drivers/net/ethernet/intel/e1000e/ich8lan.h index 2504b11c3169..5feb589a9b5f 100644 --- a/drivers/net/ethernet/intel/e1000e/ich8lan.h +++ b/drivers/net/ethernet/intel/e1000e/ich8lan.h @@ -219,6 +219,10 @@ #define I217_PLL_CLOCK_GATE_REG PHY_REG(772, 28) #define I217_PLL_CLOCK_GATE_MASK 0x07FF +/* PHY Timeouts */ +#define I217_PHY_TIMEOUTS_REG PHY_REG(770, 21) +#define I217_PHY_TIMEOUTS_K1_EXIT_TO_MASK 0x0FC0 + #define SW_FLAG_TIMEOUT 1000 /* SW Semaphore flag timeout in ms */ /* Inband Control */ From 40206599beec98cfeb01913ee417f015e3f6190c Mon Sep 17 00:00:00 2001 From: Piotr Kwapulinski Date: Fri, 21 Feb 2025 16:49:17 +0100 Subject: [PATCH 04/56] ixgbe: fix media type detection for E610 device The commit 23c0e5a16bcc ("ixgbe: Add link management support for E610 device") introduced incorrect media type detection for E610 device. It reproduces when advertised speed is modified after driver reload. Clear the previous outdated PHY type high value. Reproduction steps: modprobe ixgbe ethtool -s eth0 advertise 0x1000000000000 modprobe -r ixgbe modprobe ixgbe ethtool -s eth0 advertise 0x1000000000000 Result before the fix: netlink error: link settings update failed netlink error: Invalid argument Result after the fix: No output error Fixes: 23c0e5a16bcc ("ixgbe: Add link management support for E610 device") Reviewed-by: Przemek Kitszel Reviewed-by: Paul Menzel Signed-off-by: Piotr Kwapulinski Reviewed-by: Simon Horman Tested-by: Bharath R Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c index cb07ecd8937d..00935747c8c5 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c @@ -1453,9 +1453,11 @@ enum ixgbe_media_type ixgbe_get_media_type_e610(struct ixgbe_hw *hw) hw->link.link_info.phy_type_low = 0; } else { highest_bit = fls64(le64_to_cpu(pcaps.phy_type_low)); - if (highest_bit) + if (highest_bit) { hw->link.link_info.phy_type_low = BIT_ULL(highest_bit - 1); + hw->link.link_info.phy_type_high = 0; + } } } From 4c9106f4906a85f6b13542d862e423bcdc118cc3 Mon Sep 17 00:00:00 2001 From: Emil Tantilov Date: Mon, 17 Mar 2025 22:42:02 -0700 Subject: [PATCH 05/56] idpf: fix adapter NULL pointer dereference on reboot With SRIOV enabled, idpf ends up calling into idpf_remove() twice. First via idpf_shutdown() and then again when idpf_remove() calls into sriov_disable(), because the VF devices use the idpf driver, hence the same remove routine. When that happens, it is possible for the adapter to be NULL from the first call to idpf_remove(), leading to a NULL pointer dereference. echo 1 > /sys/class/net//device/sriov_numvfs reboot BUG: kernel NULL pointer dereference, address: 0000000000000020 ... RIP: 0010:idpf_remove+0x22/0x1f0 [idpf] ... ? idpf_remove+0x22/0x1f0 [idpf] ? idpf_remove+0x1e4/0x1f0 [idpf] pci_device_remove+0x3f/0xb0 device_release_driver_internal+0x19f/0x200 pci_stop_bus_device+0x6d/0x90 pci_stop_and_remove_bus_device+0x12/0x20 pci_iov_remove_virtfn+0xbe/0x120 sriov_disable+0x34/0xe0 idpf_sriov_configure+0x58/0x140 [idpf] idpf_remove+0x1b9/0x1f0 [idpf] idpf_shutdown+0x12/0x30 [idpf] pci_device_shutdown+0x35/0x60 device_shutdown+0x156/0x200 ... Replace the direct idpf_remove() call in idpf_shutdown() with idpf_vc_core_deinit() and idpf_deinit_dflt_mbx(), which perform the bulk of the cleanup, such as stopping the init task, freeing IRQs, destroying the vports and freeing the mailbox. This avoids the calls to sriov_disable() in addition to a small netdev cleanup, and destroying workqueues, which don't seem to be required on shutdown. Reported-by: Yuying Ma Fixes: e850efed5e15 ("idpf: add module register and probe functionality") Reviewed-by: Madhu Chittim Signed-off-by: Emil Tantilov Reviewed-by: Simon Horman Tested-by: Samuel Salin Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/idpf/idpf_main.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/idpf/idpf_main.c b/drivers/net/ethernet/intel/idpf/idpf_main.c index b6c515d14cbf..bec4a02c5373 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_main.c +++ b/drivers/net/ethernet/intel/idpf/idpf_main.c @@ -87,7 +87,11 @@ static void idpf_remove(struct pci_dev *pdev) */ static void idpf_shutdown(struct pci_dev *pdev) { - idpf_remove(pdev); + struct idpf_adapter *adapter = pci_get_drvdata(pdev); + + cancel_delayed_work_sync(&adapter->vc_event_task); + idpf_vc_core_deinit(adapter); + idpf_deinit_dflt_mbx(adapter); if (system_state == SYSTEM_POWER_OFF) pci_set_power_state(pdev, PCI_D3hot); From 9d74da1177c800eb3d51c13f9821b7b0683845a5 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 21 Mar 2025 23:24:20 +0100 Subject: [PATCH 06/56] netfilter: nft_set_hash: GC reaps elements with conncount for dynamic sets only conncount has its own GC handler which determines when to reap stale elements, this is convenient for dynamic sets. However, this also reaps non-dynamic sets with static configurations coming from control plane. Always run connlimit gc handler but honor feedback to reap element if this set is dynamic. Fixes: 290180e2448c ("netfilter: nf_tables: add connlimit support") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_set_hash.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index 8bfac4185ac7..abb0c8ec6371 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -309,7 +309,8 @@ static bool nft_rhash_expr_needs_gc_run(const struct nft_set *set, nft_setelem_expr_foreach(expr, elem_expr, size) { if (expr->ops->gc && - expr->ops->gc(read_pnet(&set->net), expr)) + expr->ops->gc(read_pnet(&set->net), expr) && + set->flags & NFT_SET_EVAL) return true; } From 688c15017d5cd5aac882400782e7213d40dc3556 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 1 Apr 2025 14:36:47 +0200 Subject: [PATCH 07/56] netfilter: nf_tables: don't unregister hook when table is dormant When nf_tables_updchain encounters an error, hook registration needs to be rolled back. This should only be done if the hook has been registered, which won't happen when the table is flagged as dormant (inactive). Just move the assignment into the registration block. Reported-by: syzbot+53ed3a6440173ddbf499@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=53ed3a6440173ddbf499 Fixes: b9703ed44ffb ("netfilter: nf_tables: support for adding new devices to an existing netdev chain") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index c2df81b7e950..a133e1c175ce 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2839,11 +2839,11 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, err = nft_netdev_register_hooks(ctx->net, &hook.list); if (err < 0) goto err_hooks; + + unregister = true; } } - unregister = true; - if (nla[NFTA_CHAIN_COUNTERS]) { if (!nft_is_base_chain(chain)) { err = -EOPNOTSUPP; From 078aabd567de3d63d37d7673f714e309d369e6e2 Mon Sep 17 00:00:00 2001 From: Debin Zhu Date: Tue, 1 Apr 2025 20:40:18 +0800 Subject: [PATCH 08/56] netlabel: Fix NULL pointer exception caused by CALIPSO on IPv4 sockets When calling netlbl_conn_setattr(), addr->sa_family is used to determine the function behavior. If sk is an IPv4 socket, but the connect function is called with an IPv6 address, the function calipso_sock_setattr() is triggered. Inside this function, the following code is executed: sk_fullsock(__sk) ? inet_sk(__sk)->pinet6 : NULL; Since sk is an IPv4 socket, pinet6 is NULL, leading to a null pointer dereference. This patch fixes the issue by checking if inet6_sk(sk) returns a NULL pointer before accessing pinet6. Signed-off-by: Debin Zhu Signed-off-by: Bitao Ouyang <1985755126@qq.com> Acked-by: Paul Moore Fixes: ceba1832b1b2 ("calipso: Set the calipso socket label to match the secattr.") Link: https://patch.msgid.link/20250401124018.4763-1-mowenroot@163.com Signed-off-by: Jakub Kicinski --- net/ipv6/calipso.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/net/ipv6/calipso.c b/net/ipv6/calipso.c index dbcea9fee626..62618a058b8f 100644 --- a/net/ipv6/calipso.c +++ b/net/ipv6/calipso.c @@ -1072,8 +1072,13 @@ static int calipso_sock_getattr(struct sock *sk, struct ipv6_opt_hdr *hop; int opt_len, len, ret_val = -ENOMSG, offset; unsigned char *opt; - struct ipv6_txoptions *txopts = txopt_get(inet6_sk(sk)); + struct ipv6_pinfo *pinfo = inet6_sk(sk); + struct ipv6_txoptions *txopts; + + if (!pinfo) + return -EAFNOSUPPORT; + txopts = txopt_get(pinfo); if (!txopts || !txopts->hopopt) goto done; @@ -1125,8 +1130,13 @@ static int calipso_sock_setattr(struct sock *sk, { int ret_val; struct ipv6_opt_hdr *old, *new; - struct ipv6_txoptions *txopts = txopt_get(inet6_sk(sk)); + struct ipv6_pinfo *pinfo = inet6_sk(sk); + struct ipv6_txoptions *txopts; + + if (!pinfo) + return -EAFNOSUPPORT; + txopts = txopt_get(pinfo); old = NULL; if (txopts) old = txopts->hopopt; @@ -1153,8 +1163,13 @@ static int calipso_sock_setattr(struct sock *sk, static void calipso_sock_delattr(struct sock *sk) { struct ipv6_opt_hdr *new_hop; - struct ipv6_txoptions *txopts = txopt_get(inet6_sk(sk)); + struct ipv6_pinfo *pinfo = inet6_sk(sk); + struct ipv6_txoptions *txopts; + + if (!pinfo) + return; + txopts = txopt_get(pinfo); if (!txopts || !txopts->hopopt) goto done; From ce8fe975fd99b49c29c42e50f2441ba53112b2e8 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Sat, 29 Mar 2025 15:25:35 -0700 Subject: [PATCH 09/56] net_sched: skbprio: Remove overly strict queue assertions In the current implementation, skbprio enqueue/dequeue contains an assertion that fails under certain conditions when SKBPRIO is used as a child qdisc under TBF with specific parameters. The failure occurs because TBF sometimes peeks at packets in the child qdisc without actually dequeuing them when tokens are unavailable. This peek operation creates a discrepancy between the parent and child qdisc queue length counters. When TBF later receives a high-priority packet, SKBPRIO's queue length may show a different value than what's reflected in its internal priority queue tracking, triggering the assertion. The fix removes this overly strict assertions in SKBPRIO, they are not necessary at all. Reported-by: syzbot+a3422a19b05ea96bee18@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=a3422a19b05ea96bee18 Fixes: aea5f654e6b7 ("net/sched: add skbprio scheduler") Cc: Nishanth Devarajan Signed-off-by: Cong Wang Acked-by: Paolo Abeni Link: https://patch.msgid.link/20250329222536.696204-2-xiyou.wangcong@gmail.com Signed-off-by: Jakub Kicinski --- net/sched/sch_skbprio.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/net/sched/sch_skbprio.c b/net/sched/sch_skbprio.c index 20ff7386b74b..f485f62ab721 100644 --- a/net/sched/sch_skbprio.c +++ b/net/sched/sch_skbprio.c @@ -123,8 +123,6 @@ static int skbprio_enqueue(struct sk_buff *skb, struct Qdisc *sch, /* Check to update highest and lowest priorities. */ if (skb_queue_empty(lp_qdisc)) { if (q->lowest_prio == q->highest_prio) { - /* The incoming packet is the only packet in queue. */ - BUG_ON(sch->q.qlen != 1); q->lowest_prio = prio; q->highest_prio = prio; } else { @@ -156,7 +154,6 @@ static struct sk_buff *skbprio_dequeue(struct Qdisc *sch) /* Update highest priority field. */ if (skb_queue_empty(hpq)) { if (q->lowest_prio == q->highest_prio) { - BUG_ON(sch->q.qlen); q->highest_prio = 0; q->lowest_prio = SKBPRIO_MAX_PRIORITY - 1; } else { From 076c700988938e02d51c018095d33b339cdb7ffd Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Sat, 29 Mar 2025 15:25:36 -0700 Subject: [PATCH 10/56] selftests: tc-testing: Add TBF with SKBPRIO queue length corner case test Add a test case to validate the interaction between TBF and SKBPRIO queueing disciplines, specifically targeting queue length accounting corner cases. This test complements the fix for the queue length accounting issue in the SKBPRIO qdisc. This is still best-effort, as timing and manipulating enqueue and dequeue from user-space is very hard. Cc: Pedro Tammela Signed-off-by: Cong Wang Acked-by: Paolo Abeni Link: https://patch.msgid.link/20250329222536.696204-3-xiyou.wangcong@gmail.com Signed-off-by: Jakub Kicinski --- .../tc-testing/tc-tests/infra/qdiscs.json | 34 ++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json b/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json index 9044ac054167..25454fd95537 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json +++ b/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json @@ -126,5 +126,37 @@ "$TC qdisc del dev $DUMMY root handle 1: drr", "$IP addr del 10.10.10.10/24 dev $DUMMY" ] - } + }, + { + "id": "c024", + "name": "Test TBF with SKBPRIO - catch qlen corner cases", + "category": [ + "qdisc", + "tbf", + "skbprio" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$IP link set dev $DUMMY up || true", + "$IP addr add 10.10.10.10/24 dev $DUMMY || true", + "$TC qdisc add dev $DUMMY handle 1: root tbf rate 100bit burst 2000 limit 1000", + "$TC qdisc add dev $DUMMY parent 1: handle 10: skbprio limit 1", + "ping -c 1 -W 0.1 -Q 0x00 -s 1400 -I $DUMMY 10.10.10.1 > /dev/null || true", + "ping -c 1 -W 0.1 -Q 0x1c -s 1400 -I $DUMMY 10.10.10.1 > /dev/null || true", + "ping -c 1 -W 0.1 -Q 0x00 -s 1400 -I $DUMMY 10.10.10.1 > /dev/null || true", + "ping -c 1 -W 0.1 -Q 0x1c -s 1400 -I $DUMMY 10.10.10.1 > /dev/null || true", + "sleep 0.5" + ], + "cmdUnderTest": "$TC -s qdisc show dev $DUMMY", + "expExitCode": "0", + "verifyCmd": "$TC -s qdisc show dev $DUMMY | grep -A 5 'qdisc skbprio'", + "matchPattern": "dropped [1-9][0-9]*", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DUMMY handle 1: root", + "$IP addr del 10.10.10.10/24 dev $DUMMY || true" + ] + } ] From 10206302af856791fbcc27a33ed3c3eb09b2793d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 31 Mar 2025 09:15:32 +0000 Subject: [PATCH 11/56] sctp: add mutual exclusion in proc_sctp_do_udp_port() We must serialize calls to sctp_udp_sock_stop() and sctp_udp_sock_start() or risk a crash as syzbot reported: Oops: general protection fault, probably for non-canonical address 0xdffffc000000000d: 0000 [#1] SMP KASAN PTI KASAN: null-ptr-deref in range [0x0000000000000068-0x000000000000006f] CPU: 1 UID: 0 PID: 6551 Comm: syz.1.44 Not tainted 6.14.0-syzkaller-g7f2ff7b62617 #0 PREEMPT(full) Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 02/12/2025 RIP: 0010:kernel_sock_shutdown+0x47/0x70 net/socket.c:3653 Call Trace: udp_tunnel_sock_release+0x68/0x80 net/ipv4/udp_tunnel_core.c:181 sctp_udp_sock_stop+0x71/0x160 net/sctp/protocol.c:930 proc_sctp_do_udp_port+0x264/0x450 net/sctp/sysctl.c:553 proc_sys_call_handler+0x3d0/0x5b0 fs/proc/proc_sysctl.c:601 iter_file_splice_write+0x91c/0x1150 fs/splice.c:738 do_splice_from fs/splice.c:935 [inline] direct_splice_actor+0x18f/0x6c0 fs/splice.c:1158 splice_direct_to_actor+0x342/0xa30 fs/splice.c:1102 do_splice_direct_actor fs/splice.c:1201 [inline] do_splice_direct+0x174/0x240 fs/splice.c:1227 do_sendfile+0xafd/0xe50 fs/read_write.c:1368 __do_sys_sendfile64 fs/read_write.c:1429 [inline] __se_sys_sendfile64 fs/read_write.c:1415 [inline] __x64_sys_sendfile64+0x1d8/0x220 fs/read_write.c:1415 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] Fixes: 046c052b475e ("sctp: enable udp tunneling socks") Reported-by: syzbot+fae49d997eb56fa7c74d@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/67ea5c01.050a0220.1547ec.012b.GAE@google.com/T/#u Signed-off-by: Eric Dumazet Cc: Marcelo Ricardo Leitner Acked-by: Xin Long Link: https://patch.msgid.link/20250331091532.224982-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/sctp/sysctl.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c index 8e1e97be4df7..ee3eac338a9d 100644 --- a/net/sctp/sysctl.c +++ b/net/sctp/sysctl.c @@ -525,6 +525,8 @@ static int proc_sctp_do_auth(const struct ctl_table *ctl, int write, return ret; } +static DEFINE_MUTEX(sctp_sysctl_mutex); + static int proc_sctp_do_udp_port(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { @@ -549,6 +551,7 @@ static int proc_sctp_do_udp_port(const struct ctl_table *ctl, int write, if (new_value > max || new_value < min) return -EINVAL; + mutex_lock(&sctp_sysctl_mutex); net->sctp.udp_port = new_value; sctp_udp_sock_stop(net); if (new_value) { @@ -561,6 +564,7 @@ static int proc_sctp_do_udp_port(const struct ctl_table *ctl, int write, lock_sock(sk); sctp_sk(sk)->udp_port = htons(net->sctp.udp_port); release_sock(sk); + mutex_unlock(&sctp_sysctl_mutex); } return ret; From 57b290d97c6150774bf929117ca737a26d8fc33d Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Mon, 31 Mar 2025 08:52:53 +0200 Subject: [PATCH 12/56] net: airoha: Fix qid report in airoha_tc_get_htb_get_leaf_queue() Fix the following kernel warning deleting HTB offloaded leafs and/or root HTB qdisc in airoha_eth driver properly reporting qid in airoha_tc_get_htb_get_leaf_queue routine. $tc qdisc replace dev eth1 root handle 10: htb offload $tc class add dev eth1 arent 10: classid 10:4 htb rate 100mbit ceil 100mbit $tc qdisc replace dev eth1 parent 10:4 handle 4: ets bands 8 \ quanta 1514 3028 4542 6056 7570 9084 10598 12112 $tc qdisc del dev eth1 root [ 55.827864] ------------[ cut here ]------------ [ 55.832493] WARNING: CPU: 3 PID: 2678 at 0xffffffc0798695a4 [ 55.956510] CPU: 3 PID: 2678 Comm: tc Tainted: G O 6.6.71 #0 [ 55.963557] Hardware name: Airoha AN7581 Evaluation Board (DT) [ 55.969383] pstate: 20400005 (nzCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) [ 55.976344] pc : 0xffffffc0798695a4 [ 55.979851] lr : 0xffffffc079869a20 [ 55.983358] sp : ffffffc0850536a0 [ 55.986665] x29: ffffffc0850536a0 x28: 0000000000000024 x27: 0000000000000001 [ 55.993800] x26: 0000000000000000 x25: ffffff8008b19000 x24: ffffff800222e800 [ 56.000935] x23: 0000000000000001 x22: 0000000000000000 x21: ffffff8008b19000 [ 56.008071] x20: ffffff8002225800 x19: ffffff800379d000 x18: 0000000000000000 [ 56.015206] x17: ffffffbf9ea59000 x16: ffffffc080018000 x15: 0000000000000000 [ 56.022342] x14: 0000000000000000 x13: 0000000000000000 x12: 0000000000000001 [ 56.029478] x11: ffffffc081471008 x10: ffffffc081575a98 x9 : 0000000000000000 [ 56.036614] x8 : ffffffc08167fd40 x7 : ffffffc08069e104 x6 : ffffff8007f86000 [ 56.043748] x5 : 0000000000000000 x4 : 0000000000000000 x3 : 0000000000000001 [ 56.050884] x2 : 0000000000000000 x1 : 0000000000000250 x0 : ffffff800222c000 [ 56.058020] Call trace: [ 56.060459] 0xffffffc0798695a4 [ 56.063618] 0xffffffc079869a20 [ 56.066777] __qdisc_destroy+0x40/0xa0 [ 56.070528] qdisc_put+0x54/0x6c [ 56.073748] qdisc_graft+0x41c/0x648 [ 56.077324] tc_get_qdisc+0x168/0x2f8 [ 56.080978] rtnetlink_rcv_msg+0x230/0x330 [ 56.085076] netlink_rcv_skb+0x5c/0x128 [ 56.088913] rtnetlink_rcv+0x14/0x1c [ 56.092490] netlink_unicast+0x1e0/0x2c8 [ 56.096413] netlink_sendmsg+0x198/0x3c8 [ 56.100337] ____sys_sendmsg+0x1c4/0x274 [ 56.104261] ___sys_sendmsg+0x7c/0xc0 [ 56.107924] __sys_sendmsg+0x44/0x98 [ 56.111492] __arm64_sys_sendmsg+0x20/0x28 [ 56.115580] invoke_syscall.constprop.0+0x58/0xfc [ 56.120285] do_el0_svc+0x3c/0xbc [ 56.123592] el0_svc+0x18/0x4c [ 56.126647] el0t_64_sync_handler+0x118/0x124 [ 56.131005] el0t_64_sync+0x150/0x154 [ 56.134660] ---[ end trace 0000000000000000 ]--- Fixes: ef1ca9271313b ("net: airoha: Add sched HTB offload support") Signed-off-by: Lorenzo Bianconi Acked-by: Paolo Abeni Link: https://patch.msgid.link/20250331-airoha-htb-qdisc-offload-del-fix-v1-1-4ea429c2c968@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/airoha/airoha_eth.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/airoha/airoha_eth.c b/drivers/net/ethernet/airoha/airoha_eth.c index c0a642568ac1..20a96cafc748 100644 --- a/drivers/net/ethernet/airoha/airoha_eth.c +++ b/drivers/net/ethernet/airoha/airoha_eth.c @@ -2358,7 +2358,7 @@ static int airoha_tc_get_htb_get_leaf_queue(struct airoha_gdm_port *port, return -EINVAL; } - opt->qid = channel; + opt->qid = AIROHA_NUM_TX_RING + channel; return 0; } From 367579274f60cb23c570ae5348966ab51e1509a4 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Mon, 31 Mar 2025 18:17:31 +0200 Subject: [PATCH 13/56] net: airoha: Fix ETS priomap validation ETS Qdisc schedules SP bands in a priority order assigning band-0 the highest priority (band-0 > band-1 > .. > band-n) while EN7581 arranges SP bands in a priority order assigning band-7 the highest priority (band-7 > band-6, .. > band-n). Fix priomap check in airoha_qdma_set_tx_ets_sched routine in order to align ETS Qdisc and airoha_eth driver SP priority ordering. Fixes: b56e4d660a96 ("net: airoha: Enforce ETS Qdisc priomap") Signed-off-by: Lorenzo Bianconi Reviewed-by: Simon Horman Reviewed-by: Davide Caratti Link: https://patch.msgid.link/20250331-airoha-ets-validate-priomap-v1-1-60a524488672@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/airoha/airoha_eth.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/airoha/airoha_eth.c b/drivers/net/ethernet/airoha/airoha_eth.c index 20a96cafc748..69e523dd4186 100644 --- a/drivers/net/ethernet/airoha/airoha_eth.c +++ b/drivers/net/ethernet/airoha/airoha_eth.c @@ -2028,7 +2028,7 @@ static int airoha_qdma_set_tx_ets_sched(struct airoha_gdm_port *port, struct tc_ets_qopt_offload_replace_params *p = &opt->replace_params; enum tx_sched_mode mode = TC_SCH_SP; u16 w[AIROHA_NUM_QOS_QUEUES] = {}; - int i, nstrict = 0, nwrr, qidx; + int i, nstrict = 0; if (p->bands > AIROHA_NUM_QOS_QUEUES) return -EINVAL; @@ -2046,17 +2046,17 @@ static int airoha_qdma_set_tx_ets_sched(struct airoha_gdm_port *port, * lowest priorities with respect to SP ones. * e.g: WRR0, WRR1, .., WRRm, SP0, SP1, .., SPn */ - nwrr = p->bands - nstrict; - qidx = nstrict && nwrr ? nstrict : 0; - for (i = 1; i <= p->bands; i++) { - if (p->priomap[i % AIROHA_NUM_QOS_QUEUES] != qidx) + for (i = 0; i < nstrict; i++) { + if (p->priomap[p->bands - i - 1] != i) return -EINVAL; - - qidx = i == nwrr ? 0 : qidx + 1; } - for (i = 0; i < nwrr; i++) + for (i = 0; i < p->bands - nstrict; i++) { + if (p->priomap[i] != nstrict + i) + return -EINVAL; + w[i] = p->weights[nstrict + i]; + } if (!nstrict) mode = TC_SCH_WRR8; From d996e412b2dfc079bd44bff5b3bc743fdb6d7c90 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 31 Mar 2025 07:28:14 -0700 Subject: [PATCH 14/56] bpf: add missing ops lock around dev_xdp_attach_link Syzkaller points out that create_link path doesn't grab ops lock, add it. Reported-by: syzbot+08936936fe8132f91f1a@syzkaller.appspotmail.com Closes: https://lore.kernel.org/bpf/67e6b3e8.050a0220.2f068f.0079.GAE@google.com/ Fixes: 97246d6d21c2 ("net: hold netdev instance lock during ndo_bpf") Signed-off-by: Stanislav Fomichev Link: https://patch.msgid.link/20250331142814.1887506-1-sdf@fomichev.me Signed-off-by: Jakub Kicinski --- net/core/dev.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/core/dev.c b/net/core/dev.c index be17e0660144..5d20ff226d5e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -10284,7 +10284,9 @@ int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) goto unlock; } + netdev_lock_ops(dev); err = dev_xdp_attach_link(dev, &extack, link); + netdev_unlock_ops(dev); rtnl_unlock(); if (err) { From 5bbcb5902e1c7193b5ffee13251ec92878aae0e5 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 31 Mar 2025 17:15:20 -0700 Subject: [PATCH 15/56] MAINTAINERS: update Open vSwitch maintainers Pravin has not been active for a while, missingmaints reports: Subsystem OPENVSWITCH Changes 138 / 253 (54%) (No activity) Top reviewers: [41]: aconole@redhat.com [31]: horms@kernel.org [23]: echaudro@redhat.com [8]: fw@strlen.de [6]: i.maximets@ovn.org INACTIVE MAINTAINER Pravin B Shelar Let's elevate Aaron, Eelco and Ilya to the status of maintainers. Acked-by: Aaron Conole Acked-by: Ilya Maximets Acked-by: Eelco Chaudron Acked-by: Simon Horman Link: https://patch.msgid.link/20250401001520.2080231-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- CREDITS | 4 ++++ MAINTAINERS | 4 +++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/CREDITS b/CREDITS index 660d5e67af7d..0dabce0f03f0 100644 --- a/CREDITS +++ b/CREDITS @@ -3666,6 +3666,10 @@ S: 149 Union St. S: Kingston, Ontario S: Canada K7L 2P4 +N: Pravin B Shelar +E: pshelar@ovn.org +D: Open vSwitch maintenance and contributions + N: John Shifflett E: john@geolog.com E: jshiffle@netcom.com diff --git a/MAINTAINERS b/MAINTAINERS index e0045ac4327b..ff882416c445 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -18097,7 +18097,9 @@ F: drivers/irqchip/irq-ompic.c F: drivers/irqchip/irq-or1k-* OPENVSWITCH -M: Pravin B Shelar +M: Aaron Conole +M: Eelco Chaudron +M: Ilya Maximets L: netdev@vger.kernel.org L: dev@openvswitch.org S: Maintained From d3210dabda8dd0477f3a6301dcaf9ed44aeccd3c Mon Sep 17 00:00:00 2001 From: Greg Thelen Date: Mon, 31 Mar 2025 18:53:15 -0700 Subject: [PATCH 16/56] eth: mlx4: select PAGE_POOL With commit 8533b14b3d65 ("eth: mlx4: create a page pool for Rx") mlx4 started using functions guarded by PAGE_POOL. This change introduced build errors when CONFIG_MLX4_EN is set but CONFIG_PAGE_POOL is not: ld: vmlinux.o: in function `mlx4_en_alloc_frags': en_rx.c:(.text+0xa5eaf9): undefined reference to `page_pool_alloc_pages' ld: vmlinux.o: in function `mlx4_en_create_rx_ring': (.text+0xa5ee91): undefined reference to `page_pool_create' Make MLX4_EN select PAGE_POOL to fix the ml;x4 build errors. Fixes: 8533b14b3d65 ("eth: mlx4: create a page pool for Rx") Signed-off-by: Greg Thelen Reviewed-by: Joe Damato Link: https://patch.msgid.link/20250401015315.2306092-1-gthelen@google.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx4/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/mellanox/mlx4/Kconfig b/drivers/net/ethernet/mellanox/mlx4/Kconfig index 825e05fb8607..0b1cb340206f 100644 --- a/drivers/net/ethernet/mellanox/mlx4/Kconfig +++ b/drivers/net/ethernet/mellanox/mlx4/Kconfig @@ -7,6 +7,7 @@ config MLX4_EN tristate "Mellanox Technologies 1/10/40Gbit Ethernet support" depends on PCI && NETDEVICES && ETHERNET && INET depends on PTP_1588_CLOCK_OPTIONAL + select PAGE_POOL select MLX4_CORE help This driver supports Mellanox Technologies ConnectX Ethernet From 96844075226b49af25a69a1d084b648ec2d9b08d Mon Sep 17 00:00:00 2001 From: Tobias Waldekranz Date: Tue, 1 Apr 2025 08:58:04 +0200 Subject: [PATCH 17/56] net: mvpp2: Prevent parser TCAM memory corruption Protect the parser TCAM/SRAM memory, and the cached (shadow) SRAM information, from concurrent modifications. Both the TCAM and SRAM tables are indirectly accessed by configuring an index register that selects the row to read or write to. This means that operations must be atomic in order to, e.g., avoid spreading writes across multiple rows. Since the shadow SRAM array is used to find free rows in the hardware table, it must also be protected in order to avoid TOCTOU errors where multiple cores allocate the same row. This issue was detected in a situation where `mvpp2_set_rx_mode()` ran concurrently on two CPUs. In this particular case the MVPP2_PE_MAC_UC_PROMISCUOUS entry was corrupted, causing the classifier unit to drop all incoming unicast - indicated by the `rx_classifier_drops` counter. Fixes: 3f518509dedc ("ethernet: Add new driver for Marvell Armada 375 network unit") Signed-off-by: Tobias Waldekranz Reviewed-by: Maxime Chevallier Tested-by: Maxime Chevallier Link: https://patch.msgid.link/20250401065855.3113635-1-tobias@waldekranz.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/mvpp2/mvpp2.h | 3 + .../net/ethernet/marvell/mvpp2/mvpp2_main.c | 3 +- .../net/ethernet/marvell/mvpp2/mvpp2_prs.c | 201 ++++++++++++------ 3 files changed, 140 insertions(+), 67 deletions(-) diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2.h b/drivers/net/ethernet/marvell/mvpp2/mvpp2.h index 44fe9b68d1c2..061fcd444d50 100644 --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2.h +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2.h @@ -1113,6 +1113,9 @@ struct mvpp2 { /* Spinlocks for CM3 shared memory configuration */ spinlock_t mss_spinlock; + + /* Spinlock for shared PRS parser memory and shadow table */ + spinlock_t prs_spinlock; }; struct mvpp2_pcpu_stats { diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c index 566c12c89520..416a926a8281 100644 --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c @@ -7723,8 +7723,9 @@ static int mvpp2_probe(struct platform_device *pdev) if (mvpp2_read(priv, MVPP2_VER_ID_REG) == MVPP2_VER_PP23) priv->hw_version = MVPP23; - /* Init mss lock */ + /* Init locks for shared packet processor resources */ spin_lock_init(&priv->mss_spinlock); + spin_lock_init(&priv->prs_spinlock); /* Initialize network controller */ err = mvpp2_init(pdev, priv); diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_prs.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_prs.c index 9af22f497a40..93e978bdf303 100644 --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_prs.c +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_prs.c @@ -23,6 +23,8 @@ static int mvpp2_prs_hw_write(struct mvpp2 *priv, struct mvpp2_prs_entry *pe) { int i; + lockdep_assert_held(&priv->prs_spinlock); + if (pe->index > MVPP2_PRS_TCAM_SRAM_SIZE - 1) return -EINVAL; @@ -43,11 +45,13 @@ static int mvpp2_prs_hw_write(struct mvpp2 *priv, struct mvpp2_prs_entry *pe) } /* Initialize tcam entry from hw */ -int mvpp2_prs_init_from_hw(struct mvpp2 *priv, struct mvpp2_prs_entry *pe, - int tid) +static int __mvpp2_prs_init_from_hw(struct mvpp2 *priv, + struct mvpp2_prs_entry *pe, int tid) { int i; + lockdep_assert_held(&priv->prs_spinlock); + if (tid > MVPP2_PRS_TCAM_SRAM_SIZE - 1) return -EINVAL; @@ -73,6 +77,18 @@ int mvpp2_prs_init_from_hw(struct mvpp2 *priv, struct mvpp2_prs_entry *pe, return 0; } +int mvpp2_prs_init_from_hw(struct mvpp2 *priv, struct mvpp2_prs_entry *pe, + int tid) +{ + int err; + + spin_lock_bh(&priv->prs_spinlock); + err = __mvpp2_prs_init_from_hw(priv, pe, tid); + spin_unlock_bh(&priv->prs_spinlock); + + return err; +} + /* Invalidate tcam hw entry */ static void mvpp2_prs_hw_inv(struct mvpp2 *priv, int index) { @@ -374,7 +390,7 @@ static int mvpp2_prs_flow_find(struct mvpp2 *priv, int flow) priv->prs_shadow[tid].lu != MVPP2_PRS_LU_FLOWS) continue; - mvpp2_prs_init_from_hw(priv, &pe, tid); + __mvpp2_prs_init_from_hw(priv, &pe, tid); bits = mvpp2_prs_sram_ai_get(&pe); /* Sram store classification lookup ID in AI bits [5:0] */ @@ -441,7 +457,7 @@ static void mvpp2_prs_mac_drop_all_set(struct mvpp2 *priv, int port, bool add) if (priv->prs_shadow[MVPP2_PE_DROP_ALL].valid) { /* Entry exist - update port only */ - mvpp2_prs_init_from_hw(priv, &pe, MVPP2_PE_DROP_ALL); + __mvpp2_prs_init_from_hw(priv, &pe, MVPP2_PE_DROP_ALL); } else { /* Entry doesn't exist - create new */ memset(&pe, 0, sizeof(pe)); @@ -469,14 +485,17 @@ static void mvpp2_prs_mac_drop_all_set(struct mvpp2 *priv, int port, bool add) } /* Set port to unicast or multicast promiscuous mode */ -void mvpp2_prs_mac_promisc_set(struct mvpp2 *priv, int port, - enum mvpp2_prs_l2_cast l2_cast, bool add) +static void __mvpp2_prs_mac_promisc_set(struct mvpp2 *priv, int port, + enum mvpp2_prs_l2_cast l2_cast, + bool add) { struct mvpp2_prs_entry pe; unsigned char cast_match; unsigned int ri; int tid; + lockdep_assert_held(&priv->prs_spinlock); + if (l2_cast == MVPP2_PRS_L2_UNI_CAST) { cast_match = MVPP2_PRS_UCAST_VAL; tid = MVPP2_PE_MAC_UC_PROMISCUOUS; @@ -489,7 +508,7 @@ void mvpp2_prs_mac_promisc_set(struct mvpp2 *priv, int port, /* promiscuous mode - Accept unknown unicast or multicast packets */ if (priv->prs_shadow[tid].valid) { - mvpp2_prs_init_from_hw(priv, &pe, tid); + __mvpp2_prs_init_from_hw(priv, &pe, tid); } else { memset(&pe, 0, sizeof(pe)); mvpp2_prs_tcam_lu_set(&pe, MVPP2_PRS_LU_MAC); @@ -522,6 +541,14 @@ void mvpp2_prs_mac_promisc_set(struct mvpp2 *priv, int port, mvpp2_prs_hw_write(priv, &pe); } +void mvpp2_prs_mac_promisc_set(struct mvpp2 *priv, int port, + enum mvpp2_prs_l2_cast l2_cast, bool add) +{ + spin_lock_bh(&priv->prs_spinlock); + __mvpp2_prs_mac_promisc_set(priv, port, l2_cast, add); + spin_unlock_bh(&priv->prs_spinlock); +} + /* Set entry for dsa packets */ static void mvpp2_prs_dsa_tag_set(struct mvpp2 *priv, int port, bool add, bool tagged, bool extend) @@ -539,7 +566,7 @@ static void mvpp2_prs_dsa_tag_set(struct mvpp2 *priv, int port, bool add, if (priv->prs_shadow[tid].valid) { /* Entry exist - update port only */ - mvpp2_prs_init_from_hw(priv, &pe, tid); + __mvpp2_prs_init_from_hw(priv, &pe, tid); } else { /* Entry doesn't exist - create new */ memset(&pe, 0, sizeof(pe)); @@ -610,7 +637,7 @@ static void mvpp2_prs_dsa_tag_ethertype_set(struct mvpp2 *priv, int port, if (priv->prs_shadow[tid].valid) { /* Entry exist - update port only */ - mvpp2_prs_init_from_hw(priv, &pe, tid); + __mvpp2_prs_init_from_hw(priv, &pe, tid); } else { /* Entry doesn't exist - create new */ memset(&pe, 0, sizeof(pe)); @@ -673,7 +700,7 @@ static int mvpp2_prs_vlan_find(struct mvpp2 *priv, unsigned short tpid, int ai) priv->prs_shadow[tid].lu != MVPP2_PRS_LU_VLAN) continue; - mvpp2_prs_init_from_hw(priv, &pe, tid); + __mvpp2_prs_init_from_hw(priv, &pe, tid); match = mvpp2_prs_tcam_data_cmp(&pe, 0, tpid); if (!match) continue; @@ -726,7 +753,7 @@ static int mvpp2_prs_vlan_add(struct mvpp2 *priv, unsigned short tpid, int ai, priv->prs_shadow[tid_aux].lu != MVPP2_PRS_LU_VLAN) continue; - mvpp2_prs_init_from_hw(priv, &pe, tid_aux); + __mvpp2_prs_init_from_hw(priv, &pe, tid_aux); ri_bits = mvpp2_prs_sram_ri_get(&pe); if ((ri_bits & MVPP2_PRS_RI_VLAN_MASK) == MVPP2_PRS_RI_VLAN_DOUBLE) @@ -760,7 +787,7 @@ static int mvpp2_prs_vlan_add(struct mvpp2 *priv, unsigned short tpid, int ai, mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_VLAN); } else { - mvpp2_prs_init_from_hw(priv, &pe, tid); + __mvpp2_prs_init_from_hw(priv, &pe, tid); } /* Update ports' mask */ mvpp2_prs_tcam_port_map_set(&pe, port_map); @@ -800,7 +827,7 @@ static int mvpp2_prs_double_vlan_find(struct mvpp2 *priv, unsigned short tpid1, priv->prs_shadow[tid].lu != MVPP2_PRS_LU_VLAN) continue; - mvpp2_prs_init_from_hw(priv, &pe, tid); + __mvpp2_prs_init_from_hw(priv, &pe, tid); match = mvpp2_prs_tcam_data_cmp(&pe, 0, tpid1) && mvpp2_prs_tcam_data_cmp(&pe, 4, tpid2); @@ -849,7 +876,7 @@ static int mvpp2_prs_double_vlan_add(struct mvpp2 *priv, unsigned short tpid1, priv->prs_shadow[tid_aux].lu != MVPP2_PRS_LU_VLAN) continue; - mvpp2_prs_init_from_hw(priv, &pe, tid_aux); + __mvpp2_prs_init_from_hw(priv, &pe, tid_aux); ri_bits = mvpp2_prs_sram_ri_get(&pe); ri_bits &= MVPP2_PRS_RI_VLAN_MASK; if (ri_bits == MVPP2_PRS_RI_VLAN_SINGLE || @@ -880,7 +907,7 @@ static int mvpp2_prs_double_vlan_add(struct mvpp2 *priv, unsigned short tpid1, mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_VLAN); } else { - mvpp2_prs_init_from_hw(priv, &pe, tid); + __mvpp2_prs_init_from_hw(priv, &pe, tid); } /* Update ports' mask */ @@ -1213,8 +1240,8 @@ static void mvpp2_prs_mac_init(struct mvpp2 *priv) /* Create dummy entries for drop all and promiscuous modes */ mvpp2_prs_drop_fc(priv); mvpp2_prs_mac_drop_all_set(priv, 0, false); - mvpp2_prs_mac_promisc_set(priv, 0, MVPP2_PRS_L2_UNI_CAST, false); - mvpp2_prs_mac_promisc_set(priv, 0, MVPP2_PRS_L2_MULTI_CAST, false); + __mvpp2_prs_mac_promisc_set(priv, 0, MVPP2_PRS_L2_UNI_CAST, false); + __mvpp2_prs_mac_promisc_set(priv, 0, MVPP2_PRS_L2_MULTI_CAST, false); } /* Set default entries for various types of dsa packets */ @@ -1533,12 +1560,6 @@ static int mvpp2_prs_vlan_init(struct platform_device *pdev, struct mvpp2 *priv) struct mvpp2_prs_entry pe; int err; - priv->prs_double_vlans = devm_kcalloc(&pdev->dev, sizeof(bool), - MVPP2_PRS_DBL_VLANS_MAX, - GFP_KERNEL); - if (!priv->prs_double_vlans) - return -ENOMEM; - /* Double VLAN: 0x88A8, 0x8100 */ err = mvpp2_prs_double_vlan_add(priv, ETH_P_8021AD, ETH_P_8021Q, MVPP2_PRS_PORT_MASK); @@ -1941,7 +1962,7 @@ static int mvpp2_prs_vid_range_find(struct mvpp2_port *port, u16 vid, u16 mask) port->priv->prs_shadow[tid].lu != MVPP2_PRS_LU_VID) continue; - mvpp2_prs_init_from_hw(port->priv, &pe, tid); + __mvpp2_prs_init_from_hw(port->priv, &pe, tid); mvpp2_prs_tcam_data_byte_get(&pe, 2, &byte[0], &enable[0]); mvpp2_prs_tcam_data_byte_get(&pe, 3, &byte[1], &enable[1]); @@ -1970,6 +1991,8 @@ int mvpp2_prs_vid_entry_add(struct mvpp2_port *port, u16 vid) memset(&pe, 0, sizeof(pe)); + spin_lock_bh(&priv->prs_spinlock); + /* Scan TCAM and see if entry with this already exist */ tid = mvpp2_prs_vid_range_find(port, vid, mask); @@ -1988,8 +2011,10 @@ int mvpp2_prs_vid_entry_add(struct mvpp2_port *port, u16 vid) MVPP2_PRS_VLAN_FILT_MAX_ENTRY); /* There isn't room for a new VID filter */ - if (tid < 0) + if (tid < 0) { + spin_unlock_bh(&priv->prs_spinlock); return tid; + } mvpp2_prs_tcam_lu_set(&pe, MVPP2_PRS_LU_VID); pe.index = tid; @@ -1997,7 +2022,7 @@ int mvpp2_prs_vid_entry_add(struct mvpp2_port *port, u16 vid) /* Mask all ports */ mvpp2_prs_tcam_port_map_set(&pe, 0); } else { - mvpp2_prs_init_from_hw(priv, &pe, tid); + __mvpp2_prs_init_from_hw(priv, &pe, tid); } /* Enable the current port */ @@ -2019,6 +2044,7 @@ int mvpp2_prs_vid_entry_add(struct mvpp2_port *port, u16 vid) mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_VID); mvpp2_prs_hw_write(priv, &pe); + spin_unlock_bh(&priv->prs_spinlock); return 0; } @@ -2028,15 +2054,16 @@ void mvpp2_prs_vid_entry_remove(struct mvpp2_port *port, u16 vid) struct mvpp2 *priv = port->priv; int tid; - /* Scan TCAM and see if entry with this already exist */ - tid = mvpp2_prs_vid_range_find(port, vid, 0xfff); + spin_lock_bh(&priv->prs_spinlock); - /* No such entry */ - if (tid < 0) - return; + /* Invalidate TCAM entry with this , if it exists */ + tid = mvpp2_prs_vid_range_find(port, vid, 0xfff); + if (tid >= 0) { + mvpp2_prs_hw_inv(priv, tid); + priv->prs_shadow[tid].valid = false; + } - mvpp2_prs_hw_inv(priv, tid); - priv->prs_shadow[tid].valid = false; + spin_unlock_bh(&priv->prs_spinlock); } /* Remove all existing VID filters on this port */ @@ -2045,6 +2072,8 @@ void mvpp2_prs_vid_remove_all(struct mvpp2_port *port) struct mvpp2 *priv = port->priv; int tid; + spin_lock_bh(&priv->prs_spinlock); + for (tid = MVPP2_PRS_VID_PORT_FIRST(port->id); tid <= MVPP2_PRS_VID_PORT_LAST(port->id); tid++) { if (priv->prs_shadow[tid].valid) { @@ -2052,6 +2081,8 @@ void mvpp2_prs_vid_remove_all(struct mvpp2_port *port) priv->prs_shadow[tid].valid = false; } } + + spin_unlock_bh(&priv->prs_spinlock); } /* Remove VID filering entry for this port */ @@ -2060,10 +2091,14 @@ void mvpp2_prs_vid_disable_filtering(struct mvpp2_port *port) unsigned int tid = MVPP2_PRS_VID_PORT_DFLT(port->id); struct mvpp2 *priv = port->priv; + spin_lock_bh(&priv->prs_spinlock); + /* Invalidate the guard entry */ mvpp2_prs_hw_inv(priv, tid); priv->prs_shadow[tid].valid = false; + + spin_unlock_bh(&priv->prs_spinlock); } /* Add guard entry that drops packets when no VID is matched on this port */ @@ -2079,6 +2114,8 @@ void mvpp2_prs_vid_enable_filtering(struct mvpp2_port *port) memset(&pe, 0, sizeof(pe)); + spin_lock_bh(&priv->prs_spinlock); + pe.index = tid; reg_val = mvpp2_read(priv, MVPP2_MH_REG(port->id)); @@ -2111,6 +2148,8 @@ void mvpp2_prs_vid_enable_filtering(struct mvpp2_port *port) /* Update shadow table */ mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_VID); mvpp2_prs_hw_write(priv, &pe); + + spin_unlock_bh(&priv->prs_spinlock); } /* Parser default initialization */ @@ -2118,6 +2157,20 @@ int mvpp2_prs_default_init(struct platform_device *pdev, struct mvpp2 *priv) { int err, index, i; + priv->prs_shadow = devm_kcalloc(&pdev->dev, MVPP2_PRS_TCAM_SRAM_SIZE, + sizeof(*priv->prs_shadow), + GFP_KERNEL); + if (!priv->prs_shadow) + return -ENOMEM; + + priv->prs_double_vlans = devm_kcalloc(&pdev->dev, sizeof(bool), + MVPP2_PRS_DBL_VLANS_MAX, + GFP_KERNEL); + if (!priv->prs_double_vlans) + return -ENOMEM; + + spin_lock_bh(&priv->prs_spinlock); + /* Enable tcam table */ mvpp2_write(priv, MVPP2_PRS_TCAM_CTRL_REG, MVPP2_PRS_TCAM_EN_MASK); @@ -2136,12 +2189,6 @@ int mvpp2_prs_default_init(struct platform_device *pdev, struct mvpp2 *priv) for (index = 0; index < MVPP2_PRS_TCAM_SRAM_SIZE; index++) mvpp2_prs_hw_inv(priv, index); - priv->prs_shadow = devm_kcalloc(&pdev->dev, MVPP2_PRS_TCAM_SRAM_SIZE, - sizeof(*priv->prs_shadow), - GFP_KERNEL); - if (!priv->prs_shadow) - return -ENOMEM; - /* Always start from lookup = 0 */ for (index = 0; index < MVPP2_MAX_PORTS; index++) mvpp2_prs_hw_port_init(priv, index, MVPP2_PRS_LU_MH, @@ -2158,26 +2205,13 @@ int mvpp2_prs_default_init(struct platform_device *pdev, struct mvpp2 *priv) mvpp2_prs_vid_init(priv); err = mvpp2_prs_etype_init(priv); - if (err) - return err; - - err = mvpp2_prs_vlan_init(pdev, priv); - if (err) - return err; - - err = mvpp2_prs_pppoe_init(priv); - if (err) - return err; - - err = mvpp2_prs_ip6_init(priv); - if (err) - return err; - - err = mvpp2_prs_ip4_init(priv); - if (err) - return err; + err = err ? : mvpp2_prs_vlan_init(pdev, priv); + err = err ? : mvpp2_prs_pppoe_init(priv); + err = err ? : mvpp2_prs_ip6_init(priv); + err = err ? : mvpp2_prs_ip4_init(priv); - return 0; + spin_unlock_bh(&priv->prs_spinlock); + return err; } /* Compare MAC DA with tcam entry data */ @@ -2217,7 +2251,7 @@ mvpp2_prs_mac_da_range_find(struct mvpp2 *priv, int pmap, const u8 *da, (priv->prs_shadow[tid].udf != udf_type)) continue; - mvpp2_prs_init_from_hw(priv, &pe, tid); + __mvpp2_prs_init_from_hw(priv, &pe, tid); entry_pmap = mvpp2_prs_tcam_port_map_get(&pe); if (mvpp2_prs_mac_range_equals(&pe, da, mask) && @@ -2229,7 +2263,8 @@ mvpp2_prs_mac_da_range_find(struct mvpp2 *priv, int pmap, const u8 *da, } /* Update parser's mac da entry */ -int mvpp2_prs_mac_da_accept(struct mvpp2_port *port, const u8 *da, bool add) +static int __mvpp2_prs_mac_da_accept(struct mvpp2_port *port, + const u8 *da, bool add) { unsigned char mask[ETH_ALEN] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; struct mvpp2 *priv = port->priv; @@ -2261,7 +2296,7 @@ int mvpp2_prs_mac_da_accept(struct mvpp2_port *port, const u8 *da, bool add) /* Mask all ports */ mvpp2_prs_tcam_port_map_set(&pe, 0); } else { - mvpp2_prs_init_from_hw(priv, &pe, tid); + __mvpp2_prs_init_from_hw(priv, &pe, tid); } mvpp2_prs_tcam_lu_set(&pe, MVPP2_PRS_LU_MAC); @@ -2317,6 +2352,17 @@ int mvpp2_prs_mac_da_accept(struct mvpp2_port *port, const u8 *da, bool add) return 0; } +int mvpp2_prs_mac_da_accept(struct mvpp2_port *port, const u8 *da, bool add) +{ + int err; + + spin_lock_bh(&port->priv->prs_spinlock); + err = __mvpp2_prs_mac_da_accept(port, da, add); + spin_unlock_bh(&port->priv->prs_spinlock); + + return err; +} + int mvpp2_prs_update_mac_da(struct net_device *dev, const u8 *da) { struct mvpp2_port *port = netdev_priv(dev); @@ -2345,6 +2391,8 @@ void mvpp2_prs_mac_del_all(struct mvpp2_port *port) unsigned long pmap; int index, tid; + spin_lock_bh(&priv->prs_spinlock); + for (tid = MVPP2_PE_MAC_RANGE_START; tid <= MVPP2_PE_MAC_RANGE_END; tid++) { unsigned char da[ETH_ALEN], da_mask[ETH_ALEN]; @@ -2354,7 +2402,7 @@ void mvpp2_prs_mac_del_all(struct mvpp2_port *port) (priv->prs_shadow[tid].udf != MVPP2_PRS_UDF_MAC_DEF)) continue; - mvpp2_prs_init_from_hw(priv, &pe, tid); + __mvpp2_prs_init_from_hw(priv, &pe, tid); pmap = mvpp2_prs_tcam_port_map_get(&pe); @@ -2375,14 +2423,17 @@ void mvpp2_prs_mac_del_all(struct mvpp2_port *port) continue; /* Remove entry from TCAM */ - mvpp2_prs_mac_da_accept(port, da, false); + __mvpp2_prs_mac_da_accept(port, da, false); } + + spin_unlock_bh(&priv->prs_spinlock); } int mvpp2_prs_tag_mode_set(struct mvpp2 *priv, int port, int type) { switch (type) { case MVPP2_TAG_TYPE_EDSA: + spin_lock_bh(&priv->prs_spinlock); /* Add port to EDSA entries */ mvpp2_prs_dsa_tag_set(priv, port, true, MVPP2_PRS_TAGGED, MVPP2_PRS_EDSA); @@ -2393,9 +2444,11 @@ int mvpp2_prs_tag_mode_set(struct mvpp2 *priv, int port, int type) MVPP2_PRS_TAGGED, MVPP2_PRS_DSA); mvpp2_prs_dsa_tag_set(priv, port, false, MVPP2_PRS_UNTAGGED, MVPP2_PRS_DSA); + spin_unlock_bh(&priv->prs_spinlock); break; case MVPP2_TAG_TYPE_DSA: + spin_lock_bh(&priv->prs_spinlock); /* Add port to DSA entries */ mvpp2_prs_dsa_tag_set(priv, port, true, MVPP2_PRS_TAGGED, MVPP2_PRS_DSA); @@ -2406,10 +2459,12 @@ int mvpp2_prs_tag_mode_set(struct mvpp2 *priv, int port, int type) MVPP2_PRS_TAGGED, MVPP2_PRS_EDSA); mvpp2_prs_dsa_tag_set(priv, port, false, MVPP2_PRS_UNTAGGED, MVPP2_PRS_EDSA); + spin_unlock_bh(&priv->prs_spinlock); break; case MVPP2_TAG_TYPE_MH: case MVPP2_TAG_TYPE_NONE: + spin_lock_bh(&priv->prs_spinlock); /* Remove port form EDSA and DSA entries */ mvpp2_prs_dsa_tag_set(priv, port, false, MVPP2_PRS_TAGGED, MVPP2_PRS_DSA); @@ -2419,6 +2474,7 @@ int mvpp2_prs_tag_mode_set(struct mvpp2 *priv, int port, int type) MVPP2_PRS_TAGGED, MVPP2_PRS_EDSA); mvpp2_prs_dsa_tag_set(priv, port, false, MVPP2_PRS_UNTAGGED, MVPP2_PRS_EDSA); + spin_unlock_bh(&priv->prs_spinlock); break; default: @@ -2437,11 +2493,15 @@ int mvpp2_prs_add_flow(struct mvpp2 *priv, int flow, u32 ri, u32 ri_mask) memset(&pe, 0, sizeof(pe)); + spin_lock_bh(&priv->prs_spinlock); + tid = mvpp2_prs_tcam_first_free(priv, MVPP2_PE_LAST_FREE_TID, MVPP2_PE_FIRST_FREE_TID); - if (tid < 0) + if (tid < 0) { + spin_unlock_bh(&priv->prs_spinlock); return tid; + } pe.index = tid; @@ -2461,6 +2521,7 @@ int mvpp2_prs_add_flow(struct mvpp2 *priv, int flow, u32 ri, u32 ri_mask) mvpp2_prs_tcam_port_map_set(&pe, MVPP2_PRS_PORT_MASK); mvpp2_prs_hw_write(priv, &pe); + spin_unlock_bh(&priv->prs_spinlock); return 0; } @@ -2472,6 +2533,8 @@ int mvpp2_prs_def_flow(struct mvpp2_port *port) memset(&pe, 0, sizeof(pe)); + spin_lock_bh(&port->priv->prs_spinlock); + tid = mvpp2_prs_flow_find(port->priv, port->id); /* Such entry not exist */ @@ -2480,8 +2543,10 @@ int mvpp2_prs_def_flow(struct mvpp2_port *port) tid = mvpp2_prs_tcam_first_free(port->priv, MVPP2_PE_LAST_FREE_TID, MVPP2_PE_FIRST_FREE_TID); - if (tid < 0) + if (tid < 0) { + spin_unlock_bh(&port->priv->prs_spinlock); return tid; + } pe.index = tid; @@ -2492,13 +2557,14 @@ int mvpp2_prs_def_flow(struct mvpp2_port *port) /* Update shadow table */ mvpp2_prs_shadow_set(port->priv, pe.index, MVPP2_PRS_LU_FLOWS); } else { - mvpp2_prs_init_from_hw(port->priv, &pe, tid); + __mvpp2_prs_init_from_hw(port->priv, &pe, tid); } mvpp2_prs_tcam_lu_set(&pe, MVPP2_PRS_LU_FLOWS); mvpp2_prs_tcam_port_map_set(&pe, (1 << port->id)); mvpp2_prs_hw_write(port->priv, &pe); + spin_unlock_bh(&port->priv->prs_spinlock); return 0; } @@ -2509,11 +2575,14 @@ int mvpp2_prs_hits(struct mvpp2 *priv, int index) if (index > MVPP2_PRS_TCAM_SRAM_SIZE) return -EINVAL; + spin_lock_bh(&priv->prs_spinlock); + mvpp2_write(priv, MVPP2_PRS_TCAM_HIT_IDX_REG, index); val = mvpp2_read(priv, MVPP2_PRS_TCAM_HIT_CNT_REG); val &= MVPP2_PRS_TCAM_HIT_CNT_MASK; + spin_unlock_bh(&priv->prs_spinlock); return val; } From ca9e5d3d9a4d7d30355aa68bb30dc6f850f067ab Mon Sep 17 00:00:00 2001 From: Pedro Tammela Date: Tue, 1 Apr 2025 11:49:08 -0300 Subject: [PATCH 18/56] selftests: tc-testing: fix nat regex matching In iproute 6.14, the nat ip mask logic was fixed to remove an undefined behaviour[1]. So now instead of reporting '0.0.0.0/32' on x86 and potentially '0.0.0.0/0' in other platforms, it reports '0.0.0.0/0' in all platforms. [1] https://lore.kernel.org/netdev/20250306112520.188728-1-torben.nielsen@prevas.dk/ Reviewed-by: Simon Horman Signed-off-by: Pedro Tammela Link: https://patch.msgid.link/20250401144908.568140-1-pctammela@mojatatu.com Signed-off-by: Jakub Kicinski --- .../selftests/tc-testing/tc-tests/actions/nat.json | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/nat.json b/tools/testing/selftests/tc-testing/tc-tests/actions/nat.json index ee2792998c89..4f21aeb8a3fb 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/actions/nat.json +++ b/tools/testing/selftests/tc-testing/tc-tests/actions/nat.json @@ -305,7 +305,7 @@ "cmdUnderTest": "$TC actions add action nat ingress default 10.10.10.1 index 12", "expExitCode": "0", "verifyCmd": "$TC actions get action nat index 12", - "matchPattern": "action order [0-9]+: nat ingress 0.0.0.0/32 10.10.10.1 pass.*index 12 ref", + "matchPattern": "action order [0-9]+: nat ingress 0.0.0.0/0 10.10.10.1 pass.*index 12 ref", "matchCount": "1", "teardown": [ "$TC actions flush action nat" @@ -332,7 +332,7 @@ "cmdUnderTest": "$TC actions add action nat ingress any 10.10.10.1 index 12", "expExitCode": "0", "verifyCmd": "$TC actions get action nat index 12", - "matchPattern": "action order [0-9]+: nat ingress 0.0.0.0/32 10.10.10.1 pass.*index 12 ref", + "matchPattern": "action order [0-9]+: nat ingress 0.0.0.0/0 10.10.10.1 pass.*index 12 ref", "matchCount": "1", "teardown": [ "$TC actions flush action nat" @@ -359,7 +359,7 @@ "cmdUnderTest": "$TC actions add action nat ingress all 10.10.10.1 index 12", "expExitCode": "0", "verifyCmd": "$TC actions get action nat index 12", - "matchPattern": "action order [0-9]+: nat ingress 0.0.0.0/32 10.10.10.1 pass.*index 12 ref", + "matchPattern": "action order [0-9]+: nat ingress 0.0.0.0/0 10.10.10.1 pass.*index 12 ref", "matchCount": "1", "teardown": [ "$TC actions flush action nat" @@ -548,7 +548,7 @@ "cmdUnderTest": "$TC actions add action nat egress default 20.20.20.1 pipe index 10", "expExitCode": "0", "verifyCmd": "$TC actions get action nat index 10", - "matchPattern": "action order [0-9]+: nat egress 0.0.0.0/32 20.20.20.1 pipe.*index 10 ref", + "matchPattern": "action order [0-9]+: nat egress 0.0.0.0/0 20.20.20.1 pipe.*index 10 ref", "matchCount": "1", "teardown": [ "$TC actions flush action nat" @@ -575,7 +575,7 @@ "cmdUnderTest": "$TC actions add action nat egress any 20.20.20.1 pipe index 10", "expExitCode": "0", "verifyCmd": "$TC actions get action nat index 10", - "matchPattern": "action order [0-9]+: nat egress 0.0.0.0/32 20.20.20.1 pipe.*index 10 ref", + "matchPattern": "action order [0-9]+: nat egress 0.0.0.0/0 20.20.20.1 pipe.*index 10 ref", "matchCount": "1", "teardown": [ "$TC actions flush action nat" @@ -602,7 +602,7 @@ "cmdUnderTest": "$TC actions add action nat egress all 20.20.20.1 pipe index 10", "expExitCode": "0", "verifyCmd": "$TC actions get action nat index 10", - "matchPattern": "action order [0-9]+: nat egress 0.0.0.0/32 20.20.20.1 pipe.*index 10 ref", + "matchPattern": "action order [0-9]+: nat egress 0.0.0.0/0 20.20.20.1 pipe.*index 10 ref", "matchCount": "1", "teardown": [ "$TC actions flush action nat" @@ -629,7 +629,7 @@ "cmdUnderTest": "$TC actions add action nat egress all 20.20.20.1 pipe index 10 cookie aa1bc2d3eeff112233445566778800a1", "expExitCode": "0", "verifyCmd": "$TC actions get action nat index 10", - "matchPattern": "action order [0-9]+: nat egress 0.0.0.0/32 20.20.20.1 pipe.*index 10 ref.*cookie aa1bc2d3eeff112233445566778800a1", + "matchPattern": "action order [0-9]+: nat egress 0.0.0.0/0 20.20.20.1 pipe.*index 10 ref.*cookie aa1bc2d3eeff112233445566778800a1", "matchCount": "1", "teardown": [ "$TC actions flush action nat" From 1b7fdc702c031134c619b74c4f311c590e50ca3d Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 1 Apr 2025 12:07:08 -0700 Subject: [PATCH 19/56] rtnetlink: Use register_pernet_subsys() in rtnl_net_debug_init(). rtnl_net_debug_init() registers rtnl_net_debug_net_ops by register_pernet_device() but calls unregister_pernet_subsys() in case register_netdevice_notifier() fails. It corrupts pernet_list because first_device is updated in register_pernet_device() but not unregister_pernet_subsys(). Let's fix it by calling register_pernet_subsys() instead. The _subsys() one fits better for the use case because it keeps the notifier alive until default_device_exit_net(), giving it more chance to test NETDEV_UNREGISTER. Fixes: 03fa53485659 ("rtnetlink: Add ASSERT_RTNL_NET() placeholder for netdev notifier.") Signed-off-by: Kuniyuki Iwashima Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250401190716.70437-1-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/core/rtnl_net_debug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/rtnl_net_debug.c b/net/core/rtnl_net_debug.c index 7ecd28cc1c22..f3272b09c255 100644 --- a/net/core/rtnl_net_debug.c +++ b/net/core/rtnl_net_debug.c @@ -102,7 +102,7 @@ static int __init rtnl_net_debug_init(void) { int ret; - ret = register_pernet_device(&rtnl_net_debug_net_ops); + ret = register_pernet_subsys(&rtnl_net_debug_net_ops); if (ret) return ret; From 5a465a0da13ee9fbd7d3cd0b2893309b0fe4b7e3 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 1 Apr 2025 11:44:42 -0700 Subject: [PATCH 20/56] udp: Fix multiple wraparounds of sk->sk_rmem_alloc. __udp_enqueue_schedule_skb() has the following condition: if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) goto drop; sk->sk_rcvbuf is initialised by net.core.rmem_default and later can be configured by SO_RCVBUF, which is limited by net.core.rmem_max, or SO_RCVBUFFORCE. If we set INT_MAX to sk->sk_rcvbuf, the condition is always false as sk->sk_rmem_alloc is also signed int. Then, the size of the incoming skb is added to sk->sk_rmem_alloc unconditionally. This results in integer overflow (possibly multiple times) on sk->sk_rmem_alloc and allows a single socket to have skb up to net.core.udp_mem[1]. For example, if we set a large value to udp_mem[1] and INT_MAX to sk->sk_rcvbuf and flood packets to the socket, we can see multiple overflows: # cat /proc/net/sockstat | grep UDP: UDP: inuse 3 mem 7956736 <-- (7956736 << 12) bytes > INT_MAX * 15 ^- PAGE_SHIFT # ss -uam State Recv-Q ... UNCONN -1757018048 ... <-- flipping the sign repeatedly skmem:(r2537949248,rb2147483646,t0,tb212992,f1984,w0,o0,bl0,d0) Previously, we had a boundary check for INT_MAX, which was removed by commit 6a1f12dd85a8 ("udp: relax atomic operation on sk->sk_rmem_alloc"). A complete fix would be to revert it and cap the right operand by INT_MAX: rmem = atomic_add_return(size, &sk->sk_rmem_alloc); if (rmem > min(size + (unsigned int)sk->sk_rcvbuf, INT_MAX)) goto uncharge_drop; but we do not want to add the expensive atomic_add_return() back just for the corner case. Casting rmem to unsigned int prevents multiple wraparounds, but we still allow a single wraparound. # cat /proc/net/sockstat | grep UDP: UDP: inuse 3 mem 524288 <-- (INT_MAX + 1) >> 12 # ss -uam State Recv-Q ... UNCONN -2147482816 ... <-- INT_MAX + 831 bytes skmem:(r2147484480,rb2147483646,t0,tb212992,f3264,w0,o0,bl0,d14468947) So, let's define rmem and rcvbuf as unsigned int and check skb->truesize only when rcvbuf is large enough to lower the overflow possibility. Note that we still have a small chance to see overflow if multiple skbs to the same socket are processed on different core at the same time and each size does not exceed the limit but the total size does. Note also that we must ignore skb->truesize for a small buffer as explained in commit 363dc73acacb ("udp: be less conservative with sock rmem accounting"). Fixes: 6a1f12dd85a8 ("udp: relax atomic operation on sk->sk_rmem_alloc") Signed-off-by: Kuniyuki Iwashima Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20250401184501.67377-2-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/ipv4/udp.c | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index d0bffcfa56d8..354779b22faf 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1725,17 +1725,25 @@ static int udp_rmem_schedule(struct sock *sk, int size) int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) { struct sk_buff_head *list = &sk->sk_receive_queue; - int rmem, err = -ENOMEM; + unsigned int rmem, rcvbuf; spinlock_t *busy = NULL; - int size, rcvbuf; + int size, err = -ENOMEM; - /* Immediately drop when the receive queue is full. - * Always allow at least one packet. - */ rmem = atomic_read(&sk->sk_rmem_alloc); rcvbuf = READ_ONCE(sk->sk_rcvbuf); - if (rmem > rcvbuf) - goto drop; + size = skb->truesize; + + /* Immediately drop when the receive queue is full. + * Cast to unsigned int performs the boundary check for INT_MAX. + */ + if (rmem + size > rcvbuf) { + if (rcvbuf > INT_MAX >> 1) + goto drop; + + /* Always allow at least one packet for small buffer. */ + if (rmem > rcvbuf) + goto drop; + } /* Under mem pressure, it might be helpful to help udp_recvmsg() * having linear skbs : @@ -1745,10 +1753,10 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) */ if (rmem > (rcvbuf >> 1)) { skb_condense(skb); - + size = skb->truesize; busy = busylock_acquire(sk); } - size = skb->truesize; + udp_set_dev_scratch(skb); atomic_add(size, &sk->sk_rmem_alloc); From df207de9d9e7a4d92f8567e2c539d9c8c12fd99d Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 1 Apr 2025 11:44:43 -0700 Subject: [PATCH 21/56] udp: Fix memory accounting leak. Matt Dowling reported a weird UDP memory usage issue. Under normal operation, the UDP memory usage reported in /proc/net/sockstat remains close to zero. However, it occasionally spiked to 524,288 pages and never dropped. Moreover, the value doubled when the application was terminated. Finally, it caused intermittent packet drops. We can reproduce the issue with the script below [0]: 1. /proc/net/sockstat reports 0 pages # cat /proc/net/sockstat | grep UDP: UDP: inuse 1 mem 0 2. Run the script till the report reaches 524,288 # python3 test.py & sleep 5 # cat /proc/net/sockstat | grep UDP: UDP: inuse 3 mem 524288 <-- (INT_MAX + 1) >> PAGE_SHIFT 3. Kill the socket and confirm the number never drops # pkill python3 && sleep 5 # cat /proc/net/sockstat | grep UDP: UDP: inuse 1 mem 524288 4. (necessary since v6.0) Trigger proto_memory_pcpu_drain() # python3 test.py & sleep 1 && pkill python3 5. The number doubles # cat /proc/net/sockstat | grep UDP: UDP: inuse 1 mem 1048577 The application set INT_MAX to SO_RCVBUF, which triggered an integer overflow in udp_rmem_release(). When a socket is close()d, udp_destruct_common() purges its receive queue and sums up skb->truesize in the queue. This total is calculated and stored in a local unsigned integer variable. The total size is then passed to udp_rmem_release() to adjust memory accounting. However, because the function takes a signed integer argument, the total size can wrap around, causing an overflow. Then, the released amount is calculated as follows: 1) Add size to sk->sk_forward_alloc. 2) Round down sk->sk_forward_alloc to the nearest lower multiple of PAGE_SIZE and assign it to amount. 3) Subtract amount from sk->sk_forward_alloc. 4) Pass amount >> PAGE_SHIFT to __sk_mem_reduce_allocated(). When the issue occurred, the total in udp_destruct_common() was 2147484480 (INT_MAX + 833), which was cast to -2147482816 in udp_rmem_release(). At 1) sk->sk_forward_alloc is changed from 3264 to -2147479552, and 2) sets -2147479552 to amount. 3) reverts the wraparound, so we don't see a warning in inet_sock_destruct(). However, udp_memory_allocated ends up doubling at 4). Since commit 3cd3399dd7a8 ("net: implement per-cpu reserves for memory_allocated"), memory usage no longer doubles immediately after a socket is close()d because __sk_mem_reduce_allocated() caches the amount in udp_memory_per_cpu_fw_alloc. However, the next time a UDP socket receives a packet, the subtraction takes effect, causing UDP memory usage to double. This issue makes further memory allocation fail once the socket's sk->sk_rmem_alloc exceeds net.ipv4.udp_rmem_min, resulting in packet drops. To prevent this issue, let's use unsigned int for the calculation and call sk_forward_alloc_add() only once for the small delta. Note that first_packet_length() also potentially has the same problem. [0]: from socket import * SO_RCVBUFFORCE = 33 INT_MAX = (2 ** 31) - 1 s = socket(AF_INET, SOCK_DGRAM) s.bind(('', 0)) s.setsockopt(SOL_SOCKET, SO_RCVBUFFORCE, INT_MAX) c = socket(AF_INET, SOCK_DGRAM) c.connect(s.getsockname()) data = b'a' * 100 while True: c.send(data) Fixes: f970bd9e3a06 ("udp: implement memory accounting helpers") Reported-by: Matt Dowling Signed-off-by: Kuniyuki Iwashima Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20250401184501.67377-3-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/ipv4/udp.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 354779b22faf..2742cc7602bb 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1625,12 +1625,12 @@ static bool udp_skb_has_head_state(struct sk_buff *skb) } /* fully reclaim rmem/fwd memory allocated for skb */ -static void udp_rmem_release(struct sock *sk, int size, int partial, - bool rx_queue_lock_held) +static void udp_rmem_release(struct sock *sk, unsigned int size, + int partial, bool rx_queue_lock_held) { struct udp_sock *up = udp_sk(sk); struct sk_buff_head *sk_queue; - int amt; + unsigned int amt; if (likely(partial)) { up->forward_deficit += size; @@ -1650,10 +1650,8 @@ static void udp_rmem_release(struct sock *sk, int size, int partial, if (!rx_queue_lock_held) spin_lock(&sk_queue->lock); - - sk_forward_alloc_add(sk, size); - amt = (sk->sk_forward_alloc - partial) & ~(PAGE_SIZE - 1); - sk_forward_alloc_add(sk, -amt); + amt = (size + sk->sk_forward_alloc - partial) & ~(PAGE_SIZE - 1); + sk_forward_alloc_add(sk, size - amt); if (amt) __sk_mem_reduce_allocated(sk, amt >> PAGE_SHIFT); @@ -1843,7 +1841,7 @@ EXPORT_IPV6_MOD_GPL(skb_consume_udp); static struct sk_buff *__first_packet_length(struct sock *sk, struct sk_buff_head *rcvq, - int *total) + unsigned int *total) { struct sk_buff *skb; @@ -1876,8 +1874,8 @@ static int first_packet_length(struct sock *sk) { struct sk_buff_head *rcvq = &udp_sk(sk)->reader_queue; struct sk_buff_head *sk_queue = &sk->sk_receive_queue; + unsigned int total = 0; struct sk_buff *skb; - int total = 0; int res; spin_lock_bh(&rcvq->lock); From fccd2b711d9628c7ce0111d5e4938652101ee30a Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Fri, 28 Mar 2025 15:15:28 +0100 Subject: [PATCH 22/56] vsock: avoid timeout during connect() if the socket is closing When a peer attempts to establish a connection, vsock_connect() contains a loop that waits for the state to be TCP_ESTABLISHED. However, the other peer can be fast enough to accept the connection and close it immediately, thus moving the state to TCP_CLOSING. When this happens, the peer in the vsock_connect() is properly woken up, but since the state is not TCP_ESTABLISHED, it goes back to sleep until the timeout expires, returning -ETIMEDOUT. If the socket state is TCP_CLOSING, waiting for the timeout is pointless. vsock_connect() can return immediately without errors or delay since the connection actually happened. The socket will be in a closing state, but this is not an issue, and subsequent calls will fail as expected. We discovered this issue while developing a test that accepts and immediately closes connections to stress the transport switch between two connect() calls, where the first one was interrupted by a signal (see Closes link). Reported-by: Luigi Leonardi Closes: https://lore.kernel.org/virtualization/bq6hxrolno2vmtqwcvb5bljfpb7mvwb3kohrvaed6auz5vxrfv@ijmd2f3grobn/ Fixes: d021c344051a ("VSOCK: Introduce VM Sockets") Signed-off-by: Stefano Garzarella Acked-by: Paolo Abeni Tested-by: Luigi Leonardi Reviewed-by: Luigi Leonardi Link: https://patch.msgid.link/20250328141528.420719-1-sgarzare@redhat.com Signed-off-by: Jakub Kicinski --- net/vmw_vsock/af_vsock.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 7e3db87ae433..fc6afbc8d680 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -1551,7 +1551,11 @@ static int vsock_connect(struct socket *sock, struct sockaddr *addr, timeout = vsk->connect_timeout; prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); - while (sk->sk_state != TCP_ESTABLISHED && sk->sk_err == 0) { + /* If the socket is already closing or it is in an error state, there + * is no point in waiting. + */ + while (sk->sk_state != TCP_ESTABLISHED && + sk->sk_state != TCP_CLOSING && sk->sk_err == 0) { if (flags & O_NONBLOCK) { /* If we're not going to block, we schedule a timeout * function to generate a timeout on the connection From 8930424777e43257f5bf6f0f0f53defd0d30415c Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Sat, 29 Mar 2025 01:33:44 +0100 Subject: [PATCH 23/56] tunnels: Accept PACKET_HOST in skb_tunnel_check_pmtu(). Because skb_tunnel_check_pmtu() doesn't handle PACKET_HOST packets, commit 30a92c9e3d6b ("openvswitch: Set the skbuff pkt_type for proper pmtud support.") forced skb->pkt_type to PACKET_OUTGOING for openvswitch packets that are sent using the OVS_ACTION_ATTR_OUTPUT action. This allowed such packets to invoke the iptunnel_pmtud_check_icmp() or iptunnel_pmtud_check_icmpv6() helpers and thus trigger PMTU update on the input device. However, this also broke other parts of PMTU discovery. Since these packets don't have the PACKET_HOST type anymore, they won't trigger the sending of ICMP Fragmentation Needed or Packet Too Big messages to remote hosts when oversized (see the skb_in->pkt_type condition in __icmp_send() for example). These two skb->pkt_type checks are therefore incompatible as one requires skb->pkt_type to be PACKET_HOST, while the other requires it to be anything but PACKET_HOST. It makes sense to not trigger ICMP messages for non-PACKET_HOST packets as these messages should be generated only for incoming l2-unicast packets. However there doesn't seem to be any reason for skb_tunnel_check_pmtu() to ignore PACKET_HOST packets. Allow both cases to work by allowing skb_tunnel_check_pmtu() to work on PACKET_HOST packets and not overriding skb->pkt_type in openvswitch anymore. Fixes: 30a92c9e3d6b ("openvswitch: Set the skbuff pkt_type for proper pmtud support.") Fixes: 4cb47a8644cc ("tunnels: PMTU discovery support for directly bridged IP packets") Signed-off-by: Guillaume Nault Reviewed-by: Stefano Brivio Reviewed-by: Aaron Conole Tested-by: Aaron Conole Link: https://patch.msgid.link/eac941652b86fddf8909df9b3bf0d97bc9444793.1743208264.git.gnault@redhat.com Signed-off-by: Jakub Kicinski --- net/ipv4/ip_tunnel_core.c | 2 +- net/openvswitch/actions.c | 6 ------ 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index a3676155be78..364ea798511e 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -416,7 +416,7 @@ int skb_tunnel_check_pmtu(struct sk_buff *skb, struct dst_entry *encap_dst, skb_dst_update_pmtu_no_confirm(skb, mtu); - if (!reply || skb->pkt_type == PACKET_HOST) + if (!reply) return 0; if (skb->protocol == htons(ETH_P_IP)) diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 704c858cf209..61fea7baae5d 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -947,12 +947,6 @@ static void do_output(struct datapath *dp, struct sk_buff *skb, int out_port, pskb_trim(skb, ovs_mac_header_len(key)); } - /* Need to set the pkt_type to involve the routing layer. The - * packet movement through the OVS datapath doesn't generally - * use routing, but this is needed for tunnel cases. - */ - skb->pkt_type = PACKET_OUTGOING; - if (likely(!mru || (skb->len <= mru + vport->dev->hard_header_len))) { ovs_vport_send(vport, skb, ovs_key_mac_proto(key)); From 3a0a3ff6593d670af2451ec363ccb7b18aec0c0a Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Wed, 26 Mar 2025 18:36:32 +0100 Subject: [PATCH 24/56] net: decrease cached dst counters in dst_release Upstream fix ac888d58869b ("net: do not delay dst_entries_add() in dst_release()") moved decrementing the dst count from dst_destroy to dst_release to avoid accessing already freed data in case of netns dismantle. However in case CONFIG_DST_CACHE is enabled and OvS+tunnels are used, this fix is incomplete as the same issue will be seen for cached dsts: Unable to handle kernel paging request at virtual address ffff5aabf6b5c000 Call trace: percpu_counter_add_batch+0x3c/0x160 (P) dst_release+0xec/0x108 dst_cache_destroy+0x68/0xd8 dst_destroy+0x13c/0x168 dst_destroy_rcu+0x1c/0xb0 rcu_do_batch+0x18c/0x7d0 rcu_core+0x174/0x378 rcu_core_si+0x18/0x30 Fix this by invalidating the cache, and thus decrementing cached dst counters, in dst_release too. Fixes: d71785ffc7e7 ("net: add dst_cache to ovs vxlan lwtunnel") Signed-off-by: Antoine Tenart Link: https://patch.msgid.link/20250326173634.31096-1-atenart@kernel.org Signed-off-by: Paolo Abeni --- net/core/dst.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/net/core/dst.c b/net/core/dst.c index c99b95cf9cbb..795ca07e28a4 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -165,6 +165,14 @@ static void dst_count_dec(struct dst_entry *dst) void dst_release(struct dst_entry *dst) { if (dst && rcuref_put(&dst->__rcuref)) { +#ifdef CONFIG_DST_CACHE + if (dst->flags & DST_METADATA) { + struct metadata_dst *md_dst = (struct metadata_dst *)dst; + + if (md_dst->type == METADATA_IP_TUNNEL) + dst_cache_reset_now(&md_dst->u.tun_info.dst_cache); + } +#endif dst_count_dec(dst); call_rcu_hurry(&dst->rcu_head, dst_destroy_rcu); } From 1b755d8eb1ace3870789d48fbd94f386ad6e30be Mon Sep 17 00:00:00 2001 From: Lin Ma Date: Thu, 3 Apr 2025 01:00:26 +0800 Subject: [PATCH 25/56] netfilter: nft_tunnel: fix geneve_opt type confusion addition When handling multiple NFTA_TUNNEL_KEY_OPTS_GENEVE attributes, the parsing logic should place every geneve_opt structure one by one compactly. Hence, when deciding the next geneve_opt position, the pointer addition should be in units of char *. However, the current implementation erroneously does type conversion before the addition, which will lead to heap out-of-bounds write. [ 6.989857] ================================================================== [ 6.990293] BUG: KASAN: slab-out-of-bounds in nft_tunnel_obj_init+0x977/0xa70 [ 6.990725] Write of size 124 at addr ffff888005f18974 by task poc/178 [ 6.991162] [ 6.991259] CPU: 0 PID: 178 Comm: poc-oob-write Not tainted 6.1.132 #1 [ 6.991655] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014 [ 6.992281] Call Trace: [ 6.992423] [ 6.992586] dump_stack_lvl+0x44/0x5c [ 6.992801] print_report+0x184/0x4be [ 6.993790] kasan_report+0xc5/0x100 [ 6.994252] kasan_check_range+0xf3/0x1a0 [ 6.994486] memcpy+0x38/0x60 [ 6.994692] nft_tunnel_obj_init+0x977/0xa70 [ 6.995677] nft_obj_init+0x10c/0x1b0 [ 6.995891] nf_tables_newobj+0x585/0x950 [ 6.996922] nfnetlink_rcv_batch+0xdf9/0x1020 [ 6.998997] nfnetlink_rcv+0x1df/0x220 [ 6.999537] netlink_unicast+0x395/0x530 [ 7.000771] netlink_sendmsg+0x3d0/0x6d0 [ 7.001462] __sock_sendmsg+0x99/0xa0 [ 7.001707] ____sys_sendmsg+0x409/0x450 [ 7.002391] ___sys_sendmsg+0xfd/0x170 [ 7.003145] __sys_sendmsg+0xea/0x170 [ 7.004359] do_syscall_64+0x5e/0x90 [ 7.005817] entry_SYSCALL_64_after_hwframe+0x6e/0xd8 [ 7.006127] RIP: 0033:0x7ec756d4e407 [ 7.006339] Code: 48 89 fa 4c 89 df e8 38 aa 00 00 8b 93 08 03 00 00 59 5e 48 83 f8 fc 74 1a 5b c3 0f 1f 84 00 00 00 00 00 48 8b 44 24 10 0f 05 <5b> c3 0f 1f 80 00 00 00 00 83 e2 39 83 faf [ 7.007364] RSP: 002b:00007ffed5d46760 EFLAGS: 00000202 ORIG_RAX: 000000000000002e [ 7.007827] RAX: ffffffffffffffda RBX: 00007ec756cc4740 RCX: 00007ec756d4e407 [ 7.008223] RDX: 0000000000000000 RSI: 00007ffed5d467f0 RDI: 0000000000000003 [ 7.008620] RBP: 00007ffed5d468a0 R08: 0000000000000000 R09: 0000000000000000 [ 7.009039] R10: 0000000000000000 R11: 0000000000000202 R12: 0000000000000000 [ 7.009429] R13: 00007ffed5d478b0 R14: 00007ec756ee5000 R15: 00005cbd4e655cb8 Fix this bug with correct pointer addition and conversion in parse and dump code. Fixes: 925d844696d9 ("netfilter: nft_tunnel: add support for geneve opts") Signed-off-by: Lin Ma Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_tunnel.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nft_tunnel.c b/net/netfilter/nft_tunnel.c index 681301b46aa4..2e40f575aed9 100644 --- a/net/netfilter/nft_tunnel.c +++ b/net/netfilter/nft_tunnel.c @@ -341,7 +341,7 @@ static const struct nla_policy nft_tunnel_opts_geneve_policy[NFTA_TUNNEL_KEY_GEN static int nft_tunnel_obj_geneve_init(const struct nlattr *attr, struct nft_tunnel_opts *opts) { - struct geneve_opt *opt = (struct geneve_opt *)opts->u.data + opts->len; + struct geneve_opt *opt = (struct geneve_opt *)(opts->u.data + opts->len); struct nlattr *tb[NFTA_TUNNEL_KEY_GENEVE_MAX + 1]; int err, data_len; @@ -625,7 +625,7 @@ static int nft_tunnel_opts_dump(struct sk_buff *skb, if (!inner) goto failure; while (opts->len > offset) { - opt = (struct geneve_opt *)opts->u.data + offset; + opt = (struct geneve_opt *)(opts->u.data + offset); if (nla_put_be16(skb, NFTA_TUNNEL_KEY_GENEVE_CLASS, opt->opt_class) || nla_put_u8(skb, NFTA_TUNNEL_KEY_GENEVE_TYPE, From 15970e1b23f5c25db88c613fddf9131de086f28e Mon Sep 17 00:00:00 2001 From: Joshua Washington Date: Wed, 2 Apr 2025 00:10:37 +0000 Subject: [PATCH 26/56] gve: handle overflow when reporting TX consumed descriptors When the tx tail is less than the head (in cases of wraparound), the TX consumed descriptor statistic in DQ will be reported as UINT32_MAX - head + tail, which is incorrect. Mask the difference of head and tail according to the ring size when reporting the statistic. Cc: stable@vger.kernel.org Fixes: 2c9198356d56 ("gve: Add consumed counts to ethtool stats") Signed-off-by: Joshua Washington Signed-off-by: Harshitha Ramamurthy Reviewed-by: Michal Swiatkowski Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250402001037.2717315-1-hramamurthy@google.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/google/gve/gve_ethtool.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/google/gve/gve_ethtool.c b/drivers/net/ethernet/google/gve/gve_ethtool.c index 31a21ccf4863..4dea1fdce748 100644 --- a/drivers/net/ethernet/google/gve/gve_ethtool.c +++ b/drivers/net/ethernet/google/gve/gve_ethtool.c @@ -392,7 +392,9 @@ gve_get_ethtool_stats(struct net_device *netdev, */ data[i++] = 0; data[i++] = 0; - data[i++] = tx->dqo_tx.tail - tx->dqo_tx.head; + data[i++] = + (tx->dqo_tx.tail - tx->dqo_tx.head) & + tx->mask; } do { start = From 8241ecec1cdc6699ae197d52d58e76bddd995fa5 Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Tue, 1 Apr 2025 23:54:39 +0100 Subject: [PATCH 27/56] sfc: fix NULL dereferences in ef100_process_design_param() Since cited commit, ef100_probe_main() and hence also ef100_check_design_params() run before efx->net_dev is created; consequently, we cannot netif_set_tso_max_size() or _segs() at this point. Move those netif calls to ef100_probe_netdev(), and also replace netif_err within the design params code with pci_err. Reported-by: Kyungwook Boo Fixes: 98ff4c7c8ac7 ("sfc: Separate netdev probe/remove from PCI probe/remove") Signed-off-by: Edward Cree Reviewed-by: Michal Swiatkowski Link: https://patch.msgid.link/20250401225439.2401047-1-edward.cree@amd.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/sfc/ef100_netdev.c | 6 ++-- drivers/net/ethernet/sfc/ef100_nic.c | 47 +++++++++++-------------- 2 files changed, 24 insertions(+), 29 deletions(-) diff --git a/drivers/net/ethernet/sfc/ef100_netdev.c b/drivers/net/ethernet/sfc/ef100_netdev.c index d941f073f1eb..3a06e3b1bd6b 100644 --- a/drivers/net/ethernet/sfc/ef100_netdev.c +++ b/drivers/net/ethernet/sfc/ef100_netdev.c @@ -450,8 +450,9 @@ int ef100_probe_netdev(struct efx_probe_data *probe_data) net_dev->hw_enc_features |= efx->type->offload_features; net_dev->vlan_features |= NETIF_F_HW_CSUM | NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_ALL_TSO; - netif_set_tso_max_segs(net_dev, - ESE_EF100_DP_GZ_TSO_MAX_HDR_NUM_SEGS_DEFAULT); + nic_data = efx->nic_data; + netif_set_tso_max_size(efx->net_dev, nic_data->tso_max_payload_len); + netif_set_tso_max_segs(efx->net_dev, nic_data->tso_max_payload_num_segs); rc = efx_ef100_init_datapath_caps(efx); if (rc < 0) @@ -477,7 +478,6 @@ int ef100_probe_netdev(struct efx_probe_data *probe_data) /* Don't fail init if RSS setup doesn't work. */ efx_mcdi_push_default_indir_table(efx, efx->n_rx_channels); - nic_data = efx->nic_data; rc = ef100_get_mac_address(efx, net_dev->perm_addr, CLIENT_HANDLE_SELF, efx->type->is_vf); if (rc) diff --git a/drivers/net/ethernet/sfc/ef100_nic.c b/drivers/net/ethernet/sfc/ef100_nic.c index 62e674d6ff60..3ad95a4c8af2 100644 --- a/drivers/net/ethernet/sfc/ef100_nic.c +++ b/drivers/net/ethernet/sfc/ef100_nic.c @@ -887,8 +887,7 @@ static int ef100_process_design_param(struct efx_nic *efx, case ESE_EF100_DP_GZ_TSO_MAX_HDR_NUM_SEGS: /* We always put HDR_NUM_SEGS=1 in our TSO descriptors */ if (!reader->value) { - netif_err(efx, probe, efx->net_dev, - "TSO_MAX_HDR_NUM_SEGS < 1\n"); + pci_err(efx->pci_dev, "TSO_MAX_HDR_NUM_SEGS < 1\n"); return -EOPNOTSUPP; } return 0; @@ -901,32 +900,28 @@ static int ef100_process_design_param(struct efx_nic *efx, */ if (!reader->value || reader->value > EFX_MIN_DMAQ_SIZE || EFX_MIN_DMAQ_SIZE % (u32)reader->value) { - netif_err(efx, probe, efx->net_dev, - "%s size granularity is %llu, can't guarantee safety\n", - reader->type == ESE_EF100_DP_GZ_RXQ_SIZE_GRANULARITY ? "RXQ" : "TXQ", - reader->value); + pci_err(efx->pci_dev, + "%s size granularity is %llu, can't guarantee safety\n", + reader->type == ESE_EF100_DP_GZ_RXQ_SIZE_GRANULARITY ? "RXQ" : "TXQ", + reader->value); return -EOPNOTSUPP; } return 0; case ESE_EF100_DP_GZ_TSO_MAX_PAYLOAD_LEN: nic_data->tso_max_payload_len = min_t(u64, reader->value, GSO_LEGACY_MAX_SIZE); - netif_set_tso_max_size(efx->net_dev, - nic_data->tso_max_payload_len); return 0; case ESE_EF100_DP_GZ_TSO_MAX_PAYLOAD_NUM_SEGS: nic_data->tso_max_payload_num_segs = min_t(u64, reader->value, 0xffff); - netif_set_tso_max_segs(efx->net_dev, - nic_data->tso_max_payload_num_segs); return 0; case ESE_EF100_DP_GZ_TSO_MAX_NUM_FRAMES: nic_data->tso_max_frames = min_t(u64, reader->value, 0xffff); return 0; case ESE_EF100_DP_GZ_COMPAT: if (reader->value) { - netif_err(efx, probe, efx->net_dev, - "DP_COMPAT has unknown bits %#llx, driver not compatible with this hw\n", - reader->value); + pci_err(efx->pci_dev, + "DP_COMPAT has unknown bits %#llx, driver not compatible with this hw\n", + reader->value); return -EOPNOTSUPP; } return 0; @@ -946,10 +941,10 @@ static int ef100_process_design_param(struct efx_nic *efx, * So the value of this shouldn't matter. */ if (reader->value != ESE_EF100_DP_GZ_VI_STRIDES_DEFAULT) - netif_dbg(efx, probe, efx->net_dev, - "NIC has other than default VI_STRIDES (mask " - "%#llx), early probing might use wrong one\n", - reader->value); + pci_dbg(efx->pci_dev, + "NIC has other than default VI_STRIDES (mask " + "%#llx), early probing might use wrong one\n", + reader->value); return 0; case ESE_EF100_DP_GZ_RX_MAX_RUNT: /* Driver doesn't look at L2_STATUS:LEN_ERR bit, so we don't @@ -961,9 +956,9 @@ static int ef100_process_design_param(struct efx_nic *efx, /* Host interface says "Drivers should ignore design parameters * that they do not recognise." */ - netif_dbg(efx, probe, efx->net_dev, - "Ignoring unrecognised design parameter %u\n", - reader->type); + pci_dbg(efx->pci_dev, + "Ignoring unrecognised design parameter %u\n", + reader->type); return 0; } } @@ -999,13 +994,13 @@ static int ef100_check_design_params(struct efx_nic *efx) */ if (reader.state != EF100_TLV_TYPE) { if (reader.state == EF100_TLV_TYPE_CONT) - netif_err(efx, probe, efx->net_dev, - "truncated design parameter (incomplete type %u)\n", - reader.type); + pci_err(efx->pci_dev, + "truncated design parameter (incomplete type %u)\n", + reader.type); else - netif_err(efx, probe, efx->net_dev, - "truncated design parameter %u\n", - reader.type); + pci_err(efx->pci_dev, + "truncated design parameter %u\n", + reader.type); rc = -EIO; } out: From e5ddf19dbc3e24cce508de0dab0e8df5b7417de8 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov <0x7f454c46@gmail.com> Date: Wed, 2 Apr 2025 01:59:31 +0100 Subject: [PATCH 28/56] net/selftests: Add loopback link local route for self-connect self-connect-ipv6 got slightly flaky on netdev: > # timeout set to 120 > # selftests: net/tcp_ao: self-connect_ipv6 > # 1..5 > # # 708[lib/setup.c:250] rand seed 1742872572 > # TAP version 13 > # # 708[lib/proc.c:213] Snmp6 Ip6OutNoRoutes: 0 => 1 > # not ok 1 # error 708[self-connect.c:70] failed to connect() > # ok 2 No unexpected trace events during the test run > # # Planned tests != run tests (5 != 2) > # # Totals: pass:1 fail:0 xfail:0 xpass:0 skip:0 error:1 > ok 1 selftests: net/tcp_ao: self-connect_ipv6 I can not reproduce it on my machines, but judging by "Ip6OutNoRoutes" there is no route to the local_addr (::1). Looking at the kernel code, I see that kernel does add link-local address automatically in init_loopback(), but that is called from ipv6 notifier block. So, in turn the userspace that brought up the loopback interface may see rtnetlink ACK earlier than addrconf_notify() does it's job (at least, on a slow VM such as netdev). Probably, for ipv4 it's the same, judging by inetdev_event(). The fix is quite simple: set the link-local route straight after bringing the loopback interface. That will make it synchronous. Signed-off-by: Dmitry Safonov <0x7f454c46@gmail.com> Link: https://patch.msgid.link/20250402-tcp-ao-selfconnect-flake-v1-1-8388d629ef3d@gmail.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/tcp_ao/self-connect.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/testing/selftests/net/tcp_ao/self-connect.c b/tools/testing/selftests/net/tcp_ao/self-connect.c index 73b2f2276f3f..2c73bea698a6 100644 --- a/tools/testing/selftests/net/tcp_ao/self-connect.c +++ b/tools/testing/selftests/net/tcp_ao/self-connect.c @@ -16,6 +16,9 @@ static void __setup_lo_intf(const char *lo_intf, if (link_set_up(lo_intf)) test_error("Failed to bring %s up", lo_intf); + + if (ip_route_add(lo_intf, TEST_FAMILY, local_addr, local_addr)) + test_error("Failed to add a local route %s", lo_intf); } static void setup_lo_intf(const char *lo_intf) From e4546c6498c68ba65929fcbcf54ff1947fe53f48 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Wed, 2 Apr 2025 13:31:23 +0000 Subject: [PATCH 29/56] eth: bnxt: fix deadlock in the mgmt_ops When queue is being reset, callbacks of mgmt_ops are called by netdev_nl_bind_rx_doit(). The netdev_nl_bind_rx_doit() first acquires netdev_lock() and then calls callbacks. So, mgmt_ops callbacks should not acquire netdev_lock() internaly. The bnxt_queue_{start | stop}() calls napi_{enable | disable}() but they internally acquire netdev_lock(). So, deadlock occurs. To avoid deadlock, napi_{enable | disable}_locked() should be used instead. Signed-off-by: Taehee Yoo Acked-by: Stanislav Fomichev Reviewed-by: Michael Chan Fixes: cae03e5bdd9e ("net: hold netdev instance lock during queue operations") Link: https://patch.msgid.link/20250402133123.840173-1-ap420073@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 1a70605fad38..28ee12186c37 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -15909,7 +15909,7 @@ static int bnxt_queue_start(struct net_device *dev, void *qmem, int idx) goto err_reset; } - napi_enable(&bnapi->napi); + napi_enable_locked(&bnapi->napi); bnxt_db_nq_arm(bp, &cpr->cp_db, cpr->cp_raw_cons); for (i = 0; i < bp->nr_vnics; i++) { @@ -15931,7 +15931,7 @@ static int bnxt_queue_start(struct net_device *dev, void *qmem, int idx) err_reset: netdev_err(bp->dev, "Unexpected HWRM error during queue start rc: %d\n", rc); - napi_enable(&bnapi->napi); + napi_enable_locked(&bnapi->napi); bnxt_db_nq_arm(bp, &cpr->cp_db, cpr->cp_raw_cons); bnxt_reset_task(bp, true); return rc; @@ -15971,7 +15971,7 @@ static int bnxt_queue_stop(struct net_device *dev, void *qmem, int idx) * completion is handled in NAPI to guarantee no more DMA on that ring * after seeing the completion. */ - napi_disable(&bnapi->napi); + napi_disable_locked(&bnapi->napi); if (bp->tph_mode) { bnxt_hwrm_cp_ring_free(bp, rxr->rx_cpr); From 7ac6ea4a3e0898db76aecccd68fb2c403eb7d24e Mon Sep 17 00:00:00 2001 From: Fernando Fernandez Mancera Date: Wed, 2 Apr 2025 14:17:51 +0200 Subject: [PATCH 30/56] ipv6: fix omitted netlink attributes when using RTEXT_FILTER_SKIP_STATS Using RTEXT_FILTER_SKIP_STATS is incorrectly skipping non-stats IPv6 netlink attributes on link dump. This causes issues on userspace tools, e.g iproute2 is not rendering address generation mode as it should due to missing netlink attribute. Move the filling of IFLA_INET6_STATS and IFLA_INET6_ICMP6STATS to a helper function guarded by a flag check to avoid hitting the same situation in the future. Fixes: d5566fd72ec1 ("rtnetlink: RTEXT_FILTER_SKIP_STATS support to avoid dumping inet/inet6 stats") Signed-off-by: Fernando Fernandez Mancera Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250402121751.3108-1-ffmancera@riseup.net Signed-off-by: Jakub Kicinski --- net/ipv6/addrconf.c | 37 +++++++++++++++++++++++++------------ 1 file changed, 25 insertions(+), 12 deletions(-) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index ac8cc1076536..54a8ea004da2 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -5784,6 +5784,27 @@ static void snmp6_fill_stats(u64 *stats, struct inet6_dev *idev, int attrtype, } } +static int inet6_fill_ifla6_stats_attrs(struct sk_buff *skb, + struct inet6_dev *idev) +{ + struct nlattr *nla; + + nla = nla_reserve(skb, IFLA_INET6_STATS, IPSTATS_MIB_MAX * sizeof(u64)); + if (!nla) + goto nla_put_failure; + snmp6_fill_stats(nla_data(nla), idev, IFLA_INET6_STATS, nla_len(nla)); + + nla = nla_reserve(skb, IFLA_INET6_ICMP6STATS, ICMP6_MIB_MAX * sizeof(u64)); + if (!nla) + goto nla_put_failure; + snmp6_fill_stats(nla_data(nla), idev, IFLA_INET6_ICMP6STATS, nla_len(nla)); + + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + static int inet6_fill_ifla6_attrs(struct sk_buff *skb, struct inet6_dev *idev, u32 ext_filter_mask) { @@ -5806,18 +5827,10 @@ static int inet6_fill_ifla6_attrs(struct sk_buff *skb, struct inet6_dev *idev, /* XXX - MC not implemented */ - if (ext_filter_mask & RTEXT_FILTER_SKIP_STATS) - return 0; - - nla = nla_reserve(skb, IFLA_INET6_STATS, IPSTATS_MIB_MAX * sizeof(u64)); - if (!nla) - goto nla_put_failure; - snmp6_fill_stats(nla_data(nla), idev, IFLA_INET6_STATS, nla_len(nla)); - - nla = nla_reserve(skb, IFLA_INET6_ICMP6STATS, ICMP6_MIB_MAX * sizeof(u64)); - if (!nla) - goto nla_put_failure; - snmp6_fill_stats(nla_data(nla), idev, IFLA_INET6_ICMP6STATS, nla_len(nla)); + if (!(ext_filter_mask & RTEXT_FILTER_SKIP_STATS)) { + if (inet6_fill_ifla6_stats_attrs(skb, idev) < 0) + goto nla_put_failure; + } nla = nla_reserve(skb, IFLA_INET6_TOKEN, sizeof(struct in6_addr)); if (!nla) From 40eb4a0434cd90668fe376a92f18982b913c283b Mon Sep 17 00:00:00 2001 From: Loic Poulain Date: Tue, 1 Apr 2025 16:53:44 +0200 Subject: [PATCH 31/56] MAINTAINERS: Update Loic Poulain's email address Update Loic Poulain's email address to @oss.qualcomm.com. Signed-off-by: Loic Poulain Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250401145344.10669-1-loic.poulain@oss.qualcomm.com Signed-off-by: Jakub Kicinski --- MAINTAINERS | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index ff882416c445..a2830693af7a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -19871,7 +19871,7 @@ F: Documentation/devicetree/bindings/i2c/qcom,i2c-geni-qcom.yaml F: drivers/i2c/busses/i2c-qcom-geni.c QUALCOMM I2C CCI DRIVER -M: Loic Poulain +M: Loic Poulain M: Robert Foss L: linux-i2c@vger.kernel.org L: linux-arm-msm@vger.kernel.org @@ -20004,7 +20004,7 @@ F: Documentation/devicetree/bindings/media/*venus* F: drivers/media/platform/qcom/venus/ QUALCOMM WCN36XX WIRELESS DRIVER -M: Loic Poulain +M: Loic Poulain L: wcn36xx@lists.infradead.org S: Supported W: https://wireless.wiki.kernel.org/en/users/Drivers/wcn36xx @@ -26043,7 +26043,7 @@ F: kernel/workqueue.c F: kernel/workqueue_internal.h WWAN DRIVERS -M: Loic Poulain +M: Loic Poulain M: Sergey Ryazanov R: Johannes Berg L: netdev@vger.kernel.org From a58d882841a0750da3c482cd3d82432b1c7edb77 Mon Sep 17 00:00:00 2001 From: David Oberhollenzer Date: Tue, 1 Apr 2025 15:56:37 +0200 Subject: [PATCH 32/56] net: dsa: mv88e6xxx: propperly shutdown PPU re-enable timer on destroy The mv88e6xxx has an internal PPU that polls PHY state. If we want to access the internal PHYs, we need to disable the PPU first. Because that is a slow operation, a 10ms timer is used to re-enable it, canceled with every access, so bulk operations effectively only disable it once and re-enable it some 10ms after the last access. If a PHY is accessed and then the mv88e6xxx module is removed before the 10ms are up, the PPU re-enable ends up accessing a dangling pointer. This especially affects probing during bootup. The MDIO bus and PHY registration may succeed, but registration with the DSA framework may fail later on (e.g. because the CPU port depends on another, very slow device that isn't done probing yet, returning -EPROBE_DEFER). In this case, probe() fails, but the MDIO subsystem may already have accessed the MIDO bus or PHYs, arming the timer. This is fixed as follows: - If probe fails after mv88e6xxx_phy_init(), make sure we also call mv88e6xxx_phy_destroy() before returning - In mv88e6xxx_remove(), make sure we do the teardown in the correct order, calling mv88e6xxx_phy_destroy() after unregistering the switch device. - In mv88e6xxx_phy_destroy(), destroy both the timer and the work item that the timer might schedule, synchronously waiting in case one of the callbacks already fired and destroying the timer first, before waiting for the work item. - Access to the PPU is guarded by a mutex, the worker acquires it with a mutex_trylock(), not proceeding with the expensive shutdown if that fails. We grab the mutex in mv88e6xxx_phy_destroy() to make sure the slow PPU shutdown is already done or won't even enter, when we wait for the work item. Fixes: 2e5f032095ff ("dsa: add support for the Marvell 88E6131 switch chip") Signed-off-by: David Oberhollenzer Reviewed-by: Vladimir Oltean Link: https://patch.msgid.link/20250401135705.92760-1-david.oberhollenzer@sigma-star.at Signed-off-by: Jakub Kicinski --- drivers/net/dsa/mv88e6xxx/chip.c | 11 +++++++---- drivers/net/dsa/mv88e6xxx/phy.c | 3 +++ 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c index 901929f96b38..29a89ab4b789 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.c +++ b/drivers/net/dsa/mv88e6xxx/chip.c @@ -7350,13 +7350,13 @@ static int mv88e6xxx_probe(struct mdio_device *mdiodev) err = mv88e6xxx_switch_reset(chip); mv88e6xxx_reg_unlock(chip); if (err) - goto out; + goto out_phy; if (np) { chip->irq = of_irq_get(np, 0); if (chip->irq == -EPROBE_DEFER) { err = chip->irq; - goto out; + goto out_phy; } } @@ -7375,7 +7375,7 @@ static int mv88e6xxx_probe(struct mdio_device *mdiodev) mv88e6xxx_reg_unlock(chip); if (err) - goto out; + goto out_phy; if (chip->info->g2_irqs > 0) { err = mv88e6xxx_g2_irq_setup(chip); @@ -7409,6 +7409,8 @@ static int mv88e6xxx_probe(struct mdio_device *mdiodev) mv88e6xxx_g1_irq_free(chip); else mv88e6xxx_irq_poll_free(chip); +out_phy: + mv88e6xxx_phy_destroy(chip); out: if (pdata) dev_put(pdata->netdev); @@ -7431,7 +7433,6 @@ static void mv88e6xxx_remove(struct mdio_device *mdiodev) mv88e6xxx_ptp_free(chip); } - mv88e6xxx_phy_destroy(chip); mv88e6xxx_unregister_switch(chip); mv88e6xxx_g1_vtu_prob_irq_free(chip); @@ -7444,6 +7445,8 @@ static void mv88e6xxx_remove(struct mdio_device *mdiodev) mv88e6xxx_g1_irq_free(chip); else mv88e6xxx_irq_poll_free(chip); + + mv88e6xxx_phy_destroy(chip); } static void mv88e6xxx_shutdown(struct mdio_device *mdiodev) diff --git a/drivers/net/dsa/mv88e6xxx/phy.c b/drivers/net/dsa/mv88e6xxx/phy.c index 8bb88b3d900d..ee9e5d7e5277 100644 --- a/drivers/net/dsa/mv88e6xxx/phy.c +++ b/drivers/net/dsa/mv88e6xxx/phy.c @@ -229,7 +229,10 @@ static void mv88e6xxx_phy_ppu_state_init(struct mv88e6xxx_chip *chip) static void mv88e6xxx_phy_ppu_state_destroy(struct mv88e6xxx_chip *chip) { + mutex_lock(&chip->ppu_mutex); del_timer_sync(&chip->ppu_timer); + cancel_work_sync(&chip->ppu_work); + mutex_unlock(&chip->ppu_mutex); } int mv88e6185_phy_ppu_read(struct mv88e6xxx_chip *chip, struct mii_bus *bus, From 09bccf56db36501ccb1935d921dc24451e9f57dd Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Tue, 1 Apr 2025 11:42:30 +0200 Subject: [PATCH 33/56] net: airoha: Validate egress gdm port in airoha_ppe_foe_entry_prepare() Dev pointer in airoha_ppe_foe_entry_prepare routine is not strictly a device allocated by airoha_eth driver since it is an egress device and the flowtable can contain even wlan, pppoe or vlan devices. E.g: flowtable ft { hook ingress priority filter devices = { eth1, lan1, lan2, lan3, lan4, wlan0 } flags offload ^ | "not allocated by airoha_eth" -- } In this case airoha_get_dsa_port() will just return the original device pointer and we can't assume netdev priv pointer points to an airoha_gdm_port struct. Fix the issue validating egress gdm port in airoha_ppe_foe_entry_prepare routine before accessing net_device priv pointer. Fixes: 00a7678310fe ("net: airoha: Introduce flowtable offload support") Signed-off-by: Lorenzo Bianconi Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250401-airoha-validate-egress-gdm-port-v4-1-c7315d33ce10@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/airoha/airoha_eth.c | 13 +++++++++++++ drivers/net/ethernet/airoha/airoha_eth.h | 3 +++ drivers/net/ethernet/airoha/airoha_ppe.c | 8 ++++++-- 3 files changed, 22 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/airoha/airoha_eth.c b/drivers/net/ethernet/airoha/airoha_eth.c index 69e523dd4186..d748dc6de923 100644 --- a/drivers/net/ethernet/airoha/airoha_eth.c +++ b/drivers/net/ethernet/airoha/airoha_eth.c @@ -2454,6 +2454,19 @@ static void airoha_metadata_dst_free(struct airoha_gdm_port *port) } } +bool airoha_is_valid_gdm_port(struct airoha_eth *eth, + struct airoha_gdm_port *port) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(eth->ports); i++) { + if (eth->ports[i] == port) + return true; + } + + return false; +} + static int airoha_alloc_gdm_port(struct airoha_eth *eth, struct device_node *np, int index) { diff --git a/drivers/net/ethernet/airoha/airoha_eth.h b/drivers/net/ethernet/airoha/airoha_eth.h index 60690b685710..ec8908f904c6 100644 --- a/drivers/net/ethernet/airoha/airoha_eth.h +++ b/drivers/net/ethernet/airoha/airoha_eth.h @@ -532,6 +532,9 @@ u32 airoha_rmw(void __iomem *base, u32 offset, u32 mask, u32 val); #define airoha_qdma_clear(qdma, offset, val) \ airoha_rmw((qdma)->regs, (offset), (val), 0) +bool airoha_is_valid_gdm_port(struct airoha_eth *eth, + struct airoha_gdm_port *port); + void airoha_ppe_check_skb(struct airoha_ppe *ppe, u16 hash); int airoha_ppe_setup_tc_block_cb(enum tc_setup_type type, void *type_data, void *cb_priv); diff --git a/drivers/net/ethernet/airoha/airoha_ppe.c b/drivers/net/ethernet/airoha/airoha_ppe.c index 8b55e871352d..f10dab935cab 100644 --- a/drivers/net/ethernet/airoha/airoha_ppe.c +++ b/drivers/net/ethernet/airoha/airoha_ppe.c @@ -197,7 +197,8 @@ static int airoha_get_dsa_port(struct net_device **dev) #endif } -static int airoha_ppe_foe_entry_prepare(struct airoha_foe_entry *hwe, +static int airoha_ppe_foe_entry_prepare(struct airoha_eth *eth, + struct airoha_foe_entry *hwe, struct net_device *dev, int type, struct airoha_flow_data *data, int l4proto) @@ -225,6 +226,9 @@ static int airoha_ppe_foe_entry_prepare(struct airoha_foe_entry *hwe, struct airoha_gdm_port *port = netdev_priv(dev); u8 pse_port; + if (!airoha_is_valid_gdm_port(eth, port)) + return -EINVAL; + if (dsa_port >= 0) pse_port = port->id == 4 ? FE_PSE_PORT_GDM4 : port->id; else @@ -633,7 +637,7 @@ static int airoha_ppe_flow_offload_replace(struct airoha_gdm_port *port, !is_valid_ether_addr(data.eth.h_dest)) return -EINVAL; - err = airoha_ppe_foe_entry_prepare(&hwe, odev, offload_type, + err = airoha_ppe_foe_entry_prepare(eth, &hwe, odev, offload_type, &data, l4proto); if (err) return err; From d2ccd0560d9648a98ae0c6534588144044cff0a0 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 1 Apr 2025 09:34:42 -0700 Subject: [PATCH 34/56] net: switch to netif_disable_lro in inetdev_init Cosmin reports the following deadlock: dump_stack_lvl+0x62/0x90 print_deadlock_bug+0x274/0x3b0 __lock_acquire+0x1229/0x2470 lock_acquire+0xb7/0x2b0 __mutex_lock+0xa6/0xd20 dev_disable_lro+0x20/0x80 inetdev_init+0x12f/0x1f0 inetdev_event+0x48b/0x870 notifier_call_chain+0x38/0xf0 netif_change_net_namespace+0x72e/0x9f0 do_setlink.isra.0+0xd5/0x1220 rtnl_newlink+0x7ea/0xb50 rtnetlink_rcv_msg+0x459/0x5e0 netlink_rcv_skb+0x54/0x100 netlink_unicast+0x193/0x270 netlink_sendmsg+0x204/0x450 Switch to netif_disable_lro which assumes the caller holds the instance lock. inetdev_init is called for blackhole device (which sw device and doesn't grab instance lock) and from REGISTER/UNREGISTER notifiers. We already hold the instance lock for REGISTER notifier during netns change and we'll soon hold the lock during other paths. Reviewed-by: Jakub Kicinski Reported-by: Cosmin Ratiu Fixes: ad7c7b2172c3 ("net: hold netdev instance lock during sysfs operations") Signed-off-by: Stanislav Fomichev Link: https://patch.msgid.link/20250401163452.622454-2-sdf@fomichev.me Signed-off-by: Jakub Kicinski --- net/ipv4/devinet.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 754f60fb6e25..77e5705ac799 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -281,7 +281,7 @@ static struct in_device *inetdev_init(struct net_device *dev) if (!in_dev->arp_parms) goto out_kfree; if (IPV4_DEVCONF(in_dev->cnf, FORWARDING)) - dev_disable_lro(dev); + netif_disable_lro(dev); /* Reference in_dev->dev */ netdev_hold(dev, &in_dev->dev_tracker, GFP_KERNEL); /* Account for reference dev->ip_ptr (below) */ From 4c975fd700022c90e61a46326e3444e08317876e Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 1 Apr 2025 09:34:43 -0700 Subject: [PATCH 35/56] net: hold instance lock during NETDEV_REGISTER/UP Callers of inetdev_init can come from several places with inconsistent expectation about netdev instance lock. Grab instance lock during REGISTER (plus UP). Also solve the inconsistency with UNREGISTER where it was locked only during move netns path. WARNING: CPU: 10 PID: 1479 at ./include/net/netdev_lock.h:54 __netdev_update_features+0x65f/0xca0 __warn+0x81/0x180 __netdev_update_features+0x65f/0xca0 report_bug+0x156/0x180 handle_bug+0x4f/0x90 exc_invalid_op+0x13/0x60 asm_exc_invalid_op+0x16/0x20 __netdev_update_features+0x65f/0xca0 netif_disable_lro+0x30/0x1d0 inetdev_init+0x12f/0x1f0 inetdev_event+0x48b/0x870 notifier_call_chain+0x38/0xf0 register_netdevice+0x741/0x8b0 register_netdev+0x1f/0x40 mlx5e_probe+0x4e3/0x8e0 [mlx5_core] auxiliary_bus_probe+0x3f/0x90 really_probe+0xc3/0x3a0 __driver_probe_device+0x80/0x150 driver_probe_device+0x1f/0x90 __device_attach_driver+0x7d/0x100 bus_for_each_drv+0x80/0xd0 __device_attach+0xb4/0x1c0 bus_probe_device+0x91/0xa0 device_add+0x657/0x870 Reviewed-by: Jakub Kicinski Reported-by: Cosmin Ratiu Fixes: ad7c7b2172c3 ("net: hold netdev instance lock during sysfs operations") Signed-off-by: Stanislav Fomichev Link: https://patch.msgid.link/20250401163452.622454-3-sdf@fomichev.me Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 2 +- net/core/dev.c | 12 +++++++++--- net/core/dev_api.c | 8 +------- net/core/rtnetlink.c | 8 ++++---- 4 files changed, 15 insertions(+), 15 deletions(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index fa79145518d1..cf3b6445817b 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4192,7 +4192,7 @@ int dev_change_flags(struct net_device *dev, unsigned int flags, int netif_set_alias(struct net_device *dev, const char *alias, size_t len); int dev_set_alias(struct net_device *, const char *, size_t); int dev_get_alias(const struct net_device *, char *, size_t); -int netif_change_net_namespace(struct net_device *dev, struct net *net, +int __dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat, int new_ifindex, struct netlink_ext_ack *extack); int dev_change_net_namespace(struct net_device *dev, struct net *net, diff --git a/net/core/dev.c b/net/core/dev.c index 5d20ff226d5e..ce6612106e60 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1858,7 +1858,9 @@ static int call_netdevice_register_net_notifiers(struct notifier_block *nb, int err; for_each_netdev(net, dev) { + netdev_lock_ops(dev); err = call_netdevice_register_notifiers(nb, dev); + netdev_unlock_ops(dev); if (err) goto rollback; } @@ -11047,7 +11049,9 @@ int register_netdevice(struct net_device *dev) memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len); /* Notify protocols, that a new device appeared. */ + netdev_lock_ops(dev); ret = call_netdevice_notifiers(NETDEV_REGISTER, dev); + netdev_unlock_ops(dev); ret = notifier_to_errno(ret); if (ret) { /* Expect explicit free_netdev() on failure */ @@ -12059,7 +12063,7 @@ void unregister_netdev(struct net_device *dev) } EXPORT_SYMBOL(unregister_netdev); -int netif_change_net_namespace(struct net_device *dev, struct net *net, +int __dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat, int new_ifindex, struct netlink_ext_ack *extack) { @@ -12144,11 +12148,12 @@ int netif_change_net_namespace(struct net_device *dev, struct net *net, * And now a mini version of register_netdevice unregister_netdevice. */ + netdev_lock_ops(dev); /* If device is running close it first. */ netif_close(dev); - /* And unlink it from device chain */ unlist_netdevice(dev); + netdev_unlock_ops(dev); synchronize_net(); @@ -12210,11 +12215,12 @@ int netif_change_net_namespace(struct net_device *dev, struct net *net, err = netdev_change_owner(dev, net_old, net); WARN_ON(err); + netdev_lock_ops(dev); /* Add the device back in the hashes */ list_netdevice(dev); - /* Notify protocols, that a new device appeared. */ call_netdevice_notifiers(NETDEV_REGISTER, dev); + netdev_unlock_ops(dev); /* * Prevent userspace races by waiting until the network diff --git a/net/core/dev_api.c b/net/core/dev_api.c index 8dbc60612100..90bafb0b1b8c 100644 --- a/net/core/dev_api.c +++ b/net/core/dev_api.c @@ -117,13 +117,7 @@ EXPORT_SYMBOL(dev_set_mac_address_user); int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat) { - int ret; - - netdev_lock_ops(dev); - ret = netif_change_net_namespace(dev, net, pat, 0, NULL); - netdev_unlock_ops(dev); - - return ret; + return __dev_change_net_namespace(dev, net, pat, 0, NULL); } EXPORT_SYMBOL_GPL(dev_change_net_namespace); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 334db17be37d..c23852835050 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3025,8 +3025,6 @@ static int do_setlink(const struct sk_buff *skb, struct net_device *dev, char ifname[IFNAMSIZ]; int err; - netdev_lock_ops(dev); - err = validate_linkmsg(dev, tb, extack); if (err < 0) goto errout; @@ -3042,14 +3040,16 @@ static int do_setlink(const struct sk_buff *skb, struct net_device *dev, new_ifindex = nla_get_s32_default(tb[IFLA_NEW_IFINDEX], 0); - err = netif_change_net_namespace(dev, tgt_net, pat, + err = __dev_change_net_namespace(dev, tgt_net, pat, new_ifindex, extack); if (err) - goto errout; + return err; status |= DO_SETLINK_MODIFIED; } + netdev_lock_ops(dev); + if (tb[IFLA_MAP]) { struct rtnl_link_ifmap *u_map; struct ifmap k_map; From 8965c160b8f7333df895321c8aa6bad4a7175f2b Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 1 Apr 2025 09:34:44 -0700 Subject: [PATCH 36/56] net: use netif_disable_lro in ipv6_add_dev ipv6_add_dev might call dev_disable_lro which unconditionally grabs instance lock, so it will deadlock during NETDEV_REGISTER. Switch to netif_disable_lro. Make sure all callers hold the instance lock as well. Cc: Cosmin Ratiu Fixes: ad7c7b2172c3 ("net: hold netdev instance lock during sysfs operations") Signed-off-by: Stanislav Fomichev Link: https://patch.msgid.link/20250401163452.622454-4-sdf@fomichev.me Signed-off-by: Jakub Kicinski --- include/net/ip.h | 16 ++++++++-------- net/core/dev.c | 1 + net/ipv6/addrconf.c | 15 +++++++++++++-- 3 files changed, 22 insertions(+), 10 deletions(-) diff --git a/include/net/ip.h b/include/net/ip.h index 8a48ade24620..47ed6d23853d 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -667,14 +667,6 @@ static inline void ip_ipgre_mc_map(__be32 naddr, const unsigned char *broadcast, memcpy(buf, &naddr, sizeof(naddr)); } -#if IS_MODULE(CONFIG_IPV6) -#define EXPORT_IPV6_MOD(X) EXPORT_SYMBOL(X) -#define EXPORT_IPV6_MOD_GPL(X) EXPORT_SYMBOL_GPL(X) -#else -#define EXPORT_IPV6_MOD(X) -#define EXPORT_IPV6_MOD_GPL(X) -#endif - #if IS_ENABLED(CONFIG_IPV6) #include #endif @@ -694,6 +686,14 @@ static __inline__ void inet_reset_saddr(struct sock *sk) #endif +#if IS_MODULE(CONFIG_IPV6) +#define EXPORT_IPV6_MOD(X) EXPORT_SYMBOL(X) +#define EXPORT_IPV6_MOD_GPL(X) EXPORT_SYMBOL_GPL(X) +#else +#define EXPORT_IPV6_MOD(X) +#define EXPORT_IPV6_MOD_GPL(X) +#endif + static inline unsigned int ipv4_addr_hash(__be32 ip) { return (__force unsigned int) ip; diff --git a/net/core/dev.c b/net/core/dev.c index ce6612106e60..0608605cfc24 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1771,6 +1771,7 @@ void netif_disable_lro(struct net_device *dev) netdev_unlock_ops(lower_dev); } } +EXPORT_IPV6_MOD(netif_disable_lro); /** * dev_disable_gro_hw - disable HW Generic Receive Offload on a device diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 54a8ea004da2..c3b908fccbc1 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -80,6 +80,7 @@ #include #include #include +#include #include #include #include @@ -377,6 +378,7 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev) int err = -ENOMEM; ASSERT_RTNL(); + netdev_ops_assert_locked(dev); if (dev->mtu < IPV6_MIN_MTU && dev != blackhole_netdev) return ERR_PTR(-EINVAL); @@ -402,7 +404,7 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev) return ERR_PTR(err); } if (ndev->cnf.forwarding) - dev_disable_lro(dev); + netif_disable_lro(dev); /* We refer to the device */ netdev_hold(dev, &ndev->dev_tracker, GFP_KERNEL); @@ -3152,10 +3154,12 @@ int addrconf_add_ifaddr(struct net *net, void __user *arg) rtnl_net_lock(net); dev = __dev_get_by_index(net, ireq.ifr6_ifindex); + netdev_lock_ops(dev); if (dev) err = inet6_addr_add(net, dev, &cfg, 0, 0, NULL); else err = -ENODEV; + netdev_unlock_ops(dev); rtnl_net_unlock(net); return err; } @@ -5026,9 +5030,10 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, if (!dev) { NL_SET_ERR_MSG_MOD(extack, "Unable to find the interface"); err = -ENODEV; - goto unlock; + goto unlock_rtnl; } + netdev_lock_ops(dev); idev = ipv6_find_idev(dev); if (IS_ERR(idev)) { err = PTR_ERR(idev); @@ -5065,6 +5070,8 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, in6_ifa_put(ifa); unlock: + netdev_unlock_ops(dev); +unlock_rtnl: rtnl_net_unlock(net); return err; @@ -6516,7 +6523,9 @@ static int addrconf_sysctl_addr_gen_mode(const struct ctl_table *ctl, int write, if (idev->cnf.addr_gen_mode != new_val) { WRITE_ONCE(idev->cnf.addr_gen_mode, new_val); + netdev_lock_ops(idev->dev); addrconf_init_auto_addrs(idev->dev); + netdev_unlock_ops(idev->dev); } } else if (&net->ipv6.devconf_all->addr_gen_mode == ctl->data) { struct net_device *dev; @@ -6528,7 +6537,9 @@ static int addrconf_sysctl_addr_gen_mode(const struct ctl_table *ctl, int write, idev->cnf.addr_gen_mode != new_val) { WRITE_ONCE(idev->cnf.addr_gen_mode, new_val); + netdev_lock_ops(idev->dev); addrconf_init_auto_addrs(idev->dev); + netdev_unlock_ops(idev->dev); } } } From b912d599d3d83ff9a2db58c17b5c76429a206db5 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 1 Apr 2025 09:34:45 -0700 Subject: [PATCH 37/56] net: rename rtnl_net_debug to lock_debug And make it selected by CONFIG_DEBUG_NET. Don't rename any of the structs/functions. Next patch will use rtnl_net_debug_event in netdevsim. Reviewed-by: Jakub Kicinski Signed-off-by: Stanislav Fomichev Link: https://patch.msgid.link/20250401163452.622454-5-sdf@fomichev.me Signed-off-by: Jakub Kicinski --- net/core/Makefile | 2 +- net/core/{rtnl_net_debug.c => lock_debug.c} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename net/core/{rtnl_net_debug.c => lock_debug.c} (100%) diff --git a/net/core/Makefile b/net/core/Makefile index a10c3bd96798..b2a76ce33932 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -45,5 +45,5 @@ obj-$(CONFIG_BPF_SYSCALL) += bpf_sk_storage.o obj-$(CONFIG_OF) += of_net.o obj-$(CONFIG_NET_TEST) += net_test.o obj-$(CONFIG_NET_DEVMEM) += devmem.o -obj-$(CONFIG_DEBUG_NET_SMALL_RTNL) += rtnl_net_debug.o +obj-$(CONFIG_DEBUG_NET) += lock_debug.o obj-$(CONFIG_FAIL_SKB_REALLOC) += skb_fault_injection.o diff --git a/net/core/rtnl_net_debug.c b/net/core/lock_debug.c similarity index 100% rename from net/core/rtnl_net_debug.c rename to net/core/lock_debug.c From 1901066aab7654f4a225ac29354a564d891d0c1a Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 1 Apr 2025 09:34:46 -0700 Subject: [PATCH 38/56] netdevsim: add dummy device notifiers In order to exercise and verify notifiers' locking assumptions, register dummy notifiers (via register_netdevice_notifier_dev_net). Share notifier event handler that enforces the assumptions with lock_debug.c (rename and export rtnl_net_debug_event as netdev_debug_event). Add ops lock asserts to netdev_debug_event. Signed-off-by: Stanislav Fomichev Link: https://patch.msgid.link/20250401163452.622454-6-sdf@fomichev.me Signed-off-by: Jakub Kicinski --- drivers/net/netdevsim/netdev.c | 13 +++++++++++++ drivers/net/netdevsim/netdevsim.h | 3 +++ include/net/netdev_lock.h | 3 +++ net/core/lock_debug.c | 14 +++++++++----- 4 files changed, 28 insertions(+), 5 deletions(-) diff --git a/drivers/net/netdevsim/netdev.c b/drivers/net/netdevsim/netdev.c index b67af4651185..ddda0c1e7a6d 100644 --- a/drivers/net/netdevsim/netdev.c +++ b/drivers/net/netdevsim/netdev.c @@ -939,6 +939,7 @@ static int nsim_init_netdevsim(struct netdevsim *ns) ns->netdev->netdev_ops = &nsim_netdev_ops; ns->netdev->stat_ops = &nsim_stat_ops; ns->netdev->queue_mgmt_ops = &nsim_queue_mgmt_ops; + netdev_lockdep_set_classes(ns->netdev); err = nsim_udp_tunnels_info_create(ns->nsim_dev, ns->netdev); if (err) @@ -960,6 +961,14 @@ static int nsim_init_netdevsim(struct netdevsim *ns) if (err) goto err_ipsec_teardown; rtnl_unlock(); + + if (IS_ENABLED(CONFIG_DEBUG_NET)) { + ns->nb.notifier_call = netdev_debug_event; + if (register_netdevice_notifier_dev_net(ns->netdev, &ns->nb, + &ns->nn)) + ns->nb.notifier_call = NULL; + } + return 0; err_ipsec_teardown: @@ -1043,6 +1052,10 @@ void nsim_destroy(struct netdevsim *ns) debugfs_remove(ns->qr_dfs); debugfs_remove(ns->pp_dfs); + if (ns->nb.notifier_call) + unregister_netdevice_notifier_dev_net(ns->netdev, &ns->nb, + &ns->nn); + rtnl_lock(); peer = rtnl_dereference(ns->peer); if (peer) diff --git a/drivers/net/netdevsim/netdevsim.h b/drivers/net/netdevsim/netdevsim.h index 665020d18f29..d04401f0bdf7 100644 --- a/drivers/net/netdevsim/netdevsim.h +++ b/drivers/net/netdevsim/netdevsim.h @@ -144,6 +144,9 @@ struct netdevsim { struct nsim_ethtool ethtool; struct netdevsim __rcu *peer; + + struct notifier_block nb; + struct netdev_net_notifier nn; }; struct netdevsim * diff --git a/include/net/netdev_lock.h b/include/net/netdev_lock.h index 1c0c9a94cc22..c316b551df8d 100644 --- a/include/net/netdev_lock.h +++ b/include/net/netdev_lock.h @@ -98,4 +98,7 @@ static inline int netdev_lock_cmp_fn(const struct lockdep_map *a, &qdisc_xmit_lock_key); \ } +int netdev_debug_event(struct notifier_block *nb, unsigned long event, + void *ptr); + #endif diff --git a/net/core/lock_debug.c b/net/core/lock_debug.c index f3272b09c255..b7f22dc92a6f 100644 --- a/net/core/lock_debug.c +++ b/net/core/lock_debug.c @@ -6,10 +6,11 @@ #include #include #include +#include #include -static int rtnl_net_debug_event(struct notifier_block *nb, - unsigned long event, void *ptr) +int netdev_debug_event(struct notifier_block *nb, unsigned long event, + void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); @@ -17,11 +18,13 @@ static int rtnl_net_debug_event(struct notifier_block *nb, /* Keep enum and don't add default to trigger -Werror=switch */ switch (cmd) { + case NETDEV_REGISTER: case NETDEV_UP: + netdev_ops_assert_locked(dev); + fallthrough; case NETDEV_DOWN: case NETDEV_REBOOT: case NETDEV_CHANGE: - case NETDEV_REGISTER: case NETDEV_UNREGISTER: case NETDEV_CHANGEMTU: case NETDEV_CHANGEADDR: @@ -66,6 +69,7 @@ static int rtnl_net_debug_event(struct notifier_block *nb, return NOTIFY_DONE; } +EXPORT_SYMBOL_NS_GPL(netdev_debug_event, "NETDEV_INTERNAL"); static int rtnl_net_debug_net_id; @@ -74,7 +78,7 @@ static int __net_init rtnl_net_debug_net_init(struct net *net) struct notifier_block *nb; nb = net_generic(net, rtnl_net_debug_net_id); - nb->notifier_call = rtnl_net_debug_event; + nb->notifier_call = netdev_debug_event; return register_netdevice_notifier_net(net, nb); } @@ -95,7 +99,7 @@ static struct pernet_operations rtnl_net_debug_net_ops __net_initdata = { }; static struct notifier_block rtnl_net_debug_block = { - .notifier_call = rtnl_net_debug_event, + .notifier_call = netdev_debug_event, }; static int __init rtnl_net_debug_init(void) From dbfc99495d960134bfe1a4f13849fb0d5373b42c Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 1 Apr 2025 09:34:47 -0700 Subject: [PATCH 39/56] net: dummy: request ops lock Even though dummy device doesn't really need an instance lock, a lot of selftests use dummy so it's useful to have extra expose to the instance lock on NIPA. Request the instance/ops locking. Signed-off-by: Stanislav Fomichev Link: https://patch.msgid.link/20250401163452.622454-7-sdf@fomichev.me Signed-off-by: Jakub Kicinski --- drivers/net/dummy.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/dummy.c b/drivers/net/dummy.c index a4938c6a5ebb..d6bdad4baadd 100644 --- a/drivers/net/dummy.c +++ b/drivers/net/dummy.c @@ -105,6 +105,7 @@ static void dummy_setup(struct net_device *dev) dev->netdev_ops = &dummy_netdev_ops; dev->ethtool_ops = &dummy_ethtool_ops; dev->needs_free_netdev = true; + dev->request_ops_lock = true; /* Fill in device structure with ethernet-generic values. */ dev->flags |= IFF_NOARP; From ee705fa21fdc0c7da98309e5c3c8ab72b99ef087 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 1 Apr 2025 09:34:48 -0700 Subject: [PATCH 40/56] docs: net: document netdev notifier expectations We don't have a consistent state yet, but document where we think we are and where we wanna be. Signed-off-by: Stanislav Fomichev Link: https://patch.msgid.link/20250401163452.622454-8-sdf@fomichev.me Signed-off-by: Jakub Kicinski --- Documentation/networking/netdevices.rst | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/Documentation/networking/netdevices.rst b/Documentation/networking/netdevices.rst index ebb868f50ac2..6c2d8945f597 100644 --- a/Documentation/networking/netdevices.rst +++ b/Documentation/networking/netdevices.rst @@ -343,6 +343,29 @@ there are two sets of interfaces: ``dev_xxx`` and ``netif_xxx`` (e.g., acquiring the instance lock themselves, while the ``netif_xxx`` functions assume that the driver has already acquired the instance lock. +Notifiers and netdev instance lock +================================== + +For device drivers that implement shaping or queue management APIs, +some of the notifiers (``enum netdev_cmd``) are running under the netdev +instance lock. + +For devices with locked ops, currently only the following notifiers are +running under the lock: +* ``NETDEV_REGISTER`` +* ``NETDEV_UP`` + +The following notifiers are running without the lock: +* ``NETDEV_UNREGISTER`` + +There are no clear expectations for the remaining notifiers. Notifiers not on +the list may run with or without the instance lock, potentially even invoking +the same notifier type with and without the lock from different code paths. +The goal is to eventually ensure that all (or most, with a few documented +exceptions) notifiers run under the instance lock. Please extend this +documentation whenever you make explicit assumption about lock being held +from a notifier. + NETDEV_INTERNAL symbol namespace ================================ From 56c8a23f8a0f3c735f3b8f9d323927904090049b Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 1 Apr 2025 09:34:49 -0700 Subject: [PATCH 41/56] selftests: net: use netdevsim in netns test Netdevsim has extra register_netdevice_notifier_dev_net notifiers, use netdevim instead of dummy device to test them out. Signed-off-by: Stanislav Fomichev Link: https://patch.msgid.link/20250401163452.622454-9-sdf@fomichev.me Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/lib.sh | 25 +++++++++++++++++++++++ tools/testing/selftests/net/netns-name.sh | 13 ++++++++---- 2 files changed, 34 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/net/lib.sh b/tools/testing/selftests/net/lib.sh index 975be4fdbcdb..701905eeff66 100644 --- a/tools/testing/selftests/net/lib.sh +++ b/tools/testing/selftests/net/lib.sh @@ -222,6 +222,31 @@ setup_ns() NS_LIST+=("${ns_list[@]}") } +# Create netdevsim with given id and net namespace. +create_netdevsim() { + local id="$1" + local ns="$2" + + modprobe netdevsim &> /dev/null + udevadm settle + + echo "$id 1" | ip netns exec $ns tee /sys/bus/netdevsim/new_device >/dev/null + local dev=$(ip netns exec $ns ls /sys/bus/netdevsim/devices/netdevsim$id/net) + ip -netns $ns link set dev $dev name nsim$id + ip -netns $ns link set dev nsim$id up + + echo nsim$id +} + +# Remove netdevsim with given id. +cleanup_netdevsim() { + local id="$1" + + if [ -d "/sys/bus/netdevsim/devices/netdevsim$id/net" ]; then + echo "$id" > /sys/bus/netdevsim/del_device + fi +} + tc_rule_stats_get() { local dev=$1; shift diff --git a/tools/testing/selftests/net/netns-name.sh b/tools/testing/selftests/net/netns-name.sh index 0be1905d1f2f..38871bdef67f 100755 --- a/tools/testing/selftests/net/netns-name.sh +++ b/tools/testing/selftests/net/netns-name.sh @@ -7,10 +7,12 @@ set -o pipefail DEV=dummy-dev0 DEV2=dummy-dev1 ALT_NAME=some-alt-name +NSIM_ADDR=2025 RET_CODE=0 cleanup() { + cleanup_netdevsim $NSIM_ADDR cleanup_ns $NS $test_ns } @@ -25,12 +27,15 @@ setup_ns NS test_ns # # Test basic move without a rename +# Use netdevsim because it has extra asserts for notifiers. # -ip -netns $NS link add name $DEV type dummy || fail -ip -netns $NS link set dev $DEV netns $test_ns || + +nsim=$(create_netdevsim $NSIM_ADDR $NS) +ip -netns $NS link set dev $nsim netns $test_ns || fail "Can't perform a netns move" -ip -netns $test_ns link show dev $DEV >> /dev/null || fail "Device not found after move" -ip -netns $test_ns link del $DEV || fail +ip -netns $test_ns link show dev $nsim >> /dev/null || + fail "Device not found after move" +cleanup_netdevsim $NSIM_ADDR # # Test move with a conflict From c0f21784bca558d03d48b5ba68fac2f7070f516b Mon Sep 17 00:00:00 2001 From: David Wei Date: Wed, 2 Apr 2025 10:24:14 -0700 Subject: [PATCH 42/56] io_uring/zcrx: fix selftests w/ updated netdev Python helpers Fix io_uring zero copy rx selftest with updated netdev Python helpers. Signed-off-by: David Wei Link: https://patch.msgid.link/20250402172414.895276-1-dw@davidwei.uk Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/hw/iou-zcrx.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/drivers/net/hw/iou-zcrx.py b/tools/testing/selftests/drivers/net/hw/iou-zcrx.py index d301d9b356f7..9f271ab6ec04 100755 --- a/tools/testing/selftests/drivers/net/hw/iou-zcrx.py +++ b/tools/testing/selftests/drivers/net/hw/iou-zcrx.py @@ -27,7 +27,7 @@ def _set_flow_rule(cfg, chan): def test_zcrx(cfg) -> None: - cfg.require_v6() + cfg.require_ipver('6') combined_chans = _get_combined_channels(cfg) if combined_chans < 2: @@ -40,7 +40,7 @@ def test_zcrx(cfg) -> None: flow_rule_id = _set_flow_rule(cfg, combined_chans - 1) rx_cmd = f"{cfg.bin_remote} -s -p 9999 -i {cfg.ifname} -q {combined_chans - 1}" - tx_cmd = f"{cfg.bin_local} -c -h {cfg.remote_v6} -p 9999 -l 12840" + tx_cmd = f"{cfg.bin_local} -c -h {cfg.remote_addr_v['6']} -p 9999 -l 12840" with bkg(rx_cmd, host=cfg.remote, exit_wait=True): wait_port_listen(9999, proto="tcp", host=cfg.remote) cmd(tx_cmd) @@ -51,7 +51,7 @@ def test_zcrx(cfg) -> None: def test_zcrx_oneshot(cfg) -> None: - cfg.require_v6() + cfg.require_ipver('6') combined_chans = _get_combined_channels(cfg) if combined_chans < 2: @@ -64,7 +64,7 @@ def test_zcrx_oneshot(cfg) -> None: flow_rule_id = _set_flow_rule(cfg, combined_chans - 1) rx_cmd = f"{cfg.bin_remote} -s -p 9999 -i {cfg.ifname} -q {combined_chans - 1} -o 4" - tx_cmd = f"{cfg.bin_local} -c -h {cfg.remote_v6} -p 9999 -l 4096 -z 16384" + tx_cmd = f"{cfg.bin_local} -c -h {cfg.remote_addr_v['6']} -p 9999 -l 4096 -z 16384" with bkg(rx_cmd, host=cfg.remote, exit_wait=True): wait_port_listen(9999, proto="tcp", host=cfg.remote) cmd(tx_cmd) From b27055a08ad4b415dcf15b63034f9cb236f7fb40 Mon Sep 17 00:00:00 2001 From: Lin Ma Date: Thu, 3 Apr 2025 00:56:32 +0800 Subject: [PATCH 43/56] net: fix geneve_opt length integer overflow struct geneve_opt uses 5 bit length for each single option, which means every vary size option should be smaller than 128 bytes. However, all current related Netlink policies cannot promise this length condition and the attacker can exploit a exact 128-byte size option to *fake* a zero length option and confuse the parsing logic, further achieve heap out-of-bounds read. One example crash log is like below: [ 3.905425] ================================================================== [ 3.905925] BUG: KASAN: slab-out-of-bounds in nla_put+0xa9/0xe0 [ 3.906255] Read of size 124 at addr ffff888005f291cc by task poc/177 [ 3.906646] [ 3.906775] CPU: 0 PID: 177 Comm: poc-oob-read Not tainted 6.1.132 #1 [ 3.907131] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014 [ 3.907784] Call Trace: [ 3.907925] [ 3.908048] dump_stack_lvl+0x44/0x5c [ 3.908258] print_report+0x184/0x4be [ 3.909151] kasan_report+0xc5/0x100 [ 3.909539] kasan_check_range+0xf3/0x1a0 [ 3.909794] memcpy+0x1f/0x60 [ 3.909968] nla_put+0xa9/0xe0 [ 3.910147] tunnel_key_dump+0x945/0xba0 [ 3.911536] tcf_action_dump_1+0x1c1/0x340 [ 3.912436] tcf_action_dump+0x101/0x180 [ 3.912689] tcf_exts_dump+0x164/0x1e0 [ 3.912905] fw_dump+0x18b/0x2d0 [ 3.913483] tcf_fill_node+0x2ee/0x460 [ 3.914778] tfilter_notify+0xf4/0x180 [ 3.915208] tc_new_tfilter+0xd51/0x10d0 [ 3.918615] rtnetlink_rcv_msg+0x4a2/0x560 [ 3.919118] netlink_rcv_skb+0xcd/0x200 [ 3.919787] netlink_unicast+0x395/0x530 [ 3.921032] netlink_sendmsg+0x3d0/0x6d0 [ 3.921987] __sock_sendmsg+0x99/0xa0 [ 3.922220] __sys_sendto+0x1b7/0x240 [ 3.922682] __x64_sys_sendto+0x72/0x90 [ 3.922906] do_syscall_64+0x5e/0x90 [ 3.923814] entry_SYSCALL_64_after_hwframe+0x6e/0xd8 [ 3.924122] RIP: 0033:0x7e83eab84407 [ 3.924331] Code: 48 89 fa 4c 89 df e8 38 aa 00 00 8b 93 08 03 00 00 59 5e 48 83 f8 fc 74 1a 5b c3 0f 1f 84 00 00 00 00 00 48 8b 44 24 10 0f 05 <5b> c3 0f 1f 80 00 00 00 00 83 e2 39 83 faf [ 3.925330] RSP: 002b:00007ffff505e370 EFLAGS: 00000202 ORIG_RAX: 000000000000002c [ 3.925752] RAX: ffffffffffffffda RBX: 00007e83eaafa740 RCX: 00007e83eab84407 [ 3.926173] RDX: 00000000000001a8 RSI: 00007ffff505e3c0 RDI: 0000000000000003 [ 3.926587] RBP: 00007ffff505f460 R08: 00007e83eace1000 R09: 000000000000000c [ 3.926977] R10: 0000000000000000 R11: 0000000000000202 R12: 00007ffff505f3c0 [ 3.927367] R13: 00007ffff505f5c8 R14: 00007e83ead1b000 R15: 00005d4fbbe6dcb8 Fix these issues by enforing correct length condition in related policies. Fixes: 925d844696d9 ("netfilter: nft_tunnel: add support for geneve opts") Fixes: 4ece47787077 ("lwtunnel: add options setting and dumping for geneve") Fixes: 0ed5269f9e41 ("net/sched: add tunnel option support to act_tunnel_key") Fixes: 0a6e77784f49 ("net/sched: allow flower to match tunnel options") Signed-off-by: Lin Ma Reviewed-by: Xin Long Acked-by: Cong Wang Link: https://patch.msgid.link/20250402165632.6958-1-linma@zju.edu.cn Signed-off-by: Jakub Kicinski --- net/ipv4/ip_tunnel_core.c | 2 +- net/netfilter/nft_tunnel.c | 2 +- net/sched/act_tunnel_key.c | 2 +- net/sched/cls_flower.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 364ea798511e..f65d2f727381 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -451,7 +451,7 @@ static const struct nla_policy geneve_opt_policy[LWTUNNEL_IP_OPT_GENEVE_MAX + 1] = { [LWTUNNEL_IP_OPT_GENEVE_CLASS] = { .type = NLA_U16 }, [LWTUNNEL_IP_OPT_GENEVE_TYPE] = { .type = NLA_U8 }, - [LWTUNNEL_IP_OPT_GENEVE_DATA] = { .type = NLA_BINARY, .len = 128 }, + [LWTUNNEL_IP_OPT_GENEVE_DATA] = { .type = NLA_BINARY, .len = 127 }, }; static const struct nla_policy diff --git a/net/netfilter/nft_tunnel.c b/net/netfilter/nft_tunnel.c index 681301b46aa4..ec7089ab752c 100644 --- a/net/netfilter/nft_tunnel.c +++ b/net/netfilter/nft_tunnel.c @@ -335,7 +335,7 @@ static int nft_tunnel_obj_erspan_init(const struct nlattr *attr, static const struct nla_policy nft_tunnel_opts_geneve_policy[NFTA_TUNNEL_KEY_GENEVE_MAX + 1] = { [NFTA_TUNNEL_KEY_GENEVE_CLASS] = { .type = NLA_U16 }, [NFTA_TUNNEL_KEY_GENEVE_TYPE] = { .type = NLA_U8 }, - [NFTA_TUNNEL_KEY_GENEVE_DATA] = { .type = NLA_BINARY, .len = 128 }, + [NFTA_TUNNEL_KEY_GENEVE_DATA] = { .type = NLA_BINARY, .len = 127 }, }; static int nft_tunnel_obj_geneve_init(const struct nlattr *attr, diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index ae5dea7c48a8..2cef4b08befb 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -68,7 +68,7 @@ geneve_opt_policy[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_MAX + 1] = { [TCA_TUNNEL_KEY_ENC_OPT_GENEVE_CLASS] = { .type = NLA_U16 }, [TCA_TUNNEL_KEY_ENC_OPT_GENEVE_TYPE] = { .type = NLA_U8 }, [TCA_TUNNEL_KEY_ENC_OPT_GENEVE_DATA] = { .type = NLA_BINARY, - .len = 128 }, + .len = 127 }, }; static const struct nla_policy diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 03505673d523..099ff6a3e1f5 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -766,7 +766,7 @@ geneve_opt_policy[TCA_FLOWER_KEY_ENC_OPT_GENEVE_MAX + 1] = { [TCA_FLOWER_KEY_ENC_OPT_GENEVE_CLASS] = { .type = NLA_U16 }, [TCA_FLOWER_KEY_ENC_OPT_GENEVE_TYPE] = { .type = NLA_U8 }, [TCA_FLOWER_KEY_ENC_OPT_GENEVE_DATA] = { .type = NLA_BINARY, - .len = 128 }, + .len = 127 }, }; static const struct nla_policy From 2a8377720a0a30fa133f7773e901598f7d563f36 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Tue, 1 Apr 2025 11:02:12 +0200 Subject: [PATCH 44/56] net: octeontx2: Handle XDP_ABORTED and XDP invalid as XDP_DROP In the current implementation octeontx2 manages XDP_ABORTED and XDP invalid as XDP_PASS forwarding the skb to the networking stack. Align the behaviour to other XDP drivers handling XDP_ABORTED and XDP invalid as XDP_DROP. Please note this patch has just compile tested. Fixes: 06059a1a9a4a5 ("octeontx2-pf: Add XDP support to netdev PF") Signed-off-by: Lorenzo Bianconi Link: https://patch.msgid.link/20250401-octeontx2-xdp-abort-fix-v1-1-f0587c35a0b9@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c index af8cabe828d0..0a6bb346ba45 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c @@ -1559,12 +1559,11 @@ static bool otx2_xdp_rcv_pkt_handler(struct otx2_nic *pfvf, break; default: bpf_warn_invalid_xdp_action(pfvf->netdev, prog, act); - break; + fallthrough; case XDP_ABORTED: - if (xsk_buff) - xsk_buff_free(xsk_buff); - trace_xdp_exception(pfvf->netdev, prog, act); - break; + if (act == XDP_ABORTED) + trace_xdp_exception(pfvf->netdev, prog, act); + fallthrough; case XDP_DROP: cq->pool_ptrs++; if (xsk_buff) { From 51de3600093429e3b712e5f091d767babc5dd6df Mon Sep 17 00:00:00 2001 From: Ying Lu Date: Wed, 2 Apr 2025 16:58:59 +0800 Subject: [PATCH 45/56] usbnet:fix NPE during rx_complete MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Missing usbnet_going_away Check in Critical Path. The usb_submit_urb function lacks a usbnet_going_away validation, whereas __usbnet_queue_skb includes this check. This inconsistency creates a race condition where: A URB request may succeed, but the corresponding SKB data fails to be queued. Subsequent processes: (e.g., rx_complete → defer_bh → __skb_unlink(skb, list)) attempt to access skb->next, triggering a NULL pointer dereference (Kernel Panic). Fixes: 04e906839a05 ("usbnet: fix cyclical race on disconnect with work queue") Cc: stable@vger.kernel.org Signed-off-by: Ying Lu Link: https://patch.msgid.link/4c9ef2efaa07eb7f9a5042b74348a67e5a3a7aea.1743584159.git.luying1@xiaomi.com Signed-off-by: Jakub Kicinski --- drivers/net/usb/usbnet.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c index aeab2308b150..724b93aa4f7e 100644 --- a/drivers/net/usb/usbnet.c +++ b/drivers/net/usb/usbnet.c @@ -530,7 +530,8 @@ static int rx_submit (struct usbnet *dev, struct urb *urb, gfp_t flags) netif_device_present (dev->net) && test_bit(EVENT_DEV_OPEN, &dev->flags) && !test_bit (EVENT_RX_HALT, &dev->flags) && - !test_bit (EVENT_DEV_ASLEEP, &dev->flags)) { + !test_bit (EVENT_DEV_ASLEEP, &dev->flags) && + !usbnet_going_away(dev)) { switch (retval = usb_submit_urb (urb, GFP_ATOMIC)) { case -EPIPE: usbnet_defer_kevent (dev, EVENT_RX_HALT); @@ -551,8 +552,7 @@ static int rx_submit (struct usbnet *dev, struct urb *urb, gfp_t flags) tasklet_schedule (&dev->bh); break; case 0: - if (!usbnet_going_away(dev)) - __usbnet_queue_skb(&dev->rxq, skb, rx_start); + __usbnet_queue_skb(&dev->rxq, skb, rx_start); } } else { netif_dbg(dev, ifdown, dev->net, "rx: stopped\n"); From 4d0ab3a6885e3e9040310a8d8f54503366083626 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Wed, 2 Apr 2025 14:42:23 +0300 Subject: [PATCH 46/56] ipv6: Start path selection from the first nexthop Cited commit transitioned IPv6 path selection to use hash-threshold instead of modulo-N. With hash-threshold, each nexthop is assigned a region boundary in the multipath hash function's output space and a nexthop is chosen if the calculated hash is smaller than the nexthop's region boundary. Hash-threshold does not work correctly if path selection does not start with the first nexthop. For example, if fib6_select_path() is always passed the last nexthop in the group, then it will always be chosen because its region boundary covers the entire hash function's output space. Fix this by starting the selection process from the first nexthop and do not consider nexthops for which rt6_score_route() provided a negative score. Fixes: 3d709f69a3e7 ("ipv6: Use hash-threshold instead of modulo-N") Reported-by: Stanislav Fomichev Closes: https://lore.kernel.org/netdev/Z9RIyKZDNoka53EO@mini-arch/ Signed-off-by: Ido Schimmel Link: https://patch.msgid.link/20250402114224.293392-2-idosch@nvidia.com Signed-off-by: Jakub Kicinski --- net/ipv6/route.c | 38 +++++++++++++++++++++++++++++++++++--- 1 file changed, 35 insertions(+), 3 deletions(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index c3406a0d45bd..864f0002034b 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -412,11 +412,35 @@ static bool rt6_check_expired(const struct rt6_info *rt) return false; } +static struct fib6_info * +rt6_multipath_first_sibling_rcu(const struct fib6_info *rt) +{ + struct fib6_info *iter; + struct fib6_node *fn; + + fn = rcu_dereference(rt->fib6_node); + if (!fn) + goto out; + iter = rcu_dereference(fn->leaf); + if (!iter) + goto out; + + while (iter) { + if (iter->fib6_metric == rt->fib6_metric && + rt6_qualify_for_ecmp(iter)) + return iter; + iter = rcu_dereference(iter->fib6_next); + } + +out: + return NULL; +} + void fib6_select_path(const struct net *net, struct fib6_result *res, struct flowi6 *fl6, int oif, bool have_oif_match, const struct sk_buff *skb, int strict) { - struct fib6_info *match = res->f6i; + struct fib6_info *first, *match = res->f6i; struct fib6_info *sibling; if (!match->nh && (!match->fib6_nsiblings || have_oif_match)) @@ -440,10 +464,18 @@ void fib6_select_path(const struct net *net, struct fib6_result *res, return; } - if (fl6->mp_hash <= atomic_read(&match->fib6_nh->fib_nh_upper_bound)) + first = rt6_multipath_first_sibling_rcu(match); + if (!first) goto out; - list_for_each_entry_rcu(sibling, &match->fib6_siblings, + if (fl6->mp_hash <= atomic_read(&first->fib6_nh->fib_nh_upper_bound) && + rt6_score_route(first->fib6_nh, first->fib6_flags, oif, + strict) >= 0) { + match = first; + goto out; + } + + list_for_each_entry_rcu(sibling, &first->fib6_siblings, fib6_siblings) { const struct fib6_nh *nh = sibling->fib6_nh; int nh_upper_bound; From 8b8e0dd357165e0258d9f9cdab5366720ed2f619 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Wed, 2 Apr 2025 14:42:24 +0300 Subject: [PATCH 47/56] ipv6: Do not consider link down nexthops in path selection Nexthops whose link is down are not supposed to be considered during path selection when the "ignore_routes_with_linkdown" sysctl is set. This is done by assigning them a negative region boundary. However, when comparing the computed hash (unsigned) with the region boundary (signed), the negative region boundary is treated as unsigned, resulting in incorrect nexthop selection. Fix by treating the computed hash as signed. Note that the computed hash is always in range of [0, 2^31 - 1]. Fixes: 3d709f69a3e7 ("ipv6: Use hash-threshold instead of modulo-N") Signed-off-by: Ido Schimmel Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20250402114224.293392-3-idosch@nvidia.com Signed-off-by: Jakub Kicinski --- net/ipv6/route.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 864f0002034b..ab12b816ab94 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -442,6 +442,7 @@ void fib6_select_path(const struct net *net, struct fib6_result *res, { struct fib6_info *first, *match = res->f6i; struct fib6_info *sibling; + int hash; if (!match->nh && (!match->fib6_nsiblings || have_oif_match)) goto out; @@ -468,7 +469,8 @@ void fib6_select_path(const struct net *net, struct fib6_result *res, if (!first) goto out; - if (fl6->mp_hash <= atomic_read(&first->fib6_nh->fib_nh_upper_bound) && + hash = fl6->mp_hash; + if (hash <= atomic_read(&first->fib6_nh->fib_nh_upper_bound) && rt6_score_route(first->fib6_nh, first->fib6_flags, oif, strict) >= 0) { match = first; @@ -481,7 +483,7 @@ void fib6_select_path(const struct net *net, struct fib6_result *res, int nh_upper_bound; nh_upper_bound = atomic_read(&nh->fib_nh_upper_bound); - if (fl6->mp_hash > nh_upper_bound) + if (hash > nh_upper_bound) continue; if (rt6_score_route(nh, sibling->fib6_flags, oif, strict) < 0) break; From fda8c491db2a90ff3e6fbbae58e495b4ddddeca3 Mon Sep 17 00:00:00 2001 From: Henry Martin Date: Wed, 2 Apr 2025 21:50:36 +0800 Subject: [PATCH 48/56] arcnet: Add NULL check in com20020pci_probe() devm_kasprintf() returns NULL when memory allocation fails. Currently, com20020pci_probe() does not check for this case, which results in a NULL pointer dereference. Add NULL check after devm_kasprintf() to prevent this issue and ensure no resources are left allocated. Fixes: 6b17a597fc2f ("arcnet: restoring support for multiple Sohard Arcnet cards") Signed-off-by: Henry Martin Link: https://patch.msgid.link/20250402135036.44697-1-bsdhenrymartin@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/arcnet/com20020-pci.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/drivers/net/arcnet/com20020-pci.c b/drivers/net/arcnet/com20020-pci.c index c5e571ec94c9..0472bcdff130 100644 --- a/drivers/net/arcnet/com20020-pci.c +++ b/drivers/net/arcnet/com20020-pci.c @@ -251,18 +251,33 @@ static int com20020pci_probe(struct pci_dev *pdev, card->tx_led.default_trigger = devm_kasprintf(&pdev->dev, GFP_KERNEL, "arc%d-%d-tx", dev->dev_id, i); + if (!card->tx_led.default_trigger) { + ret = -ENOMEM; + goto err_free_arcdev; + } card->tx_led.name = devm_kasprintf(&pdev->dev, GFP_KERNEL, "pci:green:tx:%d-%d", dev->dev_id, i); - + if (!card->tx_led.name) { + ret = -ENOMEM; + goto err_free_arcdev; + } card->tx_led.dev = &dev->dev; card->recon_led.brightness_set = led_recon_set; card->recon_led.default_trigger = devm_kasprintf(&pdev->dev, GFP_KERNEL, "arc%d-%d-recon", dev->dev_id, i); + if (!card->recon_led.default_trigger) { + ret = -ENOMEM; + goto err_free_arcdev; + } card->recon_led.name = devm_kasprintf(&pdev->dev, GFP_KERNEL, "pci:red:recon:%d-%d", dev->dev_id, i); + if (!card->recon_led.name) { + ret = -ENOMEM; + goto err_free_arcdev; + } card->recon_led.dev = &dev->dev; ret = devm_led_classdev_register(&pdev->dev, &card->tx_led); From 053f3ff67d7feefc75797863f3d84b47ad47086f Mon Sep 17 00:00:00 2001 From: Dave Marquardt Date: Wed, 2 Apr 2025 10:44:03 -0500 Subject: [PATCH 49/56] net: ibmveth: make veth_pool_store stop hanging v2: - Created a single error handling unlock and exit in veth_pool_store - Greatly expanded commit message with previous explanatory-only text Summary: Use rtnl_mutex to synchronize veth_pool_store with itself, ibmveth_close and ibmveth_open, preventing multiple calls in a row to napi_disable. Background: Two (or more) threads could call veth_pool_store through writing to /sys/devices/vio/30000002/pool*/*. You can do this easily with a little shell script. This causes a hang. I configured LOCKDEP, compiled ibmveth.c with DEBUG, and built a new kernel. I ran this test again and saw: Setting pool0/active to 0 Setting pool1/active to 1 [ 73.911067][ T4365] ibmveth 30000002 eth0: close starting Setting pool1/active to 1 Setting pool1/active to 0 [ 73.911367][ T4366] ibmveth 30000002 eth0: close starting [ 73.916056][ T4365] ibmveth 30000002 eth0: close complete [ 73.916064][ T4365] ibmveth 30000002 eth0: open starting [ 110.808564][ T712] systemd-journald[712]: Sent WATCHDOG=1 notification. [ 230.808495][ T712] systemd-journald[712]: Sent WATCHDOG=1 notification. [ 243.683786][ T123] INFO: task stress.sh:4365 blocked for more than 122 seconds. [ 243.683827][ T123] Not tainted 6.14.0-01103-g2df0c02dab82-dirty #8 [ 243.683833][ T123] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 243.683838][ T123] task:stress.sh state:D stack:28096 pid:4365 tgid:4365 ppid:4364 task_flags:0x400040 flags:0x00042000 [ 243.683852][ T123] Call Trace: [ 243.683857][ T123] [c00000000c38f690] [0000000000000001] 0x1 (unreliable) [ 243.683868][ T123] [c00000000c38f840] [c00000000001f908] __switch_to+0x318/0x4e0 [ 243.683878][ T123] [c00000000c38f8a0] [c000000001549a70] __schedule+0x500/0x12a0 [ 243.683888][ T123] [c00000000c38f9a0] [c00000000154a878] schedule+0x68/0x210 [ 243.683896][ T123] [c00000000c38f9d0] [c00000000154ac80] schedule_preempt_disabled+0x30/0x50 [ 243.683904][ T123] [c00000000c38fa00] [c00000000154dbb0] __mutex_lock+0x730/0x10f0 [ 243.683913][ T123] [c00000000c38fb10] [c000000001154d40] napi_enable+0x30/0x60 [ 243.683921][ T123] [c00000000c38fb40] [c000000000f4ae94] ibmveth_open+0x68/0x5dc [ 243.683928][ T123] [c00000000c38fbe0] [c000000000f4aa20] veth_pool_store+0x220/0x270 [ 243.683936][ T123] [c00000000c38fc70] [c000000000826278] sysfs_kf_write+0x68/0xb0 [ 243.683944][ T123] [c00000000c38fcb0] [c0000000008240b8] kernfs_fop_write_iter+0x198/0x2d0 [ 243.683951][ T123] [c00000000c38fd00] [c00000000071b9ac] vfs_write+0x34c/0x650 [ 243.683958][ T123] [c00000000c38fdc0] [c00000000071bea8] ksys_write+0x88/0x150 [ 243.683966][ T123] [c00000000c38fe10] [c0000000000317f4] system_call_exception+0x124/0x340 [ 243.683973][ T123] [c00000000c38fe50] [c00000000000d05c] system_call_vectored_common+0x15c/0x2ec ... [ 243.684087][ T123] Showing all locks held in the system: [ 243.684095][ T123] 1 lock held by khungtaskd/123: [ 243.684099][ T123] #0: c00000000278e370 (rcu_read_lock){....}-{1:2}, at: debug_show_all_locks+0x50/0x248 [ 243.684114][ T123] 4 locks held by stress.sh/4365: [ 243.684119][ T123] #0: c00000003a4cd3f8 (sb_writers#3){.+.+}-{0:0}, at: ksys_write+0x88/0x150 [ 243.684132][ T123] #1: c000000041aea888 (&of->mutex#2){+.+.}-{3:3}, at: kernfs_fop_write_iter+0x154/0x2d0 [ 243.684143][ T123] #2: c0000000366fb9a8 (kn->active#64){.+.+}-{0:0}, at: kernfs_fop_write_iter+0x160/0x2d0 [ 243.684155][ T123] #3: c000000035ff4cb8 (&dev->lock){+.+.}-{3:3}, at: napi_enable+0x30/0x60 [ 243.684166][ T123] 5 locks held by stress.sh/4366: [ 243.684170][ T123] #0: c00000003a4cd3f8 (sb_writers#3){.+.+}-{0:0}, at: ksys_write+0x88/0x150 [ 243.684183][ T123] #1: c00000000aee2288 (&of->mutex#2){+.+.}-{3:3}, at: kernfs_fop_write_iter+0x154/0x2d0 [ 243.684194][ T123] #2: c0000000366f4ba8 (kn->active#64){.+.+}-{0:0}, at: kernfs_fop_write_iter+0x160/0x2d0 [ 243.684205][ T123] #3: c000000035ff4cb8 (&dev->lock){+.+.}-{3:3}, at: napi_disable+0x30/0x60 [ 243.684216][ T123] #4: c0000003ff9bbf18 (&rq->__lock){-.-.}-{2:2}, at: __schedule+0x138/0x12a0 From the ibmveth debug, two threads are calling veth_pool_store, which calls ibmveth_close and ibmveth_open. Here's the sequence: T4365 T4366 ----------------- ----------------- --------- veth_pool_store veth_pool_store ibmveth_close ibmveth_close napi_disable napi_disable ibmveth_open napi_enable <- HANG ibmveth_close calls napi_disable at the top and ibmveth_open calls napi_enable at the top. https://docs.kernel.org/networking/napi.html]] says The control APIs are not idempotent. Control API calls are safe against concurrent use of datapath APIs but an incorrect sequence of control API calls may result in crashes, deadlocks, or race conditions. For example, calling napi_disable() multiple times in a row will deadlock. In the normal open and close paths, rtnl_mutex is acquired to prevent other callers. This is missing from veth_pool_store. Use rtnl_mutex in veth_pool_store fixes these hangs. Signed-off-by: Dave Marquardt Fixes: 860f242eb534 ("[PATCH] ibmveth change buffer pools dynamically") Reviewed-by: Nick Child Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250402154403.386744-1-davemarq@linux.ibm.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/ibm/ibmveth.c | 39 +++++++++++++++++++++--------- 1 file changed, 27 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/ibm/ibmveth.c b/drivers/net/ethernet/ibm/ibmveth.c index b619a3ec245b..04192190beba 100644 --- a/drivers/net/ethernet/ibm/ibmveth.c +++ b/drivers/net/ethernet/ibm/ibmveth.c @@ -1802,18 +1802,22 @@ static ssize_t veth_pool_store(struct kobject *kobj, struct attribute *attr, long value = simple_strtol(buf, NULL, 10); long rc; + rtnl_lock(); + if (attr == &veth_active_attr) { if (value && !pool->active) { if (netif_running(netdev)) { if (ibmveth_alloc_buffer_pool(pool)) { netdev_err(netdev, "unable to alloc pool\n"); - return -ENOMEM; + rc = -ENOMEM; + goto unlock_err; } pool->active = 1; ibmveth_close(netdev); - if ((rc = ibmveth_open(netdev))) - return rc; + rc = ibmveth_open(netdev); + if (rc) + goto unlock_err; } else { pool->active = 1; } @@ -1833,48 +1837,59 @@ static ssize_t veth_pool_store(struct kobject *kobj, struct attribute *attr, if (i == IBMVETH_NUM_BUFF_POOLS) { netdev_err(netdev, "no active pool >= MTU\n"); - return -EPERM; + rc = -EPERM; + goto unlock_err; } if (netif_running(netdev)) { ibmveth_close(netdev); pool->active = 0; - if ((rc = ibmveth_open(netdev))) - return rc; + rc = ibmveth_open(netdev); + if (rc) + goto unlock_err; } pool->active = 0; } } else if (attr == &veth_num_attr) { if (value <= 0 || value > IBMVETH_MAX_POOL_COUNT) { - return -EINVAL; + rc = -EINVAL; + goto unlock_err; } else { if (netif_running(netdev)) { ibmveth_close(netdev); pool->size = value; - if ((rc = ibmveth_open(netdev))) - return rc; + rc = ibmveth_open(netdev); + if (rc) + goto unlock_err; } else { pool->size = value; } } } else if (attr == &veth_size_attr) { if (value <= IBMVETH_BUFF_OH || value > IBMVETH_MAX_BUF_SIZE) { - return -EINVAL; + rc = -EINVAL; + goto unlock_err; } else { if (netif_running(netdev)) { ibmveth_close(netdev); pool->buff_size = value; - if ((rc = ibmveth_open(netdev))) - return rc; + rc = ibmveth_open(netdev); + if (rc) + goto unlock_err; } else { pool->buff_size = value; } } } + rtnl_unlock(); /* kick the interrupt handler to allocate/deallocate pools */ ibmveth_interrupt(netdev->irq, netdev); return count; + +unlock_err: + rtnl_unlock(); + return rc; } From ec304b70d46bd2ed66541c5b57b63276529e05b1 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 2 Apr 2025 18:34:04 -0700 Subject: [PATCH 50/56] net: move mp dev config validation to __net_mp_open_rxq() devmem code performs a number of safety checks to avoid having to reimplement all of them in the drivers. Move those to __net_mp_open_rxq() and reuse that function for binding to make sure that io_uring ZC also benefits from them. While at it rename the queue ID variable to rxq_idx in __net_mp_open_rxq(), we touch most of the relevant lines. The XArray insertion is reordered after the netdev_rx_queue_restart() call, otherwise we'd need to duplicate the queue index check or risk inserting an invalid pointer. The XArray allocation failures should be extremely rare. Reviewed-by: Mina Almasry Acked-by: Stanislav Fomichev Fixes: 6e18ed929d3b ("net: add helpers for setting a memory provider on an rx queue") Link: https://patch.msgid.link/20250403013405.2827250-2-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/net/page_pool/memory_provider.h | 6 +++ net/core/devmem.c | 50 +++++-------------------- net/core/netdev-genl.c | 6 --- net/core/netdev_rx_queue.c | 49 ++++++++++++++++++------ 4 files changed, 54 insertions(+), 57 deletions(-) diff --git a/include/net/page_pool/memory_provider.h b/include/net/page_pool/memory_provider.h index b3e665897767..ada4f968960a 100644 --- a/include/net/page_pool/memory_provider.h +++ b/include/net/page_pool/memory_provider.h @@ -6,6 +6,7 @@ #include struct netdev_rx_queue; +struct netlink_ext_ack; struct sk_buff; struct memory_provider_ops { @@ -24,8 +25,13 @@ void net_mp_niov_clear_page_pool(struct net_iov *niov); int net_mp_open_rxq(struct net_device *dev, unsigned ifq_idx, struct pp_memory_provider_params *p); +int __net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx, + const struct pp_memory_provider_params *p, + struct netlink_ext_ack *extack); void net_mp_close_rxq(struct net_device *dev, unsigned ifq_idx, struct pp_memory_provider_params *old_p); +void __net_mp_close_rxq(struct net_device *dev, unsigned int rxq_idx, + const struct pp_memory_provider_params *old_p); /** * net_mp_netmem_place_in_cache() - give a netmem to a page pool diff --git a/net/core/devmem.c b/net/core/devmem.c index ee145a2aa41c..f2ce3c2ebc97 100644 --- a/net/core/devmem.c +++ b/net/core/devmem.c @@ -8,7 +8,6 @@ */ #include -#include #include #include #include @@ -143,57 +142,28 @@ int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx, struct net_devmem_dmabuf_binding *binding, struct netlink_ext_ack *extack) { + struct pp_memory_provider_params mp_params = { + .mp_priv = binding, + .mp_ops = &dmabuf_devmem_ops, + }; struct netdev_rx_queue *rxq; u32 xa_idx; int err; - if (rxq_idx >= dev->real_num_rx_queues) { - NL_SET_ERR_MSG(extack, "rx queue index out of range"); - return -ERANGE; - } - - if (dev->cfg->hds_config != ETHTOOL_TCP_DATA_SPLIT_ENABLED) { - NL_SET_ERR_MSG(extack, "tcp-data-split is disabled"); - return -EINVAL; - } - - if (dev->cfg->hds_thresh) { - NL_SET_ERR_MSG(extack, "hds-thresh is not zero"); - return -EINVAL; - } + err = __net_mp_open_rxq(dev, rxq_idx, &mp_params, extack); + if (err) + return err; rxq = __netif_get_rx_queue(dev, rxq_idx); - if (rxq->mp_params.mp_ops) { - NL_SET_ERR_MSG(extack, "designated queue already memory provider bound"); - return -EEXIST; - } - -#ifdef CONFIG_XDP_SOCKETS - if (rxq->pool) { - NL_SET_ERR_MSG(extack, "designated queue already in use by AF_XDP"); - return -EBUSY; - } -#endif - err = xa_alloc(&binding->bound_rxqs, &xa_idx, rxq, xa_limit_32b, GFP_KERNEL); if (err) - return err; - - rxq->mp_params.mp_priv = binding; - rxq->mp_params.mp_ops = &dmabuf_devmem_ops; - - err = netdev_rx_queue_restart(dev, rxq_idx); - if (err) - goto err_xa_erase; + goto err_close_rxq; return 0; -err_xa_erase: - rxq->mp_params.mp_priv = NULL; - rxq->mp_params.mp_ops = NULL; - xa_erase(&binding->bound_rxqs, xa_idx); - +err_close_rxq: + __net_mp_close_rxq(dev, rxq_idx, &mp_params); return err; } diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 3afeaa8c5dc5..5d7af50fe702 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -874,12 +874,6 @@ int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info) goto err_unlock; } - if (dev_xdp_prog_count(netdev)) { - NL_SET_ERR_MSG(info->extack, "unable to bind dmabuf to device with XDP program attached"); - err = -EEXIST; - goto err_unlock; - } - binding = net_devmem_bind_dmabuf(netdev, dmabuf_fd, info->extack); if (IS_ERR(binding)) { err = PTR_ERR(binding); diff --git a/net/core/netdev_rx_queue.c b/net/core/netdev_rx_queue.c index 3af716f77a13..556b5393ec9f 100644 --- a/net/core/netdev_rx_queue.c +++ b/net/core/netdev_rx_queue.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-or-later +#include #include #include #include @@ -86,8 +87,9 @@ int netdev_rx_queue_restart(struct net_device *dev, unsigned int rxq_idx) } EXPORT_SYMBOL_NS_GPL(netdev_rx_queue_restart, "NETDEV_INTERNAL"); -static int __net_mp_open_rxq(struct net_device *dev, unsigned ifq_idx, - struct pp_memory_provider_params *p) +int __net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx, + const struct pp_memory_provider_params *p, + struct netlink_ext_ack *extack) { struct netdev_rx_queue *rxq; int ret; @@ -95,16 +97,41 @@ static int __net_mp_open_rxq(struct net_device *dev, unsigned ifq_idx, if (!netdev_need_ops_lock(dev)) return -EOPNOTSUPP; - if (ifq_idx >= dev->real_num_rx_queues) + if (rxq_idx >= dev->real_num_rx_queues) return -EINVAL; - ifq_idx = array_index_nospec(ifq_idx, dev->real_num_rx_queues); + rxq_idx = array_index_nospec(rxq_idx, dev->real_num_rx_queues); - rxq = __netif_get_rx_queue(dev, ifq_idx); - if (rxq->mp_params.mp_ops) + if (rxq_idx >= dev->real_num_rx_queues) { + NL_SET_ERR_MSG(extack, "rx queue index out of range"); + return -ERANGE; + } + if (dev->cfg->hds_config != ETHTOOL_TCP_DATA_SPLIT_ENABLED) { + NL_SET_ERR_MSG(extack, "tcp-data-split is disabled"); + return -EINVAL; + } + if (dev->cfg->hds_thresh) { + NL_SET_ERR_MSG(extack, "hds-thresh is not zero"); + return -EINVAL; + } + if (dev_xdp_prog_count(dev)) { + NL_SET_ERR_MSG(extack, "unable to custom memory provider to device with XDP program attached"); return -EEXIST; + } + + rxq = __netif_get_rx_queue(dev, rxq_idx); + if (rxq->mp_params.mp_ops) { + NL_SET_ERR_MSG(extack, "designated queue already memory provider bound"); + return -EEXIST; + } +#ifdef CONFIG_XDP_SOCKETS + if (rxq->pool) { + NL_SET_ERR_MSG(extack, "designated queue already in use by AF_XDP"); + return -EBUSY; + } +#endif rxq->mp_params = *p; - ret = netdev_rx_queue_restart(dev, ifq_idx); + ret = netdev_rx_queue_restart(dev, rxq_idx); if (ret) { rxq->mp_params.mp_ops = NULL; rxq->mp_params.mp_priv = NULL; @@ -112,19 +139,19 @@ static int __net_mp_open_rxq(struct net_device *dev, unsigned ifq_idx, return ret; } -int net_mp_open_rxq(struct net_device *dev, unsigned ifq_idx, +int net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx, struct pp_memory_provider_params *p) { int ret; netdev_lock(dev); - ret = __net_mp_open_rxq(dev, ifq_idx, p); + ret = __net_mp_open_rxq(dev, rxq_idx, p, NULL); netdev_unlock(dev); return ret; } -static void __net_mp_close_rxq(struct net_device *dev, unsigned ifq_idx, - struct pp_memory_provider_params *old_p) +void __net_mp_close_rxq(struct net_device *dev, unsigned int ifq_idx, + const struct pp_memory_provider_params *old_p) { struct netdev_rx_queue *rxq; From 34f71de3f548eba0604c9cbabc1eb68b2f81fa0f Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 2 Apr 2025 18:34:05 -0700 Subject: [PATCH 51/56] net: avoid false positive warnings in __net_mp_close_rxq() Commit under Fixes solved the problem of spurious warnings when we uninstall an MP from a device while its down. The __net_mp_close_rxq() which is used by io_uring was not fixed. Move the fix over and reuse __net_mp_close_rxq() in the devmem path. Acked-by: Stanislav Fomichev Fixes: a70f891e0fa0 ("net: devmem: do not WARN conditionally after netdev_rx_queue_restart()") Reviewed-by: Mina Almasry Link: https://patch.msgid.link/20250403013405.2827250-3-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/core/devmem.c | 12 +++++------- net/core/netdev_rx_queue.c | 4 +++- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/net/core/devmem.c b/net/core/devmem.c index f2ce3c2ebc97..6e27a47d0493 100644 --- a/net/core/devmem.c +++ b/net/core/devmem.c @@ -116,21 +116,19 @@ void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding) struct netdev_rx_queue *rxq; unsigned long xa_idx; unsigned int rxq_idx; - int err; if (binding->list.next) list_del(&binding->list); xa_for_each(&binding->bound_rxqs, xa_idx, rxq) { - WARN_ON(rxq->mp_params.mp_priv != binding); - - rxq->mp_params.mp_priv = NULL; - rxq->mp_params.mp_ops = NULL; + const struct pp_memory_provider_params mp_params = { + .mp_priv = binding, + .mp_ops = &dmabuf_devmem_ops, + }; rxq_idx = get_netdev_rx_queue_index(rxq); - err = netdev_rx_queue_restart(binding->dev, rxq_idx); - WARN_ON(err && err != -ENETDOWN); + __net_mp_close_rxq(binding->dev, rxq_idx, &mp_params); } xa_erase(&net_devmem_dmabuf_bindings, binding->id); diff --git a/net/core/netdev_rx_queue.c b/net/core/netdev_rx_queue.c index 556b5393ec9f..d126f10197bf 100644 --- a/net/core/netdev_rx_queue.c +++ b/net/core/netdev_rx_queue.c @@ -154,6 +154,7 @@ void __net_mp_close_rxq(struct net_device *dev, unsigned int ifq_idx, const struct pp_memory_provider_params *old_p) { struct netdev_rx_queue *rxq; + int err; if (WARN_ON_ONCE(ifq_idx >= dev->real_num_rx_queues)) return; @@ -173,7 +174,8 @@ void __net_mp_close_rxq(struct net_device *dev, unsigned int ifq_idx, rxq->mp_params.mp_ops = NULL; rxq->mp_params.mp_priv = NULL; - WARN_ON(netdev_rx_queue_restart(dev, ifq_idx)); + err = netdev_rx_queue_restart(dev, ifq_idx); + WARN_ON(err && err != -ENETDOWN); } void net_mp_close_rxq(struct net_device *dev, unsigned ifq_idx, From 0802c32d4b03a26604c2db2c8a63b34a80361305 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 2 Apr 2025 18:37:03 -0700 Subject: [PATCH 52/56] netlink: specs: rt_addr: fix the spec format / schema failures The spec is mis-formatted, schema validation says: Failed validating 'type' in schema['properties']['operations']['properties']['list']['items']['properties']['dump']['properties']['request']['properties']['value']: {'minimum': 0, 'type': 'integer'} On instance['operations']['list'][3]['dump']['request']['value']: '58 - ifa-family' The ifa-family clearly wants to be part of an attribute list. Reviewed-by: Jacob Keller Reviewed-by: Donald Hunter Reviewed-by: Yuyang Huang Fixes: 4f280376e531 ("selftests/net: Add selftest for IPv4 RTM_GETMULTICAST support") Link: https://patch.msgid.link/20250403013706.2828322-2-kuba@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/rt_addr.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/netlink/specs/rt_addr.yaml b/Documentation/netlink/specs/rt_addr.yaml index 5dd5469044c7..3bc9b6f9087e 100644 --- a/Documentation/netlink/specs/rt_addr.yaml +++ b/Documentation/netlink/specs/rt_addr.yaml @@ -187,6 +187,7 @@ operations: dump: request: value: 58 + attributes: - ifa-family reply: value: 58 From 524c03585fda36584cc7ada49a1827666d37eb4e Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 2 Apr 2025 18:37:04 -0700 Subject: [PATCH 53/56] netlink: specs: rt_addr: fix get multi command name Command names should match C defines, codegens may depend on it. Reviewed-by: Jacob Keller Fixes: 4f280376e531 ("selftests/net: Add selftest for IPv4 RTM_GETMULTICAST support") Reviewed-by: Donald Hunter Link: https://patch.msgid.link/20250403013706.2828322-3-kuba@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/rt_addr.yaml | 2 +- tools/testing/selftests/net/rtnetlink.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/netlink/specs/rt_addr.yaml b/Documentation/netlink/specs/rt_addr.yaml index 3bc9b6f9087e..1650dc3f091a 100644 --- a/Documentation/netlink/specs/rt_addr.yaml +++ b/Documentation/netlink/specs/rt_addr.yaml @@ -169,7 +169,7 @@ operations: value: 20 attributes: *ifaddr-all - - name: getmaddrs + name: getmulticast doc: Get / dump IPv4/IPv6 multicast addresses. attribute-set: addr-attrs fixed-header: ifaddrmsg diff --git a/tools/testing/selftests/net/rtnetlink.py b/tools/testing/selftests/net/rtnetlink.py index 80950888800b..69436415d56e 100755 --- a/tools/testing/selftests/net/rtnetlink.py +++ b/tools/testing/selftests/net/rtnetlink.py @@ -12,7 +12,7 @@ def dump_mcaddr_check(rtnl: RtnlAddrFamily) -> None: At least the loopback interface should have this address. """ - addresses = rtnl.getmaddrs({"ifa-family": socket.AF_INET}, dump=True) + addresses = rtnl.getmulticast({"ifa-family": socket.AF_INET}, dump=True) all_host_multicasts = [ addr for addr in addresses if addr['ifa-multicast'] == IPV4_ALL_HOSTS_MULTICAST From 0c8e30252d9fe8127f90b7a0a293872b368ebf3c Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 2 Apr 2025 18:37:05 -0700 Subject: [PATCH 54/56] netlink: specs: rt_addr: pull the ifa- prefix out of the names YAML specs don't normally include the C prefix name in the name of the YAML attr. Remove the ifa- prefix from all attributes in addr-attrs and specify name-prefix instead. This is a bit risky, hopefully there aren't many users out there. Fixes: dfb0f7d9d979 ("doc/netlink: Add spec for rt addr messages") Reviewed-by: Donald Hunter Link: https://patch.msgid.link/20250403013706.2828322-4-kuba@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/rt_addr.yaml | 39 ++++++++++++------------ tools/testing/selftests/net/rtnetlink.py | 2 +- 2 files changed, 21 insertions(+), 20 deletions(-) diff --git a/Documentation/netlink/specs/rt_addr.yaml b/Documentation/netlink/specs/rt_addr.yaml index 1650dc3f091a..df6b23f06a22 100644 --- a/Documentation/netlink/specs/rt_addr.yaml +++ b/Documentation/netlink/specs/rt_addr.yaml @@ -78,45 +78,46 @@ definitions: attribute-sets: - name: addr-attrs + name-prefix: ifa- attributes: - - name: ifa-address + name: address type: binary display-hint: ipv4 - - name: ifa-local + name: local type: binary display-hint: ipv4 - - name: ifa-label + name: label type: string - - name: ifa-broadcast + name: broadcast type: binary display-hint: ipv4 - - name: ifa-anycast + name: anycast type: binary - - name: ifa-cacheinfo + name: cacheinfo type: binary struct: ifa-cacheinfo - - name: ifa-multicast + name: multicast type: binary - - name: ifa-flags + name: flags type: u32 enum: ifa-flags enum-as-flags: true - - name: ifa-rt-priority + name: rt-priority type: u32 - - name: ifa-target-netnsid + name: target-netnsid type: binary - - name: ifa-proto + name: proto type: u8 @@ -137,10 +138,10 @@ operations: - ifa-prefixlen - ifa-scope - ifa-index - - ifa-address - - ifa-label - - ifa-local - - ifa-cacheinfo + - address + - label + - local + - cacheinfo - name: deladdr doc: Remove address @@ -154,8 +155,8 @@ operations: - ifa-prefixlen - ifa-scope - ifa-index - - ifa-address - - ifa-local + - address + - local - name: getaddr doc: Dump address information. @@ -182,8 +183,8 @@ operations: reply: value: 58 attributes: &mcaddr-attrs - - ifa-multicast - - ifa-cacheinfo + - multicast + - cacheinfo dump: request: value: 58 diff --git a/tools/testing/selftests/net/rtnetlink.py b/tools/testing/selftests/net/rtnetlink.py index 69436415d56e..e9ad5e88da97 100755 --- a/tools/testing/selftests/net/rtnetlink.py +++ b/tools/testing/selftests/net/rtnetlink.py @@ -15,7 +15,7 @@ def dump_mcaddr_check(rtnl: RtnlAddrFamily) -> None: addresses = rtnl.getmulticast({"ifa-family": socket.AF_INET}, dump=True) all_host_multicasts = [ - addr for addr in addresses if addr['ifa-multicast'] == IPV4_ALL_HOSTS_MULTICAST + addr for addr in addresses if addr['multicast'] == IPV4_ALL_HOSTS_MULTICAST ] ksft_ge(len(all_host_multicasts), 1, From 1a1eba0e9899c286914032c78708c614b016704b Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 2 Apr 2025 18:37:06 -0700 Subject: [PATCH 55/56] netlink: specs: rt_route: pull the ifa- prefix out of the names YAML specs don't normally include the C prefix name in the name of the YAML attr. Remove the ifa- prefix from all attributes in route-attrs and metrics and specify name-prefix instead. This is a bit risky, hopefully there aren't many users out there. Fixes: 023289b4f582 ("doc/netlink: Add spec for rt route messages") Reviewed-by: Donald Hunter Link: https://patch.msgid.link/20250403013706.2828322-5-kuba@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/rt_route.yaml | 180 +++++++++++----------- 1 file changed, 91 insertions(+), 89 deletions(-) diff --git a/Documentation/netlink/specs/rt_route.yaml b/Documentation/netlink/specs/rt_route.yaml index a674103e5bc4..292469c7d4b9 100644 --- a/Documentation/netlink/specs/rt_route.yaml +++ b/Documentation/netlink/specs/rt_route.yaml @@ -80,165 +80,167 @@ definitions: attribute-sets: - name: route-attrs + name-prefix: rta- attributes: - - name: rta-dst + name: dst type: binary display-hint: ipv4 - - name: rta-src + name: src type: binary display-hint: ipv4 - - name: rta-iif + name: iif type: u32 - - name: rta-oif + name: oif type: u32 - - name: rta-gateway + name: gateway type: binary display-hint: ipv4 - - name: rta-priority + name: priority type: u32 - - name: rta-prefsrc + name: prefsrc type: binary display-hint: ipv4 - - name: rta-metrics + name: metrics type: nest - nested-attributes: rta-metrics + nested-attributes: metrics - - name: rta-multipath + name: multipath type: binary - - name: rta-protoinfo # not used + name: protoinfo # not used type: binary - - name: rta-flow + name: flow type: u32 - - name: rta-cacheinfo + name: cacheinfo type: binary struct: rta-cacheinfo - - name: rta-session # not used + name: session # not used type: binary - - name: rta-mp-algo # not used + name: mp-algo # not used type: binary - - name: rta-table + name: table type: u32 - - name: rta-mark + name: mark type: u32 - - name: rta-mfc-stats + name: mfc-stats type: binary - - name: rta-via + name: via type: binary - - name: rta-newdst + name: newdst type: binary - - name: rta-pref + name: pref type: u8 - - name: rta-encap-type + name: encap-type type: u16 - - name: rta-encap + name: encap type: binary # tunnel specific nest - - name: rta-expires + name: expires type: u32 - - name: rta-pad + name: pad type: binary - - name: rta-uid + name: uid type: u32 - - name: rta-ttl-propagate + name: ttl-propagate type: u8 - - name: rta-ip-proto + name: ip-proto type: u8 - - name: rta-sport + name: sport type: u16 - - name: rta-dport + name: dport type: u16 - - name: rta-nh-id + name: nh-id type: u32 - - name: rta-flowlabel + name: flowlabel type: u32 byte-order: big-endian display-hint: hex - - name: rta-metrics + name: metrics + name-prefix: rtax- attributes: - - name: rtax-unspec + name: unspec type: unused value: 0 - - name: rtax-lock + name: lock type: u32 - - name: rtax-mtu + name: mtu type: u32 - - name: rtax-window + name: window type: u32 - - name: rtax-rtt + name: rtt type: u32 - - name: rtax-rttvar + name: rttvar type: u32 - - name: rtax-ssthresh + name: ssthresh type: u32 - - name: rtax-cwnd + name: cwnd type: u32 - - name: rtax-advmss + name: advmss type: u32 - - name: rtax-reordering + name: reordering type: u32 - - name: rtax-hoplimit + name: hoplimit type: u32 - - name: rtax-initcwnd + name: initcwnd type: u32 - - name: rtax-features + name: features type: u32 - - name: rtax-rto-min + name: rto-min type: u32 - - name: rtax-initrwnd + name: initrwnd type: u32 - - name: rtax-quickack + name: quickack type: u32 - - name: rtax-cc-algo + name: cc-algo type: string - - name: rtax-fastopen-no-cookie + name: fastopen-no-cookie type: u32 operations: @@ -254,18 +256,18 @@ operations: value: 26 attributes: - rtm-family - - rta-src + - src - rtm-src-len - - rta-dst + - dst - rtm-dst-len - - rta-iif - - rta-oif - - rta-ip-proto - - rta-sport - - rta-dport - - rta-mark - - rta-uid - - rta-flowlabel + - iif + - oif + - ip-proto + - sport + - dport + - mark + - uid + - flowlabel reply: value: 24 attributes: &all-route-attrs @@ -278,34 +280,34 @@ operations: - rtm-scope - rtm-type - rtm-flags - - rta-dst - - rta-src - - rta-iif - - rta-oif - - rta-gateway - - rta-priority - - rta-prefsrc - - rta-metrics - - rta-multipath - - rta-flow - - rta-cacheinfo - - rta-table - - rta-mark - - rta-mfc-stats - - rta-via - - rta-newdst - - rta-pref - - rta-encap-type - - rta-encap - - rta-expires - - rta-pad - - rta-uid - - rta-ttl-propagate - - rta-ip-proto - - rta-sport - - rta-dport - - rta-nh-id - - rta-flowlabel + - dst + - src + - iif + - oif + - gateway + - priority + - prefsrc + - metrics + - multipath + - flow + - cacheinfo + - table + - mark + - mfc-stats + - via + - newdst + - pref + - encap-type + - encap + - expires + - pad + - uid + - ttl-propagate + - ip-proto + - sport + - dport + - nh-id + - flowlabel dump: request: value: 26 From 94f68c0f99a548d33a102672690100bf76a7c460 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 3 Apr 2025 07:56:36 -0700 Subject: [PATCH 56/56] selftests: net: amt: indicate progress in the stress test Our CI expects output from the test at least once every 10 minutes. The AMT test when running on debug kernel is just on the edge of that time for the stress test. Improve the output: - print the name of the test first, before starting it, - output a dot every 10% of the way. Output after: TEST: amt discovery [ OK ] TEST: IPv4 amt multicast forwarding [ OK ] TEST: IPv6 amt multicast forwarding [ OK ] TEST: IPv4 amt traffic forwarding torture .......... [ OK ] TEST: IPv6 amt traffic forwarding torture .......... [ OK ] Reviewed-by: Taehee Yoo Link: https://patch.msgid.link/20250403145636.2891166-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/amt.sh | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/net/amt.sh b/tools/testing/selftests/net/amt.sh index d458b45c775b..3ef209cacb8e 100755 --- a/tools/testing/selftests/net/amt.sh +++ b/tools/testing/selftests/net/amt.sh @@ -194,15 +194,21 @@ test_remote_ip() send_mcast_torture4() { - ip netns exec "${SOURCE}" bash -c \ - 'cat /dev/urandom | head -c 1G | nc -w 1 -u 239.0.0.1 4001' + for i in `seq 10`; do + ip netns exec "${SOURCE}" bash -c \ + 'cat /dev/urandom | head -c 100M | nc -w 1 -u 239.0.0.1 4001' + echo -n "." + done } send_mcast_torture6() { - ip netns exec "${SOURCE}" bash -c \ - 'cat /dev/urandom | head -c 1G | nc -w 1 -u ff0e::5:6 6001' + for i in `seq 10`; do + ip netns exec "${SOURCE}" bash -c \ + 'cat /dev/urandom | head -c 100M | nc -w 1 -u ff0e::5:6 6001' + echo -n "." + done } check_features() @@ -278,10 +284,12 @@ wait $pid || err=$? if [ $err -eq 1 ]; then ERR=1 fi +printf "TEST: %-50s" "IPv4 amt traffic forwarding torture" send_mcast_torture4 -printf "TEST: %-60s [ OK ]\n" "IPv4 amt traffic forwarding torture" +printf " [ OK ]\n" +printf "TEST: %-50s" "IPv6 amt traffic forwarding torture" send_mcast_torture6 -printf "TEST: %-60s [ OK ]\n" "IPv6 amt traffic forwarding torture" +printf " [ OK ]\n" sleep 5 if [ "${ERR}" -eq 1 ]; then echo "Some tests failed." >&2