From 34d5a86ff7bbe225fba3ad91f9b4dc85fb408e18 Mon Sep 17 00:00:00 2001 From: Daniel Golle Date: Wed, 15 Jan 2025 14:43:35 +0000 Subject: [PATCH 01/14] net: phy: realtek: clear 1000Base-T lpa if link is down Only read 1000Base-T link partner advertisement if autonegotiation has completed and otherwise 1000Base-T link partner advertisement bits. This fixes bogus 1000Base-T link partner advertisement after link goes down (eg. by disconnecting the wire). Fixes: 5cb409b3960e ("net: phy: realtek: clear 1000Base-T link partner advertisement") Signed-off-by: Daniel Golle Reviewed-by: Michal Swiatkowski Signed-off-by: David S. Miller --- drivers/net/phy/realtek.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/drivers/net/phy/realtek.c b/drivers/net/phy/realtek.c index f65d7f1f348e7..26b324ab0f90f 100644 --- a/drivers/net/phy/realtek.c +++ b/drivers/net/phy/realtek.c @@ -1023,23 +1023,20 @@ static int rtl822x_c45_read_status(struct phy_device *phydev) { int ret, val; - ret = genphy_c45_read_status(phydev); - if (ret < 0) - return ret; - - if (phydev->autoneg == AUTONEG_DISABLE || - !genphy_c45_aneg_done(phydev)) - mii_stat1000_mod_linkmode_lpa_t(phydev->lp_advertising, 0); - /* Vendor register as C45 has no standardized support for 1000BaseT */ - if (phydev->autoneg == AUTONEG_ENABLE) { + if (phydev->autoneg == AUTONEG_ENABLE && genphy_c45_aneg_done(phydev)) { val = phy_read_mmd(phydev, MDIO_MMD_VEND2, RTL822X_VND2_GANLPAR); if (val < 0) return val; - - mii_stat1000_mod_linkmode_lpa_t(phydev->lp_advertising, val); + } else { + val = 0; } + mii_stat1000_mod_linkmode_lpa_t(phydev->lp_advertising, val); + + ret = genphy_c45_read_status(phydev); + if (ret < 0) + return ret; if (!phydev->link) return 0; From ea8318cb33e593bbfc59d637eae45a69732c5387 Mon Sep 17 00:00:00 2001 From: Daniel Golle Date: Wed, 15 Jan 2025 14:43:43 +0000 Subject: [PATCH 02/14] net: phy: realtek: clear master_slave_state if link is down rtlgen_decode_physr() which sets master_slave_state isn't called in case the link is down and other than rtlgen_read_status(), rtl822x_c45_read_status() doesn't implicitely clear master_slave_state. Avoid stale master_slave_state by always setting it to MASTER_SLAVE_STATE_UNKNOWN in rtl822x_c45_read_status() in case the link is down. Fixes: 081c9c0265c9 ("net: phy: realtek: read duplex and gbit master from PHYSR register") Signed-off-by: Daniel Golle Reviewed-by: Michal Swiatkowski Signed-off-by: David S. Miller --- drivers/net/phy/realtek.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/phy/realtek.c b/drivers/net/phy/realtek.c index 26b324ab0f90f..93704abb67878 100644 --- a/drivers/net/phy/realtek.c +++ b/drivers/net/phy/realtek.c @@ -1038,8 +1038,10 @@ static int rtl822x_c45_read_status(struct phy_device *phydev) if (ret < 0) return ret; - if (!phydev->link) + if (!phydev->link) { + phydev->master_slave_state = MASTER_SLAVE_STATE_UNKNOWN; return 0; + } /* Read actual speed from vendor register. */ val = phy_read_mmd(phydev, MDIO_MMD_VEND2, RTL_VND2_PHYSR); From d3eb58549842c60ed46f37da7f4da969e3d6ecd3 Mon Sep 17 00:00:00 2001 From: Daniel Golle Date: Wed, 15 Jan 2025 14:45:00 +0000 Subject: [PATCH 03/14] net: phy: realtek: always clear NBase-T lpa Clear NBase-T link partner advertisement before calling rtlgen_read_status() to avoid phy_resolve_aneg_linkmode() wrongly setting speed and duplex. This fixes bogus 2.5G/5G/10G link partner advertisement and thus speed and duplex being set by phy_resolve_aneg_linkmode() due to stale NBase-T lpa. Fixes: 68d5cd09e891 ("net: phy: realtek: change order of calls in C22 read_status()") Signed-off-by: Daniel Golle Reviewed-by: Michal Swiatkowski Signed-off-by: David S. Miller --- drivers/net/phy/realtek.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/phy/realtek.c b/drivers/net/phy/realtek.c index 93704abb67878..9cefca1aefa1b 100644 --- a/drivers/net/phy/realtek.c +++ b/drivers/net/phy/realtek.c @@ -952,15 +952,15 @@ static int rtl822x_read_status(struct phy_device *phydev) { int lpadv, ret; + mii_10gbt_stat_mod_linkmode_lpa_t(phydev->lp_advertising, 0); + ret = rtlgen_read_status(phydev); if (ret < 0) return ret; if (phydev->autoneg == AUTONEG_DISABLE || - !phydev->autoneg_complete) { - mii_10gbt_stat_mod_linkmode_lpa_t(phydev->lp_advertising, 0); + !phydev->autoneg_complete) return 0; - } lpadv = phy_read_paged(phydev, 0xa5d, 0x13); if (lpadv < 0) From d1f9f79fa2af8e3b45cffdeef66e05833480148a Mon Sep 17 00:00:00 2001 From: Liu Jian Date: Thu, 16 Jan 2025 22:30:53 +0800 Subject: [PATCH 04/14] net: let net.core.dev_weight always be non-zero The following problem was encountered during stability test: (NULL net_device): NAPI poll function process_backlog+0x0/0x530 \ returned 1, exceeding its budget of 0. ------------[ cut here ]------------ list_add double add: new=ffff88905f746f48, prev=ffff88905f746f48, \ next=ffff88905f746e40. WARNING: CPU: 18 PID: 5462 at lib/list_debug.c:35 \ __list_add_valid_or_report+0xf3/0x130 CPU: 18 UID: 0 PID: 5462 Comm: ping Kdump: loaded Not tainted 6.13.0-rc7+ RIP: 0010:__list_add_valid_or_report+0xf3/0x130 Call Trace: ? __warn+0xcd/0x250 ? __list_add_valid_or_report+0xf3/0x130 enqueue_to_backlog+0x923/0x1070 netif_rx_internal+0x92/0x2b0 __netif_rx+0x15/0x170 loopback_xmit+0x2ef/0x450 dev_hard_start_xmit+0x103/0x490 __dev_queue_xmit+0xeac/0x1950 ip_finish_output2+0x6cc/0x1620 ip_output+0x161/0x270 ip_push_pending_frames+0x155/0x1a0 raw_sendmsg+0xe13/0x1550 __sys_sendto+0x3bf/0x4e0 __x64_sys_sendto+0xdc/0x1b0 do_syscall_64+0x5b/0x170 entry_SYSCALL_64_after_hwframe+0x76/0x7e The reproduction command is as follows: sysctl -w net.core.dev_weight=0 ping 127.0.0.1 This is because when the napi's weight is set to 0, process_backlog() may return 0 and clear the NAPI_STATE_SCHED bit of napi->state, causing this napi to be re-polled in net_rx_action() until __do_softirq() times out. Since the NAPI_STATE_SCHED bit has been cleared, napi_schedule_rps() can be retriggered in enqueue_to_backlog(), causing this issue. Making the napi's weight always non-zero solves this problem. Triggering this issue requires system-wide admin (setting is not namespaced). Fixes: e38766054509 ("[NET]: Fix sysctl net.core.dev_weight") Fixes: 3d48b53fb2ae ("net: dev_weight: TX/RX orthogonality") Signed-off-by: Liu Jian Link: https://patch.msgid.link/20250116143053.4146855-1-liujian56@huawei.com Signed-off-by: Jakub Kicinski --- net/core/sysctl_net_core.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index cb8d32e5c14e6..ad2741f1346af 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -319,7 +319,7 @@ static int proc_do_dev_weight(const struct ctl_table *table, int write, int ret, weight; mutex_lock(&dev_weight_mutex); - ret = proc_dointvec(table, write, buffer, lenp, ppos); + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (!ret && write) { weight = READ_ONCE(weight_p); WRITE_ONCE(net_hotdata.dev_rx_weight, weight * dev_weight_rx_bias); @@ -412,6 +412,7 @@ static struct ctl_table net_core_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_do_dev_weight, + .extra1 = SYSCTL_ONE, }, { .procname = "dev_weight_rx_bias", @@ -419,6 +420,7 @@ static struct ctl_table net_core_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_do_dev_weight, + .extra1 = SYSCTL_ONE, }, { .procname = "dev_weight_tx_bias", @@ -426,6 +428,7 @@ static struct ctl_table net_core_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_do_dev_weight, + .extra1 = SYSCTL_ONE, }, { .procname = "netdev_max_backlog", From 70d81f25cc92cc4e914516c9935ae752f27d78ad Mon Sep 17 00:00:00 2001 From: Maher Sanalla Date: Thu, 16 Jan 2025 14:33:16 +0200 Subject: [PATCH 05/14] net/mlxfw: Drop hard coded max FW flash image size Currently, mlxfw kernel module limits FW flash image size to be 10MB at most, preventing the ability to burn recent BlueField-3 FW that exceeds the said size limit. Thus, drop the hard coded limit. Instead, rely on FW's max_component_size threshold that is reported in MCQI register as the size limit for FW image. Fixes: 410ed13cae39 ("Add the mlxfw module for Mellanox firmware flash process") Signed-off-by: Maher Sanalla Signed-off-by: Moshe Shemesh Reviewed-by: Ido Schimmel Tested-by: Ido Schimmel Reviewed-by: Michal Swiatkowski Link: https://patch.msgid.link/1737030796-1441634-1-git-send-email-moshe@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlxfw/mlxfw_fsm.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxfw/mlxfw_fsm.c b/drivers/net/ethernet/mellanox/mlxfw/mlxfw_fsm.c index 46245e0b24623..43c84900369a3 100644 --- a/drivers/net/ethernet/mellanox/mlxfw/mlxfw_fsm.c +++ b/drivers/net/ethernet/mellanox/mlxfw/mlxfw_fsm.c @@ -14,7 +14,6 @@ #define MLXFW_FSM_STATE_WAIT_TIMEOUT_MS 30000 #define MLXFW_FSM_STATE_WAIT_ROUNDS \ (MLXFW_FSM_STATE_WAIT_TIMEOUT_MS / MLXFW_FSM_STATE_WAIT_CYCLE_MS) -#define MLXFW_FSM_MAX_COMPONENT_SIZE (10 * (1 << 20)) static const int mlxfw_fsm_state_errno[] = { [MLXFW_FSM_STATE_ERR_ERROR] = -EIO, @@ -229,7 +228,6 @@ static int mlxfw_flash_component(struct mlxfw_dev *mlxfw_dev, return err; } - comp_max_size = min_t(u32, comp_max_size, MLXFW_FSM_MAX_COMPONENT_SIZE); if (comp->data_size > comp_max_size) { MLXFW_ERR_MSG(mlxfw_dev, extack, "Component size is bigger than limit", -EINVAL); From 1f64255bb76c11d0c41a7d81d7cec68e49d5362d Mon Sep 17 00:00:00 2001 From: Shinas Rasheed Date: Fri, 17 Jan 2025 01:46:50 -0800 Subject: [PATCH 06/14] octeon_ep: remove firmware stats fetch in ndo_get_stats64 The firmware stats fetch call that happens in ndo_get_stats64() is currently not required, and causes a warning to issue. The warn log is given below: [ 123.316837] ------------[ cut here ]------------ [ 123.316840] Voluntary context switch within RCU read-side critical section! [ 123.316917] pc : rcu_note_context_switch+0x2e4/0x300 [ 123.316919] lr : rcu_note_context_switch+0x2e4/0x300 [ 123.316947] Call trace: [ 123.316949] rcu_note_context_switch+0x2e4/0x300 [ 123.316952] __schedule+0x84/0x584 [ 123.316955] schedule+0x38/0x90 [ 123.316956] schedule_timeout+0xa0/0x1d4 [ 123.316959] octep_send_mbox_req+0x190/0x230 [octeon_ep] [ 123.316966] octep_ctrl_net_get_if_stats+0x78/0x100 [octeon_ep] [ 123.316970] octep_get_stats64+0xd4/0xf0 [octeon_ep] [ 123.316975] dev_get_stats+0x4c/0x114 [ 123.316977] dev_seq_printf_stats+0x3c/0x11c [ 123.316980] dev_seq_show+0x1c/0x40 [ 123.316982] seq_read_iter+0x3cc/0x4e0 [ 123.316985] seq_read+0xc8/0x110 [ 123.316987] proc_reg_read+0x9c/0xec [ 123.316990] vfs_read+0xc8/0x2ec [ 123.316993] ksys_read+0x70/0x100 [ 123.316995] __arm64_sys_read+0x20/0x30 [ 123.316997] invoke_syscall.constprop.0+0x7c/0xd0 [ 123.317000] do_el0_svc+0xb4/0xd0 [ 123.317002] el0_svc+0xe8/0x1f4 [ 123.317005] el0t_64_sync_handler+0x134/0x150 [ 123.317006] el0t_64_sync+0x17c/0x180 [ 123.317008] ---[ end trace 63399811432ab69b ]--- Fixes: 6a610a46bad1 ("octeon_ep: add support for ndo ops") Signed-off-by: Shinas Rasheed Link: https://patch.msgid.link/20250117094653.2588578-2-srasheed@marvell.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/octeon_ep/octep_main.c | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeon_ep/octep_main.c b/drivers/net/ethernet/marvell/octeon_ep/octep_main.c index 549436efc2048..730aa5632ccee 100644 --- a/drivers/net/ethernet/marvell/octeon_ep/octep_main.c +++ b/drivers/net/ethernet/marvell/octeon_ep/octep_main.c @@ -995,12 +995,6 @@ static void octep_get_stats64(struct net_device *netdev, struct octep_device *oct = netdev_priv(netdev); int q; - if (netif_running(netdev)) - octep_ctrl_net_get_if_stats(oct, - OCTEP_CTRL_NET_INVALID_VFID, - &oct->iface_rx_stats, - &oct->iface_tx_stats); - tx_packets = 0; tx_bytes = 0; rx_packets = 0; @@ -1018,10 +1012,6 @@ static void octep_get_stats64(struct net_device *netdev, stats->tx_bytes = tx_bytes; stats->rx_packets = rx_packets; stats->rx_bytes = rx_bytes; - stats->multicast = oct->iface_rx_stats.mcast_pkts; - stats->rx_errors = oct->iface_rx_stats.err_pkts; - stats->collisions = oct->iface_tx_stats.xscol; - stats->tx_fifo_errors = oct->iface_tx_stats.undflw; } /** From 10fad79846e49f67ad1a0a05910837125c6ca9ad Mon Sep 17 00:00:00 2001 From: Shinas Rasheed Date: Fri, 17 Jan 2025 01:46:51 -0800 Subject: [PATCH 07/14] octeon_ep: update tx/rx stats locally for persistence Update tx/rx stats locally, so that ndo_get_stats64() can use that and not rely on per queue resources to obtain statistics. The latter used to cause race conditions when the device stopped. Signed-off-by: Shinas Rasheed Link: https://patch.msgid.link/20250117094653.2588578-3-srasheed@marvell.com Signed-off-by: Jakub Kicinski --- .../marvell/octeon_ep/octep_ethtool.c | 41 ++++++++----------- .../ethernet/marvell/octeon_ep/octep_main.c | 19 ++++----- .../ethernet/marvell/octeon_ep/octep_main.h | 6 +++ .../net/ethernet/marvell/octeon_ep/octep_rx.c | 11 ++--- .../net/ethernet/marvell/octeon_ep/octep_rx.h | 4 +- .../net/ethernet/marvell/octeon_ep/octep_tx.c | 7 ++-- .../net/ethernet/marvell/octeon_ep/octep_tx.h | 4 +- 7 files changed, 45 insertions(+), 47 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeon_ep/octep_ethtool.c b/drivers/net/ethernet/marvell/octeon_ep/octep_ethtool.c index 4f4d581891188..a88c006ea65b7 100644 --- a/drivers/net/ethernet/marvell/octeon_ep/octep_ethtool.c +++ b/drivers/net/ethernet/marvell/octeon_ep/octep_ethtool.c @@ -150,17 +150,14 @@ octep_get_ethtool_stats(struct net_device *netdev, iface_rx_stats, iface_tx_stats); - for (q = 0; q < oct->num_oqs; q++) { - struct octep_iq *iq = oct->iq[q]; - struct octep_oq *oq = oct->oq[q]; - - tx_packets += iq->stats.instr_completed; - tx_bytes += iq->stats.bytes_sent; - tx_busy_errors += iq->stats.tx_busy; - - rx_packets += oq->stats.packets; - rx_bytes += oq->stats.bytes; - rx_alloc_errors += oq->stats.alloc_failures; + for (q = 0; q < OCTEP_MAX_QUEUES; q++) { + tx_packets += oct->stats_iq[q].instr_completed; + tx_bytes += oct->stats_iq[q].bytes_sent; + tx_busy_errors += oct->stats_iq[q].tx_busy; + + rx_packets += oct->stats_oq[q].packets; + rx_bytes += oct->stats_oq[q].bytes; + rx_alloc_errors += oct->stats_oq[q].alloc_failures; } i = 0; data[i++] = rx_packets; @@ -198,22 +195,18 @@ octep_get_ethtool_stats(struct net_device *netdev, data[i++] = iface_rx_stats->err_pkts; /* Per Tx Queue stats */ - for (q = 0; q < oct->num_iqs; q++) { - struct octep_iq *iq = oct->iq[q]; - - data[i++] = iq->stats.instr_posted; - data[i++] = iq->stats.instr_completed; - data[i++] = iq->stats.bytes_sent; - data[i++] = iq->stats.tx_busy; + for (q = 0; q < OCTEP_MAX_QUEUES; q++) { + data[i++] = oct->stats_iq[q].instr_posted; + data[i++] = oct->stats_iq[q].instr_completed; + data[i++] = oct->stats_iq[q].bytes_sent; + data[i++] = oct->stats_iq[q].tx_busy; } /* Per Rx Queue stats */ - for (q = 0; q < oct->num_oqs; q++) { - struct octep_oq *oq = oct->oq[q]; - - data[i++] = oq->stats.packets; - data[i++] = oq->stats.bytes; - data[i++] = oq->stats.alloc_failures; + for (q = 0; q < OCTEP_MAX_QUEUES; q++) { + data[i++] = oct->stats_oq[q].packets; + data[i++] = oct->stats_oq[q].bytes; + data[i++] = oct->stats_oq[q].alloc_failures; } } diff --git a/drivers/net/ethernet/marvell/octeon_ep/octep_main.c b/drivers/net/ethernet/marvell/octeon_ep/octep_main.c index 730aa5632ccee..a89f80bac39b8 100644 --- a/drivers/net/ethernet/marvell/octeon_ep/octep_main.c +++ b/drivers/net/ethernet/marvell/octeon_ep/octep_main.c @@ -822,7 +822,7 @@ static inline int octep_iq_full_check(struct octep_iq *iq) if (unlikely(IQ_INSTR_SPACE(iq) > OCTEP_WAKE_QUEUE_THRESHOLD)) { netif_start_subqueue(iq->netdev, iq->q_no); - iq->stats.restart_cnt++; + iq->stats->restart_cnt++; return 0; } @@ -960,7 +960,7 @@ static netdev_tx_t octep_start_xmit(struct sk_buff *skb, wmb(); /* Ring Doorbell to notify the NIC of new packets */ writel(iq->fill_cnt, iq->doorbell_reg); - iq->stats.instr_posted += iq->fill_cnt; + iq->stats->instr_posted += iq->fill_cnt; iq->fill_cnt = 0; return NETDEV_TX_OK; @@ -991,22 +991,19 @@ static netdev_tx_t octep_start_xmit(struct sk_buff *skb, static void octep_get_stats64(struct net_device *netdev, struct rtnl_link_stats64 *stats) { - u64 tx_packets, tx_bytes, rx_packets, rx_bytes; struct octep_device *oct = netdev_priv(netdev); + u64 tx_packets, tx_bytes, rx_packets, rx_bytes; int q; tx_packets = 0; tx_bytes = 0; rx_packets = 0; rx_bytes = 0; - for (q = 0; q < oct->num_oqs; q++) { - struct octep_iq *iq = oct->iq[q]; - struct octep_oq *oq = oct->oq[q]; - - tx_packets += iq->stats.instr_completed; - tx_bytes += iq->stats.bytes_sent; - rx_packets += oq->stats.packets; - rx_bytes += oq->stats.bytes; + for (q = 0; q < OCTEP_MAX_QUEUES; q++) { + tx_packets += oct->stats_iq[q].instr_completed; + tx_bytes += oct->stats_iq[q].bytes_sent; + rx_packets += oct->stats_oq[q].packets; + rx_bytes += oct->stats_oq[q].bytes; } stats->tx_packets = tx_packets; stats->tx_bytes = tx_bytes; diff --git a/drivers/net/ethernet/marvell/octeon_ep/octep_main.h b/drivers/net/ethernet/marvell/octeon_ep/octep_main.h index fee59e0e0138f..936b786f42816 100644 --- a/drivers/net/ethernet/marvell/octeon_ep/octep_main.h +++ b/drivers/net/ethernet/marvell/octeon_ep/octep_main.h @@ -257,11 +257,17 @@ struct octep_device { /* Pointers to Octeon Tx queues */ struct octep_iq *iq[OCTEP_MAX_IQ]; + /* Per iq stats */ + struct octep_iq_stats stats_iq[OCTEP_MAX_IQ]; + /* Rx queues (OQ: Output Queue) */ u16 num_oqs; /* Pointers to Octeon Rx queues */ struct octep_oq *oq[OCTEP_MAX_OQ]; + /* Per oq stats */ + struct octep_oq_stats stats_oq[OCTEP_MAX_OQ]; + /* Hardware port number of the PCIe interface */ u16 pcie_port; diff --git a/drivers/net/ethernet/marvell/octeon_ep/octep_rx.c b/drivers/net/ethernet/marvell/octeon_ep/octep_rx.c index 8af75cb37c3ee..82b6b19e76b47 100644 --- a/drivers/net/ethernet/marvell/octeon_ep/octep_rx.c +++ b/drivers/net/ethernet/marvell/octeon_ep/octep_rx.c @@ -87,7 +87,7 @@ static int octep_oq_refill(struct octep_device *oct, struct octep_oq *oq) page = dev_alloc_page(); if (unlikely(!page)) { dev_err(oq->dev, "refill: rx buffer alloc failed\n"); - oq->stats.alloc_failures++; + oq->stats->alloc_failures++; break; } @@ -98,7 +98,7 @@ static int octep_oq_refill(struct octep_device *oct, struct octep_oq *oq) "OQ-%d buffer refill: DMA mapping error!\n", oq->q_no); put_page(page); - oq->stats.alloc_failures++; + oq->stats->alloc_failures++; break; } oq->buff_info[refill_idx].page = page; @@ -134,6 +134,7 @@ static int octep_setup_oq(struct octep_device *oct, int q_no) oq->netdev = oct->netdev; oq->dev = &oct->pdev->dev; oq->q_no = q_no; + oq->stats = &oct->stats_oq[q_no]; oq->max_count = CFG_GET_OQ_NUM_DESC(oct->conf); oq->ring_size_mask = oq->max_count - 1; oq->buffer_size = CFG_GET_OQ_BUF_SIZE(oct->conf); @@ -443,7 +444,7 @@ static int __octep_oq_process_rx(struct octep_device *oct, if (!skb) { octep_oq_drop_rx(oq, buff_info, &read_idx, &desc_used); - oq->stats.alloc_failures++; + oq->stats->alloc_failures++; continue; } skb_reserve(skb, data_offset); @@ -494,8 +495,8 @@ static int __octep_oq_process_rx(struct octep_device *oct, oq->host_read_idx = read_idx; oq->refill_count += desc_used; - oq->stats.packets += pkt; - oq->stats.bytes += rx_bytes; + oq->stats->packets += pkt; + oq->stats->bytes += rx_bytes; return pkt; } diff --git a/drivers/net/ethernet/marvell/octeon_ep/octep_rx.h b/drivers/net/ethernet/marvell/octeon_ep/octep_rx.h index 3b08e2d560dc3..b4696c93d0e6a 100644 --- a/drivers/net/ethernet/marvell/octeon_ep/octep_rx.h +++ b/drivers/net/ethernet/marvell/octeon_ep/octep_rx.h @@ -186,8 +186,8 @@ struct octep_oq { */ u8 __iomem *pkts_sent_reg; - /* Statistics for this OQ. */ - struct octep_oq_stats stats; + /* Pointer to statistics for this OQ. */ + struct octep_oq_stats *stats; /* Packets pending to be processed */ u32 pkts_pending; diff --git a/drivers/net/ethernet/marvell/octeon_ep/octep_tx.c b/drivers/net/ethernet/marvell/octeon_ep/octep_tx.c index 06851b78aa28c..08ee90013fef3 100644 --- a/drivers/net/ethernet/marvell/octeon_ep/octep_tx.c +++ b/drivers/net/ethernet/marvell/octeon_ep/octep_tx.c @@ -81,9 +81,9 @@ int octep_iq_process_completions(struct octep_iq *iq, u16 budget) } iq->pkts_processed += compl_pkts; - iq->stats.instr_completed += compl_pkts; - iq->stats.bytes_sent += compl_bytes; - iq->stats.sgentry_sent += compl_sg; + iq->stats->instr_completed += compl_pkts; + iq->stats->bytes_sent += compl_bytes; + iq->stats->sgentry_sent += compl_sg; iq->flush_index = fi; netdev_tx_completed_queue(iq->netdev_q, compl_pkts, compl_bytes); @@ -187,6 +187,7 @@ static int octep_setup_iq(struct octep_device *oct, int q_no) iq->netdev = oct->netdev; iq->dev = &oct->pdev->dev; iq->q_no = q_no; + iq->stats = &oct->stats_iq[q_no]; iq->max_count = CFG_GET_IQ_NUM_DESC(oct->conf); iq->ring_size_mask = iq->max_count - 1; iq->fill_threshold = CFG_GET_IQ_DB_MIN(oct->conf); diff --git a/drivers/net/ethernet/marvell/octeon_ep/octep_tx.h b/drivers/net/ethernet/marvell/octeon_ep/octep_tx.h index 875a2c34091ff..58fb39dda977c 100644 --- a/drivers/net/ethernet/marvell/octeon_ep/octep_tx.h +++ b/drivers/net/ethernet/marvell/octeon_ep/octep_tx.h @@ -170,8 +170,8 @@ struct octep_iq { */ u16 flush_index; - /* Statistics for this input queue. */ - struct octep_iq_stats stats; + /* Pointer to statistics for this input queue. */ + struct octep_iq_stats *stats; /* Pointer to the Virtual Base addr of the input ring. */ struct octep_tx_desc_hw *desc_ring; From cc0e510cc89fe0a6479203bc20cd964962dc6a43 Mon Sep 17 00:00:00 2001 From: Shinas Rasheed Date: Fri, 17 Jan 2025 01:46:52 -0800 Subject: [PATCH 08/14] octeon_ep_vf: remove firmware stats fetch in ndo_get_stats64 The firmware stats fetch call that happens in ndo_get_stats64() is currently not required, and causes a warning to issue. The corresponding warn log for the PF is given below: [ 123.316837] ------------[ cut here ]------------ [ 123.316840] Voluntary context switch within RCU read-side critical section! [ 123.316917] pc : rcu_note_context_switch+0x2e4/0x300 [ 123.316919] lr : rcu_note_context_switch+0x2e4/0x300 [ 123.316947] Call trace: [ 123.316949] rcu_note_context_switch+0x2e4/0x300 [ 123.316952] __schedule+0x84/0x584 [ 123.316955] schedule+0x38/0x90 [ 123.316956] schedule_timeout+0xa0/0x1d4 [ 123.316959] octep_send_mbox_req+0x190/0x230 [octeon_ep] [ 123.316966] octep_ctrl_net_get_if_stats+0x78/0x100 [octeon_ep] [ 123.316970] octep_get_stats64+0xd4/0xf0 [octeon_ep] [ 123.316975] dev_get_stats+0x4c/0x114 [ 123.316977] dev_seq_printf_stats+0x3c/0x11c [ 123.316980] dev_seq_show+0x1c/0x40 [ 123.316982] seq_read_iter+0x3cc/0x4e0 [ 123.316985] seq_read+0xc8/0x110 [ 123.316987] proc_reg_read+0x9c/0xec [ 123.316990] vfs_read+0xc8/0x2ec [ 123.316993] ksys_read+0x70/0x100 [ 123.316995] __arm64_sys_read+0x20/0x30 [ 123.316997] invoke_syscall.constprop.0+0x7c/0xd0 [ 123.317000] do_el0_svc+0xb4/0xd0 [ 123.317002] el0_svc+0xe8/0x1f4 [ 123.317005] el0t_64_sync_handler+0x134/0x150 [ 123.317006] el0t_64_sync+0x17c/0x180 [ 123.317008] ---[ end trace 63399811432ab69b ]--- Fixes: c3fad23cdc06 ("octeon_ep_vf: add support for ndo ops") Signed-off-by: Shinas Rasheed Link: https://patch.msgid.link/20250117094653.2588578-4-srasheed@marvell.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_main.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_main.c b/drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_main.c index 7e6771c9cdbba..4c699514fd57a 100644 --- a/drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_main.c +++ b/drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_main.c @@ -799,14 +799,6 @@ static void octep_vf_get_stats64(struct net_device *netdev, stats->tx_bytes = tx_bytes; stats->rx_packets = rx_packets; stats->rx_bytes = rx_bytes; - if (!octep_vf_get_if_stats(oct)) { - stats->multicast = oct->iface_rx_stats.mcast_pkts; - stats->rx_errors = oct->iface_rx_stats.err_pkts; - stats->rx_dropped = oct->iface_rx_stats.dropped_pkts_fifo_full + - oct->iface_rx_stats.err_pkts; - stats->rx_missed_errors = oct->iface_rx_stats.dropped_pkts_fifo_full; - stats->tx_dropped = oct->iface_tx_stats.dropped; - } } /** From f84039939512e6d88b0f2f353695530f123be789 Mon Sep 17 00:00:00 2001 From: Shinas Rasheed Date: Fri, 17 Jan 2025 01:46:53 -0800 Subject: [PATCH 09/14] octeon_ep_vf: update tx/rx stats locally for persistence Update tx/rx stats locally, so that ndo_get_stats64() can use that and not rely on per queue resources to obtain statistics. The latter used to cause race conditions when the device stopped. Signed-off-by: Shinas Rasheed Link: https://patch.msgid.link/20250117094653.2588578-5-srasheed@marvell.com Signed-off-by: Jakub Kicinski --- .../marvell/octeon_ep_vf/octep_vf_ethtool.c | 29 +++++++------------ .../marvell/octeon_ep_vf/octep_vf_main.c | 17 +++++------ .../marvell/octeon_ep_vf/octep_vf_main.h | 6 ++++ .../marvell/octeon_ep_vf/octep_vf_rx.c | 9 +++--- .../marvell/octeon_ep_vf/octep_vf_rx.h | 2 +- .../marvell/octeon_ep_vf/octep_vf_tx.c | 7 +++-- .../marvell/octeon_ep_vf/octep_vf_tx.h | 2 +- 7 files changed, 35 insertions(+), 37 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_ethtool.c b/drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_ethtool.c index 7b21439a315f2..d60441928ba96 100644 --- a/drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_ethtool.c +++ b/drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_ethtool.c @@ -114,12 +114,9 @@ static void octep_vf_get_ethtool_stats(struct net_device *netdev, iface_tx_stats = &oct->iface_tx_stats; iface_rx_stats = &oct->iface_rx_stats; - for (q = 0; q < oct->num_oqs; q++) { - struct octep_vf_iq *iq = oct->iq[q]; - struct octep_vf_oq *oq = oct->oq[q]; - - tx_busy_errors += iq->stats.tx_busy; - rx_alloc_errors += oq->stats.alloc_failures; + for (q = 0; q < OCTEP_VF_MAX_QUEUES; q++) { + tx_busy_errors += oct->stats_iq[q].tx_busy; + rx_alloc_errors += oct->stats_oq[q].alloc_failures; } i = 0; data[i++] = rx_alloc_errors; @@ -134,22 +131,18 @@ static void octep_vf_get_ethtool_stats(struct net_device *netdev, data[i++] = iface_rx_stats->dropped_octets_fifo_full; /* Per Tx Queue stats */ - for (q = 0; q < oct->num_iqs; q++) { - struct octep_vf_iq *iq = oct->iq[q]; - - data[i++] = iq->stats.instr_posted; - data[i++] = iq->stats.instr_completed; - data[i++] = iq->stats.bytes_sent; - data[i++] = iq->stats.tx_busy; + for (q = 0; q < OCTEP_VF_MAX_QUEUES; q++) { + data[i++] = oct->stats_iq[q].instr_posted; + data[i++] = oct->stats_iq[q].instr_completed; + data[i++] = oct->stats_iq[q].bytes_sent; + data[i++] = oct->stats_iq[q].tx_busy; } /* Per Rx Queue stats */ for (q = 0; q < oct->num_oqs; q++) { - struct octep_vf_oq *oq = oct->oq[q]; - - data[i++] = oq->stats.packets; - data[i++] = oq->stats.bytes; - data[i++] = oq->stats.alloc_failures; + data[i++] = oct->stats_oq[q].packets; + data[i++] = oct->stats_oq[q].bytes; + data[i++] = oct->stats_oq[q].alloc_failures; } } diff --git a/drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_main.c b/drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_main.c index 4c699514fd57a..18c922dd5fc64 100644 --- a/drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_main.c +++ b/drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_main.c @@ -574,7 +574,7 @@ static int octep_vf_iq_full_check(struct octep_vf_iq *iq) * caused queues to get re-enabled after * being stopped */ - iq->stats.restart_cnt++; + iq->stats->restart_cnt++; fallthrough; case 1: /* Queue left enabled, since IQ is not yet full*/ return 0; @@ -731,7 +731,7 @@ static netdev_tx_t octep_vf_start_xmit(struct sk_buff *skb, /* Flush the hw descriptors before writing to doorbell */ smp_wmb(); writel(iq->fill_cnt, iq->doorbell_reg); - iq->stats.instr_posted += iq->fill_cnt; + iq->stats->instr_posted += iq->fill_cnt; iq->fill_cnt = 0; return NETDEV_TX_OK; } @@ -786,14 +786,11 @@ static void octep_vf_get_stats64(struct net_device *netdev, tx_bytes = 0; rx_packets = 0; rx_bytes = 0; - for (q = 0; q < oct->num_oqs; q++) { - struct octep_vf_iq *iq = oct->iq[q]; - struct octep_vf_oq *oq = oct->oq[q]; - - tx_packets += iq->stats.instr_completed; - tx_bytes += iq->stats.bytes_sent; - rx_packets += oq->stats.packets; - rx_bytes += oq->stats.bytes; + for (q = 0; q < OCTEP_VF_MAX_QUEUES; q++) { + tx_packets += oct->stats_iq[q].instr_completed; + tx_bytes += oct->stats_iq[q].bytes_sent; + rx_packets += oct->stats_oq[q].packets; + rx_bytes += oct->stats_oq[q].bytes; } stats->tx_packets = tx_packets; stats->tx_bytes = tx_bytes; diff --git a/drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_main.h b/drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_main.h index 5769f62545cd4..1a352f41f823c 100644 --- a/drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_main.h +++ b/drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_main.h @@ -246,11 +246,17 @@ struct octep_vf_device { /* Pointers to Octeon Tx queues */ struct octep_vf_iq *iq[OCTEP_VF_MAX_IQ]; + /* Per iq stats */ + struct octep_vf_iq_stats stats_iq[OCTEP_VF_MAX_IQ]; + /* Rx queues (OQ: Output Queue) */ u16 num_oqs; /* Pointers to Octeon Rx queues */ struct octep_vf_oq *oq[OCTEP_VF_MAX_OQ]; + /* Per oq stats */ + struct octep_vf_oq_stats stats_oq[OCTEP_VF_MAX_OQ]; + /* Hardware port number of the PCIe interface */ u16 pcie_port; diff --git a/drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_rx.c b/drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_rx.c index 82821bc28634b..d70c8be3cfc40 100644 --- a/drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_rx.c +++ b/drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_rx.c @@ -87,7 +87,7 @@ static int octep_vf_oq_refill(struct octep_vf_device *oct, struct octep_vf_oq *o page = dev_alloc_page(); if (unlikely(!page)) { dev_err(oq->dev, "refill: rx buffer alloc failed\n"); - oq->stats.alloc_failures++; + oq->stats->alloc_failures++; break; } @@ -98,7 +98,7 @@ static int octep_vf_oq_refill(struct octep_vf_device *oct, struct octep_vf_oq *o "OQ-%d buffer refill: DMA mapping error!\n", oq->q_no); put_page(page); - oq->stats.alloc_failures++; + oq->stats->alloc_failures++; break; } oq->buff_info[refill_idx].page = page; @@ -134,6 +134,7 @@ static int octep_vf_setup_oq(struct octep_vf_device *oct, int q_no) oq->netdev = oct->netdev; oq->dev = &oct->pdev->dev; oq->q_no = q_no; + oq->stats = &oct->stats_oq[q_no]; oq->max_count = CFG_GET_OQ_NUM_DESC(oct->conf); oq->ring_size_mask = oq->max_count - 1; oq->buffer_size = CFG_GET_OQ_BUF_SIZE(oct->conf); @@ -458,8 +459,8 @@ static int __octep_vf_oq_process_rx(struct octep_vf_device *oct, oq->host_read_idx = read_idx; oq->refill_count += desc_used; - oq->stats.packets += pkt; - oq->stats.bytes += rx_bytes; + oq->stats->packets += pkt; + oq->stats->bytes += rx_bytes; return pkt; } diff --git a/drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_rx.h b/drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_rx.h index fe46838b5200f..9e296b7d7e349 100644 --- a/drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_rx.h +++ b/drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_rx.h @@ -187,7 +187,7 @@ struct octep_vf_oq { u8 __iomem *pkts_sent_reg; /* Statistics for this OQ. */ - struct octep_vf_oq_stats stats; + struct octep_vf_oq_stats *stats; /* Packets pending to be processed */ u32 pkts_pending; diff --git a/drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_tx.c b/drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_tx.c index 47a5c054fdb63..8180e5ce3d7ef 100644 --- a/drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_tx.c +++ b/drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_tx.c @@ -82,9 +82,9 @@ int octep_vf_iq_process_completions(struct octep_vf_iq *iq, u16 budget) } iq->pkts_processed += compl_pkts; - iq->stats.instr_completed += compl_pkts; - iq->stats.bytes_sent += compl_bytes; - iq->stats.sgentry_sent += compl_sg; + iq->stats->instr_completed += compl_pkts; + iq->stats->bytes_sent += compl_bytes; + iq->stats->sgentry_sent += compl_sg; iq->flush_index = fi; netif_subqueue_completed_wake(iq->netdev, iq->q_no, compl_pkts, @@ -186,6 +186,7 @@ static int octep_vf_setup_iq(struct octep_vf_device *oct, int q_no) iq->netdev = oct->netdev; iq->dev = &oct->pdev->dev; iq->q_no = q_no; + iq->stats = &oct->stats_iq[q_no]; iq->max_count = CFG_GET_IQ_NUM_DESC(oct->conf); iq->ring_size_mask = iq->max_count - 1; iq->fill_threshold = CFG_GET_IQ_DB_MIN(oct->conf); diff --git a/drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_tx.h b/drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_tx.h index f338b975103c3..1cede90e3a5fa 100644 --- a/drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_tx.h +++ b/drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_tx.h @@ -129,7 +129,7 @@ struct octep_vf_iq { u16 flush_index; /* Statistics for this input queue. */ - struct octep_vf_iq_stats stats; + struct octep_vf_iq_stats *stats; /* Pointer to the Virtual Base addr of the input ring. */ struct octep_vf_tx_desc_hw *desc_ring; From 12e070eb6964b341b41677fd260af5a305316a1f Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Thu, 16 Jan 2025 10:21:57 +0100 Subject: [PATCH 10/14] net: avoid race between device unregistration and ethnl ops The following trace can be seen if a device is being unregistered while its number of channels are being modified. DEBUG_LOCKS_WARN_ON(lock->magic != lock) WARNING: CPU: 3 PID: 3754 at kernel/locking/mutex.c:564 __mutex_lock+0xc8a/0x1120 CPU: 3 UID: 0 PID: 3754 Comm: ethtool Not tainted 6.13.0-rc6+ #771 RIP: 0010:__mutex_lock+0xc8a/0x1120 Call Trace: ethtool_check_max_channel+0x1ea/0x880 ethnl_set_channels+0x3c3/0xb10 ethnl_default_set_doit+0x306/0x650 genl_family_rcv_msg_doit+0x1e3/0x2c0 genl_rcv_msg+0x432/0x6f0 netlink_rcv_skb+0x13d/0x3b0 genl_rcv+0x28/0x40 netlink_unicast+0x42e/0x720 netlink_sendmsg+0x765/0xc20 __sys_sendto+0x3ac/0x420 __x64_sys_sendto+0xe0/0x1c0 do_syscall_64+0x95/0x180 entry_SYSCALL_64_after_hwframe+0x76/0x7e This is because unregister_netdevice_many_notify might run before the rtnl lock section of ethnl operations, eg. set_channels in the above example. In this example the rss lock would be destroyed by the device unregistration path before being used again, but in general running ethnl operations while dismantle has started is not a good idea. Fix this by denying any operation on devices being unregistered. A check was already there in ethnl_ops_begin, but not wide enough. Note that the same issue cannot be seen on the ioctl version (__dev_ethtool) because the device reference is retrieved from within the rtnl lock section there. Once dismantle started, the net device is unlisted and no reference will be found. Fixes: dde91ccfa25f ("ethtool: do not perform operations on net devices being unregistered") Signed-off-by: Antoine Tenart Reviewed-by: Przemek Kitszel Reviewed-by: Edward Cree Link: https://patch.msgid.link/20250116092159.50890-1-atenart@kernel.org Signed-off-by: Jakub Kicinski --- net/ethtool/netlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index e3f0ef6b851bb..4d18dc29b3043 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -90,7 +90,7 @@ int ethnl_ops_begin(struct net_device *dev) pm_runtime_get_sync(dev->dev.parent); if (!netif_device_present(dev) || - dev->reg_state == NETREG_UNREGISTERING) { + dev->reg_state >= NETREG_UNREGISTERING) { ret = -ENODEV; goto err; } From bc50835e83f60f56e9bec2b392fb5544f250fb6f Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Wed, 15 Jan 2025 17:37:13 -0800 Subject: [PATCH 11/14] net: sched: Disallow replacing of child qdisc from one parent to another Lion Ackermann was able to create a UAF which can be abused for privilege escalation with the following script Step 1. create root qdisc tc qdisc add dev lo root handle 1:0 drr step2. a class for packet aggregation do demonstrate uaf tc class add dev lo classid 1:1 drr step3. a class for nesting tc class add dev lo classid 1:2 drr step4. a class to graft qdisc to tc class add dev lo classid 1:3 drr step5. tc qdisc add dev lo parent 1:1 handle 2:0 plug limit 1024 step6. tc qdisc add dev lo parent 1:2 handle 3:0 drr step7. tc class add dev lo classid 3:1 drr step 8. tc qdisc add dev lo parent 3:1 handle 4:0 pfifo step 9. Display the class/qdisc layout tc class ls dev lo class drr 1:1 root leaf 2: quantum 64Kb class drr 1:2 root leaf 3: quantum 64Kb class drr 3:1 root leaf 4: quantum 64Kb tc qdisc ls qdisc drr 1: dev lo root refcnt 2 qdisc plug 2: dev lo parent 1:1 qdisc pfifo 4: dev lo parent 3:1 limit 1000p qdisc drr 3: dev lo parent 1:2 step10. trigger the bug <=== prevented by this patch tc qdisc replace dev lo parent 1:3 handle 4:0 step 11. Redisplay again the qdiscs/classes tc class ls dev lo class drr 1:1 root leaf 2: quantum 64Kb class drr 1:2 root leaf 3: quantum 64Kb class drr 1:3 root leaf 4: quantum 64Kb class drr 3:1 root leaf 4: quantum 64Kb tc qdisc ls qdisc drr 1: dev lo root refcnt 2 qdisc plug 2: dev lo parent 1:1 qdisc pfifo 4: dev lo parent 3:1 refcnt 2 limit 1000p qdisc drr 3: dev lo parent 1:2 Observe that a) parent for 4:0 does not change despite the replace request. There can only be one parent. b) refcount has gone up by two for 4:0 and c) both class 1:3 and 3:1 are pointing to it. Step 12. send one packet to plug echo "" | socat -u STDIN UDP4-DATAGRAM:127.0.0.1:8888,priority=$((0x10001)) step13. send one packet to the grafted fifo echo "" | socat -u STDIN UDP4-DATAGRAM:127.0.0.1:8888,priority=$((0x10003)) step14. lets trigger the uaf tc class delete dev lo classid 1:3 tc class delete dev lo classid 1:1 The semantics of "replace" is for a del/add _on the same node_ and not a delete from one node(3:1) and add to another node (1:3) as in step10. While we could "fix" with a more complex approach there could be consequences to expectations so the patch takes the preventive approach of "disallow such config". Joint work with Lion Ackermann Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Jamal Hadi Salim Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250116013713.900000-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/sched/sch_api.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 300430b8c4d22..fac9c946a4c75 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -1664,6 +1664,10 @@ static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, q = qdisc_lookup(dev, tcm->tcm_handle); if (!q) goto create_n_graft; + if (q->parent != tcm->tcm_parent) { + NL_SET_ERR_MSG(extack, "Cannot move an existing qdisc to a different parent"); + return -EINVAL; + } if (n->nlmsg_flags & NLM_F_EXCL) { NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot override"); return -EEXIST; From 4395a44acb15850e492dd1de9ec4b6479d96bc80 Mon Sep 17 00:00:00 2001 From: Roger Quadros Date: Thu, 16 Jan 2025 15:54:49 +0200 Subject: [PATCH 12/14] net: ethernet: ti: am65-cpsw: fix freeing IRQ in am65_cpsw_nuss_remove_tx_chns() When getting the IRQ we use k3_udma_glue_tx_get_irq() which returns negative error value on error. So not NULL check is not sufficient to deteremine if IRQ is valid. Check that IRQ is greater then zero to ensure it is valid. There is no issue at probe time but at runtime user can invoke .set_channels which results in the following call chain. am65_cpsw_set_channels() am65_cpsw_nuss_update_tx_rx_chns() am65_cpsw_nuss_remove_tx_chns() am65_cpsw_nuss_init_tx_chns() At this point if am65_cpsw_nuss_init_tx_chns() fails due to k3_udma_glue_tx_get_irq() then tx_chn->irq will be set to a negative value. Then, at subsequent .set_channels with higher channel count we will attempt to free an invalid IRQ in am65_cpsw_nuss_remove_tx_chns() leading to a kernel warning. The issue is present in the original commit that introduced this driver, although there, am65_cpsw_nuss_update_tx_rx_chns() existed as am65_cpsw_nuss_update_tx_chns(). Fixes: 93a76530316a ("net: ethernet: ti: introduce am65x/j721e gigabit eth subsystem driver") Signed-off-by: Roger Quadros Reviewed-by: Simon Horman Reviewed-by: Siddharth Vadapalli Reviewed-by: Jacob Keller Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/am65-cpsw-nuss.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.c b/drivers/net/ethernet/ti/am65-cpsw-nuss.c index 5465bf872734a..e1de45fb18aee 100644 --- a/drivers/net/ethernet/ti/am65-cpsw-nuss.c +++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.c @@ -2248,7 +2248,7 @@ static void am65_cpsw_nuss_remove_tx_chns(struct am65_cpsw_common *common) for (i = 0; i < common->tx_ch_num; i++) { struct am65_cpsw_tx_chn *tx_chn = &common->tx_chns[i]; - if (tx_chn->irq) + if (tx_chn->irq > 0) devm_free_irq(dev, tx_chn->irq, tx_chn); netif_napi_del(&tx_chn->napi_tx); From 25c1a9ca53db5780757e7f53e688b8f916821baa Mon Sep 17 00:00:00 2001 From: Mahdi Arghavani Date: Fri, 17 Jan 2025 21:37:51 +0000 Subject: [PATCH 13/14] tcp_cubic: fix incorrect HyStart round start detection I noticed that HyStart incorrectly marks the start of rounds, leading to inaccurate measurements of ACK train lengths and resetting the `ca->sample_cnt` variable. This inaccuracy can impact HyStart's functionality in terminating exponential cwnd growth during Slow-Start, potentially degrading TCP performance. The issue arises because the changes introduced in commit 4e1fddc98d25 ("tcp_cubic: fix spurious Hystart ACK train detections for not-cwnd-limited flows") moved the caller of the `bictcp_hystart_reset` function inside the `hystart_update` function. This modification added an additional condition for triggering the caller, requiring that (tcp_snd_cwnd(tp) >= hystart_low_window) must also be satisfied before invoking `bictcp_hystart_reset`. This fix ensures that `bictcp_hystart_reset` is correctly called at the start of a new round, regardless of the congestion window size. This is achieved by moving the condition (tcp_snd_cwnd(tp) >= hystart_low_window) from before calling `bictcp_hystart_reset` to after it. I tested with a client and a server connected through two Linux software routers. In this setup, the minimum RTT was 150 ms, the bottleneck bandwidth was 50 Mbps, and the bottleneck buffer size was 1 BDP, calculated as (50M / 1514 / 8) * 0.150 = 619 packets. I conducted the test twice, transferring data from the server to the client for 1.5 seconds. Before the patch was applied, HYSTART-DELAY stopped the exponential growth of cwnd when cwnd = 516, and the bottleneck link was not yet saturated (516 < 619). After the patch was applied, HYSTART-ACK-TRAIN stopped the exponential growth of cwnd when cwnd = 632, and the bottleneck link was saturated (632 > 619). In this test, applying the patch resulted in 300 KB more data delivered. Fixes: 4e1fddc98d25 ("tcp_cubic: fix spurious Hystart ACK train detections for not-cwnd-limited flows") Signed-off-by: Mahdi Arghavani Reviewed-by: Jason Xing Cc: Neal Cardwell Cc: Eric Dumazet Cc: Haibo Zhang Cc: David Eyers Cc: Abbas Arghavani Reviewed-by: Neal Cardwell Tested-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_cubic.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 5dbed91c61782..76c23675ae50a 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -392,6 +392,10 @@ static void hystart_update(struct sock *sk, u32 delay) if (after(tp->snd_una, ca->end_seq)) bictcp_hystart_reset(sk); + /* hystart triggers when cwnd is larger than some threshold */ + if (tcp_snd_cwnd(tp) < hystart_low_window) + return; + if (hystart_detect & HYSTART_ACK_TRAIN) { u32 now = bictcp_clock_us(sk); @@ -467,9 +471,7 @@ __bpf_kfunc static void cubictcp_acked(struct sock *sk, const struct ack_sample if (ca->delay_min == 0 || ca->delay_min > delay) ca->delay_min = delay; - /* hystart triggers when cwnd is larger than some threshold */ - if (!ca->found && tcp_in_slow_start(tp) && hystart && - tcp_snd_cwnd(tp) >= hystart_low_window) + if (!ca->found && tcp_in_slow_start(tp) && hystart) hystart_update(sk, delay); } From d640627663bfe7d8963c7615316d7d4ef60f3b0b Mon Sep 17 00:00:00 2001 From: Nikita Zhandarovich Date: Wed, 15 Jan 2025 08:42:20 -0800 Subject: [PATCH 14/14] net/rose: prevent integer overflows in rose_setsockopt() In case of possible unpredictably large arguments passed to rose_setsockopt() and multiplied by extra values on top of that, integer overflows may occur. Do the safest minimum and fix these issues by checking the contents of 'opt' and returning -EINVAL if they are too large. Also, switch to unsigned int and remove useless check for negative 'opt' in ROSE_IDLE case. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Nikita Zhandarovich Link: https://patch.msgid.link/20250115164220.19954-1-n.zhandarovich@fintech.ru Signed-off-by: Jakub Kicinski --- net/rose/af_rose.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index 59050caab65c8..72c65d938a150 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -397,15 +397,15 @@ static int rose_setsockopt(struct socket *sock, int level, int optname, { struct sock *sk = sock->sk; struct rose_sock *rose = rose_sk(sk); - int opt; + unsigned int opt; if (level != SOL_ROSE) return -ENOPROTOOPT; - if (optlen < sizeof(int)) + if (optlen < sizeof(unsigned int)) return -EINVAL; - if (copy_from_sockptr(&opt, optval, sizeof(int))) + if (copy_from_sockptr(&opt, optval, sizeof(unsigned int))) return -EFAULT; switch (optname) { @@ -414,31 +414,31 @@ static int rose_setsockopt(struct socket *sock, int level, int optname, return 0; case ROSE_T1: - if (opt < 1) + if (opt < 1 || opt > UINT_MAX / HZ) return -EINVAL; rose->t1 = opt * HZ; return 0; case ROSE_T2: - if (opt < 1) + if (opt < 1 || opt > UINT_MAX / HZ) return -EINVAL; rose->t2 = opt * HZ; return 0; case ROSE_T3: - if (opt < 1) + if (opt < 1 || opt > UINT_MAX / HZ) return -EINVAL; rose->t3 = opt * HZ; return 0; case ROSE_HOLDBACK: - if (opt < 1) + if (opt < 1 || opt > UINT_MAX / HZ) return -EINVAL; rose->hb = opt * HZ; return 0; case ROSE_IDLE: - if (opt < 0) + if (opt > UINT_MAX / (60 * HZ)) return -EINVAL; rose->idle = opt * 60 * HZ; return 0;