From fa8eda19015ca9ae625f46d4ecb13df651bb54cc Mon Sep 17 00:00:00 2001 From: Przemek Kitszel Date: Thu, 6 Feb 2025 23:30:23 +0100 Subject: [PATCH 01/41] ice: health.c: fix compilation on gcc 7.5 GCC 7 is not as good as GCC 8+ in telling what is a compile-time const, and thus could be used for static storage. Fortunately keeping strings as const arrays is enough to make old gcc happy. Excerpt from the report: My GCC is: gcc (Ubuntu 7.5.0-3ubuntu1~18.04) 7.5.0. CC [M] drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.o drivers/net/ethernet/intel/ice/devlink/health.c:35:3: error: initializer element is not constant ice_common_port_solutions, {ice_port_number_label}}, ^~~~~~~~~~~~~~~~~~~~~~~~~ drivers/net/ethernet/intel/ice/devlink/health.c:35:3: note: (near initialization for 'ice_health_status_lookup[0].solution') drivers/net/ethernet/intel/ice/devlink/health.c:35:31: error: initializer element is not constant ice_common_port_solutions, {ice_port_number_label}}, ^~~~~~~~~~~~~~~~~~~~~ drivers/net/ethernet/intel/ice/devlink/health.c:35:31: note: (near initialization for 'ice_health_status_lookup[0].data_label[0]') drivers/net/ethernet/intel/ice/devlink/health.c:37:46: error: initializer element is not constant "Change or replace the module or cable.", {ice_port_number_label}}, ^~~~~~~~~~~~~~~~~~~~~ drivers/net/ethernet/intel/ice/devlink/health.c:37:46: note: (near initialization for 'ice_health_status_lookup[1].data_label[0]') drivers/net/ethernet/intel/ice/devlink/health.c:39:3: error: initializer element is not constant ice_common_port_solutions, {ice_port_number_label}}, ^~~~~~~~~~~~~~~~~~~~~~~~~ Fixes: 85d6164ec56d ("ice: add fw and port health reporters") Reported-by: Qiuxu Zhuo Closes: https://lore.kernel.org/netdev/CY8PR11MB7134BF7A46D71E50D25FA7A989F72@CY8PR11MB7134.namprd11.prod.outlook.com Reviewed-by: Michal Swiatkowski Suggested-by: Simon Horman Signed-off-by: Przemek Kitszel Reviewed-by: Simon Horman Tested-by: Sunitha Mekala (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/devlink/health.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/devlink/health.c b/drivers/net/ethernet/intel/ice/devlink/health.c index ea40f79412590..19c3d37aa768b 100644 --- a/drivers/net/ethernet/intel/ice/devlink/health.c +++ b/drivers/net/ethernet/intel/ice/devlink/health.c @@ -25,10 +25,10 @@ struct ice_health_status { * The below lookup requires to be sorted by code. */ -static const char *const ice_common_port_solutions = +static const char ice_common_port_solutions[] = "Check your cable connection. Change or replace the module or cable. Manually set speed and duplex."; -static const char *const ice_port_number_label = "Port Number"; -static const char *const ice_update_nvm_solution = "Update to the latest NVM image."; +static const char ice_port_number_label[] = "Port Number"; +static const char ice_update_nvm_solution[] = "Update to the latest NVM image."; static const struct ice_health_status ice_health_status_lookup[] = { {ICE_AQC_HEALTH_STATUS_ERR_UNKNOWN_MOD_STRICT, "An unsupported module was detected.", From 53ce7166cbffd2b8f3bd821fd3918be665afd4c6 Mon Sep 17 00:00:00 2001 From: Karol Kolacinski Date: Wed, 19 Feb 2025 14:13:27 -0800 Subject: [PATCH 02/41] ice: ensure periodic output start time is in the future On E800 series hardware, if the start time for a periodic output signal is programmed into GLTSYN_TGT_H and GLTSYN_TGT_L registers, the hardware logic locks up and the periodic output signal never starts. Any future attempt to reprogram the clock function is futile as the hardware will not reset until a power on. The ice_ptp_cfg_perout function has logic to prevent this, as it checks if the requested start time is in the past. If so, a new start time is calculated by rounding up. Since commit d755a7e129a5 ("ice: Cache perout/extts requests and check flags"), the rounding is done to the nearest multiple of the clock period, rather than to a full second. This is more accurate, since it ensures the signal matches the user request precisely. Unfortunately, there is a race condition with this rounding logic. If the current time is close to the multiple of the period, we could calculate a target time that is extremely soon. It takes time for the software to program the registers, during which time this requested start time could become a start time in the past. If that happens, the periodic output signal will lock up. For large enough periods, or for the logic prior to the mentioned commit, this is unlikely. However, with the new logic rounding to the period and with a small enough period, this becomes inevitable. For example, attempting to enable a 10MHz signal requires a period of 100 nanoseconds. This means in the *best* case, we have 99 nanoseconds to program the clock output. This is essentially impossible, and thus such a small period practically guarantees that the clock output function will lock up. To fix this, add some slop to the clock time used to check if the start time is in the past. Because it is not critical that output signals start immediately, but it *is* critical that we do not brick the function, 0.5 seconds is selected. This does mean that any requested output will be delayed by at least 0.5 seconds. This slop is applied before rounding, so that we always round up to the nearest multiple of the period that is at least 0.5 seconds in the future, ensuring a minimum of 0.5 seconds to program the clock output registers. Finally, to ensure that the hardware registers programming the clock output complete in a timely manner, add a write flush to the end of ice_ptp_write_perout. This ensures we don't risk any issue with PCIe transaction batching. Strictly speaking, this fixes a race condition all the way back at the initial implementation of periodic output programming, as it is theoretically possible to trigger this bug even on the old logic when always rounding to a full second. However, the window is narrow, and the code has been refactored heavily since then, making a direct backport not apply cleanly. Fixes: d755a7e129a5 ("ice: Cache perout/extts requests and check flags") Signed-off-by: Karol Kolacinski Signed-off-by: Jacob Keller Tested-by: Rinitha S (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_ptp.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_ptp.c b/drivers/net/ethernet/intel/ice/ice_ptp.c index e26320ce52ca1..a99e0fbd0b8b5 100644 --- a/drivers/net/ethernet/intel/ice/ice_ptp.c +++ b/drivers/net/ethernet/intel/ice/ice_ptp.c @@ -1783,6 +1783,7 @@ static int ice_ptp_write_perout(struct ice_hw *hw, unsigned int chan, 8 + chan + (tmr_idx * 4)); wr32(hw, GLGEN_GPIO_CTL(gpio_pin), val); + ice_flush(hw); return 0; } @@ -1843,9 +1844,10 @@ static int ice_ptp_cfg_perout(struct ice_pf *pf, struct ptp_perout_request *rq, div64_u64_rem(start, period, &phase); /* If we have only phase or start time is in the past, start the timer - * at the next multiple of period, maintaining phase. + * at the next multiple of period, maintaining phase at least 0.5 second + * from now, so we have time to write it to HW. */ - clk = ice_ptp_read_src_clk_reg(pf, NULL); + clk = ice_ptp_read_src_clk_reg(pf, NULL) + NSEC_PER_MSEC * 500; if (rq->flags & PTP_PEROUT_PHASE || start <= clk - prop_delay_ns) start = div64_u64(clk + period - 1, period) * period + phase; From 7fd71f317288d5150d353ce9d65b1e2abf99a8e2 Mon Sep 17 00:00:00 2001 From: Jesse Brandeburg Date: Thu, 6 Mar 2025 09:56:34 -0800 Subject: [PATCH 03/41] ice: fix reservation of resources for RDMA when disabled If the CONFIG_INFINIBAND_IRDMA symbol is not enabled as a module or a built-in, then don't let the driver reserve resources for RDMA. The result of this change is a large savings in resources for older kernels, and a cleaner driver configuration for the IRDMA=n case for old and new kernels. Implement this by avoiding enabling the RDMA capability when scanning hardware capabilities. Note: Loading the out-of-tree irdma driver in connection to the in-kernel ice driver, is not supported, and should not be attempted, especially when disabling IRDMA in the kernel config. Fixes: d25a0fc41c1f ("ice: Initialize RDMA support") Signed-off-by: Jesse Brandeburg Acked-by: Dave Ertman Reviewed-by: Aleksandr Loktionov Tested-by: Rinitha S (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_common.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ice/ice_common.c b/drivers/net/ethernet/intel/ice/ice_common.c index 7a2a2e8da8fab..1e801300310e9 100644 --- a/drivers/net/ethernet/intel/ice/ice_common.c +++ b/drivers/net/ethernet/intel/ice/ice_common.c @@ -2271,7 +2271,8 @@ ice_parse_common_caps(struct ice_hw *hw, struct ice_hw_common_caps *caps, caps->nvm_unified_update); break; case ICE_AQC_CAPS_RDMA: - caps->rdma = (number == 1); + if (IS_ENABLED(CONFIG_INFINIBAND_IRDMA)) + caps->rdma = (number == 1); ice_debug(hw, ICE_DBG_INIT, "%s: rdma = %d\n", prefix, caps->rdma); break; case ICE_AQC_CAPS_MAX_MTU: From db5e8ea155fc1d89c87cb81f0e4a681a77b9b03f Mon Sep 17 00:00:00 2001 From: Jan Glaza Date: Tue, 4 Mar 2025 12:08:31 +0100 Subject: [PATCH 04/41] virtchnl: make proto and filter action count unsigned The count field in virtchnl_proto_hdrs and virtchnl_filter_action_set should never be negative while still being valid. Changing it from int to u32 ensures proper handling of values in virtchnl messages in driverrs and prevents unintended behavior. In its current signed form, a negative count does not trigger an error in ice driver but instead results in it being treated as 0. This can lead to unexpected outcomes when processing messages. By using u32, any invalid values will correctly trigger -EINVAL, making error detection more robust. Fixes: 1f7ea1cd6a374 ("ice: Enable FDIR Configure for AVF") Reviewed-by: Jedrzej Jagielski Reviewed-by: Simon Horman Signed-off-by: Jan Glaza Signed-off-by: Martyna Szapar-Mudlaw Tested-by: Rafal Romanowski Signed-off-by: Tony Nguyen --- include/linux/avf/virtchnl.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/avf/virtchnl.h b/include/linux/avf/virtchnl.h index 13a11f3c09b87..aca06f300f833 100644 --- a/include/linux/avf/virtchnl.h +++ b/include/linux/avf/virtchnl.h @@ -1283,7 +1283,7 @@ struct virtchnl_proto_hdrs { * 2 - from the second inner layer * .... **/ - int count; /* the proto layers must < VIRTCHNL_MAX_NUM_PROTO_HDRS */ + u32 count; /* the proto layers must < VIRTCHNL_MAX_NUM_PROTO_HDRS */ union { struct virtchnl_proto_hdr proto_hdr[VIRTCHNL_MAX_NUM_PROTO_HDRS]; @@ -1335,7 +1335,7 @@ VIRTCHNL_CHECK_STRUCT_LEN(36, virtchnl_filter_action); struct virtchnl_filter_action_set { /* action number must be less then VIRTCHNL_MAX_NUM_ACTIONS */ - int count; + u32 count; struct virtchnl_filter_action actions[VIRTCHNL_MAX_NUM_ACTIONS]; }; From f91d0efcc3dd7b341bb370499b99dc02d4fb792f Mon Sep 17 00:00:00 2001 From: Jan Glaza Date: Tue, 4 Mar 2025 12:08:32 +0100 Subject: [PATCH 05/41] ice: stop truncating queue ids when checking Queue IDs can be up to 4096, fix invalid check to stop truncating IDs to 8 bits. Fixes: bf93bf791cec8 ("ice: introduce ice_virtchnl.c and ice_virtchnl.h") Reviewed-by: Aleksandr Loktionov Reviewed-by: Jedrzej Jagielski Reviewed-by: Simon Horman Signed-off-by: Jan Glaza Signed-off-by: Martyna Szapar-Mudlaw Tested-by: Rafal Romanowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_virtchnl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl.c b/drivers/net/ethernet/intel/ice/ice_virtchnl.c index ff4ad788d96ac..346aee373ccd4 100644 --- a/drivers/net/ethernet/intel/ice/ice_virtchnl.c +++ b/drivers/net/ethernet/intel/ice/ice_virtchnl.c @@ -562,7 +562,7 @@ bool ice_vc_isvalid_vsi_id(struct ice_vf *vf, u16 vsi_id) * * check for the valid queue ID */ -static bool ice_vc_isvalid_q_id(struct ice_vsi *vsi, u8 qid) +static bool ice_vc_isvalid_q_id(struct ice_vsi *vsi, u16 qid) { /* allocated Tx and Rx queues should be always equal for VF VSI */ return qid < vsi->alloc_txq; From e2f7d3f7331b92cb820da23e8c45133305da1e63 Mon Sep 17 00:00:00 2001 From: Jan Glaza Date: Tue, 4 Mar 2025 12:08:33 +0100 Subject: [PATCH 06/41] ice: validate queue quanta parameters to prevent OOB access Add queue wraparound prevention in quanta configuration. Ensure end_qid does not overflow by validating start_qid and num_queues. Fixes: 015307754a19 ("ice: Support VF queue rate limit and quanta size configuration") Reviewed-by: Jedrzej Jagielski Reviewed-by: Simon Horman Signed-off-by: Jan Glaza Signed-off-by: Martyna Szapar-Mudlaw Tested-by: Rafal Romanowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_virtchnl.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl.c b/drivers/net/ethernet/intel/ice/ice_virtchnl.c index 346aee373ccd4..df13f5110168d 100644 --- a/drivers/net/ethernet/intel/ice/ice_virtchnl.c +++ b/drivers/net/ethernet/intel/ice/ice_virtchnl.c @@ -1900,13 +1900,21 @@ static int ice_vc_cfg_q_bw(struct ice_vf *vf, u8 *msg) */ static int ice_vc_cfg_q_quanta(struct ice_vf *vf, u8 *msg) { + u16 quanta_prof_id, quanta_size, start_qid, num_queues, end_qid, i; enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS; - u16 quanta_prof_id, quanta_size, start_qid, end_qid, i; struct virtchnl_quanta_cfg *qquanta = (struct virtchnl_quanta_cfg *)msg; struct ice_vsi *vsi; int ret; + start_qid = qquanta->queue_select.start_queue_id; + num_queues = qquanta->queue_select.num_queues; + + if (check_add_overflow(start_qid, num_queues, &end_qid)) { + v_ret = VIRTCHNL_STATUS_ERR_PARAM; + goto err; + } + if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states)) { v_ret = VIRTCHNL_STATUS_ERR_PARAM; goto err; @@ -1918,8 +1926,6 @@ static int ice_vc_cfg_q_quanta(struct ice_vf *vf, u8 *msg) goto err; } - end_qid = qquanta->queue_select.start_queue_id + - qquanta->queue_select.num_queues; if (end_qid > ICE_MAX_RSS_QS_PER_VF || end_qid > min_t(u16, vsi->alloc_txq, vsi->alloc_rxq)) { dev_err(ice_pf_to_dev(vf->pf), "VF-%d trying to configure more than allocated number of queues: %d\n", @@ -1948,7 +1954,6 @@ static int ice_vc_cfg_q_quanta(struct ice_vf *vf, u8 *msg) goto err; } - start_qid = qquanta->queue_select.start_queue_id; for (i = start_qid; i < end_qid; i++) vsi->tx_rings[i]->quanta_prof_id = quanta_prof_id; From c5be6562de5a19c44605b1c1edba8e339f2022c8 Mon Sep 17 00:00:00 2001 From: Lukasz Czapnik Date: Tue, 4 Mar 2025 12:08:34 +0100 Subject: [PATCH 07/41] ice: fix input validation for virtchnl BW Add missing validation of tc and queue id values sent by a VF in ice_vc_cfg_q_bw(). Additionally fixed logged value in the warning message, where max_tx_rate was incorrectly referenced instead of min_tx_rate. Also correct error handling in this function by properly exiting when invalid configuration is detected. Fixes: 015307754a19 ("ice: Support VF queue rate limit and quanta size configuration") Reviewed-by: Jedrzej Jagielski Reviewed-by: Simon Horman Signed-off-by: Lukasz Czapnik Co-developed-by: Martyna Szapar-Mudlaw Signed-off-by: Martyna Szapar-Mudlaw Tested-by: Rafal Romanowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_virtchnl.c | 24 ++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl.c b/drivers/net/ethernet/intel/ice/ice_virtchnl.c index df13f5110168d..1af51469f070b 100644 --- a/drivers/net/ethernet/intel/ice/ice_virtchnl.c +++ b/drivers/net/ethernet/intel/ice/ice_virtchnl.c @@ -1862,15 +1862,33 @@ static int ice_vc_cfg_q_bw(struct ice_vf *vf, u8 *msg) for (i = 0; i < qbw->num_queues; i++) { if (qbw->cfg[i].shaper.peak != 0 && vf->max_tx_rate != 0 && - qbw->cfg[i].shaper.peak > vf->max_tx_rate) + qbw->cfg[i].shaper.peak > vf->max_tx_rate) { dev_warn(ice_pf_to_dev(vf->pf), "The maximum queue %d rate limit configuration may not take effect because the maximum TX rate for VF-%d is %d\n", qbw->cfg[i].queue_id, vf->vf_id, vf->max_tx_rate); + v_ret = VIRTCHNL_STATUS_ERR_PARAM; + goto err; + } if (qbw->cfg[i].shaper.committed != 0 && vf->min_tx_rate != 0 && - qbw->cfg[i].shaper.committed < vf->min_tx_rate) + qbw->cfg[i].shaper.committed < vf->min_tx_rate) { dev_warn(ice_pf_to_dev(vf->pf), "The minimum queue %d rate limit configuration may not take effect because the minimum TX rate for VF-%d is %d\n", qbw->cfg[i].queue_id, vf->vf_id, - vf->max_tx_rate); + vf->min_tx_rate); + v_ret = VIRTCHNL_STATUS_ERR_PARAM; + goto err; + } + if (qbw->cfg[i].queue_id > vf->num_vf_qs) { + dev_warn(ice_pf_to_dev(vf->pf), "VF-%d trying to configure invalid queue_id\n", + vf->vf_id); + v_ret = VIRTCHNL_STATUS_ERR_PARAM; + goto err; + } + if (qbw->cfg[i].tc >= ICE_MAX_TRAFFIC_CLASS) { + dev_warn(ice_pf_to_dev(vf->pf), "VF-%d trying to configure a traffic class higher than allowed\n", + vf->vf_id); + v_ret = VIRTCHNL_STATUS_ERR_PARAM; + goto err; + } } for (i = 0; i < qbw->num_queues; i++) { From 1388dd564183a5a18ec4a966748037736b5653c5 Mon Sep 17 00:00:00 2001 From: Mateusz Polchlopek Date: Tue, 4 Mar 2025 12:08:36 +0100 Subject: [PATCH 08/41] ice: fix using untrusted value of pkt_len in ice_vc_fdir_parse_raw() Fix using the untrusted value of proto->raw.pkt_len in function ice_vc_fdir_parse_raw() by verifying if it does not exceed the VIRTCHNL_MAX_SIZE_RAW_PACKET value. Fixes: 99f419df8a5c ("ice: enable FDIR filters from raw binary patterns for VFs") Reviewed-by: Przemek Kitszel Signed-off-by: Mateusz Polchlopek Signed-off-by: Martyna Szapar-Mudlaw Tested-by: Rafal Romanowski Signed-off-by: Tony Nguyen --- .../ethernet/intel/ice/ice_virtchnl_fdir.c | 24 ++++++++++++------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_fdir.c b/drivers/net/ethernet/intel/ice/ice_virtchnl_fdir.c index 14e3f0f89c78d..9be4bd717512d 100644 --- a/drivers/net/ethernet/intel/ice/ice_virtchnl_fdir.c +++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_fdir.c @@ -832,21 +832,27 @@ ice_vc_fdir_parse_raw(struct ice_vf *vf, struct virtchnl_proto_hdrs *proto, struct virtchnl_fdir_fltr_conf *conf) { - u8 *pkt_buf, *msk_buf __free(kfree); + u8 *pkt_buf, *msk_buf __free(kfree) = NULL; struct ice_parser_result rslt; struct ice_pf *pf = vf->pf; + u16 pkt_len, udp_port = 0; struct ice_parser *psr; int status = -ENOMEM; struct ice_hw *hw; - u16 udp_port = 0; - pkt_buf = kzalloc(proto->raw.pkt_len, GFP_KERNEL); - msk_buf = kzalloc(proto->raw.pkt_len, GFP_KERNEL); + pkt_len = proto->raw.pkt_len; + + if (!pkt_len || pkt_len > VIRTCHNL_MAX_SIZE_RAW_PACKET) + return -EINVAL; + + pkt_buf = kzalloc(pkt_len, GFP_KERNEL); + msk_buf = kzalloc(pkt_len, GFP_KERNEL); + if (!pkt_buf || !msk_buf) goto err_mem_alloc; - memcpy(pkt_buf, proto->raw.spec, proto->raw.pkt_len); - memcpy(msk_buf, proto->raw.mask, proto->raw.pkt_len); + memcpy(pkt_buf, proto->raw.spec, pkt_len); + memcpy(msk_buf, proto->raw.mask, pkt_len); hw = &pf->hw; @@ -862,7 +868,7 @@ ice_vc_fdir_parse_raw(struct ice_vf *vf, if (ice_get_open_tunnel_port(hw, &udp_port, TNL_VXLAN)) ice_parser_vxlan_tunnel_set(psr, udp_port, true); - status = ice_parser_run(psr, pkt_buf, proto->raw.pkt_len, &rslt); + status = ice_parser_run(psr, pkt_buf, pkt_len, &rslt); if (status) goto err_parser_destroy; @@ -876,7 +882,7 @@ ice_vc_fdir_parse_raw(struct ice_vf *vf, } status = ice_parser_profile_init(&rslt, pkt_buf, msk_buf, - proto->raw.pkt_len, ICE_BLK_FD, + pkt_len, ICE_BLK_FD, conf->prof); if (status) goto err_parser_profile_init; @@ -885,7 +891,7 @@ ice_vc_fdir_parse_raw(struct ice_vf *vf, ice_parser_profile_dump(hw, conf->prof); /* Store raw flow info into @conf */ - conf->pkt_len = proto->raw.pkt_len; + conf->pkt_len = pkt_len; conf->pkt_buf = pkt_buf; conf->parser_ena = true; From 680811c67906191b237bbafe7dabbbad64649b39 Mon Sep 17 00:00:00 2001 From: Emil Tantilov Date: Fri, 14 Feb 2025 09:18:16 -0800 Subject: [PATCH 09/41] idpf: check error for register_netdev() on init Current init logic ignores the error code from register_netdev(), which will cause WARN_ON() on attempt to unregister it, if there was one, and there is no info for the user that the creation of the netdev failed. WARNING: CPU: 89 PID: 6902 at net/core/dev.c:11512 unregister_netdevice_many_notify+0x211/0x1a10 ... [ 3707.563641] unregister_netdev+0x1c/0x30 [ 3707.563656] idpf_vport_dealloc+0x5cf/0xce0 [idpf] [ 3707.563684] idpf_deinit_task+0xef/0x160 [idpf] [ 3707.563712] idpf_vc_core_deinit+0x84/0x320 [idpf] [ 3707.563739] idpf_remove+0xbf/0x780 [idpf] [ 3707.563769] pci_device_remove+0xab/0x1e0 [ 3707.563786] device_release_driver_internal+0x371/0x530 [ 3707.563803] driver_detach+0xbf/0x180 [ 3707.563816] bus_remove_driver+0x11b/0x2a0 [ 3707.563829] pci_unregister_driver+0x2a/0x250 Introduce an error check and log the vport number and error code. On removal make sure to check VPORT_REG_NETDEV flag prior to calling unregister and free on the netdev. Add local variables for idx, vport_config and netdev for readability. Fixes: 0fe45467a104 ("idpf: add create vport and netdev configuration") Suggested-by: Tony Nguyen Signed-off-by: Emil Tantilov Reviewed-by: Simon Horman Tested-by: Samuel Salin Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/idpf/idpf_lib.c | 31 +++++++++++++++------- 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/intel/idpf/idpf_lib.c b/drivers/net/ethernet/intel/idpf/idpf_lib.c index a3d6b8f198a86..a055a47449f12 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_lib.c +++ b/drivers/net/ethernet/intel/idpf/idpf_lib.c @@ -927,15 +927,19 @@ static int idpf_stop(struct net_device *netdev) static void idpf_decfg_netdev(struct idpf_vport *vport) { struct idpf_adapter *adapter = vport->adapter; + u16 idx = vport->idx; kfree(vport->rx_ptype_lkup); vport->rx_ptype_lkup = NULL; - unregister_netdev(vport->netdev); - free_netdev(vport->netdev); + if (test_and_clear_bit(IDPF_VPORT_REG_NETDEV, + adapter->vport_config[idx]->flags)) { + unregister_netdev(vport->netdev); + free_netdev(vport->netdev); + } vport->netdev = NULL; - adapter->netdevs[vport->idx] = NULL; + adapter->netdevs[idx] = NULL; } /** @@ -1536,13 +1540,22 @@ void idpf_init_task(struct work_struct *work) } for (index = 0; index < adapter->max_vports; index++) { - if (adapter->netdevs[index] && - !test_bit(IDPF_VPORT_REG_NETDEV, - adapter->vport_config[index]->flags)) { - register_netdev(adapter->netdevs[index]); - set_bit(IDPF_VPORT_REG_NETDEV, - adapter->vport_config[index]->flags); + struct net_device *netdev = adapter->netdevs[index]; + struct idpf_vport_config *vport_config; + + vport_config = adapter->vport_config[index]; + + if (!netdev || + test_bit(IDPF_VPORT_REG_NETDEV, vport_config->flags)) + continue; + + err = register_netdev(netdev); + if (err) { + dev_err(&pdev->dev, "failed to register netdev for vport %d: %pe\n", + index, ERR_PTR(err)); + continue; } + set_bit(IDPF_VPORT_REG_NETDEV, vport_config->flags); } /* As all the required vports are created, clear the reset flag From f653b608f78363c26dc4ffd6c2465eb743ebdc13 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Fri, 14 Mar 2025 12:06:31 +0200 Subject: [PATCH 10/41] MAINTAINERS: update bridge entry Roopa has decided to withdraw as a bridge maintainer and Ido has agreed to step up and co-maintain the bridge with me. He has been very helpful in bridge patch reviews and has contributed a lot to the bridge over the years. Add an entry for Roopa to CREDITS and also add bridge's headers to its MAINTAINERS entry. Signed-off-by: Nikolay Aleksandrov Reviewed-by: Ido Schimmel Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250314100631.40999-1-razor@blackwall.org Signed-off-by: Paolo Abeni --- CREDITS | 4 ++++ MAINTAINERS | 4 +++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/CREDITS b/CREDITS index 53d11a46fd698..d71d42c30044b 100644 --- a/CREDITS +++ b/CREDITS @@ -3233,6 +3233,10 @@ N: Rui Prior E: rprior@inescn.pt D: ATM device driver for NICStAR based cards +N: Roopa Prabhu +E: roopa@nvidia.com +D: Bridge co-maintainer, vxlan and networking contributor + N: Stefan Probst E: sp@caldera.de D: The Linux Support Team Erlangen, 1993-97 diff --git a/MAINTAINERS b/MAINTAINERS index 00e94bec401e1..a0e48e8c7870e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8593,12 +8593,14 @@ F: Documentation/networking/devlink/etas_es58x.rst F: drivers/net/can/usb/etas_es58x/ ETHERNET BRIDGE -M: Roopa Prabhu M: Nikolay Aleksandrov +M: Ido Schimmel L: bridge@lists.linux.dev L: netdev@vger.kernel.org S: Maintained W: http://www.linuxfoundation.org/en/Net:Bridge +F: include/linux/if_bridge.h +F: include/uapi/linux/if_bridge.h F: include/linux/netfilter_bridge/ F: net/bridge/ From 919f9f497dbcee75d487400e8f9815b74a6a37df Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Sun, 16 Mar 2025 02:58:37 +0000 Subject: [PATCH 11/41] eth: bnxt: fix out-of-range access of vnic_info array The bnxt_queue_{start | stop}() access vnic_info as much as allocated, which indicates bp->nr_vnics. So, it should not reach bp->vnic_info[bp->nr_vnics]. Fixes: 661958552eda ("eth: bnxt: do not use BNXT_VNIC_NTUPLE unconditionally in queue restart logic") Signed-off-by: Taehee Yoo Reviewed-by: Michael Chan Link: https://patch.msgid.link/20250316025837.939527-1-ap420073@gmail.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 55f553debd3b2..0ddc3d41e2d81 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -15651,7 +15651,7 @@ static int bnxt_queue_start(struct net_device *dev, void *qmem, int idx) cpr = &rxr->bnapi->cp_ring; cpr->sw_stats->rx.rx_resets++; - for (i = 0; i <= bp->nr_vnics; i++) { + for (i = 0; i < bp->nr_vnics; i++) { vnic = &bp->vnic_info[i]; rc = bnxt_hwrm_vnic_set_rss_p5(bp, vnic, true); @@ -15679,7 +15679,7 @@ static int bnxt_queue_stop(struct net_device *dev, void *qmem, int idx) struct bnxt_vnic_info *vnic; int i; - for (i = 0; i <= bp->nr_vnics; i++) { + for (i = 0; i < bp->nr_vnics; i++) { vnic = &bp->vnic_info[i]; vnic->mru = 0; bnxt_hwrm_vnic_update(bp, vnic, From ed3ba9b6e280e14cc3148c1b226ba453f02fa76c Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Sun, 16 Mar 2025 12:28:37 -0700 Subject: [PATCH 12/41] net: Remove RTNL dance for SIOCBRADDIF and SIOCBRDELIF. SIOCBRDELIF is passed to dev_ioctl() first and later forwarded to br_ioctl_call(), which causes unnecessary RTNL dance and the splat below [0] under RTNL pressure. Let's say Thread A is trying to detach a device from a bridge and Thread B is trying to remove the bridge. In dev_ioctl(), Thread A bumps the bridge device's refcnt by netdev_hold() and releases RTNL because the following br_ioctl_call() also re-acquires RTNL. In the race window, Thread B could acquire RTNL and try to remove the bridge device. Then, rtnl_unlock() by Thread B will release RTNL and wait for netdev_put() by Thread A. Thread A, however, must hold RTNL after the unlock in dev_ifsioc(), which may take long under RTNL pressure, resulting in the splat by Thread B. Thread A (SIOCBRDELIF) Thread B (SIOCBRDELBR) ---------------------- ---------------------- sock_ioctl sock_ioctl `- sock_do_ioctl `- br_ioctl_call `- dev_ioctl `- br_ioctl_stub |- rtnl_lock | |- dev_ifsioc ' ' |- dev = __dev_get_by_name(...) |- netdev_hold(dev, ...) . / |- rtnl_unlock ------. | | |- br_ioctl_call `---> |- rtnl_lock Race | | `- br_ioctl_stub |- br_del_bridge Window | | | |- dev = __dev_get_by_name(...) | | | May take long | `- br_dev_delete(dev, ...) | | | under RTNL pressure | `- unregister_netdevice_queue(dev, ...) | | | | `- rtnl_unlock \ | |- rtnl_lock <-' `- netdev_run_todo | |- ... `- netdev_run_todo | `- rtnl_unlock |- __rtnl_unlock | |- netdev_wait_allrefs_any |- netdev_put(dev, ...) <----------------' Wait refcnt decrement and log splat below To avoid blocking SIOCBRDELBR unnecessarily, let's not call dev_ioctl() for SIOCBRADDIF and SIOCBRDELIF. In the dev_ioctl() path, we do the following: 1. Copy struct ifreq by get_user_ifreq in sock_do_ioctl() 2. Check CAP_NET_ADMIN in dev_ioctl() 3. Call dev_load() in dev_ioctl() 4. Fetch the master dev from ifr.ifr_name in dev_ifsioc() 3. can be done by request_module() in br_ioctl_call(), so we move 1., 2., and 4. to br_ioctl_stub(). Note that 2. is also checked later in add_del_if(), but it's better performed before RTNL. SIOCBRADDIF and SIOCBRDELIF have been processed in dev_ioctl() since the pre-git era, and there seems to be no specific reason to process them there. [0]: unregister_netdevice: waiting for wpan3 to become free. Usage count = 2 ref_tracker: wpan3@ffff8880662d8608 has 1/1 users at __netdev_tracker_alloc include/linux/netdevice.h:4282 [inline] netdev_hold include/linux/netdevice.h:4311 [inline] dev_ifsioc+0xc6a/0x1160 net/core/dev_ioctl.c:624 dev_ioctl+0x255/0x10c0 net/core/dev_ioctl.c:826 sock_do_ioctl+0x1ca/0x260 net/socket.c:1213 sock_ioctl+0x23a/0x6c0 net/socket.c:1318 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:906 [inline] __se_sys_ioctl fs/ioctl.c:892 [inline] __x64_sys_ioctl+0x1a4/0x210 fs/ioctl.c:892 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xcb/0x250 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f Fixes: 893b19587534 ("net: bridge: fix ioctl locking") Reported-by: syzkaller Reported-by: yan kang Reported-by: yue sun Closes: https://lore.kernel.org/netdev/SY8P300MB0421225D54EB92762AE8F0F2A1D32@SY8P300MB0421.AUSP300.PROD.OUTLOOK.COM/ Signed-off-by: Kuniyuki Iwashima Acked-by: Stanislav Fomichev Reviewed-by: Ido Schimmel Acked-by: Nikolay Aleksandrov Link: https://patch.msgid.link/20250316192851.19781-1-kuniyu@amazon.com Signed-off-by: Paolo Abeni --- include/linux/if_bridge.h | 6 ++---- net/bridge/br_ioctl.c | 36 +++++++++++++++++++++++++++++++++--- net/bridge/br_private.h | 3 +-- net/core/dev_ioctl.c | 19 ------------------- net/socket.c | 19 +++++++++---------- 5 files changed, 45 insertions(+), 38 deletions(-) diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h index 3ff96ae31bf6d..c5fe3b2a53e82 100644 --- a/include/linux/if_bridge.h +++ b/include/linux/if_bridge.h @@ -65,11 +65,9 @@ struct br_ip_list { #define BR_DEFAULT_AGEING_TIME (300 * HZ) struct net_bridge; -void brioctl_set(int (*hook)(struct net *net, struct net_bridge *br, - unsigned int cmd, struct ifreq *ifr, +void brioctl_set(int (*hook)(struct net *net, unsigned int cmd, void __user *uarg)); -int br_ioctl_call(struct net *net, struct net_bridge *br, unsigned int cmd, - struct ifreq *ifr, void __user *uarg); +int br_ioctl_call(struct net *net, unsigned int cmd, void __user *uarg); #if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_BRIDGE_IGMP_SNOOPING) int br_multicast_list_adjacent(struct net_device *dev, diff --git a/net/bridge/br_ioctl.c b/net/bridge/br_ioctl.c index f213ed1083618..6bc0a11f2ed3e 100644 --- a/net/bridge/br_ioctl.c +++ b/net/bridge/br_ioctl.c @@ -394,10 +394,26 @@ static int old_deviceless(struct net *net, void __user *data) return -EOPNOTSUPP; } -int br_ioctl_stub(struct net *net, struct net_bridge *br, unsigned int cmd, - struct ifreq *ifr, void __user *uarg) +int br_ioctl_stub(struct net *net, unsigned int cmd, void __user *uarg) { int ret = -EOPNOTSUPP; + struct ifreq ifr; + + if (cmd == SIOCBRADDIF || cmd == SIOCBRDELIF) { + void __user *data; + char *colon; + + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) + return -EPERM; + + if (get_user_ifreq(&ifr, &data, uarg)) + return -EFAULT; + + ifr.ifr_name[IFNAMSIZ - 1] = 0; + colon = strchr(ifr.ifr_name, ':'); + if (colon) + *colon = 0; + } rtnl_lock(); @@ -430,7 +446,21 @@ int br_ioctl_stub(struct net *net, struct net_bridge *br, unsigned int cmd, break; case SIOCBRADDIF: case SIOCBRDELIF: - ret = add_del_if(br, ifr->ifr_ifindex, cmd == SIOCBRADDIF); + { + struct net_device *dev; + + dev = __dev_get_by_name(net, ifr.ifr_name); + if (!dev || !netif_device_present(dev)) { + ret = -ENODEV; + break; + } + if (!netif_is_bridge_master(dev)) { + ret = -EOPNOTSUPP; + break; + } + + ret = add_del_if(netdev_priv(dev), ifr.ifr_ifindex, cmd == SIOCBRADDIF); + } break; } diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 1054b8a88edc4..d5b3c5936a79e 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -949,8 +949,7 @@ br_port_get_check_rtnl(const struct net_device *dev) /* br_ioctl.c */ int br_dev_siocdevprivate(struct net_device *dev, struct ifreq *rq, void __user *data, int cmd); -int br_ioctl_stub(struct net *net, struct net_bridge *br, unsigned int cmd, - struct ifreq *ifr, void __user *uarg); +int br_ioctl_stub(struct net *net, unsigned int cmd, void __user *uarg); /* br_multicast.c */ #ifdef CONFIG_BRIDGE_IGMP_SNOOPING diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index 4c2098ac9d724..57f79f8e84665 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -551,7 +551,6 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, void __user *data, int err; struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name); const struct net_device_ops *ops; - netdevice_tracker dev_tracker; if (!dev) return -ENODEV; @@ -614,22 +613,6 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, void __user *data, case SIOCWANDEV: return dev_siocwandev(dev, &ifr->ifr_settings); - case SIOCBRADDIF: - case SIOCBRDELIF: - if (!netif_device_present(dev)) - return -ENODEV; - if (!netif_is_bridge_master(dev)) - return -EOPNOTSUPP; - - netdev_hold(dev, &dev_tracker, GFP_KERNEL); - rtnl_net_unlock(net); - - err = br_ioctl_call(net, netdev_priv(dev), cmd, ifr, NULL); - - netdev_put(dev, &dev_tracker); - rtnl_net_lock(net); - return err; - case SIOCDEVPRIVATE ... SIOCDEVPRIVATE + 15: return dev_siocdevprivate(dev, ifr, data, cmd); @@ -812,8 +795,6 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, case SIOCBONDRELEASE: case SIOCBONDSETHWADDR: case SIOCBONDCHANGEACTIVE: - case SIOCBRADDIF: - case SIOCBRDELIF: case SIOCSHWTSTAMP: if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; diff --git a/net/socket.c b/net/socket.c index 28bae5a942341..38227d00d1987 100644 --- a/net/socket.c +++ b/net/socket.c @@ -1145,12 +1145,10 @@ static ssize_t sock_write_iter(struct kiocb *iocb, struct iov_iter *from) */ static DEFINE_MUTEX(br_ioctl_mutex); -static int (*br_ioctl_hook)(struct net *net, struct net_bridge *br, - unsigned int cmd, struct ifreq *ifr, +static int (*br_ioctl_hook)(struct net *net, unsigned int cmd, void __user *uarg); -void brioctl_set(int (*hook)(struct net *net, struct net_bridge *br, - unsigned int cmd, struct ifreq *ifr, +void brioctl_set(int (*hook)(struct net *net, unsigned int cmd, void __user *uarg)) { mutex_lock(&br_ioctl_mutex); @@ -1159,8 +1157,7 @@ void brioctl_set(int (*hook)(struct net *net, struct net_bridge *br, } EXPORT_SYMBOL(brioctl_set); -int br_ioctl_call(struct net *net, struct net_bridge *br, unsigned int cmd, - struct ifreq *ifr, void __user *uarg) +int br_ioctl_call(struct net *net, unsigned int cmd, void __user *uarg) { int err = -ENOPKG; @@ -1169,7 +1166,7 @@ int br_ioctl_call(struct net *net, struct net_bridge *br, unsigned int cmd, mutex_lock(&br_ioctl_mutex); if (br_ioctl_hook) - err = br_ioctl_hook(net, br, cmd, ifr, uarg); + err = br_ioctl_hook(net, cmd, uarg); mutex_unlock(&br_ioctl_mutex); return err; @@ -1269,7 +1266,9 @@ static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg) case SIOCSIFBR: case SIOCBRADDBR: case SIOCBRDELBR: - err = br_ioctl_call(net, NULL, cmd, NULL, argp); + case SIOCBRADDIF: + case SIOCBRDELIF: + err = br_ioctl_call(net, cmd, argp); break; case SIOCGIFVLAN: case SIOCSIFVLAN: @@ -3429,6 +3428,8 @@ static int compat_sock_ioctl_trans(struct file *file, struct socket *sock, case SIOCGPGRP: case SIOCBRADDBR: case SIOCBRDELBR: + case SIOCBRADDIF: + case SIOCBRDELIF: case SIOCGIFVLAN: case SIOCSIFVLAN: case SIOCGSKNS: @@ -3468,8 +3469,6 @@ static int compat_sock_ioctl_trans(struct file *file, struct socket *sock, case SIOCGIFPFLAGS: case SIOCGIFTXQLEN: case SIOCSIFTXQLEN: - case SIOCBRADDIF: - case SIOCBRDELIF: case SIOCGIFNAME: case SIOCSIFNAME: case SIOCGMIIPHY: From 2f6efbabceb6b2914ee9bafb86d9a51feae9cce8 Mon Sep 17 00:00:00 2001 From: Murad Masimov Date: Mon, 17 Mar 2025 13:53:52 +0300 Subject: [PATCH 13/41] ax25: Remove broken autobind Binding AX25 socket by using the autobind feature leads to memory leaks in ax25_connect() and also refcount leaks in ax25_release(). Memory leak was detected with kmemleak: ================================================================ unreferenced object 0xffff8880253cd680 (size 96): backtrace: __kmalloc_node_track_caller_noprof (./include/linux/kmemleak.h:43) kmemdup_noprof (mm/util.c:136) ax25_rt_autobind (net/ax25/ax25_route.c:428) ax25_connect (net/ax25/af_ax25.c:1282) __sys_connect_file (net/socket.c:2045) __sys_connect (net/socket.c:2064) __x64_sys_connect (net/socket.c:2067) do_syscall_64 (arch/x86/entry/common.c:52 arch/x86/entry/common.c:83) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:130) ================================================================ When socket is bound, refcounts must be incremented the way it is done in ax25_bind() and ax25_setsockopt() (SO_BINDTODEVICE). In case of autobind, the refcounts are not incremented. This bug leads to the following issue reported by Syzkaller: ================================================================ ax25_connect(): syz-executor318 uses autobind, please contact jreuter@yaina.de ------------[ cut here ]------------ refcount_t: decrement hit 0; leaking memory. WARNING: CPU: 0 PID: 5317 at lib/refcount.c:31 refcount_warn_saturate+0xfa/0x1d0 lib/refcount.c:31 Modules linked in: CPU: 0 UID: 0 PID: 5317 Comm: syz-executor318 Not tainted 6.14.0-rc4-syzkaller-00278-gece144f151ac #0 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2~bpo12+1 04/01/2014 RIP: 0010:refcount_warn_saturate+0xfa/0x1d0 lib/refcount.c:31 ... Call Trace: __refcount_dec include/linux/refcount.h:336 [inline] refcount_dec include/linux/refcount.h:351 [inline] ref_tracker_free+0x6af/0x7e0 lib/ref_tracker.c:236 netdev_tracker_free include/linux/netdevice.h:4302 [inline] netdev_put include/linux/netdevice.h:4319 [inline] ax25_release+0x368/0x960 net/ax25/af_ax25.c:1080 __sock_release net/socket.c:647 [inline] sock_close+0xbc/0x240 net/socket.c:1398 __fput+0x3e9/0x9f0 fs/file_table.c:464 __do_sys_close fs/open.c:1580 [inline] __se_sys_close fs/open.c:1565 [inline] __x64_sys_close+0x7f/0x110 fs/open.c:1565 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f ... ================================================================ Considering the issues above and the comments left in the code that say: "check if we can remove this feature. It is broken."; "autobinding in this may or may not work"; - it is better to completely remove this feature than to fix it because it is broken and leads to various kinds of memory bugs. Now calling connect() without first binding socket will result in an error (-EINVAL). Userspace software that relies on the autobind feature might get broken. However, this feature does not seem widely used with this specific driver as it was not reliable at any point of time, and it is already broken anyway. E.g. ax25-tools and ax25-apps packages for popular distributions do not use the autobind feature for AF_AX25. Found by Linux Verification Center (linuxtesting.org) with Syzkaller. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: syzbot+33841dc6aa3e1d86b78a@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=33841dc6aa3e1d86b78a Signed-off-by: Murad Masimov Signed-off-by: David S. Miller --- include/net/ax25.h | 1 - net/ax25/af_ax25.c | 30 ++++++------------ net/ax25/ax25_route.c | 74 ------------------------------------------- 3 files changed, 10 insertions(+), 95 deletions(-) diff --git a/include/net/ax25.h b/include/net/ax25.h index 4ee141aae0a29..a7bba42dde153 100644 --- a/include/net/ax25.h +++ b/include/net/ax25.h @@ -418,7 +418,6 @@ void ax25_rt_device_down(struct net_device *); int ax25_rt_ioctl(unsigned int, void __user *); extern const struct seq_operations ax25_rt_seqops; ax25_route *ax25_get_route(ax25_address *addr, struct net_device *dev); -int ax25_rt_autobind(ax25_cb *, ax25_address *); struct sk_buff *ax25_rt_build_path(struct sk_buff *, ax25_address *, ax25_address *, ax25_digi *); void ax25_rt_free(void); diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index 9f3b8b682adb2..3ee7dba343100 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -1270,28 +1270,18 @@ static int __must_check ax25_connect(struct socket *sock, } } - /* - * Must bind first - autobinding in this may or may not work. If - * the socket is already bound, check to see if the device has - * been filled in, error if it hasn't. - */ + /* Must bind first - autobinding does not work. */ if (sock_flag(sk, SOCK_ZAPPED)) { - /* check if we can remove this feature. It is broken. */ - printk(KERN_WARNING "ax25_connect(): %s uses autobind, please contact jreuter@yaina.de\n", - current->comm); - if ((err = ax25_rt_autobind(ax25, &fsa->fsa_ax25.sax25_call)) < 0) { - kfree(digi); - goto out_release; - } + kfree(digi); + err = -EINVAL; + goto out_release; + } - ax25_fillin_cb(ax25, ax25->ax25_dev); - ax25_cb_add(ax25); - } else { - if (ax25->ax25_dev == NULL) { - kfree(digi); - err = -EHOSTUNREACH; - goto out_release; - } + /* Check to see if the device has been filled in, error if it hasn't. */ + if (ax25->ax25_dev == NULL) { + kfree(digi); + err = -EHOSTUNREACH; + goto out_release; } if (sk->sk_type == SOCK_SEQPACKET && diff --git a/net/ax25/ax25_route.c b/net/ax25/ax25_route.c index 69de75db0c9c2..10577434f40bf 100644 --- a/net/ax25/ax25_route.c +++ b/net/ax25/ax25_route.c @@ -373,80 +373,6 @@ ax25_route *ax25_get_route(ax25_address *addr, struct net_device *dev) return ax25_rt; } -/* - * Adjust path: If you specify a default route and want to connect - * a target on the digipeater path but w/o having a special route - * set before, the path has to be truncated from your target on. - */ -static inline void ax25_adjust_path(ax25_address *addr, ax25_digi *digipeat) -{ - int k; - - for (k = 0; k < digipeat->ndigi; k++) { - if (ax25cmp(addr, &digipeat->calls[k]) == 0) - break; - } - - digipeat->ndigi = k; -} - - -/* - * Find which interface to use. - */ -int ax25_rt_autobind(ax25_cb *ax25, ax25_address *addr) -{ - ax25_uid_assoc *user; - ax25_route *ax25_rt; - int err = 0; - - ax25_route_lock_use(); - ax25_rt = ax25_get_route(addr, NULL); - if (!ax25_rt) { - ax25_route_lock_unuse(); - return -EHOSTUNREACH; - } - rcu_read_lock(); - if ((ax25->ax25_dev = ax25_dev_ax25dev(ax25_rt->dev)) == NULL) { - err = -EHOSTUNREACH; - goto put; - } - - user = ax25_findbyuid(current_euid()); - if (user) { - ax25->source_addr = user->call; - ax25_uid_put(user); - } else { - if (ax25_uid_policy && !capable(CAP_NET_BIND_SERVICE)) { - err = -EPERM; - goto put; - } - ax25->source_addr = *(ax25_address *)ax25->ax25_dev->dev->dev_addr; - } - - if (ax25_rt->digipeat != NULL) { - ax25->digipeat = kmemdup(ax25_rt->digipeat, sizeof(ax25_digi), - GFP_ATOMIC); - if (ax25->digipeat == NULL) { - err = -ENOMEM; - goto put; - } - ax25_adjust_path(addr, ax25->digipeat); - } - - if (ax25->sk != NULL) { - local_bh_disable(); - bh_lock_sock(ax25->sk); - sock_reset_flag(ax25->sk, SOCK_ZAPPED); - bh_unlock_sock(ax25->sk); - local_bh_enable(); - } - -put: - rcu_read_unlock(); - ax25_route_lock_unuse(); - return err; -} struct sk_buff *ax25_rt_build_path(struct sk_buff *skb, ax25_address *src, ax25_address *dest, ax25_digi *digi) From c60d101a226f18e9a8f01bb4c6ca2b47dfcb15ef Mon Sep 17 00:00:00 2001 From: Qingfang Deng Date: Tue, 18 Mar 2025 11:24:23 +0800 Subject: [PATCH 14/41] net: stmmac: Fix accessing freed irq affinity_hint The cpumask should not be a local variable, since its pointer is saved to irq_desc and may be accessed from procfs. To fix it, use the persistent mask cpumask_of(cpu#). Cc: stable@vger.kernel.org Fixes: 8deec94c6040 ("net: stmmac: set IRQ affinity hint for multi MSI vectors") Signed-off-by: Qingfang Deng Reviewed-by: Jacob Keller Link: https://patch.msgid.link/20250318032424.112067-1-dqfext@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index c0ae7db96f46f..b7c3bfdaa1802 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -3640,7 +3640,6 @@ static int stmmac_request_irq_multi_msi(struct net_device *dev) { struct stmmac_priv *priv = netdev_priv(dev); enum request_irq_err irq_err; - cpumask_t cpu_mask; int irq_idx = 0; char *int_name; int ret; @@ -3769,9 +3768,8 @@ static int stmmac_request_irq_multi_msi(struct net_device *dev) irq_idx = i; goto irq_error; } - cpumask_clear(&cpu_mask); - cpumask_set_cpu(i % num_online_cpus(), &cpu_mask); - irq_set_affinity_hint(priv->rx_irq[i], &cpu_mask); + irq_set_affinity_hint(priv->rx_irq[i], + cpumask_of(i % num_online_cpus())); } /* Request Tx MSI irq */ @@ -3794,9 +3792,8 @@ static int stmmac_request_irq_multi_msi(struct net_device *dev) irq_idx = i; goto irq_error; } - cpumask_clear(&cpu_mask); - cpumask_set_cpu(i % num_online_cpus(), &cpu_mask); - irq_set_affinity_hint(priv->tx_irq[i], &cpu_mask); + irq_set_affinity_hint(priv->tx_irq[i], + cpumask_of(i % num_online_cpus())); } return 0; From 81273eb87af86d4a43244b553762348e364b2df7 Mon Sep 17 00:00:00 2001 From: Harshitha Ramamurthy Date: Mon, 17 Mar 2025 21:41:41 +0000 Subject: [PATCH 15/41] gve: unlink old napi only if page pool exists Commit de70981f295e ("gve: unlink old napi when stopping a queue using queue API") unlinks the old napi when stopping a queue. But this breaks QPL mode of the driver which does not use page pool. Fix this by checking that there's a page pool associated with the ring. Cc: stable@vger.kernel.org Fixes: de70981f295e ("gve: unlink old napi when stopping a queue using queue API") Reviewed-by: Joshua Washington Signed-off-by: Harshitha Ramamurthy Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250317214141.286854-1-hramamurthy@google.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/google/gve/gve_rx_dqo.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/google/gve/gve_rx_dqo.c b/drivers/net/ethernet/google/gve/gve_rx_dqo.c index f0674a4435670..b5be2c18858a0 100644 --- a/drivers/net/ethernet/google/gve/gve_rx_dqo.c +++ b/drivers/net/ethernet/google/gve/gve_rx_dqo.c @@ -114,7 +114,8 @@ void gve_rx_stop_ring_dqo(struct gve_priv *priv, int idx) if (!gve_rx_was_added_to_block(priv, idx)) return; - page_pool_disable_direct_recycling(rx->dqo.page_pool); + if (rx->dqo.page_pool) + page_pool_disable_direct_recycling(rx->dqo.page_pool); gve_remove_napi(priv, ntfy_idx); gve_rx_remove_from_block(priv, idx); gve_rx_reset_ring_dqo(priv, idx); From 3865bec60683b86d39a5d8d6c34a1d269adaa84c Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Wed, 19 Mar 2025 14:45:08 +0200 Subject: [PATCH 16/41] net/mlx5e: Fix ethtool -N flow-type ip4 to RSS context There commands can be used to add an RSS context and steer some traffic into it: # ethtool -X eth0 context new New RSS context is 1 # ethtool -N eth0 flow-type ip4 dst-ip 1.1.1.1 context 1 Added rule with ID 1023 However, the second command fails with EINVAL on mlx5e: # ethtool -N eth0 flow-type ip4 dst-ip 1.1.1.1 context 1 rmgr: Cannot insert RX class rule: Invalid argument Cannot insert classification rule It happens when flow_get_tirn calls flow_type_to_traffic_type with flow_type = IP_USER_FLOW or IPV6_USER_FLOW. That function only handles IPV4_FLOW and IPV6_FLOW cases, but unlike all other cases which are common for hash and spec, IPv4 and IPv6 defines different contants for hash and for spec: #define TCP_V4_FLOW 0x01 /* hash or spec (tcp_ip4_spec) */ #define UDP_V4_FLOW 0x02 /* hash or spec (udp_ip4_spec) */ ... #define IPV4_USER_FLOW 0x0d /* spec only (usr_ip4_spec) */ #define IP_USER_FLOW IPV4_USER_FLOW #define IPV6_USER_FLOW 0x0e /* spec only (usr_ip6_spec; nfc only) */ #define IPV4_FLOW 0x10 /* hash only */ #define IPV6_FLOW 0x11 /* hash only */ Extend the switch in flow_type_to_traffic_type to support both, which fixes the failing ethtool -N command with flow-type ip4 or ip6. Fixes: 248d3b4c9a39 ("net/mlx5e: Support flow classification into RSS contexts") Signed-off-by: Maxim Mikityanskiy Tested-by: Daniel Borkmann Reviewed-by: Joe Damato Reviewed-by: Tariq Toukan Link: https://patch.msgid.link/20250319124508.3979818-1-maxim@isovalent.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c index 773624bb2c5d5..d68230a7b9f46 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c @@ -884,8 +884,10 @@ static int flow_type_to_traffic_type(u32 flow_type) case ESP_V6_FLOW: return MLX5_TT_IPV6_IPSEC_ESP; case IPV4_FLOW: + case IP_USER_FLOW: return MLX5_TT_IPV4; case IPV6_FLOW: + case IPV6_USER_FLOW: return MLX5_TT_IPV6; default: return -EINVAL; From 107b25db61122d8f990987895c2912927b8b6e3f Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Fri, 21 Mar 2025 14:16:38 -0700 Subject: [PATCH 17/41] bnxt_en: Mask the bd_cnt field in the TX BD properly The bd_cnt field in the TX BD specifies the total number of BDs for the TX packet. The bd_cnt field has 5 bits and the maximum number supported is 32 with the value 0. CONFIG_MAX_SKB_FRAGS can be modified and the total number of SKB fragments can approach or exceed the maximum supported by the chip. Add a macro to properly mask the bd_cnt field so that the value 32 will be properly masked and set to 0 in the bd_cnd field. Without this patch, the out-of-range bd_cnt value will corrupt the TX BD and may cause TX timeout. The next patch will check for values exceeding 32. Fixes: 3948b05950fd ("net: introduce a config option to tweak MAX_SKB_FRAGS") Reviewed-by: Kalesh AP Reviewed-by: Somnath Kotur Reviewed-by: Andy Gospodarek Signed-off-by: Michael Chan Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250321211639.3812992-2-michael.chan@broadcom.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 4 ++-- drivers/net/ethernet/broadcom/bnxt/bnxt.h | 2 ++ drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c | 3 +-- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 0ddc3d41e2d81..158e9789c1f46 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -564,7 +564,7 @@ static netdev_tx_t bnxt_start_xmit(struct sk_buff *skb, struct net_device *dev) TX_BD_FLAGS_LHINT_512_AND_SMALLER | TX_BD_FLAGS_COAL_NOW | TX_BD_FLAGS_PACKET_END | - (2 << TX_BD_FLAGS_BD_CNT_SHIFT)); + TX_BD_CNT(2)); if (skb->ip_summed == CHECKSUM_PARTIAL) tx_push1->tx_bd_hsize_lflags = @@ -639,7 +639,7 @@ static netdev_tx_t bnxt_start_xmit(struct sk_buff *skb, struct net_device *dev) dma_unmap_addr_set(tx_buf, mapping, mapping); flags = (len << TX_BD_LEN_SHIFT) | TX_BD_TYPE_LONG_TX_BD | - ((last_frag + 2) << TX_BD_FLAGS_BD_CNT_SHIFT); + TX_BD_CNT(last_frag + 2); txbd->tx_bd_haddr = cpu_to_le64(mapping); txbd->tx_bd_opaque = SET_TX_OPAQUE(bp, txr, prod, 2 + last_frag); diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index 2373f423a523e..3b4a044db73e3 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -82,6 +82,8 @@ struct tx_bd { #define TX_OPAQUE_PROD(bp, opq) ((TX_OPAQUE_IDX(opq) + TX_OPAQUE_BDS(opq)) &\ (bp)->tx_ring_mask) +#define TX_BD_CNT(n) (((n) << TX_BD_FLAGS_BD_CNT_SHIFT) & TX_BD_FLAGS_BD_CNT) + struct tx_bd_ext { __le32 tx_bd_hsize_lflags; #define TX_BD_FLAGS_TCP_UDP_CHKSUM (1 << 0) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c index 299822cacca48..d71bad3cfd6bd 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c @@ -48,8 +48,7 @@ struct bnxt_sw_tx_bd *bnxt_xmit_bd(struct bnxt *bp, tx_buf->page = virt_to_head_page(xdp->data); txbd = &txr->tx_desc_ring[TX_RING(bp, prod)][TX_IDX(prod)]; - flags = (len << TX_BD_LEN_SHIFT) | - ((num_frags + 1) << TX_BD_FLAGS_BD_CNT_SHIFT) | + flags = (len << TX_BD_LEN_SHIFT) | TX_BD_CNT(num_frags + 1) | bnxt_lhint_arr[len >> 9]; txbd->tx_bd_len_flags_type = cpu_to_le32(flags); txbd->tx_bd_opaque = SET_TX_OPAQUE(bp, txr, prod, 1 + num_frags); From b91e82129400bdc40ee1232aa7e32ae6027f9b4f Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Fri, 21 Mar 2025 14:16:39 -0700 Subject: [PATCH 18/41] bnxt_en: Linearize TX SKB if the fragments exceed the max If skb_shinfo(skb)->nr_frags excceds what the chip can support, linearize the SKB and warn once to let the user know. net.core.max_skb_frags can be lowered, for example, to avoid the issue. Fixes: 3948b05950fd ("net: introduce a config option to tweak MAX_SKB_FRAGS") Reviewed-by: Somnath Kotur Reviewed-by: Kalesh AP Reviewed-by: Andy Gospodarek Signed-off-by: Michael Chan Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250321211639.3812992-3-michael.chan@broadcom.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 11 +++++++++++ drivers/net/ethernet/broadcom/bnxt/bnxt.h | 4 ++++ 2 files changed, 15 insertions(+) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 158e9789c1f46..2cd79b59cf002 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -485,6 +485,17 @@ static netdev_tx_t bnxt_start_xmit(struct sk_buff *skb, struct net_device *dev) txr = &bp->tx_ring[bp->tx_ring_map[i]]; prod = txr->tx_prod; +#if (MAX_SKB_FRAGS > TX_MAX_FRAGS) + if (skb_shinfo(skb)->nr_frags > TX_MAX_FRAGS) { + netdev_warn_once(dev, "SKB has too many (%d) fragments, max supported is %d. SKB will be linearized.\n", + skb_shinfo(skb)->nr_frags, TX_MAX_FRAGS); + if (skb_linearize(skb)) { + dev_kfree_skb_any(skb); + dev_core_stats_tx_dropped_inc(dev); + return NETDEV_TX_OK; + } + } +#endif free_size = bnxt_tx_avail(bp, txr); if (unlikely(free_size < skb_shinfo(skb)->nr_frags + 2)) { /* We must have raced with NAPI cleanup */ diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index 3b4a044db73e3..d621fb621f30c 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -84,6 +84,10 @@ struct tx_bd { #define TX_BD_CNT(n) (((n) << TX_BD_FLAGS_BD_CNT_SHIFT) & TX_BD_FLAGS_BD_CNT) +#define TX_MAX_BD_CNT 32 + +#define TX_MAX_FRAGS (TX_MAX_BD_CNT - 2) + struct tx_bd_ext { __le32 tx_bd_hsize_lflags; #define TX_BD_FLAGS_TCP_UDP_CHKSUM (1 << 0) From f9a457722cf5e3534be5ffab549d6b49737fca72 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Beh=C3=BAn?= Date: Mon, 17 Mar 2025 18:32:44 +0100 Subject: [PATCH 19/41] net: dsa: mv88e6xxx: fix VTU methods for 6320 family MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The VTU registers of the 6320 family use the 6352 semantics, not 6185. Fix it. Fixes: b8fee9571063 ("net: dsa: mv88e6xxx: add VLAN Get Next support") Signed-off-by: Marek Behún Cc: # 5.15.x Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20250317173250.28780-2-kabel@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/dsa/mv88e6xxx/chip.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c index 5db96ca52505a..06b17c3b2205c 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.c +++ b/drivers/net/dsa/mv88e6xxx/chip.c @@ -5169,8 +5169,8 @@ static const struct mv88e6xxx_ops mv88e6320_ops = { .hardware_reset_pre = mv88e6xxx_g2_eeprom_wait, .hardware_reset_post = mv88e6xxx_g2_eeprom_wait, .reset = mv88e6352_g1_reset, - .vtu_getnext = mv88e6185_g1_vtu_getnext, - .vtu_loadpurge = mv88e6185_g1_vtu_loadpurge, + .vtu_getnext = mv88e6352_g1_vtu_getnext, + .vtu_loadpurge = mv88e6352_g1_vtu_loadpurge, .gpio_ops = &mv88e6352_gpio_ops, .avb_ops = &mv88e6352_avb_ops, .ptp_ops = &mv88e6352_ptp_ops, @@ -5217,8 +5217,8 @@ static const struct mv88e6xxx_ops mv88e6321_ops = { .hardware_reset_pre = mv88e6xxx_g2_eeprom_wait, .hardware_reset_post = mv88e6xxx_g2_eeprom_wait, .reset = mv88e6352_g1_reset, - .vtu_getnext = mv88e6185_g1_vtu_getnext, - .vtu_loadpurge = mv88e6185_g1_vtu_loadpurge, + .vtu_getnext = mv88e6352_g1_vtu_getnext, + .vtu_loadpurge = mv88e6352_g1_vtu_loadpurge, .gpio_ops = &mv88e6352_gpio_ops, .avb_ops = &mv88e6352_avb_ops, .ptp_ops = &mv88e6352_ptp_ops, From 4ae01ec007716986e1a20f1285eb013cbf188830 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Beh=C3=BAn?= Date: Mon, 17 Mar 2025 18:32:45 +0100 Subject: [PATCH 20/41] net: dsa: mv88e6xxx: fix atu_move_port_mask for 6341 family MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The atu_move_port_mask for 6341 family (Topaz) is 0xf, not 0x1f. The PortVec field is 8 bits wide, not 11 as in 6390 family. Fix this. Fixes: e606ca36bbf2 ("net: dsa: mv88e6xxx: rework ATU Remove") Signed-off-by: Marek Behún Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20250317173250.28780-3-kabel@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/dsa/mv88e6xxx/chip.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c index 06b17c3b2205c..b989123cdbeb0 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.c +++ b/drivers/net/dsa/mv88e6xxx/chip.c @@ -5818,7 +5818,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = { .global1_addr = 0x1b, .global2_addr = 0x1c, .age_time_coeff = 3750, - .atu_move_port_mask = 0x1f, + .atu_move_port_mask = 0xf, .g1_irqs = 9, .g2_irqs = 10, .stats_type = STATS_TYPE_BANK0 | STATS_TYPE_BANK1, @@ -6296,7 +6296,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = { .global1_addr = 0x1b, .global2_addr = 0x1c, .age_time_coeff = 3750, - .atu_move_port_mask = 0x1f, + .atu_move_port_mask = 0xf, .g1_irqs = 9, .g2_irqs = 10, .stats_type = STATS_TYPE_BANK0 | STATS_TYPE_BANK1, From f85c69369854a43af2c5d3b3896da0908d713133 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Beh=C3=BAn?= Date: Mon, 17 Mar 2025 18:32:46 +0100 Subject: [PATCH 21/41] net: dsa: mv88e6xxx: enable PVT for 6321 switch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit f36456522168 ("net: dsa: mv88e6xxx: move PVT description in info") did not enable PVT for 6321 switch. Fix it. Fixes: f36456522168 ("net: dsa: mv88e6xxx: move PVT description in info") Signed-off-by: Marek Behún Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20250317173250.28780-4-kabel@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/dsa/mv88e6xxx/chip.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c index b989123cdbeb0..3cddc8f084b37 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.c +++ b/drivers/net/dsa/mv88e6xxx/chip.c @@ -6274,6 +6274,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = { .g2_irqs = 10, .stats_type = STATS_TYPE_BANK0 | STATS_TYPE_BANK1, .atu_move_port_mask = 0xf, + .pvt = true, .multi_chip = true, .edsa_support = MV88E6XXX_EDSA_SUPPORTED, .ptp_support = true, From a2ef58e2c4aea4de166fc9832eb2b621e88c98d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Beh=C3=BAn?= Date: Mon, 17 Mar 2025 18:32:47 +0100 Subject: [PATCH 22/41] net: dsa: mv88e6xxx: enable .port_set_policy() for 6320 family MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit f3a2cd326e44 ("net: dsa: mv88e6xxx: introduce .port_set_policy") did not add the .port_set_policy() method for the 6320 family. Fix it. Fixes: f3a2cd326e44 ("net: dsa: mv88e6xxx: introduce .port_set_policy") Signed-off-by: Marek Behún Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20250317173250.28780-5-kabel@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/dsa/mv88e6xxx/chip.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c index 3cddc8f084b37..f886a69d7a3ce 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.c +++ b/drivers/net/dsa/mv88e6xxx/chip.c @@ -5145,6 +5145,7 @@ static const struct mv88e6xxx_ops mv88e6320_ops = { .port_set_rgmii_delay = mv88e6320_port_set_rgmii_delay, .port_set_speed_duplex = mv88e6185_port_set_speed_duplex, .port_tag_remap = mv88e6095_port_tag_remap, + .port_set_policy = mv88e6352_port_set_policy, .port_set_frame_mode = mv88e6351_port_set_frame_mode, .port_set_ucast_flood = mv88e6352_port_set_ucast_flood, .port_set_mcast_flood = mv88e6352_port_set_mcast_flood, @@ -5194,6 +5195,7 @@ static const struct mv88e6xxx_ops mv88e6321_ops = { .port_set_rgmii_delay = mv88e6320_port_set_rgmii_delay, .port_set_speed_duplex = mv88e6185_port_set_speed_duplex, .port_tag_remap = mv88e6095_port_tag_remap, + .port_set_policy = mv88e6352_port_set_policy, .port_set_frame_mode = mv88e6351_port_set_frame_mode, .port_set_ucast_flood = mv88e6352_port_set_ucast_flood, .port_set_mcast_flood = mv88e6352_port_set_mcast_flood, From 1428a6109b20e356188c3fb027bdb7998cc2fb98 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Beh=C3=BAn?= Date: Mon, 17 Mar 2025 18:32:48 +0100 Subject: [PATCH 23/41] net: dsa: mv88e6xxx: enable STU methods for 6320 family MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit c050f5e91b47 ("net: dsa: mv88e6xxx: Fill in STU support for all supported chips") introduced STU methods, but did not add them to the 6320 family. Fix it. Fixes: c050f5e91b47 ("net: dsa: mv88e6xxx: Fill in STU support for all supported chips") Signed-off-by: Marek Behún Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20250317173250.28780-6-kabel@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/dsa/mv88e6xxx/chip.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c index f886a69d7a3ce..74b8bae226e4b 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.c +++ b/drivers/net/dsa/mv88e6xxx/chip.c @@ -5172,6 +5172,8 @@ static const struct mv88e6xxx_ops mv88e6320_ops = { .reset = mv88e6352_g1_reset, .vtu_getnext = mv88e6352_g1_vtu_getnext, .vtu_loadpurge = mv88e6352_g1_vtu_loadpurge, + .stu_getnext = mv88e6352_g1_stu_getnext, + .stu_loadpurge = mv88e6352_g1_stu_loadpurge, .gpio_ops = &mv88e6352_gpio_ops, .avb_ops = &mv88e6352_avb_ops, .ptp_ops = &mv88e6352_ptp_ops, @@ -5221,6 +5223,8 @@ static const struct mv88e6xxx_ops mv88e6321_ops = { .reset = mv88e6352_g1_reset, .vtu_getnext = mv88e6352_g1_vtu_getnext, .vtu_loadpurge = mv88e6352_g1_vtu_loadpurge, + .stu_getnext = mv88e6352_g1_stu_getnext, + .stu_loadpurge = mv88e6352_g1_stu_loadpurge, .gpio_ops = &mv88e6352_gpio_ops, .avb_ops = &mv88e6352_avb_ops, .ptp_ops = &mv88e6352_ptp_ops, @@ -6241,6 +6245,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = { .num_internal_phys = 5, .num_gpio = 15, .max_vid = 4095, + .max_sid = 63, .port_base_addr = 0x10, .phy_base_addr = 0x0, .global1_addr = 0x1b, @@ -6267,6 +6272,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = { .num_internal_phys = 5, .num_gpio = 15, .max_vid = 4095, + .max_sid = 63, .port_base_addr = 0x10, .phy_base_addr = 0x0, .global1_addr = 0x1b, From 52fdc41c3278c981066a461d03d5477ebfcf270c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Beh=C3=BAn?= Date: Mon, 17 Mar 2025 18:32:49 +0100 Subject: [PATCH 24/41] net: dsa: mv88e6xxx: fix internal PHYs for 6320 family MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix internal PHYs definition for the 6320 family, which has only 2 internal PHYs (on ports 3 and 4). Fixes: bc3931557d1d ("net: dsa: mv88e6xxx: Add number of internal PHYs") Signed-off-by: Marek Behún Cc: # 6.6.x Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20250317173250.28780-7-kabel@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/dsa/mv88e6xxx/chip.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c index 74b8bae226e4b..88f479dc328ce 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.c +++ b/drivers/net/dsa/mv88e6xxx/chip.c @@ -6242,7 +6242,8 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = { .num_databases = 4096, .num_macs = 8192, .num_ports = 7, - .num_internal_phys = 5, + .num_internal_phys = 2, + .internal_phys_offset = 3, .num_gpio = 15, .max_vid = 4095, .max_sid = 63, @@ -6269,7 +6270,8 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = { .num_databases = 4096, .num_macs = 8192, .num_ports = 7, - .num_internal_phys = 5, + .num_internal_phys = 2, + .internal_phys_offset = 3, .num_gpio = 15, .max_vid = 4095, .max_sid = 63, From 1ebc8e1ef906db9c08e9abe9776d85ddec837725 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Beh=C3=BAn?= Date: Mon, 17 Mar 2025 18:32:50 +0100 Subject: [PATCH 25/41] net: dsa: mv88e6xxx: workaround RGMII transmit delay erratum for 6320 family MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement the workaround for erratum 3.3 RGMII timing may be out of spec when transmit delay is enabled for the 6320 family, which says: When transmit delay is enabled via Port register 1 bit 14 = 1, duty cycle may be out of spec. Under very rare conditions this may cause the attached device receive CRC errors. Signed-off-by: Marek Behún Cc: # 5.4.x Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20250317173250.28780-8-kabel@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/dsa/mv88e6xxx/chip.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c index 88f479dc328ce..901929f96b380 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.c +++ b/drivers/net/dsa/mv88e6xxx/chip.c @@ -3674,6 +3674,21 @@ static int mv88e6xxx_stats_setup(struct mv88e6xxx_chip *chip) return mv88e6xxx_g1_stats_clear(chip); } +static int mv88e6320_setup_errata(struct mv88e6xxx_chip *chip) +{ + u16 dummy; + int err; + + /* Workaround for erratum + * 3.3 RGMII timing may be out of spec when transmit delay is enabled + */ + err = mv88e6xxx_port_hidden_write(chip, 0, 0xf, 0x7, 0xe000); + if (err) + return err; + + return mv88e6xxx_port_hidden_read(chip, 0, 0xf, 0x7, &dummy); +} + /* Check if the errata has already been applied. */ static bool mv88e6390_setup_errata_applied(struct mv88e6xxx_chip *chip) { @@ -5130,6 +5145,7 @@ static const struct mv88e6xxx_ops mv88e6290_ops = { static const struct mv88e6xxx_ops mv88e6320_ops = { /* MV88E6XXX_FAMILY_6320 */ + .setup_errata = mv88e6320_setup_errata, .ieee_pri_map = mv88e6085_g1_ieee_pri_map, .ip_pri_map = mv88e6085_g1_ip_pri_map, .irl_init_all = mv88e6352_g2_irl_init_all, @@ -5182,6 +5198,7 @@ static const struct mv88e6xxx_ops mv88e6320_ops = { static const struct mv88e6xxx_ops mv88e6321_ops = { /* MV88E6XXX_FAMILY_6320 */ + .setup_errata = mv88e6320_setup_errata, .ieee_pri_map = mv88e6085_g1_ieee_pri_map, .ip_pri_map = mv88e6085_g1_ip_pri_map, .irl_init_all = mv88e6352_g2_irl_init_all, From 4af9939a4977e05ccaaa645848f6208e82e06c61 Mon Sep 17 00:00:00 2001 From: WangYuli Date: Tue, 18 Mar 2025 18:36:54 +0800 Subject: [PATCH 26/41] mlxsw: spectrum_acl_bloom_filter: Workaround for some LLVM versions This is a workaround to mitigate a compiler anomaly. During LLVM toolchain compilation of this driver on s390x architecture, an unreasonable __write_overflow_field warning occurs. Contextually, chunk_index is restricted to 0, 1 or 2. By expanding these possibilities, the compile warning is suppressed. Fix follow error with clang-19 when -Werror: In file included from drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_bloom_filter.c:5: In file included from ./include/linux/gfp.h:7: In file included from ./include/linux/mmzone.h:8: In file included from ./include/linux/spinlock.h:63: In file included from ./include/linux/lockdep.h:14: In file included from ./include/linux/smp.h:13: In file included from ./include/linux/cpumask.h:12: In file included from ./include/linux/bitmap.h:13: In file included from ./include/linux/string.h:392: ./include/linux/fortify-string.h:571:4: error: call to '__write_overflow_field' declared with 'warning' attribute: detected write beyond size of field (1st parameter); maybe use struct_group()? [-Werror,-Wattribute-warning] 571 | __write_overflow_field(p_size_field, size); | ^ 1 error generated. According to the testing, we can be fairly certain that this is a clang compiler bug, impacting only clang-19 and below. Clang versions 20 and 21 do not exhibit this behavior. Link: https://lore.kernel.org/all/484364B641C901CD+20250311141025.1624528-1-wangyuli@uniontech.com/ Fixes: 7585cacdb978 ("mlxsw: spectrum_acl: Add Bloom filter handling") Co-developed-by: Zijian Chen Signed-off-by: Zijian Chen Co-developed-by: Wentao Guan Signed-off-by: Wentao Guan Suggested-by: Paolo Abeni Co-developed-by: Ido Schimmel Signed-off-by: Ido Schimmel Tested-by: Ido Schimmel Tested-by: WangYuli Signed-off-by: WangYuli Link: https://patch.msgid.link/A1858F1D36E653E0+20250318103654.708077-1-wangyuli@uniontech.com Signed-off-by: Jakub Kicinski --- .../mlxsw/spectrum_acl_bloom_filter.c | 27 +++++++++++++++---- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_bloom_filter.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_bloom_filter.c index a54eedb69a3f5..067f0055a55af 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_bloom_filter.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_bloom_filter.c @@ -212,7 +212,22 @@ static const u8 mlxsw_sp4_acl_bf_crc6_tab[256] = { * This array defines key offsets for easy access when copying key blocks from * entry key to Bloom filter chunk. */ -static const u8 chunk_key_offsets[MLXSW_BLOOM_KEY_CHUNKS] = {2, 20, 38}; +static char * +mlxsw_sp_acl_bf_enc_key_get(struct mlxsw_sp_acl_atcam_entry *aentry, + u8 chunk_index) +{ + switch (chunk_index) { + case 0: + return &aentry->ht_key.enc_key[2]; + case 1: + return &aentry->ht_key.enc_key[20]; + case 2: + return &aentry->ht_key.enc_key[38]; + default: + WARN_ON_ONCE(1); + return &aentry->ht_key.enc_key[0]; + } +} static u16 mlxsw_sp2_acl_bf_crc16_byte(u16 crc, u8 c) { @@ -235,9 +250,10 @@ __mlxsw_sp_acl_bf_key_encode(struct mlxsw_sp_acl_atcam_region *aregion, u8 key_offset, u8 chunk_key_len, u8 chunk_len) { struct mlxsw_afk_key_info *key_info = aregion->region->key_info; - u8 chunk_index, chunk_count, block_count; + u8 chunk_index, chunk_count; char *chunk = output; __be16 erp_region_id; + u32 block_count; block_count = mlxsw_afk_key_info_blocks_count_get(key_info); chunk_count = 1 + ((block_count - 1) >> 2); @@ -245,12 +261,13 @@ __mlxsw_sp_acl_bf_key_encode(struct mlxsw_sp_acl_atcam_region *aregion, (aregion->region->id << 4)); for (chunk_index = max_chunks - chunk_count; chunk_index < max_chunks; chunk_index++) { + char *enc_key; + memset(chunk, 0, pad_bytes); memcpy(chunk + pad_bytes, &erp_region_id, sizeof(erp_region_id)); - memcpy(chunk + key_offset, - &aentry->ht_key.enc_key[chunk_key_offsets[chunk_index]], - chunk_key_len); + enc_key = mlxsw_sp_acl_bf_enc_key_get(aentry, chunk_index); + memcpy(chunk + key_offset, enc_key, chunk_key_len); chunk += chunk_len; } *len = chunk_count * chunk_len; From 00eb88752f48615ae7b4c1df6f9271cdd62c1d95 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 18 Mar 2025 13:57:14 +0200 Subject: [PATCH 27/41] net: dsa: sja1105: fix displaced ethtool statistics counters Port counters with no name (aka sja1105_port_counters[__SJA1105_COUNTER_UNUSED]) are skipped when reporting sja1105_get_sset_count(), but are not skipped during sja1105_get_strings() and sja1105_get_ethtool_stats(). As a consequence, the first reported counter has an empty name and a bogus value (reads from area 0, aka MAC, from offset 0, bits start:end 0:0). Also, the last counter (N_NOT_REACH on E/T, N_RX_BCAST on P/Q/R/S) gets pushed out of the statistics counters that get shown. Skip __SJA1105_COUNTER_UNUSED consistently, so that the bogus counter with an empty name disappears, and in its place appears a valid counter. Fixes: 039b167d68a3 ("net: dsa: sja1105: don't use burst SPI reads for port statistics") Signed-off-by: Vladimir Oltean Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250318115716.2124395-2-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- drivers/net/dsa/sja1105/sja1105_ethtool.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/net/dsa/sja1105/sja1105_ethtool.c b/drivers/net/dsa/sja1105/sja1105_ethtool.c index 2ea64b1d026d7..84d7d3f66bd03 100644 --- a/drivers/net/dsa/sja1105/sja1105_ethtool.c +++ b/drivers/net/dsa/sja1105/sja1105_ethtool.c @@ -571,6 +571,9 @@ void sja1105_get_ethtool_stats(struct dsa_switch *ds, int port, u64 *data) max_ctr = __MAX_SJA1105PQRS_PORT_COUNTER; for (i = 0; i < max_ctr; i++) { + if (!strlen(sja1105_port_counters[i].name)) + continue; + rc = sja1105_port_counter_read(priv, port, i, &data[k++]); if (rc) { dev_err(ds->dev, @@ -596,8 +599,12 @@ void sja1105_get_strings(struct dsa_switch *ds, int port, else max_ctr = __MAX_SJA1105PQRS_PORT_COUNTER; - for (i = 0; i < max_ctr; i++) + for (i = 0; i < max_ctr; i++) { + if (!strlen(sja1105_port_counters[i].name)) + continue; + ethtool_puts(&data, sja1105_port_counters[i].name); + } } int sja1105_get_sset_count(struct dsa_switch *ds, int port, int sset) From b6a177b559717b707087114e08537fd47a4d1aca Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 18 Mar 2025 13:57:15 +0200 Subject: [PATCH 28/41] net: dsa: sja1105: reject other RX filters than HWTSTAMP_FILTER_PTP_V2_L2_EVENT This is all that we can support timestamping, so we shouldn't accept anything else. Also see sja1105_hwtstamp_get(). To avoid erroring out in an inconsistent state, operate on copies of priv->hwts_rx_en and priv->hwts_tx_en, and write them back when nothing else can fail anymore. Fixes: a602afd200f5 ("net: dsa: sja1105: Expose PTP timestamping ioctls to userspace") Signed-off-by: Vladimir Oltean Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250318115716.2124395-3-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- drivers/net/dsa/sja1105/sja1105_ptp.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/drivers/net/dsa/sja1105/sja1105_ptp.c b/drivers/net/dsa/sja1105/sja1105_ptp.c index a1f4ca6ad888f..08b45fdd1d248 100644 --- a/drivers/net/dsa/sja1105/sja1105_ptp.c +++ b/drivers/net/dsa/sja1105/sja1105_ptp.c @@ -61,17 +61,21 @@ enum sja1105_ptp_clk_mode { int sja1105_hwtstamp_set(struct dsa_switch *ds, int port, struct ifreq *ifr) { struct sja1105_private *priv = ds->priv; + unsigned long hwts_tx_en, hwts_rx_en; struct hwtstamp_config config; if (copy_from_user(&config, ifr->ifr_data, sizeof(config))) return -EFAULT; + hwts_tx_en = priv->hwts_tx_en; + hwts_rx_en = priv->hwts_rx_en; + switch (config.tx_type) { case HWTSTAMP_TX_OFF: - priv->hwts_tx_en &= ~BIT(port); + hwts_tx_en &= ~BIT(port); break; case HWTSTAMP_TX_ON: - priv->hwts_tx_en |= BIT(port); + hwts_tx_en |= BIT(port); break; default: return -ERANGE; @@ -79,15 +83,21 @@ int sja1105_hwtstamp_set(struct dsa_switch *ds, int port, struct ifreq *ifr) switch (config.rx_filter) { case HWTSTAMP_FILTER_NONE: - priv->hwts_rx_en &= ~BIT(port); + hwts_rx_en &= ~BIT(port); break; - default: - priv->hwts_rx_en |= BIT(port); + case HWTSTAMP_FILTER_PTP_V2_L2_EVENT: + hwts_rx_en |= BIT(port); break; + default: + return -ERANGE; } if (copy_to_user(ifr->ifr_data, &config, sizeof(config))) return -EFAULT; + + priv->hwts_tx_en = hwts_tx_en; + priv->hwts_rx_en = hwts_rx_en; + return 0; } From 5f2b28b79d2d1946ee36ad8b3dc0066f73c90481 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 18 Mar 2025 13:57:16 +0200 Subject: [PATCH 29/41] net: dsa: sja1105: fix kasan out-of-bounds warning in sja1105_table_delete_entry() There are actually 2 problems: - deleting the last element doesn't require the memmove of elements [i + 1, end) over it. Actually, element i+1 is out of bounds. - The memmove itself should move size - i - 1 elements, because the last element is out of bounds. The out-of-bounds element still remains out of bounds after being accessed, so the problem is only that we touch it, not that it becomes in active use. But I suppose it can lead to issues if the out-of-bounds element is part of an unmapped page. Fixes: 6666cebc5e30 ("net: dsa: sja1105: Add support for VLAN operations") Signed-off-by: Vladimir Oltean Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250318115716.2124395-4-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- drivers/net/dsa/sja1105/sja1105_static_config.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/dsa/sja1105/sja1105_static_config.c b/drivers/net/dsa/sja1105/sja1105_static_config.c index 3d790f8c6f4da..ffece8a400a66 100644 --- a/drivers/net/dsa/sja1105/sja1105_static_config.c +++ b/drivers/net/dsa/sja1105/sja1105_static_config.c @@ -1917,8 +1917,10 @@ int sja1105_table_delete_entry(struct sja1105_table *table, int i) if (i > table->entry_count) return -ERANGE; - memmove(entries + i * entry_size, entries + (i + 1) * entry_size, - (table->entry_count - i) * entry_size); + if (i + 1 < table->entry_count) { + memmove(entries + i * entry_size, entries + (i + 1) * entry_size, + (table->entry_count - i - 1) * entry_size); + } table->entry_count--; From bdf549a7a4d738838d50833168881a0b6247446a Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Tue, 18 Mar 2025 22:51:16 +0200 Subject: [PATCH 30/41] net/mlx5: LAG, reload representors on LAG creation failure When LAG creation fails, the driver reloads the RDMA devices. If RDMA representors are present, they should also be reloaded. This step was missed in the cited commit. Fixes: 598fe77df855 ("net/mlx5: Lag, Create shared FDB when in switchdev mode") Signed-off-by: Mark Bloch Reviewed-by: Shay Drori Signed-off-by: Tariq Toukan Reviewed-by: Michal Swiatkowski Reviewed-by: Kalesh AP Link: https://patch.msgid.link/1742331077-102038-2-git-send-email-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c index ed2ba272946b9..6c9737c537348 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c @@ -1052,6 +1052,10 @@ static void mlx5_do_bond(struct mlx5_lag *ldev) if (err) { if (shared_fdb || roce_lag) mlx5_lag_add_devices(ldev); + if (shared_fdb) { + mlx5_ldev_for_each(i, 0, ldev) + mlx5_eswitch_reload_ib_reps(ldev->pf[i].dev->priv.eswitch); + } return; } else if (roce_lag) { From 1726ad035cb0c93cc5c3f2227ec71322ccd7c2f8 Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Tue, 18 Mar 2025 22:51:17 +0200 Subject: [PATCH 31/41] net/mlx5: Start health poll after enable hca The health poll mechanism performs periodic checks to detect firmware errors. One of the checks verifies the function is still enabled on firmware side, but the function is enabled only after enable_hca command completed. Start health poll after enable_hca command to avoid a race between function enabled and first health polling. Fixes: 9b98d395b85d ("net/mlx5: Start health poll at earlier stage of driver load") Signed-off-by: Moshe Shemesh Reviewed-by: Shay Drori Signed-off-by: Tariq Toukan Reviewed-by: Michal Swiatkowski Reviewed-by: Kalesh AP Link: https://patch.msgid.link/1742331077-102038-3-git-send-email-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/main.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index ec956c4bcebdb..7c3312d6aed9b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -1205,24 +1205,24 @@ static int mlx5_function_enable(struct mlx5_core_dev *dev, bool boot, u64 timeou dev->caps.embedded_cpu = mlx5_read_embedded_cpu(dev); mlx5_cmd_set_state(dev, MLX5_CMDIF_STATE_UP); - mlx5_start_health_poll(dev); - err = mlx5_core_enable_hca(dev, 0); if (err) { mlx5_core_err(dev, "enable hca failed\n"); - goto stop_health_poll; + goto err_cmd_cleanup; } + mlx5_start_health_poll(dev); + err = mlx5_core_set_issi(dev); if (err) { mlx5_core_err(dev, "failed to set issi\n"); - goto err_disable_hca; + goto stop_health_poll; } err = mlx5_satisfy_startup_pages(dev, 1); if (err) { mlx5_core_err(dev, "failed to allocate boot pages\n"); - goto err_disable_hca; + goto stop_health_poll; } err = mlx5_tout_query_dtor(dev); @@ -1235,10 +1235,9 @@ static int mlx5_function_enable(struct mlx5_core_dev *dev, bool boot, u64 timeou reclaim_boot_pages: mlx5_reclaim_startup_pages(dev); -err_disable_hca: - mlx5_core_disable_hca(dev, 0); stop_health_poll: mlx5_stop_health_poll(dev, boot); + mlx5_core_disable_hca(dev, 0); err_cmd_cleanup: mlx5_cmd_set_state(dev, MLX5_CMDIF_STATE_DOWN); mlx5_cmd_disable(dev); @@ -1249,8 +1248,8 @@ static int mlx5_function_enable(struct mlx5_core_dev *dev, bool boot, u64 timeou static void mlx5_function_disable(struct mlx5_core_dev *dev, bool boot) { mlx5_reclaim_startup_pages(dev); - mlx5_core_disable_hca(dev, 0); mlx5_stop_health_poll(dev, boot); + mlx5_core_disable_hca(dev, 0); mlx5_cmd_set_state(dev, MLX5_CMDIF_STATE_DOWN); mlx5_cmd_disable(dev); } From 0dd765fae295832934bf28e45dd5a355e0891ed4 Mon Sep 17 00:00:00 2001 From: Sankararaman Jayaraman Date: Thu, 20 Mar 2025 10:25:22 +0530 Subject: [PATCH 32/41] vmxnet3: unregister xdp rxq info in the reset path vmxnet3 does not unregister xdp rxq info in the vmxnet3_reset_work() code path as vmxnet3_rq_destroy() is not invoked in this code path. So, we get below message with a backtrace. Missing unregister, handled but fix driver WARNING: CPU:48 PID: 500 at net/core/xdp.c:182 __xdp_rxq_info_reg+0x93/0xf0 This patch fixes the problem by moving the unregister code of XDP from vmxnet3_rq_destroy() to vmxnet3_rq_cleanup(). Fixes: 54f00cce1178 ("vmxnet3: Add XDP support.") Signed-off-by: Sankararaman Jayaraman Signed-off-by: Ronak Doshi Link: https://patch.msgid.link/20250320045522.57892-1-sankararaman.jayaraman@broadcom.com Signed-off-by: Jakub Kicinski --- drivers/net/vmxnet3/vmxnet3_drv.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/net/vmxnet3/vmxnet3_drv.c b/drivers/net/vmxnet3/vmxnet3_drv.c index 6793fa09f9d1a..3df6aabc7e339 100644 --- a/drivers/net/vmxnet3/vmxnet3_drv.c +++ b/drivers/net/vmxnet3/vmxnet3_drv.c @@ -2033,6 +2033,11 @@ vmxnet3_rq_cleanup(struct vmxnet3_rx_queue *rq, rq->comp_ring.gen = VMXNET3_INIT_GEN; rq->comp_ring.next2proc = 0; + + if (xdp_rxq_info_is_reg(&rq->xdp_rxq)) + xdp_rxq_info_unreg(&rq->xdp_rxq); + page_pool_destroy(rq->page_pool); + rq->page_pool = NULL; } @@ -2073,11 +2078,6 @@ static void vmxnet3_rq_destroy(struct vmxnet3_rx_queue *rq, } } - if (xdp_rxq_info_is_reg(&rq->xdp_rxq)) - xdp_rxq_info_unreg(&rq->xdp_rxq); - page_pool_destroy(rq->page_pool); - rq->page_pool = NULL; - if (rq->data_ring.base) { dma_free_coherent(&adapter->pdev->dev, rq->rx_ring[0].size * rq->data_ring.desc_size, From 094ee6017ea09c11d6af187935a949df32803ce0 Mon Sep 17 00:00:00 2001 From: Wang Liang Date: Fri, 21 Mar 2025 12:48:52 +0800 Subject: [PATCH 33/41] bonding: check xdp prog when set bond mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Following operations can trigger a warning[1]: ip netns add ns1 ip netns exec ns1 ip link add bond0 type bond mode balance-rr ip netns exec ns1 ip link set dev bond0 xdp obj af_xdp_kern.o sec xdp ip netns exec ns1 ip link set bond0 type bond mode broadcast ip netns del ns1 When delete the namespace, dev_xdp_uninstall() is called to remove xdp program on bond dev, and bond_xdp_set() will check the bond mode. If bond mode is changed after attaching xdp program, the warning may occur. Some bond modes (broadcast, etc.) do not support native xdp. Set bond mode with xdp program attached is not good. Add check for xdp program when set bond mode. [1] ------------[ cut here ]------------ WARNING: CPU: 0 PID: 11 at net/core/dev.c:9912 unregister_netdevice_many_notify+0x8d9/0x930 Modules linked in: CPU: 0 UID: 0 PID: 11 Comm: kworker/u4:0 Not tainted 6.14.0-rc4 #107 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.15.0-0-g2dd4b9b3f840-prebuilt.qemu.org 04/01/2014 Workqueue: netns cleanup_net RIP: 0010:unregister_netdevice_many_notify+0x8d9/0x930 Code: 00 00 48 c7 c6 6f e3 a2 82 48 c7 c7 d0 b3 96 82 e8 9c 10 3e ... RSP: 0018:ffffc90000063d80 EFLAGS: 00000282 RAX: 00000000ffffffa1 RBX: ffff888004959000 RCX: 00000000ffffdfff RDX: 0000000000000000 RSI: 00000000ffffffea RDI: ffffc90000063b48 RBP: ffffc90000063e28 R08: ffffffff82d39b28 R09: 0000000000009ffb R10: 0000000000000175 R11: ffffffff82d09b40 R12: ffff8880049598e8 R13: 0000000000000001 R14: dead000000000100 R15: ffffc90000045000 FS: 0000000000000000(0000) GS:ffff888007a00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000000000d406b60 CR3: 000000000483e000 CR4: 00000000000006f0 Call Trace: ? __warn+0x83/0x130 ? unregister_netdevice_many_notify+0x8d9/0x930 ? report_bug+0x18e/0x1a0 ? handle_bug+0x54/0x90 ? exc_invalid_op+0x18/0x70 ? asm_exc_invalid_op+0x1a/0x20 ? unregister_netdevice_many_notify+0x8d9/0x930 ? bond_net_exit_batch_rtnl+0x5c/0x90 cleanup_net+0x237/0x3d0 process_one_work+0x163/0x390 worker_thread+0x293/0x3b0 ? __pfx_worker_thread+0x10/0x10 kthread+0xec/0x1e0 ? __pfx_kthread+0x10/0x10 ? __pfx_kthread+0x10/0x10 ret_from_fork+0x2f/0x50 ? __pfx_kthread+0x10/0x10 ret_from_fork_asm+0x1a/0x30 ---[ end trace 0000000000000000 ]--- Fixes: 9e2ee5c7e7c3 ("net, bonding: Add XDP support to the bonding driver") Signed-off-by: Wang Liang Acked-by: Jussi Maki Reviewed-by: Nikolay Aleksandrov Reviewed-by: Toke Høiland-Jørgensen Link: https://patch.msgid.link/20250321044852.1086551-1-wangliang74@huawei.com Signed-off-by: Jakub Kicinski --- drivers/net/bonding/bond_main.c | 8 ++++---- drivers/net/bonding/bond_options.c | 3 +++ include/net/bonding.h | 1 + 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index e45bba240cbcd..4da5fcb7def47 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -322,9 +322,9 @@ static bool bond_sk_check(struct bonding *bond) } } -static bool bond_xdp_check(struct bonding *bond) +bool bond_xdp_check(struct bonding *bond, int mode) { - switch (BOND_MODE(bond)) { + switch (mode) { case BOND_MODE_ROUNDROBIN: case BOND_MODE_ACTIVEBACKUP: return true; @@ -1937,7 +1937,7 @@ void bond_xdp_set_features(struct net_device *bond_dev) ASSERT_RTNL(); - if (!bond_xdp_check(bond) || !bond_has_slaves(bond)) { + if (!bond_xdp_check(bond, BOND_MODE(bond)) || !bond_has_slaves(bond)) { xdp_clear_features_flag(bond_dev); return; } @@ -5699,7 +5699,7 @@ static int bond_xdp_set(struct net_device *dev, struct bpf_prog *prog, ASSERT_RTNL(); - if (!bond_xdp_check(bond)) { + if (!bond_xdp_check(bond, BOND_MODE(bond))) { BOND_NL_ERR(dev, extack, "No native XDP support for the current bonding mode"); return -EOPNOTSUPP; diff --git a/drivers/net/bonding/bond_options.c b/drivers/net/bonding/bond_options.c index d1b095af253bd..91893c29b8995 100644 --- a/drivers/net/bonding/bond_options.c +++ b/drivers/net/bonding/bond_options.c @@ -868,6 +868,9 @@ static bool bond_set_xfrm_features(struct bonding *bond) static int bond_option_mode_set(struct bonding *bond, const struct bond_opt_value *newval) { + if (bond->xdp_prog && !bond_xdp_check(bond, newval->value)) + return -EOPNOTSUPP; + if (!bond_mode_uses_arp(newval->value)) { if (bond->params.arp_interval) { netdev_dbg(bond->dev, "%s mode is incompatible with arp monitoring, start mii monitoring\n", diff --git a/include/net/bonding.h b/include/net/bonding.h index 8bb5f016969f1..95f67b308c19a 100644 --- a/include/net/bonding.h +++ b/include/net/bonding.h @@ -695,6 +695,7 @@ void bond_debug_register(struct bonding *bond); void bond_debug_unregister(struct bonding *bond); void bond_debug_reregister(struct bonding *bond); const char *bond_mode_name(int mode); +bool bond_xdp_check(struct bonding *bond, int mode); void bond_setup(struct net_device *bond_dev); unsigned int bond_get_num_tx_queues(void); int bond_netlink_init(void); From d93a6caab5d7d9b5ce034d75b1e1e993338e3852 Mon Sep 17 00:00:00 2001 From: Nick Child Date: Thu, 20 Mar 2025 16:29:51 -0500 Subject: [PATCH 34/41] ibmvnic: Use kernel helpers for hex dumps Previously, when the driver was printing hex dumps, the buffer was cast to an 8 byte long and printed using string formatters. If the buffer size was not a multiple of 8 then a read buffer overflow was possible. Therefore, create a new ibmvnic function that loops over a buffer and calls hex_dump_to_buffer instead. This patch address KASAN reports like the one below: ibmvnic 30000003 env3: Login Buffer: ibmvnic 30000003 env3: 01000000af000000 <...> ibmvnic 30000003 env3: 2e6d62692e736261 ibmvnic 30000003 env3: 65050003006d6f63 ================================================================== BUG: KASAN: slab-out-of-bounds in ibmvnic_login+0xacc/0xffc [ibmvnic] Read of size 8 at addr c0000001331a9aa8 by task ip/17681 <...> Allocated by task 17681: <...> ibmvnic_login+0x2f0/0xffc [ibmvnic] ibmvnic_open+0x148/0x308 [ibmvnic] __dev_open+0x1ac/0x304 <...> The buggy address is located 168 bytes inside of allocated 175-byte region [c0000001331a9a00, c0000001331a9aaf) <...> ================================================================= ibmvnic 30000003 env3: 000000000033766e Fixes: 032c5e82847a ("Driver for IBM System i/p VNIC protocol") Signed-off-by: Nick Child Reviewed-by: Dave Marquardt Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250320212951.11142-1-nnac123@linux.ibm.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/ibm/ibmvnic.c | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index 0676fc547b6f4..480606d1245ea 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -4829,6 +4829,18 @@ static void vnic_add_client_data(struct ibmvnic_adapter *adapter, strscpy(vlcd->name, adapter->netdev->name, len); } +static void ibmvnic_print_hex_dump(struct net_device *dev, void *buf, + size_t len) +{ + unsigned char hex_str[16 * 3]; + + for (size_t i = 0; i < len; i += 16) { + hex_dump_to_buffer((unsigned char *)buf + i, len - i, 16, 8, + hex_str, sizeof(hex_str), false); + netdev_dbg(dev, "%s\n", hex_str); + } +} + static int send_login(struct ibmvnic_adapter *adapter) { struct ibmvnic_login_rsp_buffer *login_rsp_buffer; @@ -4939,10 +4951,8 @@ static int send_login(struct ibmvnic_adapter *adapter) vnic_add_client_data(adapter, vlcd); netdev_dbg(adapter->netdev, "Login Buffer:\n"); - for (i = 0; i < (adapter->login_buf_sz - 1) / 8 + 1; i++) { - netdev_dbg(adapter->netdev, "%016lx\n", - ((unsigned long *)(adapter->login_buf))[i]); - } + ibmvnic_print_hex_dump(adapter->netdev, adapter->login_buf, + adapter->login_buf_sz); memset(&crq, 0, sizeof(crq)); crq.login.first = IBMVNIC_CRQ_CMD; @@ -5319,15 +5329,13 @@ static void handle_query_ip_offload_rsp(struct ibmvnic_adapter *adapter) { struct device *dev = &adapter->vdev->dev; struct ibmvnic_query_ip_offload_buffer *buf = &adapter->ip_offload_buf; - int i; dma_unmap_single(dev, adapter->ip_offload_tok, sizeof(adapter->ip_offload_buf), DMA_FROM_DEVICE); netdev_dbg(adapter->netdev, "Query IP Offload Buffer:\n"); - for (i = 0; i < (sizeof(adapter->ip_offload_buf) - 1) / 8 + 1; i++) - netdev_dbg(adapter->netdev, "%016lx\n", - ((unsigned long *)(buf))[i]); + ibmvnic_print_hex_dump(adapter->netdev, buf, + sizeof(adapter->ip_offload_buf)); netdev_dbg(adapter->netdev, "ipv4_chksum = %d\n", buf->ipv4_chksum); netdev_dbg(adapter->netdev, "ipv6_chksum = %d\n", buf->ipv6_chksum); @@ -5558,10 +5566,8 @@ static int handle_login_rsp(union ibmvnic_crq *login_rsp_crq, netdev->mtu = adapter->req_mtu - ETH_HLEN; netdev_dbg(adapter->netdev, "Login Response Buffer:\n"); - for (i = 0; i < (adapter->login_rsp_buf_sz - 1) / 8 + 1; i++) { - netdev_dbg(adapter->netdev, "%016lx\n", - ((unsigned long *)(adapter->login_rsp_buf))[i]); - } + ibmvnic_print_hex_dump(netdev, adapter->login_rsp_buf, + adapter->login_rsp_buf_sz); /* Sanity checks */ if (login->num_txcomp_subcrqs != login_rsp->num_txsubm_subcrqs || From 0032c99e83b9ce6d5995d65900aa4b6ffb501cce Mon Sep 17 00:00:00 2001 From: Wang Liang Date: Fri, 21 Mar 2025 17:03:53 +0800 Subject: [PATCH 35/41] net: fix NULL pointer dereference in l3mdev_l3_rcv When delete l3s ipvlan: ip link del link eth0 ipvlan1 type ipvlan mode l3s This may cause a null pointer dereference: Call trace: ip_rcv_finish+0x48/0xd0 ip_rcv+0x5c/0x100 __netif_receive_skb_one_core+0x64/0xb0 __netif_receive_skb+0x20/0x80 process_backlog+0xb4/0x204 napi_poll+0xe8/0x294 net_rx_action+0xd8/0x22c __do_softirq+0x12c/0x354 This is because l3mdev_l3_rcv() visit dev->l3mdev_ops after ipvlan_l3s_unregister() assign the dev->l3mdev_ops to NULL. The process like this: (CPU1) | (CPU2) l3mdev_l3_rcv() | check dev->priv_flags: | master = skb->dev; | | | ipvlan_l3s_unregister() | set dev->priv_flags | dev->l3mdev_ops = NULL; | visit master->l3mdev_ops | To avoid this by do not set dev->l3mdev_ops when unregister l3s ipvlan. Suggested-by: David Ahern Fixes: c675e06a98a4 ("ipvlan: decouple l3s mode dependencies from other modes") Signed-off-by: Wang Liang Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250321090353.1170545-1-wangliang74@huawei.com Signed-off-by: Jakub Kicinski --- drivers/net/ipvlan/ipvlan_l3s.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/ipvlan/ipvlan_l3s.c b/drivers/net/ipvlan/ipvlan_l3s.c index b4ef386bdb1ba..7c017fe35522a 100644 --- a/drivers/net/ipvlan/ipvlan_l3s.c +++ b/drivers/net/ipvlan/ipvlan_l3s.c @@ -226,5 +226,4 @@ void ipvlan_l3s_unregister(struct ipvl_port *port) dev->priv_flags &= ~IFF_L3MDEV_RX_HANDLER; ipvlan_unregister_nf_hook(read_pnet(&port->pnet)); - dev->l3mdev_ops = NULL; } From 1ae1d705a1120e8e0ca41698c5a0fff6f5290bc1 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Fri, 21 Mar 2025 15:10:44 +0100 Subject: [PATCH 36/41] net: dsa: microchip: fix DCB apptrust configuration on KSZ88x3 Remove KSZ88x3-specific priority and apptrust configuration logic that was based on incorrect register access assumptions. Also fix the register offset for KSZ8_REG_PORT_1_CTRL_0 to align with get_port_addr() logic. The KSZ88x3 switch family uses a different register layout compared to KSZ9477-compatible variants. Specifically, port control registers need offset adjustment through get_port_addr(), and do not match the datasheet values directly. Commit a1ea57710c9d ("net: dsa: microchip: dcb: add special handling for KSZ88X3 family") introduced quirks based on datasheet offsets, which do not work with the driver's internal addressing model. As a result, these quirks addressed the wrong ports and caused unstable behavior. This patch removes all KSZ88x3-specific DCB quirks and corrects the port control register offset, effectively restoring working and predictable apptrust configuration. Fixes: a1ea57710c9d ("net: dsa: microchip: dcb: add special handling for KSZ88X3 family") Signed-off-by: Oleksij Rempel Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250321141044.2128973-1-o.rempel@pengutronix.de Signed-off-by: Jakub Kicinski --- drivers/net/dsa/microchip/ksz8.c | 11 +- drivers/net/dsa/microchip/ksz_dcb.c | 231 +--------------------------- 2 files changed, 9 insertions(+), 233 deletions(-) diff --git a/drivers/net/dsa/microchip/ksz8.c b/drivers/net/dsa/microchip/ksz8.c index da7110d675583..be433b4e2b1ca 100644 --- a/drivers/net/dsa/microchip/ksz8.c +++ b/drivers/net/dsa/microchip/ksz8.c @@ -1625,7 +1625,6 @@ void ksz8_port_setup(struct ksz_device *dev, int port, bool cpu_port) const u16 *regs = dev->info->regs; struct dsa_switch *ds = dev->ds; const u32 *masks; - int queues; u8 member; masks = dev->info->masks; @@ -1633,15 +1632,7 @@ void ksz8_port_setup(struct ksz_device *dev, int port, bool cpu_port) /* enable broadcast storm limit */ ksz_port_cfg(dev, port, P_BCAST_STORM_CTRL, PORT_BROADCAST_STORM, true); - /* For KSZ88x3 enable only one queue by default, otherwise we won't - * be able to get rid of PCP prios on Port 2. - */ - if (ksz_is_ksz88x3(dev)) - queues = 1; - else - queues = dev->info->num_tx_queues; - - ksz8_port_queue_split(dev, port, queues); + ksz8_port_queue_split(dev, port, dev->info->num_tx_queues); /* replace priority */ ksz_port_cfg(dev, port, P_802_1P_CTRL, diff --git a/drivers/net/dsa/microchip/ksz_dcb.c b/drivers/net/dsa/microchip/ksz_dcb.c index 30b4a6186e38f..c3b501997ac94 100644 --- a/drivers/net/dsa/microchip/ksz_dcb.c +++ b/drivers/net/dsa/microchip/ksz_dcb.c @@ -10,7 +10,12 @@ #include "ksz_dcb.h" #include "ksz8.h" -#define KSZ8_REG_PORT_1_CTRL_0 0x10 +/* Port X Control 0 register. + * The datasheet specifies: Port 1 - 0x10, Port 2 - 0x20, Port 3 - 0x30. + * However, the driver uses get_port_addr(), which maps Port 1 to offset 0. + * Therefore, we define the base offset as 0x00 here to align with that logic. + */ +#define KSZ8_REG_PORT_1_CTRL_0 0x00 #define KSZ8_PORT_DIFFSERV_ENABLE BIT(6) #define KSZ8_PORT_802_1P_ENABLE BIT(5) #define KSZ8_PORT_BASED_PRIO_M GENMASK(4, 3) @@ -181,49 +186,6 @@ int ksz_port_get_default_prio(struct dsa_switch *ds, int port) return (data & mask) >> shift; } -/** - * ksz88x3_port_set_default_prio_quirks - Quirks for default priority - * @dev: Pointer to the KSZ switch device structure - * @port: Port number for which to set the default priority - * @prio: Priority value to set - * - * This function implements quirks for setting the default priority on KSZ88x3 - * devices. On Port 2, no other priority providers are working - * except of PCP. So, configuring default priority on Port 2 is not possible. - * On Port 1, it is not possible to configure port priority if PCP - * apptrust on Port 2 is disabled. Since we disable multiple queues on the - * switch to disable PCP on Port 2, we need to ensure that the default priority - * configuration on Port 1 is in agreement with the configuration on Port 2. - * - * Return: 0 on success, or a negative error code on failure - */ -static int ksz88x3_port_set_default_prio_quirks(struct ksz_device *dev, int port, - u8 prio) -{ - if (!prio) - return 0; - - if (port == KSZ_PORT_2) { - dev_err(dev->dev, "Port priority configuration is not working on Port 2\n"); - return -EINVAL; - } else if (port == KSZ_PORT_1) { - u8 port2_data; - int ret; - - ret = ksz_pread8(dev, KSZ_PORT_2, KSZ8_REG_PORT_1_CTRL_0, - &port2_data); - if (ret) - return ret; - - if (!(port2_data & KSZ8_PORT_802_1P_ENABLE)) { - dev_err(dev->dev, "Not possible to configure port priority on Port 1 if PCP apptrust on Port 2 is disabled\n"); - return -EINVAL; - } - } - - return 0; -} - /** * ksz_port_set_default_prio - Sets the default priority for a port on a KSZ * switch @@ -239,18 +201,12 @@ static int ksz88x3_port_set_default_prio_quirks(struct ksz_device *dev, int port int ksz_port_set_default_prio(struct dsa_switch *ds, int port, u8 prio) { struct ksz_device *dev = ds->priv; - int reg, shift, ret; + int reg, shift; u8 mask; if (prio >= dev->info->num_ipms) return -EINVAL; - if (ksz_is_ksz88x3(dev)) { - ret = ksz88x3_port_set_default_prio_quirks(dev, port, prio); - if (ret) - return ret; - } - ksz_get_default_port_prio_reg(dev, ®, &mask, &shift); return ksz_prmw8(dev, port, reg, mask, (prio << shift) & mask); @@ -518,155 +474,6 @@ static int ksz_port_set_apptrust_validate(struct ksz_device *dev, int port, return -EINVAL; } -/** - * ksz88x3_port1_apptrust_quirk - Quirk for apptrust configuration on Port 1 - * of KSZ88x3 devices - * @dev: Pointer to the KSZ switch device structure - * @port: Port number for which to set the apptrust selectors - * @reg: Register address for the apptrust configuration - * @port1_data: Data to set for the apptrust configuration - * - * This function implements a quirk for apptrust configuration on Port 1 of - * KSZ88x3 devices. It ensures that apptrust configuration on Port 1 is not - * possible if PCP apptrust on Port 2 is disabled. This is because the Port 2 - * seems to be permanently hardwired to PCP classification, so we need to - * do Port 1 configuration always in agreement with Port 2 configuration. - * - * Return: 0 on success, or a negative error code on failure - */ -static int ksz88x3_port1_apptrust_quirk(struct ksz_device *dev, int port, - int reg, u8 port1_data) -{ - u8 port2_data; - int ret; - - /* If no apptrust is requested for Port 1, no need to care about Port 2 - * configuration. - */ - if (!(port1_data & (KSZ8_PORT_802_1P_ENABLE | KSZ8_PORT_DIFFSERV_ENABLE))) - return 0; - - /* We got request to enable any apptrust on Port 1. To make it possible, - * we need to enable multiple queues on the switch. If we enable - * multiqueue support, PCP classification on Port 2 will be - * automatically activated by HW. - */ - ret = ksz_pread8(dev, KSZ_PORT_2, reg, &port2_data); - if (ret) - return ret; - - /* If KSZ8_PORT_802_1P_ENABLE bit is set on Port 2, the driver showed - * the interest in PCP classification on Port 2. In this case, - * multiqueue support is enabled and we can enable any apptrust on - * Port 1. - * If KSZ8_PORT_802_1P_ENABLE bit is not set on Port 2, the PCP - * classification on Port 2 is still active, but the driver disabled - * multiqueue support and made frame prioritization inactive for - * all ports. In this case, we can't enable any apptrust on Port 1. - */ - if (!(port2_data & KSZ8_PORT_802_1P_ENABLE)) { - dev_err(dev->dev, "Not possible to enable any apptrust on Port 1 if PCP apptrust on Port 2 is disabled\n"); - return -EINVAL; - } - - return 0; -} - -/** - * ksz88x3_port2_apptrust_quirk - Quirk for apptrust configuration on Port 2 - * of KSZ88x3 devices - * @dev: Pointer to the KSZ switch device structure - * @port: Port number for which to set the apptrust selectors - * @reg: Register address for the apptrust configuration - * @port2_data: Data to set for the apptrust configuration - * - * This function implements a quirk for apptrust configuration on Port 2 of - * KSZ88x3 devices. It ensures that DSCP apptrust is not working on Port 2 and - * that it is not possible to disable PCP on Port 2. The only way to disable PCP - * on Port 2 is to disable multiple queues on the switch. - * - * Return: 0 on success, or a negative error code on failure - */ -static int ksz88x3_port2_apptrust_quirk(struct ksz_device *dev, int port, - int reg, u8 port2_data) -{ - struct dsa_switch *ds = dev->ds; - u8 port1_data; - int ret; - - /* First validate Port 2 configuration. DiffServ/DSCP is not working - * on this port. - */ - if (port2_data & KSZ8_PORT_DIFFSERV_ENABLE) { - dev_err(dev->dev, "DSCP apptrust is not working on Port 2\n"); - return -EINVAL; - } - - /* If PCP support is requested, we need to enable all queues on the - * switch to make PCP priority working on Port 2. - */ - if (port2_data & KSZ8_PORT_802_1P_ENABLE) - return ksz8_all_queues_split(dev, dev->info->num_tx_queues); - - /* We got request to disable PCP priority on Port 2. - * Now, we need to compare Port 2 configuration with Port 1 - * configuration. - */ - ret = ksz_pread8(dev, KSZ_PORT_1, reg, &port1_data); - if (ret) - return ret; - - /* If Port 1 has any apptrust enabled, we can't disable multiple queues - * on the switch, so we can't disable PCP on Port 2. - */ - if (port1_data & (KSZ8_PORT_802_1P_ENABLE | KSZ8_PORT_DIFFSERV_ENABLE)) { - dev_err(dev->dev, "Not possible to disable PCP on Port 2 if any apptrust is enabled on Port 1\n"); - return -EINVAL; - } - - /* Now we need to ensure that default priority on Port 1 is set to 0 - * otherwise we can't disable multiqueue support on the switch. - */ - ret = ksz_port_get_default_prio(ds, KSZ_PORT_1); - if (ret < 0) { - return ret; - } else if (ret) { - dev_err(dev->dev, "Not possible to disable PCP on Port 2 if non zero default priority is set on Port 1\n"); - return -EINVAL; - } - - /* Port 1 has no apptrust or default priority set and we got request to - * disable PCP on Port 2. We can disable multiqueue support to disable - * PCP on Port 2. - */ - return ksz8_all_queues_split(dev, 1); -} - -/** - * ksz88x3_port_apptrust_quirk - Quirk for apptrust configuration on KSZ88x3 - * devices - * @dev: Pointer to the KSZ switch device structure - * @port: Port number for which to set the apptrust selectors - * @reg: Register address for the apptrust configuration - * @data: Data to set for the apptrust configuration - * - * This function implements a quirk for apptrust configuration on KSZ88x3 - * devices. It ensures that apptrust configuration on Port 1 and - * Port 2 is done in agreement with each other. - * - * Return: 0 on success, or a negative error code on failure - */ -static int ksz88x3_port_apptrust_quirk(struct ksz_device *dev, int port, - int reg, u8 data) -{ - if (port == KSZ_PORT_1) - return ksz88x3_port1_apptrust_quirk(dev, port, reg, data); - else if (port == KSZ_PORT_2) - return ksz88x3_port2_apptrust_quirk(dev, port, reg, data); - - return 0; -} - /** * ksz_port_set_apptrust - Sets the apptrust selectors for a port on a KSZ * switch @@ -707,12 +514,6 @@ int ksz_port_set_apptrust(struct dsa_switch *ds, int port, } } - if (ksz_is_ksz88x3(dev)) { - ret = ksz88x3_port_apptrust_quirk(dev, port, reg, data); - if (ret) - return ret; - } - return ksz_prmw8(dev, port, reg, mask, data); } @@ -799,21 +600,5 @@ int ksz_dcb_init_port(struct ksz_device *dev, int port) */ int ksz_dcb_init(struct ksz_device *dev) { - int ret; - - ret = ksz_init_global_dscp_map(dev); - if (ret) - return ret; - - /* Enable 802.1p priority control on Port 2 during switch initialization. - * This setup is critical for the apptrust functionality on Port 1, which - * relies on the priority settings of Port 2. Note: Port 1 is naturally - * configured before Port 2, necessitating this configuration order. - */ - if (ksz_is_ksz88x3(dev)) - return ksz_prmw8(dev, KSZ_PORT_2, KSZ8_REG_PORT_1_CTRL_0, - KSZ8_PORT_802_1P_ENABLE, - KSZ8_PORT_802_1P_ENABLE); - - return 0; + return ksz_init_global_dscp_map(dev); } From fd87b7783802b45cdd261b273e6b2b792823064d Mon Sep 17 00:00:00 2001 From: Pranjal Shrivastava Date: Mon, 24 Mar 2025 07:42:27 +0000 Subject: [PATCH 37/41] net: Fix the devmem sock opts and msgs for parisc The devmem socket options and socket control message definitions introduced in the TCP devmem series[1] incorrectly continued the socket definitions for arch/parisc. The UAPI change seems safe as there are currently no drivers that declare support for devmem TCP RX via PP_FLAG_ALLOW_UNREADABLE_NETMEM. Hence, fixing this UAPI should be safe. Fix the devmem socket options and socket control message definitions to reflect the series followed by arch/parisc. [1] https://lore.kernel.org/lkml/20240910171458.219195-10-almasrymina@google.com/ Fixes: 8f0b3cc9a4c10 ("tcp: RX path for devmem TCP") Signed-off-by: Pranjal Shrivastava Reviewed-by: Willem de Bruijn Reviewed-by: Jason Xing Reviewed-by: Mina Almasry Link: https://patch.msgid.link/20250324074228.3139088-1-praan@google.com Signed-off-by: Jakub Kicinski --- arch/parisc/include/uapi/asm/socket.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h index aa9cd4b951fe5..96831c9886065 100644 --- a/arch/parisc/include/uapi/asm/socket.h +++ b/arch/parisc/include/uapi/asm/socket.h @@ -132,16 +132,16 @@ #define SO_PASSPIDFD 0x404A #define SO_PEERPIDFD 0x404B -#define SO_DEVMEM_LINEAR 78 -#define SCM_DEVMEM_LINEAR SO_DEVMEM_LINEAR -#define SO_DEVMEM_DMABUF 79 -#define SCM_DEVMEM_DMABUF SO_DEVMEM_DMABUF -#define SO_DEVMEM_DONTNEED 80 - #define SCM_TS_OPT_ID 0x404C #define SO_RCVPRIORITY 0x404D +#define SO_DEVMEM_LINEAR 0x404E +#define SCM_DEVMEM_LINEAR SO_DEVMEM_LINEAR +#define SO_DEVMEM_DMABUF 0x404F +#define SCM_DEVMEM_DMABUF SO_DEVMEM_DMABUF +#define SO_DEVMEM_DONTNEED 0x4050 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 From bf2986fcf82a449441f9ee4335df19be19e83970 Mon Sep 17 00:00:00 2001 From: Minjoong Kim Date: Sat, 22 Mar 2025 10:52:00 +0000 Subject: [PATCH 38/41] atm: Fix NULL pointer dereference When MPOA_cache_impos_rcvd() receives the msg, it can trigger Null Pointer Dereference Vulnerability if both entry and holding_time are NULL. Because there is only for the situation where entry is NULL and holding_time exists, it can be passed when both entry and holding_time are NULL. If these are NULL, the entry will be passd to eg_cache_put() as parameter and it is referenced by entry->use code in it. kasan log: [ 3.316691] Oops: general protection fault, probably for non-canonical address 0xdffffc0000000006:I [ 3.317568] KASAN: null-ptr-deref in range [0x0000000000000030-0x0000000000000037] [ 3.318188] CPU: 3 UID: 0 PID: 79 Comm: ex Not tainted 6.14.0-rc2 #102 [ 3.318601] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014 [ 3.319298] RIP: 0010:eg_cache_remove_entry+0xa5/0x470 [ 3.319677] Code: c1 f7 6e fd 48 c7 c7 00 7e 38 b2 e8 95 64 54 fd 48 c7 c7 40 7e 38 b2 48 89 ee e80 [ 3.321220] RSP: 0018:ffff88800583f8a8 EFLAGS: 00010006 [ 3.321596] RAX: 0000000000000006 RBX: ffff888005989000 RCX: ffffffffaecc2d8e [ 3.322112] RDX: 0000000000000000 RSI: 0000000000000004 RDI: 0000000000000030 [ 3.322643] RBP: 0000000000000000 R08: 0000000000000000 R09: fffffbfff6558b88 [ 3.323181] R10: 0000000000000003 R11: 203a207972746e65 R12: 1ffff11000b07f15 [ 3.323707] R13: dffffc0000000000 R14: ffff888005989000 R15: ffff888005989068 [ 3.324185] FS: 000000001b6313c0(0000) GS:ffff88806d380000(0000) knlGS:0000000000000000 [ 3.325042] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 3.325545] CR2: 00000000004b4b40 CR3: 000000000248e000 CR4: 00000000000006f0 [ 3.326430] Call Trace: [ 3.326725] [ 3.326927] ? die_addr+0x3c/0xa0 [ 3.327330] ? exc_general_protection+0x161/0x2a0 [ 3.327662] ? asm_exc_general_protection+0x26/0x30 [ 3.328214] ? vprintk_emit+0x15e/0x420 [ 3.328543] ? eg_cache_remove_entry+0xa5/0x470 [ 3.328910] ? eg_cache_remove_entry+0x9a/0x470 [ 3.329294] ? __pfx_eg_cache_remove_entry+0x10/0x10 [ 3.329664] ? console_unlock+0x107/0x1d0 [ 3.329946] ? __pfx_console_unlock+0x10/0x10 [ 3.330283] ? do_syscall_64+0xa6/0x1a0 [ 3.330584] ? entry_SYSCALL_64_after_hwframe+0x47/0x7f [ 3.331090] ? __pfx_prb_read_valid+0x10/0x10 [ 3.331395] ? down_trylock+0x52/0x80 [ 3.331703] ? vprintk_emit+0x15e/0x420 [ 3.331986] ? __pfx_vprintk_emit+0x10/0x10 [ 3.332279] ? down_trylock+0x52/0x80 [ 3.332527] ? _printk+0xbf/0x100 [ 3.332762] ? __pfx__printk+0x10/0x10 [ 3.333007] ? _raw_write_lock_irq+0x81/0xe0 [ 3.333284] ? __pfx__raw_write_lock_irq+0x10/0x10 [ 3.333614] msg_from_mpoad+0x1185/0x2750 [ 3.333893] ? __build_skb_around+0x27b/0x3a0 [ 3.334183] ? __pfx_msg_from_mpoad+0x10/0x10 [ 3.334501] ? __alloc_skb+0x1c0/0x310 [ 3.334809] ? __pfx___alloc_skb+0x10/0x10 [ 3.335283] ? _raw_spin_lock+0xe0/0xe0 [ 3.335632] ? finish_wait+0x8d/0x1e0 [ 3.335975] vcc_sendmsg+0x684/0xba0 [ 3.336250] ? __pfx_vcc_sendmsg+0x10/0x10 [ 3.336587] ? __pfx_autoremove_wake_function+0x10/0x10 [ 3.337056] ? fdget+0x176/0x3e0 [ 3.337348] __sys_sendto+0x4a2/0x510 [ 3.337663] ? __pfx___sys_sendto+0x10/0x10 [ 3.337969] ? ioctl_has_perm.constprop.0.isra.0+0x284/0x400 [ 3.338364] ? sock_ioctl+0x1bb/0x5a0 [ 3.338653] ? __rseq_handle_notify_resume+0x825/0xd20 [ 3.339017] ? __pfx_sock_ioctl+0x10/0x10 [ 3.339316] ? __pfx___rseq_handle_notify_resume+0x10/0x10 [ 3.339727] ? selinux_file_ioctl+0xa4/0x260 [ 3.340166] __x64_sys_sendto+0xe0/0x1c0 [ 3.340526] ? syscall_exit_to_user_mode+0x123/0x140 [ 3.340898] do_syscall_64+0xa6/0x1a0 [ 3.341170] entry_SYSCALL_64_after_hwframe+0x77/0x7f [ 3.341533] RIP: 0033:0x44a380 [ 3.341757] Code: 0f 1f 84 00 00 00 00 00 66 90 f3 0f 1e fa 41 89 ca 64 8b 04 25 18 00 00 00 85 c00 [ 3.343078] RSP: 002b:00007ffc1d404098 EFLAGS: 00000246 ORIG_RAX: 000000000000002c [ 3.343631] RAX: ffffffffffffffda RBX: 00007ffc1d404458 RCX: 000000000044a380 [ 3.344306] RDX: 000000000000019c RSI: 00007ffc1d4040b0 RDI: 0000000000000003 [ 3.344833] RBP: 00007ffc1d404260 R08: 0000000000000000 R09: 0000000000000000 [ 3.345381] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000001 [ 3.346015] R13: 00007ffc1d404448 R14: 00000000004c17d0 R15: 0000000000000001 [ 3.346503] [ 3.346679] Modules linked in: [ 3.346956] ---[ end trace 0000000000000000 ]--- [ 3.347315] RIP: 0010:eg_cache_remove_entry+0xa5/0x470 [ 3.347737] Code: c1 f7 6e fd 48 c7 c7 00 7e 38 b2 e8 95 64 54 fd 48 c7 c7 40 7e 38 b2 48 89 ee e80 [ 3.349157] RSP: 0018:ffff88800583f8a8 EFLAGS: 00010006 [ 3.349517] RAX: 0000000000000006 RBX: ffff888005989000 RCX: ffffffffaecc2d8e [ 3.350103] RDX: 0000000000000000 RSI: 0000000000000004 RDI: 0000000000000030 [ 3.350610] RBP: 0000000000000000 R08: 0000000000000000 R09: fffffbfff6558b88 [ 3.351246] R10: 0000000000000003 R11: 203a207972746e65 R12: 1ffff11000b07f15 [ 3.351785] R13: dffffc0000000000 R14: ffff888005989000 R15: ffff888005989068 [ 3.352404] FS: 000000001b6313c0(0000) GS:ffff88806d380000(0000) knlGS:0000000000000000 [ 3.353099] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 3.353544] CR2: 00000000004b4b40 CR3: 000000000248e000 CR4: 00000000000006f0 [ 3.354072] note: ex[79] exited with irqs disabled [ 3.354458] note: ex[79] exited with preempt_count 1 Signed-off-by: Minjoong Kim Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250322105200.14981-1-pwn9uin@gmail.com Signed-off-by: Jakub Kicinski --- net/atm/mpc.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/atm/mpc.c b/net/atm/mpc.c index 324e3ab96bb39..12da0269275c5 100644 --- a/net/atm/mpc.c +++ b/net/atm/mpc.c @@ -1314,6 +1314,8 @@ static void MPOA_cache_impos_rcvd(struct k_message *msg, holding_time = msg->content.eg_info.holding_time; dprintk("(%s) entry = %p, holding_time = %u\n", mpc->dev->name, entry, holding_time); + if (entry == NULL && !holding_time) + return; if (entry == NULL && holding_time) { entry = mpc->eg_ops->add_entry(msg, mpc); mpc->eg_ops->put(entry); From a44940d094afa2e04b5b164b1e136fc18bcb4a2d Mon Sep 17 00:00:00 2001 From: Jiawen Wu Date: Mon, 24 Mar 2025 18:32:34 +0800 Subject: [PATCH 39/41] net: libwx: fix Tx descriptor content for some tunnel packets The length of skb header was incorrectly calculated when transmit a tunnel packet with outer IPv6 extension header, or a IP over IP packet which has inner IPv6 header. Thus the correct Tx context descriptor cannot be composed, resulting in Tx ring hang. Fixes: 3403960cdf86 ("net: wangxun: libwx add tx offload functions") Signed-off-by: Jiawen Wu Link: https://patch.msgid.link/20250324103235.823096-1-jiawenwu@trustnetic.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/wangxun/libwx/wx_lib.c | 61 +++++++++++++-------- 1 file changed, 37 insertions(+), 24 deletions(-) diff --git a/drivers/net/ethernet/wangxun/libwx/wx_lib.c b/drivers/net/ethernet/wangxun/libwx/wx_lib.c index 2b3d6586f44a5..9294a9d8c5541 100644 --- a/drivers/net/ethernet/wangxun/libwx/wx_lib.c +++ b/drivers/net/ethernet/wangxun/libwx/wx_lib.c @@ -1082,26 +1082,6 @@ static void wx_tx_ctxtdesc(struct wx_ring *tx_ring, u32 vlan_macip_lens, context_desc->mss_l4len_idx = cpu_to_le32(mss_l4len_idx); } -static void wx_get_ipv6_proto(struct sk_buff *skb, int offset, u8 *nexthdr) -{ - struct ipv6hdr *hdr = (struct ipv6hdr *)(skb->data + offset); - - *nexthdr = hdr->nexthdr; - offset += sizeof(struct ipv6hdr); - while (ipv6_ext_hdr(*nexthdr)) { - struct ipv6_opt_hdr _hdr, *hp; - - if (*nexthdr == NEXTHDR_NONE) - return; - hp = skb_header_pointer(skb, offset, sizeof(_hdr), &_hdr); - if (!hp) - return; - if (*nexthdr == NEXTHDR_FRAGMENT) - break; - *nexthdr = hp->nexthdr; - } -} - union network_header { struct iphdr *ipv4; struct ipv6hdr *ipv6; @@ -1112,6 +1092,8 @@ static u8 wx_encode_tx_desc_ptype(const struct wx_tx_buffer *first) { u8 tun_prot = 0, l4_prot = 0, ptype = 0; struct sk_buff *skb = first->skb; + unsigned char *exthdr, *l4_hdr; + __be16 frag_off; if (skb->encapsulation) { union network_header hdr; @@ -1122,14 +1104,18 @@ static u8 wx_encode_tx_desc_ptype(const struct wx_tx_buffer *first) ptype = WX_PTYPE_TUN_IPV4; break; case htons(ETH_P_IPV6): - wx_get_ipv6_proto(skb, skb_network_offset(skb), &tun_prot); + l4_hdr = skb_transport_header(skb); + exthdr = skb_network_header(skb) + sizeof(struct ipv6hdr); + tun_prot = ipv6_hdr(skb)->nexthdr; + if (l4_hdr != exthdr) + ipv6_skip_exthdr(skb, exthdr - skb->data, &tun_prot, &frag_off); ptype = WX_PTYPE_TUN_IPV6; break; default: return ptype; } - if (tun_prot == IPPROTO_IPIP) { + if (tun_prot == IPPROTO_IPIP || tun_prot == IPPROTO_IPV6) { hdr.raw = (void *)inner_ip_hdr(skb); ptype |= WX_PTYPE_PKT_IPIP; } else if (tun_prot == IPPROTO_UDP) { @@ -1166,7 +1152,11 @@ static u8 wx_encode_tx_desc_ptype(const struct wx_tx_buffer *first) l4_prot = hdr.ipv4->protocol; break; case 6: - wx_get_ipv6_proto(skb, skb_inner_network_offset(skb), &l4_prot); + l4_hdr = skb_inner_transport_header(skb); + exthdr = skb_inner_network_header(skb) + sizeof(struct ipv6hdr); + l4_prot = inner_ipv6_hdr(skb)->nexthdr; + if (l4_hdr != exthdr) + ipv6_skip_exthdr(skb, exthdr - skb->data, &l4_prot, &frag_off); ptype |= WX_PTYPE_PKT_IPV6; break; default: @@ -1179,7 +1169,11 @@ static u8 wx_encode_tx_desc_ptype(const struct wx_tx_buffer *first) ptype = WX_PTYPE_PKT_IP; break; case htons(ETH_P_IPV6): - wx_get_ipv6_proto(skb, skb_network_offset(skb), &l4_prot); + l4_hdr = skb_transport_header(skb); + exthdr = skb_network_header(skb) + sizeof(struct ipv6hdr); + l4_prot = ipv6_hdr(skb)->nexthdr; + if (l4_hdr != exthdr) + ipv6_skip_exthdr(skb, exthdr - skb->data, &l4_prot, &frag_off); ptype = WX_PTYPE_PKT_IP | WX_PTYPE_PKT_IPV6; break; default: @@ -1269,13 +1263,20 @@ static int wx_tso(struct wx_ring *tx_ring, struct wx_tx_buffer *first, /* vlan_macip_lens: HEADLEN, MACLEN, VLAN tag */ if (enc) { + unsigned char *exthdr, *l4_hdr; + __be16 frag_off; + switch (first->protocol) { case htons(ETH_P_IP): tun_prot = ip_hdr(skb)->protocol; first->tx_flags |= WX_TX_FLAGS_OUTER_IPV4; break; case htons(ETH_P_IPV6): + l4_hdr = skb_transport_header(skb); + exthdr = skb_network_header(skb) + sizeof(struct ipv6hdr); tun_prot = ipv6_hdr(skb)->nexthdr; + if (l4_hdr != exthdr) + ipv6_skip_exthdr(skb, exthdr - skb->data, &tun_prot, &frag_off); break; default: break; @@ -1298,6 +1299,7 @@ static int wx_tso(struct wx_ring *tx_ring, struct wx_tx_buffer *first, WX_TXD_TUNNEL_LEN_SHIFT); break; case IPPROTO_IPIP: + case IPPROTO_IPV6: tunhdr_eiplen_tunlen = (((char *)inner_ip_hdr(skb) - (char *)ip_hdr(skb)) >> 2) << WX_TXD_OUTER_IPLEN_SHIFT; @@ -1341,6 +1343,8 @@ static void wx_tx_csum(struct wx_ring *tx_ring, struct wx_tx_buffer *first, vlan_macip_lens = skb_network_offset(skb) << WX_TXD_MACLEN_SHIFT; } else { + unsigned char *exthdr, *l4_hdr; + __be16 frag_off; u8 l4_prot = 0; union { struct iphdr *ipv4; @@ -1362,7 +1366,12 @@ static void wx_tx_csum(struct wx_ring *tx_ring, struct wx_tx_buffer *first, tun_prot = ip_hdr(skb)->protocol; break; case htons(ETH_P_IPV6): + l4_hdr = skb_transport_header(skb); + exthdr = skb_network_header(skb) + sizeof(struct ipv6hdr); tun_prot = ipv6_hdr(skb)->nexthdr; + if (l4_hdr != exthdr) + ipv6_skip_exthdr(skb, exthdr - skb->data, + &tun_prot, &frag_off); break; default: return; @@ -1386,6 +1395,7 @@ static void wx_tx_csum(struct wx_ring *tx_ring, struct wx_tx_buffer *first, WX_TXD_TUNNEL_LEN_SHIFT); break; case IPPROTO_IPIP: + case IPPROTO_IPV6: tunhdr_eiplen_tunlen = (((char *)inner_ip_hdr(skb) - (char *)ip_hdr(skb)) >> 2) << WX_TXD_OUTER_IPLEN_SHIFT; @@ -1408,7 +1418,10 @@ static void wx_tx_csum(struct wx_ring *tx_ring, struct wx_tx_buffer *first, break; case 6: vlan_macip_lens |= (transport_hdr.raw - network_hdr.raw) >> 1; + exthdr = network_hdr.raw + sizeof(struct ipv6hdr); l4_prot = network_hdr.ipv6->nexthdr; + if (transport_hdr.raw != exthdr) + ipv6_skip_exthdr(skb, exthdr - skb->data, &l4_prot, &frag_off); break; default: break; From c7d82913d5f9e97860772ee4051eaa66b56a6273 Mon Sep 17 00:00:00 2001 From: Jiawen Wu Date: Mon, 24 Mar 2025 18:32:35 +0800 Subject: [PATCH 40/41] net: libwx: fix Tx L4 checksum The hardware only supports L4 checksum offload for TCP/UDP/SCTP protocol. There was a bug to set Tx checksum flag for the other protocol that results in Tx ring hang. Fix to compute software checksum for these packets. Fixes: 3403960cdf86 ("net: wangxun: libwx add tx offload functions") Signed-off-by: Jiawen Wu Link: https://patch.msgid.link/20250324103235.823096-2-jiawenwu@trustnetic.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/wangxun/libwx/wx_lib.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/wangxun/libwx/wx_lib.c b/drivers/net/ethernet/wangxun/libwx/wx_lib.c index 9294a9d8c5541..497abf2723a5e 100644 --- a/drivers/net/ethernet/wangxun/libwx/wx_lib.c +++ b/drivers/net/ethernet/wangxun/libwx/wx_lib.c @@ -1337,6 +1337,7 @@ static void wx_tx_csum(struct wx_ring *tx_ring, struct wx_tx_buffer *first, u8 tun_prot = 0; if (skb->ip_summed != CHECKSUM_PARTIAL) { +csum_failed: if (!(first->tx_flags & WX_TX_FLAGS_HW_VLAN) && !(first->tx_flags & WX_TX_FLAGS_CC)) return; @@ -1441,7 +1442,8 @@ static void wx_tx_csum(struct wx_ring *tx_ring, struct wx_tx_buffer *first, WX_TXD_L4LEN_SHIFT; break; default: - break; + skb_checksum_help(skb); + goto csum_failed; } /* update TX checksum flag */ From 70facbf978ac90c6da17a3de2a8dd111b06f1bac Mon Sep 17 00:00:00 2001 From: Daniel Hsu Date: Tue, 25 Mar 2025 16:10:08 +0800 Subject: [PATCH 41/41] mctp: Fix incorrect tx flow invalidation condition in mctp-i2c Previously, the condition for invalidating the tx flow in mctp_i2c_invalidate_tx_flow() checked if `rc` was nonzero. However, this could incorrectly trigger the invalidation even when `rc > 0` was returned as a success status. This patch updates the condition to explicitly check for `rc < 0`, ensuring that only error cases trigger the invalidation. Signed-off-by: Daniel Hsu Reviewed-by: Jeremy Kerr Signed-off-by: David S. Miller --- drivers/net/mctp/mctp-i2c.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/mctp/mctp-i2c.c b/drivers/net/mctp/mctp-i2c.c index d74d47dd6e04d..f782d93f826ef 100644 --- a/drivers/net/mctp/mctp-i2c.c +++ b/drivers/net/mctp/mctp-i2c.c @@ -537,7 +537,7 @@ static void mctp_i2c_xmit(struct mctp_i2c_dev *midev, struct sk_buff *skb) rc = __i2c_transfer(midev->adapter, &msg, 1); /* on tx errors, the flow can no longer be considered valid */ - if (rc) + if (rc < 0) mctp_i2c_invalidate_tx_flow(midev, skb); break;