From 546d98393abcf2f841e61163d95ed21fde346cc1 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 3 Feb 2025 14:59:23 +0200 Subject: [PATCH 01/63] bonding: delete always true device check XFRM API makes sure that xs->xso.dev is valid in all XFRM offload callbacks. There is no need to check it again. Signed-off-by: Leon Romanovsky Acked-by: Paolo Abeni Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/0b2f8f5f09701bb43bbd83b94bfe5cb506b57adc.1738587150.git.leon@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/bonding/bond_main.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index e45bba240cbc..f6d0628a36d9 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -432,9 +432,6 @@ static struct net_device *bond_ipsec_dev(struct xfrm_state *xs) struct bonding *bond; struct slave *slave; - if (!bond_dev) - return NULL; - bond = netdev_priv(bond_dev); if (BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP) return NULL; From a064068bb6be51ed54f435fe7314c057f9eeb020 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 3 Feb 2025 15:11:52 +0000 Subject: [PATCH 02/63] neighbour: remove neigh_parms_destroy() neigh_parms_destroy() is a simple kfree(), no need for a forward declaration. neigh_parms_put() can instead call kfree() directly. Signed-off-by: Eric Dumazet Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250203151152.3163876-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/core/neighbour.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 89656d180bc6..73260ca0fc22 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -832,12 +832,10 @@ static int pneigh_ifdown_and_unlock(struct neigh_table *tbl, return -ENOENT; } -static void neigh_parms_destroy(struct neigh_parms *parms); - static inline void neigh_parms_put(struct neigh_parms *parms) { if (refcount_dec_and_test(&parms->refcnt)) - neigh_parms_destroy(parms); + kfree(parms); } /* @@ -1713,11 +1711,6 @@ void neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms) } EXPORT_SYMBOL(neigh_parms_release); -static void neigh_parms_destroy(struct neigh_parms *parms) -{ - kfree(parms); -} - static struct lock_class_key neigh_table_proxy_queue_class; static struct neigh_table __rcu *neigh_tables[NEIGH_NR_TABLES] __read_mostly; From ac335826115dbbe10e536f43cf6090957c21bdc8 Mon Sep 17 00:00:00 2001 From: Ninad Palsule Date: Mon, 3 Feb 2025 09:12:55 -0600 Subject: [PATCH 03/63] dt-bindings: net: faraday,ftgmac100: Add phys mode Aspeed device supports rgmii, rgmii-id, rgmii-rxid, rgmii-txid so document them. Acked-by: Rob Herring (Arm) Signed-off-by: Ninad Palsule Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20250203151306.276358-2-ninad@linux.ibm.com Signed-off-by: Jakub Kicinski --- Documentation/devicetree/bindings/net/faraday,ftgmac100.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Documentation/devicetree/bindings/net/faraday,ftgmac100.yaml b/Documentation/devicetree/bindings/net/faraday,ftgmac100.yaml index 9bcbacb6640d..55d6a8379025 100644 --- a/Documentation/devicetree/bindings/net/faraday,ftgmac100.yaml +++ b/Documentation/devicetree/bindings/net/faraday,ftgmac100.yaml @@ -44,6 +44,9 @@ properties: phy-mode: enum: - rgmii + - rgmii-id + - rgmii-rxid + - rgmii-txid - rmii phy-handle: true From 185b1d53ea544a348dca679daefa4abd54d1322b Mon Sep 17 00:00:00 2001 From: Andrew Kreimer Date: Mon, 3 Feb 2025 19:53:24 +0200 Subject: [PATCH 04/63] net: qed: fix typos There are some typos in comments/messages: - Valiate -> Validate - acceptible -> acceptable - acces -> access - relased -> released Fix them via codespell. Signed-off-by: Andrew Kreimer Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250203175419.4146-1-algonell@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/qlogic/qed/qed_sriov.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/qlogic/qed/qed_sriov.c b/drivers/net/ethernet/qlogic/qed/qed_sriov.c index fa167b1aa019..5222a035fd19 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_sriov.c +++ b/drivers/net/ethernet/qlogic/qed/qed_sriov.c @@ -3033,7 +3033,7 @@ static void qed_iov_vf_mbx_vport_update(struct qed_hwfn *p_hwfn, u16 length; int rc; - /* Valiate PF can send such a request */ + /* Validate PF can send such a request */ if (!vf->vport_instance) { DP_VERBOSE(p_hwfn, QED_MSG_IOV, @@ -3312,7 +3312,7 @@ static void qed_iov_vf_mbx_ucast_filter(struct qed_hwfn *p_hwfn, goto out; } - /* Determine if the unicast filtering is acceptible by PF */ + /* Determine if the unicast filtering is acceptable by PF */ if ((p_bulletin->valid_bitmap & BIT(VLAN_ADDR_FORCED)) && (params.type == QED_FILTER_VLAN || params.type == QED_FILTER_MAC_VLAN)) { @@ -3729,7 +3729,7 @@ qed_iov_execute_vf_flr_cleanup(struct qed_hwfn *p_hwfn, rc = qed_iov_enable_vf_access(p_hwfn, p_ptt, p_vf); if (rc) { - DP_ERR(p_hwfn, "Failed to re-enable VF[%d] acces\n", + DP_ERR(p_hwfn, "Failed to re-enable VF[%d] access\n", vfid); return rc; } @@ -4480,7 +4480,7 @@ int qed_sriov_disable(struct qed_dev *cdev, bool pci_enabled) struct qed_ptt *ptt = qed_ptt_acquire(hwfn); /* Failure to acquire the ptt in 100g creates an odd error - * where the first engine has already relased IOV. + * where the first engine has already released IOV. */ if (!ptt) { DP_ERR(hwfn, "Failed to acquire ptt\n"); From 2cf424f5ac01682c93e3decfddee6282b7552f50 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Mon, 3 Feb 2025 18:52:29 +0000 Subject: [PATCH 05/63] mlx4: Remove unused functions The last use of mlx4_find_cached_mac() was removed in 2014 by commit 2f5bb473681b ("mlx4: Add ref counting to port MAC table for RoCE") mlx4_zone_free_entries() was added in 2014 by commit 7a89399ffad7 ("net/mlx4: Add mlx4_bitmap zone allocator") but hasn't been used. (The _unique version is used) Remove them. Signed-off-by: Dr. David Alan Gilbert Reviewed-by: Simon Horman Reviewed-by: Tariq Toukan Reviewed-by: Kalesh AP Link: https://patch.msgid.link/20250203185229.204279-1-linux@treblig.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx4/alloc.c | 22 ---------------------- drivers/net/ethernet/mellanox/mlx4/mlx4.h | 6 ------ drivers/net/ethernet/mellanox/mlx4/port.c | 20 -------------------- include/linux/mlx4/device.h | 1 - 4 files changed, 49 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx4/alloc.c b/drivers/net/ethernet/mellanox/mlx4/alloc.c index b330020dc0d6..598df63518c5 100644 --- a/drivers/net/ethernet/mellanox/mlx4/alloc.c +++ b/drivers/net/ethernet/mellanox/mlx4/alloc.c @@ -526,28 +526,6 @@ u32 mlx4_zone_alloc_entries(struct mlx4_zone_allocator *zones, u32 uid, int coun return res; } -u32 mlx4_zone_free_entries(struct mlx4_zone_allocator *zones, u32 uid, u32 obj, u32 count) -{ - struct mlx4_zone_entry *zone; - int res = 0; - - spin_lock(&zones->lock); - - zone = __mlx4_find_zone_by_uid(zones, uid); - - if (NULL == zone) { - res = -1; - goto out; - } - - __mlx4_free_from_zone(zone, obj, count); - -out: - spin_unlock(&zones->lock); - - return res; -} - u32 mlx4_zone_free_entries_unique(struct mlx4_zone_allocator *zones, u32 obj, u32 count) { struct mlx4_zone_entry *zone; diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h index d7d856d1758a..b213094ea30f 100644 --- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h +++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h @@ -1478,12 +1478,6 @@ void mlx4_zone_allocator_destroy(struct mlx4_zone_allocator *zone_alloc); u32 mlx4_zone_alloc_entries(struct mlx4_zone_allocator *zones, u32 uid, int count, int align, u32 skip_mask, u32 *puid); -/* Free objects, start from of the uid from zone_allocator - * . - */ -u32 mlx4_zone_free_entries(struct mlx4_zone_allocator *zones, - u32 uid, u32 obj, u32 count); - /* If was allocated with MLX4_ZONE_ALLOC_FLAGS_NO_OVERLAP, instead of * specifying the uid when freeing an object, zone allocator could figure it by * itself. Other parameters are similar to mlx4_zone_free. diff --git a/drivers/net/ethernet/mellanox/mlx4/port.c b/drivers/net/ethernet/mellanox/mlx4/port.c index 4e43f4a7d246..e3d0b13c1610 100644 --- a/drivers/net/ethernet/mellanox/mlx4/port.c +++ b/drivers/net/ethernet/mellanox/mlx4/port.c @@ -147,26 +147,6 @@ static int mlx4_set_port_mac_table(struct mlx4_dev *dev, u8 port, return err; } -int mlx4_find_cached_mac(struct mlx4_dev *dev, u8 port, u64 mac, int *idx) -{ - struct mlx4_port_info *info = &mlx4_priv(dev)->port[port]; - struct mlx4_mac_table *table = &info->mac_table; - int i; - - for (i = 0; i < MLX4_MAX_MAC_NUM; i++) { - if (!table->refs[i]) - continue; - - if (mac == (MLX4_MAC_MASK & be64_to_cpu(table->entries[i]))) { - *idx = i; - return 0; - } - } - - return -ENOENT; -} -EXPORT_SYMBOL_GPL(mlx4_find_cached_mac); - static bool mlx4_need_mf_bond(struct mlx4_dev *dev) { int i, num_eth_ports = 0; diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index 27f42f713c89..87edb7a8173b 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -1415,7 +1415,6 @@ int mlx4_get_is_vlan_offload_disabled(struct mlx4_dev *dev, u8 port, bool *vlan_offload_disabled); void mlx4_handle_eth_header_mcast_prio(struct mlx4_net_trans_rule_hw_ctrl *ctrl, struct _rule_hw *eth_header); -int mlx4_find_cached_mac(struct mlx4_dev *dev, u8 port, u64 mac, int *idx); int mlx4_find_cached_vlan(struct mlx4_dev *dev, u8 port, u16 vid, int *idx); int mlx4_register_vlan(struct mlx4_dev *dev, u8 port, u16 vlan, int *index); void mlx4_unregister_vlan(struct mlx4_dev *dev, u8 port, u16 vlan); From 15c51f17bdc46418b2e1b2b7a21a9a3036da6bae Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Mon, 3 Feb 2025 18:59:58 +0000 Subject: [PATCH 06/63] net/mlx5: Remove unused mlx5dr_domain_sync mlx5dr_domain_sync() was added in 2019 by commit 70605ea545e8 ("net/mlx5: DR, Expose APIs for direct rule managing") but hasn't been used. Remove it. mlx5dr_domain_sync() was the only user of mlx5dr_send_ring_force_drain(). Remove it. Signed-off-by: Dr. David Alan Gilbert Reviewed-by: Tariq Toukan Reviewed-by: Kalesh AP Link: https://patch.msgid.link/20250203185958.204794-1-linux@treblig.org Signed-off-by: Jakub Kicinski --- .../mlx5/core/steering/sws/dr_domain.c | 24 -------------- .../mellanox/mlx5/core/steering/sws/dr_send.c | 33 ------------------- .../mlx5/core/steering/sws/dr_types.h | 1 - .../mellanox/mlx5/core/steering/sws/mlx5dr.h | 2 -- 4 files changed, 60 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_domain.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_domain.c index 60cb4527588a..65740bb68b09 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_domain.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_domain.c @@ -516,30 +516,6 @@ mlx5dr_domain_create(struct mlx5_core_dev *mdev, enum mlx5dr_domain_type type) return NULL; } -/* Assure synchronization of the device steering tables with updates made by SW - * insertion. - */ -int mlx5dr_domain_sync(struct mlx5dr_domain *dmn, u32 flags) -{ - int ret = 0; - - if (flags & MLX5DR_DOMAIN_SYNC_FLAGS_SW) { - mlx5dr_domain_lock(dmn); - ret = mlx5dr_send_ring_force_drain(dmn); - mlx5dr_domain_unlock(dmn); - if (ret) { - mlx5dr_err(dmn, "Force drain failed flags: %d, ret: %d\n", - flags, ret); - return ret; - } - } - - if (flags & MLX5DR_DOMAIN_SYNC_FLAGS_HW) - ret = mlx5dr_cmd_sync_steering(dmn->mdev); - - return ret; -} - int mlx5dr_domain_destroy(struct mlx5dr_domain *dmn) { if (WARN_ON_ONCE(refcount_read(&dmn->refcount) > 1)) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_send.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_send.c index f57c84e5128b..4fd4e8483382 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_send.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_send.c @@ -1331,36 +1331,3 @@ void mlx5dr_send_ring_free(struct mlx5dr_domain *dmn, kfree(send_ring->sync_buff); kfree(send_ring); } - -int mlx5dr_send_ring_force_drain(struct mlx5dr_domain *dmn) -{ - struct mlx5dr_send_ring *send_ring = dmn->send_ring; - struct postsend_info send_info = {}; - u8 data[DR_STE_SIZE]; - int num_of_sends_req; - int ret; - int i; - - /* Sending this amount of requests makes sure we will get drain */ - num_of_sends_req = send_ring->signal_th * TH_NUMS_TO_DRAIN / 2; - - /* Send fake requests forcing the last to be signaled */ - send_info.write.addr = (uintptr_t)data; - send_info.write.length = DR_STE_SIZE; - send_info.write.lkey = 0; - /* Using the sync_mr in order to write/read */ - send_info.remote_addr = (uintptr_t)send_ring->sync_mr->addr; - send_info.rkey = send_ring->sync_mr->mkey; - - for (i = 0; i < num_of_sends_req; i++) { - ret = dr_postsend_icm_data(dmn, &send_info); - if (ret) - return ret; - } - - spin_lock(&send_ring->lock); - ret = dr_handle_pending_wc(dmn, send_ring); - spin_unlock(&send_ring->lock); - - return ret; -} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_types.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_types.h index 7618c6147f86..cc328292bf84 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_types.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_types.h @@ -1473,7 +1473,6 @@ struct mlx5dr_send_ring { int mlx5dr_send_ring_alloc(struct mlx5dr_domain *dmn); void mlx5dr_send_ring_free(struct mlx5dr_domain *dmn, struct mlx5dr_send_ring *send_ring); -int mlx5dr_send_ring_force_drain(struct mlx5dr_domain *dmn); int mlx5dr_send_postsend_ste(struct mlx5dr_domain *dmn, struct mlx5dr_ste *ste, u8 *data, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/mlx5dr.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/mlx5dr.h index 0bb3724c10c2..fc8a2169d1a1 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/mlx5dr.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/mlx5dr.h @@ -45,8 +45,6 @@ mlx5dr_domain_create(struct mlx5_core_dev *mdev, enum mlx5dr_domain_type type); int mlx5dr_domain_destroy(struct mlx5dr_domain *domain); -int mlx5dr_domain_sync(struct mlx5dr_domain *domain, u32 flags); - void mlx5dr_domain_set_peer(struct mlx5dr_domain *dmn, struct mlx5dr_domain *peer_dmn, u16 peer_vhca_id); From 626b36727609e453fb3c9fd172e44cb67f39279e Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Mon, 3 Feb 2025 19:01:41 +0000 Subject: [PATCH 07/63] mlxsw: spectrum_router: Remove unused functions mlxsw_sp_ipip_lb_ul_vr_id() has been unused since 2020's commit acde33bf7319 ("mlxsw: spectrum_router: Reduce mlxsw_sp_ipip_fib_entry_op_gre4()") mlxsw_sp_rif_exists() has been unused since 2023's commit 49c3a615d382 ("mlxsw: spectrum_router: Replay MACVLANs when RIF is made") mlxsw_sp_rif_vid() has been unused since 2023's commit a5b52692e693 ("mlxsw: spectrum_switchdev: Manage RIFs on PVID change") Remove them. Signed-off-by: Dr. David Alan Gilbert Reviewed-by: Ido Schimmel Reviewed-by: Petr Machata Link: https://patch.msgid.link/20250203190141.204951-1-linux@treblig.org Signed-off-by: Jakub Kicinski --- .../net/ethernet/mellanox/mlxsw/spectrum.h | 3 -- .../ethernet/mellanox/mlxsw/spectrum_router.c | 48 ------------------- .../ethernet/mellanox/mlxsw/spectrum_router.h | 1 - 3 files changed, 52 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h index b10f80fc651b..fa7082ee5183 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h @@ -754,9 +754,6 @@ void mlxsw_sp_port_vlan_router_leave(struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan); void mlxsw_sp_rif_destroy_by_dev(struct mlxsw_sp *mlxsw_sp, struct net_device *dev); -bool mlxsw_sp_rif_exists(struct mlxsw_sp *mlxsw_sp, - const struct net_device *dev); -u16 mlxsw_sp_rif_vid(struct mlxsw_sp *mlxsw_sp, const struct net_device *dev); u16 mlxsw_sp_router_port(const struct mlxsw_sp *mlxsw_sp); int mlxsw_sp_router_nve_promote_decap(struct mlxsw_sp *mlxsw_sp, u32 ul_tb_id, enum mlxsw_sp_l3proto ul_proto, diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index 7d6d859cef3f..464821dd492d 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -8184,41 +8184,6 @@ mlxsw_sp_rif_find_by_dev(const struct mlxsw_sp *mlxsw_sp, return NULL; } -bool mlxsw_sp_rif_exists(struct mlxsw_sp *mlxsw_sp, - const struct net_device *dev) -{ - struct mlxsw_sp_rif *rif; - - mutex_lock(&mlxsw_sp->router->lock); - rif = mlxsw_sp_rif_find_by_dev(mlxsw_sp, dev); - mutex_unlock(&mlxsw_sp->router->lock); - - return rif; -} - -u16 mlxsw_sp_rif_vid(struct mlxsw_sp *mlxsw_sp, const struct net_device *dev) -{ - struct mlxsw_sp_rif *rif; - u16 vid = 0; - - mutex_lock(&mlxsw_sp->router->lock); - rif = mlxsw_sp_rif_find_by_dev(mlxsw_sp, dev); - if (!rif) - goto out; - - /* We only return the VID for VLAN RIFs. Otherwise we return an - * invalid value (0). - */ - if (rif->ops->type != MLXSW_SP_RIF_TYPE_VLAN) - goto out; - - vid = mlxsw_sp_fid_8021q_vid(rif->fid); - -out: - mutex_unlock(&mlxsw_sp->router->lock); - return vid; -} - static int mlxsw_sp_router_rif_disable(struct mlxsw_sp *mlxsw_sp, u16 rif) { char ritr_pl[MLXSW_REG_RITR_LEN]; @@ -8417,19 +8382,6 @@ u16 mlxsw_sp_ipip_lb_rif_index(const struct mlxsw_sp_rif_ipip_lb *lb_rif) return lb_rif->common.rif_index; } -u16 mlxsw_sp_ipip_lb_ul_vr_id(const struct mlxsw_sp_rif_ipip_lb *lb_rif) -{ - struct net_device *dev = mlxsw_sp_rif_dev(&lb_rif->common); - u32 ul_tb_id = mlxsw_sp_ipip_dev_ul_tb_id(dev); - struct mlxsw_sp_vr *ul_vr; - - ul_vr = mlxsw_sp_vr_get(lb_rif->common.mlxsw_sp, ul_tb_id, NULL); - if (WARN_ON(IS_ERR(ul_vr))) - return 0; - - return ul_vr->id; -} - u16 mlxsw_sp_ipip_lb_ul_rif_id(const struct mlxsw_sp_rif_ipip_lb *lb_rif) { return lb_rif->ul_rif_id; diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.h index 0432c7cc6b07..313efab5c324 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.h +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.h @@ -90,7 +90,6 @@ struct mlxsw_sp_ipip_entry; struct mlxsw_sp_rif *mlxsw_sp_rif_by_index(const struct mlxsw_sp *mlxsw_sp, u16 rif_index); u16 mlxsw_sp_ipip_lb_rif_index(const struct mlxsw_sp_rif_ipip_lb *rif); -u16 mlxsw_sp_ipip_lb_ul_vr_id(const struct mlxsw_sp_rif_ipip_lb *rif); u16 mlxsw_sp_ipip_lb_ul_rif_id(const struct mlxsw_sp_rif_ipip_lb *lb_rif); u32 mlxsw_sp_ipip_dev_ul_tb_id(const struct net_device *ol_dev); int mlxsw_sp_rif_dev_ifindex(const struct mlxsw_sp_rif *rif); From b565a8c750ef9d9d9e10d0fee17f4bf297be0e5a Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Mon, 3 Feb 2025 18:33:43 +0000 Subject: [PATCH 08/63] cavium/liquidio: Remove unused lio_get_device_id lio_get_device_id() has been unused since 2018's commit 64fecd3ec512 ("liquidio: remove obsolete functions and data structures") Remove it. Signed-off-by: Dr. David Alan Gilbert Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250203183343.193691-1-linux@treblig.org Signed-off-by: Jakub Kicinski --- .../net/ethernet/cavium/liquidio/octeon_device.c | 16 ---------------- .../net/ethernet/cavium/liquidio/octeon_device.h | 7 ------- 2 files changed, 23 deletions(-) diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_device.c b/drivers/net/ethernet/cavium/liquidio/octeon_device.c index 6b6cb73482d7..1753bb87dfbd 100644 --- a/drivers/net/ethernet/cavium/liquidio/octeon_device.c +++ b/drivers/net/ethernet/cavium/liquidio/octeon_device.c @@ -1433,22 +1433,6 @@ int octeon_wait_for_ddr_init(struct octeon_device *oct, u32 *timeout) } EXPORT_SYMBOL_GPL(octeon_wait_for_ddr_init); -/* Get the octeon id assigned to the octeon device passed as argument. - * This function is exported to other modules. - * @param dev - octeon device pointer passed as a void *. - * @return octeon device id - */ -int lio_get_device_id(void *dev) -{ - struct octeon_device *octeon_dev = (struct octeon_device *)dev; - u32 i; - - for (i = 0; i < MAX_OCTEON_DEVICES; i++) - if (octeon_device[i] == octeon_dev) - return octeon_dev->octeon_id; - return -1; -} - void lio_enable_irq(struct octeon_droq *droq, struct octeon_instr_queue *iq) { u64 instr_cnt; diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_device.h b/drivers/net/ethernet/cavium/liquidio/octeon_device.h index d26364c2ac81..19344b21f8fb 100644 --- a/drivers/net/ethernet/cavium/liquidio/octeon_device.h +++ b/drivers/net/ethernet/cavium/liquidio/octeon_device.h @@ -705,13 +705,6 @@ octeon_get_dispatch(struct octeon_device *octeon_dev, u16 opcode, */ struct octeon_device *lio_get_device(u32 octeon_id); -/** Get the octeon id assigned to the octeon device passed as argument. - * This function is exported to other modules. - * @param dev - octeon device pointer passed as a void *. - * @return octeon device id - */ -int lio_get_device_id(void *dev); - /** Read windowed register. * @param oct - pointer to the Octeon device. * @param addr - Address of the register to read. From 9dd05df8403bda5b68178b795c554b3940628bb6 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 3 Feb 2025 13:58:16 -0800 Subject: [PATCH 09/63] net: warn if NAPI instance wasn't shut down Drivers should always disable a NAPI instance before removing it. If they don't the instance may be queued for polling. Since commit 86e25f40aa1e ("net: napi: Add napi_config") we also remove the NAPI from the busy polling hash table in napi_disable(), so not disabling would leave a stale entry there. Use of busy polling is relatively uncommon so bugs may be lurking in the drivers. Add an explicit warning. Reviewed-by: Joe Damato Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250203215816.1294081-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/core/dev.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/core/dev.c b/net/core/dev.c index c0021cbd28fc..2b141f20b13b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -7071,6 +7071,9 @@ void __netif_napi_del_locked(struct napi_struct *napi) if (!test_and_clear_bit(NAPI_STATE_LISTED, &napi->state)) return; + /* Make sure NAPI is disabled (or was never enabled). */ + WARN_ON(!test_bit(NAPI_STATE_SCHED, &napi->state)); + if (napi->config) { napi->index = -1; napi->config = NULL; From 33b565fa2bc0af2d5b23b0fd954460b0b25b9280 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Tue, 4 Feb 2025 12:40:49 +1030 Subject: [PATCH 10/63] net: atlantic: Avoid -Wflex-array-member-not-at-end warnings -Wflex-array-member-not-at-end was introduced in GCC-14, and we are getting ready to enable it, globally. Remove unused flexible-array member `buf` and, with this, fix the following warnings: drivers/net/ethernet/aquantia/atlantic/aq_hw.h:197:36: warning: structure containing a flexible array member is not at the end of another structure [-Wflex-array-member-not-at-end] drivers/net/ethernet/aquantia/atlantic/hw_atl/../aq_hw.h:197:36: warning: structure containing a flexible array member is not at the end of another structure [-Wflex-array-member-not-at-end] Suggested-by: Igor Russkikh Signed-off-by: Gustavo A. R. Silva Reviewed-by: Michal Swiatkowski Reviewed-by: Simon Horman Reviewed-by: Igor Russkikh Link: https://patch.msgid.link/Z6F3KZVfnAZ2FoJm@kspp Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.h | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.h b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.h index f5901f8e3907..f6b990b7f5b4 100644 --- a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.h +++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.h @@ -226,7 +226,6 @@ struct __packed offload_info { struct offload_port_info ports; struct offload_ka_info kas; struct offload_rr_info rrs; - u8 buf[]; }; struct __packed hw_atl_utils_fw_rpc { From d5fdfe480c7926960cb926964546c8704fe09626 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Mon, 3 Feb 2025 11:04:15 -0800 Subject: [PATCH 11/63] netconsole: selftest: Add test for fragmented messages Add a new selftest to verify netconsole's handling of messages that exceed the packet size limit and require fragmentation. The test sends messages with varying sizes and userdata, validating that: 1. Large messages are correctly fragmented and reassembled 2. Userdata fields are properly preserved across fragments 3. Messages work correctly with and without kernel release version appending The test creates a networking environment using netdevsim, sends messages through /dev/kmsg, and verifies the received fragments maintain message integrity. Signed-off-by: Breno Leitao Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250203-netcons_frag_msgs-v1-1-5bc6bedf2ac0@debian.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/Makefile | 1 + .../drivers/net/lib/sh/lib_netcons.sh | 7 + .../drivers/net/netcons_fragmented_msg.sh | 122 ++++++++++++++++++ 3 files changed, 130 insertions(+) create mode 100755 tools/testing/selftests/drivers/net/netcons_fragmented_msg.sh diff --git a/tools/testing/selftests/drivers/net/Makefile b/tools/testing/selftests/drivers/net/Makefile index 137470bdee0c..c7f1c443f2af 100644 --- a/tools/testing/selftests/drivers/net/Makefile +++ b/tools/testing/selftests/drivers/net/Makefile @@ -7,6 +7,7 @@ TEST_INCLUDES := $(wildcard lib/py/*.py) \ TEST_PROGS := \ netcons_basic.sh \ + netcons_fragmented_msg.sh \ netcons_overflow.sh \ ping.py \ queues.py \ diff --git a/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh b/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh index 3acaba41ac7b..0c262b123fdd 100644 --- a/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh +++ b/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh @@ -110,6 +110,13 @@ function create_dynamic_target() { echo 1 > "${NETCONS_PATH}"/enabled } +# Do not append the release to the header of the message +function disable_release_append() { + echo 0 > "${NETCONS_PATH}"/enabled + echo 0 > "${NETCONS_PATH}"/release + echo 1 > "${NETCONS_PATH}"/enabled +} + function cleanup() { local NSIM_DEV_SYS_DEL="/sys/bus/netdevsim/del_device" diff --git a/tools/testing/selftests/drivers/net/netcons_fragmented_msg.sh b/tools/testing/selftests/drivers/net/netcons_fragmented_msg.sh new file mode 100755 index 000000000000..4a71e01a230c --- /dev/null +++ b/tools/testing/selftests/drivers/net/netcons_fragmented_msg.sh @@ -0,0 +1,122 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: GPL-2.0 + +# Test netconsole's message fragmentation functionality. +# +# When a message exceeds the maximum packet size, netconsole splits it into +# multiple fragments for transmission. This test verifies: +# - Correct fragmentation of large messages +# - Proper reassembly of fragments at the receiver +# - Preservation of userdata across fragments +# - Behavior with and without kernel release version appending +# +# Author: Breno Leitao + +set -euo pipefail + +SCRIPTDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")") + +source "${SCRIPTDIR}"/lib/sh/lib_netcons.sh + +modprobe netdevsim 2> /dev/null || true +modprobe netconsole 2> /dev/null || true + +# The content of kmsg will be save to the following file +OUTPUT_FILE="/tmp/${TARGET}" + +# set userdata to a long value. In this case, it is "1-2-3-4...50-" +USERDATA_VALUE=$(printf -- '%.2s-' {1..60}) + +# Convert the header string in a regexp, so, we can remove +# the second header as well. +# A header looks like "13,468,514729715,-,ncfrag=0/1135;". If +# release is appended, you might find something like:L +# "6.13.0-04048-g4f561a87745a,13,468,514729715,-,ncfrag=0/1135;" +function header_to_regex() { + # header is everything before ; + local HEADER="${1}" + REGEX=$(echo "${HEADER}" | cut -d'=' -f1) + echo "${REGEX}=[0-9]*\/[0-9]*;" +} + +# We have two headers in the message. Remove both to get the full message, +# and extract the full message. +function extract_msg() { + local MSGFILE="${1}" + # Extract the header, which is the very first thing that arrives in the + # first list. + HEADER=$(sed -n '1p' "${MSGFILE}" | cut -d';' -f1) + HEADER_REGEX=$(header_to_regex "${HEADER}") + + # Remove the two headers from the received message + # This will return the message without any header, similarly to what + # was sent. + sed "s/""${HEADER_REGEX}""//g" "${MSGFILE}" +} + +# Validate the message, which has two messages glued together. +# unwrap them to make sure all the characters were transmitted. +# File will look like the following: +# 13,468,514729715,-,ncfrag=0/1135; +# key=-13,468,514729715,-,ncfrag=967/1135; +function validate_fragmented_result() { + # Discard the netconsole headers, and assemble the full message + RCVMSG=$(extract_msg "${1}") + + # check for the main message + if ! echo "${RCVMSG}" | grep -q "${MSG}"; then + echo "Message body doesn't match." >&2 + echo "msg received=" "${RCVMSG}" >&2 + exit "${ksft_fail}" + fi + + # check userdata + if ! echo "${RCVMSG}" | grep -q "${USERDATA_VALUE}"; then + echo "message userdata doesn't match" >&2 + echo "msg received=" "${RCVMSG}" >&2 + exit "${ksft_fail}" + fi + # test passed. hooray +} + +# Check for basic system dependency and exit if not found +check_for_dependencies +# Set current loglevel to KERN_INFO(6), and default to KERN_NOTICE(5) +echo "6 5" > /proc/sys/kernel/printk +# Remove the namespace, interfaces and netconsole target on exit +trap cleanup EXIT +# Create one namespace and two interfaces +set_network +# Create a dynamic target for netconsole +create_dynamic_target +# Set userdata "key" with the "value" value +set_user_data + + +# TEST 1: Send message and userdata. They will fragment +# ======= +MSG=$(printf -- 'MSG%.3s=' {1..150}) + +# Listen for netconsole port inside the namespace and destination interface +listen_port_and_save_to "${OUTPUT_FILE}" & +# Wait for socat to start and listen to the port. +wait_local_port_listen "${NAMESPACE}" "${PORT}" udp +# Send the message +echo "${MSG}: ${TARGET}" > /dev/kmsg +# Wait until socat saves the file to disk +busywait "${BUSYWAIT_TIMEOUT}" test -s "${OUTPUT_FILE}" +# Check if the message was not corrupted +validate_fragmented_result "${OUTPUT_FILE}" + +# TEST 2: Test with smaller message, and without release appended +# ======= +MSG=$(printf -- 'FOOBAR%.3s=' {1..100}) +# Let's disable release and test again. +disable_release_append + +listen_port_and_save_to "${OUTPUT_FILE}" & +wait_local_port_listen "${NAMESPACE}" "${PORT}" udp +echo "${MSG}: ${TARGET}" > /dev/kmsg +busywait "${BUSYWAIT_TIMEOUT}" test -s "${OUTPUT_FILE}" +validate_fragmented_result "${OUTPUT_FILE}" +exit "${ksft_pass}" From 51773846fab24a353bed4ebb660997ced4bc32d7 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Mon, 3 Feb 2025 21:33:39 +0100 Subject: [PATCH 12/63] net: phy: realtek: make HWMON support a user-visible Kconfig symbol Make config symbol REALTEK_PHY_HWMON user-visible, so that users can remove support if not needed. Suggested-by: Geert Uytterhoeven Signed-off-by: Heiner Kallweit Reviewed-by: Simon Horman Link: https://patch.msgid.link/3466ee92-166a-4b0f-9ae7-42b9e046f333@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/realtek/Kconfig | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/net/phy/realtek/Kconfig b/drivers/net/phy/realtek/Kconfig index 31935f147d87..b05c2a1e9024 100644 --- a/drivers/net/phy/realtek/Kconfig +++ b/drivers/net/phy/realtek/Kconfig @@ -4,8 +4,12 @@ config REALTEK_PHY help Currently supports RTL821x/RTL822x and fast ethernet PHYs +if REALTEK_PHY + config REALTEK_PHY_HWMON - def_bool REALTEK_PHY && HWMON - depends on !(REALTEK_PHY=y && HWMON=m) + bool "HWMON support for Realtek PHYs" + depends on HWMON && !(REALTEK_PHY=y && HWMON=m) help Optional hwmon support for the temperature sensor + +endif # REALTEK_PHY From 135c3c86a7cef4ba3d368da15b16c275b74582d3 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Mon, 3 Feb 2025 21:35:24 +0100 Subject: [PATCH 13/63] r8169: make Kconfig option for LED support user-visible Make config option R8169_LEDS user-visible, so that users can remove support if not needed. Signed-off-by: Heiner Kallweit Reviewed-by: Simon Horman Link: https://patch.msgid.link/d29f0cdb-32bf-435f-b59d-dc96bca1e3ab@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/realtek/Kconfig | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/realtek/Kconfig b/drivers/net/ethernet/realtek/Kconfig index 8a8ea51c639e..fe136f61586f 100644 --- a/drivers/net/ethernet/realtek/Kconfig +++ b/drivers/net/ethernet/realtek/Kconfig @@ -114,7 +114,8 @@ config R8169 will be called r8169. This is recommended. config R8169_LEDS - def_bool R8169 && LEDS_TRIGGER_NETDEV + bool "Support for controlling the NIC LEDs" + depends on R8169 && LEDS_TRIGGER_NETDEV depends on !(R8169=y && LEDS_CLASS=m) help Optional support for controlling the NIC LED's with the netdev From 0bea93fdbaf8675b7e8124bdcaf51497dcc8bcfa Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Mon, 3 Feb 2025 21:41:36 +0100 Subject: [PATCH 14/63] net: phy: realtek: use string choices helpers Use string choices helpers to simplify the code. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202501190707.qQS8PGHW-lkp@intel.com/ Signed-off-by: Heiner Kallweit Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/phy/realtek/realtek_main.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/net/phy/realtek/realtek_main.c b/drivers/net/phy/realtek/realtek_main.c index 572a933636b0..210fefac44d4 100644 --- a/drivers/net/phy/realtek/realtek_main.c +++ b/drivers/net/phy/realtek/realtek_main.c @@ -13,6 +13,7 @@ #include #include #include +#include #include "realtek.h" @@ -422,11 +423,11 @@ static int rtl8211f_config_init(struct phy_device *phydev) } else if (ret) { dev_dbg(dev, "%s 2ns TX delay (and changing the value from pin-strapping RXD1 or the bootloader)\n", - val_txdly ? "Enabling" : "Disabling"); + str_enable_disable(val_txdly)); } else { dev_dbg(dev, "2ns TX delay was already %s (by pin-strapping RXD1 or bootloader configuration)\n", - val_txdly ? "enabled" : "disabled"); + str_enabled_disabled(val_txdly)); } ret = phy_modify_paged_changed(phydev, 0xd08, 0x15, RTL8211F_RX_DELAY, @@ -437,11 +438,11 @@ static int rtl8211f_config_init(struct phy_device *phydev) } else if (ret) { dev_dbg(dev, "%s 2ns RX delay (and changing the value from pin-strapping RXD0 or the bootloader)\n", - val_rxdly ? "Enabling" : "Disabling"); + str_enable_disable(val_rxdly)); } else { dev_dbg(dev, "2ns RX delay was already %s (by pin-strapping RXD0 or bootloader configuration)\n", - val_rxdly ? "enabled" : "disabled"); + str_enabled_disabled(val_rxdly)); } if (priv->has_phycr2) { From 79c61899b5eee317907efd1b0d06a1ada0cc00d8 Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Tue, 4 Feb 2025 18:03:10 +0100 Subject: [PATCH 15/63] net-sysfs: remove rtnl_trylock from device attributes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is an ABBA deadlock between net device unregistration and sysfs files being accessed[1][2]. To prevent this from happening all paths taking the rtnl lock after the sysfs one (actually kn->active refcount) use rtnl_trylock and return early (using restart_syscall)[3], which can make syscalls to spin for a long time when there is contention on the rtnl lock[4]. There are not many possibilities to improve the above: - Rework the entire net/ locking logic. - Invert two locks in one of the paths — not possible. But here it's actually possible to drop one of the locks safely: the kernfs_node refcount. More details in the code itself, which comes with lots of comments. Note that we check the device is alive in the added sysfs_rtnl_lock helper to disallow sysfs operations to run after device dismantle has started. This also help keeping the same behavior as before. Because of this calls to dev_isalive in sysfs ops were removed. [1] https://lore.kernel.org/netdev/49A4D5D5.5090602@trash.net/ [2] https://lore.kernel.org/netdev/m14oyhis31.fsf@fess.ebiederm.org/ [3] https://lore.kernel.org/netdev/20090226084924.16cb3e08@nehalam/ [4] https://lore.kernel.org/all/20210928125500.167943-1-atenart@kernel.org/T/ Signed-off-by: Antoine Tenart Link: https://patch.msgid.link/20250204170314.146022-2-atenart@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/rtnetlink.h | 1 + net/core/net-sysfs.c | 186 +++++++++++++++++++++++++++----------- net/core/rtnetlink.c | 5 + 3 files changed, 139 insertions(+), 53 deletions(-) diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 4bc2ee0b10b0..ccaaf4c7d5f6 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -43,6 +43,7 @@ extern void rtnl_lock(void); extern void rtnl_unlock(void); extern int rtnl_trylock(void); extern int rtnl_is_locked(void); +extern int rtnl_lock_interruptible(void); extern int rtnl_lock_killable(void); extern bool refcount_dec_and_rtnl_lock(refcount_t *r); diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 07cb99b114bd..e012234c739a 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -42,6 +42,87 @@ static inline int dev_isalive(const struct net_device *dev) return READ_ONCE(dev->reg_state) <= NETREG_REGISTERED; } +/* There is a possible ABBA deadlock between rtnl_lock and kernfs_node->active, + * when unregistering a net device and accessing associated sysfs files. The + * potential deadlock is as follow: + * + * CPU 0 CPU 1 + * + * rtnl_lock vfs_read + * unregister_netdevice_many kernfs_seq_start + * device_del / kobject_put kernfs_get_active (kn->active++) + * kernfs_drain sysfs_kf_seq_show + * wait_event( rtnl_lock + * kn->active == KN_DEACTIVATED_BIAS) -> waits on CPU 0 to release + * -> waits on CPU 1 to decrease kn->active the rtnl lock. + * + * The historical fix was to use rtnl_trylock with restart_syscall to bail out + * of sysfs operations when the lock couldn't be taken. This fixed the above + * issue as it allowed CPU 1 to bail out of the ABBA situation. + * + * But it came with performances issues, as syscalls are being restarted in + * loops when there was contention on the rtnl lock, with huge slow downs in + * specific scenarios (e.g. lots of virtual interfaces created and userspace + * daemons querying their attributes). + * + * The idea below is to bail out of the active kernfs_node protection + * (kn->active) while trying to take the rtnl lock. + * + * This replaces rtnl_lock() and still has to be used with rtnl_unlock(). The + * net device is guaranteed to be alive if this returns successfully. + */ +static int sysfs_rtnl_lock(struct kobject *kobj, struct attribute *attr, + struct net_device *ndev) +{ + struct kernfs_node *kn; + int ret = 0; + + /* First, we hold a reference to the net device as the unregistration + * path might run in parallel. This will ensure the net device and the + * associated sysfs objects won't be freed while we try to take the rtnl + * lock. + */ + dev_hold(ndev); + /* sysfs_break_active_protection was introduced to allow self-removal of + * devices and their associated sysfs files by bailing out of the + * sysfs/kernfs protection. We do this here to allow the unregistration + * path to complete in parallel. The following takes a reference on the + * kobject and the kernfs_node being accessed. + * + * This works because we hold a reference onto the net device and the + * unregistration path will wait for us eventually in netdev_run_todo + * (outside an rtnl lock section). + */ + kn = sysfs_break_active_protection(kobj, attr); + /* We can now try to take the rtnl lock. This can't deadlock us as the + * unregistration path is able to drain sysfs files (kernfs_node) thanks + * to the above dance. + */ + if (rtnl_lock_interruptible()) { + ret = -ERESTARTSYS; + goto unbreak; + } + /* Check dismantle on the device hasn't started, otherwise deny the + * operation. + */ + if (!dev_isalive(ndev)) { + rtnl_unlock(); + ret = -ENODEV; + goto unbreak; + } + /* We are now sure the device dismantle hasn't started nor that it can + * start before we exit the locking section as we hold the rtnl lock. + * There's no need to keep unbreaking the sysfs protection nor to hold + * a net device reference from that point; that was only needed to take + * the rtnl lock. + */ +unbreak: + sysfs_unbreak_active_protection(kn); + dev_put(ndev); + + return ret; +} + /* use same locking rules as GIF* ioctl's */ static ssize_t netdev_show(const struct device *dev, struct device_attribute *attr, char *buf, @@ -95,14 +176,14 @@ static ssize_t netdev_store(struct device *dev, struct device_attribute *attr, if (ret) goto err; - if (!rtnl_trylock()) - return restart_syscall(); + ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev); + if (ret) + goto err; + + ret = (*set)(netdev, new); + if (ret == 0) + ret = len; - if (dev_isalive(netdev)) { - ret = (*set)(netdev, new); - if (ret == 0) - ret = len; - } rtnl_unlock(); err: return ret; @@ -220,7 +301,7 @@ static ssize_t carrier_store(struct device *dev, struct device_attribute *attr, struct net_device *netdev = to_net_dev(dev); /* The check is also done in change_carrier; this helps returning early - * without hitting the trylock/restart in netdev_store. + * without hitting the locking section in netdev_store. */ if (!netdev->netdev_ops->ndo_change_carrier) return -EOPNOTSUPP; @@ -234,8 +315,9 @@ static ssize_t carrier_show(struct device *dev, struct net_device *netdev = to_net_dev(dev); int ret = -EINVAL; - if (!rtnl_trylock()) - return restart_syscall(); + ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev); + if (ret) + return ret; if (netif_running(netdev)) { /* Synchronize carrier state with link watch, @@ -245,8 +327,8 @@ static ssize_t carrier_show(struct device *dev, ret = sysfs_emit(buf, fmt_dec, !!netif_carrier_ok(netdev)); } - rtnl_unlock(); + rtnl_unlock(); return ret; } static DEVICE_ATTR_RW(carrier); @@ -258,13 +340,14 @@ static ssize_t speed_show(struct device *dev, int ret = -EINVAL; /* The check is also done in __ethtool_get_link_ksettings; this helps - * returning early without hitting the trylock/restart below. + * returning early without hitting the locking section below. */ if (!netdev->ethtool_ops->get_link_ksettings) return ret; - if (!rtnl_trylock()) - return restart_syscall(); + ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev); + if (ret) + return ret; if (netif_running(netdev)) { struct ethtool_link_ksettings cmd; @@ -284,13 +367,14 @@ static ssize_t duplex_show(struct device *dev, int ret = -EINVAL; /* The check is also done in __ethtool_get_link_ksettings; this helps - * returning early without hitting the trylock/restart below. + * returning early without hitting the locking section below. */ if (!netdev->ethtool_ops->get_link_ksettings) return ret; - if (!rtnl_trylock()) - return restart_syscall(); + ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev); + if (ret) + return ret; if (netif_running(netdev)) { struct ethtool_link_ksettings cmd; @@ -490,16 +574,15 @@ static ssize_t ifalias_store(struct device *dev, struct device_attribute *attr, if (len > 0 && buf[len - 1] == '\n') --count; - if (!rtnl_trylock()) - return restart_syscall(); + ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev); + if (ret) + return ret; - if (dev_isalive(netdev)) { - ret = dev_set_alias(netdev, buf, count); - if (ret < 0) - goto err; - ret = len; - netdev_state_change(netdev); - } + ret = dev_set_alias(netdev, buf, count); + if (ret < 0) + goto err; + ret = len; + netdev_state_change(netdev); err: rtnl_unlock(); @@ -551,24 +634,23 @@ static ssize_t phys_port_id_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *netdev = to_net_dev(dev); + struct netdev_phys_item_id ppid; ssize_t ret = -EINVAL; /* The check is also done in dev_get_phys_port_id; this helps returning - * early without hitting the trylock/restart below. + * early without hitting the locking section below. */ if (!netdev->netdev_ops->ndo_get_phys_port_id) return -EOPNOTSUPP; - if (!rtnl_trylock()) - return restart_syscall(); + ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev); + if (ret) + return ret; - if (dev_isalive(netdev)) { - struct netdev_phys_item_id ppid; + ret = dev_get_phys_port_id(netdev, &ppid); + if (!ret) + ret = sysfs_emit(buf, "%*phN\n", ppid.id_len, ppid.id); - ret = dev_get_phys_port_id(netdev, &ppid); - if (!ret) - ret = sysfs_emit(buf, "%*phN\n", ppid.id_len, ppid.id); - } rtnl_unlock(); return ret; @@ -580,24 +662,23 @@ static ssize_t phys_port_name_show(struct device *dev, { struct net_device *netdev = to_net_dev(dev); ssize_t ret = -EINVAL; + char name[IFNAMSIZ]; /* The checks are also done in dev_get_phys_port_name; this helps - * returning early without hitting the trylock/restart below. + * returning early without hitting the locking section below. */ if (!netdev->netdev_ops->ndo_get_phys_port_name && !netdev->devlink_port) return -EOPNOTSUPP; - if (!rtnl_trylock()) - return restart_syscall(); + ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev); + if (ret) + return ret; - if (dev_isalive(netdev)) { - char name[IFNAMSIZ]; + ret = dev_get_phys_port_name(netdev, name, sizeof(name)); + if (!ret) + ret = sysfs_emit(buf, "%s\n", name); - ret = dev_get_phys_port_name(netdev, name, sizeof(name)); - if (!ret) - ret = sysfs_emit(buf, "%s\n", name); - } rtnl_unlock(); return ret; @@ -608,26 +689,25 @@ static ssize_t phys_switch_id_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *netdev = to_net_dev(dev); + struct netdev_phys_item_id ppid = { }; ssize_t ret = -EINVAL; /* The checks are also done in dev_get_phys_port_name; this helps - * returning early without hitting the trylock/restart below. This works + * returning early without hitting the locking section below. This works * because recurse is false when calling dev_get_port_parent_id. */ if (!netdev->netdev_ops->ndo_get_port_parent_id && !netdev->devlink_port) return -EOPNOTSUPP; - if (!rtnl_trylock()) - return restart_syscall(); + ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev); + if (ret) + return ret; - if (dev_isalive(netdev)) { - struct netdev_phys_item_id ppid = { }; + ret = dev_get_port_parent_id(netdev, &ppid, false); + if (!ret) + ret = sysfs_emit(buf, "%*phN\n", ppid.id_len, ppid.id); - ret = dev_get_port_parent_id(netdev, &ppid, false); - if (!ret) - ret = sysfs_emit(buf, "%*phN\n", ppid.id_len, ppid.id); - } rtnl_unlock(); return ret; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 1f4d4b5570ab..cb7fad8d1f95 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -80,6 +80,11 @@ void rtnl_lock(void) } EXPORT_SYMBOL(rtnl_lock); +int rtnl_lock_interruptible(void) +{ + return mutex_lock_interruptible(&rtnl_mutex); +} + int rtnl_lock_killable(void) { return mutex_lock_killable(&rtnl_mutex); From b7ecc1de51ca7d0a9fa8dbc3f756ab87b99a1838 Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Tue, 4 Feb 2025 18:03:11 +0100 Subject: [PATCH 16/63] net-sysfs: move queue attribute groups outside the default groups Rx/tx queues embed their own kobject for registering their per-queue sysfs files. The issue is they're using the kobject default groups for this and entirely rely on the kobject refcounting for releasing their sysfs paths. In order to remove rtnl_trylock calls we need sysfs files not to rely on their associated kobject refcounting for their release. Thus we here move queues sysfs files from the kobject default groups to their own groups which can be removed separately. Signed-off-by: Antoine Tenart Link: https://patch.msgid.link/20250204170314.146022-3-atenart@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 1 + include/net/netdev_rx_queue.h | 1 + net/core/net-sysfs.c | 27 +++++++++++++++++++++------ 3 files changed, 23 insertions(+), 6 deletions(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 2a59034a5fa2..1dcc76af7520 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -658,6 +658,7 @@ struct netdev_queue { struct Qdisc __rcu *qdisc_sleeping; #ifdef CONFIG_SYSFS struct kobject kobj; + const struct attribute_group **groups; #endif unsigned long tx_maxrate; /* diff --git a/include/net/netdev_rx_queue.h b/include/net/netdev_rx_queue.h index 596836abf7bf..af40842f229d 100644 --- a/include/net/netdev_rx_queue.h +++ b/include/net/netdev_rx_queue.h @@ -16,6 +16,7 @@ struct netdev_rx_queue { struct rps_dev_flow_table __rcu *rps_flow_table; #endif struct kobject kobj; + const struct attribute_group **groups; struct net_device *dev; netdevice_tracker dev_tracker; diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index e012234c739a..0b7ee260613d 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1188,7 +1188,6 @@ static void rx_queue_get_ownership(const struct kobject *kobj, static const struct kobj_type rx_queue_ktype = { .sysfs_ops = &rx_queue_sysfs_ops, .release = rx_queue_release, - .default_groups = rx_queue_default_groups, .namespace = rx_queue_namespace, .get_ownership = rx_queue_get_ownership, }; @@ -1222,20 +1221,27 @@ static int rx_queue_add_kobject(struct net_device *dev, int index) if (error) goto err; + queue->groups = rx_queue_default_groups; + error = sysfs_create_groups(kobj, queue->groups); + if (error) + goto err; + if (dev->sysfs_rx_queue_group) { error = sysfs_create_group(kobj, dev->sysfs_rx_queue_group); if (error) - goto err; + goto err_default_groups; } error = rx_queue_default_mask(dev, queue); if (error) - goto err; + goto err_default_groups; kobject_uevent(kobj, KOBJ_ADD); return error; +err_default_groups: + sysfs_remove_groups(kobj, queue->groups); err: kobject_put(kobj); return error; @@ -1280,12 +1286,14 @@ net_rx_queue_update_kobjects(struct net_device *dev, int old_num, int new_num) } while (--i >= new_num) { - struct kobject *kobj = &dev->_rx[i].kobj; + struct netdev_rx_queue *queue = &dev->_rx[i]; + struct kobject *kobj = &queue->kobj; if (!refcount_read(&dev_net(dev)->ns.count)) kobj->uevent_suppress = 1; if (dev->sysfs_rx_queue_group) sysfs_remove_group(kobj, dev->sysfs_rx_queue_group); + sysfs_remove_groups(kobj, queue->groups); kobject_put(kobj); } @@ -1872,7 +1880,6 @@ static void netdev_queue_get_ownership(const struct kobject *kobj, static const struct kobj_type netdev_queue_ktype = { .sysfs_ops = &netdev_queue_sysfs_ops, .release = netdev_queue_release, - .default_groups = netdev_queue_default_groups, .namespace = netdev_queue_namespace, .get_ownership = netdev_queue_get_ownership, }; @@ -1902,15 +1909,22 @@ static int netdev_queue_add_kobject(struct net_device *dev, int index) if (error) goto err; + queue->groups = netdev_queue_default_groups; + error = sysfs_create_groups(kobj, queue->groups); + if (error) + goto err; + if (netdev_uses_bql(dev)) { error = sysfs_create_group(kobj, &dql_group); if (error) - goto err; + goto err_default_groups; } kobject_uevent(kobj, KOBJ_ADD); return 0; +err_default_groups: + sysfs_remove_groups(kobj, queue->groups); err: kobject_put(kobj); return error; @@ -1965,6 +1979,7 @@ netdev_queue_update_kobjects(struct net_device *dev, int old_num, int new_num) if (netdev_uses_bql(dev)) sysfs_remove_group(&queue->kobj, &dql_group); + sysfs_remove_groups(&queue->kobj, queue->groups); kobject_put(&queue->kobj); } From 7e54f85c60828842be27e0149f3533357225090e Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Tue, 4 Feb 2025 18:03:12 +0100 Subject: [PATCH 17/63] net-sysfs: prevent uncleared queues from being re-added With the (upcoming) removal of the rtnl_trylock/restart_syscall logic and because of how Tx/Rx queues are implemented (and their requirements), it might happen that a queue is re-added before having the chance to be cleared. In such rare case, do not complete the queue addition operation. Signed-off-by: Antoine Tenart Link: https://patch.msgid.link/20250204170314.146022-4-atenart@kernel.org Signed-off-by: Jakub Kicinski --- net/core/net-sysfs.c | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 0b7ee260613d..027af27517fa 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1210,6 +1210,22 @@ static int rx_queue_add_kobject(struct net_device *dev, int index) struct kobject *kobj = &queue->kobj; int error = 0; + /* Rx queues are cleared in rx_queue_release to allow later + * re-registration. This is triggered when their kobj refcount is + * dropped. + * + * If a queue is removed while both a read (or write) operation and a + * the re-addition of the same queue are pending (waiting on rntl_lock) + * it might happen that the re-addition will execute before the read, + * making the initial removal to never happen (queue's kobj refcount + * won't drop enough because of the pending read). In such rare case, + * return to allow the removal operation to complete. + */ + if (unlikely(kobj->state_initialized)) { + netdev_warn_once(dev, "Cannot re-add rx queues before their removal completed"); + return -EAGAIN; + } + /* Kobject_put later will trigger rx_queue_release call which * decreases dev refcount: Take that reference here */ @@ -1898,6 +1914,22 @@ static int netdev_queue_add_kobject(struct net_device *dev, int index) struct kobject *kobj = &queue->kobj; int error = 0; + /* Tx queues are cleared in netdev_queue_release to allow later + * re-registration. This is triggered when their kobj refcount is + * dropped. + * + * If a queue is removed while both a read (or write) operation and a + * the re-addition of the same queue are pending (waiting on rntl_lock) + * it might happen that the re-addition will execute before the read, + * making the initial removal to never happen (queue's kobj refcount + * won't drop enough because of the pending read). In such rare case, + * return to allow the removal operation to complete. + */ + if (unlikely(kobj->state_initialized)) { + netdev_warn_once(dev, "Cannot re-add tx queues before their removal completed"); + return -EAGAIN; + } + /* Kobject_put later will trigger netdev_queue_release call * which decreases dev refcount: Take that reference here */ From b0b6fcfa6ad8433e22b050c72cfbeec2548744b9 Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Tue, 4 Feb 2025 18:03:13 +0100 Subject: [PATCH 18/63] net-sysfs: remove rtnl_trylock from queue attributes Similar to the commit removing remove rtnl_trylock from device attributes we here apply the same technique to networking queues. Signed-off-by: Antoine Tenart Link: https://patch.msgid.link/20250204170314.146022-5-atenart@kernel.org Signed-off-by: Jakub Kicinski --- net/core/net-sysfs.c | 147 ++++++++++++++++++++++++++----------------- 1 file changed, 89 insertions(+), 58 deletions(-) diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 027af27517fa..3fe2c521e574 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1348,9 +1348,11 @@ static int net_rx_queue_change_owner(struct net_device *dev, int num, */ struct netdev_queue_attribute { struct attribute attr; - ssize_t (*show)(struct netdev_queue *queue, char *buf); - ssize_t (*store)(struct netdev_queue *queue, - const char *buf, size_t len); + ssize_t (*show)(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, char *buf); + ssize_t (*store)(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, const char *buf, + size_t len); }; #define to_netdev_queue_attr(_attr) \ container_of(_attr, struct netdev_queue_attribute, attr) @@ -1367,7 +1369,7 @@ static ssize_t netdev_queue_attr_show(struct kobject *kobj, if (!attribute->show) return -EIO; - return attribute->show(queue, buf); + return attribute->show(kobj, attr, queue, buf); } static ssize_t netdev_queue_attr_store(struct kobject *kobj, @@ -1381,7 +1383,7 @@ static ssize_t netdev_queue_attr_store(struct kobject *kobj, if (!attribute->store) return -EIO; - return attribute->store(queue, buf, count); + return attribute->store(kobj, attr, queue, buf, count); } static const struct sysfs_ops netdev_queue_sysfs_ops = { @@ -1389,7 +1391,8 @@ static const struct sysfs_ops netdev_queue_sysfs_ops = { .store = netdev_queue_attr_store, }; -static ssize_t tx_timeout_show(struct netdev_queue *queue, char *buf) +static ssize_t tx_timeout_show(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, char *buf) { unsigned long trans_timeout = atomic_long_read(&queue->trans_timeout); @@ -1407,18 +1410,18 @@ static unsigned int get_netdev_queue_index(struct netdev_queue *queue) return i; } -static ssize_t traffic_class_show(struct netdev_queue *queue, - char *buf) +static ssize_t traffic_class_show(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, char *buf) { struct net_device *dev = queue->dev; - int num_tc, tc; - int index; + int num_tc, tc, index, ret; if (!netif_is_multiqueue(dev)) return -ENOENT; - if (!rtnl_trylock()) - return restart_syscall(); + ret = sysfs_rtnl_lock(kobj, attr, queue->dev); + if (ret) + return ret; index = get_netdev_queue_index(queue); @@ -1445,24 +1448,25 @@ static ssize_t traffic_class_show(struct netdev_queue *queue, } #ifdef CONFIG_XPS -static ssize_t tx_maxrate_show(struct netdev_queue *queue, - char *buf) +static ssize_t tx_maxrate_show(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, char *buf) { return sysfs_emit(buf, "%lu\n", queue->tx_maxrate); } -static ssize_t tx_maxrate_store(struct netdev_queue *queue, - const char *buf, size_t len) +static ssize_t tx_maxrate_store(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, const char *buf, + size_t len) { - struct net_device *dev = queue->dev; int err, index = get_netdev_queue_index(queue); + struct net_device *dev = queue->dev; u32 rate = 0; if (!capable(CAP_NET_ADMIN)) return -EPERM; /* The check is also done later; this helps returning early without - * hitting the trylock/restart below. + * hitting the locking section below. */ if (!dev->netdev_ops->ndo_set_tx_maxrate) return -EOPNOTSUPP; @@ -1471,18 +1475,21 @@ static ssize_t tx_maxrate_store(struct netdev_queue *queue, if (err < 0) return err; - if (!rtnl_trylock()) - return restart_syscall(); + err = sysfs_rtnl_lock(kobj, attr, dev); + if (err) + return err; err = -EOPNOTSUPP; if (dev->netdev_ops->ndo_set_tx_maxrate) err = dev->netdev_ops->ndo_set_tx_maxrate(dev, index, rate); - rtnl_unlock(); if (!err) { queue->tx_maxrate = rate; + rtnl_unlock(); return len; } + + rtnl_unlock(); return err; } @@ -1526,16 +1533,17 @@ static ssize_t bql_set(const char *buf, const size_t count, return count; } -static ssize_t bql_show_hold_time(struct netdev_queue *queue, - char *buf) +static ssize_t bql_show_hold_time(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, char *buf) { struct dql *dql = &queue->dql; return sysfs_emit(buf, "%u\n", jiffies_to_msecs(dql->slack_hold_time)); } -static ssize_t bql_set_hold_time(struct netdev_queue *queue, - const char *buf, size_t len) +static ssize_t bql_set_hold_time(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, const char *buf, + size_t len) { struct dql *dql = &queue->dql; unsigned int value; @@ -1554,15 +1562,17 @@ static struct netdev_queue_attribute bql_hold_time_attribute __ro_after_init = __ATTR(hold_time, 0644, bql_show_hold_time, bql_set_hold_time); -static ssize_t bql_show_stall_thrs(struct netdev_queue *queue, char *buf) +static ssize_t bql_show_stall_thrs(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, char *buf) { struct dql *dql = &queue->dql; return sysfs_emit(buf, "%u\n", jiffies_to_msecs(dql->stall_thrs)); } -static ssize_t bql_set_stall_thrs(struct netdev_queue *queue, - const char *buf, size_t len) +static ssize_t bql_set_stall_thrs(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, const char *buf, + size_t len) { struct dql *dql = &queue->dql; unsigned int value; @@ -1588,13 +1598,15 @@ static ssize_t bql_set_stall_thrs(struct netdev_queue *queue, static struct netdev_queue_attribute bql_stall_thrs_attribute __ro_after_init = __ATTR(stall_thrs, 0644, bql_show_stall_thrs, bql_set_stall_thrs); -static ssize_t bql_show_stall_max(struct netdev_queue *queue, char *buf) +static ssize_t bql_show_stall_max(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, char *buf) { return sysfs_emit(buf, "%u\n", READ_ONCE(queue->dql.stall_max)); } -static ssize_t bql_set_stall_max(struct netdev_queue *queue, - const char *buf, size_t len) +static ssize_t bql_set_stall_max(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, const char *buf, + size_t len) { WRITE_ONCE(queue->dql.stall_max, 0); return len; @@ -1603,7 +1615,8 @@ static ssize_t bql_set_stall_max(struct netdev_queue *queue, static struct netdev_queue_attribute bql_stall_max_attribute __ro_after_init = __ATTR(stall_max, 0644, bql_show_stall_max, bql_set_stall_max); -static ssize_t bql_show_stall_cnt(struct netdev_queue *queue, char *buf) +static ssize_t bql_show_stall_cnt(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, char *buf) { struct dql *dql = &queue->dql; @@ -1613,8 +1626,8 @@ static ssize_t bql_show_stall_cnt(struct netdev_queue *queue, char *buf) static struct netdev_queue_attribute bql_stall_cnt_attribute __ro_after_init = __ATTR(stall_cnt, 0444, bql_show_stall_cnt, NULL); -static ssize_t bql_show_inflight(struct netdev_queue *queue, - char *buf) +static ssize_t bql_show_inflight(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, char *buf) { struct dql *dql = &queue->dql; @@ -1625,13 +1638,16 @@ static struct netdev_queue_attribute bql_inflight_attribute __ro_after_init = __ATTR(inflight, 0444, bql_show_inflight, NULL); #define BQL_ATTR(NAME, FIELD) \ -static ssize_t bql_show_ ## NAME(struct netdev_queue *queue, \ - char *buf) \ +static ssize_t bql_show_ ## NAME(struct kobject *kobj, \ + struct attribute *attr, \ + struct netdev_queue *queue, char *buf) \ { \ return bql_show(buf, queue->dql.FIELD); \ } \ \ -static ssize_t bql_set_ ## NAME(struct netdev_queue *queue, \ +static ssize_t bql_set_ ## NAME(struct kobject *kobj, \ + struct attribute *attr, \ + struct netdev_queue *queue, \ const char *buf, size_t len) \ { \ return bql_set(buf, len, &queue->dql.FIELD); \ @@ -1717,19 +1733,21 @@ static ssize_t xps_queue_show(struct net_device *dev, unsigned int index, return len < PAGE_SIZE ? len : -EINVAL; } -static ssize_t xps_cpus_show(struct netdev_queue *queue, char *buf) +static ssize_t xps_cpus_show(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, char *buf) { struct net_device *dev = queue->dev; unsigned int index; - int len, tc; + int len, tc, ret; if (!netif_is_multiqueue(dev)) return -ENOENT; index = get_netdev_queue_index(queue); - if (!rtnl_trylock()) - return restart_syscall(); + ret = sysfs_rtnl_lock(kobj, attr, queue->dev); + if (ret) + return ret; /* If queue belongs to subordinate dev use its map */ dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev; @@ -1740,18 +1758,21 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue, char *buf) return -EINVAL; } - /* Make sure the subordinate device can't be freed */ - get_device(&dev->dev); + /* Increase the net device refcnt to make sure it won't be freed while + * xps_queue_show is running. + */ + dev_hold(dev); rtnl_unlock(); len = xps_queue_show(dev, index, tc, buf, XPS_CPUS); - put_device(&dev->dev); + dev_put(dev); return len; } -static ssize_t xps_cpus_store(struct netdev_queue *queue, - const char *buf, size_t len) +static ssize_t xps_cpus_store(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, const char *buf, + size_t len) { struct net_device *dev = queue->dev; unsigned int index; @@ -1775,9 +1796,10 @@ static ssize_t xps_cpus_store(struct netdev_queue *queue, return err; } - if (!rtnl_trylock()) { + err = sysfs_rtnl_lock(kobj, attr, dev); + if (err) { free_cpumask_var(mask); - return restart_syscall(); + return err; } err = netif_set_xps_queue(dev, mask, index); @@ -1791,26 +1813,34 @@ static ssize_t xps_cpus_store(struct netdev_queue *queue, static struct netdev_queue_attribute xps_cpus_attribute __ro_after_init = __ATTR_RW(xps_cpus); -static ssize_t xps_rxqs_show(struct netdev_queue *queue, char *buf) +static ssize_t xps_rxqs_show(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, char *buf) { struct net_device *dev = queue->dev; unsigned int index; - int tc; + int tc, ret; index = get_netdev_queue_index(queue); - if (!rtnl_trylock()) - return restart_syscall(); + ret = sysfs_rtnl_lock(kobj, attr, dev); + if (ret) + return ret; tc = netdev_txq_to_tc(dev, index); + + /* Increase the net device refcnt to make sure it won't be freed while + * xps_queue_show is running. + */ + dev_hold(dev); rtnl_unlock(); - if (tc < 0) - return -EINVAL; - return xps_queue_show(dev, index, tc, buf, XPS_RXQS); + ret = tc >= 0 ? xps_queue_show(dev, index, tc, buf, XPS_RXQS) : -EINVAL; + dev_put(dev); + return ret; } -static ssize_t xps_rxqs_store(struct netdev_queue *queue, const char *buf, +static ssize_t xps_rxqs_store(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, const char *buf, size_t len) { struct net_device *dev = queue->dev; @@ -1834,9 +1864,10 @@ static ssize_t xps_rxqs_store(struct netdev_queue *queue, const char *buf, return err; } - if (!rtnl_trylock()) { + err = sysfs_rtnl_lock(kobj, attr, dev); + if (err) { bitmap_free(mask); - return restart_syscall(); + return err; } cpus_read_lock(); From cbecd06a224962941f116e53f5673476ef6cd3f3 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 3 Feb 2025 13:48:50 -0800 Subject: [PATCH 19/63] selftests: net: suppress ReST file generation when building selftests Some selftests need libynl.a. When building it try to skip generating the ReST documentation, libynl.a does not depend on them. Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250203214850.1282291-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/ynl.mk | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/ynl.mk b/tools/testing/selftests/net/ynl.mk index 12e7cae251be..e907c2751956 100644 --- a/tools/testing/selftests/net/ynl.mk +++ b/tools/testing/selftests/net/ynl.mk @@ -27,7 +27,8 @@ $(OUTPUT)/.libynl-$(YNL_GENS_HASH).sig: $(OUTPUT)/libynl.a: $(YNL_SPECS) $(OUTPUT)/.libynl-$(YNL_GENS_HASH).sig $(Q)rm -f $(top_srcdir)/tools/net/ynl/libynl.a - $(Q)$(MAKE) -C $(top_srcdir)/tools/net/ynl GENS="$(YNL_GENS)" libynl.a + $(Q)$(MAKE) -C $(top_srcdir)/tools/net/ynl \ + GENS="$(YNL_GENS)" RSTS="" libynl.a $(Q)cp $(top_srcdir)/tools/net/ynl/libynl.a $(OUTPUT)/libynl.a EXTRA_CLEAN += \ From d9e9f6d7b7d0c520bb87f19d2cbc57aeeb2091d5 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Tue, 4 Feb 2025 18:37:15 +0100 Subject: [PATCH 20/63] bridge: mdb: Allow replace of a host-joined group Attempts to replace an MDB group membership of the host itself are currently bounced: # ip link add name br up type bridge vlan_filtering 1 # bridge mdb replace dev br port br grp 239.0.0.1 vid 2 # bridge mdb replace dev br port br grp 239.0.0.1 vid 2 Error: bridge: Group is already joined by host. A similar operation done on a member port would succeed. Ignore the check for replacement of host group memberships as well. The bit of code that this enables is br_multicast_host_join(), which, for already-joined groups only refreshes the MC group expiration timer, which is desirable; and a userspace notification, also desirable. Change a selftest that exercises this code path from expecting a rejection to expecting a pass. The rest of MDB selftests pass without modification. Signed-off-by: Petr Machata Reviewed-by: Ido Schimmel Acked-by: Nikolay Aleksandrov Link: https://patch.msgid.link/e5c5188b9787ae806609e7ca3aa2a0a501b9b5c4.1738685648.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- net/bridge/br_mdb.c | 2 +- tools/testing/selftests/net/forwarding/bridge_mdb.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index 1a52a0bca086..7e1ad229e133 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -1040,7 +1040,7 @@ static int br_mdb_add_group(const struct br_mdb_config *cfg, /* host join */ if (!port) { - if (mp->host_joined) { + if (mp->host_joined && !(cfg->nlflags & NLM_F_REPLACE)) { NL_SET_ERR_MSG_MOD(extack, "Group is already joined by host"); return -EEXIST; } diff --git a/tools/testing/selftests/net/forwarding/bridge_mdb.sh b/tools/testing/selftests/net/forwarding/bridge_mdb.sh index d9d587454d20..8c1597ebc2d3 100755 --- a/tools/testing/selftests/net/forwarding/bridge_mdb.sh +++ b/tools/testing/selftests/net/forwarding/bridge_mdb.sh @@ -149,7 +149,7 @@ cfg_test_host_common() check_err $? "Failed to add $name host entry" bridge mdb replace dev br0 port br0 grp $grp $state vid 10 &> /dev/null - check_fail $? "Managed to replace $name host entry" + check_err $? "Failed to replace $name host entry" bridge mdb del dev br0 port br0 grp $grp $state vid 10 bridge mdb get dev br0 grp $grp vid 10 &> /dev/null From 863257c29fe9c882b21fe5d1596081ef55c4875a Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Tue, 4 Feb 2025 13:24:31 +1030 Subject: [PATCH 21/63] cxgb4: Avoid a -Wflex-array-member-not-at-end warning -Wflex-array-member-not-at-end was introduced in GCC-14, and we are getting ready to enable it, globally. Move the conflicting declaration to the end of the structure. Notice that `struct ethtool_dump` is a flexible structure --a structure that contains a flexible-array member. Fix the following warning: ./drivers/net/ethernet/chelsio/cxgb4/cxgb4.h:1215:29: warning: structure containing a flexible array member is not at the end of another structure [-Wflex-array-member-not-at-end] Reviewed-by: Simon Horman Signed-off-by: Gustavo A. R. Silva Link: https://patch.msgid.link/Z6GBZ4brXYffLkt_@kspp Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/chelsio/cxgb4/cxgb4.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h index c7c2c15a1815..95e6f015a6af 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h @@ -1211,9 +1211,6 @@ struct adapter { struct timer_list flower_stats_timer; struct work_struct flower_stats_work; - /* Ethtool Dump */ - struct ethtool_dump eth_dump; - /* HMA */ struct hma_data hma; @@ -1233,6 +1230,10 @@ struct adapter { /* Ethtool n-tuple */ struct cxgb4_ethtool_filter *ethtool_filters; + + /* Ethtool Dump */ + /* Must be last - ends in a flex-array member. */ + struct ethtool_dump eth_dump; }; /* Support for "sched-class" command to allow a TX Scheduling Class to be From 6cccb3bb0561812539d7f0ab35382e8d8998076a Mon Sep 17 00:00:00 2001 From: Aswin Karuvally Date: Tue, 4 Feb 2025 11:31:35 +0100 Subject: [PATCH 22/63] s390/net: Remove LCS driver The original Open Systems Adapter (OSA) was introduced by IBM in the mid-90s. These were then superseded by OSA-Express in 1999 which used Queued Direct IO to greatly improve throughput. The newer cards retained the older, slower non-QDIO (OSE) modes for compatibility with older systems. In Linux, the lcs driver was responsible for cards operating in the older OSE mode and the qeth driver was introduced to allow the OSA-Express cards to operate in the newer QDIO (OSD) mode. For an S390 machine from 1998 or later, there is no reason to use the OSE mode and lcs driver as all OSA cards since 1999 provide the faster OSD mode. As a result, it's been years since we have heard of a customer configuration involving the lcs driver. This patch removes the lcs driver. The technology it supports has been obsolete for past 25+ years and is irrelevant for current use cases. Reviewed-by: Alexandra Winter Acked-by: Heiko Carstens Acked-by: Peter Oberparleiter Signed-off-by: Aswin Karuvally Signed-off-by: Alexandra Winter Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250204103135.1619097-1-wintera@linux.ibm.com Signed-off-by: Jakub Kicinski --- Documentation/arch/s390/driver-model.rst | 2 +- arch/s390/include/asm/irq.h | 1 - arch/s390/kernel/irq.c | 1 - drivers/s390/net/Kconfig | 11 +- drivers/s390/net/Makefile | 1 - drivers/s390/net/lcs.c | 2385 ---------------------- drivers/s390/net/lcs.h | 342 ---- 7 files changed, 2 insertions(+), 2741 deletions(-) delete mode 100644 drivers/s390/net/lcs.c delete mode 100644 drivers/s390/net/lcs.h diff --git a/Documentation/arch/s390/driver-model.rst b/Documentation/arch/s390/driver-model.rst index ad4bc2dbea43..ad18f129fb0b 100644 --- a/Documentation/arch/s390/driver-model.rst +++ b/Documentation/arch/s390/driver-model.rst @@ -244,7 +244,7 @@ information about the interrupt from the irb parameter. -------------------- The ccwgroup mechanism is designed to handle devices consisting of multiple ccw -devices, like lcs or ctc. +devices, like qeth or ctc. The ccw driver provides a 'group' attribute. Piping bus ids of ccw devices to this attributes creates a ccwgroup device consisting of these ccw devices (if diff --git a/arch/s390/include/asm/irq.h b/arch/s390/include/asm/irq.h index d9e705f4a697..bde6a496df5f 100644 --- a/arch/s390/include/asm/irq.h +++ b/arch/s390/include/asm/irq.h @@ -54,7 +54,6 @@ enum interruption_class { IRQIO_C70, IRQIO_TAP, IRQIO_VMR, - IRQIO_LCS, IRQIO_CTC, IRQIO_ADM, IRQIO_CSC, diff --git a/arch/s390/kernel/irq.c b/arch/s390/kernel/irq.c index ef7be599e1f7..7ca157ffab30 100644 --- a/arch/s390/kernel/irq.c +++ b/arch/s390/kernel/irq.c @@ -84,7 +84,6 @@ static const struct irq_class irqclass_sub_desc[] = { {.irq = IRQIO_C70, .name = "C70", .desc = "[I/O] 3270"}, {.irq = IRQIO_TAP, .name = "TAP", .desc = "[I/O] Tape"}, {.irq = IRQIO_VMR, .name = "VMR", .desc = "[I/O] Unit Record Devices"}, - {.irq = IRQIO_LCS, .name = "LCS", .desc = "[I/O] LCS"}, {.irq = IRQIO_CTC, .name = "CTC", .desc = "[I/O] CTC"}, {.irq = IRQIO_ADM, .name = "ADM", .desc = "[I/O] EADM Subchannel"}, {.irq = IRQIO_CSC, .name = "CSC", .desc = "[I/O] CHSC Subchannel"}, diff --git a/drivers/s390/net/Kconfig b/drivers/s390/net/Kconfig index c61e6427384c..9eb9e3c49f81 100644 --- a/drivers/s390/net/Kconfig +++ b/drivers/s390/net/Kconfig @@ -2,15 +2,6 @@ menu "S/390 network device drivers" depends on NETDEVICES && S390 -config LCS - def_tristate m - prompt "Lan Channel Station Interface" - depends on CCW && NETDEVICES && ETHERNET - help - Select this option if you want to use LCS networking on IBM System z. - To compile as a module, choose M. The module name is lcs. - If you do not use LCS, choose N. - config CTCM def_tristate m prompt "CTC and MPC SNA device support" @@ -98,7 +89,7 @@ config QETH_OSX config CCWGROUP tristate - default (LCS || CTCM || QETH || SMC) + default (CTCM || QETH || SMC) config ISM tristate "Support for ISM vPCI Adapter" diff --git a/drivers/s390/net/Makefile b/drivers/s390/net/Makefile index bc55ec316adb..b5aaba290127 100644 --- a/drivers/s390/net/Makefile +++ b/drivers/s390/net/Makefile @@ -8,7 +8,6 @@ obj-$(CONFIG_CTCM) += ctcm.o fsm.o obj-$(CONFIG_NETIUCV) += netiucv.o fsm.o obj-$(CONFIG_SMSGIUCV) += smsgiucv.o obj-$(CONFIG_SMSGIUCV_EVENT) += smsgiucv_app.o -obj-$(CONFIG_LCS) += lcs.o qeth-y += qeth_core_sys.o qeth_core_main.o qeth_core_mpc.o qeth_ethtool.o obj-$(CONFIG_QETH) += qeth.o qeth_l2-y += qeth_l2_main.o qeth_l2_sys.o diff --git a/drivers/s390/net/lcs.c b/drivers/s390/net/lcs.c deleted file mode 100644 index 88db8378325a..000000000000 --- a/drivers/s390/net/lcs.c +++ /dev/null @@ -1,2385 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0+ -/* - * Linux for S/390 LAN channel station device driver - * - * Copyright IBM Corp. 1999, 2009 - * Author(s): Original Code written by - * DJ Barrow - * Rewritten by - * Frank Pavlic and - * Martin Schwidefsky - */ - -#define KMSG_COMPONENT "lcs" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -#include "lcs.h" - - -/* - * initialization string for output - */ - -static char version[] __initdata = "LCS driver"; - -/* - * the root device for lcs group devices - */ -static struct device *lcs_root_dev; - -/* - * Some prototypes. - */ -static void lcs_tasklet(unsigned long); -static void lcs_start_kernel_thread(struct work_struct *); -static void lcs_get_frames_cb(struct lcs_channel *, struct lcs_buffer *); -#ifdef CONFIG_IP_MULTICAST -static int lcs_send_delipm(struct lcs_card *, struct lcs_ipm_list *); -#endif /* CONFIG_IP_MULTICAST */ -static int lcs_recovery(void *ptr); - -/* - * Debug Facility Stuff - */ -static char debug_buffer[255]; -static debug_info_t *lcs_dbf_setup; -static debug_info_t *lcs_dbf_trace; - -/* - * LCS Debug Facility functions - */ -static void -lcs_unregister_debug_facility(void) -{ - debug_unregister(lcs_dbf_setup); - debug_unregister(lcs_dbf_trace); -} - -static int -lcs_register_debug_facility(void) -{ - lcs_dbf_setup = debug_register("lcs_setup", 2, 1, 8); - lcs_dbf_trace = debug_register("lcs_trace", 4, 1, 8); - if (lcs_dbf_setup == NULL || lcs_dbf_trace == NULL) { - pr_err("Not enough memory for debug facility.\n"); - lcs_unregister_debug_facility(); - return -ENOMEM; - } - debug_register_view(lcs_dbf_setup, &debug_hex_ascii_view); - debug_set_level(lcs_dbf_setup, 2); - debug_register_view(lcs_dbf_trace, &debug_hex_ascii_view); - debug_set_level(lcs_dbf_trace, 2); - return 0; -} - -/* - * Allocate io buffers. - */ -static int -lcs_alloc_channel(struct lcs_channel *channel) -{ - int cnt; - - LCS_DBF_TEXT(2, setup, "ichalloc"); - for (cnt = 0; cnt < LCS_NUM_BUFFS; cnt++) { - /* alloc memory fo iobuffer */ - channel->iob[cnt].data = - kzalloc(LCS_IOBUFFERSIZE, GFP_DMA | GFP_KERNEL); - if (channel->iob[cnt].data == NULL) - break; - channel->iob[cnt].state = LCS_BUF_STATE_EMPTY; - } - if (cnt < LCS_NUM_BUFFS) { - /* Not all io buffers could be allocated. */ - LCS_DBF_TEXT(2, setup, "echalloc"); - while (cnt-- > 0) - kfree(channel->iob[cnt].data); - return -ENOMEM; - } - return 0; -} - -/* - * Free io buffers. - */ -static void -lcs_free_channel(struct lcs_channel *channel) -{ - int cnt; - - LCS_DBF_TEXT(2, setup, "ichfree"); - for (cnt = 0; cnt < LCS_NUM_BUFFS; cnt++) { - kfree(channel->iob[cnt].data); - channel->iob[cnt].data = NULL; - } -} - -/* - * Cleanup channel. - */ -static void -lcs_cleanup_channel(struct lcs_channel *channel) -{ - LCS_DBF_TEXT(3, setup, "cleanch"); - /* Kill write channel tasklets. */ - tasklet_kill(&channel->irq_tasklet); - /* Free channel buffers. */ - lcs_free_channel(channel); -} - -/* - * LCS free memory for card and channels. - */ -static void -lcs_free_card(struct lcs_card *card) -{ - LCS_DBF_TEXT(2, setup, "remcard"); - LCS_DBF_HEX(2, setup, &card, sizeof(void*)); - kfree(card); -} - -/* - * LCS alloc memory for card and channels - */ -static struct lcs_card * -lcs_alloc_card(void) -{ - struct lcs_card *card; - int rc; - - LCS_DBF_TEXT(2, setup, "alloclcs"); - - card = kzalloc(sizeof(struct lcs_card), GFP_KERNEL | GFP_DMA); - if (card == NULL) - return NULL; - card->lan_type = LCS_FRAME_TYPE_AUTO; - card->pkt_seq = 0; - card->lancmd_timeout = LCS_LANCMD_TIMEOUT_DEFAULT; - /* Allocate io buffers for the read channel. */ - rc = lcs_alloc_channel(&card->read); - if (rc){ - LCS_DBF_TEXT(2, setup, "iccwerr"); - lcs_free_card(card); - return NULL; - } - /* Allocate io buffers for the write channel. */ - rc = lcs_alloc_channel(&card->write); - if (rc) { - LCS_DBF_TEXT(2, setup, "iccwerr"); - lcs_cleanup_channel(&card->read); - lcs_free_card(card); - return NULL; - } - -#ifdef CONFIG_IP_MULTICAST - INIT_LIST_HEAD(&card->ipm_list); -#endif - LCS_DBF_HEX(2, setup, &card, sizeof(void*)); - return card; -} - -/* - * Setup read channel. - */ -static void -lcs_setup_read_ccws(struct lcs_card *card) -{ - int cnt; - - LCS_DBF_TEXT(2, setup, "ireadccw"); - /* Setup read ccws. */ - memset(card->read.ccws, 0, sizeof (struct ccw1) * (LCS_NUM_BUFFS + 1)); - for (cnt = 0; cnt < LCS_NUM_BUFFS; cnt++) { - card->read.ccws[cnt].cmd_code = LCS_CCW_READ; - card->read.ccws[cnt].count = LCS_IOBUFFERSIZE; - card->read.ccws[cnt].flags = - CCW_FLAG_CC | CCW_FLAG_SLI | CCW_FLAG_PCI; - /* - * Note: we have allocated the buffer with GFP_DMA, so - * we do not need to do set_normalized_cda. - */ - card->read.ccws[cnt].cda = - virt_to_dma32(card->read.iob[cnt].data); - ((struct lcs_header *) - card->read.iob[cnt].data)->offset = LCS_ILLEGAL_OFFSET; - card->read.iob[cnt].callback = lcs_get_frames_cb; - card->read.iob[cnt].state = LCS_BUF_STATE_READY; - card->read.iob[cnt].count = LCS_IOBUFFERSIZE; - } - card->read.ccws[0].flags &= ~CCW_FLAG_PCI; - card->read.ccws[LCS_NUM_BUFFS - 1].flags &= ~CCW_FLAG_PCI; - card->read.ccws[LCS_NUM_BUFFS - 1].flags |= CCW_FLAG_SUSPEND; - /* Last ccw is a tic (transfer in channel). */ - card->read.ccws[LCS_NUM_BUFFS].cmd_code = LCS_CCW_TRANSFER; - card->read.ccws[LCS_NUM_BUFFS].cda = virt_to_dma32(card->read.ccws); - /* Setg initial state of the read channel. */ - card->read.state = LCS_CH_STATE_INIT; - - card->read.io_idx = 0; - card->read.buf_idx = 0; -} - -static void -lcs_setup_read(struct lcs_card *card) -{ - LCS_DBF_TEXT(3, setup, "initread"); - - lcs_setup_read_ccws(card); - /* Initialize read channel tasklet. */ - card->read.irq_tasklet.data = (unsigned long) &card->read; - card->read.irq_tasklet.func = lcs_tasklet; - /* Initialize waitqueue. */ - init_waitqueue_head(&card->read.wait_q); -} - -/* - * Setup write channel. - */ -static void -lcs_setup_write_ccws(struct lcs_card *card) -{ - int cnt; - - LCS_DBF_TEXT(3, setup, "iwritccw"); - /* Setup write ccws. */ - memset(card->write.ccws, 0, sizeof(struct ccw1) * (LCS_NUM_BUFFS + 1)); - for (cnt = 0; cnt < LCS_NUM_BUFFS; cnt++) { - card->write.ccws[cnt].cmd_code = LCS_CCW_WRITE; - card->write.ccws[cnt].count = 0; - card->write.ccws[cnt].flags = - CCW_FLAG_SUSPEND | CCW_FLAG_CC | CCW_FLAG_SLI; - /* - * Note: we have allocated the buffer with GFP_DMA, so - * we do not need to do set_normalized_cda. - */ - card->write.ccws[cnt].cda = - virt_to_dma32(card->write.iob[cnt].data); - } - /* Last ccw is a tic (transfer in channel). */ - card->write.ccws[LCS_NUM_BUFFS].cmd_code = LCS_CCW_TRANSFER; - card->write.ccws[LCS_NUM_BUFFS].cda = virt_to_dma32(card->write.ccws); - /* Set initial state of the write channel. */ - card->read.state = LCS_CH_STATE_INIT; - - card->write.io_idx = 0; - card->write.buf_idx = 0; -} - -static void -lcs_setup_write(struct lcs_card *card) -{ - LCS_DBF_TEXT(3, setup, "initwrit"); - - lcs_setup_write_ccws(card); - /* Initialize write channel tasklet. */ - card->write.irq_tasklet.data = (unsigned long) &card->write; - card->write.irq_tasklet.func = lcs_tasklet; - /* Initialize waitqueue. */ - init_waitqueue_head(&card->write.wait_q); -} - -static void -lcs_set_allowed_threads(struct lcs_card *card, unsigned long threads) -{ - unsigned long flags; - - spin_lock_irqsave(&card->mask_lock, flags); - card->thread_allowed_mask = threads; - spin_unlock_irqrestore(&card->mask_lock, flags); - wake_up(&card->wait_q); -} -static int lcs_threads_running(struct lcs_card *card, unsigned long threads) -{ - unsigned long flags; - int rc = 0; - - spin_lock_irqsave(&card->mask_lock, flags); - rc = (card->thread_running_mask & threads); - spin_unlock_irqrestore(&card->mask_lock, flags); - return rc; -} - -static int -lcs_wait_for_threads(struct lcs_card *card, unsigned long threads) -{ - return wait_event_interruptible(card->wait_q, - lcs_threads_running(card, threads) == 0); -} - -static int lcs_set_thread_start_bit(struct lcs_card *card, unsigned long thread) -{ - unsigned long flags; - - spin_lock_irqsave(&card->mask_lock, flags); - if ( !(card->thread_allowed_mask & thread) || - (card->thread_start_mask & thread) ) { - spin_unlock_irqrestore(&card->mask_lock, flags); - return -EPERM; - } - card->thread_start_mask |= thread; - spin_unlock_irqrestore(&card->mask_lock, flags); - return 0; -} - -static void -lcs_clear_thread_running_bit(struct lcs_card *card, unsigned long thread) -{ - unsigned long flags; - - spin_lock_irqsave(&card->mask_lock, flags); - card->thread_running_mask &= ~thread; - spin_unlock_irqrestore(&card->mask_lock, flags); - wake_up(&card->wait_q); -} - -static int __lcs_do_run_thread(struct lcs_card *card, unsigned long thread) -{ - unsigned long flags; - int rc = 0; - - spin_lock_irqsave(&card->mask_lock, flags); - if (card->thread_start_mask & thread){ - if ((card->thread_allowed_mask & thread) && - !(card->thread_running_mask & thread)){ - rc = 1; - card->thread_start_mask &= ~thread; - card->thread_running_mask |= thread; - } else - rc = -EPERM; - } - spin_unlock_irqrestore(&card->mask_lock, flags); - return rc; -} - -static int -lcs_do_run_thread(struct lcs_card *card, unsigned long thread) -{ - int rc = 0; - wait_event(card->wait_q, - (rc = __lcs_do_run_thread(card, thread)) >= 0); - return rc; -} - -static int -lcs_do_start_thread(struct lcs_card *card, unsigned long thread) -{ - unsigned long flags; - int rc = 0; - - spin_lock_irqsave(&card->mask_lock, flags); - LCS_DBF_TEXT_(4, trace, " %02x%02x%02x", - (u8) card->thread_start_mask, - (u8) card->thread_allowed_mask, - (u8) card->thread_running_mask); - rc = (card->thread_start_mask & thread); - spin_unlock_irqrestore(&card->mask_lock, flags); - return rc; -} - -/* - * Initialize channels,card and state machines. - */ -static void -lcs_setup_card(struct lcs_card *card) -{ - LCS_DBF_TEXT(2, setup, "initcard"); - LCS_DBF_HEX(2, setup, &card, sizeof(void*)); - - lcs_setup_read(card); - lcs_setup_write(card); - /* Set cards initial state. */ - card->state = DEV_STATE_DOWN; - card->tx_buffer = NULL; - card->tx_emitted = 0; - - init_waitqueue_head(&card->wait_q); - spin_lock_init(&card->lock); - spin_lock_init(&card->ipm_lock); - spin_lock_init(&card->mask_lock); -#ifdef CONFIG_IP_MULTICAST - INIT_LIST_HEAD(&card->ipm_list); -#endif - INIT_LIST_HEAD(&card->lancmd_waiters); -} - -static void lcs_clear_multicast_list(struct lcs_card *card) -{ -#ifdef CONFIG_IP_MULTICAST - struct lcs_ipm_list *ipm; - unsigned long flags; - - /* Free multicast list. */ - LCS_DBF_TEXT(3, setup, "clmclist"); - spin_lock_irqsave(&card->ipm_lock, flags); - while (!list_empty(&card->ipm_list)){ - ipm = list_entry(card->ipm_list.next, - struct lcs_ipm_list, list); - list_del(&ipm->list); - if (ipm->ipm_state != LCS_IPM_STATE_SET_REQUIRED){ - spin_unlock_irqrestore(&card->ipm_lock, flags); - lcs_send_delipm(card, ipm); - spin_lock_irqsave(&card->ipm_lock, flags); - } - kfree(ipm); - } - spin_unlock_irqrestore(&card->ipm_lock, flags); -#endif -} - -/* - * Cleanup channels,card and state machines. - */ -static void -lcs_cleanup_card(struct lcs_card *card) -{ - - LCS_DBF_TEXT(3, setup, "cleancrd"); - LCS_DBF_HEX(2,setup,&card,sizeof(void*)); - - if (card->dev != NULL) - free_netdev(card->dev); - /* Cleanup channels. */ - lcs_cleanup_channel(&card->write); - lcs_cleanup_channel(&card->read); -} - -/* - * Start channel. - */ -static int -lcs_start_channel(struct lcs_channel *channel) -{ - unsigned long flags; - int rc; - - LCS_DBF_TEXT_(4, trace,"ssch%s", dev_name(&channel->ccwdev->dev)); - spin_lock_irqsave(get_ccwdev_lock(channel->ccwdev), flags); - rc = ccw_device_start(channel->ccwdev, - channel->ccws + channel->io_idx, 0, 0, - DOIO_DENY_PREFETCH | DOIO_ALLOW_SUSPEND); - if (rc == 0) - channel->state = LCS_CH_STATE_RUNNING; - spin_unlock_irqrestore(get_ccwdev_lock(channel->ccwdev), flags); - if (rc) { - LCS_DBF_TEXT_(4,trace,"essh%s", - dev_name(&channel->ccwdev->dev)); - dev_err(&channel->ccwdev->dev, - "Starting an LCS device resulted in an error," - " rc=%d!\n", rc); - } - return rc; -} - -static int -lcs_clear_channel(struct lcs_channel *channel) -{ - unsigned long flags; - int rc; - - LCS_DBF_TEXT(4,trace,"clearch"); - LCS_DBF_TEXT_(4, trace, "%s", dev_name(&channel->ccwdev->dev)); - spin_lock_irqsave(get_ccwdev_lock(channel->ccwdev), flags); - rc = ccw_device_clear(channel->ccwdev, 0); - spin_unlock_irqrestore(get_ccwdev_lock(channel->ccwdev), flags); - if (rc) { - LCS_DBF_TEXT_(4, trace, "ecsc%s", - dev_name(&channel->ccwdev->dev)); - return rc; - } - wait_event(channel->wait_q, (channel->state == LCS_CH_STATE_CLEARED)); - channel->state = LCS_CH_STATE_STOPPED; - return rc; -} - - -/* - * Stop channel. - */ -static int -lcs_stop_channel(struct lcs_channel *channel) -{ - unsigned long flags; - int rc; - - if (channel->state == LCS_CH_STATE_STOPPED) - return 0; - LCS_DBF_TEXT(4,trace,"haltsch"); - LCS_DBF_TEXT_(4, trace, "%s", dev_name(&channel->ccwdev->dev)); - channel->state = LCS_CH_STATE_INIT; - spin_lock_irqsave(get_ccwdev_lock(channel->ccwdev), flags); - rc = ccw_device_halt(channel->ccwdev, 0); - spin_unlock_irqrestore(get_ccwdev_lock(channel->ccwdev), flags); - if (rc) { - LCS_DBF_TEXT_(4, trace, "ehsc%s", - dev_name(&channel->ccwdev->dev)); - return rc; - } - /* Asynchronous halt initialted. Wait for its completion. */ - wait_event(channel->wait_q, (channel->state == LCS_CH_STATE_HALTED)); - lcs_clear_channel(channel); - return 0; -} - -/* - * start read and write channel - */ -static int -lcs_start_channels(struct lcs_card *card) -{ - int rc; - - LCS_DBF_TEXT(2, trace, "chstart"); - /* start read channel */ - rc = lcs_start_channel(&card->read); - if (rc) - return rc; - /* start write channel */ - rc = lcs_start_channel(&card->write); - if (rc) - lcs_stop_channel(&card->read); - return rc; -} - -/* - * stop read and write channel - */ -static int -lcs_stop_channels(struct lcs_card *card) -{ - LCS_DBF_TEXT(2, trace, "chhalt"); - lcs_stop_channel(&card->read); - lcs_stop_channel(&card->write); - return 0; -} - -/* - * Get empty buffer. - */ -static struct lcs_buffer * -__lcs_get_buffer(struct lcs_channel *channel) -{ - int index; - - LCS_DBF_TEXT(5, trace, "_getbuff"); - index = channel->io_idx; - do { - if (channel->iob[index].state == LCS_BUF_STATE_EMPTY) { - channel->iob[index].state = LCS_BUF_STATE_LOCKED; - return channel->iob + index; - } - index = (index + 1) & (LCS_NUM_BUFFS - 1); - } while (index != channel->io_idx); - return NULL; -} - -static struct lcs_buffer * -lcs_get_buffer(struct lcs_channel *channel) -{ - struct lcs_buffer *buffer; - unsigned long flags; - - LCS_DBF_TEXT(5, trace, "getbuff"); - spin_lock_irqsave(get_ccwdev_lock(channel->ccwdev), flags); - buffer = __lcs_get_buffer(channel); - spin_unlock_irqrestore(get_ccwdev_lock(channel->ccwdev), flags); - return buffer; -} - -/* - * Resume channel program if the channel is suspended. - */ -static int -__lcs_resume_channel(struct lcs_channel *channel) -{ - int rc; - - if (channel->state != LCS_CH_STATE_SUSPENDED) - return 0; - if (channel->ccws[channel->io_idx].flags & CCW_FLAG_SUSPEND) - return 0; - LCS_DBF_TEXT_(5, trace, "rsch%s", dev_name(&channel->ccwdev->dev)); - rc = ccw_device_resume(channel->ccwdev); - if (rc) { - LCS_DBF_TEXT_(4, trace, "ersc%s", - dev_name(&channel->ccwdev->dev)); - dev_err(&channel->ccwdev->dev, - "Sending data from the LCS device to the LAN failed" - " with rc=%d\n",rc); - } else - channel->state = LCS_CH_STATE_RUNNING; - return rc; - -} - -/* - * Make a buffer ready for processing. - */ -static void __lcs_ready_buffer_bits(struct lcs_channel *channel, int index) -{ - int prev, next; - - LCS_DBF_TEXT(5, trace, "rdybits"); - prev = (index - 1) & (LCS_NUM_BUFFS - 1); - next = (index + 1) & (LCS_NUM_BUFFS - 1); - /* Check if we may clear the suspend bit of this buffer. */ - if (channel->ccws[next].flags & CCW_FLAG_SUSPEND) { - /* Check if we have to set the PCI bit. */ - if (!(channel->ccws[prev].flags & CCW_FLAG_SUSPEND)) - /* Suspend bit of the previous buffer is not set. */ - channel->ccws[index].flags |= CCW_FLAG_PCI; - /* Suspend bit of the next buffer is set. */ - channel->ccws[index].flags &= ~CCW_FLAG_SUSPEND; - } -} - -static int -lcs_ready_buffer(struct lcs_channel *channel, struct lcs_buffer *buffer) -{ - unsigned long flags; - int index, rc; - - LCS_DBF_TEXT(5, trace, "rdybuff"); - BUG_ON(buffer->state != LCS_BUF_STATE_LOCKED && - buffer->state != LCS_BUF_STATE_PROCESSED); - spin_lock_irqsave(get_ccwdev_lock(channel->ccwdev), flags); - buffer->state = LCS_BUF_STATE_READY; - index = buffer - channel->iob; - /* Set length. */ - channel->ccws[index].count = buffer->count; - /* Check relevant PCI/suspend bits. */ - __lcs_ready_buffer_bits(channel, index); - rc = __lcs_resume_channel(channel); - spin_unlock_irqrestore(get_ccwdev_lock(channel->ccwdev), flags); - return rc; -} - -/* - * Mark the buffer as processed. Take care of the suspend bit - * of the previous buffer. This function is called from - * interrupt context, so the lock must not be taken. - */ -static int -__lcs_processed_buffer(struct lcs_channel *channel, struct lcs_buffer *buffer) -{ - int index, prev, next; - - LCS_DBF_TEXT(5, trace, "prcsbuff"); - BUG_ON(buffer->state != LCS_BUF_STATE_READY); - buffer->state = LCS_BUF_STATE_PROCESSED; - index = buffer - channel->iob; - prev = (index - 1) & (LCS_NUM_BUFFS - 1); - next = (index + 1) & (LCS_NUM_BUFFS - 1); - /* Set the suspend bit and clear the PCI bit of this buffer. */ - channel->ccws[index].flags |= CCW_FLAG_SUSPEND; - channel->ccws[index].flags &= ~CCW_FLAG_PCI; - /* Check the suspend bit of the previous buffer. */ - if (channel->iob[prev].state == LCS_BUF_STATE_READY) { - /* - * Previous buffer is in state ready. It might have - * happened in lcs_ready_buffer that the suspend bit - * has not been cleared to avoid an endless loop. - * Do it now. - */ - __lcs_ready_buffer_bits(channel, prev); - } - /* Clear PCI bit of next buffer. */ - channel->ccws[next].flags &= ~CCW_FLAG_PCI; - return __lcs_resume_channel(channel); -} - -/* - * Put a processed buffer back to state empty. - */ -static void -lcs_release_buffer(struct lcs_channel *channel, struct lcs_buffer *buffer) -{ - unsigned long flags; - - LCS_DBF_TEXT(5, trace, "relbuff"); - BUG_ON(buffer->state != LCS_BUF_STATE_LOCKED && - buffer->state != LCS_BUF_STATE_PROCESSED); - spin_lock_irqsave(get_ccwdev_lock(channel->ccwdev), flags); - buffer->state = LCS_BUF_STATE_EMPTY; - spin_unlock_irqrestore(get_ccwdev_lock(channel->ccwdev), flags); -} - -/* - * Get buffer for a lan command. - */ -static struct lcs_buffer * -lcs_get_lancmd(struct lcs_card *card, int count) -{ - struct lcs_buffer *buffer; - struct lcs_cmd *cmd; - - LCS_DBF_TEXT(4, trace, "getlncmd"); - /* Get buffer and wait if none is available. */ - wait_event(card->write.wait_q, - ((buffer = lcs_get_buffer(&card->write)) != NULL)); - count += sizeof(struct lcs_header); - *(__u16 *)(buffer->data + count) = 0; - buffer->count = count + sizeof(__u16); - buffer->callback = lcs_release_buffer; - cmd = (struct lcs_cmd *) buffer->data; - cmd->offset = count; - cmd->type = LCS_FRAME_TYPE_CONTROL; - cmd->slot = 0; - return buffer; -} - - -static void -lcs_get_reply(struct lcs_reply *reply) -{ - refcount_inc(&reply->refcnt); -} - -static void -lcs_put_reply(struct lcs_reply *reply) -{ - if (refcount_dec_and_test(&reply->refcnt)) - kfree(reply); -} - -static struct lcs_reply * -lcs_alloc_reply(struct lcs_cmd *cmd) -{ - struct lcs_reply *reply; - - LCS_DBF_TEXT(4, trace, "getreply"); - - reply = kzalloc(sizeof(struct lcs_reply), GFP_ATOMIC); - if (!reply) - return NULL; - refcount_set(&reply->refcnt, 1); - reply->sequence_no = cmd->sequence_no; - reply->received = 0; - reply->rc = 0; - init_waitqueue_head(&reply->wait_q); - - return reply; -} - -/* - * Notifier function for lancmd replies. Called from read irq. - */ -static void -lcs_notify_lancmd_waiters(struct lcs_card *card, struct lcs_cmd *cmd) -{ - struct list_head *l, *n; - struct lcs_reply *reply; - - LCS_DBF_TEXT(4, trace, "notiwait"); - spin_lock(&card->lock); - list_for_each_safe(l, n, &card->lancmd_waiters) { - reply = list_entry(l, struct lcs_reply, list); - if (reply->sequence_no == cmd->sequence_no) { - lcs_get_reply(reply); - list_del_init(&reply->list); - if (reply->callback != NULL) - reply->callback(card, cmd); - reply->received = 1; - reply->rc = cmd->return_code; - wake_up(&reply->wait_q); - lcs_put_reply(reply); - break; - } - } - spin_unlock(&card->lock); -} - -/* - * Emit buffer of a lan command. - */ -static void -lcs_lancmd_timeout(struct timer_list *t) -{ - struct lcs_reply *reply = from_timer(reply, t, timer); - struct lcs_reply *list_reply, *r; - unsigned long flags; - - LCS_DBF_TEXT(4, trace, "timeout"); - spin_lock_irqsave(&reply->card->lock, flags); - list_for_each_entry_safe(list_reply, r, - &reply->card->lancmd_waiters,list) { - if (reply == list_reply) { - lcs_get_reply(reply); - list_del_init(&reply->list); - spin_unlock_irqrestore(&reply->card->lock, flags); - reply->received = 1; - reply->rc = -ETIME; - wake_up(&reply->wait_q); - lcs_put_reply(reply); - return; - } - } - spin_unlock_irqrestore(&reply->card->lock, flags); -} - -static int -lcs_send_lancmd(struct lcs_card *card, struct lcs_buffer *buffer, - void (*reply_callback)(struct lcs_card *, struct lcs_cmd *)) -{ - struct lcs_reply *reply; - struct lcs_cmd *cmd; - unsigned long flags; - int rc; - - LCS_DBF_TEXT(4, trace, "sendcmd"); - cmd = (struct lcs_cmd *) buffer->data; - cmd->return_code = 0; - cmd->sequence_no = card->sequence_no++; - reply = lcs_alloc_reply(cmd); - if (!reply) - return -ENOMEM; - reply->callback = reply_callback; - reply->card = card; - spin_lock_irqsave(&card->lock, flags); - list_add_tail(&reply->list, &card->lancmd_waiters); - spin_unlock_irqrestore(&card->lock, flags); - - buffer->callback = lcs_release_buffer; - rc = lcs_ready_buffer(&card->write, buffer); - if (rc) - return rc; - timer_setup(&reply->timer, lcs_lancmd_timeout, 0); - mod_timer(&reply->timer, jiffies + HZ * card->lancmd_timeout); - wait_event(reply->wait_q, reply->received); - del_timer_sync(&reply->timer); - LCS_DBF_TEXT_(4, trace, "rc:%d",reply->rc); - rc = reply->rc; - lcs_put_reply(reply); - return rc ? -EIO : 0; -} - -/* - * LCS startup command - */ -static int -lcs_send_startup(struct lcs_card *card, __u8 initiator) -{ - struct lcs_buffer *buffer; - struct lcs_cmd *cmd; - - LCS_DBF_TEXT(2, trace, "startup"); - buffer = lcs_get_lancmd(card, LCS_STD_CMD_SIZE); - cmd = (struct lcs_cmd *) buffer->data; - cmd->cmd_code = LCS_CMD_STARTUP; - cmd->initiator = initiator; - cmd->cmd.lcs_startup.buff_size = LCS_IOBUFFERSIZE; - return lcs_send_lancmd(card, buffer, NULL); -} - -/* - * LCS shutdown command - */ -static int -lcs_send_shutdown(struct lcs_card *card) -{ - struct lcs_buffer *buffer; - struct lcs_cmd *cmd; - - LCS_DBF_TEXT(2, trace, "shutdown"); - buffer = lcs_get_lancmd(card, LCS_STD_CMD_SIZE); - cmd = (struct lcs_cmd *) buffer->data; - cmd->cmd_code = LCS_CMD_SHUTDOWN; - cmd->initiator = LCS_INITIATOR_TCPIP; - return lcs_send_lancmd(card, buffer, NULL); -} - -/* - * LCS lanstat command - */ -static void -__lcs_lanstat_cb(struct lcs_card *card, struct lcs_cmd *cmd) -{ - LCS_DBF_TEXT(2, trace, "statcb"); - memcpy(card->mac, cmd->cmd.lcs_lanstat_cmd.mac_addr, LCS_MAC_LENGTH); -} - -static int -lcs_send_lanstat(struct lcs_card *card) -{ - struct lcs_buffer *buffer; - struct lcs_cmd *cmd; - - LCS_DBF_TEXT(2,trace, "cmdstat"); - buffer = lcs_get_lancmd(card, LCS_STD_CMD_SIZE); - cmd = (struct lcs_cmd *) buffer->data; - /* Setup lanstat command. */ - cmd->cmd_code = LCS_CMD_LANSTAT; - cmd->initiator = LCS_INITIATOR_TCPIP; - cmd->cmd.lcs_std_cmd.lan_type = card->lan_type; - cmd->cmd.lcs_std_cmd.portno = card->portno; - return lcs_send_lancmd(card, buffer, __lcs_lanstat_cb); -} - -/* - * send stoplan command - */ -static int -lcs_send_stoplan(struct lcs_card *card, __u8 initiator) -{ - struct lcs_buffer *buffer; - struct lcs_cmd *cmd; - - LCS_DBF_TEXT(2, trace, "cmdstpln"); - buffer = lcs_get_lancmd(card, LCS_STD_CMD_SIZE); - cmd = (struct lcs_cmd *) buffer->data; - cmd->cmd_code = LCS_CMD_STOPLAN; - cmd->initiator = initiator; - cmd->cmd.lcs_std_cmd.lan_type = card->lan_type; - cmd->cmd.lcs_std_cmd.portno = card->portno; - return lcs_send_lancmd(card, buffer, NULL); -} - -/* - * send startlan command - */ -static void -__lcs_send_startlan_cb(struct lcs_card *card, struct lcs_cmd *cmd) -{ - LCS_DBF_TEXT(2, trace, "srtlancb"); - card->lan_type = cmd->cmd.lcs_std_cmd.lan_type; - card->portno = cmd->cmd.lcs_std_cmd.portno; -} - -static int -lcs_send_startlan(struct lcs_card *card, __u8 initiator) -{ - struct lcs_buffer *buffer; - struct lcs_cmd *cmd; - - LCS_DBF_TEXT(2, trace, "cmdstaln"); - buffer = lcs_get_lancmd(card, LCS_STD_CMD_SIZE); - cmd = (struct lcs_cmd *) buffer->data; - cmd->cmd_code = LCS_CMD_STARTLAN; - cmd->initiator = initiator; - cmd->cmd.lcs_std_cmd.lan_type = card->lan_type; - cmd->cmd.lcs_std_cmd.portno = card->portno; - return lcs_send_lancmd(card, buffer, __lcs_send_startlan_cb); -} - -#ifdef CONFIG_IP_MULTICAST -/* - * send setipm command (Multicast) - */ -static int -lcs_send_setipm(struct lcs_card *card,struct lcs_ipm_list *ipm_list) -{ - struct lcs_buffer *buffer; - struct lcs_cmd *cmd; - - LCS_DBF_TEXT(2, trace, "cmdsetim"); - buffer = lcs_get_lancmd(card, LCS_MULTICAST_CMD_SIZE); - cmd = (struct lcs_cmd *) buffer->data; - cmd->cmd_code = LCS_CMD_SETIPM; - cmd->initiator = LCS_INITIATOR_TCPIP; - cmd->cmd.lcs_qipassist.lan_type = card->lan_type; - cmd->cmd.lcs_qipassist.portno = card->portno; - cmd->cmd.lcs_qipassist.version = 4; - cmd->cmd.lcs_qipassist.num_ip_pairs = 1; - memcpy(cmd->cmd.lcs_qipassist.lcs_ipass_ctlmsg.ip_mac_pair, - &ipm_list->ipm, sizeof (struct lcs_ip_mac_pair)); - LCS_DBF_TEXT_(2, trace, "%x",ipm_list->ipm.ip_addr); - return lcs_send_lancmd(card, buffer, NULL); -} - -/* - * send delipm command (Multicast) - */ -static int -lcs_send_delipm(struct lcs_card *card,struct lcs_ipm_list *ipm_list) -{ - struct lcs_buffer *buffer; - struct lcs_cmd *cmd; - - LCS_DBF_TEXT(2, trace, "cmddelim"); - buffer = lcs_get_lancmd(card, LCS_MULTICAST_CMD_SIZE); - cmd = (struct lcs_cmd *) buffer->data; - cmd->cmd_code = LCS_CMD_DELIPM; - cmd->initiator = LCS_INITIATOR_TCPIP; - cmd->cmd.lcs_qipassist.lan_type = card->lan_type; - cmd->cmd.lcs_qipassist.portno = card->portno; - cmd->cmd.lcs_qipassist.version = 4; - cmd->cmd.lcs_qipassist.num_ip_pairs = 1; - memcpy(cmd->cmd.lcs_qipassist.lcs_ipass_ctlmsg.ip_mac_pair, - &ipm_list->ipm, sizeof (struct lcs_ip_mac_pair)); - LCS_DBF_TEXT_(2, trace, "%x",ipm_list->ipm.ip_addr); - return lcs_send_lancmd(card, buffer, NULL); -} - -/* - * check if multicast is supported by LCS - */ -static void -__lcs_check_multicast_cb(struct lcs_card *card, struct lcs_cmd *cmd) -{ - LCS_DBF_TEXT(2, trace, "chkmccb"); - card->ip_assists_supported = - cmd->cmd.lcs_qipassist.ip_assists_supported; - card->ip_assists_enabled = - cmd->cmd.lcs_qipassist.ip_assists_enabled; -} - -static int -lcs_check_multicast_support(struct lcs_card *card) -{ - struct lcs_buffer *buffer; - struct lcs_cmd *cmd; - int rc; - - LCS_DBF_TEXT(2, trace, "cmdqipa"); - /* Send query ipassist. */ - buffer = lcs_get_lancmd(card, LCS_STD_CMD_SIZE); - cmd = (struct lcs_cmd *) buffer->data; - cmd->cmd_code = LCS_CMD_QIPASSIST; - cmd->initiator = LCS_INITIATOR_TCPIP; - cmd->cmd.lcs_qipassist.lan_type = card->lan_type; - cmd->cmd.lcs_qipassist.portno = card->portno; - cmd->cmd.lcs_qipassist.version = 4; - cmd->cmd.lcs_qipassist.num_ip_pairs = 1; - rc = lcs_send_lancmd(card, buffer, __lcs_check_multicast_cb); - if (rc != 0) { - pr_err("Query IPAssist failed. Assuming unsupported!\n"); - return -EOPNOTSUPP; - } - if (card->ip_assists_supported & LCS_IPASS_MULTICAST_SUPPORT) - return 0; - return -EOPNOTSUPP; -} - -/* - * set or del multicast address on LCS card - */ -static void -lcs_fix_multicast_list(struct lcs_card *card) -{ - struct list_head failed_list; - struct lcs_ipm_list *ipm, *tmp; - unsigned long flags; - int rc; - - LCS_DBF_TEXT(4,trace, "fixipm"); - INIT_LIST_HEAD(&failed_list); - spin_lock_irqsave(&card->ipm_lock, flags); -list_modified: - list_for_each_entry_safe(ipm, tmp, &card->ipm_list, list){ - switch (ipm->ipm_state) { - case LCS_IPM_STATE_SET_REQUIRED: - /* del from ipm_list so no one else can tamper with - * this entry */ - list_del_init(&ipm->list); - spin_unlock_irqrestore(&card->ipm_lock, flags); - rc = lcs_send_setipm(card, ipm); - spin_lock_irqsave(&card->ipm_lock, flags); - if (rc) { - pr_info("Adding multicast address failed." - " Table possibly full!\n"); - /* store ipm in failed list -> will be added - * to ipm_list again, so a retry will be done - * during the next call of this function */ - list_add_tail(&ipm->list, &failed_list); - } else { - ipm->ipm_state = LCS_IPM_STATE_ON_CARD; - /* re-insert into ipm_list */ - list_add_tail(&ipm->list, &card->ipm_list); - } - goto list_modified; - case LCS_IPM_STATE_DEL_REQUIRED: - list_del(&ipm->list); - spin_unlock_irqrestore(&card->ipm_lock, flags); - lcs_send_delipm(card, ipm); - spin_lock_irqsave(&card->ipm_lock, flags); - kfree(ipm); - goto list_modified; - case LCS_IPM_STATE_ON_CARD: - break; - } - } - /* re-insert all entries from the failed_list into ipm_list */ - list_for_each_entry_safe(ipm, tmp, &failed_list, list) - list_move_tail(&ipm->list, &card->ipm_list); - - spin_unlock_irqrestore(&card->ipm_lock, flags); -} - -/* - * get mac address for the relevant Multicast address - */ -static void -lcs_get_mac_for_ipm(__be32 ipm, char *mac, struct net_device *dev) -{ - LCS_DBF_TEXT(4,trace, "getmac"); - ip_eth_mc_map(ipm, mac); -} - -/* - * function called by net device to handle multicast address relevant things - */ -static void lcs_remove_mc_addresses(struct lcs_card *card, - struct in_device *in4_dev) -{ - struct ip_mc_list *im4; - struct list_head *l; - struct lcs_ipm_list *ipm; - unsigned long flags; - char buf[MAX_ADDR_LEN]; - - LCS_DBF_TEXT(4, trace, "remmclst"); - spin_lock_irqsave(&card->ipm_lock, flags); - list_for_each(l, &card->ipm_list) { - ipm = list_entry(l, struct lcs_ipm_list, list); - for (im4 = rcu_dereference(in4_dev->mc_list); - im4 != NULL; im4 = rcu_dereference(im4->next_rcu)) { - lcs_get_mac_for_ipm(im4->multiaddr, buf, card->dev); - if ( (ipm->ipm.ip_addr == im4->multiaddr) && - (memcmp(buf, &ipm->ipm.mac_addr, - LCS_MAC_LENGTH) == 0) ) - break; - } - if (im4 == NULL) - ipm->ipm_state = LCS_IPM_STATE_DEL_REQUIRED; - } - spin_unlock_irqrestore(&card->ipm_lock, flags); -} - -static struct lcs_ipm_list *lcs_check_addr_entry(struct lcs_card *card, - struct ip_mc_list *im4, - char *buf) -{ - struct lcs_ipm_list *tmp, *ipm = NULL; - struct list_head *l; - unsigned long flags; - - LCS_DBF_TEXT(4, trace, "chkmcent"); - spin_lock_irqsave(&card->ipm_lock, flags); - list_for_each(l, &card->ipm_list) { - tmp = list_entry(l, struct lcs_ipm_list, list); - if ( (tmp->ipm.ip_addr == im4->multiaddr) && - (memcmp(buf, &tmp->ipm.mac_addr, - LCS_MAC_LENGTH) == 0) ) { - ipm = tmp; - break; - } - } - spin_unlock_irqrestore(&card->ipm_lock, flags); - return ipm; -} - -static void lcs_set_mc_addresses(struct lcs_card *card, - struct in_device *in4_dev) -{ - - struct ip_mc_list *im4; - struct lcs_ipm_list *ipm; - char buf[MAX_ADDR_LEN]; - unsigned long flags; - - LCS_DBF_TEXT(4, trace, "setmclst"); - for (im4 = rcu_dereference(in4_dev->mc_list); im4 != NULL; - im4 = rcu_dereference(im4->next_rcu)) { - lcs_get_mac_for_ipm(im4->multiaddr, buf, card->dev); - ipm = lcs_check_addr_entry(card, im4, buf); - if (ipm != NULL) - continue; /* Address already in list. */ - ipm = kzalloc(sizeof(struct lcs_ipm_list), GFP_ATOMIC); - if (ipm == NULL) { - pr_info("Not enough memory to add" - " new multicast entry!\n"); - break; - } - memcpy(&ipm->ipm.mac_addr, buf, LCS_MAC_LENGTH); - ipm->ipm.ip_addr = im4->multiaddr; - ipm->ipm_state = LCS_IPM_STATE_SET_REQUIRED; - spin_lock_irqsave(&card->ipm_lock, flags); - LCS_DBF_HEX(2,trace,&ipm->ipm.ip_addr,4); - list_add(&ipm->list, &card->ipm_list); - spin_unlock_irqrestore(&card->ipm_lock, flags); - } -} - -static int -lcs_register_mc_addresses(void *data) -{ - struct lcs_card *card; - struct in_device *in4_dev; - - card = (struct lcs_card *) data; - - if (!lcs_do_run_thread(card, LCS_SET_MC_THREAD)) - return 0; - LCS_DBF_TEXT(4, trace, "regmulti"); - - in4_dev = in_dev_get(card->dev); - if (in4_dev == NULL) - goto out; - rcu_read_lock(); - lcs_remove_mc_addresses(card,in4_dev); - lcs_set_mc_addresses(card, in4_dev); - rcu_read_unlock(); - in_dev_put(in4_dev); - - netif_carrier_off(card->dev); - netif_tx_disable(card->dev); - wait_event(card->write.wait_q, - (card->write.state != LCS_CH_STATE_RUNNING)); - lcs_fix_multicast_list(card); - if (card->state == DEV_STATE_UP) { - netif_carrier_on(card->dev); - netif_wake_queue(card->dev); - } -out: - lcs_clear_thread_running_bit(card, LCS_SET_MC_THREAD); - return 0; -} -#endif /* CONFIG_IP_MULTICAST */ - -/* - * function called by net device to - * handle multicast address relevant things - */ -static void -lcs_set_multicast_list(struct net_device *dev) -{ -#ifdef CONFIG_IP_MULTICAST - struct lcs_card *card; - - LCS_DBF_TEXT(4, trace, "setmulti"); - card = (struct lcs_card *) dev->ml_priv; - - if (!lcs_set_thread_start_bit(card, LCS_SET_MC_THREAD)) - schedule_work(&card->kernel_thread_starter); -#endif /* CONFIG_IP_MULTICAST */ -} - -static long -lcs_check_irb_error(struct ccw_device *cdev, struct irb *irb) -{ - if (!IS_ERR(irb)) - return 0; - - switch (PTR_ERR(irb)) { - case -EIO: - dev_warn(&cdev->dev, - "An I/O-error occurred on the LCS device\n"); - LCS_DBF_TEXT(2, trace, "ckirberr"); - LCS_DBF_TEXT_(2, trace, " rc%d", -EIO); - break; - case -ETIMEDOUT: - dev_warn(&cdev->dev, - "A command timed out on the LCS device\n"); - LCS_DBF_TEXT(2, trace, "ckirberr"); - LCS_DBF_TEXT_(2, trace, " rc%d", -ETIMEDOUT); - break; - default: - dev_warn(&cdev->dev, - "An error occurred on the LCS device, rc=%ld\n", - PTR_ERR(irb)); - LCS_DBF_TEXT(2, trace, "ckirberr"); - LCS_DBF_TEXT(2, trace, " rc???"); - } - return PTR_ERR(irb); -} - -static int -lcs_get_problem(struct ccw_device *cdev, struct irb *irb) -{ - int dstat, cstat; - char *sense; - - sense = (char *) irb->ecw; - cstat = irb->scsw.cmd.cstat; - dstat = irb->scsw.cmd.dstat; - - if (cstat & (SCHN_STAT_CHN_CTRL_CHK | SCHN_STAT_INTF_CTRL_CHK | - SCHN_STAT_CHN_DATA_CHK | SCHN_STAT_CHAIN_CHECK | - SCHN_STAT_PROT_CHECK | SCHN_STAT_PROG_CHECK)) { - LCS_DBF_TEXT(2, trace, "CGENCHK"); - return 1; - } - if (dstat & DEV_STAT_UNIT_CHECK) { - if (sense[LCS_SENSE_BYTE_1] & - LCS_SENSE_RESETTING_EVENT) { - LCS_DBF_TEXT(2, trace, "REVIND"); - return 1; - } - if (sense[LCS_SENSE_BYTE_0] & - LCS_SENSE_CMD_REJECT) { - LCS_DBF_TEXT(2, trace, "CMDREJ"); - return 0; - } - if ((!sense[LCS_SENSE_BYTE_0]) && - (!sense[LCS_SENSE_BYTE_1]) && - (!sense[LCS_SENSE_BYTE_2]) && - (!sense[LCS_SENSE_BYTE_3])) { - LCS_DBF_TEXT(2, trace, "ZEROSEN"); - return 0; - } - LCS_DBF_TEXT(2, trace, "DGENCHK"); - return 1; - } - return 0; -} - -static void -lcs_schedule_recovery(struct lcs_card *card) -{ - LCS_DBF_TEXT(2, trace, "startrec"); - if (!lcs_set_thread_start_bit(card, LCS_RECOVERY_THREAD)) - schedule_work(&card->kernel_thread_starter); -} - -/* - * IRQ Handler for LCS channels - */ -static void -lcs_irq(struct ccw_device *cdev, unsigned long intparm, struct irb *irb) -{ - struct lcs_card *card; - struct lcs_channel *channel; - int rc, index; - int cstat, dstat; - - if (lcs_check_irb_error(cdev, irb)) - return; - - card = CARD_FROM_DEV(cdev); - if (card->read.ccwdev == cdev) - channel = &card->read; - else - channel = &card->write; - - cstat = irb->scsw.cmd.cstat; - dstat = irb->scsw.cmd.dstat; - LCS_DBF_TEXT_(5, trace, "Rint%s", dev_name(&cdev->dev)); - LCS_DBF_TEXT_(5, trace, "%4x%4x", irb->scsw.cmd.cstat, - irb->scsw.cmd.dstat); - LCS_DBF_TEXT_(5, trace, "%4x%4x", irb->scsw.cmd.fctl, - irb->scsw.cmd.actl); - - /* Check for channel and device errors presented */ - rc = lcs_get_problem(cdev, irb); - if (rc || (dstat & DEV_STAT_UNIT_EXCEP)) { - dev_warn(&cdev->dev, - "The LCS device stopped because of an error," - " dstat=0x%X, cstat=0x%X \n", - dstat, cstat); - if (rc) { - channel->state = LCS_CH_STATE_ERROR; - } - } - if (channel->state == LCS_CH_STATE_ERROR) { - lcs_schedule_recovery(card); - wake_up(&card->wait_q); - return; - } - /* How far in the ccw chain have we processed? */ - if ((channel->state != LCS_CH_STATE_INIT) && - (irb->scsw.cmd.fctl & SCSW_FCTL_START_FUNC) && - (irb->scsw.cmd.cpa != 0)) { - index = (struct ccw1 *)dma32_to_virt(irb->scsw.cmd.cpa) - - channel->ccws; - if ((irb->scsw.cmd.actl & SCSW_ACTL_SUSPENDED) || - (irb->scsw.cmd.cstat & SCHN_STAT_PCI)) - /* Bloody io subsystem tells us lies about cpa... */ - index = (index - 1) & (LCS_NUM_BUFFS - 1); - while (channel->io_idx != index) { - __lcs_processed_buffer(channel, - channel->iob + channel->io_idx); - channel->io_idx = - (channel->io_idx + 1) & (LCS_NUM_BUFFS - 1); - } - } - - if ((irb->scsw.cmd.dstat & DEV_STAT_DEV_END) || - (irb->scsw.cmd.dstat & DEV_STAT_CHN_END) || - (irb->scsw.cmd.dstat & DEV_STAT_UNIT_CHECK)) - /* Mark channel as stopped. */ - channel->state = LCS_CH_STATE_STOPPED; - else if (irb->scsw.cmd.actl & SCSW_ACTL_SUSPENDED) - /* CCW execution stopped on a suspend bit. */ - channel->state = LCS_CH_STATE_SUSPENDED; - if (irb->scsw.cmd.fctl & SCSW_FCTL_HALT_FUNC) { - if (irb->scsw.cmd.cc != 0) { - ccw_device_halt(channel->ccwdev, 0); - return; - } - /* The channel has been stopped by halt_IO. */ - channel->state = LCS_CH_STATE_HALTED; - } - if (irb->scsw.cmd.fctl & SCSW_FCTL_CLEAR_FUNC) - channel->state = LCS_CH_STATE_CLEARED; - /* Do the rest in the tasklet. */ - tasklet_schedule(&channel->irq_tasklet); -} - -/* - * Tasklet for IRQ handler - */ -static void -lcs_tasklet(unsigned long data) -{ - unsigned long flags; - struct lcs_channel *channel; - struct lcs_buffer *iob; - int buf_idx; - - channel = (struct lcs_channel *) data; - LCS_DBF_TEXT_(5, trace, "tlet%s", dev_name(&channel->ccwdev->dev)); - - /* Check for processed buffers. */ - iob = channel->iob; - buf_idx = channel->buf_idx; - while (iob[buf_idx].state == LCS_BUF_STATE_PROCESSED) { - /* Do the callback thing. */ - if (iob[buf_idx].callback != NULL) - iob[buf_idx].callback(channel, iob + buf_idx); - buf_idx = (buf_idx + 1) & (LCS_NUM_BUFFS - 1); - } - channel->buf_idx = buf_idx; - - if (channel->state == LCS_CH_STATE_STOPPED) - lcs_start_channel(channel); - spin_lock_irqsave(get_ccwdev_lock(channel->ccwdev), flags); - if (channel->state == LCS_CH_STATE_SUSPENDED && - channel->iob[channel->io_idx].state == LCS_BUF_STATE_READY) - __lcs_resume_channel(channel); - spin_unlock_irqrestore(get_ccwdev_lock(channel->ccwdev), flags); - - /* Something happened on the channel. Wake up waiters. */ - wake_up(&channel->wait_q); -} - -/* - * Finish current tx buffer and make it ready for transmit. - */ -static void -__lcs_emit_txbuffer(struct lcs_card *card) -{ - LCS_DBF_TEXT(5, trace, "emittx"); - *(__u16 *)(card->tx_buffer->data + card->tx_buffer->count) = 0; - card->tx_buffer->count += 2; - lcs_ready_buffer(&card->write, card->tx_buffer); - card->tx_buffer = NULL; - card->tx_emitted++; -} - -/* - * Callback for finished tx buffers. - */ -static void -lcs_txbuffer_cb(struct lcs_channel *channel, struct lcs_buffer *buffer) -{ - struct lcs_card *card; - - LCS_DBF_TEXT(5, trace, "txbuffcb"); - /* Put buffer back to pool. */ - lcs_release_buffer(channel, buffer); - card = container_of(channel, struct lcs_card, write); - if (netif_queue_stopped(card->dev) && netif_carrier_ok(card->dev)) - netif_wake_queue(card->dev); - spin_lock(&card->lock); - card->tx_emitted--; - if (card->tx_emitted <= 0 && card->tx_buffer != NULL) - /* - * Last running tx buffer has finished. Submit partially - * filled current buffer. - */ - __lcs_emit_txbuffer(card); - spin_unlock(&card->lock); -} - -/* - * Packet transmit function called by network stack - */ -static netdev_tx_t __lcs_start_xmit(struct lcs_card *card, struct sk_buff *skb, - struct net_device *dev) -{ - struct lcs_header *header; - int rc = NETDEV_TX_OK; - - LCS_DBF_TEXT(5, trace, "hardxmit"); - if (skb == NULL) { - card->stats.tx_dropped++; - card->stats.tx_errors++; - return NETDEV_TX_OK; - } - if (card->state != DEV_STATE_UP) { - dev_kfree_skb(skb); - card->stats.tx_dropped++; - card->stats.tx_errors++; - card->stats.tx_carrier_errors++; - return NETDEV_TX_OK; - } - if (skb->protocol == htons(ETH_P_IPV6)) { - dev_kfree_skb(skb); - return NETDEV_TX_OK; - } - netif_stop_queue(card->dev); - spin_lock(&card->lock); - if (card->tx_buffer != NULL && - card->tx_buffer->count + sizeof(struct lcs_header) + - skb->len + sizeof(u16) > LCS_IOBUFFERSIZE) - /* skb too big for current tx buffer. */ - __lcs_emit_txbuffer(card); - if (card->tx_buffer == NULL) { - /* Get new tx buffer */ - card->tx_buffer = lcs_get_buffer(&card->write); - if (card->tx_buffer == NULL) { - card->stats.tx_dropped++; - rc = NETDEV_TX_BUSY; - goto out; - } - card->tx_buffer->callback = lcs_txbuffer_cb; - card->tx_buffer->count = 0; - } - header = (struct lcs_header *) - (card->tx_buffer->data + card->tx_buffer->count); - card->tx_buffer->count += skb->len + sizeof(struct lcs_header); - header->offset = card->tx_buffer->count; - header->type = card->lan_type; - header->slot = card->portno; - skb_copy_from_linear_data(skb, header + 1, skb->len); - spin_unlock(&card->lock); - card->stats.tx_bytes += skb->len; - card->stats.tx_packets++; - dev_kfree_skb(skb); - netif_wake_queue(card->dev); - spin_lock(&card->lock); - if (card->tx_emitted <= 0 && card->tx_buffer != NULL) - /* If this is the first tx buffer emit it immediately. */ - __lcs_emit_txbuffer(card); -out: - spin_unlock(&card->lock); - return rc; -} - -static netdev_tx_t lcs_start_xmit(struct sk_buff *skb, struct net_device *dev) -{ - struct lcs_card *card; - int rc; - - LCS_DBF_TEXT(5, trace, "pktxmit"); - card = (struct lcs_card *) dev->ml_priv; - rc = __lcs_start_xmit(card, skb, dev); - return rc; -} - -/* - * send startlan and lanstat command to make LCS device ready - */ -static int -lcs_startlan_auto(struct lcs_card *card) -{ - int rc; - - LCS_DBF_TEXT(2, trace, "strtauto"); - card->lan_type = LCS_FRAME_TYPE_ENET; - rc = lcs_send_startlan(card, LCS_INITIATOR_TCPIP); - if (rc == 0) - return 0; - - return -EIO; -} - -static int -lcs_startlan(struct lcs_card *card) -{ - int rc, i; - - LCS_DBF_TEXT(2, trace, "startlan"); - rc = 0; - if (card->portno != LCS_INVALID_PORT_NO) { - if (card->lan_type == LCS_FRAME_TYPE_AUTO) - rc = lcs_startlan_auto(card); - else - rc = lcs_send_startlan(card, LCS_INITIATOR_TCPIP); - } else { - for (i = 0; i <= 16; i++) { - card->portno = i; - if (card->lan_type != LCS_FRAME_TYPE_AUTO) - rc = lcs_send_startlan(card, - LCS_INITIATOR_TCPIP); - else - /* autodetecting lan type */ - rc = lcs_startlan_auto(card); - if (rc == 0) - break; - } - } - if (rc == 0) - return lcs_send_lanstat(card); - return rc; -} - -/* - * LCS detect function - * setup channels and make them I/O ready - */ -static int -lcs_detect(struct lcs_card *card) -{ - int rc = 0; - - LCS_DBF_TEXT(2, setup, "lcsdetct"); - /* start/reset card */ - if (card->dev) - netif_stop_queue(card->dev); - rc = lcs_stop_channels(card); - if (rc == 0) { - rc = lcs_start_channels(card); - if (rc == 0) { - rc = lcs_send_startup(card, LCS_INITIATOR_TCPIP); - if (rc == 0) - rc = lcs_startlan(card); - } - } - if (rc == 0) { - card->state = DEV_STATE_UP; - } else { - card->state = DEV_STATE_DOWN; - card->write.state = LCS_CH_STATE_INIT; - card->read.state = LCS_CH_STATE_INIT; - } - return rc; -} - -/* - * LCS Stop card - */ -static int -lcs_stopcard(struct lcs_card *card) -{ - int rc; - - LCS_DBF_TEXT(3, setup, "stopcard"); - - if (card->read.state != LCS_CH_STATE_STOPPED && - card->write.state != LCS_CH_STATE_STOPPED && - card->read.state != LCS_CH_STATE_ERROR && - card->write.state != LCS_CH_STATE_ERROR && - card->state == DEV_STATE_UP) { - lcs_clear_multicast_list(card); - rc = lcs_send_stoplan(card,LCS_INITIATOR_TCPIP); - rc = lcs_send_shutdown(card); - } - rc = lcs_stop_channels(card); - card->state = DEV_STATE_DOWN; - - return rc; -} - -/* - * Kernel Thread helper functions for LGW initiated commands - */ -static void -lcs_start_kernel_thread(struct work_struct *work) -{ - struct lcs_card *card = container_of(work, struct lcs_card, kernel_thread_starter); - LCS_DBF_TEXT(5, trace, "krnthrd"); - if (lcs_do_start_thread(card, LCS_RECOVERY_THREAD)) - kthread_run(lcs_recovery, card, "lcs_recover"); -#ifdef CONFIG_IP_MULTICAST - if (lcs_do_start_thread(card, LCS_SET_MC_THREAD)) - kthread_run(lcs_register_mc_addresses, card, "regipm"); -#endif -} - -/* - * Process control frames. - */ -static void -lcs_get_control(struct lcs_card *card, struct lcs_cmd *cmd) -{ - LCS_DBF_TEXT(5, trace, "getctrl"); - if (cmd->initiator == LCS_INITIATOR_LGW) { - switch(cmd->cmd_code) { - case LCS_CMD_STARTUP: - case LCS_CMD_STARTLAN: - lcs_schedule_recovery(card); - break; - case LCS_CMD_STOPLAN: - if (card->dev) { - pr_warn("Stoplan for %s initiated by LGW\n", - card->dev->name); - netif_carrier_off(card->dev); - } - break; - default: - LCS_DBF_TEXT(5, trace, "noLGWcmd"); - break; - } - } else - lcs_notify_lancmd_waiters(card, cmd); -} - -/* - * Unpack network packet. - */ -static void -lcs_get_skb(struct lcs_card *card, char *skb_data, unsigned int skb_len) -{ - struct sk_buff *skb; - - LCS_DBF_TEXT(5, trace, "getskb"); - if (card->dev == NULL || - card->state != DEV_STATE_UP) - /* The card isn't up. Ignore the packet. */ - return; - - skb = dev_alloc_skb(skb_len); - if (skb == NULL) { - dev_err(&card->dev->dev, - " Allocating a socket buffer to interface %s failed\n", - card->dev->name); - card->stats.rx_dropped++; - return; - } - skb_put_data(skb, skb_data, skb_len); - skb->protocol = card->lan_type_trans(skb, card->dev); - card->stats.rx_bytes += skb_len; - card->stats.rx_packets++; - if (skb->protocol == htons(ETH_P_802_2)) - *((__u32 *)skb->cb) = ++card->pkt_seq; - netif_rx(skb); -} - -/* - * LCS main routine to get packets and lancmd replies from the buffers - */ -static void -lcs_get_frames_cb(struct lcs_channel *channel, struct lcs_buffer *buffer) -{ - struct lcs_card *card; - struct lcs_header *lcs_hdr; - __u16 offset; - - LCS_DBF_TEXT(5, trace, "lcsgtpkt"); - lcs_hdr = (struct lcs_header *) buffer->data; - if (lcs_hdr->offset == LCS_ILLEGAL_OFFSET) { - LCS_DBF_TEXT(4, trace, "-eiogpkt"); - return; - } - card = container_of(channel, struct lcs_card, read); - offset = 0; - while (lcs_hdr->offset != 0) { - if (lcs_hdr->offset <= 0 || - lcs_hdr->offset > LCS_IOBUFFERSIZE || - lcs_hdr->offset < offset) { - /* Offset invalid. */ - card->stats.rx_length_errors++; - card->stats.rx_errors++; - return; - } - if (lcs_hdr->type == LCS_FRAME_TYPE_CONTROL) - lcs_get_control(card, (struct lcs_cmd *) lcs_hdr); - else if (lcs_hdr->type == LCS_FRAME_TYPE_ENET) - lcs_get_skb(card, (char *)(lcs_hdr + 1), - lcs_hdr->offset - offset - - sizeof(struct lcs_header)); - else - dev_info_once(&card->dev->dev, - "Unknown frame type %d\n", - lcs_hdr->type); - offset = lcs_hdr->offset; - lcs_hdr->offset = LCS_ILLEGAL_OFFSET; - lcs_hdr = (struct lcs_header *) (buffer->data + offset); - } - /* The buffer is now empty. Make it ready again. */ - lcs_ready_buffer(&card->read, buffer); -} - -/* - * get network statistics for ifconfig and other user programs - */ -static struct net_device_stats * -lcs_getstats(struct net_device *dev) -{ - struct lcs_card *card; - - LCS_DBF_TEXT(4, trace, "netstats"); - card = (struct lcs_card *) dev->ml_priv; - return &card->stats; -} - -/* - * stop lcs device - * This function will be called by user doing ifconfig xxx down - */ -static int -lcs_stop_device(struct net_device *dev) -{ - struct lcs_card *card; - int rc; - - LCS_DBF_TEXT(2, trace, "stopdev"); - card = (struct lcs_card *) dev->ml_priv; - netif_carrier_off(dev); - netif_tx_disable(dev); - dev->flags &= ~IFF_UP; - wait_event(card->write.wait_q, - (card->write.state != LCS_CH_STATE_RUNNING)); - rc = lcs_stopcard(card); - if (rc) - dev_err(&card->dev->dev, - " Shutting down the LCS device failed\n"); - return rc; -} - -/* - * start lcs device and make it runnable - * This function will be called by user doing ifconfig xxx up - */ -static int -lcs_open_device(struct net_device *dev) -{ - struct lcs_card *card; - int rc; - - LCS_DBF_TEXT(2, trace, "opendev"); - card = (struct lcs_card *) dev->ml_priv; - /* initialize statistics */ - rc = lcs_detect(card); - if (rc) { - pr_err("Error in opening device!\n"); - - } else { - dev->flags |= IFF_UP; - netif_carrier_on(dev); - netif_wake_queue(dev); - card->state = DEV_STATE_UP; - } - return rc; -} - -/* - * show function for portno called by cat or similar things - */ -static ssize_t -lcs_portno_show (struct device *dev, struct device_attribute *attr, char *buf) -{ - struct lcs_card *card; - - card = dev_get_drvdata(dev); - - if (!card) - return 0; - - return sysfs_emit(buf, "%d\n", card->portno); -} - -/* - * store the value which is piped to file portno - */ -static ssize_t -lcs_portno_store (struct device *dev, struct device_attribute *attr, const char *buf, size_t count) -{ - struct lcs_card *card; - int rc; - s16 value; - - card = dev_get_drvdata(dev); - - if (!card) - return 0; - - rc = kstrtos16(buf, 0, &value); - if (rc) - return -EINVAL; - /* TODO: sanity checks */ - card->portno = value; - if (card->dev) - card->dev->dev_port = card->portno; - - return count; - -} - -static DEVICE_ATTR(portno, 0644, lcs_portno_show, lcs_portno_store); - -static const char *lcs_type[] = { - "not a channel", - "2216 parallel", - "2216 channel", - "OSA LCS card", - "unknown channel type", - "unsupported channel type", -}; - -static ssize_t -lcs_type_show(struct device *dev, struct device_attribute *attr, char *buf) -{ - struct ccwgroup_device *cgdev; - - cgdev = to_ccwgroupdev(dev); - if (!cgdev) - return -ENODEV; - - return sysfs_emit(buf, "%s\n", - lcs_type[cgdev->cdev[0]->id.driver_info]); -} - -static DEVICE_ATTR(type, 0444, lcs_type_show, NULL); - -static ssize_t -lcs_timeout_show(struct device *dev, struct device_attribute *attr, char *buf) -{ - struct lcs_card *card; - - card = dev_get_drvdata(dev); - - return card ? sysfs_emit(buf, "%u\n", card->lancmd_timeout) : 0; -} - -static ssize_t -lcs_timeout_store (struct device *dev, struct device_attribute *attr, const char *buf, size_t count) -{ - struct lcs_card *card; - unsigned int value; - int rc; - - card = dev_get_drvdata(dev); - - if (!card) - return 0; - - rc = kstrtouint(buf, 0, &value); - if (rc) - return -EINVAL; - /* TODO: sanity checks */ - card->lancmd_timeout = value; - - return count; - -} - -static DEVICE_ATTR(lancmd_timeout, 0644, lcs_timeout_show, lcs_timeout_store); - -static ssize_t -lcs_dev_recover_store(struct device *dev, struct device_attribute *attr, - const char *buf, size_t count) -{ - struct lcs_card *card = dev_get_drvdata(dev); - char *tmp; - int i; - - if (!card) - return -EINVAL; - if (card->state != DEV_STATE_UP) - return -EPERM; - i = simple_strtoul(buf, &tmp, 16); - if (i == 1) - lcs_schedule_recovery(card); - return count; -} - -static DEVICE_ATTR(recover, 0200, NULL, lcs_dev_recover_store); - -static struct attribute * lcs_attrs[] = { - &dev_attr_portno.attr, - &dev_attr_type.attr, - &dev_attr_lancmd_timeout.attr, - &dev_attr_recover.attr, - NULL, -}; -static struct attribute_group lcs_attr_group = { - .attrs = lcs_attrs, -}; -static const struct attribute_group *lcs_attr_groups[] = { - &lcs_attr_group, - NULL, -}; -static const struct device_type lcs_devtype = { - .name = "lcs", - .groups = lcs_attr_groups, -}; - -/* - * lcs_probe_device is called on establishing a new ccwgroup_device. - */ -static int -lcs_probe_device(struct ccwgroup_device *ccwgdev) -{ - struct lcs_card *card; - - if (!get_device(&ccwgdev->dev)) - return -ENODEV; - - LCS_DBF_TEXT(2, setup, "add_dev"); - card = lcs_alloc_card(); - if (!card) { - LCS_DBF_TEXT_(2, setup, " rc%d", -ENOMEM); - put_device(&ccwgdev->dev); - return -ENOMEM; - } - dev_set_drvdata(&ccwgdev->dev, card); - ccwgdev->cdev[0]->handler = lcs_irq; - ccwgdev->cdev[1]->handler = lcs_irq; - card->gdev = ccwgdev; - INIT_WORK(&card->kernel_thread_starter, lcs_start_kernel_thread); - card->thread_start_mask = 0; - card->thread_allowed_mask = 0; - card->thread_running_mask = 0; - ccwgdev->dev.type = &lcs_devtype; - - return 0; -} - -static int -lcs_register_netdev(struct ccwgroup_device *ccwgdev) -{ - struct lcs_card *card; - - LCS_DBF_TEXT(2, setup, "regnetdv"); - card = dev_get_drvdata(&ccwgdev->dev); - if (card->dev->reg_state != NETREG_UNINITIALIZED) - return 0; - SET_NETDEV_DEV(card->dev, &ccwgdev->dev); - return register_netdev(card->dev); -} - -/* - * lcs_new_device will be called by setting the group device online. - */ -static const struct net_device_ops lcs_netdev_ops = { - .ndo_open = lcs_open_device, - .ndo_stop = lcs_stop_device, - .ndo_get_stats = lcs_getstats, - .ndo_start_xmit = lcs_start_xmit, -}; - -static const struct net_device_ops lcs_mc_netdev_ops = { - .ndo_open = lcs_open_device, - .ndo_stop = lcs_stop_device, - .ndo_get_stats = lcs_getstats, - .ndo_start_xmit = lcs_start_xmit, - .ndo_set_rx_mode = lcs_set_multicast_list, -}; - -static int -lcs_new_device(struct ccwgroup_device *ccwgdev) -{ - struct lcs_card *card; - struct net_device *dev=NULL; - enum lcs_dev_states recover_state; - int rc; - - card = dev_get_drvdata(&ccwgdev->dev); - if (!card) - return -ENODEV; - - LCS_DBF_TEXT(2, setup, "newdev"); - LCS_DBF_HEX(3, setup, &card, sizeof(void*)); - card->read.ccwdev = ccwgdev->cdev[0]; - card->write.ccwdev = ccwgdev->cdev[1]; - - recover_state = card->state; - rc = ccw_device_set_online(card->read.ccwdev); - if (rc) - goto out_err; - rc = ccw_device_set_online(card->write.ccwdev); - if (rc) - goto out_werr; - - LCS_DBF_TEXT(3, setup, "lcsnewdv"); - - lcs_setup_card(card); - rc = lcs_detect(card); - if (rc) { - LCS_DBF_TEXT(2, setup, "dtctfail"); - dev_err(&ccwgdev->dev, - "Detecting a network adapter for LCS devices" - " failed with rc=%d (0x%x)\n", rc, rc); - lcs_stopcard(card); - goto out; - } - if (card->dev) { - LCS_DBF_TEXT(2, setup, "samedev"); - LCS_DBF_HEX(3, setup, &card, sizeof(void*)); - goto netdev_out; - } - switch (card->lan_type) { - case LCS_FRAME_TYPE_ENET: - card->lan_type_trans = eth_type_trans; - dev = alloc_etherdev(0); - break; - default: - LCS_DBF_TEXT(3, setup, "errinit"); - pr_err(" Initialization failed\n"); - goto out; - } - if (!dev) - goto out; - card->dev = dev; - card->dev->ml_priv = card; - card->dev->netdev_ops = &lcs_netdev_ops; - card->dev->dev_port = card->portno; - eth_hw_addr_set(card->dev, card->mac); -#ifdef CONFIG_IP_MULTICAST - if (!lcs_check_multicast_support(card)) - card->dev->netdev_ops = &lcs_mc_netdev_ops; -#endif -netdev_out: - lcs_set_allowed_threads(card,0xffffffff); - if (recover_state == DEV_STATE_RECOVER) { - lcs_set_multicast_list(card->dev); - card->dev->flags |= IFF_UP; - netif_carrier_on(card->dev); - netif_wake_queue(card->dev); - card->state = DEV_STATE_UP; - } else { - lcs_stopcard(card); - } - - if (lcs_register_netdev(ccwgdev) != 0) - goto out; - - /* Print out supported assists: IPv6 */ - pr_info("LCS device %s %s IPv6 support\n", card->dev->name, - (card->ip_assists_supported & LCS_IPASS_IPV6_SUPPORT) ? - "with" : "without"); - /* Print out supported assist: Multicast */ - pr_info("LCS device %s %s Multicast support\n", card->dev->name, - (card->ip_assists_supported & LCS_IPASS_MULTICAST_SUPPORT) ? - "with" : "without"); - return 0; -out: - - ccw_device_set_offline(card->write.ccwdev); -out_werr: - ccw_device_set_offline(card->read.ccwdev); -out_err: - return -ENODEV; -} - -/* - * lcs_shutdown_device, called when setting the group device offline. - */ -static int -__lcs_shutdown_device(struct ccwgroup_device *ccwgdev, int recovery_mode) -{ - struct lcs_card *card; - enum lcs_dev_states recover_state; - int ret = 0, ret2 = 0, ret3 = 0; - - LCS_DBF_TEXT(3, setup, "shtdndev"); - card = dev_get_drvdata(&ccwgdev->dev); - if (!card) - return -ENODEV; - if (recovery_mode == 0) { - lcs_set_allowed_threads(card, 0); - if (lcs_wait_for_threads(card, LCS_SET_MC_THREAD)) - return -ERESTARTSYS; - } - LCS_DBF_HEX(3, setup, &card, sizeof(void*)); - recover_state = card->state; - - ret = lcs_stop_device(card->dev); - ret2 = ccw_device_set_offline(card->read.ccwdev); - ret3 = ccw_device_set_offline(card->write.ccwdev); - if (!ret) - ret = (ret2) ? ret2 : ret3; - if (ret) - LCS_DBF_TEXT_(3, setup, "1err:%d", ret); - if (recover_state == DEV_STATE_UP) { - card->state = DEV_STATE_RECOVER; - } - return 0; -} - -static int -lcs_shutdown_device(struct ccwgroup_device *ccwgdev) -{ - return __lcs_shutdown_device(ccwgdev, 0); -} - -/* - * drive lcs recovery after startup and startlan initiated by Lan Gateway - */ -static int -lcs_recovery(void *ptr) -{ - struct lcs_card *card; - struct ccwgroup_device *gdev; - int rc; - - card = (struct lcs_card *) ptr; - - LCS_DBF_TEXT(4, trace, "recover1"); - if (!lcs_do_run_thread(card, LCS_RECOVERY_THREAD)) - return 0; - LCS_DBF_TEXT(4, trace, "recover2"); - gdev = card->gdev; - dev_warn(&gdev->dev, - "A recovery process has been started for the LCS device\n"); - rc = __lcs_shutdown_device(gdev, 1); - rc = lcs_new_device(gdev); - if (!rc) - pr_info("Device %s successfully recovered!\n", - card->dev->name); - else - pr_info("Device %s could not be recovered!\n", - card->dev->name); - lcs_clear_thread_running_bit(card, LCS_RECOVERY_THREAD); - return 0; -} - -/* - * lcs_remove_device, free buffers and card - */ -static void -lcs_remove_device(struct ccwgroup_device *ccwgdev) -{ - struct lcs_card *card; - - card = dev_get_drvdata(&ccwgdev->dev); - if (!card) - return; - - LCS_DBF_TEXT(3, setup, "remdev"); - LCS_DBF_HEX(3, setup, &card, sizeof(void*)); - if (ccwgdev->state == CCWGROUP_ONLINE) { - lcs_shutdown_device(ccwgdev); - } - if (card->dev) - unregister_netdev(card->dev); - lcs_cleanup_card(card); - lcs_free_card(card); - dev_set_drvdata(&ccwgdev->dev, NULL); - put_device(&ccwgdev->dev); -} - -static struct ccw_device_id lcs_ids[] = { - {CCW_DEVICE(0x3088, 0x08), .driver_info = lcs_channel_type_parallel}, - {CCW_DEVICE(0x3088, 0x1f), .driver_info = lcs_channel_type_2216}, - {CCW_DEVICE(0x3088, 0x60), .driver_info = lcs_channel_type_osa2}, - {}, -}; -MODULE_DEVICE_TABLE(ccw, lcs_ids); - -static struct ccw_driver lcs_ccw_driver = { - .driver = { - .owner = THIS_MODULE, - .name = "lcs", - }, - .ids = lcs_ids, - .probe = ccwgroup_probe_ccwdev, - .remove = ccwgroup_remove_ccwdev, - .int_class = IRQIO_LCS, -}; - -/* - * LCS ccwgroup driver registration - */ -static struct ccwgroup_driver lcs_group_driver = { - .driver = { - .owner = THIS_MODULE, - .name = "lcs", - }, - .ccw_driver = &lcs_ccw_driver, - .setup = lcs_probe_device, - .remove = lcs_remove_device, - .set_online = lcs_new_device, - .set_offline = lcs_shutdown_device, -}; - -static ssize_t group_store(struct device_driver *ddrv, const char *buf, - size_t count) -{ - int err; - err = ccwgroup_create_dev(lcs_root_dev, &lcs_group_driver, 2, buf); - return err ? err : count; -} -static DRIVER_ATTR_WO(group); - -static struct attribute *lcs_drv_attrs[] = { - &driver_attr_group.attr, - NULL, -}; -static struct attribute_group lcs_drv_attr_group = { - .attrs = lcs_drv_attrs, -}; -static const struct attribute_group *lcs_drv_attr_groups[] = { - &lcs_drv_attr_group, - NULL, -}; - -/* - * LCS Module/Kernel initialization function - */ -static int -__init lcs_init_module(void) -{ - int rc; - - pr_info("Loading %s\n", version); - rc = lcs_register_debug_facility(); - LCS_DBF_TEXT(0, setup, "lcsinit"); - if (rc) - goto out_err; - lcs_root_dev = root_device_register("lcs"); - rc = PTR_ERR_OR_ZERO(lcs_root_dev); - if (rc) - goto register_err; - rc = ccw_driver_register(&lcs_ccw_driver); - if (rc) - goto ccw_err; - lcs_group_driver.driver.groups = lcs_drv_attr_groups; - rc = ccwgroup_driver_register(&lcs_group_driver); - if (rc) - goto ccwgroup_err; - return 0; - -ccwgroup_err: - ccw_driver_unregister(&lcs_ccw_driver); -ccw_err: - root_device_unregister(lcs_root_dev); -register_err: - lcs_unregister_debug_facility(); -out_err: - pr_err("Initializing the lcs device driver failed\n"); - return rc; -} - - -/* - * LCS module cleanup function - */ -static void -__exit lcs_cleanup_module(void) -{ - pr_info("Terminating lcs module.\n"); - LCS_DBF_TEXT(0, trace, "cleanup"); - ccwgroup_driver_unregister(&lcs_group_driver); - ccw_driver_unregister(&lcs_ccw_driver); - root_device_unregister(lcs_root_dev); - lcs_unregister_debug_facility(); -} - -module_init(lcs_init_module); -module_exit(lcs_cleanup_module); - -MODULE_AUTHOR("Frank Pavlic "); -MODULE_DESCRIPTION("S/390 LAN channel station device driver"); -MODULE_LICENSE("GPL"); - diff --git a/drivers/s390/net/lcs.h b/drivers/s390/net/lcs.h deleted file mode 100644 index a2699b70b050..000000000000 --- a/drivers/s390/net/lcs.h +++ /dev/null @@ -1,342 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/*lcs.h*/ - -#include -#include -#include -#include -#include -#include - -#define LCS_DBF_TEXT(level, name, text) \ - do { \ - debug_text_event(lcs_dbf_##name, level, text); \ - } while (0) - -#define LCS_DBF_HEX(level,name,addr,len) \ -do { \ - debug_event(lcs_dbf_##name,level,(void*)(addr),len); \ -} while (0) - -#define LCS_DBF_TEXT_(level,name,text...) \ - do { \ - if (debug_level_enabled(lcs_dbf_##name, level)) { \ - scnprintf(debug_buffer, sizeof(debug_buffer), text); \ - debug_text_event(lcs_dbf_##name, level, debug_buffer); \ - } \ - } while (0) - -/** - * sysfs related stuff - */ -#define CARD_FROM_DEV(cdev) \ - (struct lcs_card *) dev_get_drvdata( \ - &((struct ccwgroup_device *)dev_get_drvdata(&cdev->dev))->dev); - -/** - * Enum for classifying detected devices. - */ -enum lcs_channel_types { - /* Device is not a channel */ - lcs_channel_type_none, - - /* Device is a 2216 channel */ - lcs_channel_type_parallel, - - /* Device is a 2216 channel */ - lcs_channel_type_2216, - - /* Device is a OSA2 card */ - lcs_channel_type_osa2 -}; - -/** - * CCW commands used in this driver - */ -#define LCS_CCW_WRITE 0x01 -#define LCS_CCW_READ 0x02 -#define LCS_CCW_TRANSFER 0x08 - -/** - * LCS device status primitives - */ -#define LCS_CMD_STARTLAN 0x01 -#define LCS_CMD_STOPLAN 0x02 -#define LCS_CMD_LANSTAT 0x04 -#define LCS_CMD_STARTUP 0x07 -#define LCS_CMD_SHUTDOWN 0x08 -#define LCS_CMD_QIPASSIST 0xb2 -#define LCS_CMD_SETIPM 0xb4 -#define LCS_CMD_DELIPM 0xb5 - -#define LCS_INITIATOR_TCPIP 0x00 -#define LCS_INITIATOR_LGW 0x01 -#define LCS_STD_CMD_SIZE 16 -#define LCS_MULTICAST_CMD_SIZE 404 - -/** - * LCS IPASSIST MASKS,only used when multicast is switched on - */ -/* Not supported by LCS */ -#define LCS_IPASS_ARP_PROCESSING 0x0001 -#define LCS_IPASS_IN_CHECKSUM_SUPPORT 0x0002 -#define LCS_IPASS_OUT_CHECKSUM_SUPPORT 0x0004 -#define LCS_IPASS_IP_FRAG_REASSEMBLY 0x0008 -#define LCS_IPASS_IP_FILTERING 0x0010 -/* Supported by lcs 3172 */ -#define LCS_IPASS_IPV6_SUPPORT 0x0020 -#define LCS_IPASS_MULTICAST_SUPPORT 0x0040 - -/** - * LCS sense byte definitions - */ -#define LCS_SENSE_BYTE_0 0 -#define LCS_SENSE_BYTE_1 1 -#define LCS_SENSE_BYTE_2 2 -#define LCS_SENSE_BYTE_3 3 -#define LCS_SENSE_INTERFACE_DISCONNECT 0x01 -#define LCS_SENSE_EQUIPMENT_CHECK 0x10 -#define LCS_SENSE_BUS_OUT_CHECK 0x20 -#define LCS_SENSE_INTERVENTION_REQUIRED 0x40 -#define LCS_SENSE_CMD_REJECT 0x80 -#define LCS_SENSE_RESETTING_EVENT 0x80 -#define LCS_SENSE_DEVICE_ONLINE 0x20 - -/** - * LCS packet type definitions - */ -#define LCS_FRAME_TYPE_CONTROL 0 -#define LCS_FRAME_TYPE_ENET 1 -#define LCS_FRAME_TYPE_TR 2 -#define LCS_FRAME_TYPE_FDDI 7 -#define LCS_FRAME_TYPE_AUTO -1 - -/** - * some more definitions,we will sort them later - */ -#define LCS_ILLEGAL_OFFSET 0xffff -#define LCS_IOBUFFERSIZE 0x5000 -#define LCS_NUM_BUFFS 32 /* needs to be power of 2 */ -#define LCS_MAC_LENGTH 6 -#define LCS_INVALID_PORT_NO -1 -#define LCS_LANCMD_TIMEOUT_DEFAULT 5 - -/** - * Multicast state - */ -#define LCS_IPM_STATE_SET_REQUIRED 0 -#define LCS_IPM_STATE_DEL_REQUIRED 1 -#define LCS_IPM_STATE_ON_CARD 2 - -/** - * LCS IP Assist declarations - * seems to be only used for multicast - */ -#define LCS_IPASS_ARP_PROCESSING 0x0001 -#define LCS_IPASS_INBOUND_CSUM_SUPP 0x0002 -#define LCS_IPASS_OUTBOUND_CSUM_SUPP 0x0004 -#define LCS_IPASS_IP_FRAG_REASSEMBLY 0x0008 -#define LCS_IPASS_IP_FILTERING 0x0010 -#define LCS_IPASS_IPV6_SUPPORT 0x0020 -#define LCS_IPASS_MULTICAST_SUPPORT 0x0040 - -/** - * LCS Buffer states - */ -enum lcs_buffer_states { - LCS_BUF_STATE_EMPTY, /* buffer is empty */ - LCS_BUF_STATE_LOCKED, /* buffer is locked, don't touch */ - LCS_BUF_STATE_READY, /* buffer is ready for read/write */ - LCS_BUF_STATE_PROCESSED, -}; - -/** - * LCS Channel State Machine declarations - */ -enum lcs_channel_states { - LCS_CH_STATE_INIT, - LCS_CH_STATE_HALTED, - LCS_CH_STATE_STOPPED, - LCS_CH_STATE_RUNNING, - LCS_CH_STATE_SUSPENDED, - LCS_CH_STATE_CLEARED, - LCS_CH_STATE_ERROR, -}; - -/** - * LCS device state machine - */ -enum lcs_dev_states { - DEV_STATE_DOWN, - DEV_STATE_UP, - DEV_STATE_RECOVER, -}; - -enum lcs_threads { - LCS_SET_MC_THREAD = 1, - LCS_RECOVERY_THREAD = 2, -}; - -/** - * LCS struct declarations - */ -struct lcs_header { - __u16 offset; - __u8 type; - __u8 slot; -} __attribute__ ((packed)); - -struct lcs_ip_mac_pair { - __be32 ip_addr; - __u8 mac_addr[LCS_MAC_LENGTH]; - __u8 reserved[2]; -} __attribute__ ((packed)); - -struct lcs_ipm_list { - struct list_head list; - struct lcs_ip_mac_pair ipm; - __u8 ipm_state; -}; - -struct lcs_cmd { - __u16 offset; - __u8 type; - __u8 slot; - __u8 cmd_code; - __u8 initiator; - __u16 sequence_no; - __u16 return_code; - union { - struct { - __u8 lan_type; - __u8 portno; - __u16 parameter_count; - __u8 operator_flags[3]; - __u8 reserved[3]; - } lcs_std_cmd; - struct { - __u16 unused1; - __u16 buff_size; - __u8 unused2[6]; - } lcs_startup; - struct { - __u8 lan_type; - __u8 portno; - __u8 unused[10]; - __u8 mac_addr[LCS_MAC_LENGTH]; - __u32 num_packets_deblocked; - __u32 num_packets_blocked; - __u32 num_packets_tx_on_lan; - __u32 num_tx_errors_detected; - __u32 num_tx_packets_disgarded; - __u32 num_packets_rx_from_lan; - __u32 num_rx_errors_detected; - __u32 num_rx_discarded_nobuffs_avail; - __u32 num_rx_packets_too_large; - } lcs_lanstat_cmd; -#ifdef CONFIG_IP_MULTICAST - struct { - __u8 lan_type; - __u8 portno; - __u16 num_ip_pairs; - __u16 ip_assists_supported; - __u16 ip_assists_enabled; - __u16 version; - struct { - struct lcs_ip_mac_pair - ip_mac_pair[32]; - __u32 response_data; - } lcs_ipass_ctlmsg __attribute ((packed)); - } lcs_qipassist __attribute__ ((packed)); -#endif /*CONFIG_IP_MULTICAST */ - } cmd __attribute__ ((packed)); -} __attribute__ ((packed)); - -/** - * Forward declarations. - */ -struct lcs_card; -struct lcs_channel; - -/** - * Definition of an lcs buffer. - */ -struct lcs_buffer { - enum lcs_buffer_states state; - void *data; - int count; - /* Callback for completion notification. */ - void (*callback)(struct lcs_channel *, struct lcs_buffer *); -}; - -struct lcs_reply { - struct list_head list; - __u16 sequence_no; - refcount_t refcnt; - /* Callback for completion notification. */ - void (*callback)(struct lcs_card *, struct lcs_cmd *); - wait_queue_head_t wait_q; - struct lcs_card *card; - struct timer_list timer; - int received; - int rc; -}; - -/** - * Definition of an lcs channel - */ -struct lcs_channel { - enum lcs_channel_states state; - struct ccw_device *ccwdev; - struct ccw1 ccws[LCS_NUM_BUFFS + 1]; - wait_queue_head_t wait_q; - struct tasklet_struct irq_tasklet; - struct lcs_buffer iob[LCS_NUM_BUFFS]; - int io_idx; - int buf_idx; -}; - - -/** - * definition of the lcs card - */ -struct lcs_card { - spinlock_t lock; - spinlock_t ipm_lock; - enum lcs_dev_states state; - struct net_device *dev; - struct net_device_stats stats; - __be16 (*lan_type_trans)(struct sk_buff *skb, - struct net_device *dev); - struct ccwgroup_device *gdev; - struct lcs_channel read; - struct lcs_channel write; - struct lcs_buffer *tx_buffer; - int tx_emitted; - struct list_head lancmd_waiters; - int lancmd_timeout; - - struct work_struct kernel_thread_starter; - spinlock_t mask_lock; - unsigned long thread_start_mask; - unsigned long thread_running_mask; - unsigned long thread_allowed_mask; - wait_queue_head_t wait_q; - -#ifdef CONFIG_IP_MULTICAST - struct list_head ipm_list; -#endif - __u8 mac[LCS_MAC_LENGTH]; - __u16 ip_assists_supported; - __u16 ip_assists_enabled; - __s8 lan_type; - __u32 pkt_seq; - __u16 sequence_no; - __s16 portno; - /* Some info copied from probeinfo */ - u8 device_forced; - u8 max_port_no; - u8 hint_port_no; - s16 port_protocol_no; -} __attribute__ ((aligned(8))); - From cbe08724c18078564abefbf6591078a7c98e5e0f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 4 Feb 2025 14:48:25 +0000 Subject: [PATCH 23/63] net: flush_backlog() small changes Add READ_ONCE() around reads of skb->dev->reg_state, because this field can be changed from other threads/cpus. Instead of calling dev_kfree_skb_irq() and kfree_skb() while interrupts are masked and locks held, use a temporary list and use __skb_queue_purge_reason() Use SKB_DROP_REASON_DEV_READY drop reason to better describe why these skbs are dropped. Signed-off-by: Eric Dumazet Reviewed-by: Jason Xing Link: https://patch.msgid.link/20250204144825.316785-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/core/dev.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index 2b141f20b13b..c41d1e1cbf62 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6119,16 +6119,18 @@ EXPORT_SYMBOL(netif_receive_skb_list); static void flush_backlog(struct work_struct *work) { struct sk_buff *skb, *tmp; + struct sk_buff_head list; struct softnet_data *sd; + __skb_queue_head_init(&list); local_bh_disable(); sd = this_cpu_ptr(&softnet_data); backlog_lock_irq_disable(sd); skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) { - if (skb->dev->reg_state == NETREG_UNREGISTERING) { + if (READ_ONCE(skb->dev->reg_state) == NETREG_UNREGISTERING) { __skb_unlink(skb, &sd->input_pkt_queue); - dev_kfree_skb_irq(skb); + __skb_queue_tail(&list, skb); rps_input_queue_head_incr(sd); } } @@ -6136,14 +6138,16 @@ static void flush_backlog(struct work_struct *work) local_lock_nested_bh(&softnet_data.process_queue_bh_lock); skb_queue_walk_safe(&sd->process_queue, skb, tmp) { - if (skb->dev->reg_state == NETREG_UNREGISTERING) { + if (READ_ONCE(skb->dev->reg_state) == NETREG_UNREGISTERING) { __skb_unlink(skb, &sd->process_queue); - kfree_skb(skb); + __skb_queue_tail(&list, skb); rps_input_queue_head_incr(sd); } } local_unlock_nested_bh(&softnet_data.process_queue_bh_lock); local_bh_enable(); + + __skb_queue_purge_reason(&list, SKB_DROP_REASON_DEV_READY); } static bool flush_required(int cpu) From faac69a4ae5abb49e62c79c66b51bb905c9aa5ec Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Tue, 4 Feb 2025 07:58:17 +0100 Subject: [PATCH 24/63] r8169: don't scan PHY addresses > 0 The PHY address is a dummy, because r8169 PHY access registers don't support a PHY address. Therefore scan address 0 only. Signed-off-by: Heiner Kallweit Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/830637dd-4016-4a68-92b3-618fcac6589d@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/realtek/r8169_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index 5a5eba49c651..7306c8e323d7 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -5222,6 +5222,7 @@ static int r8169_mdio_register(struct rtl8169_private *tp) new_bus->priv = tp; new_bus->parent = &pdev->dev; new_bus->irq[0] = PHY_MAC_INTERRUPT; + new_bus->phy_mask = GENMASK(31, 1); snprintf(new_bus->id, MII_BUS_ID_SIZE, "r8169-%x-%x", pci_domain_nr(pdev->bus), pci_dev_id(pdev)); From 50f37fc2a39c4a8cc4813629b4cf239b71c6097d Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 4 Feb 2025 22:36:54 +0100 Subject: [PATCH 25/63] ipv4: ip_gre: Fix set but not used warning in ipgre_err() if IPv4-only MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit if CONFIG_NET_IPGRE is enabled, but CONFIG_IPV6 is disabled: net/ipv4/ip_gre.c: In function ‘ipgre_err’: net/ipv4/ip_gre.c:144:22: error: variable ‘data_len’ set but not used [-Werror=unused-but-set-variable] 144 | unsigned int data_len = 0; | ^~~~~~~~ Fix this by moving all data_len processing inside the IPV6-only section that uses its result. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202501121007.2GofXmh5-lkp@intel.com/ Signed-off-by: Geert Uytterhoeven Reviewed-by: Simon Horman Link: https://patch.msgid.link/d09113cfe2bfaca02f3dddf832fb5f48dd20958b.1738704881.git.geert@linux-m68k.org Signed-off-by: Jakub Kicinski --- net/ipv4/ip_gre.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index ed1b6b44faf8..c9f11a046c26 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -141,7 +141,6 @@ static int ipgre_err(struct sk_buff *skb, u32 info, const struct iphdr *iph; const int type = icmp_hdr(skb)->type; const int code = icmp_hdr(skb)->code; - unsigned int data_len = 0; struct ip_tunnel *t; if (tpi->proto == htons(ETH_P_TEB)) @@ -182,7 +181,6 @@ static int ipgre_err(struct sk_buff *skb, u32 info, case ICMP_TIME_EXCEEDED: if (code != ICMP_EXC_TTL) return 0; - data_len = icmp_hdr(skb)->un.reserved[1] * 4; /* RFC 4884 4.1 */ break; case ICMP_REDIRECT: @@ -190,10 +188,16 @@ static int ipgre_err(struct sk_buff *skb, u32 info, } #if IS_ENABLED(CONFIG_IPV6) - if (tpi->proto == htons(ETH_P_IPV6) && - !ip6_err_gen_icmpv6_unreach(skb, iph->ihl * 4 + tpi->hdr_len, - type, data_len)) - return 0; + if (tpi->proto == htons(ETH_P_IPV6)) { + unsigned int data_len = 0; + + if (type == ICMP_TIME_EXCEEDED) + data_len = icmp_hdr(skb)->un.reserved[1] * 4; /* RFC 4884 4.1 */ + + if (!ip6_err_gen_icmpv6_unreach(skb, iph->ihl * 4 + tpi->hdr_len, + type, data_len)) + return 0; + } #endif if (t->parms.iph.daddr == 0 || From f6205f8215f12a96518ac9469ff76294ae7bd612 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 4 Feb 2025 16:55:42 +0200 Subject: [PATCH 26/63] vxlan: Annotate FDB data races The 'used' and 'updated' fields in the FDB entry structure can be accessed concurrently by multiple threads, leading to reports such as [1]. Can be reproduced using [2]. Suppress these reports by annotating these accesses using READ_ONCE() / WRITE_ONCE(). [1] BUG: KCSAN: data-race in vxlan_xmit / vxlan_xmit write to 0xffff942604d263a8 of 8 bytes by task 286 on cpu 0: vxlan_xmit+0xb29/0x2380 dev_hard_start_xmit+0x84/0x2f0 __dev_queue_xmit+0x45a/0x1650 packet_xmit+0x100/0x150 packet_sendmsg+0x2114/0x2ac0 __sys_sendto+0x318/0x330 __x64_sys_sendto+0x76/0x90 x64_sys_call+0x14e8/0x1c00 do_syscall_64+0x9e/0x1a0 entry_SYSCALL_64_after_hwframe+0x77/0x7f read to 0xffff942604d263a8 of 8 bytes by task 287 on cpu 2: vxlan_xmit+0xadf/0x2380 dev_hard_start_xmit+0x84/0x2f0 __dev_queue_xmit+0x45a/0x1650 packet_xmit+0x100/0x150 packet_sendmsg+0x2114/0x2ac0 __sys_sendto+0x318/0x330 __x64_sys_sendto+0x76/0x90 x64_sys_call+0x14e8/0x1c00 do_syscall_64+0x9e/0x1a0 entry_SYSCALL_64_after_hwframe+0x77/0x7f value changed: 0x00000000fffbac6e -> 0x00000000fffbac6f Reported by Kernel Concurrency Sanitizer on: CPU: 2 UID: 0 PID: 287 Comm: mausezahn Not tainted 6.13.0-rc7-01544-gb4b270f11a02 #5 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.3-3.fc41 04/01/2014 [2] #!/bin/bash set +H echo whitelist > /sys/kernel/debug/kcsan echo !vxlan_xmit > /sys/kernel/debug/kcsan ip link add name vx0 up type vxlan id 10010 dstport 4789 local 192.0.2.1 bridge fdb add 00:11:22:33:44:55 dev vx0 self static dst 198.51.100.1 taskset -c 0 mausezahn vx0 -a own -b 00:11:22:33:44:55 -c 0 -q & taskset -c 2 mausezahn vx0 -a own -b 00:11:22:33:44:55 -c 0 -q & Reviewed-by: Petr Machata Signed-off-by: Ido Schimmel Reviewed-by: Eric Dumazet Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/20250204145549.1216254-2-idosch@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/vxlan/vxlan_core.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index 05c10acb2a57..2f2c6606f719 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -227,9 +227,9 @@ static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan, be32_to_cpu(fdb->vni))) goto nla_put_failure; - ci.ndm_used = jiffies_to_clock_t(now - fdb->used); + ci.ndm_used = jiffies_to_clock_t(now - READ_ONCE(fdb->used)); ci.ndm_confirmed = 0; - ci.ndm_updated = jiffies_to_clock_t(now - fdb->updated); + ci.ndm_updated = jiffies_to_clock_t(now - READ_ONCE(fdb->updated)); ci.ndm_refcnt = 0; if (nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci)) @@ -434,8 +434,8 @@ static struct vxlan_fdb *vxlan_find_mac(struct vxlan_dev *vxlan, struct vxlan_fdb *f; f = __vxlan_find_mac(vxlan, mac, vni); - if (f && f->used != jiffies) - f->used = jiffies; + if (f && READ_ONCE(f->used) != jiffies) + WRITE_ONCE(f->used, jiffies); return f; } @@ -1009,12 +1009,12 @@ static int vxlan_fdb_update_existing(struct vxlan_dev *vxlan, !(f->flags & NTF_VXLAN_ADDED_BY_USER)) { if (f->state != state) { f->state = state; - f->updated = jiffies; + WRITE_ONCE(f->updated, jiffies); notify = 1; } if (f->flags != fdb_flags) { f->flags = fdb_flags; - f->updated = jiffies; + WRITE_ONCE(f->updated, jiffies); notify = 1; } } @@ -1048,7 +1048,7 @@ static int vxlan_fdb_update_existing(struct vxlan_dev *vxlan, } if (ndm_flags & NTF_USE) - f->used = jiffies; + WRITE_ONCE(f->used, jiffies); if (notify) { if (rd == NULL) @@ -1481,7 +1481,7 @@ static enum skb_drop_reason vxlan_snoop(struct net_device *dev, src_mac, &rdst->remote_ip.sa, &src_ip->sa); rdst->remote_ip = *src_ip; - f->updated = jiffies; + WRITE_ONCE(f->updated, jiffies); vxlan_fdb_notify(vxlan, f, rdst, RTM_NEWNEIGH, true, NULL); } else { u32 hash_index = fdb_head_index(vxlan, src_mac, vni); @@ -2852,7 +2852,7 @@ static void vxlan_cleanup(struct timer_list *t) if (f->flags & NTF_EXT_LEARNED) continue; - timeout = f->used + vxlan->cfg.age_interval * HZ; + timeout = READ_ONCE(f->used) + vxlan->cfg.age_interval * HZ; if (time_before_eq(timeout, jiffies)) { netdev_dbg(vxlan->dev, "garbage collect %pM\n", From 1370c45d6e7e3cbac4b6dc71f54fd6e167848900 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 4 Feb 2025 16:55:43 +0200 Subject: [PATCH 27/63] vxlan: Read jiffies once when updating FDB 'used' time Avoid two volatile reads in the data path. Instead, read jiffies once and only if an FDB entry was found. Reviewed-by: Petr Machata Signed-off-by: Ido Schimmel Reviewed-by: Eric Dumazet Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/20250204145549.1216254-3-idosch@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/vxlan/vxlan_core.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index 2f2c6606f719..676a93ce3a19 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -434,8 +434,12 @@ static struct vxlan_fdb *vxlan_find_mac(struct vxlan_dev *vxlan, struct vxlan_fdb *f; f = __vxlan_find_mac(vxlan, mac, vni); - if (f && READ_ONCE(f->used) != jiffies) - WRITE_ONCE(f->used, jiffies); + if (f) { + unsigned long now = jiffies; + + if (READ_ONCE(f->used) != now) + WRITE_ONCE(f->used, now); + } return f; } From c4f2082bf641d270fd518e8c218196eb26ac1c3c Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 4 Feb 2025 16:55:44 +0200 Subject: [PATCH 28/63] vxlan: Always refresh FDB 'updated' time when learning is enabled Currently, when learning is enabled and a packet is received from the expected remote, the 'updated' field of the FDB entry is not refreshed. This will become a problem when we switch the VXLAN driver to age out entries based on the 'updated' field. Solve this by always refreshing an FDB entry when we receive a packet with a matching source MAC address, regardless if it was received via the expected remote or not as it indicates the host is alive. This is consistent with the bridge driver's FDB. Reviewed-by: Petr Machata Signed-off-by: Ido Schimmel Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/20250204145549.1216254-4-idosch@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/vxlan/vxlan_core.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index 676a93ce3a19..36cb06a56aca 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -1466,6 +1466,10 @@ static enum skb_drop_reason vxlan_snoop(struct net_device *dev, f = vxlan_find_mac(vxlan, src_mac, vni); if (likely(f)) { struct vxlan_rdst *rdst = first_remote_rcu(f); + unsigned long now = jiffies; + + if (READ_ONCE(f->updated) != now) + WRITE_ONCE(f->updated, now); if (likely(vxlan_addr_equal(&rdst->remote_ip, src_ip) && rdst->remote_ifindex == ifindex)) @@ -1485,7 +1489,6 @@ static enum skb_drop_reason vxlan_snoop(struct net_device *dev, src_mac, &rdst->remote_ip.sa, &src_ip->sa); rdst->remote_ip = *src_ip; - WRITE_ONCE(f->updated, jiffies); vxlan_fdb_notify(vxlan, f, rdst, RTM_NEWNEIGH, true, NULL); } else { u32 hash_index = fdb_head_index(vxlan, src_mac, vni); From 40a9994f2fbddf299655073be947e9cfc57dfdf1 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 4 Feb 2025 16:55:45 +0200 Subject: [PATCH 29/63] vxlan: Refresh FDB 'updated' time upon 'NTF_USE' The 'NTF_USE' flag can be used by user space to refresh FDB entries so that they will not age out. Currently, the VXLAN driver implements it by refreshing the 'used' field in the FDB entry as this is the field according to which FDB entries are aged out. Subsequent patches will switch the VXLAN driver to age out entries based on the 'updated' field. Prepare for this change by refreshing the 'updated' field upon 'NTF_USE'. This is consistent with the bridge driver's FDB: # ip link add name br1 up type bridge # ip link add name swp1 master br1 up type dummy # bridge fdb add 00:11:22:33:44:55 dev swp1 master dynamic vlan 1 # sleep 10 # bridge fdb replace 00:11:22:33:44:55 dev swp1 master dynamic vlan 1 # bridge -s -j fdb get 00:11:22:33:44:55 br br1 vlan 1 | jq '.[]["updated"]' 10 # sleep 10 # bridge fdb replace 00:11:22:33:44:55 dev swp1 master use dynamic vlan 1 # bridge -s -j fdb get 00:11:22:33:44:55 br br1 vlan 1 | jq '.[]["updated"]' 0 Before: # ip link add name vx1 up type vxlan id 10010 dstport 4789 # bridge fdb add 00:11:22:33:44:55 dev vx1 self dynamic dst 198.51.100.1 # sleep 10 # bridge fdb replace 00:11:22:33:44:55 dev vx1 self dynamic dst 198.51.100.1 # bridge -s -j -p fdb get 00:11:22:33:44:55 br vx1 self | jq '.[]["updated"]' 10 # sleep 10 # bridge fdb replace 00:11:22:33:44:55 dev vx1 self use dynamic dst 198.51.100.1 # bridge -s -j -p fdb get 00:11:22:33:44:55 br vx1 self | jq '.[]["updated"]' 20 After: # ip link add name vx1 up type vxlan id 10010 dstport 4789 # bridge fdb add 00:11:22:33:44:55 dev vx1 self dynamic dst 198.51.100.1 # sleep 10 # bridge fdb replace 00:11:22:33:44:55 dev vx1 self dynamic dst 198.51.100.1 # bridge -s -j -p fdb get 00:11:22:33:44:55 br vx1 self | jq '.[]["updated"]' 10 # sleep 10 # bridge fdb replace 00:11:22:33:44:55 dev vx1 self use dynamic dst 198.51.100.1 # bridge -s -j -p fdb get 00:11:22:33:44:55 br vx1 self | jq '.[]["updated"]' 0 Reviewed-by: Petr Machata Signed-off-by: Ido Schimmel Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/20250204145549.1216254-5-idosch@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/vxlan/vxlan_core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index 36cb06a56aca..c73138647110 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -1051,8 +1051,10 @@ static int vxlan_fdb_update_existing(struct vxlan_dev *vxlan, notify |= rc; } - if (ndm_flags & NTF_USE) + if (ndm_flags & NTF_USE) { WRITE_ONCE(f->used, jiffies); + WRITE_ONCE(f->updated, jiffies); + } if (notify) { if (rd == NULL) From fb2f449eca514a2dc23fdd35a0973d343f028a92 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 4 Feb 2025 16:55:46 +0200 Subject: [PATCH 30/63] vxlan: Refresh FDB 'updated' time upon user space updates When a host migrates to a different remote and a packet is received from the new remote, the corresponding FDB entry is updated and its 'updated' time is refreshed. However, when user space replaces the remote of an FDB entry, its 'updated' time is not refreshed: # ip link add name vx1 up type vxlan id 10010 dstport 4789 # bridge fdb add 00:11:22:33:44:55 dev vx1 self dynamic dst 198.51.100.1 # sleep 10 # bridge -s -j -p fdb get 00:11:22:33:44:55 br vx1 self | jq '.[]["updated"]' 10 # bridge fdb replace 00:11:22:33:44:55 dev vx1 self dynamic dst 198.51.100.2 # bridge -s -j -p fdb get 00:11:22:33:44:55 br vx1 self | jq '.[]["updated"]' 10 This can lead to the entry being aged out prematurely and it is also inconsistent with the bridge driver: # ip link add name br1 up type bridge # ip link add name swp1 master br1 up type dummy # ip link add name swp2 master br1 up type dummy # bridge fdb add 00:11:22:33:44:55 dev swp1 master dynamic vlan 1 # sleep 10 # bridge -s -j fdb get 00:11:22:33:44:55 br br1 vlan 1 | jq '.[]["updated"]' 10 # bridge fdb replace 00:11:22:33:44:55 dev swp2 master dynamic vlan 1 # bridge -s -j fdb get 00:11:22:33:44:55 br br1 vlan 1 | jq '.[]["updated"]' 0 Adjust the VXLAN driver to refresh the 'updated' time of an FDB entry whenever one of its attributes is changed by user space: # ip link add name vx1 up type vxlan id 10010 dstport 4789 # bridge fdb add 00:11:22:33:44:55 dev vx1 self dynamic dst 198.51.100.1 # sleep 10 # bridge -s -j -p fdb get 00:11:22:33:44:55 br vx1 self | jq '.[]["updated"]' 10 # bridge fdb replace 00:11:22:33:44:55 dev vx1 self dynamic dst 198.51.100.2 # bridge -s -j -p fdb get 00:11:22:33:44:55 br vx1 self | jq '.[]["updated"]' 0 Reviewed-by: Petr Machata Signed-off-by: Ido Schimmel Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/20250204145549.1216254-6-idosch@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/vxlan/vxlan_core.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index c73138647110..c75fcb0679ac 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -1013,12 +1013,10 @@ static int vxlan_fdb_update_existing(struct vxlan_dev *vxlan, !(f->flags & NTF_VXLAN_ADDED_BY_USER)) { if (f->state != state) { f->state = state; - WRITE_ONCE(f->updated, jiffies); notify = 1; } if (f->flags != fdb_flags) { f->flags = fdb_flags; - WRITE_ONCE(f->updated, jiffies); notify = 1; } } @@ -1060,6 +1058,7 @@ static int vxlan_fdb_update_existing(struct vxlan_dev *vxlan, if (rd == NULL) rd = first_remote_rtnl(f); + WRITE_ONCE(f->updated, jiffies); err = vxlan_fdb_notify(vxlan, f, rd, RTM_NEWNEIGH, swdev_notify, extack); if (err) From b4a1d98b0fa533939128436e24df441a5c025ea7 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 4 Feb 2025 16:55:47 +0200 Subject: [PATCH 31/63] vxlan: Age out FDB entries based on 'updated' time Currently, the VXLAN driver ages out FDB entries based on their 'used' time which is refreshed by both the Tx and Rx paths. This means that an FDB entry will not age out if traffic is only forwarded to the target host: # ip link add name vx1 up type vxlan id 10010 local 192.0.2.1 dstport 4789 learning ageing 10 # bridge fdb add 00:11:22:33:44:55 dev vx1 self dynamic dst 198.51.100.1 # bridge fdb get 00:11:22:33:44:55 br vx1 self 00:11:22:33:44:55 dev vx1 dst 198.51.100.1 self # mausezahn vx1 -a own -b 00:11:22:33:44:55 -c 0 -p 100 -q & # sleep 20 # bridge fdb get 00:11:22:33:44:55 br vx1 self 00:11:22:33:44:55 dev vx1 dst 198.51.100.1 self This is wrong as an FDB entry will remain present when we no longer have an indication that the host is still behind the current remote. It is also inconsistent with the bridge driver: # ip link add name br1 up type bridge ageing_time $((10 * 100)) # ip link add name swp1 up master br1 type dummy # bridge fdb add 00:11:22:33:44:55 dev swp1 master dynamic # bridge fdb get 00:11:22:33:44:55 br br1 00:11:22:33:44:55 dev swp1 master br1 # mausezahn br1 -a own -b 00:11:22:33:44:55 -c 0 -p 100 -q & # sleep 20 # bridge fdb get 00:11:22:33:44:55 br br1 Error: Fdb entry not found. Solve this by aging out entries based on their 'updated' time, which is not refreshed by the Tx path: # ip link add name vx1 up type vxlan id 10010 local 192.0.2.1 dstport 4789 learning ageing 10 # bridge fdb add 00:11:22:33:44:55 dev vx1 self dynamic dst 198.51.100.1 # bridge fdb get 00:11:22:33:44:55 br vx1 self 00:11:22:33:44:55 dev vx1 dst 198.51.100.1 self # mausezahn vx1 -a own -b 00:11:22:33:44:55 -c 0 -p 100 -q & # sleep 20 # bridge fdb get 00:11:22:33:44:55 br vx1 self Error: Fdb entry not found. But is refreshed by the Rx path: # ip address add 192.0.2.1/32 dev lo # ip link add name vx1 up type vxlan id 10010 local 192.0.2.1 dstport 4789 localbypass # ip link add name vx2 up type vxlan id 20010 local 192.0.2.1 dstport 4789 learning ageing 10 # bridge fdb add 00:11:22:33:44:55 dev vx1 self static dst 127.0.0.1 vni 20010 # mausezahn vx1 -a 00:aa:bb:cc:dd:ee -b 00:11:22:33:44:55 -c 0 -p 100 -q & # sleep 20 # bridge fdb get 00:aa:bb:cc:dd:ee br vx2 self 00:aa:bb:cc:dd:ee dev vx2 dst 127.0.0.1 self # pkill mausezahn # sleep 20 # bridge fdb get 00:aa:bb:cc:dd:ee br vx2 self Error: Fdb entry not found. Reviewed-by: Petr Machata Signed-off-by: Ido Schimmel Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/20250204145549.1216254-7-idosch@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/vxlan/vxlan_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index c75fcb0679ac..01797becae09 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -2860,7 +2860,7 @@ static void vxlan_cleanup(struct timer_list *t) if (f->flags & NTF_EXT_LEARNED) continue; - timeout = READ_ONCE(f->used) + vxlan->cfg.age_interval * HZ; + timeout = READ_ONCE(f->updated) + vxlan->cfg.age_interval * HZ; if (time_before_eq(timeout, jiffies)) { netdev_dbg(vxlan->dev, "garbage collect %pM\n", From 9722f834fe9a7c583591defa2cab3f652f50a5f0 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 4 Feb 2025 16:55:48 +0200 Subject: [PATCH 32/63] vxlan: Avoid unnecessary updates to FDB 'used' time Now that the VXLAN driver ages out FDB entries based on their 'updated' time we can remove unnecessary updates of the 'used' time from the Rx path and the control path, so that the 'used' time is only updated by the Tx path. Reviewed-by: Petr Machata Signed-off-by: Ido Schimmel Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/20250204145549.1216254-8-idosch@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/vxlan/vxlan_core.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index 01797becae09..ece5415f9013 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -1049,10 +1049,8 @@ static int vxlan_fdb_update_existing(struct vxlan_dev *vxlan, notify |= rc; } - if (ndm_flags & NTF_USE) { - WRITE_ONCE(f->used, jiffies); + if (ndm_flags & NTF_USE) WRITE_ONCE(f->updated, jiffies); - } if (notify) { if (rd == NULL) @@ -1297,7 +1295,7 @@ int __vxlan_fdb_delete(struct vxlan_dev *vxlan, struct vxlan_fdb *f; int err = -ENOENT; - f = vxlan_find_mac(vxlan, addr, src_vni); + f = __vxlan_find_mac(vxlan, addr, src_vni); if (!f) return err; @@ -1464,7 +1462,7 @@ static enum skb_drop_reason vxlan_snoop(struct net_device *dev, ifindex = src_ifindex; #endif - f = vxlan_find_mac(vxlan, src_mac, vni); + f = __vxlan_find_mac(vxlan, src_mac, vni); if (likely(f)) { struct vxlan_rdst *rdst = first_remote_rcu(f); unsigned long now = jiffies; @@ -4773,7 +4771,7 @@ vxlan_fdb_offloaded_set(struct net_device *dev, spin_lock_bh(&vxlan->hash_lock[hash_index]); - f = vxlan_find_mac(vxlan, fdb_info->eth_addr, fdb_info->vni); + f = __vxlan_find_mac(vxlan, fdb_info->eth_addr, fdb_info->vni); if (!f) goto out; @@ -4829,7 +4827,7 @@ vxlan_fdb_external_learn_del(struct net_device *dev, hash_index = fdb_head_index(vxlan, fdb_info->eth_addr, fdb_info->vni); spin_lock_bh(&vxlan->hash_lock[hash_index]); - f = vxlan_find_mac(vxlan, fdb_info->eth_addr, fdb_info->vni); + f = __vxlan_find_mac(vxlan, fdb_info->eth_addr, fdb_info->vni); if (!f) err = -ENOENT; else if (f->flags & NTF_EXT_LEARNED) From c467a98e1de0359a0d8b6d881ecc2762c918cfc7 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 4 Feb 2025 16:55:49 +0200 Subject: [PATCH 33/63] selftests: forwarding: vxlan_bridge_1d: Check aging while forwarding Extend the VXLAN FDB aging test case to verify that FDB entries are aged out when they only forward traffic and not refreshed by received traffic. The test fails before "vxlan: Age out FDB entries based on 'updated' time": # ./vxlan_bridge_1d.sh [...] TEST: VXLAN: Ageing of learned FDB entry [FAIL] [...] # echo $? 1 And passes after it: # ./vxlan_bridge_1d.sh [...] TEST: VXLAN: Ageing of learned FDB entry [ OK ] [...] # echo $? 0 Reviewed-by: Petr Machata Signed-off-by: Ido Schimmel Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/20250204145549.1216254-9-idosch@nvidia.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/forwarding/vxlan_bridge_1d.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/testing/selftests/net/forwarding/vxlan_bridge_1d.sh b/tools/testing/selftests/net/forwarding/vxlan_bridge_1d.sh index 3f9d50f1ef9e..180c5eca556f 100755 --- a/tools/testing/selftests/net/forwarding/vxlan_bridge_1d.sh +++ b/tools/testing/selftests/net/forwarding/vxlan_bridge_1d.sh @@ -740,6 +740,8 @@ test_learning() vxlan_flood_test $mac $dst 0 10 0 + # The entry should age out when it only forwards traffic + $MZ $h1 -c 50 -d 1sec -p 64 -b $mac -B $dst -t icmp -q & sleep 60 bridge fdb show brport vx1 | grep $mac | grep -q self From e3ad54f5bdb9432bf34673f1050e8c28eee7908a Mon Sep 17 00:00:00 2001 From: Jianbo Liu Date: Mon, 3 Feb 2025 23:35:02 +0200 Subject: [PATCH 34/63] net/mlx5: Add helper functions for PTP callbacks The PTP callback functions should not be used directly by internal callers. Add helpers that can be used internally and externally. Signed-off-by: Jianbo Liu Reviewed-by: Carolina Jubran Reviewed-by: Dragos Tatulea Signed-off-by: Tariq Toukan Reviewed-by: Mateusz Polchlopek Signed-off-by: Paolo Abeni --- .../ethernet/mellanox/mlx5/core/lib/clock.c | 32 +++++++++++++------ 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c index d61a1a9297c9..eaf343756026 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c @@ -119,6 +119,13 @@ static u32 mlx5_ptp_shift_constant(u32 dev_freq_khz) ilog2((U32_MAX / NSEC_PER_MSEC) * dev_freq_khz)); } +static s32 mlx5_clock_getmaxphase(struct mlx5_core_dev *mdev) +{ + return MLX5_CAP_MCAM_FEATURE(mdev, mtutc_time_adjustment_extended_range) ? + MLX5_MTUTC_OPERATION_ADJUST_TIME_EXTENDED_MAX : + MLX5_MTUTC_OPERATION_ADJUST_TIME_MAX; +} + static s32 mlx5_ptp_getmaxphase(struct ptp_clock_info *ptp) { struct mlx5_clock *clock = container_of(ptp, struct mlx5_clock, ptp_info); @@ -126,14 +133,12 @@ static s32 mlx5_ptp_getmaxphase(struct ptp_clock_info *ptp) mdev = container_of(clock, struct mlx5_core_dev, clock); - return MLX5_CAP_MCAM_FEATURE(mdev, mtutc_time_adjustment_extended_range) ? - MLX5_MTUTC_OPERATION_ADJUST_TIME_EXTENDED_MAX : - MLX5_MTUTC_OPERATION_ADJUST_TIME_MAX; + return mlx5_clock_getmaxphase(mdev); } static bool mlx5_is_mtutc_time_adj_cap(struct mlx5_core_dev *mdev, s64 delta) { - s64 max = mlx5_ptp_getmaxphase(&mdev->clock.ptp_info); + s64 max = mlx5_clock_getmaxphase(mdev); if (delta < -max || delta > max) return false; @@ -361,15 +366,12 @@ static int mlx5_ptp_settime_real_time(struct mlx5_core_dev *mdev, return mlx5_set_mtutc(mdev, in, sizeof(in)); } -static int mlx5_ptp_settime(struct ptp_clock_info *ptp, const struct timespec64 *ts) +static int mlx5_clock_settime(struct mlx5_core_dev *mdev, struct mlx5_clock *clock, + const struct timespec64 *ts) { - struct mlx5_clock *clock = container_of(ptp, struct mlx5_clock, ptp_info); struct mlx5_timer *timer = &clock->timer; - struct mlx5_core_dev *mdev; unsigned long flags; - mdev = container_of(clock, struct mlx5_core_dev, clock); - if (mlx5_modify_mtutc_allowed(mdev)) { int err = mlx5_ptp_settime_real_time(mdev, ts); @@ -385,6 +387,16 @@ static int mlx5_ptp_settime(struct ptp_clock_info *ptp, const struct timespec64 return 0; } +static int mlx5_ptp_settime(struct ptp_clock_info *ptp, const struct timespec64 *ts) +{ + struct mlx5_clock *clock = container_of(ptp, struct mlx5_clock, ptp_info); + struct mlx5_core_dev *mdev; + + mdev = container_of(clock, struct mlx5_core_dev, clock); + + return mlx5_clock_settime(mdev, clock, ts); +} + static struct timespec64 mlx5_ptp_gettimex_real_time(struct mlx5_core_dev *mdev, struct ptp_system_timestamp *sts) @@ -1129,7 +1141,7 @@ static void mlx5_init_timer_clock(struct mlx5_core_dev *mdev) struct timespec64 ts; ktime_get_real_ts64(&ts); - mlx5_ptp_settime(&clock->ptp_info, &ts); + mlx5_clock_settime(mdev, clock, &ts); } } From 9f722fb105216771f3a494a83dfad445de8a7f2b Mon Sep 17 00:00:00 2001 From: Jianbo Liu Date: Mon, 3 Feb 2025 23:35:03 +0200 Subject: [PATCH 35/63] net/mlx5: Change parameters for PTP internal functions In later patch, the mlx5_clock will be allocated dynamically, its address can be obtained from mlx5_core_dev struct, but mdev can't be obtained from mlx5_clock because it can be shared by multiple interfaces. So change the parameter for such internal functions, only mdev is passed down from the callers. Signed-off-by: Jianbo Liu Reviewed-by: Carolina Jubran Reviewed-by: Dragos Tatulea Signed-off-by: Tariq Toukan Signed-off-by: Paolo Abeni --- .../ethernet/mellanox/mlx5/core/lib/clock.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c index eaf343756026..e7e4bdba02a3 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c @@ -878,10 +878,8 @@ static int mlx5_query_mtpps_pin_mode(struct mlx5_core_dev *mdev, u8 pin, mtpps_size, MLX5_REG_MTPPS, 0, 0); } -static int mlx5_get_pps_pin_mode(struct mlx5_clock *clock, u8 pin) +static int mlx5_get_pps_pin_mode(struct mlx5_core_dev *mdev, u8 pin) { - struct mlx5_core_dev *mdev = container_of(clock, struct mlx5_core_dev, clock); - u32 out[MLX5_ST_SZ_DW(mtpps_reg)] = {}; u8 mode; int err; @@ -900,8 +898,9 @@ static int mlx5_get_pps_pin_mode(struct mlx5_clock *clock, u8 pin) return PTP_PF_NONE; } -static void mlx5_init_pin_config(struct mlx5_clock *clock) +static void mlx5_init_pin_config(struct mlx5_core_dev *mdev) { + struct mlx5_clock *clock = &mdev->clock; int i; if (!clock->ptp_info.n_pins) @@ -922,7 +921,7 @@ static void mlx5_init_pin_config(struct mlx5_clock *clock) sizeof(clock->ptp_info.pin_config[i].name), "mlx5_pps%d", i); clock->ptp_info.pin_config[i].index = i; - clock->ptp_info.pin_config[i].func = mlx5_get_pps_pin_mode(clock, i); + clock->ptp_info.pin_config[i].func = mlx5_get_pps_pin_mode(mdev, i); clock->ptp_info.pin_config[i].chan = 0; } } @@ -1041,10 +1040,10 @@ static void mlx5_timecounter_init(struct mlx5_core_dev *mdev) ktime_to_ns(ktime_get_real())); } -static void mlx5_init_overflow_period(struct mlx5_clock *clock) +static void mlx5_init_overflow_period(struct mlx5_core_dev *mdev) { - struct mlx5_core_dev *mdev = container_of(clock, struct mlx5_core_dev, clock); struct mlx5_ib_clock_info *clock_info = mdev->clock_info; + struct mlx5_clock *clock = &mdev->clock; struct mlx5_timer *timer = &clock->timer; u64 overflow_cycles; u64 frac = 0; @@ -1135,7 +1134,7 @@ static void mlx5_init_timer_clock(struct mlx5_core_dev *mdev) mlx5_timecounter_init(mdev); mlx5_init_clock_info(mdev); - mlx5_init_overflow_period(clock); + mlx5_init_overflow_period(mdev); if (mlx5_real_time_mode(mdev)) { struct timespec64 ts; @@ -1147,13 +1146,11 @@ static void mlx5_init_timer_clock(struct mlx5_core_dev *mdev) static void mlx5_init_pps(struct mlx5_core_dev *mdev) { - struct mlx5_clock *clock = &mdev->clock; - if (!MLX5_PPS_CAP(mdev)) return; mlx5_get_pps_caps(mdev); - mlx5_init_pin_config(clock); + mlx5_init_pin_config(mdev); } void mlx5_init_clock(struct mlx5_core_dev *mdev) From ccb717a88b2ed57e464c3099d2e8b0c9db7cef21 Mon Sep 17 00:00:00 2001 From: Jianbo Liu Date: Mon, 3 Feb 2025 23:35:04 +0200 Subject: [PATCH 36/63] net/mlx5: Add init and destruction functions for a single HW clock Move hardware clock initialization and destruction to the functions, which will be used for dynamically allocated clock. Such clock is shared by all the devices if the queried clock identities are same. The out_work is for PPS out event, which can't be triggered when clock is shared, so INIT_WORK is not moved to the initialization function. Besides, we still need to register notifier for each device. Signed-off-by: Jianbo Liu Reviewed-by: Carolina Jubran Reviewed-by: Dragos Tatulea Signed-off-by: Tariq Toukan Signed-off-by: Paolo Abeni --- .../ethernet/mellanox/mlx5/core/lib/clock.c | 48 ++++++++++++------- 1 file changed, 31 insertions(+), 17 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c index e7e4bdba02a3..cc0a491bf617 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c @@ -1153,17 +1153,11 @@ static void mlx5_init_pps(struct mlx5_core_dev *mdev) mlx5_init_pin_config(mdev); } -void mlx5_init_clock(struct mlx5_core_dev *mdev) +static void mlx5_init_clock_dev(struct mlx5_core_dev *mdev) { struct mlx5_clock *clock = &mdev->clock; - if (!MLX5_CAP_GEN(mdev, device_frequency_khz)) { - mlx5_core_warn(mdev, "invalid device_frequency_khz, aborting HW clock init\n"); - return; - } - seqlock_init(&clock->lock); - INIT_WORK(&clock->pps_info.out_work, mlx5_pps_out); /* Initialize the device clock */ mlx5_init_timer_clock(mdev); @@ -1179,28 +1173,19 @@ void mlx5_init_clock(struct mlx5_core_dev *mdev) clock->ptp = NULL; } - MLX5_NB_INIT(&clock->pps_nb, mlx5_pps_event, PPS_EVENT); - mlx5_eq_notifier_register(mdev, &clock->pps_nb); - if (clock->ptp) ptp_schedule_worker(clock->ptp, 0); } -void mlx5_cleanup_clock(struct mlx5_core_dev *mdev) +static void mlx5_destroy_clock_dev(struct mlx5_core_dev *mdev) { struct mlx5_clock *clock = &mdev->clock; - if (!MLX5_CAP_GEN(mdev, device_frequency_khz)) - return; - - mlx5_eq_notifier_unregister(mdev, &clock->pps_nb); if (clock->ptp) { ptp_clock_unregister(clock->ptp); clock->ptp = NULL; } - cancel_work_sync(&clock->pps_info.out_work); - if (mdev->clock_info) { free_page((unsigned long)mdev->clock_info); mdev->clock_info = NULL; @@ -1208,3 +1193,32 @@ void mlx5_cleanup_clock(struct mlx5_core_dev *mdev) kfree(clock->ptp_info.pin_config); } + +void mlx5_init_clock(struct mlx5_core_dev *mdev) +{ + struct mlx5_clock *clock = &mdev->clock; + + if (!MLX5_CAP_GEN(mdev, device_frequency_khz)) { + mlx5_core_warn(mdev, "invalid device_frequency_khz, aborting HW clock init\n"); + return; + } + + mlx5_init_clock_dev(mdev); + + INIT_WORK(&clock->pps_info.out_work, mlx5_pps_out); + MLX5_NB_INIT(&clock->pps_nb, mlx5_pps_event, PPS_EVENT); + mlx5_eq_notifier_register(mdev, &clock->pps_nb); +} + +void mlx5_cleanup_clock(struct mlx5_core_dev *mdev) +{ + struct mlx5_clock *clock = &mdev->clock; + + if (!MLX5_CAP_GEN(mdev, device_frequency_khz)) + return; + + mlx5_eq_notifier_unregister(mdev, &clock->pps_nb); + cancel_work_sync(&clock->pps_info.out_work); + + mlx5_destroy_clock_dev(mdev); +} From 355f58f10911f9654d42dcba3cbe127238b4fd94 Mon Sep 17 00:00:00 2001 From: Jianbo Liu Date: Mon, 3 Feb 2025 23:35:05 +0200 Subject: [PATCH 37/63] net/mlx5: Add API to get mlx5_core_dev from mlx5_clock The mdev is calculated directly from mlx5_clock, as it's one of the fields in mlx5_core_dev. Move to a function so it can be easily changed in next patch. Signed-off-by: Jianbo Liu Reviewed-by: Carolina Jubran Reviewed-by: Dragos Tatulea Signed-off-by: Tariq Toukan Signed-off-by: Paolo Abeni --- .../ethernet/mellanox/mlx5/core/lib/clock.c | 35 ++++++++++--------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c index cc0a491bf617..b2c88050ba36 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c @@ -77,6 +77,11 @@ enum { MLX5_MTUTC_OPERATION_ADJUST_TIME_EXTENDED_MAX = 200000, }; +static struct mlx5_core_dev *mlx5_clock_mdev_get(struct mlx5_clock *clock) +{ + return container_of(clock, struct mlx5_core_dev, clock); +} + static bool mlx5_real_time_mode(struct mlx5_core_dev *mdev) { return (mlx5_is_real_time_rq(mdev) || mlx5_is_real_time_sq(mdev)); @@ -131,7 +136,7 @@ static s32 mlx5_ptp_getmaxphase(struct ptp_clock_info *ptp) struct mlx5_clock *clock = container_of(ptp, struct mlx5_clock, ptp_info); struct mlx5_core_dev *mdev; - mdev = container_of(clock, struct mlx5_core_dev, clock); + mdev = mlx5_clock_mdev_get(clock); return mlx5_clock_getmaxphase(mdev); } @@ -226,7 +231,7 @@ static int mlx5_ptp_getcrosststamp(struct ptp_clock_info *ptp, struct system_time_snapshot history_begin = {0}; struct mlx5_core_dev *mdev; - mdev = container_of(clock, struct mlx5_core_dev, clock); + mdev = mlx5_clock_mdev_get(clock); if (!mlx5_is_ptm_source_time_available(mdev)) return -EBUSY; @@ -268,8 +273,7 @@ static u64 read_internal_timer(const struct cyclecounter *cc) { struct mlx5_timer *timer = container_of(cc, struct mlx5_timer, cycles); struct mlx5_clock *clock = container_of(timer, struct mlx5_clock, timer); - struct mlx5_core_dev *mdev = container_of(clock, struct mlx5_core_dev, - clock); + struct mlx5_core_dev *mdev = mlx5_clock_mdev_get(clock); return mlx5_read_time(mdev, NULL, false) & cc->mask; } @@ -304,8 +308,7 @@ static void mlx5_pps_out(struct work_struct *work) out_work); struct mlx5_clock *clock = container_of(pps_info, struct mlx5_clock, pps_info); - struct mlx5_core_dev *mdev = container_of(clock, struct mlx5_core_dev, - clock); + struct mlx5_core_dev *mdev = mlx5_clock_mdev_get(clock); u32 in[MLX5_ST_SZ_DW(mtpps_reg)] = {0}; unsigned long flags; int i; @@ -335,7 +338,7 @@ static long mlx5_timestamp_overflow(struct ptp_clock_info *ptp_info) unsigned long flags; clock = container_of(ptp_info, struct mlx5_clock, ptp_info); - mdev = container_of(clock, struct mlx5_core_dev, clock); + mdev = mlx5_clock_mdev_get(clock); timer = &clock->timer; if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) @@ -392,7 +395,7 @@ static int mlx5_ptp_settime(struct ptp_clock_info *ptp, const struct timespec64 struct mlx5_clock *clock = container_of(ptp, struct mlx5_clock, ptp_info); struct mlx5_core_dev *mdev; - mdev = container_of(clock, struct mlx5_core_dev, clock); + mdev = mlx5_clock_mdev_get(clock); return mlx5_clock_settime(mdev, clock, ts); } @@ -416,7 +419,7 @@ static int mlx5_ptp_gettimex(struct ptp_clock_info *ptp, struct timespec64 *ts, struct mlx5_core_dev *mdev; u64 cycles, ns; - mdev = container_of(clock, struct mlx5_core_dev, clock); + mdev = mlx5_clock_mdev_get(clock); if (mlx5_real_time_mode(mdev)) { *ts = mlx5_ptp_gettimex_real_time(mdev, sts); goto out; @@ -457,7 +460,7 @@ static int mlx5_ptp_adjtime(struct ptp_clock_info *ptp, s64 delta) struct mlx5_core_dev *mdev; unsigned long flags; - mdev = container_of(clock, struct mlx5_core_dev, clock); + mdev = mlx5_clock_mdev_get(clock); if (mlx5_modify_mtutc_allowed(mdev)) { int err = mlx5_ptp_adjtime_real_time(mdev, delta); @@ -479,7 +482,7 @@ static int mlx5_ptp_adjphase(struct ptp_clock_info *ptp, s32 delta) struct mlx5_clock *clock = container_of(ptp, struct mlx5_clock, ptp_info); struct mlx5_core_dev *mdev; - mdev = container_of(clock, struct mlx5_core_dev, clock); + mdev = mlx5_clock_mdev_get(clock); return mlx5_ptp_adjtime_real_time(mdev, delta); } @@ -512,7 +515,7 @@ static int mlx5_ptp_adjfine(struct ptp_clock_info *ptp, long scaled_ppm) unsigned long flags; u32 mult; - mdev = container_of(clock, struct mlx5_core_dev, clock); + mdev = mlx5_clock_mdev_get(clock); if (mlx5_modify_mtutc_allowed(mdev)) { int err = mlx5_ptp_freq_adj_real_time(mdev, scaled_ppm); @@ -539,8 +542,7 @@ static int mlx5_extts_configure(struct ptp_clock_info *ptp, { struct mlx5_clock *clock = container_of(ptp, struct mlx5_clock, ptp_info); - struct mlx5_core_dev *mdev = - container_of(clock, struct mlx5_core_dev, clock); + struct mlx5_core_dev *mdev = mlx5_clock_mdev_get(clock); u32 in[MLX5_ST_SZ_DW(mtpps_reg)] = {0}; u32 field_select = 0; u8 pin_mode = 0; @@ -724,8 +726,7 @@ static int mlx5_perout_configure(struct ptp_clock_info *ptp, { struct mlx5_clock *clock = container_of(ptp, struct mlx5_clock, ptp_info); - struct mlx5_core_dev *mdev = - container_of(clock, struct mlx5_core_dev, clock); + struct mlx5_core_dev *mdev = mlx5_clock_mdev_get(clock); bool rt_mode = mlx5_real_time_mode(mdev); u32 in[MLX5_ST_SZ_DW(mtpps_reg)] = {0}; u32 out_pulse_duration_ns = 0; @@ -987,7 +988,7 @@ static int mlx5_pps_event(struct notifier_block *nb, unsigned long flags; u64 ns; - mdev = container_of(clock, struct mlx5_core_dev, clock); + mdev = mlx5_clock_mdev_get(clock); switch (clock->ptp_info.pin_config[pin].func) { case PTP_PF_EXTTS: From f9beaf4fac64c84631ba9a2eb864cea6b52032a2 Mon Sep 17 00:00:00 2001 From: Jianbo Liu Date: Mon, 3 Feb 2025 23:35:06 +0200 Subject: [PATCH 38/63] net/mlx5: Change clock in mlx5_core_dev to mlx5_clock pointer Change clock member in mlx5_core_dev to a pointer, so it can point to a clock shared by multiple functions in later patch. For now, each function has its own clock, so mdev in mlx5_clock_priv is the back pointer to the function. Later it points to one (normally the first one) of the multiple functions sharing the same clock. Change mlx5_init_clock() to return error if mlx5_clock is not allocated. Besides, a null clock is defined and used when hardware clock is not supported. So, the clock pointer is always pointing to something valid. Signed-off-by: Jianbo Liu Reviewed-by: Carolina Jubran Reviewed-by: Dragos Tatulea Signed-off-by: Tariq Toukan Signed-off-by: Paolo Abeni --- .../net/ethernet/mellanox/mlx5/core/en/ptp.c | 4 +- .../net/ethernet/mellanox/mlx5/core/en/trap.c | 2 +- .../net/ethernet/mellanox/mlx5/core/en/xdp.c | 4 +- .../mellanox/mlx5/core/en/xsk/setup.c | 2 +- .../net/ethernet/mellanox/mlx5/core/en_main.c | 4 +- .../ethernet/mellanox/mlx5/core/lib/clock.c | 87 ++++++++++++++----- .../ethernet/mellanox/mlx5/core/lib/clock.h | 35 +++++++- .../net/ethernet/mellanox/mlx5/core/main.c | 11 ++- include/linux/mlx5/driver.h | 31 +------ 9 files changed, 116 insertions(+), 64 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c index afd654583b6b..131ed97ca997 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c @@ -326,7 +326,7 @@ static int mlx5e_ptp_alloc_txqsq(struct mlx5e_ptp *c, int txq_ix, int node; sq->pdev = c->pdev; - sq->clock = &mdev->clock; + sq->clock = mdev->clock; sq->mkey_be = c->mkey_be; sq->netdev = c->netdev; sq->priv = c->priv; @@ -696,7 +696,7 @@ static int mlx5e_init_ptp_rq(struct mlx5e_ptp *c, struct mlx5e_params *params, rq->pdev = c->pdev; rq->netdev = priv->netdev; rq->priv = priv; - rq->clock = &mdev->clock; + rq->clock = mdev->clock; rq->tstamp = &priv->tstamp; rq->mdev = mdev; rq->hw_mtu = MLX5E_SW2HW_MTU(params, params->sw_mtu); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/trap.c b/drivers/net/ethernet/mellanox/mlx5/core/en/trap.c index 53ca16cb9c41..140606fcd23b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/trap.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/trap.c @@ -46,7 +46,7 @@ static void mlx5e_init_trap_rq(struct mlx5e_trap *t, struct mlx5e_params *params rq->pdev = t->pdev; rq->netdev = priv->netdev; rq->priv = priv; - rq->clock = &mdev->clock; + rq->clock = mdev->clock; rq->tstamp = &priv->tstamp; rq->mdev = mdev; rq->hw_mtu = MLX5E_SW2HW_MTU(params, params->sw_mtu); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c index 94b291662087..3cc4d55613bf 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c @@ -289,9 +289,9 @@ static u64 mlx5e_xsk_fill_timestamp(void *_priv) ts = get_cqe_ts(priv->cqe); if (mlx5_is_real_time_rq(priv->cq->mdev) || mlx5_is_real_time_sq(priv->cq->mdev)) - return mlx5_real_time_cyc2time(&priv->cq->mdev->clock, ts); + return mlx5_real_time_cyc2time(priv->cq->mdev->clock, ts); - return mlx5_timecounter_cyc2time(&priv->cq->mdev->clock, ts); + return mlx5_timecounter_cyc2time(priv->cq->mdev->clock, ts); } static void mlx5e_xsk_request_checksum(u16 csum_start, u16 csum_offset, void *priv) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c index 9240cfe25d10..d743e823362a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c @@ -72,7 +72,7 @@ static int mlx5e_init_xsk_rq(struct mlx5e_channel *c, rq->netdev = c->netdev; rq->priv = c->priv; rq->tstamp = c->tstamp; - rq->clock = &mdev->clock; + rq->clock = mdev->clock; rq->icosq = &c->icosq; rq->ix = c->ix; rq->channel = c; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index a814b63ed97e..c754e0c75934 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -737,7 +737,7 @@ static int mlx5e_init_rxq_rq(struct mlx5e_channel *c, struct mlx5e_params *param rq->netdev = c->netdev; rq->priv = c->priv; rq->tstamp = c->tstamp; - rq->clock = &mdev->clock; + rq->clock = mdev->clock; rq->icosq = &c->icosq; rq->ix = c->ix; rq->channel = c; @@ -1614,7 +1614,7 @@ static int mlx5e_alloc_txqsq(struct mlx5e_channel *c, int err; sq->pdev = c->pdev; - sq->clock = &mdev->clock; + sq->clock = mdev->clock; sq->mkey_be = c->mkey_be; sq->netdev = c->netdev; sq->mdev = c->mdev; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c index b2c88050ba36..da2a21ce8060 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c @@ -77,9 +77,19 @@ enum { MLX5_MTUTC_OPERATION_ADJUST_TIME_EXTENDED_MAX = 200000, }; +struct mlx5_clock_priv { + struct mlx5_clock clock; + struct mlx5_core_dev *mdev; +}; + +static struct mlx5_clock_priv *clock_priv(struct mlx5_clock *clock) +{ + return container_of(clock, struct mlx5_clock_priv, clock); +} + static struct mlx5_core_dev *mlx5_clock_mdev_get(struct mlx5_clock *clock) { - return container_of(clock, struct mlx5_core_dev, clock); + return clock_priv(clock)->mdev; } static bool mlx5_real_time_mode(struct mlx5_core_dev *mdev) @@ -219,7 +229,7 @@ static int mlx5_mtctr_syncdevicetime(ktime_t *device_time, if (real_time_mode) *device_time = ns_to_ktime(REAL_TIME_TO_NS(device >> 32, device & U32_MAX)); else - *device_time = mlx5_timecounter_cyc2time(&mdev->clock, device); + *device_time = mlx5_timecounter_cyc2time(mdev->clock, device); return 0; } @@ -281,7 +291,7 @@ static u64 read_internal_timer(const struct cyclecounter *cc) static void mlx5_update_clock_info_page(struct mlx5_core_dev *mdev) { struct mlx5_ib_clock_info *clock_info = mdev->clock_info; - struct mlx5_clock *clock = &mdev->clock; + struct mlx5_clock *clock = mdev->clock; struct mlx5_timer *timer; u32 sign; @@ -599,7 +609,7 @@ static int mlx5_extts_configure(struct ptp_clock_info *ptp, static u64 find_target_cycles(struct mlx5_core_dev *mdev, s64 target_ns) { - struct mlx5_clock *clock = &mdev->clock; + struct mlx5_clock *clock = mdev->clock; u64 cycles_now, cycles_delta; u64 nsec_now, nsec_delta; struct mlx5_timer *timer; @@ -658,7 +668,7 @@ static int mlx5_perout_conf_out_pulse_duration(struct mlx5_core_dev *mdev, struct ptp_clock_request *rq, u32 *out_pulse_duration_ns) { - struct mlx5_pps *pps_info = &mdev->clock.pps_info; + struct mlx5_pps *pps_info = &mdev->clock->pps_info; u32 out_pulse_duration; struct timespec64 ts; @@ -691,7 +701,7 @@ static int perout_conf_npps_real_time(struct mlx5_core_dev *mdev, struct ptp_clo u32 *field_select, u32 *out_pulse_duration_ns, u64 *period, u64 *time_stamp) { - struct mlx5_pps *pps_info = &mdev->clock.pps_info; + struct mlx5_pps *pps_info = &mdev->clock->pps_info; struct ptp_clock_time *time = &rq->perout.start; struct timespec64 ts; @@ -901,7 +911,7 @@ static int mlx5_get_pps_pin_mode(struct mlx5_core_dev *mdev, u8 pin) static void mlx5_init_pin_config(struct mlx5_core_dev *mdev) { - struct mlx5_clock *clock = &mdev->clock; + struct mlx5_clock *clock = mdev->clock; int i; if (!clock->ptp_info.n_pins) @@ -929,8 +939,8 @@ static void mlx5_init_pin_config(struct mlx5_core_dev *mdev) static void mlx5_get_pps_caps(struct mlx5_core_dev *mdev) { - struct mlx5_clock *clock = &mdev->clock; u32 out[MLX5_ST_SZ_DW(mtpps_reg)] = {0}; + struct mlx5_clock *clock = mdev->clock; mlx5_query_mtpps(mdev, out, sizeof(out)); @@ -1025,7 +1035,7 @@ static int mlx5_pps_event(struct notifier_block *nb, static void mlx5_timecounter_init(struct mlx5_core_dev *mdev) { - struct mlx5_clock *clock = &mdev->clock; + struct mlx5_clock *clock = mdev->clock; struct mlx5_timer *timer = &clock->timer; u32 dev_freq; @@ -1044,7 +1054,7 @@ static void mlx5_timecounter_init(struct mlx5_core_dev *mdev) static void mlx5_init_overflow_period(struct mlx5_core_dev *mdev) { struct mlx5_ib_clock_info *clock_info = mdev->clock_info; - struct mlx5_clock *clock = &mdev->clock; + struct mlx5_clock *clock = mdev->clock; struct mlx5_timer *timer = &clock->timer; u64 overflow_cycles; u64 frac = 0; @@ -1077,7 +1087,7 @@ static void mlx5_init_overflow_period(struct mlx5_core_dev *mdev) static void mlx5_init_clock_info(struct mlx5_core_dev *mdev) { - struct mlx5_clock *clock = &mdev->clock; + struct mlx5_clock *clock = mdev->clock; struct mlx5_ib_clock_info *info; struct mlx5_timer *timer; @@ -1100,7 +1110,7 @@ static void mlx5_init_clock_info(struct mlx5_core_dev *mdev) static void mlx5_init_timer_max_freq_adjustment(struct mlx5_core_dev *mdev) { - struct mlx5_clock *clock = &mdev->clock; + struct mlx5_clock *clock = mdev->clock; u32 out[MLX5_ST_SZ_DW(mtutc_reg)] = {}; u32 in[MLX5_ST_SZ_DW(mtutc_reg)] = {}; u8 log_max_freq_adjustment = 0; @@ -1119,7 +1129,7 @@ static void mlx5_init_timer_max_freq_adjustment(struct mlx5_core_dev *mdev) static void mlx5_init_timer_clock(struct mlx5_core_dev *mdev) { - struct mlx5_clock *clock = &mdev->clock; + struct mlx5_clock *clock = mdev->clock; /* Configure the PHC */ clock->ptp_info = mlx5_ptp_clock_info; @@ -1156,7 +1166,7 @@ static void mlx5_init_pps(struct mlx5_core_dev *mdev) static void mlx5_init_clock_dev(struct mlx5_core_dev *mdev) { - struct mlx5_clock *clock = &mdev->clock; + struct mlx5_clock *clock = mdev->clock; seqlock_init(&clock->lock); @@ -1180,7 +1190,7 @@ static void mlx5_init_clock_dev(struct mlx5_core_dev *mdev) static void mlx5_destroy_clock_dev(struct mlx5_core_dev *mdev) { - struct mlx5_clock *clock = &mdev->clock; + struct mlx5_clock *clock = mdev->clock; if (clock->ptp) { ptp_clock_unregister(clock->ptp); @@ -1195,25 +1205,60 @@ static void mlx5_destroy_clock_dev(struct mlx5_core_dev *mdev) kfree(clock->ptp_info.pin_config); } -void mlx5_init_clock(struct mlx5_core_dev *mdev) +static void mlx5_clock_free(struct mlx5_core_dev *mdev) +{ + struct mlx5_clock_priv *cpriv = clock_priv(mdev->clock); + + mlx5_destroy_clock_dev(mdev); + kfree(cpriv); + mdev->clock = NULL; +} + +static int mlx5_clock_alloc(struct mlx5_core_dev *mdev) { - struct mlx5_clock *clock = &mdev->clock; + struct mlx5_clock_priv *cpriv; + struct mlx5_clock *clock; + + cpriv = kzalloc(sizeof(*cpriv), GFP_KERNEL); + if (!cpriv) + return -ENOMEM; + + cpriv->mdev = mdev; + clock = &cpriv->clock; + mdev->clock = clock; + mlx5_init_clock_dev(mdev); + + return 0; +} + +static struct mlx5_clock null_clock; + +int mlx5_init_clock(struct mlx5_core_dev *mdev) +{ + struct mlx5_clock *clock; + int err; if (!MLX5_CAP_GEN(mdev, device_frequency_khz)) { + mdev->clock = &null_clock; mlx5_core_warn(mdev, "invalid device_frequency_khz, aborting HW clock init\n"); - return; + return 0; } - mlx5_init_clock_dev(mdev); + err = mlx5_clock_alloc(mdev); + if (err) + return err; + clock = mdev->clock; INIT_WORK(&clock->pps_info.out_work, mlx5_pps_out); MLX5_NB_INIT(&clock->pps_nb, mlx5_pps_event, PPS_EVENT); mlx5_eq_notifier_register(mdev, &clock->pps_nb); + + return 0; } void mlx5_cleanup_clock(struct mlx5_core_dev *mdev) { - struct mlx5_clock *clock = &mdev->clock; + struct mlx5_clock *clock = mdev->clock; if (!MLX5_CAP_GEN(mdev, device_frequency_khz)) return; @@ -1221,5 +1266,5 @@ void mlx5_cleanup_clock(struct mlx5_core_dev *mdev) mlx5_eq_notifier_unregister(mdev, &clock->pps_nb); cancel_work_sync(&clock->pps_info.out_work); - mlx5_destroy_clock_dev(mdev); + mlx5_clock_free(mdev); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.h index bd95b9f8d143..eca1dd9039be 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.h @@ -33,6 +33,35 @@ #ifndef __LIB_CLOCK_H__ #define __LIB_CLOCK_H__ +#include + +#define MAX_PIN_NUM 8 +struct mlx5_pps { + u8 pin_caps[MAX_PIN_NUM]; + struct work_struct out_work; + u64 start[MAX_PIN_NUM]; + u8 enabled; + u64 min_npps_period; + u64 min_out_pulse_duration_ns; +}; + +struct mlx5_timer { + struct cyclecounter cycles; + struct timecounter tc; + u32 nominal_c_mult; + unsigned long overflow_period; +}; + +struct mlx5_clock { + struct mlx5_nb pps_nb; + seqlock_t lock; + struct hwtstamp_config hwtstamp_config; + struct ptp_clock *ptp; + struct ptp_clock_info ptp_info; + struct mlx5_pps pps_info; + struct mlx5_timer timer; +}; + static inline bool mlx5_is_real_time_rq(struct mlx5_core_dev *mdev) { u8 rq_ts_format_cap = MLX5_CAP_GEN(mdev, rq_ts_format); @@ -54,12 +83,12 @@ static inline bool mlx5_is_real_time_sq(struct mlx5_core_dev *mdev) typedef ktime_t (*cqe_ts_to_ns)(struct mlx5_clock *, u64); #if IS_ENABLED(CONFIG_PTP_1588_CLOCK) -void mlx5_init_clock(struct mlx5_core_dev *mdev); +int mlx5_init_clock(struct mlx5_core_dev *mdev); void mlx5_cleanup_clock(struct mlx5_core_dev *mdev); static inline int mlx5_clock_get_ptp_index(struct mlx5_core_dev *mdev) { - return mdev->clock.ptp ? ptp_clock_index(mdev->clock.ptp) : -1; + return mdev->clock->ptp ? ptp_clock_index(mdev->clock->ptp) : -1; } static inline ktime_t mlx5_timecounter_cyc2time(struct mlx5_clock *clock, @@ -87,7 +116,7 @@ static inline ktime_t mlx5_real_time_cyc2time(struct mlx5_clock *clock, return ns_to_ktime(time); } #else -static inline void mlx5_init_clock(struct mlx5_core_dev *mdev) {} +static inline int mlx5_init_clock(struct mlx5_core_dev *mdev) { return 0; } static inline void mlx5_cleanup_clock(struct mlx5_core_dev *mdev) {} static inline int mlx5_clock_get_ptp_index(struct mlx5_core_dev *mdev) { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index ec956c4bcebd..996773521aee 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -1038,7 +1038,11 @@ static int mlx5_init_once(struct mlx5_core_dev *dev) mlx5_init_reserved_gids(dev); - mlx5_init_clock(dev); + err = mlx5_init_clock(dev); + if (err) { + mlx5_core_err(dev, "failed to initialize hardware clock\n"); + goto err_tables_cleanup; + } dev->vxlan = mlx5_vxlan_create(dev); dev->geneve = mlx5_geneve_create(dev); @@ -1046,7 +1050,7 @@ static int mlx5_init_once(struct mlx5_core_dev *dev) err = mlx5_init_rl_table(dev); if (err) { mlx5_core_err(dev, "Failed to init rate limiting\n"); - goto err_tables_cleanup; + goto err_clock_cleanup; } err = mlx5_mpfs_init(dev); @@ -1123,10 +1127,11 @@ static int mlx5_init_once(struct mlx5_core_dev *dev) mlx5_mpfs_cleanup(dev); err_rl_cleanup: mlx5_cleanup_rl_table(dev); -err_tables_cleanup: +err_clock_cleanup: mlx5_geneve_destroy(dev->geneve); mlx5_vxlan_destroy(dev->vxlan); mlx5_cleanup_clock(dev); +err_tables_cleanup: mlx5_cleanup_reserved_gids(dev); mlx5_cq_debugfs_cleanup(dev); mlx5_fw_reset_cleanup(dev); diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index af86097641b0..5dab3d8d05e4 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -54,7 +54,6 @@ #include #include #include -#include #include #define MLX5_ADEV_NAME "mlx5_core" @@ -679,33 +678,7 @@ struct mlx5_rsvd_gids { struct ida ida; }; -#define MAX_PIN_NUM 8 -struct mlx5_pps { - u8 pin_caps[MAX_PIN_NUM]; - struct work_struct out_work; - u64 start[MAX_PIN_NUM]; - u8 enabled; - u64 min_npps_period; - u64 min_out_pulse_duration_ns; -}; - -struct mlx5_timer { - struct cyclecounter cycles; - struct timecounter tc; - u32 nominal_c_mult; - unsigned long overflow_period; -}; - -struct mlx5_clock { - struct mlx5_nb pps_nb; - seqlock_t lock; - struct hwtstamp_config hwtstamp_config; - struct ptp_clock *ptp; - struct ptp_clock_info ptp_info; - struct mlx5_pps pps_info; - struct mlx5_timer timer; -}; - +struct mlx5_clock; struct mlx5_dm; struct mlx5_fw_tracer; struct mlx5_vxlan; @@ -789,7 +762,7 @@ struct mlx5_core_dev { #ifdef CONFIG_MLX5_FPGA struct mlx5_fpga_device *fpga; #endif - struct mlx5_clock clock; + struct mlx5_clock *clock; struct mlx5_ib_clock_info *clock_info; struct mlx5_fw_tracer *tracer; struct mlx5_rsc_dump *rsc_dump; From 574998cf3b3f59afa9e3a6bbb609d9d4eb2023b4 Mon Sep 17 00:00:00 2001 From: Jianbo Liu Date: Mon, 3 Feb 2025 23:35:07 +0200 Subject: [PATCH 39/63] net/mlx5: Add devcom component for the clock shared by functions Add new devcom component for hardware clock. When it is running in real time mode, the functions are grouped by the identify they query. According to firmware document, the clock identify size is 64 bits, so it's safe to memcpy to component key, as the key size is also 64 bits. Signed-off-by: Jianbo Liu Reviewed-by: Carolina Jubran Reviewed-by: Dragos Tatulea Signed-off-by: Tariq Toukan Signed-off-by: Paolo Abeni --- .../ethernet/mellanox/mlx5/core/lib/clock.c | 59 ++++++++++++++++++- .../ethernet/mellanox/mlx5/core/lib/devcom.h | 1 + include/linux/mlx5/driver.h | 2 + 3 files changed, 61 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c index da2a21ce8060..7e5882ea19e0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c @@ -43,6 +43,8 @@ #include #endif /* CONFIG_X86 */ +#define MLX5_RT_CLOCK_IDENTITY_SIZE MLX5_FLD_SZ_BYTES(mrtcq_reg, rt_clock_identity) + enum { MLX5_PIN_MODE_IN = 0x0, MLX5_PIN_MODE_OUT = 0x1, @@ -77,6 +79,10 @@ enum { MLX5_MTUTC_OPERATION_ADJUST_TIME_EXTENDED_MAX = 200000, }; +struct mlx5_clock_dev_state { + struct mlx5_devcom_comp_dev *compdev; +}; + struct mlx5_clock_priv { struct mlx5_clock clock; struct mlx5_core_dev *mdev; @@ -109,6 +115,22 @@ static bool mlx5_modify_mtutc_allowed(struct mlx5_core_dev *mdev) return MLX5_CAP_MCAM_FEATURE(mdev, ptpcyc2realtime_modify); } +static int mlx5_clock_identity_get(struct mlx5_core_dev *mdev, + u8 identify[MLX5_RT_CLOCK_IDENTITY_SIZE]) +{ + u32 out[MLX5_ST_SZ_DW(mrtcq_reg)] = {}; + u32 in[MLX5_ST_SZ_DW(mrtcq_reg)] = {}; + int err; + + err = mlx5_core_access_reg(mdev, in, sizeof(in), + out, sizeof(out), MLX5_REG_MRTCQ, 0, 0); + if (!err) + memcpy(identify, MLX5_ADDR_OF(mrtcq_reg, out, rt_clock_identity), + MLX5_RT_CLOCK_IDENTITY_SIZE); + + return err; +} + static u32 mlx5_ptp_shift_constant(u32 dev_freq_khz) { /* Optimal shift constant leads to corrections above just 1 scaled ppm. @@ -1231,11 +1253,26 @@ static int mlx5_clock_alloc(struct mlx5_core_dev *mdev) return 0; } +static void mlx5_shared_clock_register(struct mlx5_core_dev *mdev, u64 key) +{ + mdev->clock_state->compdev = mlx5_devcom_register_component(mdev->priv.devc, + MLX5_DEVCOM_SHARED_CLOCK, + key, NULL, mdev); +} + +static void mlx5_shared_clock_unregister(struct mlx5_core_dev *mdev) +{ + mlx5_devcom_unregister_component(mdev->clock_state->compdev); +} + static struct mlx5_clock null_clock; int mlx5_init_clock(struct mlx5_core_dev *mdev) { + u8 identity[MLX5_RT_CLOCK_IDENTITY_SIZE]; + struct mlx5_clock_dev_state *clock_state; struct mlx5_clock *clock; + u64 key; int err; if (!MLX5_CAP_GEN(mdev, device_frequency_khz)) { @@ -1244,9 +1281,26 @@ int mlx5_init_clock(struct mlx5_core_dev *mdev) return 0; } + clock_state = kzalloc(sizeof(*clock_state), GFP_KERNEL); + if (!clock_state) + return -ENOMEM; + mdev->clock_state = clock_state; + + if (MLX5_CAP_MCAM_REG3(mdev, mrtcq) && mlx5_real_time_mode(mdev)) { + if (mlx5_clock_identity_get(mdev, identity)) { + mlx5_core_warn(mdev, "failed to get rt clock identity, create ptp dev per function\n"); + } else { + memcpy(&key, &identity, sizeof(key)); + mlx5_shared_clock_register(mdev, key); + } + } + err = mlx5_clock_alloc(mdev); - if (err) + if (err) { + kfree(clock_state); + mdev->clock_state = NULL; return err; + } clock = mdev->clock; INIT_WORK(&clock->pps_info.out_work, mlx5_pps_out); @@ -1267,4 +1321,7 @@ void mlx5_cleanup_clock(struct mlx5_core_dev *mdev) cancel_work_sync(&clock->pps_info.out_work); mlx5_clock_free(mdev); + mlx5_shared_clock_unregister(mdev); + kfree(mdev->clock_state); + mdev->clock_state = NULL; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.h index d58032dd0df7..c79699b94a02 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.h @@ -11,6 +11,7 @@ enum mlx5_devcom_component { MLX5_DEVCOM_MPV, MLX5_DEVCOM_HCA_PORTS, MLX5_DEVCOM_SD_GROUP, + MLX5_DEVCOM_SHARED_CLOCK, MLX5_DEVCOM_NUM_COMPONENTS, }; diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 5dab3d8d05e4..46bd7550adf8 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -679,6 +679,7 @@ struct mlx5_rsvd_gids { }; struct mlx5_clock; +struct mlx5_clock_dev_state; struct mlx5_dm; struct mlx5_fw_tracer; struct mlx5_vxlan; @@ -763,6 +764,7 @@ struct mlx5_core_dev { struct mlx5_fpga_device *fpga; #endif struct mlx5_clock *clock; + struct mlx5_clock_dev_state *clock_state; struct mlx5_ib_clock_info *clock_info; struct mlx5_fw_tracer *tracer; struct mlx5_rsc_dump *rsc_dump; From 79faf9d76d66a1f846b61008ddf1596bd7944a08 Mon Sep 17 00:00:00 2001 From: Jianbo Liu Date: Mon, 3 Feb 2025 23:35:08 +0200 Subject: [PATCH 40/63] net/mlx5: Move PPS notifier and out_work to clock_state The PPS notifier is currently in mlx5_clock, and mlx5_clock can be shared in later patch, so the notifier should be registered for each device to avoid any event miss. Besides, the out_work is scheduled by PPS out event which is triggered only when the device is in free running mode. So, both are moved to mlx5_core_dev's clock_state. Signed-off-by: Jianbo Liu Reviewed-by: Carolina Jubran Reviewed-by: Dragos Tatulea Signed-off-by: Tariq Toukan Signed-off-by: Paolo Abeni --- .../ethernet/mellanox/mlx5/core/lib/clock.c | 37 +++++++++---------- .../ethernet/mellanox/mlx5/core/lib/clock.h | 2 - 2 files changed, 18 insertions(+), 21 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c index 7e5882ea19e0..2586b0788b40 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c @@ -80,7 +80,10 @@ enum { }; struct mlx5_clock_dev_state { + struct mlx5_core_dev *mdev; struct mlx5_devcom_comp_dev *compdev; + struct mlx5_nb pps_nb; + struct work_struct out_work; }; struct mlx5_clock_priv { @@ -336,11 +339,10 @@ static void mlx5_update_clock_info_page(struct mlx5_core_dev *mdev) static void mlx5_pps_out(struct work_struct *work) { - struct mlx5_pps *pps_info = container_of(work, struct mlx5_pps, - out_work); - struct mlx5_clock *clock = container_of(pps_info, struct mlx5_clock, - pps_info); - struct mlx5_core_dev *mdev = mlx5_clock_mdev_get(clock); + struct mlx5_clock_dev_state *clock_state = container_of(work, struct mlx5_clock_dev_state, + out_work); + struct mlx5_core_dev *mdev = clock_state->mdev; + struct mlx5_clock *clock = mdev->clock; u32 in[MLX5_ST_SZ_DW(mtpps_reg)] = {0}; unsigned long flags; int i; @@ -1012,16 +1014,16 @@ static u64 perout_conf_next_event_timer(struct mlx5_core_dev *mdev, static int mlx5_pps_event(struct notifier_block *nb, unsigned long type, void *data) { - struct mlx5_clock *clock = mlx5_nb_cof(nb, struct mlx5_clock, pps_nb); + struct mlx5_clock_dev_state *clock_state = mlx5_nb_cof(nb, struct mlx5_clock_dev_state, + pps_nb); + struct mlx5_core_dev *mdev = clock_state->mdev; + struct mlx5_clock *clock = mdev->clock; struct ptp_clock_event ptp_event; struct mlx5_eqe *eqe = data; int pin = eqe->data.pps.pin; - struct mlx5_core_dev *mdev; unsigned long flags; u64 ns; - mdev = mlx5_clock_mdev_get(clock); - switch (clock->ptp_info.pin_config[pin].func) { case PTP_PF_EXTTS: ptp_event.index = pin; @@ -1045,7 +1047,7 @@ static int mlx5_pps_event(struct notifier_block *nb, write_seqlock_irqsave(&clock->lock, flags); clock->pps_info.start[pin] = ns; write_sequnlock_irqrestore(&clock->lock, flags); - schedule_work(&clock->pps_info.out_work); + schedule_work(&clock_state->out_work); break; default: mlx5_core_err(mdev, " Unhandled clock PPS event, func %d\n", @@ -1271,7 +1273,6 @@ int mlx5_init_clock(struct mlx5_core_dev *mdev) { u8 identity[MLX5_RT_CLOCK_IDENTITY_SIZE]; struct mlx5_clock_dev_state *clock_state; - struct mlx5_clock *clock; u64 key; int err; @@ -1284,6 +1285,7 @@ int mlx5_init_clock(struct mlx5_core_dev *mdev) clock_state = kzalloc(sizeof(*clock_state), GFP_KERNEL); if (!clock_state) return -ENOMEM; + clock_state->mdev = mdev; mdev->clock_state = clock_state; if (MLX5_CAP_MCAM_REG3(mdev, mrtcq) && mlx5_real_time_mode(mdev)) { @@ -1301,24 +1303,21 @@ int mlx5_init_clock(struct mlx5_core_dev *mdev) mdev->clock_state = NULL; return err; } - clock = mdev->clock; - INIT_WORK(&clock->pps_info.out_work, mlx5_pps_out); - MLX5_NB_INIT(&clock->pps_nb, mlx5_pps_event, PPS_EVENT); - mlx5_eq_notifier_register(mdev, &clock->pps_nb); + INIT_WORK(&mdev->clock_state->out_work, mlx5_pps_out); + MLX5_NB_INIT(&mdev->clock_state->pps_nb, mlx5_pps_event, PPS_EVENT); + mlx5_eq_notifier_register(mdev, &mdev->clock_state->pps_nb); return 0; } void mlx5_cleanup_clock(struct mlx5_core_dev *mdev) { - struct mlx5_clock *clock = mdev->clock; - if (!MLX5_CAP_GEN(mdev, device_frequency_khz)) return; - mlx5_eq_notifier_unregister(mdev, &clock->pps_nb); - cancel_work_sync(&clock->pps_info.out_work); + mlx5_eq_notifier_unregister(mdev, &mdev->clock_state->pps_nb); + cancel_work_sync(&mdev->clock_state->out_work); mlx5_clock_free(mdev); mlx5_shared_clock_unregister(mdev); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.h index eca1dd9039be..3c5fee246582 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.h @@ -38,7 +38,6 @@ #define MAX_PIN_NUM 8 struct mlx5_pps { u8 pin_caps[MAX_PIN_NUM]; - struct work_struct out_work; u64 start[MAX_PIN_NUM]; u8 enabled; u64 min_npps_period; @@ -53,7 +52,6 @@ struct mlx5_timer { }; struct mlx5_clock { - struct mlx5_nb pps_nb; seqlock_t lock; struct hwtstamp_config hwtstamp_config; struct ptp_clock *ptp; From f538ffb7a22d092a8301440bc5d59488c311ea8b Mon Sep 17 00:00:00 2001 From: Jianbo Liu Date: Mon, 3 Feb 2025 23:35:09 +0200 Subject: [PATCH 41/63] net/mlx5: Support one PTP device per hardware clock Currently, mlx5 driver exposes a PTP device for each network interface, resulting in multiple device nodes representing the same underlying PHC (PTP hardware clock). This causes problem if it is trying to synchronize to itself. For instance, when ptp4l operates on multiple interfaces following different masters, phc2sys attempts to synchronize them in automatic mode. PHC can be configured to work as free running mode or real time mode. All functions can access it directly. In this patch, we create one PTP device for each PHC when it's running in real time mode. All the functions share the same PTP device if the clock identifies they query are same, and they are already grouped by devcom in previous commit. The first mdev in the peer list is chosen when sending MTPPS/MTUTC/MTPPSE/MRTCQ to firmware. Since the function can be unloaded at any time, we need to use a mutex lock to protect the mdev pointer used in PTP and PPS callbacks. Besides, new one should be picked from the peer list when the current is not available. The clock info, which is used by IB, is shared by all the interfaces using the same hardware clock. Signed-off-by: Jianbo Liu Reviewed-by: Carolina Jubran Reviewed-by: Dragos Tatulea Signed-off-by: Tariq Toukan Signed-off-by: Paolo Abeni --- .../ethernet/mellanox/mlx5/core/lib/clock.c | 250 ++++++++++++++---- .../ethernet/mellanox/mlx5/core/lib/clock.h | 1 + 2 files changed, 203 insertions(+), 48 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c index 2586b0788b40..42df3a6fda93 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c @@ -89,6 +89,7 @@ struct mlx5_clock_dev_state { struct mlx5_clock_priv { struct mlx5_clock clock; struct mlx5_core_dev *mdev; + struct mutex lock; /* protect mdev and used in PTP callbacks */ }; static struct mlx5_clock_priv *clock_priv(struct mlx5_clock *clock) @@ -96,11 +97,37 @@ static struct mlx5_clock_priv *clock_priv(struct mlx5_clock *clock) return container_of(clock, struct mlx5_clock_priv, clock); } +static void mlx5_clock_lockdep_assert(struct mlx5_clock *clock) +{ + if (!clock->shared) + return; + + lockdep_assert(lockdep_is_held(&clock_priv(clock)->lock)); +} + static struct mlx5_core_dev *mlx5_clock_mdev_get(struct mlx5_clock *clock) { + mlx5_clock_lockdep_assert(clock); + return clock_priv(clock)->mdev; } +static void mlx5_clock_lock(struct mlx5_clock *clock) +{ + if (!clock->shared) + return; + + mutex_lock(&clock_priv(clock)->lock); +} + +static void mlx5_clock_unlock(struct mlx5_clock *clock) +{ + if (!clock->shared) + return; + + mutex_unlock(&clock_priv(clock)->lock); +} + static bool mlx5_real_time_mode(struct mlx5_core_dev *mdev) { return (mlx5_is_real_time_rq(mdev) || mlx5_is_real_time_sq(mdev)); @@ -170,10 +197,14 @@ static s32 mlx5_ptp_getmaxphase(struct ptp_clock_info *ptp) { struct mlx5_clock *clock = container_of(ptp, struct mlx5_clock, ptp_info); struct mlx5_core_dev *mdev; + s32 ret; + mlx5_clock_lock(clock); mdev = mlx5_clock_mdev_get(clock); + ret = mlx5_clock_getmaxphase(mdev); + mlx5_clock_unlock(clock); - return mlx5_clock_getmaxphase(mdev); + return ret; } static bool mlx5_is_mtutc_time_adj_cap(struct mlx5_core_dev *mdev, s64 delta) @@ -265,16 +296,23 @@ static int mlx5_ptp_getcrosststamp(struct ptp_clock_info *ptp, struct mlx5_clock *clock = container_of(ptp, struct mlx5_clock, ptp_info); struct system_time_snapshot history_begin = {0}; struct mlx5_core_dev *mdev; + int err; + mlx5_clock_lock(clock); mdev = mlx5_clock_mdev_get(clock); - if (!mlx5_is_ptm_source_time_available(mdev)) - return -EBUSY; + if (!mlx5_is_ptm_source_time_available(mdev)) { + err = -EBUSY; + goto unlock; + } ktime_get_snapshot(&history_begin); - return get_device_system_crosststamp(mlx5_mtctr_syncdevicetime, mdev, - &history_begin, cts); + err = get_device_system_crosststamp(mlx5_mtctr_syncdevicetime, mdev, + &history_begin, cts); +unlock: + mlx5_clock_unlock(clock); + return err; } #endif /* CONFIG_X86 */ @@ -372,6 +410,7 @@ static long mlx5_timestamp_overflow(struct ptp_clock_info *ptp_info) unsigned long flags; clock = container_of(ptp_info, struct mlx5_clock, ptp_info); + mlx5_clock_lock(clock); mdev = mlx5_clock_mdev_get(clock); timer = &clock->timer; @@ -384,6 +423,7 @@ static long mlx5_timestamp_overflow(struct ptp_clock_info *ptp_info) write_sequnlock_irqrestore(&clock->lock, flags); out: + mlx5_clock_unlock(clock); return timer->overflow_period; } @@ -428,10 +468,14 @@ static int mlx5_ptp_settime(struct ptp_clock_info *ptp, const struct timespec64 { struct mlx5_clock *clock = container_of(ptp, struct mlx5_clock, ptp_info); struct mlx5_core_dev *mdev; + int err; + mlx5_clock_lock(clock); mdev = mlx5_clock_mdev_get(clock); + err = mlx5_clock_settime(mdev, clock, ts); + mlx5_clock_unlock(clock); - return mlx5_clock_settime(mdev, clock, ts); + return err; } static @@ -453,6 +497,7 @@ static int mlx5_ptp_gettimex(struct ptp_clock_info *ptp, struct timespec64 *ts, struct mlx5_core_dev *mdev; u64 cycles, ns; + mlx5_clock_lock(clock); mdev = mlx5_clock_mdev_get(clock); if (mlx5_real_time_mode(mdev)) { *ts = mlx5_ptp_gettimex_real_time(mdev, sts); @@ -463,6 +508,7 @@ static int mlx5_ptp_gettimex(struct ptp_clock_info *ptp, struct timespec64 *ts, ns = mlx5_timecounter_cyc2time(clock, cycles); *ts = ns_to_timespec64(ns); out: + mlx5_clock_unlock(clock); return 0; } @@ -493,14 +539,16 @@ static int mlx5_ptp_adjtime(struct ptp_clock_info *ptp, s64 delta) struct mlx5_timer *timer = &clock->timer; struct mlx5_core_dev *mdev; unsigned long flags; + int err = 0; + mlx5_clock_lock(clock); mdev = mlx5_clock_mdev_get(clock); if (mlx5_modify_mtutc_allowed(mdev)) { - int err = mlx5_ptp_adjtime_real_time(mdev, delta); + err = mlx5_ptp_adjtime_real_time(mdev, delta); if (err) - return err; + goto unlock; } write_seqlock_irqsave(&clock->lock, flags); @@ -508,17 +556,23 @@ static int mlx5_ptp_adjtime(struct ptp_clock_info *ptp, s64 delta) mlx5_update_clock_info_page(mdev); write_sequnlock_irqrestore(&clock->lock, flags); - return 0; +unlock: + mlx5_clock_unlock(clock); + return err; } static int mlx5_ptp_adjphase(struct ptp_clock_info *ptp, s32 delta) { struct mlx5_clock *clock = container_of(ptp, struct mlx5_clock, ptp_info); struct mlx5_core_dev *mdev; + int err; + mlx5_clock_lock(clock); mdev = mlx5_clock_mdev_get(clock); + err = mlx5_ptp_adjtime_real_time(mdev, delta); + mlx5_clock_unlock(clock); - return mlx5_ptp_adjtime_real_time(mdev, delta); + return err; } static int mlx5_ptp_freq_adj_real_time(struct mlx5_core_dev *mdev, long scaled_ppm) @@ -547,15 +601,17 @@ static int mlx5_ptp_adjfine(struct ptp_clock_info *ptp, long scaled_ppm) struct mlx5_timer *timer = &clock->timer; struct mlx5_core_dev *mdev; unsigned long flags; + int err = 0; u32 mult; + mlx5_clock_lock(clock); mdev = mlx5_clock_mdev_get(clock); if (mlx5_modify_mtutc_allowed(mdev)) { - int err = mlx5_ptp_freq_adj_real_time(mdev, scaled_ppm); + err = mlx5_ptp_freq_adj_real_time(mdev, scaled_ppm); if (err) - return err; + goto unlock; } mult = (u32)adjust_by_scaled_ppm(timer->nominal_c_mult, scaled_ppm); @@ -567,7 +623,9 @@ static int mlx5_ptp_adjfine(struct ptp_clock_info *ptp, long scaled_ppm) write_sequnlock_irqrestore(&clock->lock, flags); ptp_schedule_worker(clock->ptp, timer->overflow_period); - return 0; +unlock: + mlx5_clock_unlock(clock); + return err; } static int mlx5_extts_configure(struct ptp_clock_info *ptp, @@ -576,17 +634,14 @@ static int mlx5_extts_configure(struct ptp_clock_info *ptp, { struct mlx5_clock *clock = container_of(ptp, struct mlx5_clock, ptp_info); - struct mlx5_core_dev *mdev = mlx5_clock_mdev_get(clock); u32 in[MLX5_ST_SZ_DW(mtpps_reg)] = {0}; + struct mlx5_core_dev *mdev; u32 field_select = 0; u8 pin_mode = 0; u8 pattern = 0; int pin = -1; int err = 0; - if (!MLX5_PPS_CAP(mdev)) - return -EOPNOTSUPP; - /* Reject requests with unsupported flags */ if (rq->extts.flags & ~(PTP_ENABLE_FEATURE | PTP_RISING_EDGE | @@ -617,6 +672,14 @@ static int mlx5_extts_configure(struct ptp_clock_info *ptp, field_select = MLX5_MTPPS_FS_ENABLE; } + mlx5_clock_lock(clock); + mdev = mlx5_clock_mdev_get(clock); + + if (!MLX5_PPS_CAP(mdev)) { + err = -EOPNOTSUPP; + goto unlock; + } + MLX5_SET(mtpps_reg, in, pin, pin); MLX5_SET(mtpps_reg, in, pin_mode, pin_mode); MLX5_SET(mtpps_reg, in, pattern, pattern); @@ -625,10 +688,13 @@ static int mlx5_extts_configure(struct ptp_clock_info *ptp, err = mlx5_set_mtpps(mdev, in, sizeof(in)); if (err) - return err; + goto unlock; + + err = mlx5_set_mtppse(mdev, pin, 0, MLX5_EVENT_MODE_REPETETIVE & on); - return mlx5_set_mtppse(mdev, pin, 0, - MLX5_EVENT_MODE_REPETETIVE & on); +unlock: + mlx5_clock_unlock(clock); + return err; } static u64 find_target_cycles(struct mlx5_core_dev *mdev, s64 target_ns) @@ -760,25 +826,18 @@ static int mlx5_perout_configure(struct ptp_clock_info *ptp, { struct mlx5_clock *clock = container_of(ptp, struct mlx5_clock, ptp_info); - struct mlx5_core_dev *mdev = mlx5_clock_mdev_get(clock); - bool rt_mode = mlx5_real_time_mode(mdev); u32 in[MLX5_ST_SZ_DW(mtpps_reg)] = {0}; u32 out_pulse_duration_ns = 0; + struct mlx5_core_dev *mdev; u32 field_select = 0; u64 npps_period = 0; u64 time_stamp = 0; u8 pin_mode = 0; u8 pattern = 0; + bool rt_mode; int pin = -1; int err = 0; - if (!MLX5_PPS_CAP(mdev)) - return -EOPNOTSUPP; - - /* Reject requests with unsupported flags */ - if (mlx5_perout_verify_flags(mdev, rq->perout.flags)) - return -EOPNOTSUPP; - if (rq->perout.index >= clock->ptp_info.n_pins) return -EINVAL; @@ -787,14 +846,29 @@ static int mlx5_perout_configure(struct ptp_clock_info *ptp, if (pin < 0) return -EBUSY; - if (on) { - bool rt_mode = mlx5_real_time_mode(mdev); + mlx5_clock_lock(clock); + mdev = mlx5_clock_mdev_get(clock); + rt_mode = mlx5_real_time_mode(mdev); + + if (!MLX5_PPS_CAP(mdev)) { + err = -EOPNOTSUPP; + goto unlock; + } + + /* Reject requests with unsupported flags */ + if (mlx5_perout_verify_flags(mdev, rq->perout.flags)) { + err = -EOPNOTSUPP; + goto unlock; + } + if (on) { pin_mode = MLX5_PIN_MODE_OUT; pattern = MLX5_OUT_PATTERN_PERIODIC; - if (rt_mode && rq->perout.start.sec > U32_MAX) - return -EINVAL; + if (rt_mode && rq->perout.start.sec > U32_MAX) { + err = -EINVAL; + goto unlock; + } field_select |= MLX5_MTPPS_FS_PIN_MODE | MLX5_MTPPS_FS_PATTERN | @@ -807,7 +881,7 @@ static int mlx5_perout_configure(struct ptp_clock_info *ptp, else err = perout_conf_1pps(mdev, rq, &time_stamp, rt_mode); if (err) - return err; + goto unlock; } MLX5_SET(mtpps_reg, in, pin, pin); @@ -820,13 +894,16 @@ static int mlx5_perout_configure(struct ptp_clock_info *ptp, MLX5_SET(mtpps_reg, in, out_pulse_duration_ns, out_pulse_duration_ns); err = mlx5_set_mtpps(mdev, in, sizeof(in)); if (err) - return err; + goto unlock; if (rt_mode) - return 0; + goto unlock; - return mlx5_set_mtppse(mdev, pin, 0, - MLX5_EVENT_MODE_REPETETIVE & on); + err = mlx5_set_mtppse(mdev, pin, 0, MLX5_EVENT_MODE_REPETETIVE & on); + +unlock: + mlx5_clock_unlock(clock); + return err; } static int mlx5_pps_configure(struct ptp_clock_info *ptp, @@ -1043,6 +1120,10 @@ static int mlx5_pps_event(struct notifier_block *nb, ptp_clock_event(clock->ptp, &ptp_event); break; case PTP_PF_PEROUT: + if (clock->shared) { + mlx5_core_warn(mdev, " Received unexpected PPS out event\n"); + break; + } ns = perout_conf_next_event_timer(mdev, clock); write_seqlock_irqsave(&clock->lock, flags); clock->pps_info.start[pin] = ns; @@ -1201,9 +1282,10 @@ static void mlx5_init_clock_dev(struct mlx5_core_dev *mdev) mlx5_init_pps(mdev); clock->ptp = ptp_clock_register(&clock->ptp_info, - &mdev->pdev->dev); + clock->shared ? NULL : &mdev->pdev->dev); if (IS_ERR(clock->ptp)) { - mlx5_core_warn(mdev, "ptp_clock_register failed %ld\n", + mlx5_core_warn(mdev, "%sptp_clock_register failed %ld\n", + clock->shared ? "shared clock " : "", PTR_ERR(clock->ptp)); clock->ptp = NULL; } @@ -1234,11 +1316,12 @@ static void mlx5_clock_free(struct mlx5_core_dev *mdev) struct mlx5_clock_priv *cpriv = clock_priv(mdev->clock); mlx5_destroy_clock_dev(mdev); + mutex_destroy(&cpriv->lock); kfree(cpriv); mdev->clock = NULL; } -static int mlx5_clock_alloc(struct mlx5_core_dev *mdev) +static int mlx5_clock_alloc(struct mlx5_core_dev *mdev, bool shared) { struct mlx5_clock_priv *cpriv; struct mlx5_clock *clock; @@ -1247,23 +1330,90 @@ static int mlx5_clock_alloc(struct mlx5_core_dev *mdev) if (!cpriv) return -ENOMEM; + mutex_init(&cpriv->lock); cpriv->mdev = mdev; clock = &cpriv->clock; + clock->shared = shared; mdev->clock = clock; + mlx5_clock_lock(clock); mlx5_init_clock_dev(mdev); + mlx5_clock_unlock(clock); + + if (!clock->shared) + return 0; + + if (!clock->ptp) { + mlx5_core_warn(mdev, "failed to create ptp dev shared by multiple functions"); + mlx5_clock_free(mdev); + return -EINVAL; + } return 0; } static void mlx5_shared_clock_register(struct mlx5_core_dev *mdev, u64 key) { + struct mlx5_core_dev *peer_dev, *next = NULL; + struct mlx5_devcom_comp_dev *pos; + mdev->clock_state->compdev = mlx5_devcom_register_component(mdev->priv.devc, MLX5_DEVCOM_SHARED_CLOCK, key, NULL, mdev); + if (IS_ERR(mdev->clock_state->compdev)) + return; + + mlx5_devcom_comp_lock(mdev->clock_state->compdev); + mlx5_devcom_for_each_peer_entry(mdev->clock_state->compdev, peer_dev, pos) { + if (peer_dev->clock) { + next = peer_dev; + break; + } + } + + if (next) { + mdev->clock = next->clock; + /* clock info is shared among all the functions using the same clock */ + mdev->clock_info = next->clock_info; + } else { + mlx5_clock_alloc(mdev, true); + } + mlx5_devcom_comp_unlock(mdev->clock_state->compdev); + + if (!mdev->clock) { + mlx5_devcom_unregister_component(mdev->clock_state->compdev); + mdev->clock_state->compdev = NULL; + } } static void mlx5_shared_clock_unregister(struct mlx5_core_dev *mdev) { + struct mlx5_core_dev *peer_dev, *next = NULL; + struct mlx5_clock *clock = mdev->clock; + struct mlx5_devcom_comp_dev *pos; + + mlx5_devcom_comp_lock(mdev->clock_state->compdev); + mlx5_devcom_for_each_peer_entry(mdev->clock_state->compdev, peer_dev, pos) { + if (peer_dev->clock && peer_dev != mdev) { + next = peer_dev; + break; + } + } + + if (next) { + struct mlx5_clock_priv *cpriv = clock_priv(clock); + + mlx5_clock_lock(clock); + if (mdev == cpriv->mdev) + cpriv->mdev = next; + mlx5_clock_unlock(clock); + } else { + mlx5_clock_free(mdev); + } + + mdev->clock = NULL; + mdev->clock_info = NULL; + mlx5_devcom_comp_unlock(mdev->clock_state->compdev); + mlx5_devcom_unregister_component(mdev->clock_state->compdev); } @@ -1297,11 +1447,13 @@ int mlx5_init_clock(struct mlx5_core_dev *mdev) } } - err = mlx5_clock_alloc(mdev); - if (err) { - kfree(clock_state); - mdev->clock_state = NULL; - return err; + if (!mdev->clock) { + err = mlx5_clock_alloc(mdev, false); + if (err) { + kfree(clock_state); + mdev->clock_state = NULL; + return err; + } } INIT_WORK(&mdev->clock_state->out_work, mlx5_pps_out); @@ -1319,8 +1471,10 @@ void mlx5_cleanup_clock(struct mlx5_core_dev *mdev) mlx5_eq_notifier_unregister(mdev, &mdev->clock_state->pps_nb); cancel_work_sync(&mdev->clock_state->out_work); - mlx5_clock_free(mdev); - mlx5_shared_clock_unregister(mdev); + if (mdev->clock->shared) + mlx5_shared_clock_unregister(mdev); + else + mlx5_clock_free(mdev); kfree(mdev->clock_state); mdev->clock_state = NULL; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.h index 3c5fee246582..093fa131014a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.h @@ -58,6 +58,7 @@ struct mlx5_clock { struct ptp_clock_info ptp_info; struct mlx5_pps pps_info; struct mlx5_timer timer; + bool shared; }; static inline bool mlx5_is_real_time_rq(struct mlx5_core_dev *mdev) From 39c1202fa9428bcb8d1242ee12f81cbcb298c020 Mon Sep 17 00:00:00 2001 From: Jianbo Liu Date: Mon, 3 Feb 2025 23:35:10 +0200 Subject: [PATCH 42/63] net/mlx5: Generate PPS IN event on new function for shared clock As a specific function (mdev) is chosen to send MTPPSE command to firmware, the event is generated only on that function. When that function is unloaded, the PPS event can't be forward to PTP device, even when there are other functions in the group, and PTP device is not destroyed. To resolve this problem, need to send MTPPSE again from new function, and dis-arm the event on old function after that. PPS events are handled by EQ notifier. The async EQs and notifiers are destroyed in mlx5_eq_table_destroy() which is called before mlx5_cleanup_clock(). During the period between mlx5_eq_table_destroy() and mlx5_cleanup_clock(), the events can't be handled. To avoid event loss, add mlx5_clock_unload() in mlx5_unload() to arm the event on other available function, and mlx5_clock_load in mlx5_load() for symmetry. Signed-off-by: Jianbo Liu Reviewed-by: Carolina Jubran Reviewed-by: Dragos Tatulea Signed-off-by: Tariq Toukan Signed-off-by: Paolo Abeni --- .../ethernet/mellanox/mlx5/core/lib/clock.c | 97 +++++++++++++++++-- .../ethernet/mellanox/mlx5/core/lib/clock.h | 5 + .../net/ethernet/mellanox/mlx5/core/main.c | 4 + 3 files changed, 99 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c index 42df3a6fda93..65a94e46edcf 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c @@ -90,6 +90,7 @@ struct mlx5_clock_priv { struct mlx5_clock clock; struct mlx5_core_dev *mdev; struct mutex lock; /* protect mdev and used in PTP callbacks */ + struct mlx5_core_dev *event_mdev; }; static struct mlx5_clock_priv *clock_priv(struct mlx5_clock *clock) @@ -691,6 +692,11 @@ static int mlx5_extts_configure(struct ptp_clock_info *ptp, goto unlock; err = mlx5_set_mtppse(mdev, pin, 0, MLX5_EVENT_MODE_REPETETIVE & on); + if (err) + goto unlock; + + clock->pps_info.pin_armed[pin] = on; + clock_priv(clock)->event_mdev = mdev; unlock: mlx5_clock_unlock(clock); @@ -1417,6 +1423,90 @@ static void mlx5_shared_clock_unregister(struct mlx5_core_dev *mdev) mlx5_devcom_unregister_component(mdev->clock_state->compdev); } +static void mlx5_clock_arm_pps_in_event(struct mlx5_clock *clock, + struct mlx5_core_dev *new_mdev, + struct mlx5_core_dev *old_mdev) +{ + struct ptp_clock_info *ptp_info = &clock->ptp_info; + struct mlx5_clock_priv *cpriv = clock_priv(clock); + int i; + + for (i = 0; i < ptp_info->n_pins; i++) { + if (ptp_info->pin_config[i].func != PTP_PF_EXTTS || + !clock->pps_info.pin_armed[i]) + continue; + + if (new_mdev) { + mlx5_set_mtppse(new_mdev, i, 0, MLX5_EVENT_MODE_REPETETIVE); + cpriv->event_mdev = new_mdev; + } else { + cpriv->event_mdev = NULL; + } + + if (old_mdev) + mlx5_set_mtppse(old_mdev, i, 0, MLX5_EVENT_MODE_DISABLE); + } +} + +void mlx5_clock_load(struct mlx5_core_dev *mdev) +{ + struct mlx5_clock *clock = mdev->clock; + struct mlx5_clock_priv *cpriv; + + if (!MLX5_CAP_GEN(mdev, device_frequency_khz)) + return; + + INIT_WORK(&mdev->clock_state->out_work, mlx5_pps_out); + MLX5_NB_INIT(&mdev->clock_state->pps_nb, mlx5_pps_event, PPS_EVENT); + mlx5_eq_notifier_register(mdev, &mdev->clock_state->pps_nb); + + if (!clock->shared) { + mlx5_clock_arm_pps_in_event(clock, mdev, NULL); + return; + } + + cpriv = clock_priv(clock); + mlx5_devcom_comp_lock(mdev->clock_state->compdev); + mlx5_clock_lock(clock); + if (mdev == cpriv->mdev && mdev != cpriv->event_mdev) + mlx5_clock_arm_pps_in_event(clock, mdev, cpriv->event_mdev); + mlx5_clock_unlock(clock); + mlx5_devcom_comp_unlock(mdev->clock_state->compdev); +} + +void mlx5_clock_unload(struct mlx5_core_dev *mdev) +{ + struct mlx5_core_dev *peer_dev, *next = NULL; + struct mlx5_clock *clock = mdev->clock; + struct mlx5_devcom_comp_dev *pos; + + if (!MLX5_CAP_GEN(mdev, device_frequency_khz)) + return; + + if (!clock->shared) { + mlx5_clock_arm_pps_in_event(clock, NULL, mdev); + goto out; + } + + mlx5_devcom_comp_lock(mdev->clock_state->compdev); + mlx5_devcom_for_each_peer_entry(mdev->clock_state->compdev, peer_dev, pos) { + if (peer_dev->clock && peer_dev != mdev) { + next = peer_dev; + break; + } + } + + mlx5_clock_lock(clock); + if (mdev == clock_priv(clock)->event_mdev) + mlx5_clock_arm_pps_in_event(clock, next, mdev); + mlx5_clock_unlock(clock); + mlx5_devcom_comp_unlock(mdev->clock_state->compdev); + +out: + mlx5_eq_notifier_unregister(mdev, &mdev->clock_state->pps_nb); + cancel_work_sync(&mdev->clock_state->out_work); +} + static struct mlx5_clock null_clock; int mlx5_init_clock(struct mlx5_core_dev *mdev) @@ -1456,10 +1546,6 @@ int mlx5_init_clock(struct mlx5_core_dev *mdev) } } - INIT_WORK(&mdev->clock_state->out_work, mlx5_pps_out); - MLX5_NB_INIT(&mdev->clock_state->pps_nb, mlx5_pps_event, PPS_EVENT); - mlx5_eq_notifier_register(mdev, &mdev->clock_state->pps_nb); - return 0; } @@ -1468,9 +1554,6 @@ void mlx5_cleanup_clock(struct mlx5_core_dev *mdev) if (!MLX5_CAP_GEN(mdev, device_frequency_khz)) return; - mlx5_eq_notifier_unregister(mdev, &mdev->clock_state->pps_nb); - cancel_work_sync(&mdev->clock_state->out_work); - if (mdev->clock->shared) mlx5_shared_clock_unregister(mdev); else diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.h index 093fa131014a..c18a652c0faa 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.h @@ -42,6 +42,7 @@ struct mlx5_pps { u8 enabled; u64 min_npps_period; u64 min_out_pulse_duration_ns; + bool pin_armed[MAX_PIN_NUM]; }; struct mlx5_timer { @@ -84,6 +85,8 @@ typedef ktime_t (*cqe_ts_to_ns)(struct mlx5_clock *, u64); #if IS_ENABLED(CONFIG_PTP_1588_CLOCK) int mlx5_init_clock(struct mlx5_core_dev *mdev); void mlx5_cleanup_clock(struct mlx5_core_dev *mdev); +void mlx5_clock_load(struct mlx5_core_dev *mdev); +void mlx5_clock_unload(struct mlx5_core_dev *mdev); static inline int mlx5_clock_get_ptp_index(struct mlx5_core_dev *mdev) { @@ -117,6 +120,8 @@ static inline ktime_t mlx5_real_time_cyc2time(struct mlx5_clock *clock, #else static inline int mlx5_init_clock(struct mlx5_core_dev *mdev) { return 0; } static inline void mlx5_cleanup_clock(struct mlx5_core_dev *mdev) {} +static inline void mlx5_clock_load(struct mlx5_core_dev *mdev) {} +static inline void mlx5_clock_unload(struct mlx5_core_dev *mdev) {} static inline int mlx5_clock_get_ptp_index(struct mlx5_core_dev *mdev) { return -1; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index 996773521aee..710633d5fdbe 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -1364,6 +1364,8 @@ static int mlx5_load(struct mlx5_core_dev *dev) goto err_eq_table; } + mlx5_clock_load(dev); + err = mlx5_fw_tracer_init(dev->tracer); if (err) { mlx5_core_err(dev, "Failed to init FW tracer %d\n", err); @@ -1447,6 +1449,7 @@ static int mlx5_load(struct mlx5_core_dev *dev) mlx5_hv_vhca_cleanup(dev->hv_vhca); mlx5_fw_reset_events_stop(dev); mlx5_fw_tracer_cleanup(dev->tracer); + mlx5_clock_unload(dev); mlx5_eq_table_destroy(dev); err_eq_table: mlx5_irq_table_destroy(dev); @@ -1473,6 +1476,7 @@ static void mlx5_unload(struct mlx5_core_dev *dev) mlx5_hv_vhca_cleanup(dev->hv_vhca); mlx5_fw_reset_events_stop(dev); mlx5_fw_tracer_cleanup(dev->tracer); + mlx5_clock_unload(dev); mlx5_eq_table_destroy(dev); mlx5_irq_table_destroy(dev); mlx5_pagealloc_stop(dev); From 4897f9b7f8bdcf93b8d3b466321fa00bb6d2e600 Mon Sep 17 00:00:00 2001 From: Jianbo Liu Date: Mon, 3 Feb 2025 23:35:11 +0200 Subject: [PATCH 43/63] ethtool: Add support for 200Gbps per lane link modes Define 200G, 400G and 800G link modes using 200Gbps per lane. Signed-off-by: Jianbo Liu Reviewed-by: Shahar Shitrit Signed-off-by: Tariq Toukan Signed-off-by: Paolo Abeni --- drivers/net/phy/phy-core.c | 20 ++++++++++++++++- include/uapi/linux/ethtool.h | 18 ++++++++++++++++ net/ethtool/common.c | 42 ++++++++++++++++++++++++++++++++++++ 3 files changed, 79 insertions(+), 1 deletion(-) diff --git a/drivers/net/phy/phy-core.c b/drivers/net/phy/phy-core.c index 6bf3ec985f3d..f181f05cb429 100644 --- a/drivers/net/phy/phy-core.c +++ b/drivers/net/phy/phy-core.c @@ -13,7 +13,7 @@ */ const char *phy_speed_to_str(int speed) { - BUILD_BUG_ON_MSG(__ETHTOOL_LINK_MODE_MASK_NBITS != 103, + BUILD_BUG_ON_MSG(__ETHTOOL_LINK_MODE_MASK_NBITS != 121, "Enum ethtool_link_mode_bit_indices and phylib are out of sync. " "If a speed or mode has been added please update phy_speed_to_str " "and the PHY settings array.\n"); @@ -169,6 +169,12 @@ static const struct phy_setting settings[] = { PHY_SETTING( 800000, FULL, 800000baseDR8_2_Full ), PHY_SETTING( 800000, FULL, 800000baseSR8_Full ), PHY_SETTING( 800000, FULL, 800000baseVR8_Full ), + PHY_SETTING( 800000, FULL, 800000baseCR4_Full ), + PHY_SETTING( 800000, FULL, 800000baseKR4_Full ), + PHY_SETTING( 800000, FULL, 800000baseDR4_Full ), + PHY_SETTING( 800000, FULL, 800000baseDR4_2_Full ), + PHY_SETTING( 800000, FULL, 800000baseSR4_Full ), + PHY_SETTING( 800000, FULL, 800000baseVR4_Full ), /* 400G */ PHY_SETTING( 400000, FULL, 400000baseCR8_Full ), PHY_SETTING( 400000, FULL, 400000baseKR8_Full ), @@ -180,6 +186,12 @@ static const struct phy_setting settings[] = { PHY_SETTING( 400000, FULL, 400000baseLR4_ER4_FR4_Full ), PHY_SETTING( 400000, FULL, 400000baseDR4_Full ), PHY_SETTING( 400000, FULL, 400000baseSR4_Full ), + PHY_SETTING( 400000, FULL, 400000baseCR2_Full ), + PHY_SETTING( 400000, FULL, 400000baseKR2_Full ), + PHY_SETTING( 400000, FULL, 400000baseDR2_Full ), + PHY_SETTING( 400000, FULL, 400000baseDR2_2_Full ), + PHY_SETTING( 400000, FULL, 400000baseSR2_Full ), + PHY_SETTING( 400000, FULL, 400000baseVR2_Full ), /* 200G */ PHY_SETTING( 200000, FULL, 200000baseCR4_Full ), PHY_SETTING( 200000, FULL, 200000baseKR4_Full ), @@ -191,6 +203,12 @@ static const struct phy_setting settings[] = { PHY_SETTING( 200000, FULL, 200000baseLR2_ER2_FR2_Full ), PHY_SETTING( 200000, FULL, 200000baseDR2_Full ), PHY_SETTING( 200000, FULL, 200000baseSR2_Full ), + PHY_SETTING( 200000, FULL, 200000baseCR_Full ), + PHY_SETTING( 200000, FULL, 200000baseKR_Full ), + PHY_SETTING( 200000, FULL, 200000baseDR_Full ), + PHY_SETTING( 200000, FULL, 200000baseDR_2_Full ), + PHY_SETTING( 200000, FULL, 200000baseSR_Full ), + PHY_SETTING( 200000, FULL, 200000baseVR_Full ), /* 100G */ PHY_SETTING( 100000, FULL, 100000baseCR4_Full ), PHY_SETTING( 100000, FULL, 100000baseKR4_Full ), diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index d1089b88efc7..e0bd726f84c1 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -2057,6 +2057,24 @@ enum ethtool_link_mode_bit_indices { ETHTOOL_LINK_MODE_10baseT1S_Half_BIT = 100, ETHTOOL_LINK_MODE_10baseT1S_P2MP_Half_BIT = 101, ETHTOOL_LINK_MODE_10baseT1BRR_Full_BIT = 102, + ETHTOOL_LINK_MODE_200000baseCR_Full_BIT = 103, + ETHTOOL_LINK_MODE_200000baseKR_Full_BIT = 104, + ETHTOOL_LINK_MODE_200000baseDR_Full_BIT = 105, + ETHTOOL_LINK_MODE_200000baseDR_2_Full_BIT = 106, + ETHTOOL_LINK_MODE_200000baseSR_Full_BIT = 107, + ETHTOOL_LINK_MODE_200000baseVR_Full_BIT = 108, + ETHTOOL_LINK_MODE_400000baseCR2_Full_BIT = 109, + ETHTOOL_LINK_MODE_400000baseKR2_Full_BIT = 110, + ETHTOOL_LINK_MODE_400000baseDR2_Full_BIT = 111, + ETHTOOL_LINK_MODE_400000baseDR2_2_Full_BIT = 112, + ETHTOOL_LINK_MODE_400000baseSR2_Full_BIT = 113, + ETHTOOL_LINK_MODE_400000baseVR2_Full_BIT = 114, + ETHTOOL_LINK_MODE_800000baseCR4_Full_BIT = 115, + ETHTOOL_LINK_MODE_800000baseKR4_Full_BIT = 116, + ETHTOOL_LINK_MODE_800000baseDR4_Full_BIT = 117, + ETHTOOL_LINK_MODE_800000baseDR4_2_Full_BIT = 118, + ETHTOOL_LINK_MODE_800000baseSR4_Full_BIT = 119, + ETHTOOL_LINK_MODE_800000baseVR4_Full_BIT = 120, /* must be last entry */ __ETHTOOL_LINK_MODE_MASK_NBITS diff --git a/net/ethtool/common.c b/net/ethtool/common.c index 2bd77c94f9f1..5489d0c9d13f 100644 --- a/net/ethtool/common.c +++ b/net/ethtool/common.c @@ -213,6 +213,24 @@ const char link_mode_names[][ETH_GSTRING_LEN] = { __DEFINE_LINK_MODE_NAME(10, T1S, Half), __DEFINE_LINK_MODE_NAME(10, T1S_P2MP, Half), __DEFINE_LINK_MODE_NAME(10, T1BRR, Full), + __DEFINE_LINK_MODE_NAME(200000, CR, Full), + __DEFINE_LINK_MODE_NAME(200000, KR, Full), + __DEFINE_LINK_MODE_NAME(200000, DR, Full), + __DEFINE_LINK_MODE_NAME(200000, DR_2, Full), + __DEFINE_LINK_MODE_NAME(200000, SR, Full), + __DEFINE_LINK_MODE_NAME(200000, VR, Full), + __DEFINE_LINK_MODE_NAME(400000, CR2, Full), + __DEFINE_LINK_MODE_NAME(400000, KR2, Full), + __DEFINE_LINK_MODE_NAME(400000, DR2, Full), + __DEFINE_LINK_MODE_NAME(400000, DR2_2, Full), + __DEFINE_LINK_MODE_NAME(400000, SR2, Full), + __DEFINE_LINK_MODE_NAME(400000, VR2, Full), + __DEFINE_LINK_MODE_NAME(800000, CR4, Full), + __DEFINE_LINK_MODE_NAME(800000, KR4, Full), + __DEFINE_LINK_MODE_NAME(800000, DR4, Full), + __DEFINE_LINK_MODE_NAME(800000, DR4_2, Full), + __DEFINE_LINK_MODE_NAME(800000, SR4, Full), + __DEFINE_LINK_MODE_NAME(800000, VR4, Full), }; static_assert(ARRAY_SIZE(link_mode_names) == __ETHTOOL_LINK_MODE_MASK_NBITS); @@ -221,8 +239,11 @@ static_assert(ARRAY_SIZE(link_mode_names) == __ETHTOOL_LINK_MODE_MASK_NBITS); #define __LINK_MODE_LANES_CR4 4 #define __LINK_MODE_LANES_CR8 8 #define __LINK_MODE_LANES_DR 1 +#define __LINK_MODE_LANES_DR_2 1 #define __LINK_MODE_LANES_DR2 2 +#define __LINK_MODE_LANES_DR2_2 2 #define __LINK_MODE_LANES_DR4 4 +#define __LINK_MODE_LANES_DR4_2 4 #define __LINK_MODE_LANES_DR8 8 #define __LINK_MODE_LANES_KR 1 #define __LINK_MODE_LANES_KR2 2 @@ -251,6 +272,9 @@ static_assert(ARRAY_SIZE(link_mode_names) == __ETHTOOL_LINK_MODE_MASK_NBITS); #define __LINK_MODE_LANES_T1L 1 #define __LINK_MODE_LANES_T1S 1 #define __LINK_MODE_LANES_T1S_P2MP 1 +#define __LINK_MODE_LANES_VR 1 +#define __LINK_MODE_LANES_VR2 2 +#define __LINK_MODE_LANES_VR4 4 #define __LINK_MODE_LANES_VR8 8 #define __LINK_MODE_LANES_DR8_2 8 #define __LINK_MODE_LANES_T1BRR 1 @@ -378,6 +402,24 @@ const struct link_mode_info link_mode_params[] = { __DEFINE_LINK_MODE_PARAMS(10, T1S, Half), __DEFINE_LINK_MODE_PARAMS(10, T1S_P2MP, Half), __DEFINE_LINK_MODE_PARAMS(10, T1BRR, Full), + __DEFINE_LINK_MODE_PARAMS(200000, CR, Full), + __DEFINE_LINK_MODE_PARAMS(200000, KR, Full), + __DEFINE_LINK_MODE_PARAMS(200000, DR, Full), + __DEFINE_LINK_MODE_PARAMS(200000, DR_2, Full), + __DEFINE_LINK_MODE_PARAMS(200000, SR, Full), + __DEFINE_LINK_MODE_PARAMS(200000, VR, Full), + __DEFINE_LINK_MODE_PARAMS(400000, CR2, Full), + __DEFINE_LINK_MODE_PARAMS(400000, KR2, Full), + __DEFINE_LINK_MODE_PARAMS(400000, DR2, Full), + __DEFINE_LINK_MODE_PARAMS(400000, DR2_2, Full), + __DEFINE_LINK_MODE_PARAMS(400000, SR2, Full), + __DEFINE_LINK_MODE_PARAMS(400000, VR2, Full), + __DEFINE_LINK_MODE_PARAMS(800000, CR4, Full), + __DEFINE_LINK_MODE_PARAMS(800000, KR4, Full), + __DEFINE_LINK_MODE_PARAMS(800000, DR4, Full), + __DEFINE_LINK_MODE_PARAMS(800000, DR4_2, Full), + __DEFINE_LINK_MODE_PARAMS(800000, SR4, Full), + __DEFINE_LINK_MODE_PARAMS(800000, VR4, Full), }; static_assert(ARRAY_SIZE(link_mode_params) == __ETHTOOL_LINK_MODE_MASK_NBITS); From ee0a4fc396f1b6fd1b34e99754896961fb67e4e3 Mon Sep 17 00:00:00 2001 From: Jianbo Liu Date: Mon, 3 Feb 2025 23:35:12 +0200 Subject: [PATCH 44/63] net/mlx5: Add support for 200Gbps per lane link modes This patch exposes new link modes using 200Gbps per lane, including 200G, 400G and 800G modes. Signed-off-by: Jianbo Liu Reviewed-by: Shahar Shitrit Signed-off-by: Tariq Toukan Signed-off-by: Paolo Abeni --- .../ethernet/mellanox/mlx5/core/en_ethtool.c | 21 +++++++++++++++++++ .../net/ethernet/mellanox/mlx5/core/port.c | 3 +++ include/linux/mlx5/port.h | 3 +++ 3 files changed, 27 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c index cae39198b4db..9c5fcc699515 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c @@ -237,6 +237,27 @@ void mlx5e_build_ptys2ethtool_map(void) ETHTOOL_LINK_MODE_800000baseDR8_2_Full_BIT, ETHTOOL_LINK_MODE_800000baseSR8_Full_BIT, ETHTOOL_LINK_MODE_800000baseVR8_Full_BIT); + MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_200GAUI_1_200GBASE_CR1_KR1, ext, + ETHTOOL_LINK_MODE_200000baseCR_Full_BIT, + ETHTOOL_LINK_MODE_200000baseKR_Full_BIT, + ETHTOOL_LINK_MODE_200000baseDR_Full_BIT, + ETHTOOL_LINK_MODE_200000baseDR_2_Full_BIT, + ETHTOOL_LINK_MODE_200000baseSR_Full_BIT, + ETHTOOL_LINK_MODE_200000baseVR_Full_BIT); + MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_400GAUI_2_400GBASE_CR2_KR2, ext, + ETHTOOL_LINK_MODE_400000baseCR2_Full_BIT, + ETHTOOL_LINK_MODE_400000baseKR2_Full_BIT, + ETHTOOL_LINK_MODE_400000baseDR2_Full_BIT, + ETHTOOL_LINK_MODE_400000baseDR2_2_Full_BIT, + ETHTOOL_LINK_MODE_400000baseSR2_Full_BIT, + ETHTOOL_LINK_MODE_400000baseVR2_Full_BIT); + MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_800GAUI_4_800GBASE_CR4_KR4, ext, + ETHTOOL_LINK_MODE_800000baseCR4_Full_BIT, + ETHTOOL_LINK_MODE_800000baseKR4_Full_BIT, + ETHTOOL_LINK_MODE_800000baseDR4_Full_BIT, + ETHTOOL_LINK_MODE_800000baseDR4_2_Full_BIT, + ETHTOOL_LINK_MODE_800000baseSR4_Full_BIT, + ETHTOOL_LINK_MODE_800000baseVR4_Full_BIT); } static void mlx5e_ethtool_get_speed_arr(struct mlx5_core_dev *mdev, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/port.c b/drivers/net/ethernet/mellanox/mlx5/core/port.c index 50931584132b..3995df064101 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/port.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/port.c @@ -1105,6 +1105,9 @@ static const u32 mlx5e_ext_link_speed[MLX5E_EXT_LINK_MODES_NUMBER] = { [MLX5E_200GAUI_2_200GBASE_CR2_KR2] = 200000, [MLX5E_400GAUI_4_400GBASE_CR4_KR4] = 400000, [MLX5E_800GAUI_8_800GBASE_CR8_KR8] = 800000, + [MLX5E_200GAUI_1_200GBASE_CR1_KR1] = 200000, + [MLX5E_400GAUI_2_400GBASE_CR2_KR2] = 400000, + [MLX5E_800GAUI_4_800GBASE_CR4_KR4] = 800000, }; int mlx5_port_query_eth_proto(struct mlx5_core_dev *dev, u8 port, bool ext, diff --git a/include/linux/mlx5/port.h b/include/linux/mlx5/port.h index e68d42b8ce65..fd625e0dd869 100644 --- a/include/linux/mlx5/port.h +++ b/include/linux/mlx5/port.h @@ -115,9 +115,12 @@ enum mlx5e_ext_link_mode { MLX5E_100GAUI_1_100GBASE_CR_KR = 11, MLX5E_200GAUI_4_200GBASE_CR4_KR4 = 12, MLX5E_200GAUI_2_200GBASE_CR2_KR2 = 13, + MLX5E_200GAUI_1_200GBASE_CR1_KR1 = 14, MLX5E_400GAUI_8_400GBASE_CR8 = 15, MLX5E_400GAUI_4_400GBASE_CR4_KR4 = 16, + MLX5E_400GAUI_2_400GBASE_CR2_KR2 = 17, MLX5E_800GAUI_8_800GBASE_CR8_KR8 = 19, + MLX5E_800GAUI_4_800GBASE_CR4_KR4 = 20, MLX5E_EXT_LINK_MODES_NUMBER, }; From 4e343c11efbbc9da8ac6e1a2f23704c9313e056d Mon Sep 17 00:00:00 2001 From: Jianbo Liu Date: Mon, 3 Feb 2025 23:35:13 +0200 Subject: [PATCH 45/63] net/mlx5e: Support FEC settings for 200G per lane link modes Add support to show and config FEC by ethtool for 200G/lane link modes. The RS encoding setting is mapped, and can be overridden to FEC_RS_544_514_INTERLEAVED_QUAD for these modes. Signed-off-by: Jianbo Liu Reviewed-by: Shahar Shitrit Signed-off-by: Tariq Toukan Signed-off-by: Paolo Abeni --- .../net/ethernet/mellanox/mlx5/core/en/port.c | 64 ++++++++++++++++--- .../net/ethernet/mellanox/mlx5/core/en/port.h | 1 + .../ethernet/mellanox/mlx5/core/en_ethtool.c | 1 + 3 files changed, 56 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/port.c b/drivers/net/ethernet/mellanox/mlx5/core/en/port.c index 5f6a0605e4ae..f62fbfb67a1b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/port.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/port.c @@ -296,11 +296,16 @@ enum mlx5e_fec_supported_link_mode { MLX5E_FEC_SUPPORTED_LINK_MODE_200G_2X, MLX5E_FEC_SUPPORTED_LINK_MODE_400G_4X, MLX5E_FEC_SUPPORTED_LINK_MODE_800G_8X, + MLX5E_FEC_SUPPORTED_LINK_MODE_200G_1X, + MLX5E_FEC_SUPPORTED_LINK_MODE_400G_2X, + MLX5E_FEC_SUPPORTED_LINK_MODE_800G_4X, + MLX5E_FEC_SUPPORTED_LINK_MODE_1600G_8X, MLX5E_MAX_FEC_SUPPORTED_LINK_MODE, }; #define MLX5E_FEC_FIRST_50G_PER_LANE_MODE MLX5E_FEC_SUPPORTED_LINK_MODE_50G_1X #define MLX5E_FEC_FIRST_100G_PER_LANE_MODE MLX5E_FEC_SUPPORTED_LINK_MODE_100G_1X +#define MLX5E_FEC_FIRST_200G_PER_LANE_MODE MLX5E_FEC_SUPPORTED_LINK_MODE_200G_1X #define MLX5E_FEC_OVERRIDE_ADMIN_POLICY(buf, policy, write, link) \ do { \ @@ -320,8 +325,10 @@ static bool mlx5e_is_fec_supported_link_mode(struct mlx5_core_dev *dev, return link_mode < MLX5E_FEC_FIRST_50G_PER_LANE_MODE || (link_mode < MLX5E_FEC_FIRST_100G_PER_LANE_MODE && MLX5_CAP_PCAM_FEATURE(dev, fec_50G_per_lane_in_pplm)) || - (link_mode >= MLX5E_FEC_FIRST_100G_PER_LANE_MODE && - MLX5_CAP_PCAM_FEATURE(dev, fec_100G_per_lane_in_pplm)); + (link_mode < MLX5E_FEC_FIRST_200G_PER_LANE_MODE && + MLX5_CAP_PCAM_FEATURE(dev, fec_100G_per_lane_in_pplm)) || + (link_mode >= MLX5E_FEC_FIRST_200G_PER_LANE_MODE && + MLX5_CAP_PCAM_FEATURE(dev, fec_200G_per_lane_in_pplm)); } /* get/set FEC admin field for a given speed */ @@ -368,6 +375,18 @@ static int mlx5e_fec_admin_field(u32 *pplm, u16 *fec_policy, bool write, case MLX5E_FEC_SUPPORTED_LINK_MODE_800G_8X: MLX5E_FEC_OVERRIDE_ADMIN_POLICY(pplm, *fec_policy, write, 800g_8x); break; + case MLX5E_FEC_SUPPORTED_LINK_MODE_200G_1X: + MLX5E_FEC_OVERRIDE_ADMIN_POLICY(pplm, *fec_policy, write, 200g_1x); + break; + case MLX5E_FEC_SUPPORTED_LINK_MODE_400G_2X: + MLX5E_FEC_OVERRIDE_ADMIN_POLICY(pplm, *fec_policy, write, 400g_2x); + break; + case MLX5E_FEC_SUPPORTED_LINK_MODE_800G_4X: + MLX5E_FEC_OVERRIDE_ADMIN_POLICY(pplm, *fec_policy, write, 800g_4x); + break; + case MLX5E_FEC_SUPPORTED_LINK_MODE_1600G_8X: + MLX5E_FEC_OVERRIDE_ADMIN_POLICY(pplm, *fec_policy, write, 1600g_8x); + break; default: return -EINVAL; } @@ -421,6 +440,18 @@ static int mlx5e_get_fec_cap_field(u32 *pplm, u16 *fec_cap, case MLX5E_FEC_SUPPORTED_LINK_MODE_800G_8X: *fec_cap = MLX5E_GET_FEC_OVERRIDE_CAP(pplm, 800g_8x); break; + case MLX5E_FEC_SUPPORTED_LINK_MODE_200G_1X: + *fec_cap = MLX5E_GET_FEC_OVERRIDE_CAP(pplm, 200g_1x); + break; + case MLX5E_FEC_SUPPORTED_LINK_MODE_400G_2X: + *fec_cap = MLX5E_GET_FEC_OVERRIDE_CAP(pplm, 400g_2x); + break; + case MLX5E_FEC_SUPPORTED_LINK_MODE_800G_4X: + *fec_cap = MLX5E_GET_FEC_OVERRIDE_CAP(pplm, 800g_4x); + break; + case MLX5E_FEC_SUPPORTED_LINK_MODE_1600G_8X: + *fec_cap = MLX5E_GET_FEC_OVERRIDE_CAP(pplm, 1600g_8x); + break; default: return -EINVAL; } @@ -494,6 +525,26 @@ int mlx5e_get_fec_mode(struct mlx5_core_dev *dev, u32 *fec_mode_active, return 0; } +static u16 mlx5e_remap_fec_conf_mode(enum mlx5e_fec_supported_link_mode link_mode, + u16 conf_fec) +{ + /* RS fec in ethtool is originally mapped to MLX5E_FEC_RS_528_514. + * For link modes up to 25G per lane, the value is kept. + * For 50G or 100G per lane, it's remapped to MLX5E_FEC_RS_544_514. + * For 200G per lane, remapped to MLX5E_FEC_RS_544_514_INTERLEAVED_QUAD. + */ + if (conf_fec != BIT(MLX5E_FEC_RS_528_514)) + return conf_fec; + + if (link_mode >= MLX5E_FEC_FIRST_200G_PER_LANE_MODE) + return BIT(MLX5E_FEC_RS_544_514_INTERLEAVED_QUAD); + + if (link_mode >= MLX5E_FEC_FIRST_50G_PER_LANE_MODE) + return BIT(MLX5E_FEC_RS_544_514); + + return conf_fec; +} + int mlx5e_set_fec_mode(struct mlx5_core_dev *dev, u16 fec_policy) { bool fec_50g_per_lane = MLX5_CAP_PCAM_FEATURE(dev, fec_50G_per_lane_in_pplm); @@ -530,14 +581,7 @@ int mlx5e_set_fec_mode(struct mlx5_core_dev *dev, u16 fec_policy) if (!mlx5e_is_fec_supported_link_mode(dev, i)) break; - /* RS fec in ethtool is mapped to MLX5E_FEC_RS_528_514 - * to link modes up to 25G per lane and to - * MLX5E_FEC_RS_544_514 in the new link modes based on - * 50G or 100G per lane - */ - if (conf_fec == (1 << MLX5E_FEC_RS_528_514) && - i >= MLX5E_FEC_FIRST_50G_PER_LANE_MODE) - conf_fec = (1 << MLX5E_FEC_RS_544_514); + conf_fec = mlx5e_remap_fec_conf_mode(i, conf_fec); mlx5e_get_fec_cap_field(out, &fec_caps, i); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/port.h b/drivers/net/ethernet/mellanox/mlx5/core/en/port.h index d1da225f35da..fa2283dd383b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/port.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/port.h @@ -61,6 +61,7 @@ enum { MLX5E_FEC_NOFEC, MLX5E_FEC_FIRECODE, MLX5E_FEC_RS_528_514, + MLX5E_FEC_RS_544_514_INTERLEAVED_QUAD = 4, MLX5E_FEC_RS_544_514 = 7, MLX5E_FEC_LLRS_272_257_1 = 9, }; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c index 9c5fcc699515..f9113cb13a0c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c @@ -952,6 +952,7 @@ static const u32 pplm_fec_2_ethtool[] = { [MLX5E_FEC_RS_528_514] = ETHTOOL_FEC_RS, [MLX5E_FEC_RS_544_514] = ETHTOOL_FEC_RS, [MLX5E_FEC_LLRS_272_257_1] = ETHTOOL_FEC_LLRS, + [MLX5E_FEC_RS_544_514_INTERLEAVED_QUAD] = ETHTOOL_FEC_RS, }; static u32 pplm2ethtool_fec(u_long fec_mode, unsigned long size) From 6fa15a20b7c34e6558dddd7a63494a68058225ef Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Mon, 3 Feb 2025 23:35:14 +0200 Subject: [PATCH 46/63] net/mlx5: Remove stray semicolon in LAG port selection table creation Remove the stray semicolon in the mlx5_ldev_for_each_reverse() loop. Signed-off-by: Gal Pressman Signed-off-by: Tariq Toukan Reviewed-by: Kalesh AP Signed-off-by: Paolo Abeni --- drivers/net/ethernet/mellanox/mlx5/core/lag/port_sel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/port_sel.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/port_sel.c index bde79cac33a9..d832a12ffec0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/port_sel.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/port_sel.c @@ -97,7 +97,7 @@ static int mlx5_lag_create_port_sel_table(struct mlx5_lag *ldev, mlx5_del_flow_rules(lag_definer->rules[idx]); } j = ldev->buckets; - }; + } goto destroy_fg; } } From 96d64a1ab79516d6e19aa2c0d42b02251d8694ce Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Mon, 3 Feb 2025 23:35:15 +0200 Subject: [PATCH 47/63] net/mlx5e: Remove unused mlx5e_tc_flow_action struct Commit 67efaf45930d ("net/mlx5e: TC, Remove CT action reordering") removed the usage of mlx5e_tc_flow_action struct, remove the struct as well. Signed-off-by: Gal Pressman Reviewed-by: Cosmin Ratiu Signed-off-by: Tariq Toukan Reviewed-by: Kalesh AP Signed-off-by: Paolo Abeni --- drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/act.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/act.h b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/act.h index d6c12d0ea55b..2e528b2c34d6 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/act.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/act.h @@ -73,11 +73,6 @@ struct mlx5e_tc_act { bool is_terminating_action; }; -struct mlx5e_tc_flow_action { - unsigned int num_entries; - struct flow_action_entry **entries; -}; - extern struct mlx5e_tc_act mlx5e_tc_act_drop; extern struct mlx5e_tc_act mlx5e_tc_act_trap; extern struct mlx5e_tc_act mlx5e_tc_act_accept; From 689805dcc474c2accb5cffbbcea1c06ee4a54570 Mon Sep 17 00:00:00 2001 From: Carolina Jubran Date: Mon, 3 Feb 2025 23:35:16 +0200 Subject: [PATCH 48/63] net/mlx5e: Avoid WARN_ON when configuring MQPRIO with HTB offload enabled When attempting to enable MQPRIO while HTB offload is already configured, the driver currently returns `-EINVAL` and triggers a `WARN_ON`, leading to an unnecessary call trace. Update the code to handle this case more gracefully by returning `-EOPNOTSUPP` instead, while also providing a helpful user message. Signed-off-by: Carolina Jubran Reviewed-by: Yael Chemla Reviewed-by: Cosmin Ratiu Signed-off-by: Tariq Toukan Reviewed-by: Kalesh AP Signed-off-by: Paolo Abeni --- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index c754e0c75934..2fdc86432ac0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -3816,8 +3816,11 @@ static int mlx5e_setup_tc_mqprio(struct mlx5e_priv *priv, /* MQPRIO is another toplevel qdisc that can't be attached * simultaneously with the offloaded HTB. */ - if (WARN_ON(mlx5e_selq_is_htb_enabled(&priv->selq))) - return -EINVAL; + if (mlx5e_selq_is_htb_enabled(&priv->selq)) { + NL_SET_ERR_MSG_MOD(mqprio->extack, + "MQPRIO cannot be configured when HTB offload is enabled."); + return -EOPNOTSUPP; + } switch (mqprio->mode) { case TC_MQPRIO_MODE_DCB: From 7e8b24e24ac46038e48c9a042e7d9b31855cbca5 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 3 Feb 2025 13:55:09 -0800 Subject: [PATCH 49/63] tools: ynl-gen: don't output external constants A definition with a "header" property is an "external" definition for C code, as in it is defined already in another C header file. Other languages will need the exact value but C codegen should not recreate it. So don't output those definitions in the uAPI header. Signed-off-by: Jakub Kicinski Link: https://patch.msgid.link/20250203215510.1288728-1-kuba@kernel.org Signed-off-by: Paolo Abeni --- tools/net/ynl/pyynl/ynl_gen_c.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/net/ynl/pyynl/ynl_gen_c.py b/tools/net/ynl/pyynl/ynl_gen_c.py index c2eabc90dce8..aa08b8b1463d 100755 --- a/tools/net/ynl/pyynl/ynl_gen_c.py +++ b/tools/net/ynl/pyynl/ynl_gen_c.py @@ -2549,6 +2549,9 @@ def render_uapi(family, cw): defines = [] for const in family['definitions']: + if const.get('header'): + continue + if const['type'] != 'const': cw.writes_defines(defines) defines = [] From fa796178e5eb0078a9a6c36f60fd6494cfc3f81d Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 3 Feb 2025 13:55:10 -0800 Subject: [PATCH 50/63] tools: ynl-gen: support limits using definitions Support using defines / constants in integer checks. Carolina will need this for rate API extensions. Reported-by: Carolina Jubran Link: https://lore.kernel.org/1e886aaf-e1eb-4f1a-b7ef-f63b350a3320@nvidia.com Signed-off-by: Jakub Kicinski Link: https://patch.msgid.link/20250203215510.1288728-2-kuba@kernel.org Signed-off-by: Paolo Abeni --- Documentation/netlink/genetlink-c.yaml | 5 +++-- Documentation/netlink/genetlink-legacy.yaml | 5 +++-- Documentation/netlink/genetlink.yaml | 5 +++-- tools/net/ynl/pyynl/ynl_gen_c.py | 5 ++++- 4 files changed, 13 insertions(+), 7 deletions(-) diff --git a/Documentation/netlink/genetlink-c.yaml b/Documentation/netlink/genetlink-c.yaml index 9660ffb1ed6a..44f2226160ca 100644 --- a/Documentation/netlink/genetlink-c.yaml +++ b/Documentation/netlink/genetlink-c.yaml @@ -14,9 +14,10 @@ $defs: pattern: ^[0-9A-Za-z_-]+( - 1)?$ minimum: 0 len-or-limit: - # literal int or limit based on fixed-width type e.g. u8-min, u16-max, etc. + # literal int, const name, or limit based on fixed-width type + # e.g. u8-min, u16-max, etc. type: [ string, integer ] - pattern: ^[su](8|16|32|64)-(min|max)$ + pattern: ^[0-9A-Za-z_-]+$ minimum: 0 # Schema for specs diff --git a/Documentation/netlink/genetlink-legacy.yaml b/Documentation/netlink/genetlink-legacy.yaml index 16380e12cabe..ed64acf1bef7 100644 --- a/Documentation/netlink/genetlink-legacy.yaml +++ b/Documentation/netlink/genetlink-legacy.yaml @@ -14,9 +14,10 @@ $defs: pattern: ^[0-9A-Za-z_-]+( - 1)?$ minimum: 0 len-or-limit: - # literal int or limit based on fixed-width type e.g. u8-min, u16-max, etc. + # literal int, const name, or limit based on fixed-width type + # e.g. u8-min, u16-max, etc. type: [ string, integer ] - pattern: ^[su](8|16|32|64)-(min|max)$ + pattern: ^[0-9A-Za-z_-]+$ minimum: 0 # Schema for specs diff --git a/Documentation/netlink/genetlink.yaml b/Documentation/netlink/genetlink.yaml index b036227b46f1..e43e50dba2e4 100644 --- a/Documentation/netlink/genetlink.yaml +++ b/Documentation/netlink/genetlink.yaml @@ -14,9 +14,10 @@ $defs: pattern: ^[0-9A-Za-z_-]+( - 1)?$ minimum: 0 len-or-limit: - # literal int or limit based on fixed-width type e.g. u8-min, u16-max, etc. + # literal int, const name, or limit based on fixed-width type + # e.g. u8-min, u16-max, etc. type: [ string, integer ] - pattern: ^[su](8|16|32|64)-(min|max)$ + pattern: ^[0-9A-Za-z_-]+$ minimum: 0 # Schema for specs diff --git a/tools/net/ynl/pyynl/ynl_gen_c.py b/tools/net/ynl/pyynl/ynl_gen_c.py index aa08b8b1463d..b22082fd660e 100755 --- a/tools/net/ynl/pyynl/ynl_gen_c.py +++ b/tools/net/ynl/pyynl/ynl_gen_c.py @@ -100,7 +100,7 @@ def get_limit(self, limit, default=None): if isinstance(value, int): return value if value in self.family.consts: - raise Exception("Resolving family constants not implemented, yet") + return self.family.consts[value]["value"] return limit_to_number(value) def get_limit_str(self, limit, default=None, suffix=''): @@ -110,6 +110,9 @@ def get_limit_str(self, limit, default=None, suffix=''): if isinstance(value, int): return str(value) + suffix if value in self.family.consts: + const = self.family.consts[value] + if const.get('header'): + return c_upper(value) return c_upper(f"{self.family['name']}-{value}") return c_upper(value) From 79c0c4689bdf5d0032275f40c8fffe257235a679 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Mon, 3 Feb 2025 17:00:37 -0800 Subject: [PATCH 51/63] eth: fbnic: add MAC address TCAM to debugfs Add read only access to the 32-entry MAC address TCAM via debugfs. BMC filtering shares the same table so this is quite useful to access during debug. See next commit for an example output. Signed-off-by: Alexander Duyck Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250204010038.1404268-1-kuba@kernel.org Signed-off-by: Paolo Abeni --- .../net/ethernet/meta/fbnic/fbnic_debugfs.c | 36 +++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_debugfs.c b/drivers/net/ethernet/meta/fbnic/fbnic_debugfs.c index 59951b5abdb7..ac80981f67c0 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_debugfs.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_debugfs.c @@ -10,6 +10,40 @@ static struct dentry *fbnic_dbg_root; +static void fbnic_dbg_desc_break(struct seq_file *s, int i) +{ + while (i--) + seq_putc(s, '-'); + + seq_putc(s, '\n'); +} + +static int fbnic_dbg_mac_addr_show(struct seq_file *s, void *v) +{ + struct fbnic_dev *fbd = s->private; + char hdr[80]; + int i; + + /* Generate Header */ + snprintf(hdr, sizeof(hdr), "%3s %s %-17s %s\n", + "Idx", "S", "TCAM Bitmap", "Addr/Mask"); + seq_puts(s, hdr); + fbnic_dbg_desc_break(s, strnlen(hdr, sizeof(hdr))); + + for (i = 0; i < FBNIC_RPC_TCAM_MACDA_NUM_ENTRIES; i++) { + struct fbnic_mac_addr *mac_addr = &fbd->mac_addr[i]; + + seq_printf(s, "%02d %d %64pb %pm\n", + i, mac_addr->state, mac_addr->act_tcam, + mac_addr->value.addr8); + seq_printf(s, " %pm\n", + mac_addr->mask.addr8); + } + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(fbnic_dbg_mac_addr); + static int fbnic_dbg_pcie_stats_show(struct seq_file *s, void *v) { struct fbnic_dev *fbd = s->private; @@ -48,6 +82,8 @@ void fbnic_dbg_fbd_init(struct fbnic_dev *fbd) fbd->dbg_fbd = debugfs_create_dir(name, fbnic_dbg_root); debugfs_create_file("pcie_stats", 0400, fbd->dbg_fbd, fbd, &fbnic_dbg_pcie_stats_fops); + debugfs_create_file("mac_addr", 0400, fbd->dbg_fbd, fbd, + &fbnic_dbg_mac_addr_fops); } void fbnic_dbg_fbd_exit(struct fbnic_dev *fbd) From 09717c28b76c30b1dc8c261c855ffb2406abab2e Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Mon, 3 Feb 2025 17:00:38 -0800 Subject: [PATCH 52/63] eth: fbnic: set IFF_UNICAST_FLT to avoid enabling promiscuous mode when adding unicast addrs I realized when we were adding unicast addresses we were enabling promiscuous mode. I did a bit of digging and realized we had overlooked setting the driver private flag to indicate we supported unicast filtering. Example below shows the table with 00deadbeef01 as the main NIC address, and 5 additional addresses in the 00deadbeefX0 format. # cat $dbgfs/mac_addr Idx S TCAM Bitmap Addr/Mask ---------------------------------- 00 0 00000000,00000000 000000000000 000000000000 01 0 00000000,00000000 000000000000 000000000000 02 0 00000000,00000000 000000000000 000000000000 ... 24 0 00000000,00000000 000000000000 000000000000 25 1 00100000,00000000 00deadbeef50 000000000000 26 1 00100000,00000000 00deadbeef40 000000000000 27 1 00100000,00000000 00deadbeef30 000000000000 28 1 00100000,00000000 00deadbeef20 000000000000 29 1 00100000,00000000 00deadbeef10 000000000000 30 1 00100000,00000000 00deadbeef01 000000000000 31 0 00000000,00000000 000000000000 000000000000 Before rule 31 would be active. With this change it correctly sticks to just the unicast filters. Signed-off-by: Alexander Duyck Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250204010038.1404268-2-kuba@kernel.org Signed-off-by: Paolo Abeni --- drivers/net/ethernet/meta/fbnic/fbnic_netdev.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c b/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c index 7a96b6ee773f..1db57c42333e 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c @@ -628,6 +628,8 @@ struct net_device *fbnic_netdev_alloc(struct fbnic_dev *fbd) fbnic_rss_key_fill(fbn->rss_key); fbnic_rss_init_en_mask(fbn); + netdev->priv_flags |= IFF_UNICAST_FLT; + netdev->features |= NETIF_F_RXHASH | NETIF_F_SG | From 0bdcfaf84a9428c49f971be9024505b9e3b43038 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 5 Feb 2025 09:33:52 -0800 Subject: [PATCH 53/63] tools: ynl: add all headers to makefile deps The Makefile.deps lists uAPI headers to make the build work when system headers are older than in-tree headers. The problem doesn't occur for new headers, because system headers are not there at all. But out-of-tree YNL clone on GH also uses this header to identify header dependencies, and one day the system headers will exist, and will get out of date. So let's add the headers we missed. I don't think this is a fix, but FWIW the commits which added the missing headers are: commit 04e65df94b31 ("netlink: spec: add shaper YAML spec") commit 49922401c219 ("ethtool: separate definitions that are gonna be generated") Reviewed-by: Donald Hunter Link: https://patch.msgid.link/20250205173352.446704-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/net/ynl/Makefile.deps | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/net/ynl/Makefile.deps b/tools/net/ynl/Makefile.deps index 0712b5e82eb7..d027a07c1e2c 100644 --- a/tools/net/ynl/Makefile.deps +++ b/tools/net/ynl/Makefile.deps @@ -17,9 +17,11 @@ get_hdr_inc=-D$(1) -include $(UAPI_PATH)/linux/$(2) CFLAGS_devlink:=$(call get_hdr_inc,_LINUX_DEVLINK_H_,devlink.h) CFLAGS_dpll:=$(call get_hdr_inc,_LINUX_DPLL_H,dpll.h) CFLAGS_ethtool:=$(call get_hdr_inc,_LINUX_ETHTOOL_H,ethtool.h) \ - $(call get_hdr_inc,_LINUX_ETHTOOL_NETLINK_H_,ethtool_netlink.h) + $(call get_hdr_inc,_LINUX_ETHTOOL_NETLINK_H_,ethtool_netlink.h) \ + $(call get_hdr_inc,_LINUX_ETHTOOL_NETLINK_GENERATED_H,ethtool_netlink_generated.h) CFLAGS_handshake:=$(call get_hdr_inc,_LINUX_HANDSHAKE_H,handshake.h) CFLAGS_mptcp_pm:=$(call get_hdr_inc,_LINUX_MPTCP_PM_H,mptcp_pm.h) +CFLAGS_net_shaper:=$(call get_hdr_inc,_LINUX_NET_SHAPER_H,net_shaper.h) CFLAGS_netdev:=$(call get_hdr_inc,_LINUX_NETDEV_H,netdev.h) CFLAGS_nlctrl:=$(call get_hdr_inc,__LINUX_GENERIC_NETLINK_H,genetlink.h) CFLAGS_nfsd:=$(call get_hdr_inc,_LINUX_NFSD_NETLINK_H,nfsd_netlink.h) From 8d522566ae9cb3f0609ddb2a6ce3f4f39988043c Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 4 Feb 2025 13:56:12 -0800 Subject: [PATCH 54/63] net: page_pool: don't cast mp param to devmem page_pool_check_memory_provider() is a generic path and shouldn't assume anything about the actual type of the memory provider argument. It's fine while devmem is the only provider, but cast away the devmem specific binding types to avoid confusion. Reviewed-by: Jakub Kicinski Reviewed-by: Mina Almasry Signed-off-by: Pavel Begunkov Signed-off-by: David Wei Link: https://patch.msgid.link/20250204215622.695511-2-dw@davidwei.uk Signed-off-by: Jakub Kicinski --- net/core/page_pool_user.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/page_pool_user.c b/net/core/page_pool_user.c index 6677e0c2e256..d5e214c30c31 100644 --- a/net/core/page_pool_user.c +++ b/net/core/page_pool_user.c @@ -356,7 +356,7 @@ void page_pool_unlist(struct page_pool *pool) int page_pool_check_memory_provider(struct net_device *dev, struct netdev_rx_queue *rxq) { - struct net_devmem_dmabuf_binding *binding = rxq->mp_params.mp_priv; + void *binding = rxq->mp_params.mp_priv; struct page_pool *pool; struct hlist_node *n; From 297d389e9e5bd4704b438257b08bc852281e51ce Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 4 Feb 2025 13:56:13 -0800 Subject: [PATCH 55/63] net: prefix devmem specific helpers Add prefixes to all helpers that are specific to devmem TCP, i.e. net_iov_binding[_id]. Reviewed-by: Jakub Kicinski Reviewed-by: Mina Almasry Signed-off-by: Pavel Begunkov Signed-off-by: David Wei Link: https://patch.msgid.link/20250204215622.695511-3-dw@davidwei.uk Signed-off-by: Jakub Kicinski --- net/core/devmem.c | 2 +- net/core/devmem.h | 14 +++++++------- net/ipv4/tcp.c | 2 +- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/net/core/devmem.c b/net/core/devmem.c index 3bba3f018df0..66cd1ab9224f 100644 --- a/net/core/devmem.c +++ b/net/core/devmem.c @@ -94,7 +94,7 @@ net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding) void net_devmem_free_dmabuf(struct net_iov *niov) { - struct net_devmem_dmabuf_binding *binding = net_iov_binding(niov); + struct net_devmem_dmabuf_binding *binding = net_devmem_iov_binding(niov); unsigned long dma_addr = net_devmem_get_dma_addr(niov); if (WARN_ON(!gen_pool_has_addr(binding->chunk_pool, dma_addr, diff --git a/net/core/devmem.h b/net/core/devmem.h index 76099ef9c482..99782ddeca40 100644 --- a/net/core/devmem.h +++ b/net/core/devmem.h @@ -86,11 +86,16 @@ static inline unsigned int net_iov_idx(const struct net_iov *niov) } static inline struct net_devmem_dmabuf_binding * -net_iov_binding(const struct net_iov *niov) +net_devmem_iov_binding(const struct net_iov *niov) { return net_iov_owner(niov)->binding; } +static inline u32 net_devmem_iov_binding_id(const struct net_iov *niov) +{ + return net_devmem_iov_binding(niov)->id; +} + static inline unsigned long net_iov_virtual_addr(const struct net_iov *niov) { struct dmabuf_genpool_chunk_owner *owner = net_iov_owner(niov); @@ -99,11 +104,6 @@ static inline unsigned long net_iov_virtual_addr(const struct net_iov *niov) ((unsigned long)net_iov_idx(niov) << PAGE_SHIFT); } -static inline u32 net_iov_binding_id(const struct net_iov *niov) -{ - return net_iov_owner(niov)->binding->id; -} - static inline void net_devmem_dmabuf_binding_get(struct net_devmem_dmabuf_binding *binding) { @@ -171,7 +171,7 @@ static inline unsigned long net_iov_virtual_addr(const struct net_iov *niov) return 0; } -static inline u32 net_iov_binding_id(const struct net_iov *niov) +static inline u32 net_devmem_iov_binding_id(const struct net_iov *niov) { return 0; } diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 0d704bda6c41..b872de9a8271 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2494,7 +2494,7 @@ static int tcp_recvmsg_dmabuf(struct sock *sk, const struct sk_buff *skb, /* Will perform the exchange later */ dmabuf_cmsg.frag_token = tcp_xa_pool.tokens[tcp_xa_pool.idx]; - dmabuf_cmsg.dmabuf_id = net_iov_binding_id(niov); + dmabuf_cmsg.dmabuf_id = net_devmem_iov_binding_id(niov); offset += copy; remaining_len -= copy; From 7d60fa9e1ab1e4618b2342d54b2035a0e44d19c6 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 4 Feb 2025 13:56:14 -0800 Subject: [PATCH 56/63] net: generalise net_iov chunk owners Currently net_iov stores a pointer to struct dmabuf_genpool_chunk_owner, which serves as a useful abstraction to share data and provide a context. However, it's too devmem specific, and we want to reuse it for other memory providers, and for that we need to decouple net_iov from devmem. Make net_iov to point to a new base structure called net_iov_area, which dmabuf_genpool_chunk_owner extends. Reviewed-by: Mina Almasry Acked-by: Jakub Kicinski Signed-off-by: Pavel Begunkov Signed-off-by: David Wei Link: https://patch.msgid.link/20250204215622.695511-4-dw@davidwei.uk Signed-off-by: Jakub Kicinski --- include/net/netmem.h | 21 ++++++++++++++++++++- net/core/devmem.c | 25 +++++++++++++------------ net/core/devmem.h | 25 +++++++++---------------- 3 files changed, 42 insertions(+), 29 deletions(-) diff --git a/include/net/netmem.h b/include/net/netmem.h index 1b58faa4f20f..c61d5b21e7b4 100644 --- a/include/net/netmem.h +++ b/include/net/netmem.h @@ -24,11 +24,20 @@ struct net_iov { unsigned long __unused_padding; unsigned long pp_magic; struct page_pool *pp; - struct dmabuf_genpool_chunk_owner *owner; + struct net_iov_area *owner; unsigned long dma_addr; atomic_long_t pp_ref_count; }; +struct net_iov_area { + /* Array of net_iovs for this area. */ + struct net_iov *niovs; + size_t num_niovs; + + /* Offset into the dma-buf where this chunk starts. */ + unsigned long base_virtual; +}; + /* These fields in struct page are used by the page_pool and net stack: * * struct { @@ -54,6 +63,16 @@ NET_IOV_ASSERT_OFFSET(dma_addr, dma_addr); NET_IOV_ASSERT_OFFSET(pp_ref_count, pp_ref_count); #undef NET_IOV_ASSERT_OFFSET +static inline struct net_iov_area *net_iov_owner(const struct net_iov *niov) +{ + return niov->owner; +} + +static inline unsigned int net_iov_idx(const struct net_iov *niov) +{ + return niov - net_iov_owner(niov)->niovs; +} + /* netmem */ /** diff --git a/net/core/devmem.c b/net/core/devmem.c index 66cd1ab9224f..fb0dddcb4e60 100644 --- a/net/core/devmem.c +++ b/net/core/devmem.c @@ -33,14 +33,15 @@ static void net_devmem_dmabuf_free_chunk_owner(struct gen_pool *genpool, { struct dmabuf_genpool_chunk_owner *owner = chunk->owner; - kvfree(owner->niovs); + kvfree(owner->area.niovs); kfree(owner); } static dma_addr_t net_devmem_get_dma_addr(const struct net_iov *niov) { - struct dmabuf_genpool_chunk_owner *owner = net_iov_owner(niov); + struct dmabuf_genpool_chunk_owner *owner; + owner = net_devmem_iov_to_chunk_owner(niov); return owner->base_dma_addr + ((dma_addr_t)net_iov_idx(niov) << PAGE_SHIFT); } @@ -83,7 +84,7 @@ net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding) offset = dma_addr - owner->base_dma_addr; index = offset / PAGE_SIZE; - niov = &owner->niovs[index]; + niov = &owner->area.niovs[index]; niov->pp_magic = 0; niov->pp = NULL; @@ -261,9 +262,9 @@ net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd, goto err_free_chunks; } - owner->base_virtual = virtual; + owner->area.base_virtual = virtual; owner->base_dma_addr = dma_addr; - owner->num_niovs = len / PAGE_SIZE; + owner->area.num_niovs = len / PAGE_SIZE; owner->binding = binding; err = gen_pool_add_owner(binding->chunk_pool, dma_addr, @@ -275,17 +276,17 @@ net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd, goto err_free_chunks; } - owner->niovs = kvmalloc_array(owner->num_niovs, - sizeof(*owner->niovs), - GFP_KERNEL); - if (!owner->niovs) { + owner->area.niovs = kvmalloc_array(owner->area.num_niovs, + sizeof(*owner->area.niovs), + GFP_KERNEL); + if (!owner->area.niovs) { err = -ENOMEM; goto err_free_chunks; } - for (i = 0; i < owner->num_niovs; i++) { - niov = &owner->niovs[i]; - niov->owner = owner; + for (i = 0; i < owner->area.num_niovs; i++) { + niov = &owner->area.niovs[i]; + niov->owner = &owner->area; page_pool_set_dma_addr_netmem(net_iov_to_netmem(niov), net_devmem_get_dma_addr(niov)); } diff --git a/net/core/devmem.h b/net/core/devmem.h index 99782ddeca40..a2b9913e9a17 100644 --- a/net/core/devmem.h +++ b/net/core/devmem.h @@ -10,6 +10,8 @@ #ifndef _NET_DEVMEM_H #define _NET_DEVMEM_H +#include + struct netlink_ext_ack; struct net_devmem_dmabuf_binding { @@ -51,17 +53,11 @@ struct net_devmem_dmabuf_binding { * allocations from this chunk. */ struct dmabuf_genpool_chunk_owner { - /* Offset into the dma-buf where this chunk starts. */ - unsigned long base_virtual; + struct net_iov_area area; + struct net_devmem_dmabuf_binding *binding; /* dma_addr of the start of the chunk. */ dma_addr_t base_dma_addr; - - /* Array of net_iovs for this chunk. */ - struct net_iov *niovs; - size_t num_niovs; - - struct net_devmem_dmabuf_binding *binding; }; void __net_devmem_dmabuf_binding_free(struct net_devmem_dmabuf_binding *binding); @@ -75,20 +71,17 @@ int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx, void dev_dmabuf_uninstall(struct net_device *dev); static inline struct dmabuf_genpool_chunk_owner * -net_iov_owner(const struct net_iov *niov) +net_devmem_iov_to_chunk_owner(const struct net_iov *niov) { - return niov->owner; -} + struct net_iov_area *owner = net_iov_owner(niov); -static inline unsigned int net_iov_idx(const struct net_iov *niov) -{ - return niov - net_iov_owner(niov)->niovs; + return container_of(owner, struct dmabuf_genpool_chunk_owner, area); } static inline struct net_devmem_dmabuf_binding * net_devmem_iov_binding(const struct net_iov *niov) { - return net_iov_owner(niov)->binding; + return net_devmem_iov_to_chunk_owner(niov)->binding; } static inline u32 net_devmem_iov_binding_id(const struct net_iov *niov) @@ -98,7 +91,7 @@ static inline u32 net_devmem_iov_binding_id(const struct net_iov *niov) static inline unsigned long net_iov_virtual_addr(const struct net_iov *niov) { - struct dmabuf_genpool_chunk_owner *owner = net_iov_owner(niov); + struct net_iov_area *owner = net_iov_owner(niov); return owner->base_virtual + ((unsigned long)net_iov_idx(niov) << PAGE_SHIFT); From 57afb483015768903029c8336ee287f4b03c1235 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 4 Feb 2025 13:56:15 -0800 Subject: [PATCH 57/63] net: page_pool: create hooks for custom memory providers A spin off from the original page pool memory providers patch by Jakub, which allows extending page pools with custom allocators. One of such providers is devmem TCP, and the other is io_uring zerocopy added in following patches. Link: https://lore.kernel.org/netdev/20230707183935.997267-7-kuba@kernel.org/ Co-developed-by: Jakub Kicinski # initial mp proposal Signed-off-by: Pavel Begunkov Signed-off-by: David Wei Link: https://patch.msgid.link/20250204215622.695511-5-dw@davidwei.uk Signed-off-by: Jakub Kicinski --- include/net/page_pool/memory_provider.h | 15 +++++++++++++++ include/net/page_pool/types.h | 4 ++++ net/core/devmem.c | 15 ++++++++++++++- net/core/page_pool.c | 23 +++++++++++++++-------- 4 files changed, 48 insertions(+), 9 deletions(-) create mode 100644 include/net/page_pool/memory_provider.h diff --git a/include/net/page_pool/memory_provider.h b/include/net/page_pool/memory_provider.h new file mode 100644 index 000000000000..e49d0a52629d --- /dev/null +++ b/include/net/page_pool/memory_provider.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _NET_PAGE_POOL_MEMORY_PROVIDER_H +#define _NET_PAGE_POOL_MEMORY_PROVIDER_H + +#include +#include + +struct memory_provider_ops { + netmem_ref (*alloc_netmems)(struct page_pool *pool, gfp_t gfp); + bool (*release_netmem)(struct page_pool *pool, netmem_ref netmem); + int (*init)(struct page_pool *pool); + void (*destroy)(struct page_pool *pool); +}; + +#endif diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h index 7f405672b089..36eb57d73abc 100644 --- a/include/net/page_pool/types.h +++ b/include/net/page_pool/types.h @@ -152,8 +152,11 @@ struct page_pool_stats { */ #define PAGE_POOL_FRAG_GROUP_ALIGN (4 * sizeof(long)) +struct memory_provider_ops; + struct pp_memory_provider_params { void *mp_priv; + const struct memory_provider_ops *mp_ops; }; struct page_pool { @@ -216,6 +219,7 @@ struct page_pool { struct ptr_ring ring; void *mp_priv; + const struct memory_provider_ops *mp_ops; #ifdef CONFIG_PAGE_POOL_STATS /* recycle stats are per-cpu to avoid locking */ diff --git a/net/core/devmem.c b/net/core/devmem.c index fb0dddcb4e60..c81625ca57c6 100644 --- a/net/core/devmem.c +++ b/net/core/devmem.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include "devmem.h" @@ -27,6 +28,8 @@ /* Protected by rtnl_lock() */ static DEFINE_XARRAY_FLAGS(net_devmem_dmabuf_bindings, XA_FLAGS_ALLOC1); +static const struct memory_provider_ops dmabuf_devmem_ops; + static void net_devmem_dmabuf_free_chunk_owner(struct gen_pool *genpool, struct gen_pool_chunk *chunk, void *not_used) @@ -118,6 +121,7 @@ void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding) WARN_ON(rxq->mp_params.mp_priv != binding); rxq->mp_params.mp_priv = NULL; + rxq->mp_params.mp_ops = NULL; rxq_idx = get_netdev_rx_queue_index(rxq); @@ -153,7 +157,7 @@ int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx, } rxq = __netif_get_rx_queue(dev, rxq_idx); - if (rxq->mp_params.mp_priv) { + if (rxq->mp_params.mp_ops) { NL_SET_ERR_MSG(extack, "designated queue already memory provider bound"); return -EEXIST; } @@ -171,6 +175,7 @@ int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx, return err; rxq->mp_params.mp_priv = binding; + rxq->mp_params.mp_ops = &dmabuf_devmem_ops; err = netdev_rx_queue_restart(dev, rxq_idx); if (err) @@ -180,6 +185,7 @@ int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx, err_xa_erase: rxq->mp_params.mp_priv = NULL; + rxq->mp_params.mp_ops = NULL; xa_erase(&binding->bound_rxqs, xa_idx); return err; @@ -399,3 +405,10 @@ bool mp_dmabuf_devmem_release_page(struct page_pool *pool, netmem_ref netmem) /* We don't want the page pool put_page()ing our net_iovs. */ return false; } + +static const struct memory_provider_ops dmabuf_devmem_ops = { + .init = mp_dmabuf_devmem_init, + .destroy = mp_dmabuf_devmem_destroy, + .alloc_netmems = mp_dmabuf_devmem_alloc_netmems, + .release_netmem = mp_dmabuf_devmem_release_page, +}; diff --git a/net/core/page_pool.c b/net/core/page_pool.c index f5e908c9e7ad..d632cf2c91c3 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -13,6 +13,7 @@ #include #include +#include #include #include @@ -285,13 +286,19 @@ static int page_pool_init(struct page_pool *pool, rxq = __netif_get_rx_queue(pool->slow.netdev, pool->slow.queue_idx); pool->mp_priv = rxq->mp_params.mp_priv; + pool->mp_ops = rxq->mp_params.mp_ops; } - if (pool->mp_priv) { + if (pool->mp_ops) { if (!pool->dma_map || !pool->dma_sync) return -EOPNOTSUPP; - err = mp_dmabuf_devmem_init(pool); + if (WARN_ON(!is_kernel_rodata((unsigned long)pool->mp_ops))) { + err = -EFAULT; + goto free_ptr_ring; + } + + err = pool->mp_ops->init(pool); if (err) { pr_warn("%s() mem-provider init failed %d\n", __func__, err); @@ -587,8 +594,8 @@ netmem_ref page_pool_alloc_netmems(struct page_pool *pool, gfp_t gfp) return netmem; /* Slow-path: cache empty, do real allocation */ - if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_priv) - netmem = mp_dmabuf_devmem_alloc_netmems(pool, gfp); + if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_ops) + netmem = pool->mp_ops->alloc_netmems(pool, gfp); else netmem = __page_pool_alloc_pages_slow(pool, gfp); return netmem; @@ -679,8 +686,8 @@ void page_pool_return_page(struct page_pool *pool, netmem_ref netmem) bool put; put = true; - if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_priv) - put = mp_dmabuf_devmem_release_page(pool, netmem); + if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_ops) + put = pool->mp_ops->release_netmem(pool, netmem); else __page_pool_release_page_dma(pool, netmem); @@ -1048,8 +1055,8 @@ static void __page_pool_destroy(struct page_pool *pool) page_pool_unlist(pool); page_pool_uninit(pool); - if (pool->mp_priv) { - mp_dmabuf_devmem_destroy(pool); + if (pool->mp_ops) { + pool->mp_ops->destroy(pool); static_branch_dec(&page_pool_mem_providers); } From dcc0113acd3b77cca3c7e805fffd8ea4c5a675b3 Mon Sep 17 00:00:00 2001 From: David Wei Date: Tue, 4 Feb 2025 13:56:16 -0800 Subject: [PATCH 58/63] netdev: add io_uring memory provider info Add a nested attribute for io_uring memory provider info. For now it is empty and its presence indicates that a particular page pool or queue has an io_uring memory provider attached. $ ./cli.py --spec netlink/specs/netdev.yaml --dump page-pool-get [{'id': 80, 'ifindex': 2, 'inflight': 64, 'inflight-mem': 262144, 'napi-id': 525}, {'id': 79, 'ifindex': 2, 'inflight': 320, 'inflight-mem': 1310720, 'io_uring': {}, 'napi-id': 525}, ... $ ./cli.py --spec netlink/specs/netdev.yaml --dump queue-get [{'id': 0, 'ifindex': 1, 'type': 'rx'}, {'id': 0, 'ifindex': 1, 'type': 'tx'}, {'id': 0, 'ifindex': 2, 'napi-id': 513, 'type': 'rx'}, {'id': 1, 'ifindex': 2, 'napi-id': 514, 'type': 'rx'}, ... {'id': 12, 'ifindex': 2, 'io_uring': {}, 'napi-id': 525, 'type': 'rx'}, ... Reviewed-by: Jakub Kicinski Signed-off-by: Pavel Begunkov Signed-off-by: David Wei Link: https://patch.msgid.link/20250204215622.695511-6-dw@davidwei.uk Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/netdev.yaml | 15 +++++++++++++++ include/uapi/linux/netdev.h | 7 +++++++ tools/include/uapi/linux/netdev.h | 7 +++++++ 3 files changed, 29 insertions(+) diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml index cbb544bd6c84..288923e965ae 100644 --- a/Documentation/netlink/specs/netdev.yaml +++ b/Documentation/netlink/specs/netdev.yaml @@ -114,6 +114,9 @@ attribute-sets: doc: Bitmask of enabled AF_XDP features. type: u64 enum: xsk-flags + - + name: io-uring-provider-info + attributes: [] - name: page-pool attributes: @@ -171,6 +174,11 @@ attribute-sets: name: dmabuf doc: ID of the dmabuf this page-pool is attached to. type: u32 + - + name: io-uring + doc: io-uring memory provider information. + type: nest + nested-attributes: io-uring-provider-info - name: page-pool-info subset-of: page-pool @@ -296,6 +304,11 @@ attribute-sets: name: dmabuf doc: ID of the dmabuf attached to this queue, if any. type: u32 + - + name: io-uring + doc: io_uring memory provider information. + type: nest + nested-attributes: io-uring-provider-info - name: qstats @@ -572,6 +585,7 @@ operations: - inflight-mem - detach-time - dmabuf + - io-uring dump: reply: *pp-reply config-cond: page-pool @@ -637,6 +651,7 @@ operations: - napi-id - ifindex - dmabuf + - io-uring dump: request: attributes: diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h index e4be227d3ad6..6c6ee183802d 100644 --- a/include/uapi/linux/netdev.h +++ b/include/uapi/linux/netdev.h @@ -86,6 +86,11 @@ enum { NETDEV_A_DEV_MAX = (__NETDEV_A_DEV_MAX - 1) }; +enum { + __NETDEV_A_IO_URING_PROVIDER_INFO_MAX, + NETDEV_A_IO_URING_PROVIDER_INFO_MAX = (__NETDEV_A_IO_URING_PROVIDER_INFO_MAX - 1) +}; + enum { NETDEV_A_PAGE_POOL_ID = 1, NETDEV_A_PAGE_POOL_IFINDEX, @@ -94,6 +99,7 @@ enum { NETDEV_A_PAGE_POOL_INFLIGHT_MEM, NETDEV_A_PAGE_POOL_DETACH_TIME, NETDEV_A_PAGE_POOL_DMABUF, + NETDEV_A_PAGE_POOL_IO_URING, __NETDEV_A_PAGE_POOL_MAX, NETDEV_A_PAGE_POOL_MAX = (__NETDEV_A_PAGE_POOL_MAX - 1) @@ -136,6 +142,7 @@ enum { NETDEV_A_QUEUE_TYPE, NETDEV_A_QUEUE_NAPI_ID, NETDEV_A_QUEUE_DMABUF, + NETDEV_A_QUEUE_IO_URING, __NETDEV_A_QUEUE_MAX, NETDEV_A_QUEUE_MAX = (__NETDEV_A_QUEUE_MAX - 1) diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h index e4be227d3ad6..6c6ee183802d 100644 --- a/tools/include/uapi/linux/netdev.h +++ b/tools/include/uapi/linux/netdev.h @@ -86,6 +86,11 @@ enum { NETDEV_A_DEV_MAX = (__NETDEV_A_DEV_MAX - 1) }; +enum { + __NETDEV_A_IO_URING_PROVIDER_INFO_MAX, + NETDEV_A_IO_URING_PROVIDER_INFO_MAX = (__NETDEV_A_IO_URING_PROVIDER_INFO_MAX - 1) +}; + enum { NETDEV_A_PAGE_POOL_ID = 1, NETDEV_A_PAGE_POOL_IFINDEX, @@ -94,6 +99,7 @@ enum { NETDEV_A_PAGE_POOL_INFLIGHT_MEM, NETDEV_A_PAGE_POOL_DETACH_TIME, NETDEV_A_PAGE_POOL_DMABUF, + NETDEV_A_PAGE_POOL_IO_URING, __NETDEV_A_PAGE_POOL_MAX, NETDEV_A_PAGE_POOL_MAX = (__NETDEV_A_PAGE_POOL_MAX - 1) @@ -136,6 +142,7 @@ enum { NETDEV_A_QUEUE_TYPE, NETDEV_A_QUEUE_NAPI_ID, NETDEV_A_QUEUE_DMABUF, + NETDEV_A_QUEUE_IO_URING, __NETDEV_A_QUEUE_MAX, NETDEV_A_QUEUE_MAX = (__NETDEV_A_QUEUE_MAX - 1) From 2508a46f920a3130e35ab2183a70fc93f0aaaee4 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 4 Feb 2025 13:56:17 -0800 Subject: [PATCH 59/63] net: page_pool: add callback for mp info printing Add a mandatory callback that prints information about the memory provider to netlink. Reviewed-by: Jakub Kicinski Signed-off-by: Pavel Begunkov Signed-off-by: David Wei Link: https://patch.msgid.link/20250204215622.695511-7-dw@davidwei.uk Signed-off-by: Jakub Kicinski --- include/net/page_pool/memory_provider.h | 5 +++++ net/core/devmem.c | 10 ++++++++++ net/core/netdev-genl.c | 11 ++++++----- net/core/page_pool_user.c | 5 ++--- 4 files changed, 23 insertions(+), 8 deletions(-) diff --git a/include/net/page_pool/memory_provider.h b/include/net/page_pool/memory_provider.h index e49d0a52629d..6d10a0959d00 100644 --- a/include/net/page_pool/memory_provider.h +++ b/include/net/page_pool/memory_provider.h @@ -5,11 +5,16 @@ #include #include +struct netdev_rx_queue; +struct sk_buff; + struct memory_provider_ops { netmem_ref (*alloc_netmems)(struct page_pool *pool, gfp_t gfp); bool (*release_netmem)(struct page_pool *pool, netmem_ref netmem); int (*init)(struct page_pool *pool); void (*destroy)(struct page_pool *pool); + int (*nl_fill)(void *mp_priv, struct sk_buff *rsp, + struct netdev_rx_queue *rxq); }; #endif diff --git a/net/core/devmem.c b/net/core/devmem.c index c81625ca57c6..63b790326c7d 100644 --- a/net/core/devmem.c +++ b/net/core/devmem.c @@ -406,9 +406,19 @@ bool mp_dmabuf_devmem_release_page(struct page_pool *pool, netmem_ref netmem) return false; } +static int mp_dmabuf_devmem_nl_fill(void *mp_priv, struct sk_buff *rsp, + struct netdev_rx_queue *rxq) +{ + const struct net_devmem_dmabuf_binding *binding = mp_priv; + int type = rxq ? NETDEV_A_QUEUE_DMABUF : NETDEV_A_PAGE_POOL_DMABUF; + + return nla_put_u32(rsp, type, binding->id); +} + static const struct memory_provider_ops dmabuf_devmem_ops = { .init = mp_dmabuf_devmem_init, .destroy = mp_dmabuf_devmem_destroy, .alloc_netmems = mp_dmabuf_devmem_alloc_netmems, .release_netmem = mp_dmabuf_devmem_release_page, + .nl_fill = mp_dmabuf_devmem_nl_fill, }; diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 715f85c6b62e..5b459b4fef46 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -10,6 +10,7 @@ #include #include #include +#include #include "dev.h" #include "devmem.h" @@ -368,7 +369,7 @@ static int netdev_nl_queue_fill_one(struct sk_buff *rsp, struct net_device *netdev, u32 q_idx, u32 q_type, const struct genl_info *info) { - struct net_devmem_dmabuf_binding *binding; + struct pp_memory_provider_params *params; struct netdev_rx_queue *rxq; struct netdev_queue *txq; void *hdr; @@ -385,15 +386,15 @@ netdev_nl_queue_fill_one(struct sk_buff *rsp, struct net_device *netdev, switch (q_type) { case NETDEV_QUEUE_TYPE_RX: rxq = __netif_get_rx_queue(netdev, q_idx); + if (rxq->napi && nla_put_u32(rsp, NETDEV_A_QUEUE_NAPI_ID, rxq->napi->napi_id)) goto nla_put_failure; - binding = rxq->mp_params.mp_priv; - if (binding && - nla_put_u32(rsp, NETDEV_A_QUEUE_DMABUF, binding->id)) + params = &rxq->mp_params; + if (params->mp_ops && + params->mp_ops->nl_fill(params->mp_priv, rsp, rxq)) goto nla_put_failure; - break; case NETDEV_QUEUE_TYPE_TX: txq = netdev_get_tx_queue(netdev, q_idx); diff --git a/net/core/page_pool_user.c b/net/core/page_pool_user.c index d5e214c30c31..9d8a3d8597fa 100644 --- a/net/core/page_pool_user.c +++ b/net/core/page_pool_user.c @@ -8,9 +8,9 @@ #include #include #include +#include #include -#include "devmem.h" #include "page_pool_priv.h" #include "netdev-genl-gen.h" @@ -216,7 +216,6 @@ static int page_pool_nl_fill(struct sk_buff *rsp, const struct page_pool *pool, const struct genl_info *info) { - struct net_devmem_dmabuf_binding *binding = pool->mp_priv; size_t inflight, refsz; unsigned int napi_id; void *hdr; @@ -249,7 +248,7 @@ page_pool_nl_fill(struct sk_buff *rsp, const struct page_pool *pool, pool->user.detach_time)) goto err_cancel; - if (binding && nla_put_u32(rsp, NETDEV_A_PAGE_POOL_DMABUF, binding->id)) + if (pool->mp_ops && pool->mp_ops->nl_fill(pool->mp_priv, rsp, NULL)) goto err_cancel; genlmsg_end(rsp, hdr); From f8350a4358fc9c4801f2f4229cf9b6e678055d9a Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 4 Feb 2025 13:56:18 -0800 Subject: [PATCH 60/63] net: page_pool: add a mp hook to unregister_netdevice* Devmem TCP needs a hook in unregister_netdevice_many_notify() to upkeep the set tracking queues it's bound to, i.e. ->bound_rxqs. Instead of devmem sticking directly out of the genetic path, add a mp function. Reviewed-by: Jakub Kicinski Reviewed-by: Mina Almasry Signed-off-by: Pavel Begunkov Signed-off-by: David Wei Link: https://patch.msgid.link/20250204215622.695511-8-dw@davidwei.uk Signed-off-by: Jakub Kicinski --- include/net/page_pool/memory_provider.h | 1 + net/core/dev.c | 16 ++++++++++- net/core/devmem.c | 36 +++++++++++-------------- net/core/devmem.h | 5 ---- 4 files changed, 32 insertions(+), 26 deletions(-) diff --git a/include/net/page_pool/memory_provider.h b/include/net/page_pool/memory_provider.h index 6d10a0959d00..36469a7e649f 100644 --- a/include/net/page_pool/memory_provider.h +++ b/include/net/page_pool/memory_provider.h @@ -15,6 +15,7 @@ struct memory_provider_ops { void (*destroy)(struct page_pool *pool); int (*nl_fill)(void *mp_priv, struct sk_buff *rsp, struct netdev_rx_queue *rxq); + void (*uninstall)(void *mp_priv, struct netdev_rx_queue *rxq); }; #endif diff --git a/net/core/dev.c b/net/core/dev.c index c0021cbd28fc..384b9109932a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -159,6 +159,7 @@ #include #include #include +#include #include #include @@ -11724,6 +11725,19 @@ void unregister_netdevice_queue(struct net_device *dev, struct list_head *head) } EXPORT_SYMBOL(unregister_netdevice_queue); +static void dev_memory_provider_uninstall(struct net_device *dev) +{ + unsigned int i; + + for (i = 0; i < dev->real_num_rx_queues; i++) { + struct netdev_rx_queue *rxq = &dev->_rx[i]; + struct pp_memory_provider_params *p = &rxq->mp_params; + + if (p->mp_ops && p->mp_ops->uninstall) + p->mp_ops->uninstall(rxq->mp_params.mp_priv, rxq); + } +} + void unregister_netdevice_many_notify(struct list_head *head, u32 portid, const struct nlmsghdr *nlh) { @@ -11778,7 +11792,7 @@ void unregister_netdevice_many_notify(struct list_head *head, dev_tcx_uninstall(dev); dev_xdp_uninstall(dev); bpf_dev_bound_netdev_unregister(dev); - dev_dmabuf_uninstall(dev); + dev_memory_provider_uninstall(dev); netdev_offload_xstats_disable_all(dev); diff --git a/net/core/devmem.c b/net/core/devmem.c index 63b790326c7d..cbac6419fcc4 100644 --- a/net/core/devmem.c +++ b/net/core/devmem.c @@ -320,26 +320,6 @@ net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd, return ERR_PTR(err); } -void dev_dmabuf_uninstall(struct net_device *dev) -{ - struct net_devmem_dmabuf_binding *binding; - struct netdev_rx_queue *rxq; - unsigned long xa_idx; - unsigned int i; - - for (i = 0; i < dev->real_num_rx_queues; i++) { - binding = dev->_rx[i].mp_params.mp_priv; - if (!binding) - continue; - - xa_for_each(&binding->bound_rxqs, xa_idx, rxq) - if (rxq == &dev->_rx[i]) { - xa_erase(&binding->bound_rxqs, xa_idx); - break; - } - } -} - /*** "Dmabuf devmem memory provider" ***/ int mp_dmabuf_devmem_init(struct page_pool *pool) @@ -415,10 +395,26 @@ static int mp_dmabuf_devmem_nl_fill(void *mp_priv, struct sk_buff *rsp, return nla_put_u32(rsp, type, binding->id); } +static void mp_dmabuf_devmem_uninstall(void *mp_priv, + struct netdev_rx_queue *rxq) +{ + struct net_devmem_dmabuf_binding *binding = mp_priv; + struct netdev_rx_queue *bound_rxq; + unsigned long xa_idx; + + xa_for_each(&binding->bound_rxqs, xa_idx, bound_rxq) { + if (bound_rxq == rxq) { + xa_erase(&binding->bound_rxqs, xa_idx); + break; + } + } +} + static const struct memory_provider_ops dmabuf_devmem_ops = { .init = mp_dmabuf_devmem_init, .destroy = mp_dmabuf_devmem_destroy, .alloc_netmems = mp_dmabuf_devmem_alloc_netmems, .release_netmem = mp_dmabuf_devmem_release_page, .nl_fill = mp_dmabuf_devmem_nl_fill, + .uninstall = mp_dmabuf_devmem_uninstall, }; diff --git a/net/core/devmem.h b/net/core/devmem.h index a2b9913e9a17..8e999fe2ae67 100644 --- a/net/core/devmem.h +++ b/net/core/devmem.h @@ -68,7 +68,6 @@ void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding); int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx, struct net_devmem_dmabuf_binding *binding, struct netlink_ext_ack *extack); -void dev_dmabuf_uninstall(struct net_device *dev); static inline struct dmabuf_genpool_chunk_owner * net_devmem_iov_to_chunk_owner(const struct net_iov *niov) @@ -145,10 +144,6 @@ net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx, return -EOPNOTSUPP; } -static inline void dev_dmabuf_uninstall(struct net_device *dev) -{ -} - static inline struct net_iov * net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding) { From 69e39537b66232103633f685b208f293bf9a15b5 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 4 Feb 2025 13:56:19 -0800 Subject: [PATCH 61/63] net: prepare for non devmem TCP memory providers There is a good bunch of places in generic paths assuming that the only page pool memory provider is devmem TCP. As we want to reuse the net_iov and provider infrastructure, we need to patch it up and explicitly check the provider type when we branch into devmem TCP code. Reviewed-by: Mina Almasry Reviewed-by: Jakub Kicinski Signed-off-by: Pavel Begunkov Signed-off-by: David Wei Link: https://patch.msgid.link/20250204215622.695511-9-dw@davidwei.uk Signed-off-by: Jakub Kicinski --- net/core/devmem.c | 5 +++++ net/core/devmem.h | 7 +++++++ net/ipv4/tcp.c | 5 +++++ 3 files changed, 17 insertions(+) diff --git a/net/core/devmem.c b/net/core/devmem.c index cbac6419fcc4..7c6e0b5b6acb 100644 --- a/net/core/devmem.c +++ b/net/core/devmem.c @@ -30,6 +30,11 @@ static DEFINE_XARRAY_FLAGS(net_devmem_dmabuf_bindings, XA_FLAGS_ALLOC1); static const struct memory_provider_ops dmabuf_devmem_ops; +bool net_is_devmem_iov(struct net_iov *niov) +{ + return niov->pp->mp_ops == &dmabuf_devmem_ops; +} + static void net_devmem_dmabuf_free_chunk_owner(struct gen_pool *genpool, struct gen_pool_chunk *chunk, void *not_used) diff --git a/net/core/devmem.h b/net/core/devmem.h index 8e999fe2ae67..7fc158d52729 100644 --- a/net/core/devmem.h +++ b/net/core/devmem.h @@ -115,6 +115,8 @@ struct net_iov * net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding); void net_devmem_free_dmabuf(struct net_iov *ppiov); +bool net_is_devmem_iov(struct net_iov *niov); + #else struct net_devmem_dmabuf_binding; @@ -163,6 +165,11 @@ static inline u32 net_devmem_iov_binding_id(const struct net_iov *niov) { return 0; } + +static inline bool net_is_devmem_iov(struct net_iov *niov) +{ + return false; +} #endif #endif /* _NET_DEVMEM_H */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index b872de9a8271..7f43d31c9400 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2476,6 +2476,11 @@ static int tcp_recvmsg_dmabuf(struct sock *sk, const struct sk_buff *skb, } niov = skb_frag_net_iov(frag); + if (!net_is_devmem_iov(niov)) { + err = -ENODEV; + goto out; + } + end = start + skb_frag_size(frag); copy = end - offset; From 56102c013fa7b8dbba8c5d5f7e042ad5f18cf4ec Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 4 Feb 2025 13:56:20 -0800 Subject: [PATCH 62/63] net: page_pool: add memory provider helpers Add helpers for memory providers to interact with page pools. net_mp_niov_{set,clear}_page_pool() serve to [dis]associate a net_iov with a page pool. If used, the memory provider is responsible to match "set" calls with "clear" once a net_iov is not going to be used by a page pool anymore, changing a page pool, etc. Acked-by: Jakub Kicinski Signed-off-by: Pavel Begunkov Signed-off-by: David Wei Link: https://patch.msgid.link/20250204215622.695511-10-dw@davidwei.uk Signed-off-by: Jakub Kicinski --- include/net/page_pool/memory_provider.h | 19 +++++++++++++++++ net/core/page_pool.c | 28 +++++++++++++++++++++++++ 2 files changed, 47 insertions(+) diff --git a/include/net/page_pool/memory_provider.h b/include/net/page_pool/memory_provider.h index 36469a7e649f..4f0ffb8f6a0a 100644 --- a/include/net/page_pool/memory_provider.h +++ b/include/net/page_pool/memory_provider.h @@ -18,4 +18,23 @@ struct memory_provider_ops { void (*uninstall)(void *mp_priv, struct netdev_rx_queue *rxq); }; +bool net_mp_niov_set_dma_addr(struct net_iov *niov, dma_addr_t addr); +void net_mp_niov_set_page_pool(struct page_pool *pool, struct net_iov *niov); +void net_mp_niov_clear_page_pool(struct net_iov *niov); + +/** + * net_mp_netmem_place_in_cache() - give a netmem to a page pool + * @pool: the page pool to place the netmem into + * @netmem: netmem to give + * + * Push an accounted netmem into the page pool's allocation cache. The caller + * must ensure that there is space in the cache. It should only be called off + * the mp_ops->alloc_netmems() path. + */ +static inline void net_mp_netmem_place_in_cache(struct page_pool *pool, + netmem_ref netmem) +{ + pool->alloc.cache[pool->alloc.count++] = netmem; +} + #endif diff --git a/net/core/page_pool.c b/net/core/page_pool.c index d632cf2c91c3..686bd4a117d9 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -1197,3 +1197,31 @@ void page_pool_update_nid(struct page_pool *pool, int new_nid) } } EXPORT_SYMBOL(page_pool_update_nid); + +bool net_mp_niov_set_dma_addr(struct net_iov *niov, dma_addr_t addr) +{ + return page_pool_set_dma_addr_netmem(net_iov_to_netmem(niov), addr); +} + +/* Associate a niov with a page pool. Should follow with a matching + * net_mp_niov_clear_page_pool() + */ +void net_mp_niov_set_page_pool(struct page_pool *pool, struct net_iov *niov) +{ + netmem_ref netmem = net_iov_to_netmem(niov); + + page_pool_set_pp_info(pool, netmem); + + pool->pages_state_hold_cnt++; + trace_page_pool_state_hold(pool, netmem, pool->pages_state_hold_cnt); +} + +/* Disassociate a niov from a page pool. Should only be used in the + * ->release_netmem() path. + */ +void net_mp_niov_clear_page_pool(struct net_iov *niov) +{ + netmem_ref netmem = net_iov_to_netmem(niov); + + page_pool_clear_pp_info(netmem); +} From 6e18ed929d3ba9b3b92ba5894f9233686b3e3ec1 Mon Sep 17 00:00:00 2001 From: David Wei Date: Tue, 4 Feb 2025 13:56:21 -0800 Subject: [PATCH 63/63] net: add helpers for setting a memory provider on an rx queue Add helpers that properly prep or remove a memory provider for an rx queue then restart the queue. Reviewed-by: Jakub Kicinski Signed-off-by: Pavel Begunkov Signed-off-by: David Wei Link: https://patch.msgid.link/20250204215622.695511-11-dw@davidwei.uk Signed-off-by: Jakub Kicinski --- include/net/page_pool/memory_provider.h | 5 ++ net/core/netdev_rx_queue.c | 69 +++++++++++++++++++++++++ 2 files changed, 74 insertions(+) diff --git a/include/net/page_pool/memory_provider.h b/include/net/page_pool/memory_provider.h index 4f0ffb8f6a0a..b3e665897767 100644 --- a/include/net/page_pool/memory_provider.h +++ b/include/net/page_pool/memory_provider.h @@ -22,6 +22,11 @@ bool net_mp_niov_set_dma_addr(struct net_iov *niov, dma_addr_t addr); void net_mp_niov_set_page_pool(struct page_pool *pool, struct net_iov *niov); void net_mp_niov_clear_page_pool(struct net_iov *niov); +int net_mp_open_rxq(struct net_device *dev, unsigned ifq_idx, + struct pp_memory_provider_params *p); +void net_mp_close_rxq(struct net_device *dev, unsigned ifq_idx, + struct pp_memory_provider_params *old_p); + /** * net_mp_netmem_place_in_cache() - give a netmem to a page pool * @pool: the page pool to place the netmem into diff --git a/net/core/netdev_rx_queue.c b/net/core/netdev_rx_queue.c index db82786fa0c4..db46880f37cc 100644 --- a/net/core/netdev_rx_queue.c +++ b/net/core/netdev_rx_queue.c @@ -3,6 +3,7 @@ #include #include #include +#include #include "page_pool_priv.h" @@ -80,3 +81,71 @@ int netdev_rx_queue_restart(struct net_device *dev, unsigned int rxq_idx) return err; } EXPORT_SYMBOL_NS_GPL(netdev_rx_queue_restart, "NETDEV_INTERNAL"); + +static int __net_mp_open_rxq(struct net_device *dev, unsigned ifq_idx, + struct pp_memory_provider_params *p) +{ + struct netdev_rx_queue *rxq; + int ret; + + if (ifq_idx >= dev->real_num_rx_queues) + return -EINVAL; + ifq_idx = array_index_nospec(ifq_idx, dev->real_num_rx_queues); + + rxq = __netif_get_rx_queue(dev, ifq_idx); + if (rxq->mp_params.mp_ops) + return -EEXIST; + + rxq->mp_params = *p; + ret = netdev_rx_queue_restart(dev, ifq_idx); + if (ret) { + rxq->mp_params.mp_ops = NULL; + rxq->mp_params.mp_priv = NULL; + } + return ret; +} + +int net_mp_open_rxq(struct net_device *dev, unsigned ifq_idx, + struct pp_memory_provider_params *p) +{ + int ret; + + rtnl_lock(); + ret = __net_mp_open_rxq(dev, ifq_idx, p); + rtnl_unlock(); + return ret; +} + +static void __net_mp_close_rxq(struct net_device *dev, unsigned ifq_idx, + struct pp_memory_provider_params *old_p) +{ + struct netdev_rx_queue *rxq; + + if (WARN_ON_ONCE(ifq_idx >= dev->real_num_rx_queues)) + return; + + rxq = __netif_get_rx_queue(dev, ifq_idx); + + /* Callers holding a netdev ref may get here after we already + * went thru shutdown via dev_memory_provider_uninstall(). + */ + if (dev->reg_state > NETREG_REGISTERED && + !rxq->mp_params.mp_ops) + return; + + if (WARN_ON_ONCE(rxq->mp_params.mp_ops != old_p->mp_ops || + rxq->mp_params.mp_priv != old_p->mp_priv)) + return; + + rxq->mp_params.mp_ops = NULL; + rxq->mp_params.mp_priv = NULL; + WARN_ON(netdev_rx_queue_restart(dev, ifq_idx)); +} + +void net_mp_close_rxq(struct net_device *dev, unsigned ifq_idx, + struct pp_memory_provider_params *old_p) +{ + rtnl_lock(); + __net_mp_close_rxq(dev, ifq_idx, old_p); + rtnl_unlock(); +}