diff --git a/Documentation/networking/device_drivers/ethernet/index.rst b/Documentation/networking/device_drivers/ethernet/index.rst index 5196905582c5b..392969ac88ad8 100644 --- a/Documentation/networking/device_drivers/ethernet/index.rst +++ b/Documentation/networking/device_drivers/ethernet/index.rst @@ -39,7 +39,7 @@ Contents: intel/ice marvell/octeontx2 marvell/octeon_ep - mellanox/mlx5 + mellanox/mlx5/index microsoft/netvsc neterion/s2io netronome/nfp diff --git a/Documentation/networking/device_drivers/ethernet/mellanox/mlx5.rst b/Documentation/networking/device_drivers/ethernet/mellanox/mlx5.rst deleted file mode 100644 index 6969652f593c9..0000000000000 --- a/Documentation/networking/device_drivers/ethernet/mellanox/mlx5.rst +++ /dev/null @@ -1,746 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB - -================================================= -Mellanox ConnectX(R) mlx5 core VPI Network Driver -================================================= - -Copyright (c) 2019, Mellanox Technologies LTD. - -Contents -======== - -- `Enabling the driver and kconfig options`_ -- `Devlink info`_ -- `Devlink parameters`_ -- `Bridge offload`_ -- `mlx5 subfunction`_ -- `mlx5 function attributes`_ -- `Devlink health reporters`_ -- `mlx5 tracepoints`_ - -Enabling the driver and kconfig options -======================================= - -| mlx5 core is modular and most of the major mlx5 core driver features can be selected (compiled in/out) -| at build time via kernel Kconfig flags. -| Basic features, ethernet net device rx/tx offloads and XDP, are available with the most basic flags -| CONFIG_MLX5_CORE=y/m and CONFIG_MLX5_CORE_EN=y. -| For the list of advanced features, please see below. - -**CONFIG_MLX5_CORE=(y/m/n)** (module mlx5_core.ko) - -| The driver can be enabled by choosing CONFIG_MLX5_CORE=y/m in kernel config. -| This will provide mlx5 core driver for mlx5 ulps to interface with (mlx5e, mlx5_ib). - - -**CONFIG_MLX5_CORE_EN=(y/n)** - -| Choosing this option will allow basic ethernet netdevice support with all of the standard rx/tx offloads. -| mlx5e is the mlx5 ulp driver which provides netdevice kernel interface, when chosen, mlx5e will be -| built-in into mlx5_core.ko. - - -**CONFIG_MLX5_EN_ARFS=(y/n)** - -| Enables Hardware-accelerated receive flow steering (arfs) support, and ntuple filtering. -| https://community.mellanox.com/s/article/howto-configure-arfs-on-connectx-4 - - -**CONFIG_MLX5_EN_RXNFC=(y/n)** - -| Enables ethtool receive network flow classification, which allows user defined -| flow rules to direct traffic into arbitrary rx queue via ethtool set/get_rxnfc API. - - -**CONFIG_MLX5_CORE_EN_DCB=(y/n)**: - -| Enables `Data Center Bridging (DCB) Support `_. - - -**CONFIG_MLX5_MPFS=(y/n)** - -| Ethernet Multi-Physical Function Switch (MPFS) support in ConnectX NIC. -| MPFs is required for when `Multi-Host `_ configuration is enabled to allow passing -| user configured unicast MAC addresses to the requesting PF. - - -**CONFIG_MLX5_ESWITCH=(y/n)** - -| Ethernet SRIOV E-Switch support in ConnectX NIC. E-Switch provides internal SRIOV packet steering -| and switching for the enabled VFs and PF in two available modes: -| 1) `Legacy SRIOV mode (L2 mac vlan steering based) `_. -| 2) `Switchdev mode (eswitch offloads) `_. - - -**CONFIG_MLX5_CORE_IPOIB=(y/n)** - -| IPoIB offloads & acceleration support. -| Requires CONFIG_MLX5_CORE_EN to provide an accelerated interface for the rdma -| IPoIB ulp netdevice. - - -**CONFIG_MLX5_FPGA=(y/n)** - -| Build support for the Innova family of network cards by Mellanox Technologies. -| Innova network cards are comprised of a ConnectX chip and an FPGA chip on one board. -| If you select this option, the mlx5_core driver will include the Innova FPGA core and allow -| building sandbox-specific client drivers. - - -**CONFIG_MLX5_EN_IPSEC=(y/n)** - -| Enables `IPSec XFRM cryptography-offload acceleration `_. - -**CONFIG_MLX5_EN_TLS=(y/n)** - -| TLS cryptography-offload acceleration. - - -**CONFIG_MLX5_INFINIBAND=(y/n/m)** (module mlx5_ib.ko) - -| Provides low-level InfiniBand/RDMA and `RoCE `_ support. - -**CONFIG_MLX5_SF=(y/n)** - -| Build support for subfunction. -| Subfunctons are more light weight than PCI SRIOV VFs. Choosing this option -| will enable support for creating subfunction devices. - -**External options** ( Choose if the corresponding mlx5 feature is required ) - -- CONFIG_PTP_1588_CLOCK: When chosen, mlx5 ptp support will be enabled -- CONFIG_VXLAN: When chosen, mlx5 vxlan support will be enabled. -- CONFIG_MLXFW: When chosen, mlx5 firmware flashing support will be enabled (via devlink and ethtool). - -Devlink info -============ - -The devlink info reports the running and stored firmware versions on device. -It also prints the device PSID which represents the HCA board type ID. - -User command example:: - - $ devlink dev info pci/0000:00:06.0 - pci/0000:00:06.0: - driver mlx5_core - versions: - fixed: - fw.psid MT_0000000009 - running: - fw.version 16.26.0100 - stored: - fw.version 16.26.0100 - -Devlink parameters -================== - -flow_steering_mode: Device flow steering mode ---------------------------------------------- -The flow steering mode parameter controls the flow steering mode of the driver. -Two modes are supported: -1. 'dmfs' - Device managed flow steering. -2. 'smfs' - Software/Driver managed flow steering. - -In DMFS mode, the HW steering entities are created and managed through the -Firmware. -In SMFS mode, the HW steering entities are created and managed though by -the driver directly into hardware without firmware intervention. - -SMFS mode is faster and provides better rule insertion rate compared to default DMFS mode. - -User command examples: - -- Set SMFS flow steering mode:: - - $ devlink dev param set pci/0000:06:00.0 name flow_steering_mode value "smfs" cmode runtime - -- Read device flow steering mode:: - - $ devlink dev param show pci/0000:06:00.0 name flow_steering_mode - pci/0000:06:00.0: - name flow_steering_mode type driver-specific - values: - cmode runtime value smfs - -enable_roce: RoCE enablement state ----------------------------------- -RoCE enablement state controls driver support for RoCE traffic. -When RoCE is disabled, there is no gid table, only raw ethernet QPs are supported and traffic on the well-known UDP RoCE port is handled as raw ethernet traffic. - -To change RoCE enablement state, a user must change the driverinit cmode value and run devlink reload. - -User command examples: - -- Disable RoCE:: - - $ devlink dev param set pci/0000:06:00.0 name enable_roce value false cmode driverinit - $ devlink dev reload pci/0000:06:00.0 - -- Read RoCE enablement state:: - - $ devlink dev param show pci/0000:06:00.0 name enable_roce - pci/0000:06:00.0: - name enable_roce type generic - values: - cmode driverinit value true - -esw_port_metadata: Eswitch port metadata state ----------------------------------------------- -When applicable, disabling eswitch metadata can increase packet rate -up to 20% depending on the use case and packet sizes. - -Eswitch port metadata state controls whether to internally tag packets with -metadata. Metadata tagging must be enabled for multi-port RoCE, failover -between representors and stacked devices. -By default metadata is enabled on the supported devices in E-switch. -Metadata is applicable only for E-switch in switchdev mode and -users may disable it when NONE of the below use cases will be in use: -1. HCA is in Dual/multi-port RoCE mode. -2. VF/SF representor bonding (Usually used for Live migration) -3. Stacked devices - -When metadata is disabled, the above use cases will fail to initialize if -users try to enable them. - -- Show eswitch port metadata:: - - $ devlink dev param show pci/0000:06:00.0 name esw_port_metadata - pci/0000:06:00.0: - name esw_port_metadata type driver-specific - values: - cmode runtime value true - -- Disable eswitch port metadata:: - - $ devlink dev param set pci/0000:06:00.0 name esw_port_metadata value false cmode runtime - -- Change eswitch mode to switchdev mode where after choosing the metadata value:: - - $ devlink dev eswitch set pci/0000:06:00.0 mode switchdev - -Bridge offload -============== -The mlx5 driver implements support for offloading bridge rules when in switchdev -mode. Linux bridge FDBs are automatically offloaded when mlx5 switchdev -representor is attached to bridge. - -- Change device to switchdev mode:: - - $ devlink dev eswitch set pci/0000:06:00.0 mode switchdev - -- Attach mlx5 switchdev representor 'enp8s0f0' to bridge netdev 'bridge1':: - - $ ip link set enp8s0f0 master bridge1 - -VLANs ------ -Following bridge VLAN functions are supported by mlx5: - -- VLAN filtering (including multiple VLANs per port):: - - $ ip link set bridge1 type bridge vlan_filtering 1 - $ bridge vlan add dev enp8s0f0 vid 2-3 - -- VLAN push on bridge ingress:: - - $ bridge vlan add dev enp8s0f0 vid 3 pvid - -- VLAN pop on bridge egress:: - - $ bridge vlan add dev enp8s0f0 vid 3 untagged - -mlx5 subfunction -================ -mlx5 supports subfunction management using devlink port (see :ref:`Documentation/networking/devlink/devlink-port.rst `) interface. - -A subfunction has its own function capabilities and its own resources. This -means a subfunction has its own dedicated queues (txq, rxq, cq, eq). These -queues are neither shared nor stolen from the parent PCI function. - -When a subfunction is RDMA capable, it has its own QP1, GID table, and RDMA -resources neither shared nor stolen from the parent PCI function. - -A subfunction has a dedicated window in PCI BAR space that is not shared -with the other subfunctions or the parent PCI function. This ensures that all -devices (netdev, rdma, vdpa, etc.) of the subfunction accesses only assigned -PCI BAR space. - -A subfunction supports eswitch representation through which it supports tc -offloads. The user configures eswitch to send/receive packets from/to -the subfunction port. - -Subfunctions share PCI level resources such as PCI MSI-X IRQs with -other subfunctions and/or with its parent PCI function. - -Example mlx5 software, system, and device view:: - - _______ - | admin | - | user |---------- - |_______| | - | | - ____|____ __|______ _________________ - | | | | | | - | devlink | | tc tool | | user | - | tool | |_________| | applications | - |_________| | |_________________| - | | | | - | | | | Userspace - +---------|-------------|-------------------|----------|--------------------+ - | | +----------+ +----------+ Kernel - | | | netdev | | rdma dev | - | | +----------+ +----------+ - (devlink port add/del | ^ ^ - port function set) | | | - | | +---------------| - _____|___ | | _______|_______ - | | | | | mlx5 class | - | devlink | +------------+ | | drivers | - | kernel | | rep netdev | | |(mlx5_core,ib) | - |_________| +------------+ | |_______________| - | | | ^ - (devlink ops) | | (probe/remove) - _________|________ | | ____|________ - | subfunction | | +---------------+ | subfunction | - | management driver|----- | subfunction |---| driver | - | (mlx5_core) | | auxiliary dev | | (mlx5_core) | - |__________________| +---------------+ |_____________| - | ^ - (sf add/del, vhca events) | - | (device add/del) - _____|____ ____|________ - | | | subfunction | - | PCI NIC |--- activate/deactivate events--->| host driver | - |__________| | (mlx5_core) | - |_____________| - -Subfunction is created using devlink port interface. - -- Change device to switchdev mode:: - - $ devlink dev eswitch set pci/0000:06:00.0 mode switchdev - -- Add a devlink port of subfunction flavour:: - - $ devlink port add pci/0000:06:00.0 flavour pcisf pfnum 0 sfnum 88 - pci/0000:06:00.0/32768: type eth netdev eth6 flavour pcisf controller 0 pfnum 0 sfnum 88 external false splittable false - function: - hw_addr 00:00:00:00:00:00 state inactive opstate detached - -- Show a devlink port of the subfunction:: - - $ devlink port show pci/0000:06:00.0/32768 - pci/0000:06:00.0/32768: type eth netdev enp6s0pf0sf88 flavour pcisf pfnum 0 sfnum 88 - function: - hw_addr 00:00:00:00:00:00 state inactive opstate detached - -- Delete a devlink port of subfunction after use:: - - $ devlink port del pci/0000:06:00.0/32768 - -mlx5 function attributes -======================== -The mlx5 driver provides a mechanism to setup PCI VF/SF function attributes in -a unified way for SmartNIC and non-SmartNIC. - -This is supported only when the eswitch mode is set to switchdev. Port function -configuration of the PCI VF/SF is supported through devlink eswitch port. - -Port function attributes should be set before PCI VF/SF is enumerated by the -driver. - -MAC address setup ------------------ -mlx5 driver support devlink port function attr mechanism to setup MAC -address. (refer to Documentation/networking/devlink/devlink-port.rst) - -RoCE capability setup ---------------------- -Not all mlx5 PCI devices/SFs require RoCE capability. - -When RoCE capability is disabled, it saves 1 Mbytes worth of system memory per -PCI devices/SF. - -mlx5 driver support devlink port function attr mechanism to setup RoCE -capability. (refer to Documentation/networking/devlink/devlink-port.rst) - -migratable capability setup ---------------------------- -User who wants mlx5 PCI VFs to be able to perform live migration need to -explicitly enable the VF migratable capability. - -mlx5 driver support devlink port function attr mechanism to setup migratable -capability. (refer to Documentation/networking/devlink/devlink-port.rst) - -SF state setup --------------- -To use the SF, the user must activate the SF using the SF function state -attribute. - -- Get the state of the SF identified by its unique devlink port index:: - - $ devlink port show ens2f0npf0sf88 - pci/0000:06:00.0/32768: type eth netdev ens2f0npf0sf88 flavour pcisf controller 0 pfnum 0 sfnum 88 external false splittable false - function: - hw_addr 00:00:00:00:88:88 state inactive opstate detached - -- Activate the function and verify its state is active:: - - $ devlink port function set ens2f0npf0sf88 state active - - $ devlink port show ens2f0npf0sf88 - pci/0000:06:00.0/32768: type eth netdev ens2f0npf0sf88 flavour pcisf controller 0 pfnum 0 sfnum 88 external false splittable false - function: - hw_addr 00:00:00:00:88:88 state active opstate detached - -Upon function activation, the PF driver instance gets the event from the device -that a particular SF was activated. It's the cue to put the device on bus, probe -it and instantiate the devlink instance and class specific auxiliary devices -for it. - -- Show the auxiliary device and port of the subfunction:: - - $ devlink dev show - devlink dev show auxiliary/mlx5_core.sf.4 - - $ devlink port show auxiliary/mlx5_core.sf.4/1 - auxiliary/mlx5_core.sf.4/1: type eth netdev p0sf88 flavour virtual port 0 splittable false - - $ rdma link show mlx5_0/1 - link mlx5_0/1 state ACTIVE physical_state LINK_UP netdev p0sf88 - - $ rdma dev show - 8: rocep6s0f1: node_type ca fw 16.29.0550 node_guid 248a:0703:00b3:d113 sys_image_guid 248a:0703:00b3:d112 - 13: mlx5_0: node_type ca fw 16.29.0550 node_guid 0000:00ff:fe00:8888 sys_image_guid 248a:0703:00b3:d112 - -- Subfunction auxiliary device and class device hierarchy:: - - mlx5_core.sf.4 - (subfunction auxiliary device) - /\ - / \ - / \ - / \ - / \ - mlx5_core.eth.4 mlx5_core.rdma.4 - (sf eth aux dev) (sf rdma aux dev) - | | - | | - p0sf88 mlx5_0 - (sf netdev) (sf rdma device) - -Additionally, the SF port also gets the event when the driver attaches to the -auxiliary device of the subfunction. This results in changing the operational -state of the function. This provides visibility to the user to decide when is it -safe to delete the SF port for graceful termination of the subfunction. - -- Show the SF port operational state:: - - $ devlink port show ens2f0npf0sf88 - pci/0000:06:00.0/32768: type eth netdev ens2f0npf0sf88 flavour pcisf controller 0 pfnum 0 sfnum 88 external false splittable false - function: - hw_addr 00:00:00:00:88:88 state active opstate attached - -Devlink health reporters -======================== - -tx reporter ------------ -The tx reporter is responsible for reporting and recovering of the following two error scenarios: - -- tx timeout - Report on kernel tx timeout detection. - Recover by searching lost interrupts. -- tx error completion - Report on error tx completion. - Recover by flushing the tx queue and reset it. - -tx reporter also support on demand diagnose callback, on which it provides -real time information of its send queues status. - -User commands examples: - -- Diagnose send queues status:: - - $ devlink health diagnose pci/0000:82:00.0 reporter tx - -NOTE: This command has valid output only when interface is up, otherwise the command has empty output. - -- Show number of tx errors indicated, number of recover flows ended successfully, - is autorecover enabled and graceful period from last recover:: - - $ devlink health show pci/0000:82:00.0 reporter tx - -rx reporter ------------ -The rx reporter is responsible for reporting and recovering of the following two error scenarios: - -- rx queues' initialization (population) timeout - Population of rx queues' descriptors on ring initialization is done - in napi context via triggering an irq. In case of a failure to get - the minimum amount of descriptors, a timeout would occur, and - descriptors could be recovered by polling the EQ (Event Queue). -- rx completions with errors (reported by HW on interrupt context) - Report on rx completion error. - Recover (if needed) by flushing the related queue and reset it. - -rx reporter also supports on demand diagnose callback, on which it -provides real time information of its receive queues' status. - -- Diagnose rx queues' status and corresponding completion queue:: - - $ devlink health diagnose pci/0000:82:00.0 reporter rx - -NOTE: This command has valid output only when interface is up. Otherwise, the command has empty output. - -- Show number of rx errors indicated, number of recover flows ended successfully, - is autorecover enabled, and graceful period from last recover:: - - $ devlink health show pci/0000:82:00.0 reporter rx - -fw reporter ------------ -The fw reporter implements `diagnose` and `dump` callbacks. -It follows symptoms of fw error such as fw syndrome by triggering -fw core dump and storing it into the dump buffer. -The fw reporter diagnose command can be triggered any time by the user to check -current fw status. - -User commands examples: - -- Check fw heath status:: - - $ devlink health diagnose pci/0000:82:00.0 reporter fw - -- Read FW core dump if already stored or trigger new one:: - - $ devlink health dump show pci/0000:82:00.0 reporter fw - -NOTE: This command can run only on the PF which has fw tracer ownership, -running it on other PF or any VF will return "Operation not permitted". - -fw fatal reporter ------------------ -The fw fatal reporter implements `dump` and `recover` callbacks. -It follows fatal errors indications by CR-space dump and recover flow. -The CR-space dump uses vsc interface which is valid even if the FW command -interface is not functional, which is the case in most FW fatal errors. -The recover function runs recover flow which reloads the driver and triggers fw -reset if needed. -On firmware error, the health buffer is dumped into the dmesg. The log -level is derived from the error's severity (given in health buffer). - -User commands examples: - -- Run fw recover flow manually:: - - $ devlink health recover pci/0000:82:00.0 reporter fw_fatal - -- Read FW CR-space dump if already stored or trigger new one:: - - $ devlink health dump show pci/0000:82:00.1 reporter fw_fatal - -NOTE: This command can run only on PF. - -mlx5 tracepoints -================ - -mlx5 driver provides internal tracepoints for tracking and debugging using -kernel tracepoints interfaces (refer to Documentation/trace/ftrace.rst). - -For the list of support mlx5 events, check `/sys/kernel/debug/tracing/events/mlx5/`. - -tc and eswitch offloads tracepoints: - -- mlx5e_configure_flower: trace flower filter actions and cookies offloaded to mlx5:: - - $ echo mlx5:mlx5e_configure_flower >> /sys/kernel/debug/tracing/set_event - $ cat /sys/kernel/debug/tracing/trace - ... - tc-6535 [019] ...1 2672.404466: mlx5e_configure_flower: cookie=0000000067874a55 actions= REDIRECT - -- mlx5e_delete_flower: trace flower filter actions and cookies deleted from mlx5:: - - $ echo mlx5:mlx5e_delete_flower >> /sys/kernel/debug/tracing/set_event - $ cat /sys/kernel/debug/tracing/trace - ... - tc-6569 [010] .N.1 2686.379075: mlx5e_delete_flower: cookie=0000000067874a55 actions= NULL - -- mlx5e_stats_flower: trace flower stats request:: - - $ echo mlx5:mlx5e_stats_flower >> /sys/kernel/debug/tracing/set_event - $ cat /sys/kernel/debug/tracing/trace - ... - tc-6546 [010] ...1 2679.704889: mlx5e_stats_flower: cookie=0000000060eb3d6a bytes=0 packets=0 lastused=4295560217 - -- mlx5e_tc_update_neigh_used_value: trace tunnel rule neigh update value offloaded to mlx5:: - - $ echo mlx5:mlx5e_tc_update_neigh_used_value >> /sys/kernel/debug/tracing/set_event - $ cat /sys/kernel/debug/tracing/trace - ... - kworker/u48:4-8806 [009] ...1 55117.882428: mlx5e_tc_update_neigh_used_value: netdev: ens1f0 IPv4: 1.1.1.10 IPv6: ::ffff:1.1.1.10 neigh_used=1 - -- mlx5e_rep_neigh_update: trace neigh update tasks scheduled due to neigh state change events:: - - $ echo mlx5:mlx5e_rep_neigh_update >> /sys/kernel/debug/tracing/set_event - $ cat /sys/kernel/debug/tracing/trace - ... - kworker/u48:7-2221 [009] ...1 1475.387435: mlx5e_rep_neigh_update: netdev: ens1f0 MAC: 24:8a:07:9a:17:9a IPv4: 1.1.1.10 IPv6: ::ffff:1.1.1.10 neigh_connected=1 - -Bridge offloads tracepoints: - -- mlx5_esw_bridge_fdb_entry_init: trace bridge FDB entry offloaded to mlx5:: - - $ echo mlx5:mlx5_esw_bridge_fdb_entry_init >> set_event - $ cat /sys/kernel/debug/tracing/trace - ... - kworker/u20:9-2217 [003] ...1 318.582243: mlx5_esw_bridge_fdb_entry_init: net_device=enp8s0f0_0 addr=e4:fd:05:08:00:02 vid=0 flags=0 used=0 - -- mlx5_esw_bridge_fdb_entry_cleanup: trace bridge FDB entry deleted from mlx5:: - - $ echo mlx5:mlx5_esw_bridge_fdb_entry_cleanup >> set_event - $ cat /sys/kernel/debug/tracing/trace - ... - ip-2581 [005] ...1 318.629871: mlx5_esw_bridge_fdb_entry_cleanup: net_device=enp8s0f0_1 addr=e4:fd:05:08:00:03 vid=0 flags=0 used=16 - -- mlx5_esw_bridge_fdb_entry_refresh: trace bridge FDB entry offload refreshed in - mlx5:: - - $ echo mlx5:mlx5_esw_bridge_fdb_entry_refresh >> set_event - $ cat /sys/kernel/debug/tracing/trace - ... - kworker/u20:8-3849 [003] ...1 466716: mlx5_esw_bridge_fdb_entry_refresh: net_device=enp8s0f0_0 addr=e4:fd:05:08:00:02 vid=3 flags=0 used=0 - -- mlx5_esw_bridge_vlan_create: trace bridge VLAN object add on mlx5 - representor:: - - $ echo mlx5:mlx5_esw_bridge_vlan_create >> set_event - $ cat /sys/kernel/debug/tracing/trace - ... - ip-2560 [007] ...1 318.460258: mlx5_esw_bridge_vlan_create: vid=1 flags=6 - -- mlx5_esw_bridge_vlan_cleanup: trace bridge VLAN object delete from mlx5 - representor:: - - $ echo mlx5:mlx5_esw_bridge_vlan_cleanup >> set_event - $ cat /sys/kernel/debug/tracing/trace - ... - bridge-2582 [007] ...1 318.653496: mlx5_esw_bridge_vlan_cleanup: vid=2 flags=8 - -- mlx5_esw_bridge_vport_init: trace mlx5 vport assigned with bridge upper - device:: - - $ echo mlx5:mlx5_esw_bridge_vport_init >> set_event - $ cat /sys/kernel/debug/tracing/trace - ... - ip-2560 [007] ...1 318.458915: mlx5_esw_bridge_vport_init: vport_num=1 - -- mlx5_esw_bridge_vport_cleanup: trace mlx5 vport removed from bridge upper - device:: - - $ echo mlx5:mlx5_esw_bridge_vport_cleanup >> set_event - $ cat /sys/kernel/debug/tracing/trace - ... - ip-5387 [000] ...1 573713: mlx5_esw_bridge_vport_cleanup: vport_num=1 - -Eswitch QoS tracepoints: - -- mlx5_esw_vport_qos_create: trace creation of transmit scheduler arbiter for vport:: - - $ echo mlx5:mlx5_esw_vport_qos_create >> /sys/kernel/debug/tracing/set_event - $ cat /sys/kernel/debug/tracing/trace - ... - <...>-23496 [018] .... 73136.838831: mlx5_esw_vport_qos_create: (0000:82:00.0) vport=2 tsar_ix=4 bw_share=0, max_rate=0 group=000000007b576bb3 - -- mlx5_esw_vport_qos_config: trace configuration of transmit scheduler arbiter for vport:: - - $ echo mlx5:mlx5_esw_vport_qos_config >> /sys/kernel/debug/tracing/set_event - $ cat /sys/kernel/debug/tracing/trace - ... - <...>-26548 [023] .... 75754.223823: mlx5_esw_vport_qos_config: (0000:82:00.0) vport=1 tsar_ix=3 bw_share=34, max_rate=10000 group=000000007b576bb3 - -- mlx5_esw_vport_qos_destroy: trace deletion of transmit scheduler arbiter for vport:: - - $ echo mlx5:mlx5_esw_vport_qos_destroy >> /sys/kernel/debug/tracing/set_event - $ cat /sys/kernel/debug/tracing/trace - ... - <...>-27418 [004] .... 76546.680901: mlx5_esw_vport_qos_destroy: (0000:82:00.0) vport=1 tsar_ix=3 - -- mlx5_esw_group_qos_create: trace creation of transmit scheduler arbiter for rate group:: - - $ echo mlx5:mlx5_esw_group_qos_create >> /sys/kernel/debug/tracing/set_event - $ cat /sys/kernel/debug/tracing/trace - ... - <...>-26578 [008] .... 75776.022112: mlx5_esw_group_qos_create: (0000:82:00.0) group=000000008dac63ea tsar_ix=5 - -- mlx5_esw_group_qos_config: trace configuration of transmit scheduler arbiter for rate group:: - - $ echo mlx5:mlx5_esw_group_qos_config >> /sys/kernel/debug/tracing/set_event - $ cat /sys/kernel/debug/tracing/trace - ... - <...>-27303 [020] .... 76461.455356: mlx5_esw_group_qos_config: (0000:82:00.0) group=000000008dac63ea tsar_ix=5 bw_share=100 max_rate=20000 - -- mlx5_esw_group_qos_destroy: trace deletion of transmit scheduler arbiter for group:: - - $ echo mlx5:mlx5_esw_group_qos_destroy >> /sys/kernel/debug/tracing/set_event - $ cat /sys/kernel/debug/tracing/trace - ... - <...>-27418 [006] .... 76547.187258: mlx5_esw_group_qos_destroy: (0000:82:00.0) group=000000007b576bb3 tsar_ix=1 - -SF tracepoints: - -- mlx5_sf_add: trace addition of the SF port:: - - $ echo mlx5:mlx5_sf_add >> /sys/kernel/debug/tracing/set_event - $ cat /sys/kernel/debug/tracing/trace - ... - devlink-9363 [031] ..... 24610.188722: mlx5_sf_add: (0000:06:00.0) port_index=32768 controller=0 hw_id=0x8000 sfnum=88 - -- mlx5_sf_free: trace freeing of the SF port:: - - $ echo mlx5:mlx5_sf_free >> /sys/kernel/debug/tracing/set_event - $ cat /sys/kernel/debug/tracing/trace - ... - devlink-9830 [038] ..... 26300.404749: mlx5_sf_free: (0000:06:00.0) port_index=32768 controller=0 hw_id=0x8000 - -- mlx5_sf_hwc_alloc: trace allocating of the hardware SF context:: - - $ echo mlx5:mlx5_sf_hwc_alloc >> /sys/kernel/debug/tracing/set_event - $ cat /sys/kernel/debug/tracing/trace - ... - devlink-9775 [031] ..... 26296.385259: mlx5_sf_hwc_alloc: (0000:06:00.0) controller=0 hw_id=0x8000 sfnum=88 - -- mlx5_sf_hwc_free: trace freeing of the hardware SF context:: - - $ echo mlx5:mlx5_sf_hwc_free >> /sys/kernel/debug/tracing/set_event - $ cat /sys/kernel/debug/tracing/trace - ... - kworker/u128:3-9093 [046] ..... 24625.365771: mlx5_sf_hwc_free: (0000:06:00.0) hw_id=0x8000 - -- mlx5_sf_hwc_deferred_free : trace deferred freeing of the hardware SF context:: - - $ echo mlx5:mlx5_sf_hwc_deferred_free >> /sys/kernel/debug/tracing/set_event - $ cat /sys/kernel/debug/tracing/trace - ... - devlink-9519 [046] ..... 24624.400271: mlx5_sf_hwc_deferred_free: (0000:06:00.0) hw_id=0x8000 - -- mlx5_sf_vhca_event: trace SF vhca event and state:: - - $ echo mlx5:mlx5_sf_vhca_event >> /sys/kernel/debug/tracing/set_event - $ cat /sys/kernel/debug/tracing/trace - ... - kworker/u128:3-9093 [046] ..... 24625.365525: mlx5_sf_vhca_event: (0000:06:00.0) hw_id=0x8000 sfnum=88 vhca_state=1 - -- mlx5_sf_dev_add : trace SF device add event:: - - $ echo mlx5:mlx5_sf_dev_add>> /sys/kernel/debug/tracing/set_event - $ cat /sys/kernel/debug/tracing/trace - ... - kworker/u128:3-9093 [000] ..... 24616.524495: mlx5_sf_dev_add: (0000:06:00.0) sfdev=00000000fc5d96fd aux_id=4 hw_id=0x8000 sfnum=88 - -- mlx5_sf_dev_del : trace SF device delete event:: - - $ echo mlx5:mlx5_sf_dev_del >> /sys/kernel/debug/tracing/set_event - $ cat /sys/kernel/debug/tracing/trace - ... - kworker/u128:3-9093 [044] ..... 24624.400749: mlx5_sf_dev_del: (0000:06:00.0) sfdev=00000000fc5d96fd aux_id=4 hw_id=0x8000 sfnum=88 diff --git a/Documentation/networking/device_drivers/ethernet/mellanox/mlx5/counters.rst b/Documentation/networking/device_drivers/ethernet/mellanox/mlx5/counters.rst new file mode 100644 index 0000000000000..4cd8e869762bf --- /dev/null +++ b/Documentation/networking/device_drivers/ethernet/mellanox/mlx5/counters.rst @@ -0,0 +1,1302 @@ +.. SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +.. include:: + +================ +Ethtool counters +================ + +:Copyright: |copy| 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + +Contents +======== + +- `Overview`_ +- `Groups`_ +- `Types`_ +- `Descriptions`_ + +Overview +======== + +There are several counter groups based on where the counter is being counted. In +addition, each group of counters may have different counter types. + +These counter groups are based on which component in a networking setup, +illustrated below, that they describe:: + + ---------------------------------------- + | | + ---------------------------------------- ---------------------------------------- | + | Hypervisor | | VM | | + | | | | | + | ------------------- --------------- | | ------------------- --------------- | | + | | Ethernet driver | | RDMA driver | | | | Ethernet driver | | RDMA driver | | | + | ------------------- --------------- | | ------------------- --------------- | | + | | | | | | | | | + | ------------------- | | ------------------- | | + | | | | | |-- + ---------------------------------------- ---------------------------------------- + | | + ------------- ----------------------------- + | | + ------ ------ ------ ------ ------ ------ ------ + -----| PF |----------------------| VF |-| VF |-| VF |----- --| PF |--- --| PF |--- --| PF |--- + | ------ ------ ------ ------ | | ------ | | ------ | | ------ | + | | | | | | | | + | | | | | | | | + | | | | | | | | + | eSwitch | | eSwitch | | eSwitch | | eSwitch | + ---------------------------------------------------------- ----------- ----------- ----------- + ------------------------------------------------------------------------------- + | | + | | + | Uplink (no counters) | + ------------------------------------------------------------------------------- + --------------------------------------------------------------- + | | + | | + | MPFS (no counters) | + --------------------------------------------------------------- + | + | + | Port + +Groups +====== + +Ring + Software counters populated by the driver stack. + +Netdev + An aggregation of software ring counters. + +vPort counters + Traffic counters and drops due to steering or no buffers. May indicate issues + with NIC. These counters include Ethernet traffic counters (including Raw + Ethernet) and RDMA/RoCE traffic counters. + +Physical port counters + Counters that collect statistics about the PFs and VFs. May indicate issues + with NIC, link, or network. This measuring point holds information on + standardized counters like IEEE 802.3, RFC2863, RFC 2819, RFC 3635 and + additional counters like flow control, FEC and more. Physical port counters + are not exposed to virtual machines. + +Priority Port Counters + A set of the physical port counters, per priority per port. + +Types +===== + +Counters are divided into three types. + +Traffic Informative Counters + Counters which count traffic. These counters can be used for load estimation + or for general debug. + +Traffic Acceleration Counters + Counters which count traffic that was accelerated by Mellanox driver or by + hardware. The counters are an additional layer to the informative counter set, + and the same traffic is counted in both informative and acceleration counters. + +.. [#accel] Traffic acceleration counter. + +Error Counters + Increment of these counters might indicate a problem. Each of these counters + has an explanation and correction action. + +Statistic can be fetched via the `ip link` or `ethtool` commands. `ethtool` +provides more detailed information.:: + + ip –s link show + ethtool -S + +Descriptions +============ + +XSK, PTP, and QoS counters that are similar to counters defined previously will +not be separately listed. For example, `ptp_tx[i]_packets` will not be +explicitly documented since `tx[i]_packets` describes the behavior of both +counters, except `ptp_tx[i]_packets` is only counted when precision time +protocol is used. + +Ring / Netdev Counter +---------------------------- +The following counters are available per ring or software port. + +These counters provide information on the amount of traffic that was accelerated +by the NIC. The counters are counting the accelerated traffic in addition to the +standard counters which counts it (i.e. accelerated traffic is counted twice). + +The counter names in the table below refers to both ring and port counters. The +notation for ring counters includes the [i] index without the braces. The +notation for port counters doesn't include the [i]. A counter name +`rx[i]_packets` will be printed as `rx0_packets` for ring 0 and `rx_packets` for +the software port. + +.. flat-table:: Ring / Software Port Counter Table + :widths: 2 3 1 + + * - Counter + - Description + - Type + + * - `rx[i]_packets` + - The number of packets received on ring i. + - Informative + + * - `rx[i]_bytes` + - The number of bytes received on ring i. + - Informative + + * - `tx[i]_packets` + - The number of packets transmitted on ring i. + - Informative + + * - `tx[i]_bytes` + - The number of bytes transmitted on ring i. + - Informative + + * - `tx[i]_recover` + - The number of times the SQ was recovered. + - Error + + * - `tx[i]_cqes` + - Number of CQEs events on SQ issued on ring i. + - Informative + + * - `tx[i]_cqe_err` + - The number of error CQEs encountered on the SQ for ring i. + - Error + + * - `tx[i]_tso_packets` + - The number of TSO packets transmitted on ring i [#accel]_. + - Acceleration + + * - `tx[i]_tso_bytes` + - The number of TSO bytes transmitted on ring i [#accel]_. + - Acceleration + + * - `tx[i]_tso_inner_packets` + - The number of TSO packets which are indicated to be carry internal + encapsulation transmitted on ring i [#accel]_. + - Acceleration + + * - `tx[i]_tso_inner_bytes` + - The number of TSO bytes which are indicated to be carry internal + encapsulation transmitted on ring i [#accel]_. + - Acceleration + + * - `rx[i]_gro_packets` + - Number of received packets processed using hardware-accelerated GRO. The + number of hardware GRO offloaded packets received on ring i. + - Acceleration + + * - `rx[i]_gro_bytes` + - Number of received bytes processed using hardware-accelerated GRO. The + number of hardware GRO offloaded bytes received on ring i. + - Acceleration + + * - `rx[i]_gro_skbs` + - The number of receive SKBs constructed while performing + hardware-accelerated GRO. + - Informative + + * - `rx[i]_gro_match_packets` + - Number of received packets processed using hardware-accelerated GRO that + met the flow table match criteria. + - Informative + + * - `rx[i]_gro_large_hds` + - Number of receive packets using hardware-accelerated GRO that have large + headers that require additional memory to be allocated. + - Informative + + * - `rx[i]_lro_packets` + - The number of LRO packets received on ring i [#accel]_. + - Acceleration + + * - `rx[i]_lro_bytes` + - The number of LRO bytes received on ring i [#accel]_. + - Acceleration + + * - `rx[i]_ecn_mark` + - The number of received packets where the ECN mark was turned on. + - Informative + + * - `rx_oversize_pkts_buffer` + - The number of dropped received packets due to length which arrived to RQ + and exceed software buffer size allocated by the device for incoming + traffic. It might imply that the device MTU is larger than the software + buffers size. + - Error + + * - `rx_oversize_pkts_sw_drop` + - Number of received packets dropped in software because the CQE data is + larger than the MTU size. + - Error + + * - `rx[i]_csum_unnecessary` + - Packets received with a `CHECKSUM_UNNECESSARY` on ring i [#accel]_. + - Acceleration + + * - `rx[i]_csum_unnecessary_inner` + - Packets received with inner encapsulation with a `CHECKSUM_UNNECESSARY` + on ring i [#accel]_. + - Acceleration + + * - `rx[i]_csum_none` + - Packets received with a `CHECKSUM_NONE` on ring i [#accel]_. + - Acceleration + + * - `rx[i]_csum_complete` + - Packets received with a `CHECKSUM_COMPLETE` on ring i [#accel]_. + - Acceleration + + * - `rx[i]_csum_complete_tail` + - Number of received packets that had checksum calculation computed, + potentially needed padding, and were able to do so with + `CHECKSUM_PARTIAL`. + - Informative + + * - `rx[i]_csum_complete_tail_slow` + - Number of received packets that need padding larger than eight bytes for + the checksum. + - Informative + + * - `tx[i]_csum_partial` + - Packets transmitted with a `CHECKSUM_PARTIAL` on ring i [#accel]_. + - Acceleration + + * - `tx[i]_csum_partial_inner` + - Packets transmitted with inner encapsulation with a `CHECKSUM_PARTIAL` on + ring i [#accel]_. + - Acceleration + + * - `tx[i]_csum_none` + - Packets transmitted with no hardware checksum acceleration on ring i. + - Informative + + * - `tx[i]_stopped` / `tx_queue_stopped` [#ring_global]_ + - Events where SQ was full on ring i. If this counter is increased, check + the amount of buffers allocated for transmission. + - Informative + + * - `tx[i]_wake` / `tx_queue_wake` [#ring_global]_ + - Events where SQ was full and has become not full on ring i. + - Informative + + * - `tx[i]_dropped` / `tx_queue_dropped` [#ring_global]_ + - Packets transmitted that were dropped due to DMA mapping failure on + ring i. If this counter is increased, check the amount of buffers + allocated for transmission. + - Error + + * - `tx[i]_nop` + - The number of nop WQEs (empty WQEs) inserted to the SQ (related to + ring i) due to the reach of the end of the cyclic buffer. When reaching + near to the end of cyclic buffer the driver may add those empty WQEs to + avoid handling a state the a WQE start in the end of the queue and ends + in the beginning of the queue. This is a normal condition. + - Informative + + * - `tx[i]_added_vlan_packets` + - The number of packets sent where vlan tag insertion was offloaded to the + hardware. + - Acceleration + + * - `rx[i]_removed_vlan_packets` + - The number of packets received where vlan tag stripping was offloaded to + the hardware. + - Acceleration + + * - `rx[i]_wqe_err` + - The number of wrong opcodes received on ring i. + - Error + + * - `rx[i]_mpwqe_frag` + - The number of WQEs that failed to allocate compound page and hence + fragmented MPWQE’s (Multi Packet WQEs) were used on ring i. If this + counter raise, it may suggest that there is no enough memory for large + pages, the driver allocated fragmented pages. This is not abnormal + condition. + - Informative + + * - `rx[i]_mpwqe_filler_cqes` + - The number of filler CQEs events that were issued on ring i. + - Informative + + * - `rx[i]_mpwqe_filler_strides` + - The number of strides consumed by filler CQEs on ring i. + - Informative + + * - `tx[i]_mpwqe_blks` + - The number of send blocks processed from Multi-Packet WQEs (mpwqe). + - Informative + + * - `tx[i]_mpwqe_pkts` + - The number of send packets processed from Multi-Packet WQEs (mpwqe). + - Informative + + * - `rx[i]_cqe_compress_blks` + - The number of receive blocks with CQE compression on ring i [#accel]_. + - Acceleration + + * - `rx[i]_cqe_compress_pkts` + - The number of receive packets with CQE compression on ring i [#accel]_. + - Acceleration + + * - `rx[i]_cache_reuse` + - The number of events of successful reuse of a page from a driver's + internal page cache. + - Acceleration + + * - `rx[i]_cache_full` + - The number of events of full internal page cache where driver can't put a + page back to the cache for recycling (page will be freed). + - Acceleration + + * - `rx[i]_cache_empty` + - The number of events where cache was empty - no page to give. Driver + shall allocate new page. + - Acceleration + + * - `rx[i]_cache_busy` + - The number of events where cache head was busy and cannot be recycled. + Driver allocated new page. + - Acceleration + + * - `rx[i]_cache_waive` + - The number of cache evacuation. This can occur due to page move to + another NUMA node or page was pfmemalloc-ed and should be freed as soon + as possible. + - Acceleration + + * - `rx[i]_arfs_err` + - Number of flow rules that failed to be added to the flow table. + - Error + + * - `rx[i]_recover` + - The number of times the RQ was recovered. + - Error + + * - `tx[i]_xmit_more` + - The number of packets sent with `xmit_more` indication set on the skbuff + (no doorbell). + - Acceleration + + * - `ch[i]_poll` + - The number of invocations of NAPI poll of channel i. + - Informative + + * - `ch[i]_arm` + - The number of times the NAPI poll function completed and armed the + completion queues on channel i. + - Informative + + * - `ch[i]_aff_change` + - The number of times the NAPI poll function explicitly stopped execution + on a CPU due to a change in affinity, on channel i. + - Informative + + * - `ch[i]_events` + - The number of hard interrupt events on the completion queues of channel i. + - Informative + + * - `ch[i]_eq_rearm` + - The number of times the EQ was recovered. + - Error + + * - `ch[i]_force_irq` + - Number of times NAPI is triggered by XSK wakeups by posting a NOP to + ICOSQ. + - Acceleration + + * - `rx[i]_congst_umr` + - The number of times an outstanding UMR request is delayed due to + congestion, on ring i. + - Informative + + * - `rx_pp_alloc_fast` + - Number of successful fast path allocations. + - Informative + + * - `rx_pp_alloc_slow` + - Number of slow path order-0 allocations. + - Informative + + * - `rx_pp_alloc_slow_high_order` + - Number of slow path high order allocations. + - Informative + + * - `rx_pp_alloc_empty` + - Counter is incremented when ptr ring is empty, so a slow path allocation + was forced. + - Informative + + * - `rx_pp_alloc_refill` + - Counter is incremented when an allocation which triggered a refill of the + cache. + - Informative + + * - `rx_pp_alloc_waive` + - Counter is incremented when pages obtained from the ptr ring that cannot + be added to the cache due to a NUMA mismatch. + - Informative + + * - `rx_pp_recycle_cached` + - Counter is incremented when recycling placed page in the page pool cache. + - Informative + + * - `rx_pp_recycle_cache_full` + - Counter is incremented when page pool cache was full. + - Informative + + * - `rx_pp_recycle_ring` + - Counter is incremented when page placed into the ptr ring. + - Informative + + * - `rx_pp_recycle_ring_full` + - Counter is incremented when page released from page pool because the ptr + ring was full. + - Informative + + * - `rx_pp_recycle_released_ref` + - Counter is incremented when page released (and not recycled) because + refcnt > 1. + - Informative + + * - `rx[i]_xsk_buff_alloc_err` + - The number of times allocating an skb or XSK buffer failed in the XSK RQ + context. + - Error + + * - `rx[i]_xsk_arfs_err` + - aRFS (accelerated Receive Flow Steering) does not occur in the XSK RQ + context, so this counter should never increment. + - Error + + * - `rx[i]_xdp_tx_xmit` + - The number of packets forwarded back to the port due to XDP program + `XDP_TX` action (bouncing). these packets are not counted by other + software counters. These packets are counted by physical port and vPort + counters. + - Informative + + * - `rx[i]_xdp_tx_mpwqe` + - Number of multi-packet WQEs transmitted by the netdev and `XDP_TX`-ed by + the netdev during the RQ context. + - Acceleration + + * - `rx[i]_xdp_tx_inlnw` + - Number of WQE data segments transmitted where the data could be inlined + in the WQE and then `XDP_TX`-ed during the RQ context. + - Acceleration + + * - `rx[i]_xdp_tx_nops` + - Number of NOP WQEBBs (WQE building blocks) received posted to the XDP SQ. + - Acceleration + + * - `rx[i]_xdp_tx_full` + - The number of packets that should have been forwarded back to the port + due to `XDP_TX` action but were dropped due to full tx queue. These packets + are not counted by other software counters. These packets are counted by + physical port and vPort counters. You may open more rx queues and spread + traffic rx over all queues and/or increase rx ring size. + - Error + + * - `rx[i]_xdp_tx_err` + - The number of times an `XDP_TX` error such as frame too long and frame + too short occurred on `XDP_TX` ring of RX ring. + - Error + + * - `rx[i]_xdp_tx_cqes` / `rx_xdp_tx_cqe` [#ring_global]_ + - The number of completions received on the CQ of the `XDP_TX` ring. + - Informative + + * - `rx[i]_xdp_drop` + - The number of packets dropped due to XDP program `XDP_DROP` action. these + packets are not counted by other software counters. These packets are + counted by physical port and vPort counters. + - Informative + + * - `rx[i]_xdp_redirect` + - The number of times an XDP redirect action was triggered on ring i. + - Acceleration + + * - `tx[i]_xdp_xmit` + - The number of packets redirected to the interface(due to XDP redirect). + These packets are not counted by other software counters. These packets + are counted by physical port and vPort counters. + - Informative + + * - `tx[i]_xdp_full` + - The number of packets redirected to the interface(due to XDP redirect), + but were dropped due to full tx queue. these packets are not counted by + other software counters. you may enlarge tx queues. + - Informative + + * - `tx[i]_xdp_mpwqe` + - Number of multi-packet WQEs offloaded onto the NIC that were + `XDP_REDIRECT`-ed from other netdevs. + - Acceleration + + * - `tx[i]_xdp_inlnw` + - Number of WQE data segments where the data could be inlined in the WQE + where the data segments were `XDP_REDIRECT`-ed from other netdevs. + - Acceleration + + * - `tx[i]_xdp_nops` + - Number of NOP WQEBBs (WQE building blocks) posted to the SQ that were + `XDP_REDIRECT`-ed from other netdevs. + - Acceleration + + * - `tx[i]_xdp_err` + - The number of packets redirected to the interface(due to XDP redirect) + but were dropped due to error such as frame too long and frame too short. + - Error + + * - `tx[i]_xdp_cqes` + - The number of completions received for packets redirected to the + interface(due to XDP redirect) on the CQ. + - Informative + + * - `tx[i]_xsk_xmit` + - The number of packets transmitted using XSK zerocopy functionality. + - Acceleration + + * - `tx[i]_xsk_mpwqe` + - Number of multi-packet WQEs offloaded onto the NIC that were + `XDP_REDIRECT`-ed from other netdevs. + - Acceleration + + * - `tx[i]_xsk_inlnw` + - Number of WQE data segments where the data could be inlined in the WQE + that are transmitted using XSK zerocopy. + - Acceleration + + * - `tx[i]_xsk_full` + - Number of times doorbell is rung in XSK zerocopy mode when SQ is full. + - Error + + * - `tx[i]_xsk_err` + - Number of errors that occurred in XSK zerocopy mode such as if the data + size is larger than the MTU size. + - Error + + * - `tx[i]_xsk_cqes` + - Number of CQEs processed in XSK zerocopy mode. + - Acceleration + + * - `tx_tls_ctx` + - Number of TLS TX HW offload contexts added to device for encryption. + - Acceleration + + * - `tx_tls_del` + - Number of TLS TX HW offload contexts removed from device (connection + closed). + - Acceleration + + * - `tx_tls_pool_alloc` + - Number of times a unit of work is successfully allocated in the TLS HW + offload pool. + - Acceleration + + * - `tx_tls_pool_free` + - Number of times a unit of work is freed in the TLS HW offload pool. + - Acceleration + + * - `rx_tls_ctx` + - Number of TLS RX HW offload contexts added to device for decryption. + - Acceleration + + * - `rx_tls_del` + - Number of TLS RX HW offload contexts deleted from device (connection has + finished). + - Acceleration + + * - `rx[i]_tls_decrypted_packets` + - Number of successfully decrypted RX packets which were part of a TLS + stream. + - Acceleration + + * - `rx[i]_tls_decrypted_bytes` + - Number of TLS payload bytes in RX packets which were successfully + decrypted. + - Acceleration + + * - `rx[i]_tls_resync_req_pkt` + - Number of received TLS packets with a resync request. + - Acceleration + + * - `rx[i]_tls_resync_req_start` + - Number of times the TLS async resync request was started. + - Acceleration + + * - `rx[i]_tls_resync_req_end` + - Number of times the TLS async resync request properly ended with + providing the HW tracked tcp-seq. + - Acceleration + + * - `rx[i]_tls_resync_req_skip` + - Number of times the TLS async resync request procedure was started but + not properly ended. + - Error + + * - `rx[i]_tls_resync_res_ok` + - Number of times the TLS resync response call to the driver was + successfully handled. + - Acceleration + + * - `rx[i]_tls_resync_res_retry` + - Number of times the TLS resync response call to the driver was + reattempted when ICOSQ is full. + - Error + + * - `rx[i]_tls_resync_res_skip` + - Number of times the TLS resync response call to the driver was terminated + unsuccessfully. + - Error + + * - `rx[i]_tls_err` + - Number of times when CQE TLS offload was problematic. + - Error + + * - `tx[i]_tls_encrypted_packets` + - The number of send packets that are TLS encrypted by the kernel. + - Acceleration + + * - `tx[i]_tls_encrypted_bytes` + - The number of send bytes that are TLS encrypted by the kernel. + - Acceleration + + * - `tx[i]_tls_ooo` + - Number of times out of order TLS SQE fragments were handled on ring i. + - Acceleration + + * - `tx[i]_tls_dump_packets` + - Number of TLS decrypted packets copied over from NIC over DMA. + - Acceleration + + * - `tx[i]_tls_dump_bytes` + - Number of TLS decrypted bytes copied over from NIC over DMA. + - Acceleration + + * - `tx[i]_tls_resync_bytes` + - Number of TLS bytes requested to be resynchronized in order to be + decrypted. + - Acceleration + + * - `tx[i]_tls_skip_no_sync_data` + - Number of TLS send data that can safely be skipped / do not need to be + decrypted. + - Acceleration + + * - `tx[i]_tls_drop_no_sync_data` + - Number of TLS send data that were dropped due to retransmission of TLS + data. + - Acceleration + + * - `ptp_cq[i]_abort` + - Number of times a CQE has to be skipped in precision time protocol due to + a skew between the port timestamp and CQE timestamp being greater than + 128 seconds. + - Error + + * - `ptp_cq[i]_abort_abs_diff_ns` + - Accumulation of time differences between the port timestamp and CQE + timestamp when the difference is greater than 128 seconds in precision + time protocol. + - Error + +.. [#ring_global] The corresponding ring and global counters do not share the + same name (i.e. do not follow the common naming scheme). + +vPort Counters +-------------- +Counters on the NIC port that is connected to a eSwitch. + +.. flat-table:: vPort Counter Table + :widths: 2 3 1 + + * - Counter + - Description + - Type + + * - `rx_vport_unicast_packets` + - Unicast packets received, steered to a port including Raw Ethernet + QP/DPDK traffic, excluding RDMA traffic. + - Informative + + * - `rx_vport_unicast_bytes` + - Unicast bytes received, steered to a port including Raw Ethernet QP/DPDK + traffic, excluding RDMA traffic. + - Informative + + * - `tx_vport_unicast_packets` + - Unicast packets transmitted, steered from a port including Raw Ethernet + QP/DPDK traffic, excluding RDMA traffic. + - Informative + + * - `tx_vport_unicast_bytes` + - Unicast bytes transmitted, steered from a port including Raw Ethernet + QP/DPDK traffic, excluding RDMA traffic. + - Informative + + * - `rx_vport_multicast_packets` + - Multicast packets received, steered to a port including Raw Ethernet + QP/DPDK traffic, excluding RDMA traffic. + - Informative + + * - `rx_vport_multicast_bytes` + - Multicast bytes received, steered to a port including Raw Ethernet + QP/DPDK traffic, excluding RDMA traffic. + - Informative + + * - `tx_vport_multicast_packets` + - Multicast packets transmitted, steered from a port including Raw Ethernet + QP/DPDK traffic, excluding RDMA traffic. + - Informative + + * - `tx_vport_multicast_bytes` + - Multicast bytes transmitted, steered from a port including Raw Ethernet + QP/DPDK traffic, excluding RDMA traffic. + - Informative + + * - `rx_vport_broadcast_packets` + - Broadcast packets received, steered to a port including Raw Ethernet + QP/DPDK traffic, excluding RDMA traffic. + - Informative + + * - `rx_vport_broadcast_bytes` + - Broadcast bytes received, steered to a port including Raw Ethernet + QP/DPDK traffic, excluding RDMA traffic. + - Informative + + * - `tx_vport_broadcast_packets` + - Broadcast packets transmitted, steered from a port including Raw Ethernet + QP/DPDK traffic, excluding RDMA traffic. + - Informative + + * - `tx_vport_broadcast_bytes` + - Broadcast bytes transmitted, steered from a port including Raw Ethernet + QP/DPDK traffic, excluding RDMA traffic. + - Informative + + * - `rx_vport_rdma_unicast_packets` + - RDMA unicast packets received, steered to a port (counters counts + RoCE/UD/RC traffic) [#accel]_. + - Acceleration + + * - `rx_vport_rdma_unicast_bytes` + - RDMA unicast bytes received, steered to a port (counters counts + RoCE/UD/RC traffic) [#accel]_. + - Acceleration + + * - `tx_vport_rdma_unicast_packets` + - RDMA unicast packets transmitted, steered from a port (counters counts + RoCE/UD/RC traffic) [#accel]_. + - Acceleration + + * - `tx_vport_rdma_unicast_bytes` + - RDMA unicast bytes transmitted, steered from a port (counters counts + RoCE/UD/RC traffic) [#accel]_. + - Acceleration + + * - `rx_vport_rdma_multicast_packets` + - RDMA multicast packets received, steered to a port (counters counts + RoCE/UD/RC traffic) [#accel]_. + - Acceleration + + * - `rx_vport_rdma_multicast_bytes` + - RDMA multicast bytes received, steered to a port (counters counts + RoCE/UD/RC traffic) [#accel]_. + - Acceleration + + * - `tx_vport_rdma_multicast_packets` + - RDMA multicast packets transmitted, steered from a port (counters counts + RoCE/UD/RC traffic) [#accel]_. + - Acceleration + + * - `tx_vport_rdma_multicast_bytes` + - RDMA multicast bytes transmitted, steered from a port (counters counts + RoCE/UD/RC traffic) [#accel]_. + - Acceleration + + * - `rx_steer_missed_packets` + - Number of packets that was received by the NIC, however was discarded + because it did not match any flow in the NIC flow table. + - Error + + * - `rx_packets` + - Representor only: packets received, that were handled by the hypervisor. + - Informative + + * - `rx_bytes` + - Representor only: bytes received, that were handled by the hypervisor. + - Informative + + * - `tx_packets` + - Representor only: packets transmitted, that were handled by the + hypervisor. + - Informative + + * - `tx_bytes` + - Representor only: bytes transmitted, that were handled by the hypervisor. + - Informative + + * - `dev_internal_queue_oob` + - The number of dropped packets due to lack of receive WQEs for an internal + device RQ. + - Error + +Physical Port Counters +---------------------- +The physical port counters are the counters on the external port connecting the +adapter to the network. This measuring point holds information on standardized +counters like IEEE 802.3, RFC2863, RFC 2819, RFC 3635 and additional counters +like flow control, FEC and more. + +.. flat-table:: Physical Port Counter Table + :widths: 2 3 1 + + * - Counter + - Description + - Type + + * - `rx_packets_phy` + - The number of packets received on the physical port. This counter doesn’t + include packets that were discarded due to FCS, frame size and similar + errors. + - Informative + + * - `tx_packets_phy` + - The number of packets transmitted on the physical port. + - Informative + + * - `rx_bytes_phy` + - The number of bytes received on the physical port, including Ethernet + header and FCS. + - Informative + + * - `tx_bytes_phy` + - The number of bytes transmitted on the physical port. + - Informative + + * - `rx_multicast_phy` + - The number of multicast packets received on the physical port. + - Informative + + * - `tx_multicast_phy` + - The number of multicast packets transmitted on the physical port. + - Informative + + * - `rx_broadcast_phy` + - The number of broadcast packets received on the physical port. + - Informative + + * - `tx_broadcast_phy` + - The number of broadcast packets transmitted on the physical port. + - Informative + + * - `rx_crc_errors_phy` + - The number of dropped received packets due to FCS (Frame Check Sequence) + error on the physical port. If this counter is increased in high rate, + check the link quality using `rx_symbol_error_phy` and + `rx_corrected_bits_phy` counters below. + - Error + + * - `rx_in_range_len_errors_phy` + - The number of received packets dropped due to length/type errors on a + physical port. + - Error + + * - `rx_out_of_range_len_phy` + - The number of received packets dropped due to length greater than allowed + on a physical port. If this counter is increasing, it implies that the + peer connected to the adapter has a larger MTU configured. Using same MTU + configuration shall resolve this issue. + - Error + + * - `rx_oversize_pkts_phy` + - The number of dropped received packets due to length which exceed MTU + size on a physical port. If this counter is increasing, it implies that + the peer connected to the adapter has a larger MTU configured. Using same + MTU configuration shall resolve this issue. + - Error + + * - `rx_symbol_err_phy` + - The number of received packets dropped due to physical coding errors + (symbol errors) on a physical port. + - Error + + * - `rx_mac_control_phy` + - The number of MAC control packets received on the physical port. + - Informative + + * - `tx_mac_control_phy` + - The number of MAC control packets transmitted on the physical port. + - Informative + + * - `rx_pause_ctrl_phy` + - The number of link layer pause packets received on a physical port. If + this counter is increasing, it implies that the network is congested and + cannot absorb the traffic coming from to the adapter. + - Informative + + * - `tx_pause_ctrl_phy` + - The number of link layer pause packets transmitted on a physical port. If + this counter is increasing, it implies that the NIC is congested and + cannot absorb the traffic coming from the network. + - Informative + + * - `rx_unsupported_op_phy` + - The number of MAC control packets received with unsupported opcode on a + physical port. + - Error + + * - `rx_discards_phy` + - The number of received packets dropped due to lack of buffers on a + physical port. If this counter is increasing, it implies that the adapter + is congested and cannot absorb the traffic coming from the network. + - Error + + * - `tx_discards_phy` + - The number of packets which were discarded on transmission, even no + errors were detected. the drop might occur due to link in down state, + head of line drop, pause from the network, etc. + - Error + + * - `tx_errors_phy` + - The number of transmitted packets dropped due to a length which exceed + MTU size on a physical port. + - Error + + * - `rx_undersize_pkts_phy` + - The number of received packets dropped due to length which is shorter + than 64 bytes on a physical port. If this counter is increasing, it + implies that the peer connected to the adapter has a non-standard MTU + configured or malformed packet had arrived. + - Error + + * - `rx_fragments_phy` + - The number of received packets dropped due to a length which is shorter + than 64 bytes and has FCS error on a physical port. If this counter is + increasing, it implies that the peer connected to the adapter has a + non-standard MTU configured. + - Error + + * - `rx_jabbers_phy` + - The number of received packets d due to a length which is longer than 64 + bytes and had FCS error on a physical port. + - Error + + * - `rx_64_bytes_phy` + - The number of packets received on the physical port with size of 64 bytes. + - Informative + + * - `rx_65_to_127_bytes_phy` + - The number of packets received on the physical port with size of 65 to + 127 bytes. + - Informative + + * - `rx_128_to_255_bytes_phy` + - The number of packets received on the physical port with size of 128 to + 255 bytes. + - Informative + + * - `rx_256_to_511_bytes_phy` + - The number of packets received on the physical port with size of 256 to + 512 bytes. + - Informative + + * - `rx_512_to_1023_bytes_phy` + - The number of packets received on the physical port with size of 512 to + 1023 bytes. + - Informative + + * - `rx_1024_to_1518_bytes_phy` + - The number of packets received on the physical port with size of 1024 to + 1518 bytes. + - Informative + + * - `rx_1519_to_2047_bytes_phy` + - The number of packets received on the physical port with size of 1519 to + 2047 bytes. + - Informative + + * - `rx_2048_to_4095_bytes_phy` + - The number of packets received on the physical port with size of 2048 to + 4095 bytes. + - Informative + + * - `rx_4096_to_8191_bytes_phy` + - The number of packets received on the physical port with size of 4096 to + 8191 bytes. + - Informative + + * - `rx_8192_to_10239_bytes_phy` + - The number of packets received on the physical port with size of 8192 to + 10239 bytes. + - Informative + + * - `link_down_events_phy` + - The number of times where the link operative state changed to down. In + case this counter is increasing it may imply on port flapping. You may + need to replace the cable/transceiver. + - Error + + * - `rx_out_of_buffer` + - Number of times receive queue had no software buffers allocated for the + adapter's incoming traffic. + - Error + + * - `module_bus_stuck` + - The number of times that module's I\ :sup:`2`\C bus (data or clock) + short-wire was detected. You may need to replace the cable/transceiver. + - Error + + * - `module_high_temp` + - The number of times that the module temperature was too high. If this + issue persist, you may need to check the ambient temperature or replace + the cable/transceiver module. + - Error + + * - `module_bad_shorted` + - The number of times that the module cables were shorted. You may need to + replace the cable/transceiver module. + - Error + + * - `module_unplug` + - The number of times that module was ejected. + - Informative + + * - `rx_buffer_passed_thres_phy` + - The number of events where the port receive buffer was over 85% full. + - Informative + + * - `tx_pause_storm_warning_events` + - The number of times the device was sending pauses for a long period of + time. + - Informative + + * - `tx_pause_storm_error_events` + - The number of times the device was sending pauses for a long period of + time, reaching time out and disabling transmission of pause frames. on + the period where pause frames were disabled, drop could have been + occurred. + - Error + + * - `rx[i]_buff_alloc_err` + - Failed to allocate a buffer to received packet (or SKB) on ring i. + - Error + + * - `rx_bits_phy` + - This counter provides information on the total amount of traffic that + could have been received and can be used as a guideline to measure the + ratio of errored traffic in `rx_pcs_symbol_err_phy` and + `rx_corrected_bits_phy`. + - Informative + + * - `rx_pcs_symbol_err_phy` + - This counter counts the number of symbol errors that wasn’t corrected by + FEC correction algorithm or that FEC algorithm was not active on this + interface. If this counter is increasing, it implies that the link + between the NIC and the network is suffering from high BER, and that + traffic is lost. You may need to replace the cable/transceiver. The error + rate is the number of `rx_pcs_symbol_err_phy` divided by the number of + `rx_bits_phy` on a specific time frame. + - Error + + * - `rx_corrected_bits_phy` + - The number of corrected bits on this port according to active FEC + (RS/FC). If this counter is increasing, it implies that the link between + the NIC and the network is suffering from high BER. The corrected bit + rate is the number of `rx_corrected_bits_phy` divided by the number of + `rx_bits_phy` on a specific time frame. + - Error + + * - `rx_err_lane_[l]_phy` + - This counter counts the number of physical raw errors per lane l index. + The counter counts errors before FEC corrections. If this counter is + increasing, it implies that the link between the NIC and the network is + suffering from high BER, and that traffic might be lost. You may need to + replace the cable/transceiver. Please check in accordance with + `rx_corrected_bits_phy`. + - Error + + * - `rx_global_pause` + - The number of pause packets received on the physical port. If this + counter is increasing, it implies that the network is congested and + cannot absorb the traffic coming from the adapter. Note: This counter is + only enabled when global pause mode is enabled. + - Informative + + * - `rx_global_pause_duration` + - The duration of pause received (in microSec) on the physical port. The + counter represents the time the port did not send any traffic. If this + counter is increasing, it implies that the network is congested and + cannot absorb the traffic coming from the adapter. Note: This counter is + only enabled when global pause mode is enabled. + - Informative + + * - `tx_global_pause` + - The number of pause packets transmitted on a physical port. If this + counter is increasing, it implies that the adapter is congested and + cannot absorb the traffic coming from the network. Note: This counter is + only enabled when global pause mode is enabled. + - Informative + + * - `tx_global_pause_duration` + - The duration of pause transmitter (in microSec) on the physical port. + Note: This counter is only enabled when global pause mode is enabled. + - Informative + + * - `rx_global_pause_transition` + - The number of times a transition from Xoff to Xon on the physical port + has occurred. Note: This counter is only enabled when global pause mode + is enabled. + - Informative + + * - `rx_if_down_packets` + - The number of received packets that were dropped due to interface down. + - Informative + +Priority Port Counters +---------------------- +The following counters are physical port counters that are counted per L2 +priority (0-7). + +**Note:** `p` in the counter name represents the priority. + +.. flat-table:: Priority Port Counter Table + :widths: 2 3 1 + + * - Counter + - Description + - Type + + * - `rx_prio[p]_bytes` + - The number of bytes received with priority p on the physical port. + - Informative + + * - `rx_prio[p]_packets` + - The number of packets received with priority p on the physical port. + - Informative + + * - `tx_prio[p]_bytes` + - The number of bytes transmitted on priority p on the physical port. + - Informative + + * - `tx_prio[p]_packets` + - The number of packets transmitted on priority p on the physical port. + - Informative + + * - `rx_prio[p]_pause` + - The number of pause packets received with priority p on a physical port. + If this counter is increasing, it implies that the network is congested + and cannot absorb the traffic coming from the adapter. Note: This counter + is available only if PFC was enabled on priority p. + - Informative + + * - `rx_prio[p]_pause_duration` + - The duration of pause received (in microSec) on priority p on the + physical port. The counter represents the time the port did not send any + traffic on this priority. If this counter is increasing, it implies that + the network is congested and cannot absorb the traffic coming from the + adapter. Note: This counter is available only if PFC was enabled on + priority p. + - Informative + + * - `rx_prio[p]_pause_transition` + - The number of times a transition from Xoff to Xon on priority p on the + physical port has occurred. Note: This counter is available only if PFC + was enabled on priority p. + - Informative + + * - `tx_prio[p]_pause` + - The number of pause packets transmitted on priority p on a physical port. + If this counter is increasing, it implies that the adapter is congested + and cannot absorb the traffic coming from the network. Note: This counter + is available only if PFC was enabled on priority p. + - Informative + + * - `tx_prio[p]_pause_duration` + - The duration of pause transmitter (in microSec) on priority p on the + physical port. Note: This counter is available only if PFC was enabled on + priority p. + - Informative + + * - `rx_prio[p]_buf_discard` + - The number of packets discarded by device due to lack of per host receive + buffers. + - Informative + + * - `rx_prio[p]_cong_discard` + - The number of packets discarded by device due to per host congestion. + - Informative + + * - `rx_prio[p]_marked` + - The number of packets ecn marked by device due to per host congestion. + - Informative + + * - `rx_prio[p]_discards` + - The number of packets discarded by device due to lack of receive buffers. + - Informative + +Device Counters +--------------- +.. flat-table:: Device Counter Table + :widths: 2 3 1 + + * - Counter + - Description + - Type + + * - `rx_pci_signal_integrity` + - Counts physical layer PCIe signal integrity errors, the number of + transitions to recovery due to Framing errors and CRC (dlp and tlp). If + this counter is raising, try moving the adapter card to a different slot + to rule out a bad PCI slot. Validate that you are running with the latest + firmware available and latest server BIOS version. + - Error + + * - `tx_pci_signal_integrity` + - Counts physical layer PCIe signal integrity errors, the number of + transition to recovery initiated by the other side (moving to recovery + due to getting TS/EIEOS). If this counter is raising, try moving the + adapter card to a different slot to rule out a bad PCI slot. Validate + that you are running with the latest firmware available and latest server + BIOS version. + - Error + + * - `outbound_pci_buffer_overflow` + - The number of packets dropped due to pci buffer overflow. If this counter + is raising in high rate, it might indicate that the receive traffic rate + for a host is larger than the PCIe bus and therefore a congestion occurs. + - Informative + + * - `outbound_pci_stalled_rd` + - The percentage (in the range 0...100) of time within the last second that + the NIC had outbound non-posted reads requests but could not perform the + operation due to insufficient posted credits. + - Informative + + * - `outbound_pci_stalled_wr` + - The percentage (in the range 0...100) of time within the last second that + the NIC had outbound posted writes requests but could not perform the + operation due to insufficient posted credits. + - Informative + + * - `outbound_pci_stalled_rd_events` + - The number of seconds where `outbound_pci_stalled_rd` was above 30%. + - Informative + + * - `outbound_pci_stalled_wr_events` + - The number of seconds where `outbound_pci_stalled_wr` was above 30%. + - Informative + + * - `dev_out_of_buffer` + - The number of times the device owned queue had not enough buffers + allocated. + - Error diff --git a/Documentation/networking/device_drivers/ethernet/mellanox/mlx5/devlink.rst b/Documentation/networking/device_drivers/ethernet/mellanox/mlx5/devlink.rst new file mode 100644 index 0000000000000..9b5c40ba7f0da --- /dev/null +++ b/Documentation/networking/device_drivers/ethernet/mellanox/mlx5/devlink.rst @@ -0,0 +1,224 @@ +.. SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +.. include:: + +======= +Devlink +======= + +:Copyright: |copy| 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + +Contents +======== + +- `Info`_ +- `Parameters`_ +- `Health reporters`_ + +Info +==== + +The devlink info reports the running and stored firmware versions on device. +It also prints the device PSID which represents the HCA board type ID. + +User command example:: + + $ devlink dev info pci/0000:00:06.0 + pci/0000:00:06.0: + driver mlx5_core + versions: + fixed: + fw.psid MT_0000000009 + running: + fw.version 16.26.0100 + stored: + fw.version 16.26.0100 + +Parameters +========== + +flow_steering_mode: Device flow steering mode +--------------------------------------------- +The flow steering mode parameter controls the flow steering mode of the driver. +Two modes are supported: +1. 'dmfs' - Device managed flow steering. +2. 'smfs' - Software/Driver managed flow steering. + +In DMFS mode, the HW steering entities are created and managed through the +Firmware. +In SMFS mode, the HW steering entities are created and managed though by +the driver directly into hardware without firmware intervention. + +SMFS mode is faster and provides better rule insertion rate compared to default DMFS mode. + +User command examples: + +- Set SMFS flow steering mode:: + + $ devlink dev param set pci/0000:06:00.0 name flow_steering_mode value "smfs" cmode runtime + +- Read device flow steering mode:: + + $ devlink dev param show pci/0000:06:00.0 name flow_steering_mode + pci/0000:06:00.0: + name flow_steering_mode type driver-specific + values: + cmode runtime value smfs + +enable_roce: RoCE enablement state +---------------------------------- +If the device supports RoCE disablement, RoCE enablement state controls device +support for RoCE capability. Otherwise, the control occurs in the driver stack. +When RoCE is disabled at the driver level, only raw ethernet QPs are supported. + +To change RoCE enablement state, a user must change the driverinit cmode value +and run devlink reload. + +User command examples: + +- Disable RoCE:: + + $ devlink dev param set pci/0000:06:00.0 name enable_roce value false cmode driverinit + $ devlink dev reload pci/0000:06:00.0 + +- Read RoCE enablement state:: + + $ devlink dev param show pci/0000:06:00.0 name enable_roce + pci/0000:06:00.0: + name enable_roce type generic + values: + cmode driverinit value true + +esw_port_metadata: Eswitch port metadata state +---------------------------------------------- +When applicable, disabling eswitch metadata can increase packet rate +up to 20% depending on the use case and packet sizes. + +Eswitch port metadata state controls whether to internally tag packets with +metadata. Metadata tagging must be enabled for multi-port RoCE, failover +between representors and stacked devices. +By default metadata is enabled on the supported devices in E-switch. +Metadata is applicable only for E-switch in switchdev mode and +users may disable it when NONE of the below use cases will be in use: +1. HCA is in Dual/multi-port RoCE mode. +2. VF/SF representor bonding (Usually used for Live migration) +3. Stacked devices + +When metadata is disabled, the above use cases will fail to initialize if +users try to enable them. + +- Show eswitch port metadata:: + + $ devlink dev param show pci/0000:06:00.0 name esw_port_metadata + pci/0000:06:00.0: + name esw_port_metadata type driver-specific + values: + cmode runtime value true + +- Disable eswitch port metadata:: + + $ devlink dev param set pci/0000:06:00.0 name esw_port_metadata value false cmode runtime + +- Change eswitch mode to switchdev mode where after choosing the metadata value:: + + $ devlink dev eswitch set pci/0000:06:00.0 mode switchdev + +Health reporters +================ + +tx reporter +----------- +The tx reporter is responsible for reporting and recovering of the following two error scenarios: + +- tx timeout + Report on kernel tx timeout detection. + Recover by searching lost interrupts. +- tx error completion + Report on error tx completion. + Recover by flushing the tx queue and reset it. + +tx reporter also support on demand diagnose callback, on which it provides +real time information of its send queues status. + +User commands examples: + +- Diagnose send queues status:: + + $ devlink health diagnose pci/0000:82:00.0 reporter tx + +NOTE: This command has valid output only when interface is up, otherwise the command has empty output. + +- Show number of tx errors indicated, number of recover flows ended successfully, + is autorecover enabled and graceful period from last recover:: + + $ devlink health show pci/0000:82:00.0 reporter tx + +rx reporter +----------- +The rx reporter is responsible for reporting and recovering of the following two error scenarios: + +- rx queues' initialization (population) timeout + Population of rx queues' descriptors on ring initialization is done + in napi context via triggering an irq. In case of a failure to get + the minimum amount of descriptors, a timeout would occur, and + descriptors could be recovered by polling the EQ (Event Queue). +- rx completions with errors (reported by HW on interrupt context) + Report on rx completion error. + Recover (if needed) by flushing the related queue and reset it. + +rx reporter also supports on demand diagnose callback, on which it +provides real time information of its receive queues' status. + +- Diagnose rx queues' status and corresponding completion queue:: + + $ devlink health diagnose pci/0000:82:00.0 reporter rx + +NOTE: This command has valid output only when interface is up. Otherwise, the command has empty output. + +- Show number of rx errors indicated, number of recover flows ended successfully, + is autorecover enabled, and graceful period from last recover:: + + $ devlink health show pci/0000:82:00.0 reporter rx + +fw reporter +----------- +The fw reporter implements `diagnose` and `dump` callbacks. +It follows symptoms of fw error such as fw syndrome by triggering +fw core dump and storing it into the dump buffer. +The fw reporter diagnose command can be triggered any time by the user to check +current fw status. + +User commands examples: + +- Check fw heath status:: + + $ devlink health diagnose pci/0000:82:00.0 reporter fw + +- Read FW core dump if already stored or trigger new one:: + + $ devlink health dump show pci/0000:82:00.0 reporter fw + +NOTE: This command can run only on the PF which has fw tracer ownership, +running it on other PF or any VF will return "Operation not permitted". + +fw fatal reporter +----------------- +The fw fatal reporter implements `dump` and `recover` callbacks. +It follows fatal errors indications by CR-space dump and recover flow. +The CR-space dump uses vsc interface which is valid even if the FW command +interface is not functional, which is the case in most FW fatal errors. +The recover function runs recover flow which reloads the driver and triggers fw +reset if needed. +On firmware error, the health buffer is dumped into the dmesg. The log +level is derived from the error's severity (given in health buffer). + +User commands examples: + +- Run fw recover flow manually:: + + $ devlink health recover pci/0000:82:00.0 reporter fw_fatal + +- Read FW CR-space dump if already stored or trigger new one:: + + $ devlink health dump show pci/0000:82:00.1 reporter fw_fatal + +NOTE: This command can run only on PF. diff --git a/Documentation/networking/device_drivers/ethernet/mellanox/mlx5/index.rst b/Documentation/networking/device_drivers/ethernet/mellanox/mlx5/index.rst new file mode 100644 index 0000000000000..3fdcd6b61ccfa --- /dev/null +++ b/Documentation/networking/device_drivers/ethernet/mellanox/mlx5/index.rst @@ -0,0 +1,26 @@ +.. SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +.. include:: + +Mellanox ConnectX(R) mlx5 core VPI Network Driver +================================================= + +:Copyright: |copy| 2019, Mellanox Technologies LTD. +:Copyright: |copy| 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + +Contents: + +.. toctree:: + :maxdepth: 2 + + kconfig + devlink + switchdev + tracepoints + counters + +.. only:: subproject and html + + Indices + ======= + + * :ref:`genindex` diff --git a/Documentation/networking/device_drivers/ethernet/mellanox/mlx5/kconfig.rst b/Documentation/networking/device_drivers/ethernet/mellanox/mlx5/kconfig.rst new file mode 100644 index 0000000000000..43b1f7e87ec47 --- /dev/null +++ b/Documentation/networking/device_drivers/ethernet/mellanox/mlx5/kconfig.rst @@ -0,0 +1,168 @@ +.. SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +.. include:: + +======================================= +Enabling the driver and kconfig options +======================================= + +:Copyright: |copy| 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + +| mlx5 core is modular and most of the major mlx5 core driver features can be selected (compiled in/out) +| at build time via kernel Kconfig flags. +| Basic features, ethernet net device rx/tx offloads and XDP, are available with the most basic flags +| CONFIG_MLX5_CORE=y/m and CONFIG_MLX5_CORE_EN=y. +| For the list of advanced features, please see below. + +**CONFIG_MLX5_BRIDGE=(y/n)** + +| Enable :ref:`Ethernet Bridging (BRIDGE) offloading support `. +| This will provide the ability to add representors of mlx5 uplink and VF +| ports to Bridge and offloading rules for traffic between such ports. +| Supports VLANs (trunk and access modes). + + +**CONFIG_MLX5_CORE=(y/m/n)** (module mlx5_core.ko) + +| The driver can be enabled by choosing CONFIG_MLX5_CORE=y/m in kernel config. +| This will provide mlx5 core driver for mlx5 ulps to interface with (mlx5e, mlx5_ib). + + +**CONFIG_MLX5_CORE_EN=(y/n)** + +| Choosing this option will allow basic ethernet netdevice support with all of the standard rx/tx offloads. +| mlx5e is the mlx5 ulp driver which provides netdevice kernel interface, when chosen, mlx5e will be +| built-in into mlx5_core.ko. + + +**CONFIG_MLX5_CORE_EN_DCB=(y/n)**: + +| Enables `Data Center Bridging (DCB) Support `_. + + +**CONFIG_MLX5_CORE_IPOIB=(y/n)** + +| IPoIB offloads & acceleration support. +| Requires CONFIG_MLX5_CORE_EN to provide an accelerated interface for the rdma +| IPoIB ulp netdevice. + + +**CONFIG_MLX5_CLS_ACT=(y/n)** + +| Enables offload support for TC classifier action (NET_CLS_ACT). +| Works in both native NIC mode and Switchdev SRIOV mode. +| Flow-based classifiers, such as those registered through +| `tc-flower(8)`, are processed by the device, rather than the +| host. Actions that would then overwrite matching classification +| results would then be instant due to the offload. + + +**CONFIG_MLX5_EN_ARFS=(y/n)** + +| Enables Hardware-accelerated receive flow steering (arfs) support, and ntuple filtering. +| https://community.mellanox.com/s/article/howto-configure-arfs-on-connectx-4 + + +**CONFIG_MLX5_EN_IPSEC=(y/n)** + +| Enables `IPSec XFRM cryptography-offload acceleration `_. + + +**CONFIG_MLX5_EN_MACSEC=(y/n)** + +| Build support for MACsec cryptography-offload acceleration in the NIC. + + +**CONFIG_MLX5_EN_RXNFC=(y/n)** + +| Enables ethtool receive network flow classification, which allows user defined +| flow rules to direct traffic into arbitrary rx queue via ethtool set/get_rxnfc API. + + +**CONFIG_MLX5_EN_TLS=(y/n)** + +| TLS cryptography-offload acceleration. + + +**CONFIG_MLX5_ESWITCH=(y/n)** + +| Ethernet SRIOV E-Switch support in ConnectX NIC. E-Switch provides internal SRIOV packet steering +| and switching for the enabled VFs and PF in two available modes: +| 1) `Legacy SRIOV mode (L2 mac vlan steering based) `_. +| 2) `Switchdev mode (eswitch offloads) `_. + + +**CONFIG_MLX5_FPGA=(y/n)** + +| Build support for the Innova family of network cards by Mellanox Technologies. +| Innova network cards are comprised of a ConnectX chip and an FPGA chip on one board. +| If you select this option, the mlx5_core driver will include the Innova FPGA core and allow +| building sandbox-specific client drivers. + + +**CONFIG_MLX5_INFINIBAND=(y/n/m)** (module mlx5_ib.ko) + +| Provides low-level InfiniBand/RDMA and `RoCE `_ support. + + +**CONFIG_MLX5_MPFS=(y/n)** + +| Ethernet Multi-Physical Function Switch (MPFS) support in ConnectX NIC. +| MPFs is required for when `Multi-Host `_ configuration is enabled to allow passing +| user configured unicast MAC addresses to the requesting PF. + + +**CONFIG_MLX5_SF=(y/n)** + +| Build support for subfunction. +| Subfunctons are more light weight than PCI SRIOV VFs. Choosing this option +| will enable support for creating subfunction devices. + + +**CONFIG_MLX5_SF_MANAGER=(y/n)** + +| Build support for subfuction port in the NIC. A Mellanox subfunction +| port is managed through devlink. A subfunction supports RDMA, netdevice +| and vdpa device. It is similar to a SRIOV VF but it doesn't require +| SRIOV support. + + +**CONFIG_MLX5_SW_STEERING=(y/n)** + +| Build support for software-managed steering in the NIC. + + +**CONFIG_MLX5_TC_CT=(y/n)** + +| Support offloading connection tracking rules via tc ct action. + + +**CONFIG_MLX5_TC_SAMPLE=(y/n)** + +| Support offloading sample rules via tc sample action. + + +**CONFIG_MLX5_VDPA=(y/n)** + +| Support library for Mellanox VDPA drivers. Provides code that is +| common for all types of VDPA drivers. The following drivers are planned: +| net, block. + + +**CONFIG_MLX5_VDPA_NET=(y/n)** + +| VDPA network driver for ConnectX6 and newer. Provides offloading +| of virtio net datapath such that descriptors put on the ring will +| be executed by the hardware. It also supports a variety of stateless +| offloads depending on the actual device used and firmware version. + + +**CONFIG_MLX5_VFIO_PCI=(y/n)** + +| This provides migration support for MLX5 devices using the VFIO framework. + + +**External options** ( Choose if the corresponding mlx5 feature is required ) + +- CONFIG_MLXFW: When chosen, mlx5 firmware flashing support will be enabled (via devlink and ethtool). +- CONFIG_PTP_1588_CLOCK: When chosen, mlx5 ptp support will be enabled +- CONFIG_VXLAN: When chosen, mlx5 vxlan support will be enabled. diff --git a/Documentation/networking/device_drivers/ethernet/mellanox/mlx5/switchdev.rst b/Documentation/networking/device_drivers/ethernet/mellanox/mlx5/switchdev.rst new file mode 100644 index 0000000000000..01deedb715975 --- /dev/null +++ b/Documentation/networking/device_drivers/ethernet/mellanox/mlx5/switchdev.rst @@ -0,0 +1,239 @@ +.. SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +.. include:: + +========= +Switchdev +========= + +:Copyright: |copy| 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + +.. _mlx5_bridge_offload: + +Bridge offload +============== + +The mlx5 driver implements support for offloading bridge rules when in switchdev +mode. Linux bridge FDBs are automatically offloaded when mlx5 switchdev +representor is attached to bridge. + +- Change device to switchdev mode:: + + $ devlink dev eswitch set pci/0000:06:00.0 mode switchdev + +- Attach mlx5 switchdev representor 'enp8s0f0' to bridge netdev 'bridge1':: + + $ ip link set enp8s0f0 master bridge1 + +VLANs +----- + +Following bridge VLAN functions are supported by mlx5: + +- VLAN filtering (including multiple VLANs per port):: + + $ ip link set bridge1 type bridge vlan_filtering 1 + $ bridge vlan add dev enp8s0f0 vid 2-3 + +- VLAN push on bridge ingress:: + + $ bridge vlan add dev enp8s0f0 vid 3 pvid + +- VLAN pop on bridge egress:: + + $ bridge vlan add dev enp8s0f0 vid 3 untagged + +Subfunction +=========== + +mlx5 supports subfunction management using devlink port (see :ref:`Documentation/networking/devlink/devlink-port.rst `) interface. + +A subfunction has its own function capabilities and its own resources. This +means a subfunction has its own dedicated queues (txq, rxq, cq, eq). These +queues are neither shared nor stolen from the parent PCI function. + +When a subfunction is RDMA capable, it has its own QP1, GID table, and RDMA +resources neither shared nor stolen from the parent PCI function. + +A subfunction has a dedicated window in PCI BAR space that is not shared +with the other subfunctions or the parent PCI function. This ensures that all +devices (netdev, rdma, vdpa, etc.) of the subfunction accesses only assigned +PCI BAR space. + +A subfunction supports eswitch representation through which it supports tc +offloads. The user configures eswitch to send/receive packets from/to +the subfunction port. + +Subfunctions share PCI level resources such as PCI MSI-X IRQs with +other subfunctions and/or with its parent PCI function. + +Example mlx5 software, system, and device view:: + + _______ + | admin | + | user |---------- + |_______| | + | | + ____|____ __|______ _________________ + | | | | | | + | devlink | | tc tool | | user | + | tool | |_________| | applications | + |_________| | |_________________| + | | | | + | | | | Userspace + +---------|-------------|-------------------|----------|--------------------+ + | | +----------+ +----------+ Kernel + | | | netdev | | rdma dev | + | | +----------+ +----------+ + (devlink port add/del | ^ ^ + port function set) | | | + | | +---------------| + _____|___ | | _______|_______ + | | | | | mlx5 class | + | devlink | +------------+ | | drivers | + | kernel | | rep netdev | | |(mlx5_core,ib) | + |_________| +------------+ | |_______________| + | | | ^ + (devlink ops) | | (probe/remove) + _________|________ | | ____|________ + | subfunction | | +---------------+ | subfunction | + | management driver|----- | subfunction |---| driver | + | (mlx5_core) | | auxiliary dev | | (mlx5_core) | + |__________________| +---------------+ |_____________| + | ^ + (sf add/del, vhca events) | + | (device add/del) + _____|____ ____|________ + | | | subfunction | + | PCI NIC |--- activate/deactivate events--->| host driver | + |__________| | (mlx5_core) | + |_____________| + +Subfunction is created using devlink port interface. + +- Change device to switchdev mode:: + + $ devlink dev eswitch set pci/0000:06:00.0 mode switchdev + +- Add a devlink port of subfunction flavour:: + + $ devlink port add pci/0000:06:00.0 flavour pcisf pfnum 0 sfnum 88 + pci/0000:06:00.0/32768: type eth netdev eth6 flavour pcisf controller 0 pfnum 0 sfnum 88 external false splittable false + function: + hw_addr 00:00:00:00:00:00 state inactive opstate detached + +- Show a devlink port of the subfunction:: + + $ devlink port show pci/0000:06:00.0/32768 + pci/0000:06:00.0/32768: type eth netdev enp6s0pf0sf88 flavour pcisf pfnum 0 sfnum 88 + function: + hw_addr 00:00:00:00:00:00 state inactive opstate detached + +- Delete a devlink port of subfunction after use:: + + $ devlink port del pci/0000:06:00.0/32768 + +Function attributes +=================== + +The mlx5 driver provides a mechanism to setup PCI VF/SF function attributes in +a unified way for SmartNIC and non-SmartNIC. + +This is supported only when the eswitch mode is set to switchdev. Port function +configuration of the PCI VF/SF is supported through devlink eswitch port. + +Port function attributes should be set before PCI VF/SF is enumerated by the +driver. + +MAC address setup +----------------- + +mlx5 driver support devlink port function attr mechanism to setup MAC +address. (refer to Documentation/networking/devlink/devlink-port.rst) + +RoCE capability setup +~~~~~~~~~~~~~~~~~~~~~ +Not all mlx5 PCI devices/SFs require RoCE capability. + +When RoCE capability is disabled, it saves 1 Mbytes worth of system memory per +PCI devices/SF. + +mlx5 driver support devlink port function attr mechanism to setup RoCE +capability. (refer to Documentation/networking/devlink/devlink-port.rst) + +migratable capability setup +~~~~~~~~~~~~~~~~~~~~~~~~~~~ +User who wants mlx5 PCI VFs to be able to perform live migration need to +explicitly enable the VF migratable capability. + +mlx5 driver support devlink port function attr mechanism to setup migratable +capability. (refer to Documentation/networking/devlink/devlink-port.rst) + +SF state setup +-------------- + +To use the SF, the user must activate the SF using the SF function state +attribute. + +- Get the state of the SF identified by its unique devlink port index:: + + $ devlink port show ens2f0npf0sf88 + pci/0000:06:00.0/32768: type eth netdev ens2f0npf0sf88 flavour pcisf controller 0 pfnum 0 sfnum 88 external false splittable false + function: + hw_addr 00:00:00:00:88:88 state inactive opstate detached + +- Activate the function and verify its state is active:: + + $ devlink port function set ens2f0npf0sf88 state active + + $ devlink port show ens2f0npf0sf88 + pci/0000:06:00.0/32768: type eth netdev ens2f0npf0sf88 flavour pcisf controller 0 pfnum 0 sfnum 88 external false splittable false + function: + hw_addr 00:00:00:00:88:88 state active opstate detached + +Upon function activation, the PF driver instance gets the event from the device +that a particular SF was activated. It's the cue to put the device on bus, probe +it and instantiate the devlink instance and class specific auxiliary devices +for it. + +- Show the auxiliary device and port of the subfunction:: + + $ devlink dev show + devlink dev show auxiliary/mlx5_core.sf.4 + + $ devlink port show auxiliary/mlx5_core.sf.4/1 + auxiliary/mlx5_core.sf.4/1: type eth netdev p0sf88 flavour virtual port 0 splittable false + + $ rdma link show mlx5_0/1 + link mlx5_0/1 state ACTIVE physical_state LINK_UP netdev p0sf88 + + $ rdma dev show + 8: rocep6s0f1: node_type ca fw 16.29.0550 node_guid 248a:0703:00b3:d113 sys_image_guid 248a:0703:00b3:d112 + 13: mlx5_0: node_type ca fw 16.29.0550 node_guid 0000:00ff:fe00:8888 sys_image_guid 248a:0703:00b3:d112 + +- Subfunction auxiliary device and class device hierarchy:: + + mlx5_core.sf.4 + (subfunction auxiliary device) + /\ + / \ + / \ + / \ + / \ + mlx5_core.eth.4 mlx5_core.rdma.4 + (sf eth aux dev) (sf rdma aux dev) + | | + | | + p0sf88 mlx5_0 + (sf netdev) (sf rdma device) + +Additionally, the SF port also gets the event when the driver attaches to the +auxiliary device of the subfunction. This results in changing the operational +state of the function. This provides visibility to the user to decide when is it +safe to delete the SF port for graceful termination of the subfunction. + +- Show the SF port operational state:: + + $ devlink port show ens2f0npf0sf88 + pci/0000:06:00.0/32768: type eth netdev ens2f0npf0sf88 flavour pcisf controller 0 pfnum 0 sfnum 88 external false splittable false + function: + hw_addr 00:00:00:00:88:88 state active opstate attached diff --git a/Documentation/networking/device_drivers/ethernet/mellanox/mlx5/tracepoints.rst b/Documentation/networking/device_drivers/ethernet/mellanox/mlx5/tracepoints.rst new file mode 100644 index 0000000000000..a9d3e123adc4b --- /dev/null +++ b/Documentation/networking/device_drivers/ethernet/mellanox/mlx5/tracepoints.rst @@ -0,0 +1,229 @@ +.. SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +.. include:: + +=========== +Tracepoints +=========== + +:Copyright: |copy| 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + +mlx5 driver provides internal tracepoints for tracking and debugging using +kernel tracepoints interfaces (refer to Documentation/trace/ftrace.rst). + +For the list of support mlx5 events, check `/sys/kernel/debug/tracing/events/mlx5/`. + +tc and eswitch offloads tracepoints: + +- mlx5e_configure_flower: trace flower filter actions and cookies offloaded to mlx5:: + + $ echo mlx5:mlx5e_configure_flower >> /sys/kernel/debug/tracing/set_event + $ cat /sys/kernel/debug/tracing/trace + ... + tc-6535 [019] ...1 2672.404466: mlx5e_configure_flower: cookie=0000000067874a55 actions= REDIRECT + +- mlx5e_delete_flower: trace flower filter actions and cookies deleted from mlx5:: + + $ echo mlx5:mlx5e_delete_flower >> /sys/kernel/debug/tracing/set_event + $ cat /sys/kernel/debug/tracing/trace + ... + tc-6569 [010] .N.1 2686.379075: mlx5e_delete_flower: cookie=0000000067874a55 actions= NULL + +- mlx5e_stats_flower: trace flower stats request:: + + $ echo mlx5:mlx5e_stats_flower >> /sys/kernel/debug/tracing/set_event + $ cat /sys/kernel/debug/tracing/trace + ... + tc-6546 [010] ...1 2679.704889: mlx5e_stats_flower: cookie=0000000060eb3d6a bytes=0 packets=0 lastused=4295560217 + +- mlx5e_tc_update_neigh_used_value: trace tunnel rule neigh update value offloaded to mlx5:: + + $ echo mlx5:mlx5e_tc_update_neigh_used_value >> /sys/kernel/debug/tracing/set_event + $ cat /sys/kernel/debug/tracing/trace + ... + kworker/u48:4-8806 [009] ...1 55117.882428: mlx5e_tc_update_neigh_used_value: netdev: ens1f0 IPv4: 1.1.1.10 IPv6: ::ffff:1.1.1.10 neigh_used=1 + +- mlx5e_rep_neigh_update: trace neigh update tasks scheduled due to neigh state change events:: + + $ echo mlx5:mlx5e_rep_neigh_update >> /sys/kernel/debug/tracing/set_event + $ cat /sys/kernel/debug/tracing/trace + ... + kworker/u48:7-2221 [009] ...1 1475.387435: mlx5e_rep_neigh_update: netdev: ens1f0 MAC: 24:8a:07:9a:17:9a IPv4: 1.1.1.10 IPv6: ::ffff:1.1.1.10 neigh_connected=1 + +Bridge offloads tracepoints: + +- mlx5_esw_bridge_fdb_entry_init: trace bridge FDB entry offloaded to mlx5:: + + $ echo mlx5:mlx5_esw_bridge_fdb_entry_init >> set_event + $ cat /sys/kernel/debug/tracing/trace + ... + kworker/u20:9-2217 [003] ...1 318.582243: mlx5_esw_bridge_fdb_entry_init: net_device=enp8s0f0_0 addr=e4:fd:05:08:00:02 vid=0 flags=0 used=0 + +- mlx5_esw_bridge_fdb_entry_cleanup: trace bridge FDB entry deleted from mlx5:: + + $ echo mlx5:mlx5_esw_bridge_fdb_entry_cleanup >> set_event + $ cat /sys/kernel/debug/tracing/trace + ... + ip-2581 [005] ...1 318.629871: mlx5_esw_bridge_fdb_entry_cleanup: net_device=enp8s0f0_1 addr=e4:fd:05:08:00:03 vid=0 flags=0 used=16 + +- mlx5_esw_bridge_fdb_entry_refresh: trace bridge FDB entry offload refreshed in + mlx5:: + + $ echo mlx5:mlx5_esw_bridge_fdb_entry_refresh >> set_event + $ cat /sys/kernel/debug/tracing/trace + ... + kworker/u20:8-3849 [003] ...1 466716: mlx5_esw_bridge_fdb_entry_refresh: net_device=enp8s0f0_0 addr=e4:fd:05:08:00:02 vid=3 flags=0 used=0 + +- mlx5_esw_bridge_vlan_create: trace bridge VLAN object add on mlx5 + representor:: + + $ echo mlx5:mlx5_esw_bridge_vlan_create >> set_event + $ cat /sys/kernel/debug/tracing/trace + ... + ip-2560 [007] ...1 318.460258: mlx5_esw_bridge_vlan_create: vid=1 flags=6 + +- mlx5_esw_bridge_vlan_cleanup: trace bridge VLAN object delete from mlx5 + representor:: + + $ echo mlx5:mlx5_esw_bridge_vlan_cleanup >> set_event + $ cat /sys/kernel/debug/tracing/trace + ... + bridge-2582 [007] ...1 318.653496: mlx5_esw_bridge_vlan_cleanup: vid=2 flags=8 + +- mlx5_esw_bridge_vport_init: trace mlx5 vport assigned with bridge upper + device:: + + $ echo mlx5:mlx5_esw_bridge_vport_init >> set_event + $ cat /sys/kernel/debug/tracing/trace + ... + ip-2560 [007] ...1 318.458915: mlx5_esw_bridge_vport_init: vport_num=1 + +- mlx5_esw_bridge_vport_cleanup: trace mlx5 vport removed from bridge upper + device:: + + $ echo mlx5:mlx5_esw_bridge_vport_cleanup >> set_event + $ cat /sys/kernel/debug/tracing/trace + ... + ip-5387 [000] ...1 573713: mlx5_esw_bridge_vport_cleanup: vport_num=1 + +Eswitch QoS tracepoints: + +- mlx5_esw_vport_qos_create: trace creation of transmit scheduler arbiter for vport:: + + $ echo mlx5:mlx5_esw_vport_qos_create >> /sys/kernel/debug/tracing/set_event + $ cat /sys/kernel/debug/tracing/trace + ... + <...>-23496 [018] .... 73136.838831: mlx5_esw_vport_qos_create: (0000:82:00.0) vport=2 tsar_ix=4 bw_share=0, max_rate=0 group=000000007b576bb3 + +- mlx5_esw_vport_qos_config: trace configuration of transmit scheduler arbiter for vport:: + + $ echo mlx5:mlx5_esw_vport_qos_config >> /sys/kernel/debug/tracing/set_event + $ cat /sys/kernel/debug/tracing/trace + ... + <...>-26548 [023] .... 75754.223823: mlx5_esw_vport_qos_config: (0000:82:00.0) vport=1 tsar_ix=3 bw_share=34, max_rate=10000 group=000000007b576bb3 + +- mlx5_esw_vport_qos_destroy: trace deletion of transmit scheduler arbiter for vport:: + + $ echo mlx5:mlx5_esw_vport_qos_destroy >> /sys/kernel/debug/tracing/set_event + $ cat /sys/kernel/debug/tracing/trace + ... + <...>-27418 [004] .... 76546.680901: mlx5_esw_vport_qos_destroy: (0000:82:00.0) vport=1 tsar_ix=3 + +- mlx5_esw_group_qos_create: trace creation of transmit scheduler arbiter for rate group:: + + $ echo mlx5:mlx5_esw_group_qos_create >> /sys/kernel/debug/tracing/set_event + $ cat /sys/kernel/debug/tracing/trace + ... + <...>-26578 [008] .... 75776.022112: mlx5_esw_group_qos_create: (0000:82:00.0) group=000000008dac63ea tsar_ix=5 + +- mlx5_esw_group_qos_config: trace configuration of transmit scheduler arbiter for rate group:: + + $ echo mlx5:mlx5_esw_group_qos_config >> /sys/kernel/debug/tracing/set_event + $ cat /sys/kernel/debug/tracing/trace + ... + <...>-27303 [020] .... 76461.455356: mlx5_esw_group_qos_config: (0000:82:00.0) group=000000008dac63ea tsar_ix=5 bw_share=100 max_rate=20000 + +- mlx5_esw_group_qos_destroy: trace deletion of transmit scheduler arbiter for group:: + + $ echo mlx5:mlx5_esw_group_qos_destroy >> /sys/kernel/debug/tracing/set_event + $ cat /sys/kernel/debug/tracing/trace + ... + <...>-27418 [006] .... 76547.187258: mlx5_esw_group_qos_destroy: (0000:82:00.0) group=000000007b576bb3 tsar_ix=1 + +SF tracepoints: + +- mlx5_sf_add: trace addition of the SF port:: + + $ echo mlx5:mlx5_sf_add >> /sys/kernel/debug/tracing/set_event + $ cat /sys/kernel/debug/tracing/trace + ... + devlink-9363 [031] ..... 24610.188722: mlx5_sf_add: (0000:06:00.0) port_index=32768 controller=0 hw_id=0x8000 sfnum=88 + +- mlx5_sf_free: trace freeing of the SF port:: + + $ echo mlx5:mlx5_sf_free >> /sys/kernel/debug/tracing/set_event + $ cat /sys/kernel/debug/tracing/trace + ... + devlink-9830 [038] ..... 26300.404749: mlx5_sf_free: (0000:06:00.0) port_index=32768 controller=0 hw_id=0x8000 + +- mlx5_sf_activate: trace activation of the SF port:: + + $ echo mlx5:mlx5_sf_activate >> /sys/kernel/debug/tracing/set_event + $ cat /sys/kernel/debug/tracing/trace + ... + devlink-29841 [008] ..... 3669.635095: mlx5_sf_activate: (0000:08:00.0) port_index=32768 controller=0 hw_id=0x8000 + +- mlx5_sf_deactivate: trace deactivation of the SF port:: + + $ echo mlx5:mlx5_sf_deactivate >> /sys/kernel/debug/tracing/set_event + $ cat /sys/kernel/debug/tracing/trace + ... + devlink-29994 [008] ..... 4015.969467: mlx5_sf_deactivate: (0000:08:00.0) port_index=32768 controller=0 hw_id=0x8000 + +- mlx5_sf_hwc_alloc: trace allocating of the hardware SF context:: + + $ echo mlx5:mlx5_sf_hwc_alloc >> /sys/kernel/debug/tracing/set_event + $ cat /sys/kernel/debug/tracing/trace + ... + devlink-9775 [031] ..... 26296.385259: mlx5_sf_hwc_alloc: (0000:06:00.0) controller=0 hw_id=0x8000 sfnum=88 + +- mlx5_sf_hwc_free: trace freeing of the hardware SF context:: + + $ echo mlx5:mlx5_sf_hwc_free >> /sys/kernel/debug/tracing/set_event + $ cat /sys/kernel/debug/tracing/trace + ... + kworker/u128:3-9093 [046] ..... 24625.365771: mlx5_sf_hwc_free: (0000:06:00.0) hw_id=0x8000 + +- mlx5_sf_hwc_deferred_free: trace deferred freeing of the hardware SF context:: + + $ echo mlx5:mlx5_sf_hwc_deferred_free >> /sys/kernel/debug/tracing/set_event + $ cat /sys/kernel/debug/tracing/trace + ... + devlink-9519 [046] ..... 24624.400271: mlx5_sf_hwc_deferred_free: (0000:06:00.0) hw_id=0x8000 + +- mlx5_sf_update_state: trace state updates for SF contexts:: + + $ echo mlx5:mlx5_sf_update_state >> /sys/kernel/debug/tracing/set_event + $ cat /sys/kernel/debug/tracing/trace + ... + kworker/u20:3-29490 [009] ..... 4141.453530: mlx5_sf_update_state: (0000:08:00.0) port_index=32768 controller=0 hw_id=0x8000 state=2 + +- mlx5_sf_vhca_event: trace SF vhca event and state:: + + $ echo mlx5:mlx5_sf_vhca_event >> /sys/kernel/debug/tracing/set_event + $ cat /sys/kernel/debug/tracing/trace + ... + kworker/u128:3-9093 [046] ..... 24625.365525: mlx5_sf_vhca_event: (0000:06:00.0) hw_id=0x8000 sfnum=88 vhca_state=1 + +- mlx5_sf_dev_add: trace SF device add event:: + + $ echo mlx5:mlx5_sf_dev_add>> /sys/kernel/debug/tracing/set_event + $ cat /sys/kernel/debug/tracing/trace + ... + kworker/u128:3-9093 [000] ..... 24616.524495: mlx5_sf_dev_add: (0000:06:00.0) sfdev=00000000fc5d96fd aux_id=4 hw_id=0x8000 sfnum=88 + +- mlx5_sf_dev_del: trace SF device delete event:: + + $ echo mlx5:mlx5_sf_dev_del >> /sys/kernel/debug/tracing/set_event + $ cat /sys/kernel/debug/tracing/trace + ... + kworker/u128:3-9093 [044] ..... 24624.400749: mlx5_sf_dev_del: (0000:06:00.0) sfdev=00000000fc5d96fd aux_id=4 hw_id=0x8000 sfnum=88 diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index 6f8723cc68741..125c7cb7d839c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -454,6 +454,7 @@ struct mlx5e_txqsq { struct mlx5_clock *clock; struct net_device *netdev; struct mlx5_core_dev *mdev; + struct mlx5e_channel *channel; struct mlx5e_priv *priv; /* control path */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c index 8469e9c386706..9a1bc93b7dc6e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c @@ -771,8 +771,8 @@ void mlx5e_ptp_activate_channel(struct mlx5e_ptp *c) if (test_bit(MLX5E_PTP_STATE_RX, c->state)) { mlx5e_ptp_rx_set_fs(c->priv); mlx5e_activate_rq(&c->rq); - mlx5e_trigger_napi_sched(&c->napi); } + mlx5e_trigger_napi_sched(&c->napi); } void mlx5e_ptp_deactivate_channel(struct mlx5e_ptp *c) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c index b195dbbf6c90f..41e356d9d785f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c @@ -81,6 +81,10 @@ static int mlx5e_tx_reporter_err_cqe_recover(void *ctx) sq->stats->recover++; clear_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state); mlx5e_activate_txqsq(sq); + if (sq->channel) + mlx5e_trigger_napi_icosq(sq->channel); + else + mlx5e_trigger_napi_sched(sq->cq.napi); return 0; out: diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/mirred.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/mirred.c index 78c427b38048d..c095a12346deb 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/mirred.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/mirred.c @@ -232,9 +232,9 @@ parse_mirred(struct mlx5e_tc_act_parse_state *parse_state, parse_state->ifindexes[if_count] = out_dev->ifindex; parse_state->if_count++; is_uplink_rep = mlx5e_eswitch_uplink_rep(out_dev); - err = mlx5_lag_do_mirred(priv->mdev, out_dev); - if (err) - return err; + + if (mlx5_lag_mpesw_do_mirred(priv->mdev, out_dev, extack)) + return -EOPNOTSUPP; out_dev = get_fdb_out_dev(uplink_dev, out_dev); if (!out_dev) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c index e84c3400ba1d8..7b0d3de0ec6c7 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c @@ -158,6 +158,11 @@ void mlx5e_ipsec_build_accel_xfrm_attrs(struct mlx5e_ipsec_sa_entry *sa_entry, attrs->family = x->props.family; attrs->type = x->xso.type; attrs->reqid = x->props.reqid; + attrs->upspec.dport = ntohs(x->sel.dport); + attrs->upspec.dport_mask = ntohs(x->sel.dport_mask); + attrs->upspec.sport = ntohs(x->sel.sport); + attrs->upspec.sport_mask = ntohs(x->sel.sport_mask); + attrs->upspec.proto = x->sel.proto; mlx5e_ipsec_init_limits(sa_entry, attrs); } @@ -221,6 +226,13 @@ static int mlx5e_xfrm_validate_state(struct mlx5_core_dev *mdev, NL_SET_ERR_MSG_MOD(extack, "Cannot offload xfrm states with geniv other than seqiv"); return -EINVAL; } + + if (x->sel.proto != IPPROTO_IP && + (x->sel.proto != IPPROTO_UDP || x->xso.dir != XFRM_DEV_OFFLOAD_OUT)) { + NL_SET_ERR_MSG_MOD(extack, "Device does not support upper protocol other than UDP, and only Tx direction"); + return -EINVAL; + } + switch (x->xso.type) { case XFRM_DEV_OFFLOAD_CRYPTO: if (!(mlx5_ipsec_device_caps(mdev) & MLX5_IPSEC_CAP_CRYPTO)) { @@ -517,6 +529,12 @@ static int mlx5e_xfrm_validate_policy(struct xfrm_policy *x, return -EINVAL; } + if (x->selector.proto != IPPROTO_IP && + (x->selector.proto != IPPROTO_UDP || x->xdo.dir != XFRM_DEV_OFFLOAD_OUT)) { + NL_SET_ERR_MSG_MOD(extack, "Device does not support upper protocol other than UDP, and only Tx direction"); + return -EINVAL; + } + return 0; } @@ -537,6 +555,11 @@ mlx5e_ipsec_build_accel_pol_attrs(struct mlx5e_ipsec_pol_entry *pol_entry, attrs->action = x->action; attrs->type = XFRM_DEV_OFFLOAD_PACKET; attrs->reqid = x->xfrm_vec[0].reqid; + attrs->upspec.dport = ntohs(sel->dport); + attrs->upspec.dport_mask = ntohs(sel->dport_mask); + attrs->upspec.sport = ntohs(sel->sport); + attrs->upspec.sport_mask = ntohs(sel->sport_mask); + attrs->upspec.proto = sel->proto; } static int mlx5e_xfrm_add_policy(struct xfrm_policy *x, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.h b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.h index 8bed9c3610754..b387adca9c203 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.h @@ -52,6 +52,14 @@ struct aes_gcm_keymat { u32 aes_key[256 / 32]; }; +struct upspec { + u16 dport; + u16 dport_mask; + u16 sport; + u16 sport_mask; + u8 proto; +}; + struct mlx5_accel_esp_xfrm_attrs { u32 esn; u32 spi; @@ -68,6 +76,7 @@ struct mlx5_accel_esp_xfrm_attrs { __be32 a6[4]; } daddr; + struct upspec upspec; u8 dir : 2; u8 esn_overlap : 1; u8 esn_trigger : 1; @@ -181,6 +190,7 @@ struct mlx5_accel_pol_xfrm_attrs { __be32 a6[4]; } daddr; + struct upspec upspec; u8 family; u8 action; u8 type : 2; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_fs.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_fs.c index 9f19f4b59a70c..5da6fe68eea6c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_fs.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_fs.c @@ -467,6 +467,27 @@ static void setup_fte_reg_c0(struct mlx5_flow_spec *spec, u32 reqid) misc_parameters_2.metadata_reg_c_0, reqid); } +static void setup_fte_upper_proto_match(struct mlx5_flow_spec *spec, struct upspec *upspec) +{ + if (upspec->proto != IPPROTO_UDP) + return; + + spec->match_criteria_enable |= MLX5_MATCH_OUTER_HEADERS; + MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, spec->match_criteria, ip_protocol); + MLX5_SET(fte_match_set_lyr_2_4, spec->match_value, ip_protocol, upspec->proto); + if (upspec->dport) { + MLX5_SET(fte_match_set_lyr_2_4, spec->match_criteria, udp_dport, + upspec->dport_mask); + MLX5_SET(fte_match_set_lyr_2_4, spec->match_value, udp_dport, upspec->dport); + } + + if (upspec->sport) { + MLX5_SET(fte_match_set_lyr_2_4, spec->match_criteria, udp_dport, + upspec->sport_mask); + MLX5_SET(fte_match_set_lyr_2_4, spec->match_value, udp_dport, upspec->sport); + } +} + static int setup_modify_header(struct mlx5_core_dev *mdev, u32 val, u8 dir, struct mlx5_flow_act *flow_act) { @@ -654,6 +675,7 @@ static int tx_add_rule(struct mlx5e_ipsec_sa_entry *sa_entry) setup_fte_addr6(spec, attrs->saddr.a6, attrs->daddr.a6); setup_fte_no_frags(spec); + setup_fte_upper_proto_match(spec, &attrs->upspec); switch (attrs->type) { case XFRM_DEV_OFFLOAD_CRYPTO: @@ -728,6 +750,7 @@ static int tx_add_policy(struct mlx5e_ipsec_pol_entry *pol_entry) setup_fte_addr6(spec, attrs->saddr.a6, attrs->daddr.a6); setup_fte_no_frags(spec); + setup_fte_upper_proto_match(spec, &attrs->upspec); err = setup_modify_header(mdev, attrs->reqid, XFRM_DEV_OFFLOAD_OUT, &flow_act); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 0e87432ec6f1c..27f90baac7688 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -1470,6 +1470,7 @@ static int mlx5e_alloc_txqsq(struct mlx5e_channel *c, sq->mkey_be = c->mkey_be; sq->netdev = c->netdev; sq->mdev = c->mdev; + sq->channel = c; sq->priv = c->priv; sq->ch_ix = c->ix; sq->txq_ix = txq_ix; @@ -2482,8 +2483,6 @@ static void mlx5e_activate_channel(struct mlx5e_channel *c) mlx5e_activate_xsk(c); else mlx5e_activate_rq(&c->rq); - - mlx5e_trigger_napi_icosq(c); } static void mlx5e_deactivate_channel(struct mlx5e_channel *c) @@ -2575,13 +2574,19 @@ int mlx5e_open_channels(struct mlx5e_priv *priv, return err; } -static void mlx5e_activate_channels(struct mlx5e_channels *chs) +static void mlx5e_activate_channels(struct mlx5e_priv *priv, struct mlx5e_channels *chs) { int i; for (i = 0; i < chs->num; i++) mlx5e_activate_channel(chs->c[i]); + if (priv->htb) + mlx5e_qos_activate_queues(priv); + + for (i = 0; i < chs->num; i++) + mlx5e_trigger_napi_icosq(chs->c[i]); + if (chs->ptp) mlx5e_ptp_activate_channel(chs->ptp); } @@ -2888,9 +2893,7 @@ static void mlx5e_build_txq_maps(struct mlx5e_priv *priv) void mlx5e_activate_priv_channels(struct mlx5e_priv *priv) { mlx5e_build_txq_maps(priv); - mlx5e_activate_channels(&priv->channels); - if (priv->htb) - mlx5e_qos_activate_queues(priv); + mlx5e_activate_channels(priv, &priv->channels); mlx5e_xdp_tx_enable(priv); /* dev_watchdog() wants all TX queues to be started when the carrier is diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ethtool.c index eff92dc0927c1..11fefb99d6852 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ethtool.c @@ -172,6 +172,7 @@ enum mlx5_ptys_rate { MLX5_PTYS_RATE_EDR = 1 << 5, MLX5_PTYS_RATE_HDR = 1 << 6, MLX5_PTYS_RATE_NDR = 1 << 7, + MLX5_PTYS_RATE_XDR = 1 << 8, }; static inline int mlx5_ptys_rate_enum_to_int(enum mlx5_ptys_rate rate) @@ -185,6 +186,7 @@ static inline int mlx5_ptys_rate_enum_to_int(enum mlx5_ptys_rate rate) case MLX5_PTYS_RATE_EDR: return 25000; case MLX5_PTYS_RATE_HDR: return 50000; case MLX5_PTYS_RATE_NDR: return 100000; + case MLX5_PTYS_RATE_XDR: return 200000; default: return -1; } } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/debugfs.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/debugfs.c index b8feaf0f5c4c8..f4b777d4e1086 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/debugfs.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/debugfs.c @@ -22,7 +22,7 @@ static int type_show(struct seq_file *file, void *priv) struct mlx5_lag *ldev; char *mode = NULL; - ldev = dev->priv.lag; + ldev = mlx5_lag_dev(dev); mutex_lock(&ldev->lock); if (__mlx5_lag_is_active(ldev)) mode = get_str_mode_type(ldev); @@ -41,7 +41,7 @@ static int port_sel_mode_show(struct seq_file *file, void *priv) int ret = 0; char *mode; - ldev = dev->priv.lag; + ldev = mlx5_lag_dev(dev); mutex_lock(&ldev->lock); if (__mlx5_lag_is_active(ldev)) mode = mlx5_get_str_port_sel_mode(ldev->mode, ldev->mode_flags); @@ -61,7 +61,7 @@ static int state_show(struct seq_file *file, void *priv) struct mlx5_lag *ldev; bool active; - ldev = dev->priv.lag; + ldev = mlx5_lag_dev(dev); mutex_lock(&ldev->lock); active = __mlx5_lag_is_active(ldev); mutex_unlock(&ldev->lock); @@ -77,7 +77,7 @@ static int flags_show(struct seq_file *file, void *priv) bool shared_fdb; bool lag_active; - ldev = dev->priv.lag; + ldev = mlx5_lag_dev(dev); mutex_lock(&ldev->lock); lag_active = __mlx5_lag_is_active(ldev); if (!lag_active) @@ -108,7 +108,7 @@ static int mapping_show(struct seq_file *file, void *priv) int num_ports; int i; - ldev = dev->priv.lag; + ldev = mlx5_lag_dev(dev); mutex_lock(&ldev->lock); lag_active = __mlx5_lag_is_active(ldev); if (lag_active) { @@ -142,7 +142,7 @@ static int members_show(struct seq_file *file, void *priv) struct mlx5_lag *ldev; int i; - ldev = dev->priv.lag; + ldev = mlx5_lag_dev(dev); mutex_lock(&ldev->lock); for (i = 0; i < ldev->ports; i++) { if (!ldev->pf[i].dev) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c index ad32b80e85018..dbf218cac5355 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c @@ -1187,7 +1187,7 @@ static int __mlx5_lag_dev_add_mdev(struct mlx5_core_dev *dev) tmp_dev = mlx5_get_next_phys_dev_lag(dev); if (tmp_dev) - ldev = tmp_dev->priv.lag; + ldev = mlx5_lag_dev(tmp_dev); if (!ldev) { ldev = mlx5_lag_dev_alloc(dev); @@ -1386,8 +1386,7 @@ bool mlx5_lag_is_shared_fdb(struct mlx5_core_dev *dev) spin_lock_irqsave(&lag_lock, flags); ldev = mlx5_lag_dev(dev); - res = ldev && __mlx5_lag_is_sriov(ldev) && - test_bit(MLX5_LAG_MODE_FLAG_SHARED_FDB, &ldev->mode_flags); + res = ldev && test_bit(MLX5_LAG_MODE_FLAG_SHARED_FDB, &ldev->mode_flags); spin_unlock_irqrestore(&lag_lock, flags); return res; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h index f30ac2de639f9..66013bef99391 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h @@ -50,19 +50,6 @@ struct lag_tracker { enum netdev_lag_hash hash_type; }; -enum mpesw_op { - MLX5_MPESW_OP_ENABLE, - MLX5_MPESW_OP_DISABLE, -}; - -struct mlx5_mpesw_work_st { - struct work_struct work; - struct mlx5_lag *lag; - enum mpesw_op op; - struct completion comp; - int result; -}; - /* LAG data of a ConnectX card. * It serves both its phys functions. */ @@ -124,8 +111,6 @@ int mlx5_activate_lag(struct mlx5_lag *ldev, int mlx5_lag_dev_get_netdev_idx(struct mlx5_lag *ldev, struct net_device *ndev); bool mlx5_shared_fdb_supported(struct mlx5_lag *ldev); -void mlx5_lag_del_mpesw_rule(struct mlx5_core_dev *dev); -int mlx5_lag_add_mpesw_rule(struct mlx5_core_dev *dev); char *mlx5_get_str_port_sel_mode(enum mlx5_lag_mode mode, unsigned long flags); void mlx5_infer_tx_enabled(struct lag_tracker *tracker, u8 num_ports, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/mp.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/mp.c index d9fcb9ed726f5..d85a8dfc153d6 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/mp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/mp.c @@ -28,13 +28,9 @@ static bool mlx5_lag_multipath_check_prereq(struct mlx5_lag *ldev) bool mlx5_lag_is_multipath(struct mlx5_core_dev *dev) { - struct mlx5_lag *ldev; - bool res; - - ldev = mlx5_lag_dev(dev); - res = ldev && __mlx5_lag_is_multipath(ldev); + struct mlx5_lag *ldev = mlx5_lag_dev(dev); - return res; + return ldev && __mlx5_lag_is_multipath(ldev); } /** diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.c index c17e8f1ec9146..3799f89ed1a6a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.c @@ -58,7 +58,7 @@ static void mlx5_mpesw_work(struct work_struct *work) static int mlx5_lag_mpesw_queue_work(struct mlx5_core_dev *dev, enum mpesw_op op) { - struct mlx5_lag *ldev = dev->priv.lag; + struct mlx5_lag *ldev = mlx5_lag_dev(dev); struct mlx5_mpesw_work_st *work; int err = 0; @@ -96,25 +96,27 @@ int mlx5_lag_add_mpesw_rule(struct mlx5_core_dev *dev) return mlx5_lag_mpesw_queue_work(dev, MLX5_MPESW_OP_ENABLE); } -int mlx5_lag_do_mirred(struct mlx5_core_dev *mdev, struct net_device *out_dev) +int mlx5_lag_mpesw_do_mirred(struct mlx5_core_dev *mdev, + struct net_device *out_dev, + struct netlink_ext_ack *extack) { - struct mlx5_lag *ldev = mdev->priv.lag; + struct mlx5_lag *ldev = mlx5_lag_dev(mdev); if (!netif_is_bond_master(out_dev) || !ldev) return 0; - if (ldev->mode == MLX5_LAG_MODE_MPESW) - return -EOPNOTSUPP; + if (ldev->mode != MLX5_LAG_MODE_MPESW) + return 0; - return 0; + NL_SET_ERR_MSG_MOD(extack, "can't forward to bond in mpesw mode"); + return -EOPNOTSUPP; } bool mlx5_lag_mpesw_is_activated(struct mlx5_core_dev *dev) { - bool ret; + struct mlx5_lag *ldev = mlx5_lag_dev(dev); - ret = dev->priv.lag && dev->priv.lag->mode == MLX5_LAG_MODE_MPESW; - return ret; + return ldev && ldev->mode == MLX5_LAG_MODE_MPESW; } void mlx5_lag_mpesw_init(struct mlx5_lag *ldev) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.h b/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.h index 88e8daffcf92e..818f19b5a984e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.h @@ -12,8 +12,25 @@ struct lag_mpesw { atomic_t mpesw_rule_count; }; -int mlx5_lag_do_mirred(struct mlx5_core_dev *mdev, struct net_device *out_dev); +enum mpesw_op { + MLX5_MPESW_OP_ENABLE, + MLX5_MPESW_OP_DISABLE, +}; + +struct mlx5_mpesw_work_st { + struct work_struct work; + struct mlx5_lag *lag; + enum mpesw_op op; + struct completion comp; + int result; +}; + +int mlx5_lag_mpesw_do_mirred(struct mlx5_core_dev *mdev, + struct net_device *out_dev, + struct netlink_ext_ack *extack); bool mlx5_lag_mpesw_is_activated(struct mlx5_core_dev *dev); +void mlx5_lag_del_mpesw_rule(struct mlx5_core_dev *dev); +int mlx5_lag_add_mpesw_rule(struct mlx5_core_dev *dev); #if IS_ENABLED(CONFIG_MLX5_ESWITCH) void mlx5_lag_mpesw_init(struct mlx5_lag *ldev); void mlx5_lag_mpesw_cleanup(struct mlx5_lag *ldev); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c index 75510a12ab026..4c9a402110595 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c @@ -362,7 +362,7 @@ static int mlx5_ptp_adjphase(struct ptp_clock_info *ptp, s32 delta) return mlx5_ptp_adjtime(ptp, delta); } -static int mlx5_ptp_adjfreq_real_time(struct mlx5_core_dev *mdev, s32 freq) +static int mlx5_ptp_freq_adj_real_time(struct mlx5_core_dev *mdev, long scaled_ppm) { u32 in[MLX5_ST_SZ_DW(mtutc_reg)] = {}; @@ -370,7 +370,15 @@ static int mlx5_ptp_adjfreq_real_time(struct mlx5_core_dev *mdev, s32 freq) return 0; MLX5_SET(mtutc_reg, in, operation, MLX5_MTUTC_OPERATION_ADJUST_FREQ_UTC); - MLX5_SET(mtutc_reg, in, freq_adjustment, freq); + + if (MLX5_CAP_MCAM_FEATURE(mdev, mtutc_freq_adj_units)) { + MLX5_SET(mtutc_reg, in, freq_adj_units, + MLX5_MTUTC_FREQ_ADJ_UNITS_SCALED_PPM); + MLX5_SET(mtutc_reg, in, freq_adjustment, scaled_ppm); + } else { + MLX5_SET(mtutc_reg, in, freq_adj_units, MLX5_MTUTC_FREQ_ADJ_UNITS_PPB); + MLX5_SET(mtutc_reg, in, freq_adjustment, scaled_ppm_to_ppb(scaled_ppm)); + } return mlx5_set_mtutc(mdev, in, sizeof(in)); } @@ -385,7 +393,8 @@ static int mlx5_ptp_adjfine(struct ptp_clock_info *ptp, long scaled_ppm) int err; mdev = container_of(clock, struct mlx5_core_dev, clock); - err = mlx5_ptp_adjfreq_real_time(mdev, scaled_ppm_to_ppb(scaled_ppm)); + + err = mlx5_ptp_freq_adj_real_time(mdev, scaled_ppm); if (err) return err; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c b/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c index 60596357bfc7a..96e57f1812a46 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c @@ -211,7 +211,8 @@ static int alloc_4k(struct mlx5_core_dev *dev, u64 *addr, u32 function) n = find_first_bit(&fp->bitmask, 8 * sizeof(fp->bitmask)); if (n >= MLX5_NUM_4K_IN_PAGE) { - mlx5_core_warn(dev, "alloc 4k bug\n"); + mlx5_core_warn(dev, "alloc 4k bug: fw page = 0x%llx, n = %u, bitmask: %lu, max num of 4K pages: %d\n", + fp->addr, n, fp->bitmask, MLX5_NUM_4K_IN_PAGE); return -ENOENT; } clear_bit(n, &fp->bitmask); diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 1b6201bb04c17..7cf6a78fea07f 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -9925,6 +9925,11 @@ struct mlx5_ifc_mpegc_reg_bits { u8 reserved_at_60[0x100]; }; +enum { + MLX5_MTUTC_FREQ_ADJ_UNITS_PPB = 0x0, + MLX5_MTUTC_FREQ_ADJ_UNITS_SCALED_PPM = 0x1, +}; + enum { MLX5_MTUTC_OPERATION_SET_TIME_IMMEDIATE = 0x1, MLX5_MTUTC_OPERATION_ADJUST_TIME = 0x2, @@ -9932,7 +9937,9 @@ enum { }; struct mlx5_ifc_mtutc_reg_bits { - u8 reserved_at_0[0x1c]; + u8 reserved_at_0[0x5]; + u8 freq_adj_units[0x3]; + u8 reserved_at_8[0x14]; u8 operation[0x4]; u8 freq_adjustment[0x20]; @@ -10005,7 +10012,8 @@ struct mlx5_ifc_pcam_reg_bits { }; struct mlx5_ifc_mcam_enhanced_features_bits { - u8 reserved_at_0[0x51]; + u8 reserved_at_0[0x50]; + u8 mtutc_freq_adj_units[0x1]; u8 mtutc_time_adjustment_extended_range[0x1]; u8 reserved_at_52[0xb]; u8 mcia_32dwords[0x1];