From 9dd3d5d258aceb37bdf09c8b91fa448f58ea81f0 Mon Sep 17 00:00:00 2001 From: Shahar Shitrit Date: Thu, 13 Feb 2025 11:46:38 +0200 Subject: [PATCH 1/4] net/mlx5: Apply rate-limiting to high temperature warning Wrap the high temperature warning in a temperature event with a call to net_ratelimit() to prevent flooding the kernel log with repeated warning messages when temperature exceeds the threshold multiple times within a short duration. Signed-off-by: Shahar Shitrit Signed-off-by: Tariq Toukan Reviewed-by: Mateusz Polchlopek Link: https://patch.msgid.link/20250213094641.226501-2-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/events.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/events.c b/drivers/net/ethernet/mellanox/mlx5/core/events.c index d91ea53eb394d..e8beb6289d018 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/events.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/events.c @@ -165,9 +165,10 @@ static int temp_warn(struct notifier_block *nb, unsigned long type, void *data) value_lsb = be64_to_cpu(eqe->data.temp_warning.sensor_warning_lsb); value_msb = be64_to_cpu(eqe->data.temp_warning.sensor_warning_msb); - mlx5_core_warn(events->dev, - "High temperature on sensors with bit set %llx %llx", - value_msb, value_lsb); + if (net_ratelimit()) + mlx5_core_warn(events->dev, + "High temperature on sensors with bit set %llx %llx", + value_msb, value_lsb); return NOTIFY_OK; } From b9b72ce0f5f4679fc878ad010658a72f30595118 Mon Sep 17 00:00:00 2001 From: Shahar Shitrit Date: Thu, 13 Feb 2025 11:46:39 +0200 Subject: [PATCH 2/4] net/mlx5: Prefix temperature event bitmap with '0x' for clarity Prepend '0x' to the sensor bitmap in the warning message to clearly indicate that the bitmap is in hexadecimal format. Signed-off-by: Shahar Shitrit Signed-off-by: Tariq Toukan Reviewed-by: Mateusz Polchlopek Link: https://patch.msgid.link/20250213094641.226501-3-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/events.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/events.c b/drivers/net/ethernet/mellanox/mlx5/core/events.c index e8beb6289d018..a661aa522a9ad 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/events.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/events.c @@ -167,7 +167,7 @@ static int temp_warn(struct notifier_block *nb, unsigned long type, void *data) if (net_ratelimit()) mlx5_core_warn(events->dev, - "High temperature on sensors with bit set %llx %llx", + "High temperature on sensors with bit set %#llx %#llx", value_msb, value_lsb); return NOTIFY_OK; From 633f16d7e07c129a36b882c05379e01ce5bdb542 Mon Sep 17 00:00:00 2001 From: Shahar Shitrit Date: Thu, 13 Feb 2025 11:46:40 +0200 Subject: [PATCH 3/4] net/mlx5: Modify LSB bitmask in temperature event to include only the first bit In the sensor_count field of the MTEWE register, bits 1-62 are supported only for unmanaged switches, not for NICs, and bit 63 is reserved for internal use. To prevent confusing output that may include set bits that are not relevant to NIC sensors, we update the bitmask to retain only the first bit, which corresponds to the sensor ASIC. Signed-off-by: Shahar Shitrit Signed-off-by: Tariq Toukan Reviewed-by: Mateusz Polchlopek Link: https://patch.msgid.link/20250213094641.226501-4-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/events.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/events.c b/drivers/net/ethernet/mellanox/mlx5/core/events.c index a661aa522a9ad..e85a9042e3c24 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/events.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/events.c @@ -163,6 +163,10 @@ static int temp_warn(struct notifier_block *nb, unsigned long type, void *data) u64 value_msb; value_lsb = be64_to_cpu(eqe->data.temp_warning.sensor_warning_lsb); + /* bit 1-63 are not supported for NICs, + * hence read only bit 0 (asic) from lsb. + */ + value_lsb &= 0x1; value_msb = be64_to_cpu(eqe->data.temp_warning.sensor_warning_msb); if (net_ratelimit()) From 46fd50cfcc12368bed9ae5257cc3beaea5b3c193 Mon Sep 17 00:00:00 2001 From: Shahar Shitrit Date: Thu, 13 Feb 2025 11:46:41 +0200 Subject: [PATCH 4/4] net/mlx5: Add sensor name to temperature event message Previously, a temperature event message included a bitmap indicating which sensors detect high temperatures. To enhance clarity, we modify the message format to explicitly list the names of the overheating sensors, alongside the sensors bitmap. If HWMON is not configured, the event message remains unchanged. Signed-off-by: Shahar Shitrit Reviewed-by: Carolina Jubran Signed-off-by: Tariq Toukan Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250213094641.226501-5-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/mellanox/mlx5/core/events.c | 31 +++++++++++++++++-- .../net/ethernet/mellanox/mlx5/core/hwmon.c | 5 +++ .../net/ethernet/mellanox/mlx5/core/hwmon.h | 1 + 3 files changed, 34 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/events.c b/drivers/net/ethernet/mellanox/mlx5/core/events.c index e85a9042e3c24..01c5f5990f9ae 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/events.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/events.c @@ -6,6 +6,7 @@ #include "mlx5_core.h" #include "lib/eq.h" #include "lib/events.h" +#include "hwmon.h" struct mlx5_event_nb { struct mlx5_nb nb; @@ -153,11 +154,28 @@ static int any_notifier(struct notifier_block *nb, return NOTIFY_OK; } +#if IS_ENABLED(CONFIG_HWMON) +static void print_sensor_names_in_bit_set(struct mlx5_core_dev *dev, struct mlx5_hwmon *hwmon, + u64 bit_set, int bit_set_offset) +{ + unsigned long *bit_set_ptr = (unsigned long *)&bit_set; + int num_bits = sizeof(bit_set) * BITS_PER_BYTE; + int i; + + for_each_set_bit(i, bit_set_ptr, num_bits) { + const char *sensor_name = hwmon_get_sensor_name(hwmon, i + bit_set_offset); + + mlx5_core_warn(dev, "Sensor name[%d]: %s\n", i + bit_set_offset, sensor_name); + } +} +#endif /* CONFIG_HWMON */ + /* type == MLX5_EVENT_TYPE_TEMP_WARN_EVENT */ static int temp_warn(struct notifier_block *nb, unsigned long type, void *data) { struct mlx5_event_nb *event_nb = mlx5_nb_cof(nb, struct mlx5_event_nb, nb); struct mlx5_events *events = event_nb->ctx; + struct mlx5_core_dev *dev = events->dev; struct mlx5_eqe *eqe = data; u64 value_lsb; u64 value_msb; @@ -169,10 +187,17 @@ static int temp_warn(struct notifier_block *nb, unsigned long type, void *data) value_lsb &= 0x1; value_msb = be64_to_cpu(eqe->data.temp_warning.sensor_warning_msb); - if (net_ratelimit()) - mlx5_core_warn(events->dev, - "High temperature on sensors with bit set %#llx %#llx", + if (net_ratelimit()) { + mlx5_core_warn(dev, "High temperature on sensors with bit set %#llx %#llx.\n", value_msb, value_lsb); +#if IS_ENABLED(CONFIG_HWMON) + if (dev->hwmon) { + print_sensor_names_in_bit_set(dev, dev->hwmon, value_lsb, 0); + print_sensor_names_in_bit_set(dev, dev->hwmon, value_msb, + sizeof(value_lsb) * BITS_PER_BYTE); + } +#endif + } return NOTIFY_OK; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/hwmon.c b/drivers/net/ethernet/mellanox/mlx5/core/hwmon.c index 353f81dccd1ce..4ba2636d7fb6c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/hwmon.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/hwmon.c @@ -416,3 +416,8 @@ void mlx5_hwmon_dev_unregister(struct mlx5_core_dev *mdev) mlx5_hwmon_free(hwmon); mdev->hwmon = NULL; } + +const char *hwmon_get_sensor_name(struct mlx5_hwmon *hwmon, int channel) +{ + return hwmon->temp_channel_desc[channel].sensor_name; +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/hwmon.h b/drivers/net/ethernet/mellanox/mlx5/core/hwmon.h index 999654a9b9da5..f38271c22c105 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/hwmon.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/hwmon.h @@ -10,6 +10,7 @@ int mlx5_hwmon_dev_register(struct mlx5_core_dev *mdev); void mlx5_hwmon_dev_unregister(struct mlx5_core_dev *mdev); +const char *hwmon_get_sensor_name(struct mlx5_hwmon *hwmon, int channel); #else static inline int mlx5_hwmon_dev_register(struct mlx5_core_dev *mdev)