Skip to content

Commit

Permalink
Merge branch 'mlx5-add-sensor-name-in-temperature-message'
Browse files Browse the repository at this point in the history
Tariq Toukan says:

====================
mlx5: Add sensor name in temperature message

This small series from Shahar adds the sensors names to the temperature
event messages, in addition to the existing bitmap indicators.
This improves human readability.

Series starts with simple refactoring and modifications. The top patch
adds the sensors names.
====================

Link: https://patch.msgid.link/20250213094641.226501-1-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
  • Loading branch information
Jakub Kicinski committed Feb 18, 2025
2 parents 6626f11 + 46fd50c commit fe3340a
Show file tree
Hide file tree
Showing 3 changed files with 39 additions and 3 deletions.
36 changes: 33 additions & 3 deletions drivers/net/ethernet/mellanox/mlx5/core/events.c
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#include "mlx5_core.h"
#include "lib/eq.h"
#include "lib/events.h"
#include "hwmon.h"

struct mlx5_event_nb {
struct mlx5_nb nb;
Expand Down Expand Up @@ -153,21 +154,50 @@ static int any_notifier(struct notifier_block *nb,
return NOTIFY_OK;
}

#if IS_ENABLED(CONFIG_HWMON)
static void print_sensor_names_in_bit_set(struct mlx5_core_dev *dev, struct mlx5_hwmon *hwmon,
u64 bit_set, int bit_set_offset)
{
unsigned long *bit_set_ptr = (unsigned long *)&bit_set;
int num_bits = sizeof(bit_set) * BITS_PER_BYTE;
int i;

for_each_set_bit(i, bit_set_ptr, num_bits) {
const char *sensor_name = hwmon_get_sensor_name(hwmon, i + bit_set_offset);

mlx5_core_warn(dev, "Sensor name[%d]: %s\n", i + bit_set_offset, sensor_name);
}
}
#endif /* CONFIG_HWMON */

/* type == MLX5_EVENT_TYPE_TEMP_WARN_EVENT */
static int temp_warn(struct notifier_block *nb, unsigned long type, void *data)
{
struct mlx5_event_nb *event_nb = mlx5_nb_cof(nb, struct mlx5_event_nb, nb);
struct mlx5_events *events = event_nb->ctx;
struct mlx5_core_dev *dev = events->dev;
struct mlx5_eqe *eqe = data;
u64 value_lsb;
u64 value_msb;

value_lsb = be64_to_cpu(eqe->data.temp_warning.sensor_warning_lsb);
/* bit 1-63 are not supported for NICs,
* hence read only bit 0 (asic) from lsb.
*/
value_lsb &= 0x1;
value_msb = be64_to_cpu(eqe->data.temp_warning.sensor_warning_msb);

mlx5_core_warn(events->dev,
"High temperature on sensors with bit set %llx %llx",
value_msb, value_lsb);
if (net_ratelimit()) {
mlx5_core_warn(dev, "High temperature on sensors with bit set %#llx %#llx.\n",
value_msb, value_lsb);
#if IS_ENABLED(CONFIG_HWMON)
if (dev->hwmon) {
print_sensor_names_in_bit_set(dev, dev->hwmon, value_lsb, 0);
print_sensor_names_in_bit_set(dev, dev->hwmon, value_msb,
sizeof(value_lsb) * BITS_PER_BYTE);
}
#endif
}

return NOTIFY_OK;
}
Expand Down
5 changes: 5 additions & 0 deletions drivers/net/ethernet/mellanox/mlx5/core/hwmon.c
Original file line number Diff line number Diff line change
Expand Up @@ -416,3 +416,8 @@ void mlx5_hwmon_dev_unregister(struct mlx5_core_dev *mdev)
mlx5_hwmon_free(hwmon);
mdev->hwmon = NULL;
}

const char *hwmon_get_sensor_name(struct mlx5_hwmon *hwmon, int channel)
{
return hwmon->temp_channel_desc[channel].sensor_name;
}
1 change: 1 addition & 0 deletions drivers/net/ethernet/mellanox/mlx5/core/hwmon.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

int mlx5_hwmon_dev_register(struct mlx5_core_dev *mdev);
void mlx5_hwmon_dev_unregister(struct mlx5_core_dev *mdev);
const char *hwmon_get_sensor_name(struct mlx5_hwmon *hwmon, int channel);

#else
static inline int mlx5_hwmon_dev_register(struct mlx5_core_dev *mdev)
Expand Down

0 comments on commit fe3340a

Please sign in to comment.