From b5d7b2f04ebcff740f44ef4d295b3401aeb029f4 Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Wed, 26 Feb 2025 14:25:40 +0200 Subject: [PATCH 1/4] net/mlx5: Avoid report two health errors on same syndrome In case health counter has not increased for few polling intervals, miss counter will reach max misses threshold and health report will be triggered for FW health reporter. In case syndrome found on same health poll another health report will be triggered. Avoid two health reports on same syndrome by marking this syndrome as already known. Signed-off-by: Moshe Shemesh Reviewed-by: Shahar Shitrit Signed-off-by: Tariq Toukan Reviewed-by: Kalesh AP Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/health.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c index a6329ca2d9bff..52c8035547be5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/health.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c @@ -799,6 +799,7 @@ static void poll_health(struct timer_list *t) health->prev = count; if (health->miss_counter == MAX_MISSES) { mlx5_core_err(dev, "device's health compromised - reached miss count\n"); + health->synd = ioread8(&h->synd); print_health_info(dev); queue_work(health->wq, &health->report_work); } From 6bdce277a32632045648abaf3386bb5229670e68 Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Wed, 26 Feb 2025 14:25:41 +0200 Subject: [PATCH 2/4] net/mlx5: Log health buffer data on any syndrome Currently health buffer data is logged either when FW fatal error detected or miss counter reached max misses threshold. Log health buffer whenever new health syndrome is detected. Signed-off-by: Moshe Shemesh Reviewed-by: Shahar Shitrit Signed-off-by: Tariq Toukan Reviewed-by: Kalesh AP Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/health.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c index 52c8035547be5..665cbce891757 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/health.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c @@ -806,8 +806,10 @@ static void poll_health(struct timer_list *t) prev_synd = health->synd; health->synd = ioread8(&h->synd); - if (health->synd && health->synd != prev_synd) + if (health->synd && health->synd != prev_synd) { + print_health_info(dev); queue_work(health->wq, &health->report_work); + } out: mod_timer(&health->timer, get_next_poll_jiffies(dev)); From 63f26199721fdf9bf6be74c8daf3df4f6e7e80ea Mon Sep 17 00:00:00 2001 From: Shahar Shitrit Date: Wed, 26 Feb 2025 14:25:42 +0200 Subject: [PATCH 3/4] net/mlx5: Expose crr in health buffer Expose crr bit in struct health buffer. When set, it indicates that the error cannot be recovered without flow involving a cold reset. Add its value to the health buffer info log. Signed-off-by: Shahar Shitrit Reviewed-by: Moshe Shemesh Signed-off-by: Tariq Toukan Reviewed-by: Michal Swiatkowski Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/health.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c index 665cbce891757..c7ff646e08650 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/health.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c @@ -96,6 +96,11 @@ static int mlx5_health_get_rfr(u8 rfr_severity) return rfr_severity >> MLX5_RFR_BIT_OFFSET; } +static int mlx5_health_get_crr(u8 rfr_severity) +{ + return (rfr_severity >> MLX5_CRR_BIT_OFFSET) & 0x01; +} + static bool sensor_fw_synd_rfr(struct mlx5_core_dev *dev) { struct mlx5_core_health *health = &dev->priv.health; @@ -442,12 +447,15 @@ static void print_health_info(struct mlx5_core_dev *dev) mlx5_log(dev, severity, "time %u\n", ioread32be(&h->time)); mlx5_log(dev, severity, "hw_id 0x%08x\n", ioread32be(&h->hw_id)); mlx5_log(dev, severity, "rfr %d\n", mlx5_health_get_rfr(rfr_severity)); + mlx5_log(dev, severity, "crr %d\n", mlx5_health_get_crr(rfr_severity)); mlx5_log(dev, severity, "severity %d (%s)\n", severity, mlx5_loglevel_str(severity)); mlx5_log(dev, severity, "irisc_index %d\n", ioread8(&h->irisc_index)); mlx5_log(dev, severity, "synd 0x%x: %s\n", ioread8(&h->synd), hsynd_str(ioread8(&h->synd))); mlx5_log(dev, severity, "ext_synd 0x%04x\n", ioread16be(&h->ext_synd)); mlx5_log(dev, severity, "raw fw_ver 0x%08x\n", ioread32be(&h->fw_ver)); + if (mlx5_health_get_crr(rfr_severity)) + mlx5_core_warn(dev, "Cold reset is required\n"); } static int From 680173b6bb6b7b521af6a50e9ff14bb1b0cdf931 Mon Sep 17 00:00:00 2001 From: Shahar Shitrit Date: Wed, 26 Feb 2025 14:25:43 +0200 Subject: [PATCH 4/4] net/mlx5: Add trust lockdown error to health syndrome print function Add the new health syndrome value to hsynd_str() function to indicate that the device got a trust lockdown fault. Signed-off-by: Shahar Shitrit Reviewed-by: Moshe Shemesh Signed-off-by: Tariq Toukan Reviewed-by: Kalesh AP Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/health.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c index c7ff646e08650..91613d5a36cd4 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/health.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c @@ -380,6 +380,8 @@ static const char *hsynd_str(u8 synd) return "High temperature"; case MLX5_INITIAL_SEG_HEALTH_SYNDROME_ICM_PCI_POISONED_ERR: return "ICM fetch PCI data poisoned error"; + case MLX5_INITIAL_SEG_HEALTH_SYNDROME_TRUST_LOCKDOWN_ERR: + return "Trust lockdown error"; default: return "unrecognized error"; }