Skip to content

Commit

Permalink
Merge branch 'mlx5-health-syndrome'
Browse files Browse the repository at this point in the history
Tariq Toukan says:

====================
mlx5: Trust lockdown health syndrome

This series introduces a new error type in the health syndrome,
specifically for trust lock-down.  Additionally, it exposes the CRR bit
in the health buffer, which, when set, indicates that the error cannot
be recovered without a process involving a cold reset. We add The CRR
bit value to the health buffer info log and update it to be logged on
any syndrome.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
  • Loading branch information
David S. Miller committed Feb 28, 2025
2 parents 544a882 + 680173b commit 56794b5
Showing 1 changed file with 14 additions and 1 deletion.
15 changes: 14 additions & 1 deletion drivers/net/ethernet/mellanox/mlx5/core/health.c
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,11 @@ static int mlx5_health_get_rfr(u8 rfr_severity)
return rfr_severity >> MLX5_RFR_BIT_OFFSET;
}

static int mlx5_health_get_crr(u8 rfr_severity)
{
return (rfr_severity >> MLX5_CRR_BIT_OFFSET) & 0x01;
}

static bool sensor_fw_synd_rfr(struct mlx5_core_dev *dev)
{
struct mlx5_core_health *health = &dev->priv.health;
Expand Down Expand Up @@ -375,6 +380,8 @@ static const char *hsynd_str(u8 synd)
return "High temperature";
case MLX5_INITIAL_SEG_HEALTH_SYNDROME_ICM_PCI_POISONED_ERR:
return "ICM fetch PCI data poisoned error";
case MLX5_INITIAL_SEG_HEALTH_SYNDROME_TRUST_LOCKDOWN_ERR:
return "Trust lockdown error";
default:
return "unrecognized error";
}
Expand Down Expand Up @@ -442,12 +449,15 @@ static void print_health_info(struct mlx5_core_dev *dev)
mlx5_log(dev, severity, "time %u\n", ioread32be(&h->time));
mlx5_log(dev, severity, "hw_id 0x%08x\n", ioread32be(&h->hw_id));
mlx5_log(dev, severity, "rfr %d\n", mlx5_health_get_rfr(rfr_severity));
mlx5_log(dev, severity, "crr %d\n", mlx5_health_get_crr(rfr_severity));
mlx5_log(dev, severity, "severity %d (%s)\n", severity, mlx5_loglevel_str(severity));
mlx5_log(dev, severity, "irisc_index %d\n", ioread8(&h->irisc_index));
mlx5_log(dev, severity, "synd 0x%x: %s\n", ioread8(&h->synd),
hsynd_str(ioread8(&h->synd)));
mlx5_log(dev, severity, "ext_synd 0x%04x\n", ioread16be(&h->ext_synd));
mlx5_log(dev, severity, "raw fw_ver 0x%08x\n", ioread32be(&h->fw_ver));
if (mlx5_health_get_crr(rfr_severity))
mlx5_core_warn(dev, "Cold reset is required\n");
}

static int
Expand Down Expand Up @@ -799,14 +809,17 @@ static void poll_health(struct timer_list *t)
health->prev = count;
if (health->miss_counter == MAX_MISSES) {
mlx5_core_err(dev, "device's health compromised - reached miss count\n");
health->synd = ioread8(&h->synd);
print_health_info(dev);
queue_work(health->wq, &health->report_work);
}

prev_synd = health->synd;
health->synd = ioread8(&h->synd);
if (health->synd && health->synd != prev_synd)
if (health->synd && health->synd != prev_synd) {
print_health_info(dev);
queue_work(health->wq, &health->report_work);
}

out:
mod_timer(&health->timer, get_next_poll_jiffies(dev));
Expand Down

0 comments on commit 56794b5

Please sign in to comment.