Skip to content

Commit

Permalink
RAS/AMD/FMPM: Save SPA values
Browse files Browse the repository at this point in the history
The system physical address (SPA) of an error is not a stable value. It
will change depending on the location of the memory: parts can be
swapped. And it will change depending on memory topology: NUMA nodes
and/or interleaving can be adjusted.

Therefore, the SPA value is not part of the "FRU Memory Poison" record
format. And it will not be saved to persistent storage.

However, the SPA values can be helpful during debug and for system
admins during run time.

Save the SPA values in a separate structure. This is updated when
records are restored and when new errors are saved.

  [ bp: Make error messages more user friendly and add and correct
    comments. ]

Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Link: https://lore.kernel.org/r/20240301143748.854090-3-yazen.ghannam@amd.com
  • Loading branch information
Yazen Ghannam authored and Borislav Petkov (AMD) committed Mar 1, 2024
1 parent 9d2b6fa commit 838850c
Showing 1 changed file with 71 additions and 1 deletion.
72 changes: 71 additions & 1 deletion drivers/ras/amd/fmpm.c
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,11 @@ struct fru_rec {
*/
static struct fru_rec **fru_records;

/* system physical addresses array */
static u64 *spa_entries;

#define INVALID_SPA ~0ULL

#define CPER_CREATOR_FMP \
GUID_INIT(0xcd5c2993, 0xf4b2, 0x41b2, 0xb5, 0xd4, 0xf9, 0xc3, \
0xa0, 0x33, 0x08, 0x75)
Expand All @@ -120,7 +125,7 @@ static struct fru_rec **fru_records;
0x12, 0x0a, 0x44, 0x58)

/**
* DOC: fru_poison_entries (byte)
* DOC: max_nr_entries (byte)
* Maximum number of descriptor entries possible for each FRU.
*
* Values between '1' and '255' are valid.
Expand All @@ -140,6 +145,9 @@ static unsigned int max_nr_fru;
/* Total length of record including headers and list of descriptor entries. */
static size_t max_rec_len;

/* Total number of SPA entries across all FRUs. */
static unsigned int spa_nr_entries;

/*
* Protect the local records cache in fru_records and prevent concurrent
* writes to storage. This is only needed after init once notifier block
Expand Down Expand Up @@ -269,6 +277,54 @@ static bool rec_has_fpd(struct fru_rec *rec, struct cper_fru_poison_desc *fpd)
return false;
}

static void save_spa(struct fru_rec *rec, unsigned int entry,
u64 addr, u64 id, unsigned int cpu)
{
unsigned int i, fru_idx, spa_entry;
struct atl_err a_err;
unsigned long spa;

if (entry >= max_nr_entries) {
pr_warn_once("FRU descriptor entry %d out-of-bounds (max: %d)\n",
entry, max_nr_entries);
return;
}

/* spa_nr_entries is always multiple of max_nr_entries */
for (i = 0; i < spa_nr_entries; i += max_nr_entries) {
fru_idx = i / max_nr_entries;
if (fru_records[fru_idx] == rec)
break;
}

if (i >= spa_nr_entries) {
pr_warn_once("FRU record %d not found\n", i);
return;
}

spa_entry = i + entry;
if (spa_entry >= spa_nr_entries) {
pr_warn_once("spa_entries[] index out-of-bounds\n");
return;
}

memset(&a_err, 0, sizeof(struct atl_err));

a_err.addr = addr;
a_err.ipid = id;
a_err.cpu = cpu;

spa = amd_convert_umc_mca_addr_to_sys_addr(&a_err);
if (IS_ERR_VALUE(spa)) {
pr_debug("Failed to get system address\n");
return;
}

spa_entries[spa_entry] = spa;
pr_debug("fru_idx: %u, entry: %u, spa_entry: %u, spa: 0x%016llx\n",
fru_idx, entry, spa_entry, spa_entries[spa_entry]);
}

static void update_fru_record(struct fru_rec *rec, struct mce *m)
{
struct cper_sec_fru_mem_poison *fmp = &rec->fmp;
Expand Down Expand Up @@ -301,6 +357,7 @@ static void update_fru_record(struct fru_rec *rec, struct mce *m)
entry = fmp->nr_entries;

save_fpd:
save_spa(rec, entry, m->addr, m->ipid, m->extcpu);
fpd_dest = &rec->entries[entry];
memcpy(fpd_dest, &fpd, sizeof(struct cper_fru_poison_desc));

Expand Down Expand Up @@ -385,6 +442,7 @@ static void retire_mem_fmp(struct fru_rec *rec)
continue;

retire_dram_row(fpd->addr, fpd->hw_id, err_cpu);
save_spa(rec, i, fpd->addr, fpd->hw_id, err_cpu);
}
}

Expand Down Expand Up @@ -696,6 +754,8 @@ static int get_system_info(void)
if (!max_nr_entries)
max_nr_entries = FMPM_DEFAULT_MAX_NR_ENTRIES;

spa_nr_entries = max_nr_fru * max_nr_entries;

max_rec_len = sizeof(struct fru_rec);
max_rec_len += sizeof(struct cper_fru_poison_desc) * max_nr_entries;

Expand All @@ -714,6 +774,7 @@ static void free_records(void)
kfree(rec);

kfree(fru_records);
kfree(spa_entries);
}

static int allocate_records(void)
Expand All @@ -734,6 +795,15 @@ static int allocate_records(void)
}
}

spa_entries = kcalloc(spa_nr_entries, sizeof(u64), GFP_KERNEL);
if (!spa_entries) {
ret = -ENOMEM;
goto out_free;
}

for (i = 0; i < spa_nr_entries; i++)
spa_entries[i] = INVALID_SPA;

return ret;

out_free:
Expand Down

0 comments on commit 838850c

Please sign in to comment.