Skip to content

Commit

Permalink
EDAC/i10nm: Skip the absent memory controllers
Browse files Browse the repository at this point in the history
Some Sapphire Rapids workstations' absent memory controllers
still appear as PCIe devices that fool the i10nm_edac driver
and result in "shift exponent -66 is negative" call traces
from skx_get_dimm_info().

Skip the absent memory controllers to avoid the call traces.

Reported-by: Kai-Heng Feng <kai.heng.feng@canonical.com>
Closes: https://lore.kernel.org/linux-edac/CAAd53p41Ku1m1rapeqb1xtD+kKuk+BaUW=dumuoF0ZO3GhFjFA@mail.gmail.com/T/#m5de16dce60a8c836ec235868c7c16e3fefad0cc2
Tested-by: Kai-Heng Feng <kai.heng.feng@canonical.com>
Reported-by: Koba Ko <koba.ko@canonical.com>
Closes: https://lore.kernel.org/linux-edac/SA1PR11MB71305B71CCCC3D9305835202892AA@SA1PR11MB7130.namprd11.prod.outlook.com/T/#t
Tested-by: Koba Ko <koba.ko@canonical.com>
Fixes: d4dc89d ("EDAC, i10nm: Add a driver for Intel 10nm server processors")
Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
Link: https://lore.kernel.org/r/20230710013232.59712-1-qiuxu.zhuo@intel.com
  • Loading branch information
Qiuxu Zhuo authored and Tony Luck committed Jul 24, 2023
1 parent 6eaae19 commit c545f5e
Showing 1 changed file with 49 additions and 5 deletions.
54 changes: 49 additions & 5 deletions drivers/edac/i10nm_base.c
Original file line number Diff line number Diff line change
Expand Up @@ -658,13 +658,49 @@ static struct pci_dev *get_ddr_munit(struct skx_dev *d, int i, u32 *offset, unsi
return mdev;
}

/**
* i10nm_imc_absent() - Check whether the memory controller @imc is absent
*
* @imc : The pointer to the structure of memory controller EDAC device.
*
* RETURNS : true if the memory controller EDAC device is absent, false otherwise.
*/
static bool i10nm_imc_absent(struct skx_imc *imc)
{
u32 mcmtr;
int i;

switch (res_cfg->type) {
case SPR:
for (i = 0; i < res_cfg->ddr_chan_num; i++) {
mcmtr = I10NM_GET_MCMTR(imc, i);
edac_dbg(1, "ch%d mcmtr reg %x\n", i, mcmtr);
if (mcmtr != ~0)
return false;
}

/*
* Some workstations' absent memory controllers still
* appear as PCIe devices, misleading the EDAC driver.
* By observing that the MMIO registers of these absent
* memory controllers consistently hold the value of ~0.
*
* We identify a memory controller as absent by checking
* if its MMIO register "mcmtr" == ~0 in all its channels.
*/
return true;
default:
return false;
}
}

static int i10nm_get_ddr_munits(void)
{
struct pci_dev *mdev;
void __iomem *mbase;
unsigned long size;
struct skx_dev *d;
int i, j = 0;
int i, lmc, j = 0;
u32 reg, off;
u64 base;

Expand All @@ -690,7 +726,7 @@ static int i10nm_get_ddr_munits(void)
edac_dbg(2, "socket%d mmio base 0x%llx (reg 0x%x)\n",
j++, base, reg);

for (i = 0; i < res_cfg->ddr_imc_num; i++) {
for (lmc = 0, i = 0; i < res_cfg->ddr_imc_num; i++) {
mdev = get_ddr_munit(d, i, &off, &size);

if (i == 0 && !mdev) {
Expand All @@ -700,8 +736,6 @@ static int i10nm_get_ddr_munits(void)
if (!mdev)
continue;

d->imc[i].mdev = mdev;

edac_dbg(2, "mc%d mmio base 0x%llx size 0x%lx (reg 0x%x)\n",
i, base + off, size, reg);

Expand All @@ -712,7 +746,17 @@ static int i10nm_get_ddr_munits(void)
return -ENODEV;
}

d->imc[i].mbase = mbase;
d->imc[lmc].mbase = mbase;
if (i10nm_imc_absent(&d->imc[lmc])) {
pci_dev_put(mdev);
iounmap(mbase);
d->imc[lmc].mbase = NULL;
edac_dbg(2, "Skip absent mc%d\n", i);
continue;
} else {
d->imc[lmc].mdev = mdev;
lmc++;
}
}
}

Expand Down

0 comments on commit c545f5e

Please sign in to comment.