Skip to content

Commit

Permalink
EDAC/ie31200: Switch Raptor Lake-S to interrupt mode
Browse files Browse the repository at this point in the history
Raptor Lake-S SoCs notify correctable memory errors via CMCI (Corrected
Machine Check Interrupt). Switch Raptor Lake-S EDAC support from polling
to interrupt mode by registering the callback to the MCE decode notifier
chain.

Note that as Raptor Lake-S SoCs may not recover from uncorrectable memory
errors, the system will hang as soon as this type of error occurs, and the
registered callback on the MCE decode chain will not be executed. This is
the expected behavior.

Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
Tested-by: Gary Wang <gary.c.wang@intel.com>
Link: https://lore.kernel.org/r/20250310011411.31685-12-qiuxu.zhuo@intel.com
  • Loading branch information
Qiuxu Zhuo authored and Tony Luck committed Mar 10, 2025
1 parent d074228 commit a5db1b2
Show file tree
Hide file tree
Showing 2 changed files with 78 additions and 7 deletions.
2 changes: 1 addition & 1 deletion drivers/edac/Kconfig
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ config EDAC_I3200

config EDAC_IE31200
tristate "Intel e312xx"
depends on PCI && X86
depends on PCI && X86 && X86_MCE_INTEL
help
Support for error detection and correction on the Intel
E3-1200 based DRAM controllers.
Expand Down
83 changes: 77 additions & 6 deletions drivers/edac/ie31200_edac.c
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@
#include <linux/edac.h>

#include <linux/io-64-nonatomic-lo-hi.h>
#include <asm/mce.h>
#include "edac_module.h"

#define EDAC_MOD_STR "ie31200_edac"
Expand Down Expand Up @@ -123,6 +124,7 @@ static int ie31200_registered = 1;

struct res_config {
enum mem_type mtype;
bool cmci;
int imc_num;
/* Host MMIO configuration register */
u64 reg_mchbar_mask;
Expand Down Expand Up @@ -172,6 +174,7 @@ struct ie31200_error_info {
u16 errsts;
u16 errsts2;
u64 eccerrlog[IE31200_CHANNELS];
u64 erraddr;
};

static const struct ie31200_dev_info ie31200_devs[] = {
Expand Down Expand Up @@ -327,13 +330,13 @@ static void ie31200_process_error_info(struct mem_ctl_info *mci,
log = info->eccerrlog[channel];
if (log & cfg->reg_eccerrlog_ue_mask) {
edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, 1,
0, 0, 0,
info->erraddr >> PAGE_SHIFT, 0, 0,
field_get(cfg->reg_eccerrlog_rank_mask, log),
channel, -1,
"ie31200 UE", "");
} else if (log & cfg->reg_eccerrlog_ce_mask) {
edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, 1,
0, 0,
info->erraddr >> PAGE_SHIFT, 0,
field_get(cfg->reg_eccerrlog_syndrome_mask, log),
field_get(cfg->reg_eccerrlog_rank_mask, log),
channel, -1,
Expand All @@ -342,14 +345,20 @@ static void ie31200_process_error_info(struct mem_ctl_info *mci,
}
}

static void ie31200_check(struct mem_ctl_info *mci)
static void __ie31200_check(struct mem_ctl_info *mci, struct mce *mce)
{
struct ie31200_error_info info;

info.erraddr = mce ? mce->addr : 0;
ie31200_get_and_clear_error_info(mci, &info);
ie31200_process_error_info(mci, &info);
}

static void ie31200_check(struct mem_ctl_info *mci)
{
__ie31200_check(mci, NULL);
}

static void __iomem *ie31200_map_mchbar(struct pci_dev *pdev, struct res_config *cfg, int mc)
{
union {
Expand Down Expand Up @@ -459,7 +468,7 @@ static int ie31200_register_mci(struct pci_dev *pdev, struct res_config *cfg, in
mci->mod_name = EDAC_MOD_STR;
mci->ctl_name = ie31200_devs[mc].ctl_name;
mci->dev_name = pci_name(pdev);
mci->edac_check = ie31200_check;
mci->edac_check = cfg->cmci ? NULL : ie31200_check;
mci->ctl_page_to_phys = NULL;
priv = mci->pvt_info;
priv->window = window;
Expand Down Expand Up @@ -499,6 +508,58 @@ static int ie31200_register_mci(struct pci_dev *pdev, struct res_config *cfg, in
return ret;
}

static void mce_check(struct mce *mce)
{
struct ie31200_priv *priv;
int i;

for (i = 0; i < IE31200_IMC_NUM; i++) {
priv = ie31200_pvt.priv[i];
if (!priv)
continue;

__ie31200_check(priv->mci, mce);
}
}

static int mce_handler(struct notifier_block *nb, unsigned long val, void *data)
{
struct mce *mce = (struct mce *)data;
char *type;

if (mce->kflags & MCE_HANDLED_CEC)
return NOTIFY_DONE;

/*
* Ignore unless this is a memory related error.
* Don't check MCI_STATUS_ADDRV since it's not set on some CPUs.
*/
if ((mce->status & 0xefff) >> 7 != 1)
return NOTIFY_DONE;

type = mce->mcgstatus & MCG_STATUS_MCIP ? "Exception" : "Event";

edac_dbg(0, "CPU %d: Machine Check %s: 0x%llx Bank %d: 0x%llx\n",
mce->extcpu, type, mce->mcgstatus,
mce->bank, mce->status);
edac_dbg(0, "TSC 0x%llx\n", mce->tsc);
edac_dbg(0, "ADDR 0x%llx\n", mce->addr);
edac_dbg(0, "MISC 0x%llx\n", mce->misc);
edac_dbg(0, "PROCESSOR %u:0x%x TIME %llu SOCKET %u APIC 0x%x\n",
mce->cpuvendor, mce->cpuid, mce->time,
mce->socketid, mce->apicid);

mce_check(mce);
mce->kflags |= MCE_HANDLED_EDAC;

return NOTIFY_DONE;
}

static struct notifier_block ie31200_mce_dec = {
.notifier_call = mce_handler,
.priority = MCE_PRIO_EDAC,
};

static void ie31200_unregister_mcis(void)
{
struct ie31200_priv *priv;
Expand Down Expand Up @@ -534,6 +595,13 @@ static int ie31200_probe1(struct pci_dev *pdev, struct res_config *cfg)
goto fail_register;
}

if (cfg->cmci) {
mce_register_decode_chain(&ie31200_mce_dec);
edac_op_state = EDAC_OPSTATE_INT;
} else {
edac_op_state = EDAC_OPSTATE_POLL;
}

/* get this far and it's successful. */
edac_dbg(3, "MC: success\n");
return 0;
Expand All @@ -560,9 +628,13 @@ static int ie31200_init_one(struct pci_dev *pdev,

static void ie31200_remove_one(struct pci_dev *pdev)
{
struct ie31200_priv *priv = ie31200_pvt.priv[0];

edac_dbg(0, "\n");
pci_dev_put(mci_pdev);
mci_pdev = NULL;
if (priv->cfg->cmci)
mce_unregister_decode_chain(&ie31200_mce_dec);
ie31200_unregister_mcis();
}

Expand Down Expand Up @@ -612,6 +684,7 @@ static struct res_config skl_cfg = {

struct res_config rpl_s_cfg = {
.mtype = MEM_DDR5,
.cmci = true,
.imc_num = 2,
.reg_mchbar_mask = GENMASK_ULL(41, 17),
.reg_mchbar_window_size = BIT_ULL(16),
Expand Down Expand Up @@ -677,8 +750,6 @@ static int __init ie31200_init(void)
int pci_rc, i;

edac_dbg(3, "MC:\n");
/* Ensure that the OPSTATE is set correctly for POLL or NMI */
opstate_init();

pci_rc = pci_register_driver(&ie31200_driver);
if (pci_rc < 0)
Expand Down

0 comments on commit a5db1b2

Please sign in to comment.