Skip to content

Commit

Permalink
i7core_edac: decode mcelog error and send it via edac interface
Browse files Browse the repository at this point in the history
Enriches mcelog error by using the encoded information at MCE status and
misc registers (IA32_MCx_STATUS, IA32_MCx_MISC).

Some fixes are still needed here, in order to properly fill the EDAC
fields.

Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
  • Loading branch information
Mauro Carvalho Chehab committed May 10, 2010
1 parent ba6c5c6 commit 8a2f118
Showing 1 changed file with 70 additions and 22 deletions.
92 changes: 70 additions & 22 deletions drivers/edac/i7core_edac.c
Original file line number Diff line number Diff line change
Expand Up @@ -1319,33 +1319,75 @@ static void check_mc_test_err(struct mem_ctl_info *mci, u8 socket)
pvt->last_ce_count[socket][0] = new0;
}

/*
* According with tables E-11 and E-12 of chapter E.3.3 of Intel 64 and IA-32
* Architectures Software Developer’s Manual Volume 3B.
* The MCA registers are the following ones:
* struct mce field MCA Register
* m->status MSR_IA32_MC0_STATUS
* m->addr MSR_IA32_MC0_ADDR
* m->misc MSR_IA32_MC0_MISC
* m->mcgstatus MSR_IA32_MCG_STATUS
* In the case of Nehalem, the error information is masked at .status and .misc
* fields
*/
static void i7core_mce_output_error(struct mem_ctl_info *mci,
struct mce *m)
{
debugf0("CPU %d: Machine Check Exception: %16Lx"
"Bank %d: %016Lx\n",
m->cpu, m->mcgstatus, m->bank, m->status);
if (m->ip) {
debugf0("RIP%s %02x:<%016Lx>\n",
!(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
m->cs, m->ip);
char *type="NON-FATAL";
char *err, *msg;
unsigned long error = m->status & 0x1ff0000l;
u32 core_err_cnt = (m->status >> 38) && 0x7fff;
u32 dimm = (m->misc >> 16) & 0x3;
u32 channel = (m->misc >> 18) & 0x3;
u32 syndrome = m->misc >> 32;
u32 errnum = find_first_bit(&error, 32);

switch (errnum) {
case 16:
err = "read ECC error";
break;
case 17:
err = "RAS ECC error";
break;
case 18:
err = "write parity error";
break;
case 19:
err = "redundacy loss";
break;
case 20:
err = "reserved";
break;
case 21:
err = "memory range error";
break;
case 22:
err = "RTID out of range";
break;
case 23:
err = "address parity error";
break;
case 24:
err = "byte enable parity error";
break;
default:
err = "unknown";
}
printk(KERN_EMERG "TSC %llx ", m->tsc);
if (m->addr)
printk("ADDR %llx ", m->addr);
if (m->misc)
printk("MISC %llx ", m->misc);

#if 0
snprintf(msg, sizeof(msg),
"%s (Branch=%d DRAM-Bank=%d Buffer ID = %d RDWR=%s "
"RAS=%d CAS=%d %s Err=0x%lx (%s))",
type, branch >> 1, bank, buf_id, rdwr_str(rdwr), ras, cas,
type, allErrors, error_name[errnum]);
msg = kasprintf(GFP_ATOMIC,
"%s (addr = 0x%08llx Bank=0x%08x, Dimm=%d, Channel=%d, "
"syndrome=0x%08x total error count=%d Err=%d (%s))\n",
type, (long long) m->addr, m->bank, dimm, channel,
syndrome, core_err_cnt,errnum, err);

debugf0("%s", msg);

/* Call the helper to output message */
edac_mc_handle_fbd_ue(mci, rank, channel, channel + 1, msg);
#endif
edac_mc_handle_fbd_ue(mci, 0 /* FIXME: should be rank here */,
0, 0 /* FIXME: should be channel here */, msg);

kfree(msg);
}

/*
Expand Down Expand Up @@ -1398,6 +1440,13 @@ static int i7core_mce_check_error(void *priv, struct mce *mce)

debugf0(__FILE__ ": %s()\n", __func__);

/*
* Just let mcelog handle it if the error is
* outside the memory controller
*/
if (((mce->status & 0xffff) >> 7) != 1)
return 0;

spin_lock_irqsave(&pvt->mce_lock, flags);
if (pvt->mce_count < MCE_LOG_LEN) {
memcpy(&pvt->mce_entry[pvt->mce_count], mce, sizeof(*mce));
Expand All @@ -1406,8 +1455,7 @@ static int i7core_mce_check_error(void *priv, struct mce *mce)
spin_unlock_irqrestore(&pvt->mce_lock, flags);

/* Advice mcelog that the error were handled */
// return 1;
return 0; // Let's duplicate the log
return 1;
}

/*
Expand Down

0 comments on commit 8a2f118

Please sign in to comment.