Skip to content

Commit

Permalink
x86/MCE: Dump MCE to dmesg if no consumers
Browse files Browse the repository at this point in the history
When there are no error record consumers registered with the kernel, the
only thing that appears in dmesg is something like:

  [  300.000326] mce: [Hardware Error]: Machine check events logged

and the error records are gone. Which is seriously counterproductive.

So let's dump them to dmesg instead, in such a case.

Requested-by: Eric Morton <Eric.Morton@amd.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Tony Luck <tony.luck@intel.com>
Link: http://lkml.kernel.org/r/20161101120911.13163-4-bp@alien8.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
  • Loading branch information
Borislav Petkov authored and Thomas Gleixner committed Nov 8, 2016
1 parent 8c203db commit cd9c57c
Showing 1 changed file with 46 additions and 6 deletions.
52 changes: 46 additions & 6 deletions arch/x86/kernel/cpu/mcheck/mce.c
Original file line number Diff line number Diff line change
Expand Up @@ -207,8 +207,12 @@ EXPORT_SYMBOL_GPL(mce_inject_log);

static struct notifier_block mce_srao_nb;

static atomic_t num_notifiers;

void mce_register_decode_chain(struct notifier_block *nb)
{
atomic_inc(&num_notifiers);

/* Ensure SRAO notifier has the highest priority in the decode chain. */
if (nb != &mce_srao_nb && nb->priority == INT_MAX)
nb->priority -= 1;
Expand All @@ -219,6 +223,8 @@ EXPORT_SYMBOL_GPL(mce_register_decode_chain);

void mce_unregister_decode_chain(struct notifier_block *nb)
{
atomic_dec(&num_notifiers);

atomic_notifier_chain_unregister(&x86_mce_decoder_chain, nb);
}
EXPORT_SYMBOL_GPL(mce_unregister_decode_chain);
Expand Down Expand Up @@ -270,17 +276,17 @@ struct mca_msr_regs msr_ops = {
.misc = misc_reg
};

static void print_mce(struct mce *m)
static void __print_mce(struct mce *m)
{
int ret = 0;

pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n",
m->extcpu, m->mcgstatus, m->bank, m->status);
pr_emerg(HW_ERR "CPU %d: Machine Check%s: %Lx Bank %d: %016Lx\n",
m->extcpu,
(m->mcgstatus & MCG_STATUS_MCIP ? " Exception" : ""),
m->mcgstatus, m->bank, m->status);

if (m->ip) {
pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ",
!(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
m->cs, m->ip);
m->cs, m->ip);

if (m->cs == __KERNEL_CS)
print_symbol("{%s}", m->ip);
Expand Down Expand Up @@ -308,6 +314,13 @@ static void print_mce(struct mce *m)
pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n",
m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid,
cpu_data(m->extcpu).microcode);
}

static void print_mce(struct mce *m)
{
int ret = 0;

__print_mce(m);

/*
* Print out human-readable details about the MCE error,
Expand Down Expand Up @@ -569,6 +582,32 @@ static struct notifier_block mce_srao_nb = {
.priority = INT_MAX,
};

static int mce_default_notifier(struct notifier_block *nb, unsigned long val,
void *data)
{
struct mce *m = (struct mce *)data;

if (!m)
return NOTIFY_DONE;

/*
* Run the default notifier if we have only the SRAO
* notifier and us registered.
*/
if (atomic_read(&num_notifiers) > 2)
return NOTIFY_DONE;

__print_mce(m);

return NOTIFY_DONE;
}

static struct notifier_block mce_default_nb = {
.notifier_call = mce_default_notifier,
/* lowest prio, we want it to run last. */
.priority = 0,
};

/*
* Read ADDR and MISC registers.
*/
Expand Down Expand Up @@ -2138,6 +2177,7 @@ int __init mcheck_init(void)
{
mcheck_intel_therm_init();
mce_register_decode_chain(&mce_srao_nb);
mce_register_decode_chain(&mce_default_nb);
mcheck_vendor_init_severity();

INIT_WORK(&mce_work, mce_process_work);
Expand Down

0 comments on commit cd9c57c

Please sign in to comment.