Skip to content

Commit

Permalink
Merge tag 'ras_for_3.14' of git://git.kernel.org/pub/scm/linux/kernel…
Browse files Browse the repository at this point in the history
…/git/bp/bp into x86/ras

Pull RAS updates from Borislav Petkov:

  * Add the functionality to override error reporting agents as some
  machines are sporting a new extended error logging capability which, if
  done properly in the BIOS, makes a corresponding EDAC module redundant,
  from Gong Chen.

  * PCIe AER tracepoint severity levels fix, from Rui Wang.

  * Error path correction for the mce device init, from Levente Kurusa.

Signed-off-by: Ingo Molnar <mingo@kernel.org>
  • Loading branch information
Ingo Molnar committed Dec 16, 2013
2 parents 319e2e3 + 42139eb commit 0149522
Show file tree
Hide file tree
Showing 7 changed files with 84 additions and 9 deletions.
8 changes: 8 additions & 0 deletions Documentation/kernel-parameters.txt
Original file line number Diff line number Diff line change
Expand Up @@ -881,6 +881,14 @@ bytes respectively. Such letter suffixes can also be entirely omitted.

The xen output can only be used by Xen PV guests.

edac_report= [HW,EDAC] Control how to report EDAC event
Format: {"on" | "off" | "force"}
on: enable EDAC to report H/W event. May be overridden
by other higher priority error reporting module.
off: disable H/W event reporting through EDAC.
force: enforce the use of EDAC to report H/W event.
default: on.

ekgdboc= [X86,KGDB] Allow early kernel console debugging
ekgdboc=kbd

Expand Down
4 changes: 3 additions & 1 deletion arch/x86/kernel/cpu/mcheck/mce.c
Original file line number Diff line number Diff line change
Expand Up @@ -2272,8 +2272,10 @@ static int mce_device_create(unsigned int cpu)
dev->release = &mce_device_release;

err = device_register(dev);
if (err)
if (err) {
put_device(dev);
return err;
}

for (i = 0; mce_device_attrs[i]; i++) {
err = device_create_file(dev, mce_device_attrs[i]);
Expand Down
18 changes: 16 additions & 2 deletions drivers/acpi/acpi_extlog.c
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include <acpi/acpi_bus.h>
#include <linux/cper.h>
#include <linux/ratelimit.h>
#include <linux/edac.h>
#include <asm/cpu.h>
#include <asm/mce.h>

Expand Down Expand Up @@ -43,6 +44,8 @@ struct extlog_l1_head {
u8 rev1[12];
};

static int old_edac_report_status;

static u8 extlog_dsm_uuid[] = "663E35AF-CC10-41A4-88EA-5470AF055295";

/* L1 table related physical address */
Expand Down Expand Up @@ -150,7 +153,7 @@ static int extlog_print(struct notifier_block *nb, unsigned long val,

rc = print_extlog_rcd(NULL, (struct acpi_generic_status *)elog_buf, cpu);

return NOTIFY_DONE;
return NOTIFY_STOP;
}

static int extlog_get_dsm(acpi_handle handle, int rev, int func, u64 *ret)
Expand Down Expand Up @@ -231,8 +234,12 @@ static int __init extlog_init(void)
u64 cap;
int rc;

rc = -ENODEV;
if (get_edac_report_status() == EDAC_REPORTING_FORCE) {
pr_warn("Not loading eMCA, error reporting force-enabled through EDAC.\n");
return -EPERM;
}

rc = -ENODEV;
rdmsrl(MSR_IA32_MCG_CAP, cap);
if (!(cap & MCG_ELOG_P))
return rc;
Expand Down Expand Up @@ -287,6 +294,12 @@ static int __init extlog_init(void)
if (elog_buf == NULL)
goto err_release_elog;

/*
* eMCA event report method has higher priority than EDAC method,
* unless EDAC event report method is mandatory.
*/
old_edac_report_status = get_edac_report_status();
set_edac_report_status(EDAC_REPORTING_DISABLED);
mce_register_decode_chain(&extlog_mce_dec);
/* enable OS to be involved to take over management from BIOS */
((struct extlog_l1_head *)extlog_l1_addr)->flags |= FLAG_OS_OPTIN;
Expand All @@ -308,6 +321,7 @@ static int __init extlog_init(void)

static void __exit extlog_exit(void)
{
set_edac_report_status(old_edac_report_status);
mce_unregister_decode_chain(&extlog_mce_dec);
((struct extlog_l1_head *)extlog_l1_addr)->flags &= ~FLAG_OS_OPTIN;
if (extlog_l1_addr)
Expand Down
19 changes: 19 additions & 0 deletions drivers/edac/edac_stub.c
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,25 @@ EXPORT_SYMBOL_GPL(edac_err_assert);

static atomic_t edac_subsys_valid = ATOMIC_INIT(0);

int edac_report_status = EDAC_REPORTING_ENABLED;
EXPORT_SYMBOL_GPL(edac_report_status);

static int __init edac_report_setup(char *str)
{
if (!str)
return -EINVAL;

if (!strncmp(str, "on", 2))
set_edac_report_status(EDAC_REPORTING_ENABLED);
else if (!strncmp(str, "off", 3))
set_edac_report_status(EDAC_REPORTING_DISABLED);
else if (!strncmp(str, "force", 5))
set_edac_report_status(EDAC_REPORTING_FORCE);

return 0;
}
__setup("edac_report=", edac_report_setup);

/*
* called to determine if there is an EDAC driver interested in
* knowing an event (such as NMI) occurred
Expand Down
6 changes: 5 additions & 1 deletion drivers/edac/sb_edac.c
Original file line number Diff line number Diff line change
Expand Up @@ -1829,6 +1829,9 @@ static int sbridge_mce_check_error(struct notifier_block *nb, unsigned long val,
struct mem_ctl_info *mci;
struct sbridge_pvt *pvt;

if (get_edac_report_status() == EDAC_REPORTING_DISABLED)
return NOTIFY_DONE;

mci = get_mci_for_node_id(mce->socketid);
if (!mci)
return NOTIFY_BAD;
Expand Down Expand Up @@ -2142,9 +2145,10 @@ static int __init sbridge_init(void)
opstate_init();

pci_rc = pci_register_driver(&sbridge_driver);

if (pci_rc >= 0) {
mce_register_decode_chain(&sbridge_mce_dec);
if (get_edac_report_status() == EDAC_REPORTING_DISABLED)
sbridge_printk(KERN_WARNING, "Loading driver, error reporting disabled.\n");
return 0;
}

Expand Down
28 changes: 28 additions & 0 deletions include/linux/edac.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,34 @@ extern void edac_atomic_assert_error(void);
extern struct bus_type *edac_get_sysfs_subsys(void);
extern void edac_put_sysfs_subsys(void);

enum {
EDAC_REPORTING_ENABLED,
EDAC_REPORTING_DISABLED,
EDAC_REPORTING_FORCE
};

extern int edac_report_status;
#ifdef CONFIG_EDAC
static inline int get_edac_report_status(void)
{
return edac_report_status;
}

static inline void set_edac_report_status(int new)
{
edac_report_status = new;
}
#else
static inline int get_edac_report_status(void)
{
return EDAC_REPORTING_DISABLED;
}

static inline void set_edac_report_status(int new)
{
}
#endif

static inline void opstate_init(void)
{
switch (edac_op_state) {
Expand Down
10 changes: 5 additions & 5 deletions include/trace/events/ras.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
#define _TRACE_AER_H

#include <linux/tracepoint.h>
#include <linux/edac.h>
#include <linux/aer.h>


/*
Expand Down Expand Up @@ -63,10 +63,10 @@ TRACE_EVENT(aer_event,

TP_printk("%s PCIe Bus Error: severity=%s, %s\n",
__get_str(dev_name),
__entry->severity == HW_EVENT_ERR_CORRECTED ? "Corrected" :
__entry->severity == HW_EVENT_ERR_FATAL ?
"Fatal" : "Uncorrected",
__entry->severity == HW_EVENT_ERR_CORRECTED ?
__entry->severity == AER_CORRECTABLE ? "Corrected" :
__entry->severity == AER_FATAL ?
"Fatal" : "Uncorrected, non-fatal",
__entry->severity == AER_CORRECTABLE ?
__print_flags(__entry->status, "|", aer_correctable_errors) :
__print_flags(__entry->status, "|", aer_uncorrectable_errors))
);
Expand Down

0 comments on commit 0149522

Please sign in to comment.