Skip to content

Commit

Permalink
Merge tag 'please-pull-extlog-trace' into x86/ras
Browse files Browse the repository at this point in the history
Report extended error information ("extlog") using
a trace/event.  Provide a mechanism for a smart
daemon collecting this information to tell the kernel
to skip logging corrected errors to the console.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
  • Loading branch information
H. Peter Anvin committed Jul 14, 2014
2 parents 27c9341 + 7c76bb5 commit cbd4ebc
Show file tree
Hide file tree
Showing 22 changed files with 506 additions and 166 deletions.
2 changes: 2 additions & 0 deletions drivers/Kconfig
Original file line number Diff line number Diff line change
Expand Up @@ -176,4 +176,6 @@ source "drivers/powercap/Kconfig"

source "drivers/mcb/Kconfig"

source "drivers/ras/Kconfig"

endmenu
1 change: 1 addition & 0 deletions drivers/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -158,3 +158,4 @@ obj-$(CONFIG_NTB) += ntb/
obj-$(CONFIG_FMC) += fmc/
obj-$(CONFIG_POWERCAP) += powercap/
obj-$(CONFIG_MCB) += mcb/
obj-$(CONFIG_RAS) += ras/
4 changes: 3 additions & 1 deletion drivers/acpi/Kconfig
Original file line number Diff line number Diff line change
Expand Up @@ -370,6 +370,7 @@ config ACPI_EXTLOG
tristate "Extended Error Log support"
depends on X86_MCE && X86_LOCAL_APIC
select UEFI_CPER
select RAS
default n
help
Certain usages such as Predictive Failure Analysis (PFA) require
Expand All @@ -384,6 +385,7 @@ config ACPI_EXTLOG

Enhanced MCA Logging allows firmware to provide additional error
information to system software, synchronous with MCE or CMCI. This
driver adds support for that functionality.
driver adds support for that functionality with corresponding
tracepoint which carries that information to userspace.

endif # ACPI
46 changes: 35 additions & 11 deletions drivers/acpi/acpi_extlog.c
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,12 @@
#include <linux/cper.h>
#include <linux/ratelimit.h>
#include <linux/edac.h>
#include <linux/ras.h>
#include <asm/cpu.h>
#include <asm/mce.h>

#include "apei/apei-internal.h"
#include <ras/ras_event.h>

#define EXT_ELOG_ENTRY_MASK GENMASK_ULL(51, 0) /* elog entry address mask */

Expand Down Expand Up @@ -137,8 +139,12 @@ static int extlog_print(struct notifier_block *nb, unsigned long val,
struct mce *mce = (struct mce *)data;
int bank = mce->bank;
int cpu = mce->extcpu;
struct acpi_generic_status *estatus;
int rc;
struct acpi_generic_status *estatus, *tmp;
struct acpi_generic_data *gdata;
const uuid_le *fru_id = &NULL_UUID_LE;
char *fru_text = "";
uuid_le *sec_type;
static u32 err_seq;

estatus = extlog_elog_entry_check(cpu, bank);
if (estatus == NULL)
Expand All @@ -148,8 +154,29 @@ static int extlog_print(struct notifier_block *nb, unsigned long val,
/* clear record status to enable BIOS to update it again */
estatus->block_status = 0;

rc = print_extlog_rcd(NULL, (struct acpi_generic_status *)elog_buf, cpu);
tmp = (struct acpi_generic_status *)elog_buf;

if (!ras_userspace_consumers()) {
print_extlog_rcd(NULL, tmp, cpu);
goto out;
}

/* log event via trace */
err_seq++;
gdata = (struct acpi_generic_data *)(tmp + 1);
if (gdata->validation_bits & CPER_SEC_VALID_FRU_ID)
fru_id = (uuid_le *)gdata->fru_id;
if (gdata->validation_bits & CPER_SEC_VALID_FRU_TEXT)
fru_text = gdata->fru_text;
sec_type = (uuid_le *)gdata->section_type;
if (!uuid_le_cmp(*sec_type, CPER_SEC_PLATFORM_MEM)) {
struct cper_sec_mem_err *mem = (void *)(gdata + 1);
if (gdata->error_data_length >= sizeof(*mem))
trace_extlog_mem_event(mem, err_seq, fru_id, fru_text,
(u8)gdata->error_severity);
}

out:
return NOTIFY_STOP;
}

Expand Down Expand Up @@ -196,19 +223,16 @@ static int __init extlog_init(void)
u64 cap;
int rc;

rdmsrl(MSR_IA32_MCG_CAP, cap);

if (!(cap & MCG_ELOG_P) || !extlog_get_l1addr())
return -ENODEV;

if (get_edac_report_status() == EDAC_REPORTING_FORCE) {
pr_warn("Not loading eMCA, error reporting force-enabled through EDAC.\n");
return -EPERM;
}

rc = -ENODEV;
rdmsrl(MSR_IA32_MCG_CAP, cap);
if (!(cap & MCG_ELOG_P))
return rc;

if (!extlog_get_l1addr())
return rc;

rc = -EINVAL;
/* get L1 header to fetch necessary information */
l1_hdr_size = sizeof(struct extlog_l1_head);
Expand Down
1 change: 1 addition & 0 deletions drivers/edac/Kconfig
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ config EDAC_MCE_INJ

config EDAC_MM_EDAC
tristate "Main Memory EDAC (Error Detection And Correction) reporting"
select RAS
help
Some systems are able to detect and correct errors in main
memory. EDAC can report statistics on memory error
Expand Down
3 changes: 0 additions & 3 deletions drivers/edac/edac_mc.c
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,6 @@
#include <asm/edac.h>
#include "edac_core.h"
#include "edac_module.h"

#define CREATE_TRACE_POINTS
#define TRACE_INCLUDE_PATH ../../include/ras
#include <ras/ras_event.h>

/* lock to memory controller's control array */
Expand Down
Loading

0 comments on commit cbd4ebc

Please sign in to comment.