Skip to content

Commit

Permalink
Merge tag 'amd_severity' of git://git.kernel.org/pub/scm/linux/kernel…
Browse files Browse the repository at this point in the history
…/git/ras/ras into x86/ras

Pull RAS update from Borislav Petkov:

  "This has been long in the making - an AMD-specific MCE-severity grading
   function. And it is actually readable at a quick glance. Further error
   recovery actions will be based on its output.

   Patches tested on every relevant AMD family out there."

Signed-off-by: Ingo Molnar <mingo@kernel.org>
  • Loading branch information
Ingo Molnar committed Mar 31, 2015
2 parents c9ce871 + 43eaa2a commit f5c8a10
Show file tree
Hide file tree
Showing 4 changed files with 85 additions and 2 deletions.
8 changes: 8 additions & 0 deletions arch/x86/include/asm/mce.h
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,12 @@ struct mca_config {
u32 rip_msr;
};

struct mce_vendor_flags {
__u64 overflow_recov : 1, /* cpuid_ebx(80000007) */
__reserved_0 : 63;
};
extern struct mce_vendor_flags mce_flags;

extern struct mca_config mca_cfg;
extern void mce_register_decode_chain(struct notifier_block *nb);
extern void mce_unregister_decode_chain(struct notifier_block *nb);
Expand All @@ -128,9 +134,11 @@ extern int mce_p5_enabled;
#ifdef CONFIG_X86_MCE
int mcheck_init(void);
void mcheck_cpu_init(struct cpuinfo_x86 *c);
void mcheck_vendor_init_severity(void);
#else
static inline int mcheck_init(void) { return 0; }
static inline void mcheck_cpu_init(struct cpuinfo_x86 *c) {}
static inline void mcheck_vendor_init_severity(void) {}
#endif

#ifdef CONFIG_X86_ANCIENT_MCE
Expand Down
2 changes: 1 addition & 1 deletion arch/x86/kernel/cpu/mcheck/mce-internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ struct mce_bank {
char attrname[ATTR_LEN]; /* attribute name */
};

int mce_severity(struct mce *a, int tolerant, char **msg, bool is_excp);
extern int (*mce_severity)(struct mce *a, int tolerant, char **msg, bool is_excp);
struct dentry *mce_get_debugfs_dir(void);

extern struct mce_bank *mce_banks;
Expand Down
67 changes: 66 additions & 1 deletion arch/x86/kernel/cpu/mcheck/mce-severity.c
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,62 @@ static int error_context(struct mce *m)
return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL;
}

int mce_severity(struct mce *m, int tolerant, char **msg, bool is_excp)
/*
* See AMD Error Scope Hierarchy table in a newer BKDG. For example
* 49125_15h_Models_30h-3Fh_BKDG.pdf, section "RAS Features"
*/
static int mce_severity_amd(struct mce *m, int tolerant, char **msg, bool is_excp)
{
enum context ctx = error_context(m);

/* Processor Context Corrupt, no need to fumble too much, die! */
if (m->status & MCI_STATUS_PCC)
return MCE_PANIC_SEVERITY;

if (m->status & MCI_STATUS_UC) {

/*
* On older systems where overflow_recov flag is not present, we
* should simply panic if an error overflow occurs. If
* overflow_recov flag is present and set, then software can try
* to at least kill process to prolong system operation.
*/
if (mce_flags.overflow_recov) {
/* software can try to contain */
if (!(m->mcgstatus & MCG_STATUS_RIPV))
if (ctx == IN_KERNEL)
return MCE_PANIC_SEVERITY;

/* kill current process */
return MCE_AR_SEVERITY;
} else {
/* at least one error was not logged */
if (m->status & MCI_STATUS_OVER)
return MCE_PANIC_SEVERITY;
}

/*
* For any other case, return MCE_UC_SEVERITY so that we log the
* error and exit #MC handler.
*/
return MCE_UC_SEVERITY;
}

/*
* deferred error: poll handler catches these and adds to mce_ring so
* memory-failure can take recovery actions.
*/
if (m->status & MCI_STATUS_DEFERRED)
return MCE_DEFERRED_SEVERITY;

/*
* corrected error: poll handler catches these and passes responsibility
* of decoding the error to EDAC
*/
return MCE_KEEP_SEVERITY;
}

static int mce_severity_intel(struct mce *m, int tolerant, char **msg, bool is_excp)
{
enum exception excp = (is_excp ? EXCP_CONTEXT : NO_EXCP);
enum context ctx = error_context(m);
Expand Down Expand Up @@ -216,6 +271,16 @@ int mce_severity(struct mce *m, int tolerant, char **msg, bool is_excp)
}
}

/* Default to mce_severity_intel */
int (*mce_severity)(struct mce *m, int tolerant, char **msg, bool is_excp) =
mce_severity_intel;

void __init mcheck_vendor_init_severity(void)
{
if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
mce_severity = mce_severity_amd;
}

#ifdef CONFIG_DEBUG_FS
static void *s_start(struct seq_file *f, loff_t *pos)
{
Expand Down
10 changes: 10 additions & 0 deletions arch/x86/kernel/cpu/mcheck/mce.c
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ static DEFINE_MUTEX(mce_chrdev_read_mutex);
DEFINE_PER_CPU(unsigned, mce_exception_count);

struct mce_bank *mce_banks __read_mostly;
struct mce_vendor_flags mce_flags __read_mostly;

struct mca_config mca_cfg __read_mostly = {
.bootlog = -1,
Expand Down Expand Up @@ -1534,6 +1535,13 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
if (c->x86 == 6 && cfg->banks > 0)
mce_banks[0].ctl = 0;

/*
* overflow_recov is supported for F15h Models 00h-0fh
* even though we don't have a CPUID bit for it.
*/
if (c->x86 == 0x15 && c->x86_model <= 0xf)
mce_flags.overflow_recov = 1;

/*
* Turn off MC4_MISC thresholding banks on those models since
* they're not supported there.
Expand Down Expand Up @@ -1633,6 +1641,7 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
break;
case X86_VENDOR_AMD:
mce_amd_feature_init(c);
mce_flags.overflow_recov = cpuid_ebx(0x80000007) & 0x1;
break;
default:
break;
Expand Down Expand Up @@ -2017,6 +2026,7 @@ __setup("mce", mcheck_enable);
int __init mcheck_init(void)
{
mcheck_intel_therm_init();
mcheck_vendor_init_severity();

return 0;
}
Expand Down

0 comments on commit f5c8a10

Please sign in to comment.