Skip to content

Commit

Permalink
Merge branch 'x86/ras' into x86/core, to fix conflicts
Browse files Browse the repository at this point in the history
Conflicts:
	arch/x86/include/asm/irq_vectors.h

Signed-off-by: Ingo Molnar <mingo@kernel.org>
  • Loading branch information
Ingo Molnar committed Jun 7, 2015
2 parents c8e56d2 + 243d657 commit c2f9b0a
Show file tree
Hide file tree
Showing 16 changed files with 271 additions and 52 deletions.
3 changes: 3 additions & 0 deletions Documentation/x86/x86_64/boot-options.txt
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,9 @@ Machine check
(e.g. BIOS or hardware monitoring applications), conflicting
with OS's error handling, and you cannot deactivate the agent,
then this option will be a help.
mce=no_lmce
Do not opt-in to Local MCE delivery. Use legacy method
to broadcast MCEs.
mce=bootlog
Enable logging of machine checks left over from booting.
Disabled by default on AMD because some BIOS leave bogus ones.
Expand Down
3 changes: 3 additions & 0 deletions arch/x86/include/asm/entry_arch.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,4 +52,7 @@ BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR)
BUILD_INTERRUPT(threshold_interrupt,THRESHOLD_APIC_VECTOR)
#endif

#ifdef CONFIG_X86_MCE_AMD
BUILD_INTERRUPT(deferred_error_interrupt, DEFERRED_ERROR_VECTOR)
#endif
#endif
3 changes: 3 additions & 0 deletions arch/x86/include/asm/hardirq.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,9 @@ typedef struct {
#ifdef CONFIG_X86_MCE_THRESHOLD
unsigned int irq_threshold_count;
#endif
#ifdef CONFIG_X86_MCE_AMD
unsigned int irq_deferred_error_count;
#endif
#if IS_ENABLED(CONFIG_HYPERV) || defined(CONFIG_XEN)
unsigned int irq_hv_callback_count;
#endif
Expand Down
2 changes: 2 additions & 0 deletions arch/x86/include/asm/hw_irq.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ extern asmlinkage void reschedule_interrupt(void);
extern asmlinkage void irq_move_cleanup_interrupt(void);
extern asmlinkage void reboot_interrupt(void);
extern asmlinkage void threshold_interrupt(void);
extern asmlinkage void deferred_error_interrupt(void);

extern asmlinkage void call_function_interrupt(void);
extern asmlinkage void call_function_single_interrupt(void);
Expand All @@ -54,6 +55,7 @@ extern void trace_spurious_interrupt(void);
extern void trace_thermal_interrupt(void);
extern void trace_reschedule_interrupt(void);
extern void trace_threshold_interrupt(void);
extern void trace_deferred_error_interrupt(void);
extern void trace_call_function_interrupt(void);
extern void trace_call_function_single_interrupt(void);
#define trace_irq_move_cleanup_interrupt irq_move_cleanup_interrupt
Expand Down
11 changes: 6 additions & 5 deletions arch/x86/include/asm/irq_vectors.h
Original file line number Diff line number Diff line change
Expand Up @@ -83,22 +83,23 @@
*/
#define X86_PLATFORM_IPI_VECTOR 0xf7

/* Vector for KVM to deliver posted interrupt IPI */
#ifdef CONFIG_HAVE_KVM
#define POSTED_INTR_VECTOR 0xf2
#define POSTED_INTR_WAKEUP_VECTOR 0xf1
#endif

/*
* IRQ work vector:
*/
#define IRQ_WORK_VECTOR 0xf6

#define UV_BAU_MESSAGE 0xf5
#define DEFERRED_ERROR_VECTOR 0xf4

/* Vector on which hypervisor callbacks will be delivered */
#define HYPERVISOR_CALLBACK_VECTOR 0xf3

/* Vector for KVM to deliver posted interrupt IPI */
#ifdef CONFIG_HAVE_KVM
#define POSTED_INTR_VECTOR 0xf2
#endif

/*
* Local APIC timer IRQ vector is on a different priority level,
* to work around the 'lost local interrupt if more than 2 IRQ
Expand Down
28 changes: 26 additions & 2 deletions arch/x86/include/asm/mce.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,16 @@
#define MCG_EXT_CNT(c) (((c) & MCG_EXT_CNT_MASK) >> MCG_EXT_CNT_SHIFT)
#define MCG_SER_P (1ULL<<24) /* MCA recovery/new status bits */
#define MCG_ELOG_P (1ULL<<26) /* Extended error log supported */
#define MCG_LMCE_P (1ULL<<27) /* Local machine check supported */

/* MCG_STATUS register defines */
#define MCG_STATUS_RIPV (1ULL<<0) /* restart ip valid */
#define MCG_STATUS_EIPV (1ULL<<1) /* ip points to correct instruction */
#define MCG_STATUS_MCIP (1ULL<<2) /* machine check in progress */
#define MCG_STATUS_LMCES (1ULL<<3) /* LMCE signaled */

/* MCG_EXT_CTL register defines */
#define MCG_EXT_CTL_LMCE_EN (1ULL<<0) /* Enable LMCE */

/* MCi_STATUS register defines */
#define MCI_STATUS_VAL (1ULL<<63) /* valid error */
Expand Down Expand Up @@ -104,6 +109,7 @@ struct mce_log {
struct mca_config {
bool dont_log_ce;
bool cmci_disabled;
bool lmce_disabled;
bool ignore_ce;
bool disabled;
bool ser;
Expand All @@ -117,8 +123,19 @@ struct mca_config {
};

struct mce_vendor_flags {
__u64 overflow_recov : 1, /* cpuid_ebx(80000007) */
__reserved_0 : 63;
/*
* overflow recovery cpuid bit indicates that overflow
* conditions are not fatal
*/
__u64 overflow_recov : 1,

/*
* SUCCOR stands for S/W UnCorrectable error COntainment
* and Recovery. It indicates support for data poisoning
* in HW and deferred error interrupts.
*/
succor : 1,
__reserved_0 : 62;
};
extern struct mce_vendor_flags mce_flags;

Expand Down Expand Up @@ -168,12 +185,16 @@ void cmci_clear(void);
void cmci_reenable(void);
void cmci_rediscover(void);
void cmci_recheck(void);
void lmce_clear(void);
void lmce_enable(void);
#else
static inline void mce_intel_feature_init(struct cpuinfo_x86 *c) { }
static inline void cmci_clear(void) {}
static inline void cmci_reenable(void) {}
static inline void cmci_rediscover(void) {}
static inline void cmci_recheck(void) {}
static inline void lmce_clear(void) {}
static inline void lmce_enable(void) {}
#endif

#ifdef CONFIG_X86_MCE_AMD
Expand Down Expand Up @@ -223,6 +244,9 @@ void do_machine_check(struct pt_regs *, long);
extern void (*mce_threshold_vector)(void);
extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);

/* Deferred error interrupt handler */
extern void (*deferred_error_int_vector)(void);

/*
* Thermal handler
*/
Expand Down
6 changes: 6 additions & 0 deletions arch/x86/include/asm/trace/irq_vectors.h
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,12 @@ DEFINE_IRQ_VECTOR_EVENT(call_function_single);
*/
DEFINE_IRQ_VECTOR_EVENT(threshold_apic);

/*
* deferred_error_apic - called when entering/exiting a deferred apic interrupt
* vector handler
*/
DEFINE_IRQ_VECTOR_EVENT(deferred_error_apic);

/*
* thermal_apic - called when entering/exiting a thermal apic interrupt
* vector handler
Expand Down
3 changes: 2 additions & 1 deletion arch/x86/include/asm/traps.h
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,8 @@ extern int panic_on_unrecovered_nmi;
void math_emulate(struct math_emu_info *);
#ifndef CONFIG_X86_32
asmlinkage void smp_thermal_interrupt(void);
asmlinkage void mce_threshold_interrupt(void);
asmlinkage void smp_threshold_interrupt(void);
asmlinkage void smp_deferred_error_interrupt(void);
#endif

extern enum ctx_state ist_enter(struct pt_regs *regs);
Expand Down
2 changes: 2 additions & 0 deletions arch/x86/include/uapi/asm/msr-index.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@
#define MSR_IA32_MCG_CAP 0x00000179
#define MSR_IA32_MCG_STATUS 0x0000017a
#define MSR_IA32_MCG_CTL 0x0000017b
#define MSR_IA32_MCG_EXT_CTL 0x000004d0

#define MSR_OFFCORE_RSP_0 0x000001a6
#define MSR_OFFCORE_RSP_1 0x000001a7
Expand Down Expand Up @@ -380,6 +381,7 @@
#define FEATURE_CONTROL_LOCKED (1<<0)
#define FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX (1<<1)
#define FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX (1<<2)
#define FEATURE_CONTROL_LMCE (1<<20)

#define MSR_IA32_APICBASE 0x0000001b
#define MSR_IA32_APICBASE_BSP (1<<8)
Expand Down
50 changes: 38 additions & 12 deletions arch/x86/kernel/cpu/mcheck/mce.c
Original file line number Diff line number Diff line change
Expand Up @@ -1050,6 +1050,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
char *msg = "Unknown";
u64 recover_paddr = ~0ull;
int flags = MF_ACTION_REQUIRED;
int lmce = 0;

prev_state = ist_enter(regs);

Expand Down Expand Up @@ -1077,11 +1078,20 @@ void do_machine_check(struct pt_regs *regs, long error_code)
kill_it = 1;

/*
* Go through all the banks in exclusion of the other CPUs.
* This way we don't report duplicated events on shared banks
* because the first one to see it will clear it.
* Check if this MCE is signaled to only this logical processor
*/
order = mce_start(&no_way_out);
if (m.mcgstatus & MCG_STATUS_LMCES)
lmce = 1;
else {
/*
* Go through all the banks in exclusion of the other CPUs.
* This way we don't report duplicated events on shared banks
* because the first one to see it will clear it.
* If this is a Local MCE, then no need to perform rendezvous.
*/
order = mce_start(&no_way_out);
}

for (i = 0; i < cfg->banks; i++) {
__clear_bit(i, toclear);
if (!test_bit(i, valid_banks))
Expand Down Expand Up @@ -1158,8 +1168,18 @@ void do_machine_check(struct pt_regs *regs, long error_code)
* Do most of the synchronization with other CPUs.
* When there's any problem use only local no_way_out state.
*/
if (mce_end(order) < 0)
no_way_out = worst >= MCE_PANIC_SEVERITY;
if (!lmce) {
if (mce_end(order) < 0)
no_way_out = worst >= MCE_PANIC_SEVERITY;
} else {
/*
* Local MCE skipped calling mce_reign()
* If we found a fatal error, we need to panic here.
*/
if (worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3)
mce_panic("Machine check from unknown source",
NULL, NULL);
}

/*
* At insane "tolerant" levels we take no action. Otherwise
Expand Down Expand Up @@ -1640,10 +1660,16 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
mce_intel_feature_init(c);
mce_adjust_timer = cmci_intel_adjust_timer;
break;
case X86_VENDOR_AMD:

case X86_VENDOR_AMD: {
u32 ebx = cpuid_ebx(0x80000007);

mce_amd_feature_init(c);
mce_flags.overflow_recov = cpuid_ebx(0x80000007) & 0x1;
mce_flags.overflow_recov = !!(ebx & BIT(0));
mce_flags.succor = !!(ebx & BIT(1));
break;
}

default:
break;
}
Expand Down Expand Up @@ -1979,6 +2005,7 @@ void mce_disable_bank(int bank)
/*
* mce=off Disables machine check
* mce=no_cmci Disables CMCI
* mce=no_lmce Disables LMCE
* mce=dont_log_ce Clears corrected events silently, no log created for CEs.
* mce=ignore_ce Disables polling and CMCI, corrected events are not cleared.
* mce=TOLERANCELEVEL[,monarchtimeout] (number, see above)
Expand All @@ -2002,6 +2029,8 @@ static int __init mcheck_enable(char *str)
cfg->disabled = true;
else if (!strcmp(str, "no_cmci"))
cfg->cmci_disabled = true;
else if (!strcmp(str, "no_lmce"))
cfg->lmce_disabled = true;
else if (!strcmp(str, "dont_log_ce"))
cfg->dont_log_ce = true;
else if (!strcmp(str, "ignore_ce"))
Expand All @@ -2011,11 +2040,8 @@ static int __init mcheck_enable(char *str)
else if (!strcmp(str, "bios_cmci_threshold"))
cfg->bios_cmci_threshold = true;
else if (isdigit(str[0])) {
get_option(&str, &(cfg->tolerant));
if (*str == ',') {
++str;
if (get_option(&str, &cfg->tolerant) == 2)
get_option(&str, &(cfg->monarch_timeout));
}
} else {
pr_info("mce argument %s ignored. Please use /sys\n", str);
return 0;
Expand Down
Loading

0 comments on commit c2f9b0a

Please sign in to comment.