Skip to content

Commit

Permalink
Merge branch 'x86-mce-for-linus' of git://git.kernel.org/pub/scm/linu…
Browse files Browse the repository at this point in the history
…x/kernel/git/tip/tip

Pull x86/MCE update from Ingo Molnar:
 "Various MCE robustness enhancements.

  One of the changes adds CMCI (Corrected Machine Check Interrupt) poll
  mode on Intel Nehalem+ CPUs, which mode is automatically entered when
  the rate of messages is too high - and exited once the storm is over.

  An MCE events storm will roughly look like this:

   [ 5342.740616] mce: [Hardware Error]: Machine check events logged
   [ 5342.746501] mce: [Hardware Error]: Machine check events logged
   [ 5342.757971] CMCI storm detected: switching to poll mode
   [ 5372.674957] CMCI storm subsided: switching to interrupt mode

  This should make such events more survivable"

* 'x86-mce-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  x86/mce: Provide boot argument to honour bios-set CMCI threshold
  x86, MCE: Remove unused defines
  x86, mce: Enable MCA support by default
  x86/mce: Add CMCI poll mode
  x86/mce: Make cmci_discover() quiet
  x86: mce: Remove the frozen cases in the hotplug code
  x86: mce: Split timer init
  x86: mce: Serialize mce injection
  x86: mce: Disable preemption when calling raise_local()
  • Loading branch information
Linus Torvalds committed Oct 1, 2012
2 parents ac07f5c + 39ba501 commit 7687b80
Show file tree
Hide file tree
Showing 7 changed files with 244 additions and 59 deletions.
7 changes: 7 additions & 0 deletions Documentation/x86/x86_64/boot-options.txt
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,13 @@ Machine check
monarchtimeout:
Sets the time in us to wait for other CPUs on machine checks. 0
to disable.
mce=bios_cmci_threshold
Don't overwrite the bios-set CMCI threshold. This boot option
prevents Linux from overwriting the CMCI threshold set by the
bios. Without this option, Linux always sets the CMCI
threshold to 1. Enabling this may make memory predictive failure
analysis less effective if the bios sets thresholds for memory
errors since we will not see details for all errors.

nomce (for compatibility with i386): same as mce=off

Expand Down
1 change: 1 addition & 0 deletions arch/x86/Kconfig
Original file line number Diff line number Diff line change
Expand Up @@ -874,6 +874,7 @@ config X86_REROUTE_FOR_BROKEN_BOOT_IRQS

config X86_MCE
bool "Machine Check / overheating reporting"
default y
---help---
Machine Check support allows the processor to notify the
kernel if it detects a problem (e.g. overheating, data corruption).
Expand Down
13 changes: 2 additions & 11 deletions arch/x86/include/asm/mce.h
Original file line number Diff line number Diff line change
Expand Up @@ -116,19 +116,9 @@ struct mce_log {
/* Software defined banks */
#define MCE_EXTENDED_BANK 128
#define MCE_THERMAL_BANK MCE_EXTENDED_BANK + 0

#define K8_MCE_THRESHOLD_BASE (MCE_EXTENDED_BANK + 1) /* MCE_AMD */
#define K8_MCE_THRESHOLD_BANK_0 (MCE_THRESHOLD_BASE + 0 * 9)
#define K8_MCE_THRESHOLD_BANK_1 (MCE_THRESHOLD_BASE + 1 * 9)
#define K8_MCE_THRESHOLD_BANK_2 (MCE_THRESHOLD_BASE + 2 * 9)
#define K8_MCE_THRESHOLD_BANK_3 (MCE_THRESHOLD_BASE + 3 * 9)
#define K8_MCE_THRESHOLD_BANK_4 (MCE_THRESHOLD_BASE + 4 * 9)
#define K8_MCE_THRESHOLD_BANK_5 (MCE_THRESHOLD_BASE + 5 * 9)
#define K8_MCE_THRESHOLD_DRAM_ECC (MCE_THRESHOLD_BANK_4 + 0)

#define K8_MCE_THRESHOLD_BASE (MCE_EXTENDED_BANK + 1)

#ifdef __KERNEL__

extern void mce_register_decode_chain(struct notifier_block *nb);
extern void mce_unregister_decode_chain(struct notifier_block *nb);

Expand Down Expand Up @@ -171,6 +161,7 @@ DECLARE_PER_CPU(struct device *, mce_device);
#ifdef CONFIG_X86_MCE_INTEL
extern int mce_cmci_disabled;
extern int mce_ignore_ce;
extern int mce_bios_cmci_threshold;
void mce_intel_feature_init(struct cpuinfo_x86 *c);
void cmci_clear(void);
void cmci_reenable(void);
Expand Down
8 changes: 8 additions & 0 deletions arch/x86/kernel/cpu/mcheck/mce-inject.c
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ static void raise_exception(struct mce *m, struct pt_regs *pregs)
}

static cpumask_var_t mce_inject_cpumask;
static DEFINE_MUTEX(mce_inject_mutex);

static int mce_raise_notify(unsigned int cmd, struct pt_regs *regs)
{
Expand Down Expand Up @@ -194,7 +195,11 @@ static void raise_mce(struct mce *m)
put_online_cpus();
} else
#endif
{
preempt_disable();
raise_local();
preempt_enable();
}
}

/* Error injection interface */
Expand Down Expand Up @@ -225,7 +230,10 @@ static ssize_t mce_write(struct file *filp, const char __user *ubuf,
* so do it a jiffie or two later everywhere.
*/
schedule_timeout(2);

mutex_lock(&mce_inject_mutex);
raise_mce(&m);
mutex_unlock(&mce_inject_mutex);
return usize;
}

Expand Down
12 changes: 12 additions & 0 deletions arch/x86/kernel/cpu/mcheck/mce-internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,18 @@ extern int mce_ser;

extern struct mce_bank *mce_banks;

#ifdef CONFIG_X86_MCE_INTEL
unsigned long mce_intel_adjust_timer(unsigned long interval);
void mce_intel_cmci_poll(void);
void mce_intel_hcpu_update(unsigned long cpu);
#else
# define mce_intel_adjust_timer mce_adjust_timer_default
static inline void mce_intel_cmci_poll(void) { }
static inline void mce_intel_hcpu_update(unsigned long cpu) { }
#endif

void mce_timer_kick(unsigned long interval);

#ifdef CONFIG_ACPI_APEI
int apei_write_mce(struct mce *m);
ssize_t apei_read_mce(struct mce *m, u64 *record_id);
Expand Down
94 changes: 69 additions & 25 deletions arch/x86/kernel/cpu/mcheck/mce.c
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ static int mce_dont_log_ce __read_mostly;
int mce_cmci_disabled __read_mostly;
int mce_ignore_ce __read_mostly;
int mce_ser __read_mostly;
int mce_bios_cmci_threshold __read_mostly;

struct mce_bank *mce_banks __read_mostly;

Expand Down Expand Up @@ -1266,6 +1267,14 @@ static unsigned long check_interval = 5 * 60; /* 5 minutes */
static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
static DEFINE_PER_CPU(struct timer_list, mce_timer);

static unsigned long mce_adjust_timer_default(unsigned long interval)
{
return interval;
}

static unsigned long (*mce_adjust_timer)(unsigned long interval) =
mce_adjust_timer_default;

static void mce_timer_fn(unsigned long data)
{
struct timer_list *t = &__get_cpu_var(mce_timer);
Expand All @@ -1276,21 +1285,46 @@ static void mce_timer_fn(unsigned long data)
if (mce_available(__this_cpu_ptr(&cpu_info))) {
machine_check_poll(MCP_TIMESTAMP,
&__get_cpu_var(mce_poll_banks));
mce_intel_cmci_poll();
}

/*
* Alert userspace if needed. If we logged an MCE, reduce the
* polling interval, otherwise increase the polling interval.
*/
iv = __this_cpu_read(mce_next_interval);
if (mce_notify_irq())
if (mce_notify_irq()) {
iv = max(iv / 2, (unsigned long) HZ/100);
else
} else {
iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));
iv = mce_adjust_timer(iv);
}
__this_cpu_write(mce_next_interval, iv);
/* Might have become 0 after CMCI storm subsided */
if (iv) {
t->expires = jiffies + iv;
add_timer_on(t, smp_processor_id());
}
}

t->expires = jiffies + iv;
add_timer_on(t, smp_processor_id());
/*
* Ensure that the timer is firing in @interval from now.
*/
void mce_timer_kick(unsigned long interval)
{
struct timer_list *t = &__get_cpu_var(mce_timer);
unsigned long when = jiffies + interval;
unsigned long iv = __this_cpu_read(mce_next_interval);

if (timer_pending(t)) {
if (time_before(when, t->expires))
mod_timer_pinned(t, when);
} else {
t->expires = round_jiffies(when);
add_timer_on(t, smp_processor_id());
}
if (interval < iv)
__this_cpu_write(mce_next_interval, interval);
}

/* Must not be called in IRQ context where del_timer_sync() can deadlock */
Expand Down Expand Up @@ -1585,6 +1619,7 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
switch (c->x86_vendor) {
case X86_VENDOR_INTEL:
mce_intel_feature_init(c);
mce_adjust_timer = mce_intel_adjust_timer;
break;
case X86_VENDOR_AMD:
mce_amd_feature_init(c);
Expand All @@ -1594,23 +1629,28 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
}
}

static void __mcheck_cpu_init_timer(void)
static void mce_start_timer(unsigned int cpu, struct timer_list *t)
{
struct timer_list *t = &__get_cpu_var(mce_timer);
unsigned long iv = check_interval * HZ;
unsigned long iv = mce_adjust_timer(check_interval * HZ);

setup_timer(t, mce_timer_fn, smp_processor_id());
__this_cpu_write(mce_next_interval, iv);

if (mce_ignore_ce)
if (mce_ignore_ce || !iv)
return;

__this_cpu_write(mce_next_interval, iv);
if (!iv)
return;
t->expires = round_jiffies(jiffies + iv);
add_timer_on(t, smp_processor_id());
}

static void __mcheck_cpu_init_timer(void)
{
struct timer_list *t = &__get_cpu_var(mce_timer);
unsigned int cpu = smp_processor_id();

setup_timer(t, mce_timer_fn, cpu);
mce_start_timer(cpu, t);
}

/* Handle unconfigured int18 (should never happen) */
static void unexpected_machine_check(struct pt_regs *regs, long error_code)
{
Expand Down Expand Up @@ -1907,6 +1947,7 @@ static struct miscdevice mce_chrdev_device = {
* check, or 0 to not wait
* mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
* mce=nobootlog Don't log MCEs from before booting.
* mce=bios_cmci_threshold Don't program the CMCI threshold
*/
static int __init mcheck_enable(char *str)
{
Expand All @@ -1926,6 +1967,8 @@ static int __init mcheck_enable(char *str)
mce_ignore_ce = 1;
else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
mce_bootlog = (str[0] == 'b');
else if (!strcmp(str, "bios_cmci_threshold"))
mce_bios_cmci_threshold = 1;
else if (isdigit(str[0])) {
get_option(&str, &tolerant);
if (*str == ',') {
Expand Down Expand Up @@ -2166,6 +2209,11 @@ static struct dev_ext_attribute dev_attr_cmci_disabled = {
&mce_cmci_disabled
};

static struct dev_ext_attribute dev_attr_bios_cmci_threshold = {
__ATTR(bios_cmci_threshold, 0444, device_show_int, NULL),
&mce_bios_cmci_threshold
};

static struct device_attribute *mce_device_attrs[] = {
&dev_attr_tolerant.attr,
&dev_attr_check_interval.attr,
Expand All @@ -2174,6 +2222,7 @@ static struct device_attribute *mce_device_attrs[] = {
&dev_attr_dont_log_ce.attr,
&dev_attr_ignore_ce.attr,
&dev_attr_cmci_disabled.attr,
&dev_attr_bios_cmci_threshold.attr,
NULL
};

Expand Down Expand Up @@ -2294,38 +2343,33 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
unsigned int cpu = (unsigned long)hcpu;
struct timer_list *t = &per_cpu(mce_timer, cpu);

switch (action) {
switch (action & ~CPU_TASKS_FROZEN) {
case CPU_ONLINE:
case CPU_ONLINE_FROZEN:
mce_device_create(cpu);
if (threshold_cpu_callback)
threshold_cpu_callback(action, cpu);
break;
case CPU_DEAD:
case CPU_DEAD_FROZEN:
if (threshold_cpu_callback)
threshold_cpu_callback(action, cpu);
mce_device_remove(cpu);
mce_intel_hcpu_update(cpu);
break;
case CPU_DOWN_PREPARE:
case CPU_DOWN_PREPARE_FROZEN:
del_timer_sync(t);
smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
del_timer_sync(t);
break;
case CPU_DOWN_FAILED:
case CPU_DOWN_FAILED_FROZEN:
if (!mce_ignore_ce && check_interval) {
t->expires = round_jiffies(jiffies +
per_cpu(mce_next_interval, cpu));
add_timer_on(t, cpu);
}
smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
mce_start_timer(cpu, t);
break;
case CPU_POST_DEAD:
}

if (action == CPU_POST_DEAD) {
/* intentionally ignoring frozen here */
cmci_rediscover(cpu);
break;
}

return NOTIFY_OK;
}

Expand Down
Loading

0 comments on commit 7687b80

Please sign in to comment.