Skip to content

Commit

Permalink
Merge branch 'x86-ras-for-linus' of git://git.kernel.org/pub/scm/linu…
Browse files Browse the repository at this point in the history
…x/kernel/git/tip/tip

Pull x86 RAS changes from Ingo Molnar:
 "[ The reason for drivers/ updates is that Boris asked for the
    drivers/edac/ changes to go via x86/ras in this cycle ]

  Main changes:

   - AMD CPUs:
      . Add ECC event decoding support for new F15h models
      . Various erratum fixes
      . Fix single-channel on dual-channel-controllers bug.

   - Intel CPUs:
      . UC uncorrectable memory error parsing fix
      . Add support for CMC (Corrected Machine Check) 'FF' (Firmware
        First) flag in the APEI HEST

   - Various cleanups and fixes"

* 'x86-ras-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  amd64_edac: Fix incorrect wraparounds
  amd64_edac: Correct erratum 505 range
  cpc925_edac: Use proper array termination
  x86/mce, acpi/apei: Only disable banks listed in HEST if mce is configured
  amd64_edac: Get rid of boot_cpu_data accesses
  amd64_edac: Add ECC decoding support for newer F15h models
  x86, amd_nb: Clarify F15h, model 30h GART and L3 support
  pci_ids: Add PCI device ID functions 3 and 4 for newer F15h models.
  x38_edac: Make a local function static
  i3200_edac: Make a local function static
  x86/mce: Pay no attention to 'F' bit in MCACOD when parsing 'UC' errors
  APEI/ERST: Fix error message formatting
  amd64_edac: Fix single-channel setups
  EDAC: Replace strict_strtol() with kstrtol()
  mce: acpi/apei: Soft-offline a page on firmware GHES notification
  mce: acpi/apei: Add a boot option to disable ff mode for corrected errors
  mce: acpi/apei: Honour Firmware First for MCA banks listed in APEI HEST CMC
  • Loading branch information
Linus Torvalds committed Sep 4, 2013
2 parents bb8c470 + ead6fa9 commit b20c99e
Show file tree
Hide file tree
Showing 20 changed files with 523 additions and 135 deletions.
5 changes: 5 additions & 0 deletions Documentation/x86/x86_64/boot-options.txt
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,11 @@ ACPI

acpi=noirq Don't route interrupts

acpi=nocmcff Disable firmware first mode for corrected errors. This
disables parsing the HEST CMC error source to check if
firmware has set the FF flag. This may result in
duplicate corrected error reports.

PCI

pci=off Don't use PCI
Expand Down
2 changes: 2 additions & 0 deletions arch/x86/include/asm/acpi.h
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ extern int acpi_pci_disabled;
extern int acpi_skip_timer_override;
extern int acpi_use_timer_override;
extern int acpi_fix_pin2_polarity;
extern int acpi_disable_cmcff;

extern u8 acpi_sci_flags;
extern int acpi_sci_override_gsi;
Expand Down Expand Up @@ -168,6 +169,7 @@ static inline void arch_acpi_set_pdc_bits(u32 *buf)

#define acpi_lapic 0
#define acpi_ioapic 0
#define acpi_disable_cmcff 0
static inline void acpi_noirq_set(void) { }
static inline void acpi_disable_pci(void) { }
static inline void disable_acpi(void) { }
Expand Down
16 changes: 14 additions & 2 deletions arch/x86/include/asm/mce.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,20 @@
#define MCI_STATUS_PCC (1ULL<<57) /* processor context corrupt */
#define MCI_STATUS_S (1ULL<<56) /* Signaled machine check */
#define MCI_STATUS_AR (1ULL<<55) /* Action required */
#define MCACOD 0xffff /* MCA Error Code */

/*
* Note that the full MCACOD field of IA32_MCi_STATUS MSR is
* bits 15:0. But bit 12 is the 'F' bit, defined for corrected
* errors to indicate that errors are being filtered by hardware.
* We should mask out bit 12 when looking for specific signatures
* of uncorrected errors - so the F bit is deliberately skipped
* in this #define.
*/
#define MCACOD 0xefff /* MCA Error Code */

/* Architecturally defined codes from SDM Vol. 3B Chapter 15 */
#define MCACOD_SCRUB 0x00C0 /* 0xC0-0xCF Memory Scrubbing */
#define MCACOD_SCRUBMSK 0xfff0
#define MCACOD_SCRUBMSK 0xeff0 /* Skip bit 12 ('F' bit) */
#define MCACOD_L3WB 0x017A /* L3 Explicit Writeback */
#define MCACOD_DATA 0x0134 /* Data Load */
#define MCACOD_INSTR 0x0150 /* Instruction Fetch */
Expand Down Expand Up @@ -188,6 +197,9 @@ extern void register_mce_write_callback(ssize_t (*)(struct file *filp,
const char __user *ubuf,
size_t usize, loff_t *off));

/* Disable CMCI/polling for MCA bank claimed by firmware */
extern void mce_disable_bank(int bank);

/*
* Exception handler
*/
Expand Down
5 changes: 5 additions & 0 deletions arch/x86/kernel/acpi/boot.c
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ EXPORT_SYMBOL(acpi_pci_disabled);
int acpi_lapic;
int acpi_ioapic;
int acpi_strict;
int acpi_disable_cmcff;

u8 acpi_sci_flags __initdata;
int acpi_sci_override_gsi __initdata;
Expand Down Expand Up @@ -1622,6 +1623,10 @@ static int __init parse_acpi(char *arg)
/* "acpi=copy_dsdt" copys DSDT */
else if (strcmp(arg, "copy_dsdt") == 0) {
acpi_gbl_copy_dsdt_locally = 1;
}
/* "acpi=nocmcff" disables FF mode for corrected errors */
else if (strcmp(arg, "nocmcff") == 0) {
acpi_disable_cmcff = 1;
} else {
/* Core will printk when we return error. */
return -EINVAL;
Expand Down
13 changes: 11 additions & 2 deletions arch/x86/kernel/amd_nb.c
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,15 @@ const struct pci_device_id amd_nb_misc_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M10H_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M30H_NB_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F3) },
{}
};
EXPORT_SYMBOL(amd_nb_misc_ids);

static const struct pci_device_id amd_nb_link_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M30H_NB_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F4) },
{}
};
Expand Down Expand Up @@ -81,12 +83,19 @@ int amd_cache_northbridges(void)
next_northbridge(misc, amd_nb_misc_ids);
node_to_amd_nb(i)->link = link =
next_northbridge(link, amd_nb_link_ids);
}
}

/* GART present only on Fam15h upto model 0fh */
if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10 ||
boot_cpu_data.x86 == 0x15)
(boot_cpu_data.x86 == 0x15 && boot_cpu_data.x86_model < 0x10))
amd_northbridges.flags |= AMD_NB_GART;

/*
* Check for L3 cache presence.
*/
if (!cpuid_edx(0x80000006))
return 0;

/*
* Some CPU families support L3 Cache Index Disable. There are some
* limitations because of E382 and E388 on family 0x10.
Expand Down
3 changes: 3 additions & 0 deletions arch/x86/kernel/cpu/mcheck/mce-internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,15 +25,18 @@ int mce_severity(struct mce *a, int tolerant, char **msg);
struct dentry *mce_get_debugfs_dir(void);

extern struct mce_bank *mce_banks;
extern mce_banks_t mce_banks_ce_disabled;

#ifdef CONFIG_X86_MCE_INTEL
unsigned long mce_intel_adjust_timer(unsigned long interval);
void mce_intel_cmci_poll(void);
void mce_intel_hcpu_update(unsigned long cpu);
void cmci_disable_bank(int bank);
#else
# define mce_intel_adjust_timer mce_adjust_timer_default
static inline void mce_intel_cmci_poll(void) { }
static inline void mce_intel_hcpu_update(unsigned long cpu) { }
static inline void cmci_disable_bank(int bank) { }
#endif

void mce_timer_kick(unsigned long interval);
Expand Down
28 changes: 28 additions & 0 deletions arch/x86/kernel/cpu/mcheck/mce.c
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,15 @@ DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
[0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
};

/*
* MCA banks controlled through firmware first for corrected errors.
* This is a global list of banks for which we won't enable CMCI and we
* won't poll. Firmware controls these banks and is responsible for
* reporting corrected errors through GHES. Uncorrected/recoverable
* errors are still notified through a machine check.
*/
mce_banks_t mce_banks_ce_disabled;

static DEFINE_PER_CPU(struct work_struct, mce_work);

static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);
Expand Down Expand Up @@ -1935,6 +1944,25 @@ static struct miscdevice mce_chrdev_device = {
&mce_chrdev_ops,
};

static void __mce_disable_bank(void *arg)
{
int bank = *((int *)arg);
__clear_bit(bank, __get_cpu_var(mce_poll_banks));
cmci_disable_bank(bank);
}

void mce_disable_bank(int bank)
{
if (bank >= mca_cfg.banks) {
pr_warn(FW_BUG
"Ignoring request to disable invalid MCA bank %d.\n",
bank);
return;
}
set_bit(bank, mce_banks_ce_disabled);
on_each_cpu(__mce_disable_bank, &bank, 1);
}

/*
* mce=off Disables machine check
* mce=no_cmci Disables CMCI
Expand Down
42 changes: 32 additions & 10 deletions arch/x86/kernel/cpu/mcheck/mce_intel.c
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,10 @@ static void cmci_discover(int banks)
if (test_bit(i, owned))
continue;

/* Skip banks in firmware first mode */
if (test_bit(i, mce_banks_ce_disabled))
continue;

rdmsrl(MSR_IA32_MCx_CTL2(i), val);

/* Already owned by someone else? */
Expand Down Expand Up @@ -271,6 +275,19 @@ void cmci_recheck(void)
local_irq_restore(flags);
}

/* Caller must hold the lock on cmci_discover_lock */
static void __cmci_disable_bank(int bank)
{
u64 val;

if (!test_bit(bank, __get_cpu_var(mce_banks_owned)))
return;
rdmsrl(MSR_IA32_MCx_CTL2(bank), val);
val &= ~MCI_CTL2_CMCI_EN;
wrmsrl(MSR_IA32_MCx_CTL2(bank), val);
__clear_bit(bank, __get_cpu_var(mce_banks_owned));
}

/*
* Disable CMCI on this CPU for all banks it owns when it goes down.
* This allows other CPUs to claim the banks on rediscovery.
Expand All @@ -280,20 +297,12 @@ void cmci_clear(void)
unsigned long flags;
int i;
int banks;
u64 val;

if (!cmci_supported(&banks))
return;
raw_spin_lock_irqsave(&cmci_discover_lock, flags);
for (i = 0; i < banks; i++) {
if (!test_bit(i, __get_cpu_var(mce_banks_owned)))
continue;
/* Disable CMCI */
rdmsrl(MSR_IA32_MCx_CTL2(i), val);
val &= ~MCI_CTL2_CMCI_EN;
wrmsrl(MSR_IA32_MCx_CTL2(i), val);
__clear_bit(i, __get_cpu_var(mce_banks_owned));
}
for (i = 0; i < banks; i++)
__cmci_disable_bank(i);
raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
}

Expand Down Expand Up @@ -327,6 +336,19 @@ void cmci_reenable(void)
cmci_discover(banks);
}

void cmci_disable_bank(int bank)
{
int banks;
unsigned long flags;

if (!cmci_supported(&banks))
return;

raw_spin_lock_irqsave(&cmci_discover_lock, flags);
__cmci_disable_bank(bank);
raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
}

static void intel_init_cmci(void)
{
int banks;
Expand Down
Loading

0 comments on commit b20c99e

Please sign in to comment.