Skip to content

Commit

Permalink
Merge branch 'ras-core-for-linus' of git://git.kernel.org/pub/scm/lin…
Browse files Browse the repository at this point in the history
…ux/kernel/git/tip/tip

Pull RAS updates from Ingo Molnar:
 "The main changes were:

   - Lots of enhancements for AMD SMCA (Scalable MCA
     features/extensions) systems: extract, decode and print more
     hardware error information and add matching support on the
     injection/testing side as well. (Yazn Ghannam)

   - Various MCE handling improvements on modern Intel Xeons. (Tony
     Luck)

   - Plus misc fixes and enhancements"

* 'ras-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (21 commits)
  x86/RAS/mce_amd_inj: Remove debugfs dir recursively on exit
  x86/RAS/mce_amd_inj: Fix signed wrap around when decrementing index 'i'
  x86/RAS/mce_amd_inj: Fix some W= warnings
  x86/MCE/AMD, EDAC: Handle reserved bank 4 on Fam17h properly
  x86/mce/AMD: Extract the error address on SMCA systems
  x86/mce, EDAC/mce_amd: Print MCA_SYND and MCA_IPID during MCE on SMCA systems
  x86/mce/AMD: Save MCA_IPID in MCE struct on SMCA systems
  x86/mce/AMD: Ensure the deferred error interrupt is of type APIC on SMCA systems
  x86/mce/AMD: Update sysfs bank names for SMCA systems
  x86/mce/AMD, EDAC/mce_amd: Define and use tables for known SMCA IP types
  EDAC/mce_amd: Use SMCA prefix for error descriptions arrays
  EDAC/mce_amd: Add missing SMCA error descriptions
  x86/mce/AMD: Read MSRs on the CPU allocating the threshold blocks
  x86/RAS: Add syndrome support to mce_amd_inj
  EDAC/mce_amd: Print syndrome register value on SMCA systems
  x86/mce: Add support for new MCA_SYND register
  x86/mce/AMD: Use msr_ops.misc() in allocate_threshold_blocks()
  x86/mce: Drop X86_FEATURE_MCE_RECOVERY and the related model string test
  x86/mce: Improve memcpy_mcsafe()
  x86/mce: Add PCI quirks to identify Xeons with machine check recovery
  ...
  • Loading branch information
Linus Torvalds committed Oct 3, 2016
2 parents 12b7bcb + b199ac6 commit e606d81
Show file tree
Hide file tree
Showing 14 changed files with 407 additions and 286 deletions.
1 change: 0 additions & 1 deletion arch/x86/include/asm/cpufeatures.h
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,6 @@
#define X86_FEATURE_APERFMPERF ( 3*32+28) /* APERFMPERF */
#define X86_FEATURE_EAGER_FPU ( 3*32+29) /* "eagerfpu" Non lazy FPU restore */
#define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */
#define X86_FEATURE_MCE_RECOVERY ( 3*32+31) /* cpu has recoverable machine checks */

/* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
#define X86_FEATURE_XMM3 ( 4*32+ 0) /* "pni" SSE-3 */
Expand Down
66 changes: 36 additions & 30 deletions arch/x86/include/asm/mce.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,10 @@
#define MCI_STATUS_AR (1ULL<<55) /* Action required */

/* AMD-specific bits */
#define MCI_STATUS_TCC (1ULL<<55) /* Task context corrupt */
#define MCI_STATUS_SYNDV (1ULL<<53) /* synd reg. valid */
#define MCI_STATUS_DEFERRED (1ULL<<44) /* uncorrected error, deferred exception */
#define MCI_STATUS_POISON (1ULL<<43) /* access poisonous data */
#define MCI_STATUS_TCC (1ULL<<55) /* Task context corrupt */

/*
* McaX field if set indicates a given bank supports MCA extensions:
Expand Down Expand Up @@ -110,6 +111,7 @@
#define MSR_AMD64_SMCA_MC0_MISC0 0xc0002003
#define MSR_AMD64_SMCA_MC0_CONFIG 0xc0002004
#define MSR_AMD64_SMCA_MC0_IPID 0xc0002005
#define MSR_AMD64_SMCA_MC0_SYND 0xc0002006
#define MSR_AMD64_SMCA_MC0_DESTAT 0xc0002008
#define MSR_AMD64_SMCA_MC0_DEADDR 0xc0002009
#define MSR_AMD64_SMCA_MC0_MISC1 0xc000200a
Expand All @@ -119,6 +121,7 @@
#define MSR_AMD64_SMCA_MCx_MISC(x) (MSR_AMD64_SMCA_MC0_MISC0 + 0x10*(x))
#define MSR_AMD64_SMCA_MCx_CONFIG(x) (MSR_AMD64_SMCA_MC0_CONFIG + 0x10*(x))
#define MSR_AMD64_SMCA_MCx_IPID(x) (MSR_AMD64_SMCA_MC0_IPID + 0x10*(x))
#define MSR_AMD64_SMCA_MCx_SYND(x) (MSR_AMD64_SMCA_MC0_SYND + 0x10*(x))
#define MSR_AMD64_SMCA_MCx_DESTAT(x) (MSR_AMD64_SMCA_MC0_DESTAT + 0x10*(x))
#define MSR_AMD64_SMCA_MCx_DEADDR(x) (MSR_AMD64_SMCA_MC0_DEADDR + 0x10*(x))
#define MSR_AMD64_SMCA_MCx_MISCy(x, y) ((MSR_AMD64_SMCA_MC0_MISC1 + y) + (0x10*(x)))
Expand Down Expand Up @@ -334,44 +337,47 @@ extern void apei_mce_report_mem_error(int corrected,
* Scalable MCA.
*/
#ifdef CONFIG_X86_MCE_AMD
enum amd_ip_types {
SMCA_F17H_CORE = 0, /* Core errors */
SMCA_DF, /* Data Fabric */
SMCA_UMC, /* Unified Memory Controller */
SMCA_PB, /* Parameter Block */
SMCA_PSP, /* Platform Security Processor */
SMCA_SMU, /* System Management Unit */
N_AMD_IP_TYPES
};

struct amd_hwid {
const char *name;
unsigned int hwid;
};

extern struct amd_hwid amd_hwids[N_AMD_IP_TYPES];

enum amd_core_mca_blocks {
/* These may be used by multiple smca_hwid_mcatypes */
enum smca_bank_types {
SMCA_LS = 0, /* Load Store */
SMCA_IF, /* Instruction Fetch */
SMCA_L2_CACHE, /* L2 cache */
SMCA_DE, /* Decoder unit */
RES, /* Reserved */
SMCA_EX, /* Execution unit */
SMCA_L2_CACHE, /* L2 Cache */
SMCA_DE, /* Decoder Unit */
SMCA_EX, /* Execution Unit */
SMCA_FP, /* Floating Point */
SMCA_L3_CACHE, /* L3 cache */
N_CORE_MCA_BLOCKS
SMCA_L3_CACHE, /* L3 Cache */
SMCA_CS, /* Coherent Slave */
SMCA_PIE, /* Power, Interrupts, etc. */
SMCA_UMC, /* Unified Memory Controller */
SMCA_PB, /* Parameter Block */
SMCA_PSP, /* Platform Security Processor */
SMCA_SMU, /* System Management Unit */
N_SMCA_BANK_TYPES
};

extern const char * const amd_core_mcablock_names[N_CORE_MCA_BLOCKS];
struct smca_bank_name {
const char *name; /* Short name for sysfs */
const char *long_name; /* Long name for pretty-printing */
};

extern struct smca_bank_name smca_bank_names[N_SMCA_BANK_TYPES];

#define HWID_MCATYPE(hwid, mcatype) ((hwid << 16) | mcatype)

enum amd_df_mca_blocks {
SMCA_CS = 0, /* Coherent Slave */
SMCA_PIE, /* Power management, Interrupts, etc */
N_DF_BLOCKS
struct smca_hwid_mcatype {
unsigned int bank_type; /* Use with smca_bank_types for easy indexing. */
u32 hwid_mcatype; /* (hwid,mcatype) tuple */
u32 xec_bitmap; /* Bitmap of valid ExtErrorCodes; current max is 21. */
};

extern const char * const amd_df_mcablock_names[N_DF_BLOCKS];
struct smca_bank_info {
struct smca_hwid_mcatype *type;
u32 type_instance;
};

extern struct smca_bank_info smca_banks[MAX_NR_BANKS];

#endif

#endif /* _ASM_X86_MCE_H */
5 changes: 1 addition & 4 deletions arch/x86/include/asm/pmem.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,10 +46,7 @@ static inline void arch_memcpy_to_pmem(void *dst, const void *src, size_t n)

static inline int arch_memcpy_from_pmem(void *dst, const void *src, size_t n)
{
if (static_cpu_has(X86_FEATURE_MCE_RECOVERY))
return memcpy_mcsafe(dst, src, n);
memcpy(dst, src, n);
return 0;
return memcpy_mcsafe(dst, src, n);
}

/**
Expand Down
19 changes: 18 additions & 1 deletion arch/x86/include/asm/string_64.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#define _ASM_X86_STRING_64_H

#ifdef __KERNEL__
#include <linux/jump_label.h>

/* Written 2002 by Andi Kleen */

Expand Down Expand Up @@ -78,6 +79,9 @@ int strcmp(const char *cs, const char *ct);
#define memset(s, c, n) __memset(s, c, n)
#endif

__must_check int memcpy_mcsafe_unrolled(void *dst, const void *src, size_t cnt);
DECLARE_STATIC_KEY_FALSE(mcsafe_key);

/**
* memcpy_mcsafe - copy memory with indication if a machine check happened
*
Expand All @@ -86,10 +90,23 @@ int strcmp(const char *cs, const char *ct);
* @cnt: number of bytes to copy
*
* Low level memory copy function that catches machine checks
* We only call into the "safe" function on systems that can
* actually do machine check recovery. Everyone else can just
* use memcpy().
*
* Return 0 for success, -EFAULT for fail
*/
int memcpy_mcsafe(void *dst, const void *src, size_t cnt);
static __always_inline __must_check int
memcpy_mcsafe(void *dst, const void *src, size_t cnt)
{
#ifdef CONFIG_X86_MCE
if (static_branch_unlikely(&mcsafe_key))
return memcpy_mcsafe_unrolled(dst, src, cnt);
else
#endif
memcpy(dst, src, cnt);
return 0;
}

#endif /* __KERNEL__ */

Expand Down
2 changes: 2 additions & 0 deletions arch/x86/include/uapi/asm/mce.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ struct mce {
__u32 socketid; /* CPU socket ID */
__u32 apicid; /* CPU initial apic ID */
__u64 mcgcap; /* MCGCAP MSR: machine check capabilities of CPU */
__u64 synd; /* MCA_SYND MSR: only valid on SMCA systems */
__u64 ipid; /* MCA_IPID MSR: only valid on SMCA systems */
};

#define MCE_GET_RECORD_LEN _IOR('M', 1, int)
Expand Down
44 changes: 33 additions & 11 deletions arch/x86/kernel/cpu/mcheck/mce.c
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
#include <linux/debugfs.h>
#include <linux/irq_work.h>
#include <linux/export.h>
#include <linux/jump_label.h>

#include <asm/processor.h>
#include <asm/traps.h>
Expand Down Expand Up @@ -292,6 +293,13 @@ static void print_mce(struct mce *m)
if (m->misc)
pr_cont("MISC %llx ", m->misc);

if (mce_flags.smca) {
if (m->synd)
pr_cont("SYND %llx ", m->synd);
if (m->ipid)
pr_cont("IPID %llx ", m->ipid);
}

pr_cont("\n");
/*
* Note this output is parsed by external tools and old fields
Expand Down Expand Up @@ -568,6 +576,7 @@ static void mce_read_aux(struct mce *m, int i)
{
if (m->status & MCI_STATUS_MISCV)
m->misc = mce_rdmsrl(msr_ops.misc(i));

if (m->status & MCI_STATUS_ADDRV) {
m->addr = mce_rdmsrl(msr_ops.addr(i));

Expand All @@ -579,6 +588,23 @@ static void mce_read_aux(struct mce *m, int i)
m->addr >>= shift;
m->addr <<= shift;
}

/*
* Extract [55:<lsb>] where lsb is the least significant
* *valid* bit of the address bits.
*/
if (mce_flags.smca) {
u8 lsb = (m->addr >> 56) & 0x3f;

m->addr &= GENMASK_ULL(55, lsb);
}
}

if (mce_flags.smca) {
m->ipid = mce_rdmsrl(MSR_AMD64_SMCA_MCx_IPID(i));

if (m->status & MCI_STATUS_SYNDV)
m->synd = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND(i));
}
}

Expand Down Expand Up @@ -1633,17 +1659,6 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)

if (c->x86 == 6 && c->x86_model == 45)
quirk_no_way_out = quirk_sandybridge_ifu;
/*
* MCG_CAP.MCG_SER_P is necessary but not sufficient to know
* whether this processor will actually generate recoverable
* machine checks. Check to see if this is an E7 model Xeon.
* We can't do a model number check because E5 and E7 use the
* same model number. E5 doesn't support recovery, E7 does.
*/
if (mca_cfg.recovery || (mca_cfg.ser &&
!strncmp(c->x86_model_id,
"Intel(R) Xeon(R) CPU E7-", 24)))
set_cpu_cap(c, X86_FEATURE_MCE_RECOVERY);
}
if (cfg->monarch_timeout < 0)
cfg->monarch_timeout = 0;
Expand Down Expand Up @@ -2080,6 +2095,7 @@ void mce_disable_bank(int bank)
* mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
* mce=nobootlog Don't log MCEs from before booting.
* mce=bios_cmci_threshold Don't program the CMCI threshold
* mce=recovery force enable memcpy_mcsafe()
*/
static int __init mcheck_enable(char *str)
{
Expand Down Expand Up @@ -2676,8 +2692,14 @@ static int __init mcheck_debugfs_init(void)
static int __init mcheck_debugfs_init(void) { return -EINVAL; }
#endif

DEFINE_STATIC_KEY_FALSE(mcsafe_key);
EXPORT_SYMBOL_GPL(mcsafe_key);

static int __init mcheck_late_init(void)
{
if (mca_cfg.recovery)
static_branch_inc(&mcsafe_key);

mcheck_debugfs_init();

/*
Expand Down
Loading

0 comments on commit e606d81

Please sign in to comment.