Skip to content

Commit

Permalink
amd64_edac: Remove polling mechanism
Browse files Browse the repository at this point in the history
Switch to reusing the mcheck core's machine check polling mechanism
instead of duplicating functionality by using the EDAC polling routine.

Correct formatting while at it.

Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
Acked-by: Doug Thompson <dougthompson@xmission.com>
  • Loading branch information
Borislav Petkov committed Aug 3, 2010
1 parent 98a5ae2 commit f434755
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 126 deletions.
118 changes: 0 additions & 118 deletions drivers/edac/amd64_edac.c
Original file line number Diff line number Diff line change
Expand Up @@ -1978,107 +1978,6 @@ static int get_channel_from_ecc_syndrome(struct mem_ctl_info *mci, u16 syndrome)
return map_err_sym_to_channel(err_sym, pvt->syn_type);
}

/*
* Check for valid error in the NB Status High register. If so, proceed to read
* NB Status Low, NB Address Low and NB Address High registers and store data
* into error structure.
*
* Returns:
* - 1: if hardware regs contains valid error info
* - 0: if no valid error is indicated
*/
static int amd64_get_error_info_regs(struct mem_ctl_info *mci,
struct err_regs *regs)
{
struct amd64_pvt *pvt;
struct pci_dev *misc_f3_ctl;

pvt = mci->pvt_info;
misc_f3_ctl = pvt->misc_f3_ctl;

if (amd64_read_pci_cfg(misc_f3_ctl, K8_NBSH, &regs->nbsh))
return 0;

if (!(regs->nbsh & K8_NBSH_VALID_BIT))
return 0;

/* valid error, read remaining error information registers */
if (amd64_read_pci_cfg(misc_f3_ctl, K8_NBSL, &regs->nbsl) ||
amd64_read_pci_cfg(misc_f3_ctl, K8_NBEAL, &regs->nbeal) ||
amd64_read_pci_cfg(misc_f3_ctl, K8_NBEAH, &regs->nbeah) ||
amd64_read_pci_cfg(misc_f3_ctl, K8_NBCFG, &regs->nbcfg))
return 0;

return 1;
}

/*
* This function is called to retrieve the error data from hardware and store it
* in the info structure.
*
* Returns:
* - 1: if a valid error is found
* - 0: if no error is found
*/
static int amd64_get_error_info(struct mem_ctl_info *mci,
struct err_regs *info)
{
struct amd64_pvt *pvt;
struct err_regs regs;

pvt = mci->pvt_info;

if (!amd64_get_error_info_regs(mci, info))
return 0;

/*
* Here's the problem with the K8's EDAC reporting: There are four
* registers which report pieces of error information. They are shared
* between CEs and UEs. Furthermore, contrary to what is stated in the
* BKDG, the overflow bit is never used! Every error always updates the
* reporting registers.
*
* Can you see the race condition? All four error reporting registers
* must be read before a new error updates them! There is no way to read
* all four registers atomically. The best than can be done is to detect
* that a race has occured and then report the error without any kind of
* precision.
*
* What is still positive is that errors are still reported and thus
* problems can still be detected - just not localized because the
* syndrome and address are spread out across registers.
*
* Grrrrr!!!!! Here's hoping that AMD fixes this in some future K8 rev.
* UEs and CEs should have separate register sets with proper overflow
* bits that are used! At very least the problem can be fixed by
* honoring the ErrValid bit in 'nbsh' and not updating registers - just
* set the overflow bit - unless the current error is CE and the new
* error is UE which would be the only situation for overwriting the
* current values.
*/

regs = *info;

/* Use info from the second read - most current */
if (unlikely(!amd64_get_error_info_regs(mci, info)))
return 0;

/* clear the error bits in hardware */
pci_write_bits32(pvt->misc_f3_ctl, K8_NBSH, 0, K8_NBSH_VALID_BIT);

/* Check for the possible race condition */
if ((regs.nbsh != info->nbsh) ||
(regs.nbsl != info->nbsl) ||
(regs.nbeah != info->nbeah) ||
(regs.nbeal != info->nbeal)) {
amd64_mc_printk(mci, KERN_WARNING,
"hardware STATUS read access race condition "
"detected!\n");
return 0;
}
return 1;
}

/*
* Handle any Correctable Errors (CEs) that have occurred. Check for valid ERROR
* ADDRESS and process.
Expand Down Expand Up @@ -2202,20 +2101,6 @@ void amd64_decode_bus_error(int node_id, struct err_regs *regs)

}

/*
* The main polling 'check' function, called FROM the edac core to perform the
* error checking and if an error is encountered, error processing.
*/
static void amd64_check(struct mem_ctl_info *mci)
{
struct err_regs regs;

if (amd64_get_error_info(mci, &regs)) {
struct amd64_pvt *pvt = mci->pvt_info;
amd_decode_nb_mce(pvt->mc_node_id, &regs, 1);
}
}

/*
* Input:
* 1) struct amd64_pvt which contains pvt->dram_f2_ctl pointer
Expand Down Expand Up @@ -2756,9 +2641,6 @@ static void amd64_setup_mci_misc_attributes(struct mem_ctl_info *mci)
mci->dev_name = pci_name(pvt->dram_f2_ctl);
mci->ctl_page_to_phys = NULL;

/* IMPORTANT: Set the polling 'check' function in this module */
mci->edac_check = amd64_check;

/* memory scrubber interface */
mci->set_sdram_scrub_rate = amd64_set_scrub_rate;
mci->get_sdram_scrub_rate = amd64_get_scrub_rate;
Expand Down
16 changes: 8 additions & 8 deletions drivers/edac/edac_mce_amd.c
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ static void amd_decode_dc_mce(u64 mc0_status)
u32 ec = mc0_status & 0xffff;
u32 xec = (mc0_status >> 16) & 0xf;

pr_emerg(" Data Cache Error");
pr_emerg("Data Cache Error");

if (xec == 1 && TLB_ERROR(ec))
pr_cont(": %s TLB multimatch.\n", LL_MSG(ec));
Expand Down Expand Up @@ -176,7 +176,7 @@ static void amd_decode_ic_mce(u64 mc1_status)
u32 ec = mc1_status & 0xffff;
u32 xec = (mc1_status >> 16) & 0xf;

pr_emerg(" Instruction Cache Error");
pr_emerg("Instruction Cache Error");

if (xec == 1 && TLB_ERROR(ec))
pr_cont(": %s TLB multimatch.\n", LL_MSG(ec));
Expand Down Expand Up @@ -233,7 +233,7 @@ static void amd_decode_bu_mce(u64 mc2_status)
u32 ec = mc2_status & 0xffff;
u32 xec = (mc2_status >> 16) & 0xf;

pr_emerg(" Bus Unit Error");
pr_emerg("Bus Unit Error");

if (xec == 0x1)
pr_cont(" in the write data buffers.\n");
Expand Down Expand Up @@ -275,7 +275,7 @@ static void amd_decode_ls_mce(u64 mc3_status)
u32 ec = mc3_status & 0xffff;
u32 xec = (mc3_status >> 16) & 0xf;

pr_emerg(" Load Store Error");
pr_emerg("Load Store Error");

if (xec == 0x0) {
u8 rrrr = (ec >> 4) & 0xf;
Expand Down Expand Up @@ -304,7 +304,7 @@ void amd_decode_nb_mce(int node_id, struct err_regs *regs, int handle_errors)
if (TLB_ERROR(ec) && !report_gart_errors)
return;

pr_emerg(" Northbridge Error, node %d", node_id);
pr_emerg("Northbridge Error, node %d", node_id);

/*
* F10h, revD can disable ErrCpu[3:0] so check that first and also the
Expand Down Expand Up @@ -342,13 +342,13 @@ static void amd_decode_fr_mce(u64 mc5_status)
static inline void amd_decode_err_code(unsigned int ec)
{
if (TLB_ERROR(ec)) {
pr_emerg(" Transaction: %s, Cache Level %s\n",
pr_emerg("Transaction: %s, Cache Level %s\n",
TT_MSG(ec), LL_MSG(ec));
} else if (MEM_ERROR(ec)) {
pr_emerg(" Transaction: %s, Type: %s, Cache Level: %s",
pr_emerg("Transaction: %s, Type: %s, Cache Level: %s",
RRRR_MSG(ec), TT_MSG(ec), LL_MSG(ec));
} else if (BUS_ERROR(ec)) {
pr_emerg(" Transaction type: %s(%s), %s, Cache Level: %s, "
pr_emerg("Transaction type: %s(%s), %s, Cache Level: %s, "
"Participating Processor: %s\n",
RRRR_MSG(ec), II_MSG(ec), TO_MSG(ec), LL_MSG(ec),
PP_MSG(ec));
Expand Down

0 comments on commit f434755

Please sign in to comment.