From f92cae45263b25cdb4c4d24e297e07945d3bc01b Mon Sep 17 00:00:00 2001 From: Ashish Shenoy Date: Wed, 22 Feb 2012 17:20:38 -0800 Subject: [PATCH 01/10] amd64_edac: Fix missing csrows sysfs nodes While initializing the array of csrow attribute instances, a few csrows were uninitialized. This happened because the module only performed a check for DRAM base ctl register0's and not DRAM base ctl register1's chip select enable bit. There could be systems with DIMMs populated on only single memory channel whereas the module also assumed that a dual channel dimm had double the memory size of a single memory channel instead of checking the memory on each channel. This patch fixes these above issues. Signed-off-by: Ashish Shenoy Signed-off-by: Prasanna S. Panchamukhi Link: http://lkml.kernel.org/r/4F459CFA.5090604@riverbed.com Signed-off-by: Borislav Petkov --- drivers/edac/amd64_edac.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index c9eee6d33e9a7..b9424dcde906f 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -2133,6 +2133,7 @@ static void read_mc_regs(struct amd64_pvt *pvt) static u32 amd64_csrow_nr_pages(struct amd64_pvt *pvt, u8 dct, int csrow_nr) { u32 cs_mode, nr_pages; + u32 dbam = dct ? pvt->dbam1 : pvt->dbam0; /* * The math on this doesn't look right on the surface because x/2*4 can @@ -2141,16 +2142,10 @@ static u32 amd64_csrow_nr_pages(struct amd64_pvt *pvt, u8 dct, int csrow_nr) * number of bits to shift the DBAM register to extract the proper CSROW * field. */ - cs_mode = (pvt->dbam0 >> ((csrow_nr / 2) * 4)) & 0xF; + cs_mode = (dbam >> ((csrow_nr / 2) * 4)) & 0xF; nr_pages = pvt->ops->dbam_to_cs(pvt, dct, cs_mode) << (20 - PAGE_SHIFT); - /* - * If dual channel then double the memory size of single channel. - * Channel count is 1 or 2 - */ - nr_pages <<= (pvt->channel_count - 1); - debugf0(" (csrow=%d) DBAM map index= %d\n", csrow_nr, cs_mode); debugf0(" nr_pages= %u channel-count = %d\n", nr_pages, pvt->channel_count); @@ -2181,7 +2176,7 @@ static int init_csrows(struct mem_ctl_info *mci) for_each_chip_select(i, 0, pvt) { csrow = &mci->csrows[i]; - if (!csrow_enabled(i, 0, pvt)) { + if (!csrow_enabled(i, 0, pvt) && !csrow_enabled(i, 1, pvt)) { debugf1("----CSROW %d EMPTY for node %d\n", i, pvt->mc_node_id); continue; @@ -2191,7 +2186,10 @@ static int init_csrows(struct mem_ctl_info *mci) i, pvt->mc_node_id); empty = 0; - csrow->nr_pages = amd64_csrow_nr_pages(pvt, 0, i); + if (csrow_enabled(i, 0, pvt)) + csrow->nr_pages = amd64_csrow_nr_pages(pvt, 0, i); + if (csrow_enabled(i, 1, pvt)) + csrow->nr_pages += amd64_csrow_nr_pages(pvt, 1, i); find_csrow_limits(mci, i, &input_addr_min, &input_addr_max); sys_addr = input_addr_to_sys_addr(mci, input_addr_min); csrow->first_page = (u32) (sys_addr >> PAGE_SHIFT); From 11b0a31473edf74b70ab6f8fe857b61bff82d7cc Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 9 Nov 2011 21:28:43 +0100 Subject: [PATCH 02/10] amd64_edac: Fix K8 revD and later chip select sizes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix DRAM chip select sizes calculation for K8, revisions D and E. Reported-by: Niklas Söderlund --- drivers/edac/amd64_edac.c | 32 ++++++++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index b9424dcde906f..03807283aca47 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -1132,12 +1132,36 @@ static int k8_dbam_to_chip_select(struct amd64_pvt *pvt, u8 dct, return ddr2_cs_size(cs_mode, dclr & WIDTH_128); } else if (pvt->ext_model >= K8_REV_D) { + unsigned diff; WARN_ON(cs_mode > 10); - if (cs_mode == 3 || cs_mode == 8) - return 32 << (cs_mode - 1); - else - return 32 << cs_mode; + /* + * the below calculation, besides trying to win an obfuscated C + * contest, maps cs_mode values to DIMM chip select sizes. The + * mappings are: + * + * cs_mode CS size (mb) + * ======= ============ + * 0 32 + * 1 64 + * 2 128 + * 3 128 + * 4 256 + * 5 512 + * 6 256 + * 7 512 + * 8 1024 + * 9 1024 + * 10 2048 + * + * Basically, it calculates a value with which to shift the + * smallest CS size of 32MB. + * + * ddr[23]_cs_size have a similar purpose. + */ + diff = cs_mode/3 + (unsigned)(cs_mode > 5); + + return 32 << (cs_mode - diff); } else { WARN_ON(cs_mode > 6); From 5e8e19bf6c3c9d8ecf74e2a7fdae99a76949bdf6 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 21 Sep 2011 14:10:43 +0200 Subject: [PATCH 03/10] EDAC: Correct scrub rate API The original scrub rate API definition states that if scrub rate accessors are not implemented, a negative value (-1) should be written to the sysfs file (/sys/devices/system/edac/mc/mc/sdram_scrub_rate, where N is the memory controller number on the system). This is counter-intuitive and awkward at the very least because, when setting the scrub rate, userspace has to write to sysfs and then read it back to check error status of the operation. As Tony notes, best it would be to not have the sdram_scrub_rate in sysfs if scrub rate support is not implemented. It is too late about that and a bunch of drivers on a bunch of arches would need to be changed and tested which is not a trivial task ATM. Instead, settle for the next best thing of returning -ENODEV when implementation is missing and -EINVAL when there was an error encountered while setting the scrub rate. Reported-by: Han Pingtian Cc: Tony Luck Link: http://lkml.kernel.org/r/20110916105856.GA13253@hpt.nay.redhat.com Signed-off-by: Borislav Petkov --- Documentation/edac.txt | 4 ++-- drivers/edac/edac_mc_sysfs.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Documentation/edac.txt b/Documentation/edac.txt index 249822cde82bc..fdcc49fad8e11 100644 --- a/Documentation/edac.txt +++ b/Documentation/edac.txt @@ -334,8 +334,8 @@ Sdram memory scrubbing rate: Reading the file will return the actual scrubbing rate employed. - If configuration fails or memory scrubbing is not implemented, the value - of the attribute file will be -1. + If configuration fails or memory scrubbing is not implemented, accessing + that attribute will fail. diff --git a/drivers/edac/edac_mc_sysfs.c b/drivers/edac/edac_mc_sysfs.c index d56e63477d5cd..e9a28f576d144 100644 --- a/drivers/edac/edac_mc_sysfs.c +++ b/drivers/edac/edac_mc_sysfs.c @@ -452,7 +452,7 @@ static ssize_t mci_sdram_scrub_rate_store(struct mem_ctl_info *mci, int new_bw = 0; if (!mci->set_sdram_scrub_rate) - return -EINVAL; + return -ENODEV; if (strict_strtoul(data, 10, &bandwidth) < 0) return -EINVAL; @@ -475,7 +475,7 @@ static ssize_t mci_sdram_scrub_rate_show(struct mem_ctl_info *mci, char *data) int bandwidth = 0; if (!mci->get_sdram_scrub_rate) - return -EINVAL; + return -ENODEV; bandwidth = mci->get_sdram_scrub_rate(mci); if (bandwidth < 0) { From 36c46f31df910b092aaaed27c7c616bb8e2302a1 Mon Sep 17 00:00:00 2001 From: Lionel Debroux Date: Mon, 27 Feb 2012 07:41:47 +0100 Subject: [PATCH 04/10] EDAC: Make pci_device_id tables __devinitconst. These const tables are currently marked __devinitdata, but Documentation/PCI/pci.txt says: "o The ID table array should be marked __devinitconst; this is done automatically if the table is declared with DEFINE_PCI_DEVICE_TABLE()." So use DEFINE_PCI_DEVICE_TABLE(x). Based on PaX and earlier work by Andi Kleen. Signed-off-by: Lionel Debroux Signed-off-by: Borislav Petkov --- drivers/edac/amd64_edac.c | 2 +- drivers/edac/amd76x_edac.c | 2 +- drivers/edac/e752x_edac.c | 2 +- drivers/edac/e7xxx_edac.c | 2 +- drivers/edac/i3000_edac.c | 2 +- drivers/edac/i3200_edac.c | 2 +- drivers/edac/i5000_edac.c | 2 +- drivers/edac/i5100_edac.c | 2 +- drivers/edac/i5400_edac.c | 2 +- drivers/edac/i7300_edac.c | 2 +- drivers/edac/i7core_edac.c | 2 +- drivers/edac/i82443bxgx_edac.c | 2 +- drivers/edac/i82860_edac.c | 2 +- drivers/edac/i82875p_edac.c | 2 +- drivers/edac/i82975x_edac.c | 2 +- drivers/edac/r82600_edac.c | 2 +- drivers/edac/sb_edac.c | 2 +- drivers/edac/x38_edac.c | 2 +- 18 files changed, 18 insertions(+), 18 deletions(-) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 03807283aca47..7ef73c919c5da 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -2707,7 +2707,7 @@ static void __devexit amd64_remove_one_instance(struct pci_dev *pdev) * PCI core identifies what devices are on a system during boot, and then * inquiry this table to see if this driver is for a given device found. */ -static const struct pci_device_id amd64_pci_table[] __devinitdata = { +static DEFINE_PCI_DEVICE_TABLE(amd64_pci_table) = { { .vendor = PCI_VENDOR_ID_AMD, .device = PCI_DEVICE_ID_AMD_K8_NB_MEMCTL, diff --git a/drivers/edac/amd76x_edac.c b/drivers/edac/amd76x_edac.c index e47e73bbbcc55..f8fd3c807bde0 100644 --- a/drivers/edac/amd76x_edac.c +++ b/drivers/edac/amd76x_edac.c @@ -321,7 +321,7 @@ static void __devexit amd76x_remove_one(struct pci_dev *pdev) edac_mc_free(mci); } -static const struct pci_device_id amd76x_pci_tbl[] __devinitdata = { +static DEFINE_PCI_DEVICE_TABLE(amd76x_pci_tbl) = { { PCI_VEND_DEV(AMD, FE_GATE_700C), PCI_ANY_ID, PCI_ANY_ID, 0, 0, AMD762}, diff --git a/drivers/edac/e752x_edac.c b/drivers/edac/e752x_edac.c index 1af531a11d21f..41223261ede9b 100644 --- a/drivers/edac/e752x_edac.c +++ b/drivers/edac/e752x_edac.c @@ -1380,7 +1380,7 @@ static void __devexit e752x_remove_one(struct pci_dev *pdev) edac_mc_free(mci); } -static const struct pci_device_id e752x_pci_tbl[] __devinitdata = { +static DEFINE_PCI_DEVICE_TABLE(e752x_pci_tbl) = { { PCI_VEND_DEV(INTEL, 7520_0), PCI_ANY_ID, PCI_ANY_ID, 0, 0, E7520}, diff --git a/drivers/edac/e7xxx_edac.c b/drivers/edac/e7xxx_edac.c index 6ffb6d23281f6..68dea87b72e63 100644 --- a/drivers/edac/e7xxx_edac.c +++ b/drivers/edac/e7xxx_edac.c @@ -525,7 +525,7 @@ static void __devexit e7xxx_remove_one(struct pci_dev *pdev) edac_mc_free(mci); } -static const struct pci_device_id e7xxx_pci_tbl[] __devinitdata = { +static DEFINE_PCI_DEVICE_TABLE(e7xxx_pci_tbl) = { { PCI_VEND_DEV(INTEL, 7205_0), PCI_ANY_ID, PCI_ANY_ID, 0, 0, E7205}, diff --git a/drivers/edac/i3000_edac.c b/drivers/edac/i3000_edac.c index c0510b3d70353..277689a688413 100644 --- a/drivers/edac/i3000_edac.c +++ b/drivers/edac/i3000_edac.c @@ -470,7 +470,7 @@ static void __devexit i3000_remove_one(struct pci_dev *pdev) edac_mc_free(mci); } -static const struct pci_device_id i3000_pci_tbl[] __devinitdata = { +static DEFINE_PCI_DEVICE_TABLE(i3000_pci_tbl) = { { PCI_VEND_DEV(INTEL, 3000_HB), PCI_ANY_ID, PCI_ANY_ID, 0, 0, I3000}, diff --git a/drivers/edac/i3200_edac.c b/drivers/edac/i3200_edac.c index 73f55e2008c2d..046808c6357df 100644 --- a/drivers/edac/i3200_edac.c +++ b/drivers/edac/i3200_edac.c @@ -445,7 +445,7 @@ static void __devexit i3200_remove_one(struct pci_dev *pdev) edac_mc_free(mci); } -static const struct pci_device_id i3200_pci_tbl[] __devinitdata = { +static DEFINE_PCI_DEVICE_TABLE(i3200_pci_tbl) = { { PCI_VEND_DEV(INTEL, 3200_HB), PCI_ANY_ID, PCI_ANY_ID, 0, 0, I3200}, diff --git a/drivers/edac/i5000_edac.c b/drivers/edac/i5000_edac.c index 4dc3ac25a4226..a2680d8e744b1 100644 --- a/drivers/edac/i5000_edac.c +++ b/drivers/edac/i5000_edac.c @@ -1516,7 +1516,7 @@ static void __devexit i5000_remove_one(struct pci_dev *pdev) * * The "E500P" device is the first device supported. */ -static const struct pci_device_id i5000_pci_tbl[] __devinitdata = { +static DEFINE_PCI_DEVICE_TABLE(i5000_pci_tbl) = { {PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_I5000_DEV16), .driver_data = I5000P}, diff --git a/drivers/edac/i5100_edac.c b/drivers/edac/i5100_edac.c index bcbdeeca48b89..2e23547b2f24e 100644 --- a/drivers/edac/i5100_edac.c +++ b/drivers/edac/i5100_edac.c @@ -1051,7 +1051,7 @@ static void __devexit i5100_remove_one(struct pci_dev *pdev) edac_mc_free(mci); } -static const struct pci_device_id i5100_pci_tbl[] __devinitdata = { +static DEFINE_PCI_DEVICE_TABLE(i5100_pci_tbl) = { /* Device 16, Function 0, Channel 0 Memory Map, Error Flag/Mask, ... */ { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_5100_16) }, { 0, } diff --git a/drivers/edac/i5400_edac.c b/drivers/edac/i5400_edac.c index 74d6ec342afbf..67ec9626a3305 100644 --- a/drivers/edac/i5400_edac.c +++ b/drivers/edac/i5400_edac.c @@ -1383,7 +1383,7 @@ static void __devexit i5400_remove_one(struct pci_dev *pdev) * * The "E500P" device is the first device supported. */ -static const struct pci_device_id i5400_pci_tbl[] __devinitdata = { +static DEFINE_PCI_DEVICE_TABLE(i5400_pci_tbl) = { {PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_5400_ERR)}, {0,} /* 0 terminated list. */ }; diff --git a/drivers/edac/i7300_edac.c b/drivers/edac/i7300_edac.c index 6104dba380b62..3bafa3bca1487 100644 --- a/drivers/edac/i7300_edac.c +++ b/drivers/edac/i7300_edac.c @@ -1192,7 +1192,7 @@ static void __devexit i7300_remove_one(struct pci_dev *pdev) * * Has only 8086:360c PCI ID */ -static const struct pci_device_id i7300_pci_tbl[] __devinitdata = { +static DEFINE_PCI_DEVICE_TABLE(i7300_pci_tbl) = { {PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_I7300_MCH_ERR)}, {0,} /* 0 terminated list. */ }; diff --git a/drivers/edac/i7core_edac.c b/drivers/edac/i7core_edac.c index 8568d9b618750..85226ccf52907 100644 --- a/drivers/edac/i7core_edac.c +++ b/drivers/edac/i7core_edac.c @@ -391,7 +391,7 @@ static const struct pci_id_table pci_dev_table[] = { /* * pci_device_id table for which devices we are looking for */ -static const struct pci_device_id i7core_pci_tbl[] __devinitdata = { +static DEFINE_PCI_DEVICE_TABLE(i7core_pci_tbl) = { {PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_X58_HUB_MGMT)}, {PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_LYNNFIELD_QPI_LINK0)}, {0,} /* 0 terminated list. */ diff --git a/drivers/edac/i82443bxgx_edac.c b/drivers/edac/i82443bxgx_edac.c index 4329d39f902cd..3bf2b2f490e7d 100644 --- a/drivers/edac/i82443bxgx_edac.c +++ b/drivers/edac/i82443bxgx_edac.c @@ -380,7 +380,7 @@ static void __devexit i82443bxgx_edacmc_remove_one(struct pci_dev *pdev) EXPORT_SYMBOL_GPL(i82443bxgx_edacmc_remove_one); -static const struct pci_device_id i82443bxgx_pci_tbl[] __devinitdata = { +static DEFINE_PCI_DEVICE_TABLE(i82443bxgx_pci_tbl) = { {PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82443BX_0)}, {PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82443BX_2)}, {PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82443GX_0)}, diff --git a/drivers/edac/i82860_edac.c b/drivers/edac/i82860_edac.c index 931a057750491..c779092d18d1c 100644 --- a/drivers/edac/i82860_edac.c +++ b/drivers/edac/i82860_edac.c @@ -270,7 +270,7 @@ static void __devexit i82860_remove_one(struct pci_dev *pdev) edac_mc_free(mci); } -static const struct pci_device_id i82860_pci_tbl[] __devinitdata = { +static DEFINE_PCI_DEVICE_TABLE(i82860_pci_tbl) = { { PCI_VEND_DEV(INTEL, 82860_0), PCI_ANY_ID, PCI_ANY_ID, 0, 0, I82860}, diff --git a/drivers/edac/i82875p_edac.c b/drivers/edac/i82875p_edac.c index 33864c63c6840..10f15d85fb5ee 100644 --- a/drivers/edac/i82875p_edac.c +++ b/drivers/edac/i82875p_edac.c @@ -511,7 +511,7 @@ static void __devexit i82875p_remove_one(struct pci_dev *pdev) edac_mc_free(mci); } -static const struct pci_device_id i82875p_pci_tbl[] __devinitdata = { +static DEFINE_PCI_DEVICE_TABLE(i82875p_pci_tbl) = { { PCI_VEND_DEV(INTEL, 82875_0), PCI_ANY_ID, PCI_ANY_ID, 0, 0, I82875P}, diff --git a/drivers/edac/i82975x_edac.c b/drivers/edac/i82975x_edac.c index 4184e0171f007..0cd8368f88f8c 100644 --- a/drivers/edac/i82975x_edac.c +++ b/drivers/edac/i82975x_edac.c @@ -612,7 +612,7 @@ static void __devexit i82975x_remove_one(struct pci_dev *pdev) edac_mc_free(mci); } -static const struct pci_device_id i82975x_pci_tbl[] __devinitdata = { +static DEFINE_PCI_DEVICE_TABLE(i82975x_pci_tbl) = { { PCI_VEND_DEV(INTEL, 82975_0), PCI_ANY_ID, PCI_ANY_ID, 0, 0, I82975X diff --git a/drivers/edac/r82600_edac.c b/drivers/edac/r82600_edac.c index e294e1b3616cf..6d908ad72d645 100644 --- a/drivers/edac/r82600_edac.c +++ b/drivers/edac/r82600_edac.c @@ -373,7 +373,7 @@ static void __devexit r82600_remove_one(struct pci_dev *pdev) edac_mc_free(mci); } -static const struct pci_device_id r82600_pci_tbl[] __devinitdata = { +static DEFINE_PCI_DEVICE_TABLE(r82600_pci_tbl) = { { PCI_DEVICE(PCI_VENDOR_ID_RADISYS, R82600_BRIDGE_ID) }, diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c index 1dc118d83cc6a..3a605f7777125 100644 --- a/drivers/edac/sb_edac.c +++ b/drivers/edac/sb_edac.c @@ -367,7 +367,7 @@ static const struct pci_id_table pci_dev_descr_sbridge_table[] = { /* * pci_device_id table for which devices we are looking for */ -static const struct pci_device_id sbridge_pci_tbl[] __devinitdata = { +static DEFINE_PCI_DEVICE_TABLE(sbridge_pci_tbl) = { {PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SBRIDGE_IMC_TA)}, {0,} /* 0 terminated list. */ }; diff --git a/drivers/edac/x38_edac.c b/drivers/edac/x38_edac.c index b6f47de152f31..a438297389e5d 100644 --- a/drivers/edac/x38_edac.c +++ b/drivers/edac/x38_edac.c @@ -440,7 +440,7 @@ static void __devexit x38_remove_one(struct pci_dev *pdev) edac_mc_free(mci); } -static const struct pci_device_id x38_pci_tbl[] __devinitdata = { +static DEFINE_PCI_DEVICE_TABLE(x38_pci_tbl) = { { PCI_VEND_DEV(INTEL, X38_HB), PCI_ANY_ID, PCI_ANY_ID, 0, 0, X38}, From 344f0a0631e1b2784859fbe2351d99dce2652b77 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 15 Nov 2011 17:10:58 +0100 Subject: [PATCH 05/10] MCE, AMD: Correct some MC0 error types Use "System Read Data Error" as a more general name for MC0 bus errors on F15h and update some error definitions. Signed-off-by: Borislav Petkov Reviewed-by: Andreas Herrmann --- drivers/edac/mce_amd.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c index bd926ea2e00c5..0ee1c0a115a2b 100644 --- a/drivers/edac/mce_amd.c +++ b/drivers/edac/mce_amd.c @@ -255,10 +255,9 @@ static bool f15h_dc_mce(u16 ec, u8 xec) } else if (BUS_ERROR(ec)) { if (!xec) - pr_cont("during system linefill.\n"); + pr_cont("System Read Data Error.\n"); else - pr_cont(" Internal %s condition.\n", - ((xec == 1) ? "livelock" : "deadlock")); + pr_cont(" Internal error condition type %d.\n", xec); } else ret = false; From 6c1173a61e63c32bd40cb1e6dd16343240a328eb Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 21 Nov 2011 19:45:34 +0100 Subject: [PATCH 06/10] MCE, AMD: Correct ucode patch buffer description This MC1 error signature is called differently now, fix it. Signed-off-by: Borislav Petkov Reviewed-by: Andreas Herrmann --- drivers/edac/mce_amd.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c index 0ee1c0a115a2b..5626e17a6b91a 100644 --- a/drivers/edac/mce_amd.c +++ b/drivers/edac/mce_amd.c @@ -88,7 +88,7 @@ static const char * const f15h_ic_mce_desc[] = { "Parity error for IC probe tag valid bit", "PFB non-cacheable bit parity error", "PFB valid bit parity error", /* xec = 0xd */ - "patch RAM", /* xec = 010 */ + "Microcode Patch Buffer", /* xec = 010 */ "uop queue", "insn buffer", "predecode buffer", @@ -354,7 +354,11 @@ static bool f15h_ic_mce(u16 ec, u8 xec) pr_cont("%s.\n", f15h_ic_mce_desc[xec-2]); break; - case 0x10 ... 0x14: + case 0x10: + pr_cont("%s.\n", f15h_ic_mce_desc[xec-4]); + break; + + case 0x11 ... 0x14: pr_cont("Decoder %s parity error.\n", f15h_ic_mce_desc[xec-4]); break; From b64a99c1752d2b6525a5011a8e473f8f8a4bdd79 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 23 Nov 2011 14:50:44 +0100 Subject: [PATCH 07/10] MCE, AMD: Correct VB data error description Sync with latest BKDG error types. Signed-off-by: Borislav Petkov Reviewed-by: Andreas Herrmann --- drivers/edac/mce_amd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c index 5626e17a6b91a..bf6dd9978aa78 100644 --- a/drivers/edac/mce_amd.c +++ b/drivers/edac/mce_amd.c @@ -104,7 +104,7 @@ static const char * const f15h_cu_mce_desc[] = { "WCC Tag ECC error", "WCC Data ECC error", "WCB Data parity error", - "VB Data/ECC error", + "VB Data ECC or parity error", "L2 Tag ECC error", /* xec = 0x10 */ "Hard L2 Tag ECC error", "Multiple hits on L2 tag", From 68782673e6dd69054a9b75b0983a5e45e16f6625 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 24 Nov 2011 21:29:57 +0100 Subject: [PATCH 08/10] MCE, AMD: Rework NB MCE signatures Correct their formulation, replace per-family functions with a single, unified lookup table. Signed-off-by: Borislav Petkov Reviewed-by: Andreas Herrmann --- drivers/edac/mce_amd.c | 176 +++++++++++------------------------------ drivers/edac/mce_amd.h | 1 - 2 files changed, 48 insertions(+), 129 deletions(-) diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c index bf6dd9978aa78..f6ebe5e9a57f4 100644 --- a/drivers/edac/mce_amd.c +++ b/drivers/edac/mce_amd.c @@ -64,17 +64,6 @@ EXPORT_SYMBOL_GPL(to_msgs); const char *ii_msgs[] = { "MEM", "RESV", "IO", "GEN" }; EXPORT_SYMBOL_GPL(ii_msgs); -static const char *f10h_nb_mce_desc[] = { - "HT link data error", - "Protocol error (link, L3, probe filter, etc.)", - "Parity error in NB-internal arrays", - "Link Retry due to IO link transmission error", - "L3 ECC data cache error", - "ECC error in L3 cache tag", - "L3 LRU parity bits error", - "ECC Error in the Probe Filter directory" -}; - static const char * const f15h_ic_mce_desc[] = { "UC during a demand linefill from L2", "Parity error during data load from IC", @@ -112,6 +101,28 @@ static const char * const f15h_cu_mce_desc[] = { "PRB address parity error" }; +static const char *nb_mce_desc[] = { + "DRAM ECC error detected on the NB", + "CRC error detected on HT link", + "Link-defined sync error packets detected on HT link", + "HT Master abort", + "HT Target abort", + "Invalid GART PTE entry during GART table walk", + "Unsupported atomic RMW received from an IO link", + "Watchdog timeout due to lack of progress", + "DRAM ECC error detected on the NB", + "SVM DMA Exclusion Vector error", + "HT data error detected on link", + "Protocol error (link, L3, probe filter)", + "NB internal arrays parity error", + "DRAM addr/ctl signals parity error", + "IO link transmission error", + "L3 data cache ECC error", /* xec = 0x1c */ + "L3 cache tag error", + "L3 LRU parity bits error", + "ECC Error in the Probe Filter directory" +}; + static const char * const fr_ex_mce_desc[] = { "CPU Watchdog timer expire", "Wakeup array dest tag", @@ -499,58 +510,31 @@ static void amd_decode_ls_mce(struct mce *m) pr_emerg(HW_ERR "Corrupted LS MCE info?\n"); } -static bool k8_nb_mce(u16 ec, u8 xec) +void amd_decode_nb_mce(struct mce *m) { - bool ret = true; - - switch (xec) { - case 0x1: - pr_cont("CRC error detected on HT link.\n"); - break; - - case 0x5: - pr_cont("Invalid GART PTE entry during GART table walk.\n"); - break; - - case 0x6: - pr_cont("Unsupported atomic RMW received from an IO link.\n"); - break; - - case 0x0: - case 0x8: - if (boot_cpu_data.x86 == 0x11) - return false; - - pr_cont("DRAM ECC error detected on the NB.\n"); - break; - - case 0xd: - pr_cont("Parity error on the DRAM addr/ctl signals.\n"); - break; - - default: - ret = false; - break; - } + struct cpuinfo_x86 *c = &boot_cpu_data; + int node_id = amd_get_nb_id(m->extcpu); + u16 ec = EC(m->status); + u8 xec = XEC(m->status, 0x1f); + u8 offset = 0; - return ret; -} + pr_emerg(HW_ERR "Northbridge Error (node %d): ", node_id); -static bool f10h_nb_mce(u16 ec, u8 xec) -{ - bool ret = true; - u8 offset = 0; + switch (xec) { + case 0x0 ... 0xe: - if (k8_nb_mce(ec, xec)) - return true; + /* special handling for DRAM ECCs */ + if (xec == 0x0 || xec == 0x8) { + /* no ECCs on F11h */ + if (c->x86 == 0x11) + goto wrong_nb_mce; - switch(xec) { - case 0xa ... 0xc: - offset = 10; - break; + pr_cont("%s.\n", nb_mce_desc[xec]); - case 0xe: - offset = 11; + if (nb_bus_decoder) + nb_bus_decoder(node_id, m); + return; + } break; case 0xf: @@ -559,83 +543,25 @@ static bool f10h_nb_mce(u16 ec, u8 xec) else if (BUS_ERROR(ec)) pr_cont("DMA Exclusion Vector Table Walk error.\n"); else - ret = false; - - goto out; - break; + goto wrong_nb_mce; + return; case 0x19: if (boot_cpu_data.x86 == 0x15) pr_cont("Compute Unit Data Error.\n"); else - ret = false; - - goto out; - break; + goto wrong_nb_mce; + return; case 0x1c ... 0x1f: - offset = 24; + offset = 13; break; default: - ret = false; - - goto out; - break; - } - - pr_cont("%s.\n", f10h_nb_mce_desc[xec - offset]); - -out: - return ret; -} - -static bool nb_noop_mce(u16 ec, u8 xec) -{ - return false; -} - -void amd_decode_nb_mce(struct mce *m) -{ - struct cpuinfo_x86 *c = &boot_cpu_data; - int node_id = amd_get_nb_id(m->extcpu); - u16 ec = EC(m->status); - u8 xec = XEC(m->status, 0x1f); - - pr_emerg(HW_ERR "Northbridge Error (node %d): ", node_id); - - switch (xec) { - case 0x2: - pr_cont("Sync error (sync packets on HT link detected).\n"); - return; - - case 0x3: - pr_cont("HT Master abort.\n"); - return; - - case 0x4: - pr_cont("HT Target abort.\n"); - return; - - case 0x7: - pr_cont("NB Watchdog timeout.\n"); - return; - - case 0x9: - pr_cont("SVM DMA Exclusion Vector error.\n"); - return; - - default: - break; - } - - if (!fam_ops->nb_mce(ec, xec)) goto wrong_nb_mce; + } - if (c->x86 == 0xf || c->x86 == 0x10 || c->x86 == 0x15) - if ((xec == 0x8 || xec == 0x0) && nb_bus_decoder) - nb_bus_decoder(node_id, m); - + pr_cont("%s.\n", nb_mce_desc[xec - offset]); return; wrong_nb_mce: @@ -844,39 +770,33 @@ static int __init mce_amd_init(void) case 0xf: fam_ops->dc_mce = k8_dc_mce; fam_ops->ic_mce = k8_ic_mce; - fam_ops->nb_mce = k8_nb_mce; break; case 0x10: fam_ops->dc_mce = f10h_dc_mce; fam_ops->ic_mce = k8_ic_mce; - fam_ops->nb_mce = f10h_nb_mce; break; case 0x11: fam_ops->dc_mce = k8_dc_mce; fam_ops->ic_mce = k8_ic_mce; - fam_ops->nb_mce = f10h_nb_mce; break; case 0x12: fam_ops->dc_mce = f12h_dc_mce; fam_ops->ic_mce = k8_ic_mce; - fam_ops->nb_mce = nb_noop_mce; break; case 0x14: nb_err_cpumask = 0x3; fam_ops->dc_mce = f14h_dc_mce; fam_ops->ic_mce = f14h_ic_mce; - fam_ops->nb_mce = nb_noop_mce; break; case 0x15: xec_mask = 0x1f; fam_ops->dc_mce = f15h_dc_mce; fam_ops->ic_mce = f15h_ic_mce; - fam_ops->nb_mce = f10h_nb_mce; break; default: diff --git a/drivers/edac/mce_amd.h b/drivers/edac/mce_amd.h index 0106747e240c7..6fcf599e691f8 100644 --- a/drivers/edac/mce_amd.h +++ b/drivers/edac/mce_amd.h @@ -82,7 +82,6 @@ extern const char *ii_msgs[]; struct amd_decoder_ops { bool (*dc_mce)(u16, u8); bool (*ic_mce)(u16, u8); - bool (*nb_mce)(u16, u8); }; void amd_report_gart_errors(bool); From ae615b4b5f0b875cbe8a029239436c6aed8c0ef4 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 25 Nov 2011 15:42:59 +0100 Subject: [PATCH 09/10] MCE, AMD: Correct bank 5 error signatures ... and remove superfluous ErrorCodeExt check. Signed-off-by: Borislav Petkov Reviewed-by: Andreas Herrmann --- drivers/edac/mce_amd.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c index f6ebe5e9a57f4..88a92974b78c6 100644 --- a/drivers/edac/mce_amd.c +++ b/drivers/edac/mce_amd.c @@ -136,7 +136,7 @@ static const char * const fr_ex_mce_desc[] = { "Physical register file AG0 port", "Physical register file AG1 port", "Flag register file", - "DE correctable error could not be corrected" + "DE error occurred" }; static bool f12h_dc_mce(u16 ec, u8 xec) @@ -577,9 +577,6 @@ static void amd_decode_fr_mce(struct mce *m) if (c->x86 == 0xf || c->x86 == 0x11) goto wrong_fr_mce; - if (c->x86 != 0x15 && xec != 0x0) - goto wrong_fr_mce; - pr_emerg(HW_ERR "%s Error: ", (c->x86 == 0x15 ? "Execution Unit" : "FIROB")); From ebe2aea86872622d4352cd71d55298fedf69a7bb Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 29 Nov 2011 19:03:25 +0100 Subject: [PATCH 10/10] MCE, AMD: Constify error tables ... so that checkpatch can chill out. Signed-off-by: Borislav Petkov Reviewed-by: Andreas Herrmann --- drivers/edac/mce_amd.c | 14 +++++++------- drivers/edac/mce_amd.h | 12 ++++++------ 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c index 88a92974b78c6..36e1486eb9aa0 100644 --- a/drivers/edac/mce_amd.c +++ b/drivers/edac/mce_amd.c @@ -39,29 +39,29 @@ EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder); */ /* transaction type */ -const char *tt_msgs[] = { "INSN", "DATA", "GEN", "RESV" }; +const char * const tt_msgs[] = { "INSN", "DATA", "GEN", "RESV" }; EXPORT_SYMBOL_GPL(tt_msgs); /* cache level */ -const char *ll_msgs[] = { "RESV", "L1", "L2", "L3/GEN" }; +const char * const ll_msgs[] = { "RESV", "L1", "L2", "L3/GEN" }; EXPORT_SYMBOL_GPL(ll_msgs); /* memory transaction type */ -const char *rrrr_msgs[] = { +const char * const rrrr_msgs[] = { "GEN", "RD", "WR", "DRD", "DWR", "IRD", "PRF", "EV", "SNP" }; EXPORT_SYMBOL_GPL(rrrr_msgs); /* participating processor */ -const char *pp_msgs[] = { "SRC", "RES", "OBS", "GEN" }; +const char * const pp_msgs[] = { "SRC", "RES", "OBS", "GEN" }; EXPORT_SYMBOL_GPL(pp_msgs); /* request timeout */ -const char *to_msgs[] = { "no timeout", "timed out" }; +const char * const to_msgs[] = { "no timeout", "timed out" }; EXPORT_SYMBOL_GPL(to_msgs); /* memory or i/o */ -const char *ii_msgs[] = { "MEM", "RESV", "IO", "GEN" }; +const char * const ii_msgs[] = { "MEM", "RESV", "IO", "GEN" }; EXPORT_SYMBOL_GPL(ii_msgs); static const char * const f15h_ic_mce_desc[] = { @@ -101,7 +101,7 @@ static const char * const f15h_cu_mce_desc[] = { "PRB address parity error" }; -static const char *nb_mce_desc[] = { +static const char * const nb_mce_desc[] = { "DRAM ECC error detected on the NB", "CRC error detected on HT link", "Link-defined sync error packets detected on HT link", diff --git a/drivers/edac/mce_amd.h b/drivers/edac/mce_amd.h index 6fcf599e691f8..c6074c5cd1ef4 100644 --- a/drivers/edac/mce_amd.h +++ b/drivers/edac/mce_amd.h @@ -69,12 +69,12 @@ enum rrrr_ids { R4_SNOOP, }; -extern const char *tt_msgs[]; -extern const char *ll_msgs[]; -extern const char *rrrr_msgs[]; -extern const char *pp_msgs[]; -extern const char *to_msgs[]; -extern const char *ii_msgs[]; +extern const char * const tt_msgs[]; +extern const char * const ll_msgs[]; +extern const char * const rrrr_msgs[]; +extern const char * const pp_msgs[]; +extern const char * const to_msgs[]; +extern const char * const ii_msgs[]; /* * per-family decoder ops