From c67ea7d22eeb133363e8227342cf14cbf6eaa9a0 Mon Sep 17 00:00:00 2001 From: Shubhrajyoti Datta Date: Wed, 4 Jan 2023 14:15:11 +0530 Subject: [PATCH 01/15] dt-bindings: edac: Add bindings for Xilinx ZynqMP OCM Add bindings for Xilinx ZynqMP OCM controller. Signed-off-by: Shubhrajyoti Datta Co-developed-by: Sai Krishna Potthuri Signed-off-by: Sai Krishna Potthuri Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20230104084512.1855243-2-sai.krishna.potthuri@amd.com --- .../xlnx,zynqmp-ocmc-1.0.yaml | 45 +++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 Documentation/devicetree/bindings/memory-controllers/xlnx,zynqmp-ocmc-1.0.yaml diff --git a/Documentation/devicetree/bindings/memory-controllers/xlnx,zynqmp-ocmc-1.0.yaml b/Documentation/devicetree/bindings/memory-controllers/xlnx,zynqmp-ocmc-1.0.yaml new file mode 100644 index 0000000000000..ca9fc747bf4fe --- /dev/null +++ b/Documentation/devicetree/bindings/memory-controllers/xlnx,zynqmp-ocmc-1.0.yaml @@ -0,0 +1,45 @@ +# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/memory-controllers/xlnx,zynqmp-ocmc-1.0.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Xilinx Zynqmp OCM(On-Chip Memory) Controller + +maintainers: + - Shubhrajyoti Datta + - Sai Krishna Potthuri + +description: | + The OCM supports 64-bit wide ECC functionality to detect multi-bit errors + and recover from a single-bit memory fault.On a write, if all bytes are + being written, the ECC is generated and written into the ECC RAM along with + the write-data that is written into the data RAM. If one or more bytes are + not written, then the read operation results in an correctable error or + uncorrectable error. + +properties: + compatible: + const: xlnx,zynqmp-ocmc-1.0 + + reg: + maxItems: 1 + + interrupts: + maxItems: 1 + +required: + - compatible + - reg + - interrupts + +additionalProperties: false + +examples: + - | + #include + memory-controller@ff960000 { + compatible = "xlnx,zynqmp-ocmc-1.0"; + reg = <0xff960000 0x1000>; + interrupts = ; + }; From 3bd2706c910fd328e4ab96ae0aabdcd7c4a90fbf Mon Sep 17 00:00:00 2001 From: Sai Krishna Potthuri Date: Wed, 4 Jan 2023 14:15:12 +0530 Subject: [PATCH 02/15] EDAC/zynqmp: Add EDAC support for Xilinx ZynqMP OCM Add EDAC support for Xilinx ZynqMP OCM Controller, so this driver reports CE and UE errors upon interrupt generation. Also add debugfs files for error injection. On Xilinx ZynqMP platform, both OCM Controller driver(zynqmp_edac) and DDR Memory Controller driver(synopsys_edac) co-exist which means both can be loaded at a time. This scenario is tested on Xilinx ZynqMP platform. Fix following issue reported by the robot: "MAINTAINERS references a file that doesn't exist: Documentation/devicetree/bindings/edac/xlnx,zynqmp-ocmc.yaml" [ bp: - Massage commit message - s/EDAC_ZYNQMP_OCM/EDAC_ZYNQMP/ - Touchups ] Reported-by: kernel test robot Co-developed-by: Shubhrajyoti Datta Signed-off-by: Shubhrajyoti Datta Signed-off-by: Sai Krishna Potthuri Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20230104084512.1855243-3-sai.krishna.potthuri@amd.com --- MAINTAINERS | 7 + drivers/edac/Kconfig | 8 + drivers/edac/Makefile | 1 + drivers/edac/zynqmp_edac.c | 467 +++++++++++++++++++++++++++++++++++++ 4 files changed, 483 insertions(+) create mode 100644 drivers/edac/zynqmp_edac.c diff --git a/MAINTAINERS b/MAINTAINERS index a36df9ed283d3..27cc2a00cc753 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -22908,6 +22908,13 @@ F: Documentation/devicetree/bindings/dma/xilinx/xlnx,zynqmp-dpdma.yaml F: drivers/dma/xilinx/xilinx_dpdma.c F: include/dt-bindings/dma/xlnx-zynqmp-dpdma.h +XILINX ZYNQMP OCM EDAC DRIVER +M: Shubhrajyoti Datta +M: Sai Krishna Potthuri +S: Maintained +F: Documentation/devicetree/bindings/memory-controllers/xlnx,zynqmp-ocmc-1.0.yaml +F: drivers/edac/zynqmp_edac.c + XILINX ZYNQMP PSGTR PHY DRIVER M: Anurag Kumar Vulisha M: Laurent Pinchart diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig index 4cfdefbd744da..68f576700911f 100644 --- a/drivers/edac/Kconfig +++ b/drivers/edac/Kconfig @@ -542,4 +542,12 @@ config EDAC_DMC520 Support for error detection and correction on the SoCs with ARM DMC-520 DRAM controller. +config EDAC_ZYNQMP + tristate "Xilinx ZynqMP OCM Controller" + depends on ARCH_ZYNQMP || COMPILE_TEST + help + This driver supports error detection and correction for the + Xilinx ZynqMP OCM (On Chip Memory) controller. It can also be + built as a module. In that case it will be called zynqmp_edac. + endif # EDAC diff --git a/drivers/edac/Makefile b/drivers/edac/Makefile index 2d1641a27a28f..9b025c5b30617 100644 --- a/drivers/edac/Makefile +++ b/drivers/edac/Makefile @@ -84,3 +84,4 @@ obj-$(CONFIG_EDAC_QCOM) += qcom_edac.o obj-$(CONFIG_EDAC_ASPEED) += aspeed_edac.o obj-$(CONFIG_EDAC_BLUEFIELD) += bluefield_edac.o obj-$(CONFIG_EDAC_DMC520) += dmc520_edac.o +obj-$(CONFIG_EDAC_ZYNQMP) += zynqmp_edac.o diff --git a/drivers/edac/zynqmp_edac.c b/drivers/edac/zynqmp_edac.c new file mode 100644 index 0000000000000..ac7d1e0b324c7 --- /dev/null +++ b/drivers/edac/zynqmp_edac.c @@ -0,0 +1,467 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Xilinx ZynqMP OCM ECC Driver + * + * Copyright (C) 2022 Advanced Micro Devices, Inc. + */ + +#include +#include +#include +#include +#include +#include + +#include "edac_module.h" + +#define ZYNQMP_OCM_EDAC_MSG_SIZE 256 + +#define ZYNQMP_OCM_EDAC_STRING "zynqmp_ocm" + +/* Error/Interrupt registers */ +#define ERR_CTRL_OFST 0x0 +#define OCM_ISR_OFST 0x04 +#define OCM_IMR_OFST 0x08 +#define OCM_IEN_OFST 0x0C +#define OCM_IDS_OFST 0x10 + +/* ECC control register */ +#define ECC_CTRL_OFST 0x14 + +/* Correctable error info registers */ +#define CE_FFA_OFST 0x1C +#define CE_FFD0_OFST 0x20 +#define CE_FFD1_OFST 0x24 +#define CE_FFD2_OFST 0x28 +#define CE_FFD3_OFST 0x2C +#define CE_FFE_OFST 0x30 + +/* Uncorrectable error info registers */ +#define UE_FFA_OFST 0x34 +#define UE_FFD0_OFST 0x38 +#define UE_FFD1_OFST 0x3C +#define UE_FFD2_OFST 0x40 +#define UE_FFD3_OFST 0x44 +#define UE_FFE_OFST 0x48 + +/* ECC control register bit field definitions */ +#define ECC_CTRL_CLR_CE_ERR 0x40 +#define ECC_CTRL_CLR_UE_ERR 0x80 + +/* Fault injection data and count registers */ +#define OCM_FID0_OFST 0x4C +#define OCM_FID1_OFST 0x50 +#define OCM_FID2_OFST 0x54 +#define OCM_FID3_OFST 0x58 +#define OCM_FIC_OFST 0x74 + +#define UE_MAX_BITPOS_LOWER 31 +#define UE_MIN_BITPOS_UPPER 32 +#define UE_MAX_BITPOS_UPPER 63 + +/* Interrupt masks */ +#define OCM_CEINTR_MASK BIT(6) +#define OCM_UEINTR_MASK BIT(7) +#define OCM_ECC_ENABLE_MASK BIT(0) + +#define OCM_FICOUNT_MASK GENMASK(23, 0) +#define OCM_NUM_UE_BITPOS 2 +#define OCM_BASEVAL 0xFFFC0000 +#define EDAC_DEVICE "ZynqMP-OCM" + +/** + * struct ecc_error_info - ECC error log information + * @addr: Fault generated at this address + * @fault_lo: Generated fault data (lower 32-bit) + * @fault_hi: Generated fault data (upper 32-bit) + */ +struct ecc_error_info { + u32 addr; + u32 fault_lo; + u32 fault_hi; +}; + +/** + * struct ecc_status - ECC status information to report + * @ce_cnt: Correctable error count + * @ue_cnt: Uncorrectable error count + * @ceinfo: Correctable error log information + * @ueinfo: Uncorrectable error log information + */ +struct ecc_status { + u32 ce_cnt; + u32 ue_cnt; + struct ecc_error_info ceinfo; + struct ecc_error_info ueinfo; +}; + +/** + * struct edac_priv - OCM private instance data + * @baseaddr: Base address of the OCM + * @message: Buffer for framing the event specific info + * @stat: ECC status information + * @ce_cnt: Correctable Error count + * @ue_cnt: Uncorrectable Error count + * @debugfs_dir: Directory entry for debugfs + * @ce_bitpos: Bit position for Correctable Error + * @ue_bitpos: Array to store UnCorrectable Error bit positions + * @fault_injection_cnt: Fault Injection Counter value + */ +struct edac_priv { + void __iomem *baseaddr; + char message[ZYNQMP_OCM_EDAC_MSG_SIZE]; + struct ecc_status stat; + u32 ce_cnt; + u32 ue_cnt; +#ifdef CONFIG_EDAC_DEBUG + struct dentry *debugfs_dir; + u8 ce_bitpos; + u8 ue_bitpos[OCM_NUM_UE_BITPOS]; + u32 fault_injection_cnt; +#endif +}; + +/** + * get_error_info - Get the current ECC error info + * @base: Pointer to the base address of the OCM + * @p: Pointer to the OCM ECC status structure + * @mask: Status register mask value + * + * Determines there is any ECC error or not + * + */ +static void get_error_info(void __iomem *base, struct ecc_status *p, int mask) +{ + if (mask & OCM_CEINTR_MASK) { + p->ce_cnt++; + p->ceinfo.fault_lo = readl(base + CE_FFD0_OFST); + p->ceinfo.fault_hi = readl(base + CE_FFD1_OFST); + p->ceinfo.addr = (OCM_BASEVAL | readl(base + CE_FFA_OFST)); + writel(ECC_CTRL_CLR_CE_ERR, base + OCM_ISR_OFST); + } else if (mask & OCM_UEINTR_MASK) { + p->ue_cnt++; + p->ueinfo.fault_lo = readl(base + UE_FFD0_OFST); + p->ueinfo.fault_hi = readl(base + UE_FFD1_OFST); + p->ueinfo.addr = (OCM_BASEVAL | readl(base + UE_FFA_OFST)); + writel(ECC_CTRL_CLR_UE_ERR, base + OCM_ISR_OFST); + } +} + +/** + * handle_error - Handle error types CE and UE + * @dci: Pointer to the EDAC device instance + * @p: Pointer to the OCM ECC status structure + * + * Handles correctable and uncorrectable errors. + */ +static void handle_error(struct edac_device_ctl_info *dci, struct ecc_status *p) +{ + struct edac_priv *priv = dci->pvt_info; + struct ecc_error_info *pinf; + + if (p->ce_cnt) { + pinf = &p->ceinfo; + snprintf(priv->message, ZYNQMP_OCM_EDAC_MSG_SIZE, + "\nOCM ECC error type :%s\nAddr: [0x%x]\nFault Data[0x%08x%08x]", + "CE", pinf->addr, pinf->fault_hi, pinf->fault_lo); + edac_device_handle_ce(dci, 0, 0, priv->message); + } + + if (p->ue_cnt) { + pinf = &p->ueinfo; + snprintf(priv->message, ZYNQMP_OCM_EDAC_MSG_SIZE, + "\nOCM ECC error type :%s\nAddr: [0x%x]\nFault Data[0x%08x%08x]", + "UE", pinf->addr, pinf->fault_hi, pinf->fault_lo); + edac_device_handle_ue(dci, 0, 0, priv->message); + } + + memset(p, 0, sizeof(*p)); +} + +/** + * intr_handler - ISR routine + * @irq: irq number + * @dev_id: device id pointer + * + * Return: IRQ_NONE, if CE/UE interrupt not set or IRQ_HANDLED otherwise + */ +static irqreturn_t intr_handler(int irq, void *dev_id) +{ + struct edac_device_ctl_info *dci = dev_id; + struct edac_priv *priv = dci->pvt_info; + int regval; + + regval = readl(priv->baseaddr + OCM_ISR_OFST); + if (!(regval & (OCM_CEINTR_MASK | OCM_UEINTR_MASK))) { + WARN_ONCE(1, "Unhandled IRQ%d, ISR: 0x%x", irq, regval); + return IRQ_NONE; + } + + get_error_info(priv->baseaddr, &priv->stat, regval); + + priv->ce_cnt += priv->stat.ce_cnt; + priv->ue_cnt += priv->stat.ue_cnt; + handle_error(dci, &priv->stat); + + return IRQ_HANDLED; +} + +/** + * get_eccstate - Return the ECC status + * @base: Pointer to the OCM base address + * + * Get the ECC enable/disable status + * + * Return: ECC status 0/1. + */ +static bool get_eccstate(void __iomem *base) +{ + return readl(base + ECC_CTRL_OFST) & OCM_ECC_ENABLE_MASK; +} + +#ifdef CONFIG_EDAC_DEBUG +/** + * write_fault_count - write fault injection count + * @priv: Pointer to the EDAC private struct + * + * Update the fault injection count register, once the counter reaches + * zero, it injects errors + */ +static void write_fault_count(struct edac_priv *priv) +{ + u32 ficount = priv->fault_injection_cnt; + + if (ficount & ~OCM_FICOUNT_MASK) { + ficount &= OCM_FICOUNT_MASK; + edac_printk(KERN_INFO, EDAC_DEVICE, + "Fault injection count value truncated to %d\n", ficount); + } + + writel(ficount, priv->baseaddr + OCM_FIC_OFST); +} + +/* + * To get the Correctable Error injected, the following steps are needed: + * - Setup the optional Fault Injection Count: + * echo > /sys/kernel/debug/edac/ocm/inject_fault_count + * - Write the Correctable Error bit position value: + * echo > /sys/kernel/debug/edac/ocm/inject_ce_bitpos + */ +static ssize_t inject_ce_write(struct file *file, const char __user *data, + size_t count, loff_t *ppos) +{ + struct edac_device_ctl_info *edac_dev = file->private_data; + struct edac_priv *priv = edac_dev->pvt_info; + int ret; + + if (!data) + return -EFAULT; + + ret = kstrtou8_from_user(data, count, 0, &priv->ce_bitpos); + if (ret) + return ret; + + if (priv->ce_bitpos > UE_MAX_BITPOS_UPPER) + return -EINVAL; + + if (priv->ce_bitpos <= UE_MAX_BITPOS_LOWER) { + writel(BIT(priv->ce_bitpos), priv->baseaddr + OCM_FID0_OFST); + writel(0, priv->baseaddr + OCM_FID1_OFST); + } else { + writel(BIT(priv->ce_bitpos - UE_MIN_BITPOS_UPPER), + priv->baseaddr + OCM_FID1_OFST); + writel(0, priv->baseaddr + OCM_FID0_OFST); + } + + write_fault_count(priv); + + return count; +} + +static const struct file_operations inject_ce_fops = { + .open = simple_open, + .write = inject_ce_write, + .llseek = generic_file_llseek, +}; + +/* + * To get the Uncorrectable Error injected, the following steps are needed: + * - Setup the optional Fault Injection Count: + * echo > /sys/kernel/debug/edac/ocm/inject_fault_count + * - Write the Uncorrectable Error bit position values: + * echo , > /sys/kernel/debug/edac/ocm/inject_ue_bitpos + */ +static ssize_t inject_ue_write(struct file *file, const char __user *data, + size_t count, loff_t *ppos) +{ + struct edac_device_ctl_info *edac_dev = file->private_data; + struct edac_priv *priv = edac_dev->pvt_info; + char buf[6], *pbuf, *token[2]; + u64 ue_bitpos; + int i, ret; + u8 len; + + if (!data) + return -EFAULT; + + len = min_t(size_t, count, sizeof(buf)); + if (copy_from_user(buf, data, len)) + return -EFAULT; + + buf[len] = '\0'; + pbuf = &buf[0]; + for (i = 0; i < OCM_NUM_UE_BITPOS; i++) + token[i] = strsep(&pbuf, ","); + + ret = kstrtou8(token[0], 0, &priv->ue_bitpos[0]); + if (ret) + return ret; + + ret = kstrtou8(token[1], 0, &priv->ue_bitpos[1]); + if (ret) + return ret; + + if (priv->ue_bitpos[0] > UE_MAX_BITPOS_UPPER || + priv->ue_bitpos[1] > UE_MAX_BITPOS_UPPER) + return -EINVAL; + + if (priv->ue_bitpos[0] == priv->ue_bitpos[1]) { + edac_printk(KERN_ERR, EDAC_DEVICE, "Bit positions should not be equal\n"); + return -EINVAL; + } + + ue_bitpos = BIT(priv->ue_bitpos[0]) | BIT(priv->ue_bitpos[1]); + + writel((u32)ue_bitpos, priv->baseaddr + OCM_FID0_OFST); + writel((u32)(ue_bitpos >> 32), priv->baseaddr + OCM_FID1_OFST); + + write_fault_count(priv); + + return count; +} + +static const struct file_operations inject_ue_fops = { + .open = simple_open, + .write = inject_ue_write, + .llseek = generic_file_llseek, +}; + +static void setup_debugfs(struct edac_device_ctl_info *edac_dev) +{ + struct edac_priv *priv = edac_dev->pvt_info; + + priv->debugfs_dir = edac_debugfs_create_dir("ocm"); + if (!priv->debugfs_dir) + return; + + edac_debugfs_create_x32("inject_fault_count", 0644, priv->debugfs_dir, + &priv->fault_injection_cnt); + edac_debugfs_create_file("inject_ue_bitpos", 0644, priv->debugfs_dir, + edac_dev, &inject_ue_fops); + edac_debugfs_create_file("inject_ce_bitpos", 0644, priv->debugfs_dir, + edac_dev, &inject_ce_fops); +} +#endif + +static int edac_probe(struct platform_device *pdev) +{ + struct edac_device_ctl_info *dci; + struct edac_priv *priv; + void __iomem *baseaddr; + struct resource *res; + int irq, ret; + + baseaddr = devm_platform_get_and_ioremap_resource(pdev, 0, &res); + if (IS_ERR(baseaddr)) + return PTR_ERR(baseaddr); + + if (!get_eccstate(baseaddr)) { + edac_printk(KERN_INFO, EDAC_DEVICE, "ECC not enabled\n"); + return -ENXIO; + } + + dci = edac_device_alloc_ctl_info(sizeof(*priv), ZYNQMP_OCM_EDAC_STRING, + 1, ZYNQMP_OCM_EDAC_STRING, 1, 0, NULL, 0, + edac_device_alloc_index()); + if (!dci) + return -ENOMEM; + + priv = dci->pvt_info; + platform_set_drvdata(pdev, dci); + dci->dev = &pdev->dev; + priv->baseaddr = baseaddr; + dci->mod_name = pdev->dev.driver->name; + dci->ctl_name = ZYNQMP_OCM_EDAC_STRING; + dci->dev_name = dev_name(&pdev->dev); + + irq = platform_get_irq(pdev, 0); + if (irq < 0) { + ret = irq; + goto free_dev_ctl; + } + + ret = devm_request_irq(&pdev->dev, irq, intr_handler, 0, + dev_name(&pdev->dev), dci); + if (ret) { + edac_printk(KERN_ERR, EDAC_DEVICE, "Failed to request Irq\n"); + goto free_dev_ctl; + } + + /* Enable UE, CE interrupts */ + writel((OCM_CEINTR_MASK | OCM_UEINTR_MASK), priv->baseaddr + OCM_IEN_OFST); + +#ifdef CONFIG_EDAC_DEBUG + setup_debugfs(dci); +#endif + + ret = edac_device_add_device(dci); + if (ret) + goto free_dev_ctl; + + return 0; + +free_dev_ctl: + edac_device_free_ctl_info(dci); + + return ret; +} + +static int edac_remove(struct platform_device *pdev) +{ + struct edac_device_ctl_info *dci = platform_get_drvdata(pdev); + struct edac_priv *priv = dci->pvt_info; + + /* Disable UE, CE interrupts */ + writel((OCM_CEINTR_MASK | OCM_UEINTR_MASK), priv->baseaddr + OCM_IDS_OFST); + +#ifdef CONFIG_EDAC_DEBUG + debugfs_remove_recursive(priv->debugfs_dir); +#endif + + edac_device_del_device(&pdev->dev); + edac_device_free_ctl_info(dci); + + return 0; +} + +static const struct of_device_id zynqmp_ocm_edac_match[] = { + { .compatible = "xlnx,zynqmp-ocmc-1.0"}, + { /* end of table */ } +}; + +MODULE_DEVICE_TABLE(of, zynqmp_ocm_edac_match); + +static struct platform_driver zynqmp_ocm_edac_driver = { + .driver = { + .name = "zynqmp-ocm-edac", + .of_match_table = zynqmp_ocm_edac_match, + }, + .probe = edac_probe, + .remove = edac_remove, +}; + +module_platform_driver(zynqmp_ocm_edac_driver); + +MODULE_AUTHOR("Advanced Micro Devices, Inc"); +MODULE_DESCRIPTION("Xilinx ZynqMP OCM ECC driver"); +MODULE_LICENSE("GPL"); From 8d8fcc391f50e2d9686d42b85e9b1db89b1bbb35 Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Wed, 18 Jan 2023 20:38:49 +0530 Subject: [PATCH 03/15] EDAC/qcom: Add platform_device_id table for module autoloading Add a device ID table so that the driver loads automatically when the associated platform_device gets registered. Reported-by: Andrew Halaney Signed-off-by: Manivannan Sadhasivam Signed-off-by: Borislav Petkov (AMD) Tested-by: Steev Klimaszewski # Thinkpad X13s Tested-by: Andrew Halaney # sa8540p-ride Link: https://lore.kernel.org/r/20230118150904.26913-3-manivannan.sadhasivam@linaro.org --- drivers/edac/qcom_edac.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/edac/qcom_edac.c b/drivers/edac/qcom_edac.c index 97a27e42dd610..9e77fa84e84fb 100644 --- a/drivers/edac/qcom_edac.c +++ b/drivers/edac/qcom_edac.c @@ -397,12 +397,19 @@ static int qcom_llcc_edac_remove(struct platform_device *pdev) return 0; } +static const struct platform_device_id qcom_llcc_edac_id_table[] = { + { .name = "qcom_llcc_edac" }, + {} +}; +MODULE_DEVICE_TABLE(platform, qcom_llcc_edac_id_table); + static struct platform_driver qcom_llcc_edac_driver = { .probe = qcom_llcc_edac_probe, .remove = qcom_llcc_edac_remove, .driver = { .name = "qcom_llcc_edac", }, + .id_table = qcom_llcc_edac_id_table, }; module_platform_driver(qcom_llcc_edac_driver); From 6e8746cb735166eaf3ceb086b31bb0431f5e3532 Mon Sep 17 00:00:00 2001 From: Qiuxu Zhuo Date: Fri, 13 Jan 2023 11:27:58 +0800 Subject: [PATCH 04/15] EDAC/skx_common: Enable EDAC support for the "near" memory The current {skx,i10nm}_edac miss the EDAC support to decode errors from the 1st level memory (the fast "near" memory as cache) of the 2-level memory system. Introduce a helper function skx_error_in_mem() to check whether errors are from memory at the beginning of skx_mce_check_error(). As long as the errors are from memory (either the 1-level memory system or the 2-level memory system), decode the errors. Reported-and-tested-by: Youquan Song Signed-off-by: Qiuxu Zhuo Signed-off-by: Tony Luck Link: https://lore.kernel.org/all/20230113032802.41752-1-qiuxu.zhuo@intel.com --- drivers/edac/skx_common.c | 18 ++++++++++++------ drivers/edac/skx_common.h | 24 ++++++++++++++++++++++++ 2 files changed, 36 insertions(+), 6 deletions(-) diff --git a/drivers/edac/skx_common.c b/drivers/edac/skx_common.c index f0f8e98f6efb2..5fdcb6d5b96b6 100644 --- a/drivers/edac/skx_common.c +++ b/drivers/edac/skx_common.c @@ -632,12 +632,18 @@ static bool skx_error_in_1st_level_mem(const struct mce *m) if (!skx_mem_cfg_2lm) return false; - errcode = GET_BITFIELD(m->status, 0, 15); + errcode = GET_BITFIELD(m->status, 0, 15) & MCACOD_MEM_ERR_MASK; - if ((errcode & 0xef80) != 0x280) - return false; + return errcode == MCACOD_EXT_MEM_ERR; +} - return true; +static bool skx_error_in_mem(const struct mce *m) +{ + u32 errcode; + + errcode = GET_BITFIELD(m->status, 0, 15) & MCACOD_MEM_ERR_MASK; + + return (errcode == MCACOD_MEM_CTL_ERR || errcode == MCACOD_EXT_MEM_ERR); } int skx_mce_check_error(struct notifier_block *nb, unsigned long val, @@ -651,8 +657,8 @@ int skx_mce_check_error(struct notifier_block *nb, unsigned long val, if (mce->kflags & MCE_HANDLED_CEC) return NOTIFY_DONE; - /* ignore unless this is memory related with an address */ - if ((mce->status & 0xefff) >> 7 != 1 || !(mce->status & MCI_STATUS_ADDRV)) + /* Ignore unless this is memory related with an address */ + if (!skx_error_in_mem(mce) || !(mce->status & MCI_STATUS_ADDRV)) return NOTIFY_DONE; memset(&res, 0, sizeof(res)); diff --git a/drivers/edac/skx_common.h b/drivers/edac/skx_common.h index 0cbadd3d2cd39..3120326572649 100644 --- a/drivers/edac/skx_common.h +++ b/drivers/edac/skx_common.h @@ -56,6 +56,30 @@ #define MCI_MISC_ECC_MODE(m) (((m) >> 59) & 15) #define MCI_MISC_ECC_DDRT 8 /* read from DDRT */ +/* + * According to Intel Architecture spec vol 3B, + * Table 15-10 "IA32_MCi_Status [15:0] Compound Error Code Encoding" + * memory errors should fit one of these masks: + * 000f 0000 1mmm cccc (binary) + * 000f 0010 1mmm cccc (binary) [RAM used as cache] + * where: + * f = Correction Report Filtering Bit. If 1, subsequent errors + * won't be shown + * mmm = error type + * cccc = channel + */ +#define MCACOD_MEM_ERR_MASK 0xef80 +/* + * Errors from either the memory of the 1-level memory system or the + * 2nd level memory (the slow "far" memory) of the 2-level memory system. + */ +#define MCACOD_MEM_CTL_ERR 0x80 +/* + * Errors from the 1st level memory (the fast "near" memory as cache) + * of the 2-level memory system. + */ +#define MCACOD_EXT_MEM_ERR 0x280 + /* * Each cpu socket contains some pci devices that provide global * information, and also some that are local to each of the two From d2415e2e5330fc11f4c688fa518751bdc90259f5 Mon Sep 17 00:00:00 2001 From: Qiuxu Zhuo Date: Fri, 13 Jan 2023 11:27:59 +0800 Subject: [PATCH 05/15] EDAC/skx_common: Delete duplicated and unreachable code skx_mce_check_error() returns early if the error isn't from memory. So when skx_mce_output_error() is invoked from skx_mce_check_error(), it doesn't need to re-check whether the error is from memory. Delete the duplicated and unreachable code from skx_mce_output_error(). Signed-off-by: Qiuxu Zhuo Signed-off-by: Tony Luck Link: https://lore.kernel.org/all/20230113032802.41752-1-qiuxu.zhuo@intel.com --- drivers/edac/skx_common.c | 58 ++++++++++++++------------------------- 1 file changed, 21 insertions(+), 37 deletions(-) diff --git a/drivers/edac/skx_common.c b/drivers/edac/skx_common.c index 5fdcb6d5b96b6..fd8186fe005c8 100644 --- a/drivers/edac/skx_common.c +++ b/drivers/edac/skx_common.c @@ -560,44 +560,28 @@ static void skx_mce_output_error(struct mem_ctl_info *mci, tp_event = HW_EVENT_ERR_CORRECTED; } - /* - * According to Intel Architecture spec vol 3B, - * Table 15-10 "IA32_MCi_Status [15:0] Compound Error Code Encoding" - * memory errors should fit one of these masks: - * 000f 0000 1mmm cccc (binary) - * 000f 0010 1mmm cccc (binary) [RAM used as cache] - * where: - * f = Correction Report Filtering Bit. If 1, subsequent errors - * won't be shown - * mmm = error type - * cccc = channel - * If the mask doesn't match, report an error to the parsing logic - */ - if (!((errcode & 0xef80) == 0x80 || (errcode & 0xef80) == 0x280)) { - optype = "Can't parse: it is not a mem"; - } else { - switch (optypenum) { - case 0: - optype = "generic undef request error"; - break; - case 1: - optype = "memory read error"; - break; - case 2: - optype = "memory write error"; - break; - case 3: - optype = "addr/cmd error"; - break; - case 4: - optype = "memory scrubbing error"; - scrub_err = true; - break; - default: - optype = "reserved"; - break; - } + switch (optypenum) { + case 0: + optype = "generic undef request error"; + break; + case 1: + optype = "memory read error"; + break; + case 2: + optype = "memory write error"; + break; + case 3: + optype = "addr/cmd error"; + break; + case 4: + optype = "memory scrubbing error"; + scrub_err = true; + break; + default: + optype = "reserved"; + break; } + if (res->decoded_by_adxl) { len = snprintf(skx_msg, MSG_SIZE, "%s%s err_code:0x%04x:0x%04x %s", overflow ? " OVERFLOW" : "", From e4b2bc6616e21f4a7ce4e7452f716e3db8fe66b6 Mon Sep 17 00:00:00 2001 From: Qiuxu Zhuo Date: Fri, 13 Jan 2023 11:28:00 +0800 Subject: [PATCH 06/15] EDAC/i10nm: Add Intel Emerald Rapids server support The Emerald Rapids CPU model uses similar memory controller registers as Sapphire Rapids server. Add Emerald Rapids CPU model number ID for EDAC support. Tested-by: Li Zhang Signed-off-by: Qiuxu Zhuo Signed-off-by: Tony Luck Link: https://lore.kernel.org/all/20230113032802.41752-1-qiuxu.zhuo@intel.com --- drivers/edac/i10nm_base.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c index 65aeea53e2df4..e11726f7fe365 100644 --- a/drivers/edac/i10nm_base.c +++ b/drivers/edac/i10nm_base.c @@ -637,6 +637,7 @@ static const struct x86_cpu_id i10nm_cpuids[] = { X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(ICELAKE_X, X86_STEPPINGS(0x4, 0xf), &i10nm_cfg1), X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(ICELAKE_D, X86_STEPPINGS(0x0, 0xf), &i10nm_cfg1), X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(SAPPHIRERAPIDS_X, X86_STEPPINGS(0x0, 0xf), &spr_cfg), + X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(EMERALDRAPIDS_X, X86_STEPPINGS(0x0, 0xf), &spr_cfg), {} }; MODULE_DEVICE_TABLE(x86cpu, i10nm_cpuids); From dd7814b78539416c6e561eeaa0951b3e88ac799e Mon Sep 17 00:00:00 2001 From: Qiuxu Zhuo Date: Fri, 13 Jan 2023 11:28:01 +0800 Subject: [PATCH 07/15] EDAC/i10nm: Make more configurations CPU model specific The numbers of memory controllers per socket, channels per memory controller, DIMMs per channel and the triples of bus/device/function of PCI devices used in i10nm_edac can be CPU model specific. Add new fields to the structure res_config for above numbers and triples to make them CPU model specific. Signed-off-by: Qiuxu Zhuo Signed-off-by: Tony Luck Link: https://lore.kernel.org/all/20230113032802.41752-1-qiuxu.zhuo@intel.com --- drivers/edac/i10nm_base.c | 131 ++++++++++++++++++++++++++------------ drivers/edac/skx_common.h | 32 +++++++++- 2 files changed, 121 insertions(+), 42 deletions(-) diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c index e11726f7fe365..5682d64cceb6e 100644 --- a/drivers/edac/i10nm_base.c +++ b/drivers/edac/i10nm_base.c @@ -148,35 +148,47 @@ static void __enable_retry_rd_err_log(struct skx_imc *imc, int chan, bool enable static void enable_retry_rd_err_log(bool enable) { + int i, j, imc_num, chan_num; struct skx_imc *imc; struct skx_dev *d; - int i, j; edac_dbg(2, "\n"); - list_for_each_entry(d, i10nm_edac_list, list) - for (i = 0; i < I10NM_NUM_IMC; i++) { + list_for_each_entry(d, i10nm_edac_list, list) { + imc_num = res_cfg->ddr_imc_num; + chan_num = res_cfg->ddr_chan_num; + + for (i = 0; i < imc_num; i++) { imc = &d->imc[i]; if (!imc->mbase) continue; - for (j = 0; j < I10NM_NUM_CHANNELS; j++) { - if (imc->hbm_mc) { - __enable_retry_rd_err_log(imc, j, enable, - res_cfg->offsets_scrub_hbm0, - res_cfg->offsets_demand_hbm0, - NULL); - __enable_retry_rd_err_log(imc, j, enable, - res_cfg->offsets_scrub_hbm1, - res_cfg->offsets_demand_hbm1, - NULL); - } else { - __enable_retry_rd_err_log(imc, j, enable, - res_cfg->offsets_scrub, - res_cfg->offsets_demand, - res_cfg->offsets_demand2); - } + for (j = 0; j < chan_num; j++) + __enable_retry_rd_err_log(imc, j, enable, + res_cfg->offsets_scrub, + res_cfg->offsets_demand, + res_cfg->offsets_demand2); + } + + imc_num += res_cfg->hbm_imc_num; + chan_num = res_cfg->hbm_chan_num; + + for (; i < imc_num; i++) { + imc = &d->imc[i]; + if (!imc->mbase || !imc->hbm_mc) + continue; + + for (j = 0; j < chan_num; j++) { + __enable_retry_rd_err_log(imc, j, enable, + res_cfg->offsets_scrub_hbm0, + res_cfg->offsets_demand_hbm0, + NULL); + __enable_retry_rd_err_log(imc, j, enable, + res_cfg->offsets_scrub_hbm1, + res_cfg->offsets_demand_hbm1, + NULL); } + } } } @@ -318,9 +330,9 @@ static bool i10nm_check_2lm(struct res_config *cfg) int i; list_for_each_entry(d, i10nm_edac_list, list) { - d->sad_all = pci_get_dev_wrapper(d->seg, d->bus[1], - PCI_SLOT(cfg->sad_all_devfn), - PCI_FUNC(cfg->sad_all_devfn)); + d->sad_all = pci_get_dev_wrapper(d->seg, d->bus[res_cfg->sad_all_bdf.bus], + res_cfg->sad_all_bdf.dev, + res_cfg->sad_all_bdf.fun); if (!d->sad_all) continue; @@ -444,11 +456,15 @@ static int i10nm_get_ddr_munits(void) u64 base; list_for_each_entry(d, i10nm_edac_list, list) { - d->util_all = pci_get_dev_wrapper(d->seg, d->bus[1], 29, 1); + d->util_all = pci_get_dev_wrapper(d->seg, d->bus[res_cfg->util_all_bdf.bus], + res_cfg->util_all_bdf.dev, + res_cfg->util_all_bdf.fun); if (!d->util_all) return -ENODEV; - d->uracu = pci_get_dev_wrapper(d->seg, d->bus[0], 0, 1); + d->uracu = pci_get_dev_wrapper(d->seg, d->bus[res_cfg->uracu_bdf.bus], + res_cfg->uracu_bdf.dev, + res_cfg->uracu_bdf.fun); if (!d->uracu) return -ENODEV; @@ -461,9 +477,10 @@ static int i10nm_get_ddr_munits(void) edac_dbg(2, "socket%d mmio base 0x%llx (reg 0x%x)\n", j++, base, reg); - for (i = 0; i < I10NM_NUM_DDR_IMC; i++) { - mdev = pci_get_dev_wrapper(d->seg, d->bus[0], - 12 + i, 0); + for (i = 0; i < res_cfg->ddr_imc_num; i++) { + mdev = pci_get_dev_wrapper(d->seg, d->bus[res_cfg->ddr_mdev_bdf.bus], + res_cfg->ddr_mdev_bdf.dev + i, + res_cfg->ddr_mdev_bdf.fun); if (i == 0 && !mdev) { i10nm_printk(KERN_ERR, "No IMC found\n"); return -ENODEV; @@ -519,7 +536,9 @@ static int i10nm_get_hbm_munits(void) u64 base; list_for_each_entry(d, i10nm_edac_list, list) { - d->pcu_cr3 = pci_get_dev_wrapper(d->seg, d->bus[1], 30, 3); + d->pcu_cr3 = pci_get_dev_wrapper(d->seg, d->bus[res_cfg->pcu_cr3_bdf.bus], + res_cfg->pcu_cr3_bdf.dev, + res_cfg->pcu_cr3_bdf.fun); if (!d->pcu_cr3) return -ENODEV; @@ -540,11 +559,13 @@ static int i10nm_get_hbm_munits(void) } base += I10NM_GET_HBM_IMC_MMIO_OFFSET(reg); - lmc = I10NM_NUM_DDR_IMC; + lmc = res_cfg->ddr_imc_num; + + for (i = 0; i < res_cfg->hbm_imc_num; i++) { + mdev = pci_get_dev_wrapper(d->seg, d->bus[res_cfg->hbm_mdev_bdf.bus], + res_cfg->hbm_mdev_bdf.dev + i / 4, + res_cfg->hbm_mdev_bdf.fun + i % 4); - for (i = 0; i < I10NM_NUM_HBM_IMC; i++) { - mdev = pci_get_dev_wrapper(d->seg, d->bus[0], - 12 + i / 4, 1 + i % 4); if (i == 0 && !mdev) { i10nm_printk(KERN_ERR, "No hbm mc found\n"); return -ENODEV; @@ -594,8 +615,16 @@ static struct res_config i10nm_cfg0 = { .type = I10NM, .decs_did = 0x3452, .busno_cfg_offset = 0xcc, + .ddr_imc_num = 4, + .ddr_chan_num = 2, + .ddr_dimm_num = 2, .ddr_chan_mmio_sz = 0x4000, - .sad_all_devfn = PCI_DEVFN(29, 0), + .sad_all_bdf = {1, 29, 0}, + .pcu_cr3_bdf = {1, 30, 3}, + .util_all_bdf = {1, 29, 1}, + .uracu_bdf = {0, 0, 1}, + .ddr_mdev_bdf = {0, 12, 0}, + .hbm_mdev_bdf = {0, 12, 1}, .sad_all_offset = 0x108, .offsets_scrub = offsets_scrub_icx, .offsets_demand = offsets_demand_icx, @@ -605,8 +634,16 @@ static struct res_config i10nm_cfg1 = { .type = I10NM, .decs_did = 0x3452, .busno_cfg_offset = 0xd0, + .ddr_imc_num = 4, + .ddr_chan_num = 2, + .ddr_dimm_num = 2, .ddr_chan_mmio_sz = 0x4000, - .sad_all_devfn = PCI_DEVFN(29, 0), + .sad_all_bdf = {1, 29, 0}, + .pcu_cr3_bdf = {1, 30, 3}, + .util_all_bdf = {1, 29, 1}, + .uracu_bdf = {0, 0, 1}, + .ddr_mdev_bdf = {0, 12, 0}, + .hbm_mdev_bdf = {0, 12, 1}, .sad_all_offset = 0x108, .offsets_scrub = offsets_scrub_icx, .offsets_demand = offsets_demand_icx, @@ -616,10 +653,21 @@ static struct res_config spr_cfg = { .type = SPR, .decs_did = 0x3252, .busno_cfg_offset = 0xd0, + .ddr_imc_num = 4, + .ddr_chan_num = 2, + .ddr_dimm_num = 2, + .hbm_imc_num = 16, + .hbm_chan_num = 2, + .hbm_dimm_num = 1, .ddr_chan_mmio_sz = 0x8000, .hbm_chan_mmio_sz = 0x4000, .support_ddr5 = true, - .sad_all_devfn = PCI_DEVFN(10, 0), + .sad_all_bdf = {1, 10, 0}, + .pcu_cr3_bdf = {1, 30, 3}, + .util_all_bdf = {1, 29, 1}, + .uracu_bdf = {0, 0, 1}, + .ddr_mdev_bdf = {0, 12, 0}, + .hbm_mdev_bdf = {0, 12, 1}, .sad_all_offset = 0x300, .offsets_scrub = offsets_scrub_spr, .offsets_scrub_hbm0 = offsets_scrub_spr_hbm0, @@ -753,6 +801,7 @@ static int __init i10nm_init(void) struct skx_dev *d; int rc, i, off[3] = {0xd0, 0xc8, 0xcc}; u64 tolm, tohm; + int imc_num; edac_dbg(2, "\n"); @@ -793,6 +842,8 @@ static int __init i10nm_init(void) if (i10nm_get_hbm_munits() && rc) goto fail; + imc_num = res_cfg->ddr_imc_num + res_cfg->hbm_imc_num; + list_for_each_entry(d, i10nm_edac_list, list) { rc = skx_get_src_id(d, 0xf8, &src_id); if (rc < 0) @@ -803,7 +854,7 @@ static int __init i10nm_init(void) goto fail; edac_dbg(2, "src_id = %d node_id = %d\n", src_id, node_id); - for (i = 0; i < I10NM_NUM_IMC; i++) { + for (i = 0; i < imc_num; i++) { if (!d->imc[i].mdev) continue; @@ -813,12 +864,12 @@ static int __init i10nm_init(void) d->imc[i].node_id = node_id; if (d->imc[i].hbm_mc) { d->imc[i].chan_mmio_sz = cfg->hbm_chan_mmio_sz; - d->imc[i].num_channels = I10NM_NUM_HBM_CHANNELS; - d->imc[i].num_dimms = I10NM_NUM_HBM_DIMMS; + d->imc[i].num_channels = cfg->hbm_chan_num; + d->imc[i].num_dimms = cfg->hbm_dimm_num; } else { d->imc[i].chan_mmio_sz = cfg->ddr_chan_mmio_sz; - d->imc[i].num_channels = I10NM_NUM_DDR_CHANNELS; - d->imc[i].num_dimms = I10NM_NUM_DDR_DIMMS; + d->imc[i].num_channels = cfg->ddr_chan_num; + d->imc[i].num_dimms = cfg->ddr_dimm_num; } rc = skx_register_mci(&d->imc[i], d->imc[i].mdev, diff --git a/drivers/edac/skx_common.h b/drivers/edac/skx_common.h index 3120326572649..982e1bcb1edf3 100644 --- a/drivers/edac/skx_common.h +++ b/drivers/edac/skx_common.h @@ -173,19 +173,47 @@ struct decoded_addr { bool decoded_by_adxl; }; +struct pci_bdf { + u32 bus : 8; + u32 dev : 5; + u32 fun : 3; +}; + struct res_config { enum type type; /* Configuration agent device ID */ unsigned int decs_did; /* Default bus number configuration register offset */ int busno_cfg_offset; + /* DDR memory controllers per socket */ + int ddr_imc_num; + /* DDR channels per DDR memory controller */ + int ddr_chan_num; + /* DDR DIMMs per DDR memory channel */ + int ddr_dimm_num; /* Per DDR channel memory-mapped I/O size */ int ddr_chan_mmio_sz; + /* HBM memory controllers per socket */ + int hbm_imc_num; + /* HBM channels per HBM memory controller */ + int hbm_chan_num; + /* HBM DIMMs per HBM memory channel */ + int hbm_dimm_num; /* Per HBM channel memory-mapped I/O size */ int hbm_chan_mmio_sz; bool support_ddr5; - /* SAD device number and function number */ - unsigned int sad_all_devfn; + /* SAD device BDF */ + struct pci_bdf sad_all_bdf; + /* PCU device BDF */ + struct pci_bdf pcu_cr3_bdf; + /* UTIL device BDF */ + struct pci_bdf util_all_bdf; + /* URACU device BDF */ + struct pci_bdf uracu_bdf; + /* DDR mdev device BDF */ + struct pci_bdf ddr_mdev_bdf; + /* HBM mdev device BDF */ + struct pci_bdf hbm_mdev_bdf; int sad_all_offset; /* Offsets of retry_rd_err_log registers */ u32 *offsets_scrub; From ba987eaaabf99b462cdfed86274e3455d5126349 Mon Sep 17 00:00:00 2001 From: Qiuxu Zhuo Date: Fri, 13 Jan 2023 11:28:02 +0800 Subject: [PATCH 08/15] EDAC/i10nm: Add Intel Granite Rapids server support The Granite Rapids CPU model uses similar memory controller registers as Sapphire Rapids server but with some different configurations: - Various memory controller numbers for different Granite Rapids CPUs. So detect the number of present memory controllers at run time. - Different MMIO offsets of memory controllers. - Different triples of bus/dev/fun of some PCI devices used in i10nm_edac. Add above configurations and Granite Rapids CPU model ID for EDAC support. [Tony: Fixed 2 typos s/strcture/structure/] Signed-off-by: Qiuxu Zhuo Signed-off-by: Tony Luck Link: https://lore.kernel.org/all/20230113032802.41752-1-qiuxu.zhuo@intel.com --- drivers/edac/i10nm_base.c | 237 ++++++++++++++++++++++++++++++++++---- drivers/edac/skx_common.h | 5 +- 2 files changed, 217 insertions(+), 25 deletions(-) diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c index 5682d64cceb6e..46034310b78ee 100644 --- a/drivers/edac/i10nm_base.c +++ b/drivers/edac/i10nm_base.c @@ -13,7 +13,7 @@ #include "edac_module.h" #include "skx_common.h" -#define I10NM_REVISION "v0.0.5" +#define I10NM_REVISION "v0.0.6" #define EDAC_MOD_STR "i10nm_edac" /* Debug macros */ @@ -22,25 +22,34 @@ #define I10NM_GET_SCK_BAR(d, reg) \ pci_read_config_dword((d)->uracu, 0xd0, &(reg)) -#define I10NM_GET_IMC_BAR(d, i, reg) \ - pci_read_config_dword((d)->uracu, 0xd8 + (i) * 4, &(reg)) +#define I10NM_GET_IMC_BAR(d, i, reg) \ + pci_read_config_dword((d)->uracu, \ + (res_cfg->type == GNR ? 0xd4 : 0xd8) + (i) * 4, &(reg)) #define I10NM_GET_SAD(d, offset, i, reg)\ - pci_read_config_dword((d)->sad_all, (offset) + (i) * 8, &(reg)) + pci_read_config_dword((d)->sad_all, (offset) + (i) * \ + (res_cfg->type == GNR ? 12 : 8), &(reg)) #define I10NM_GET_HBM_IMC_BAR(d, reg) \ pci_read_config_dword((d)->uracu, 0xd4, &(reg)) #define I10NM_GET_CAPID3_CFG(d, reg) \ - pci_read_config_dword((d)->pcu_cr3, 0x90, &(reg)) + pci_read_config_dword((d)->pcu_cr3, \ + res_cfg->type == GNR ? 0x290 : 0x90, &(reg)) +#define I10NM_GET_CAPID5_CFG(d, reg) \ + pci_read_config_dword((d)->pcu_cr3, \ + res_cfg->type == GNR ? 0x298 : 0x98, &(reg)) #define I10NM_GET_DIMMMTR(m, i, j) \ - readl((m)->mbase + ((m)->hbm_mc ? 0x80c : 0x2080c) + \ + readl((m)->mbase + ((m)->hbm_mc ? 0x80c : \ + (res_cfg->type == GNR ? 0xc0c : 0x2080c)) + \ (i) * (m)->chan_mmio_sz + (j) * 4) #define I10NM_GET_MCDDRTCFG(m, i) \ readl((m)->mbase + ((m)->hbm_mc ? 0x970 : 0x20970) + \ (i) * (m)->chan_mmio_sz) #define I10NM_GET_MCMTR(m, i) \ - readl((m)->mbase + ((m)->hbm_mc ? 0xef8 : 0x20ef8) + \ + readl((m)->mbase + ((m)->hbm_mc ? 0xef8 : \ + (res_cfg->type == GNR ? 0xaf8 : 0x20ef8)) + \ (i) * (m)->chan_mmio_sz) #define I10NM_GET_AMAP(m, i) \ - readl((m)->mbase + ((m)->hbm_mc ? 0x814 : 0x20814) + \ + readl((m)->mbase + ((m)->hbm_mc ? 0x814 : \ + (res_cfg->type == GNR ? 0xc14 : 0x20814)) + \ (i) * (m)->chan_mmio_sz) #define I10NM_GET_REG32(m, i, offset) \ readl((m)->mbase + (i) * (m)->chan_mmio_sz + (offset)) @@ -56,7 +65,10 @@ #define I10NM_GET_HBM_IMC_MMIO_OFFSET(reg) \ ((GET_BITFIELD(reg, 0, 10) << 12) + 0x140000) +#define I10NM_GNR_IMC_MMIO_OFFSET 0x24c000 +#define I10NM_GNR_IMC_MMIO_SIZE 0x4000 #define I10NM_HBM_IMC_MMIO_SIZE 0x9000 +#define I10NM_DDR_IMC_CH_CNT(reg) GET_BITFIELD(reg, 21, 24) #define I10NM_IS_HBM_PRESENT(reg) GET_BITFIELD(reg, 27, 30) #define I10NM_IS_HBM_IMC(reg) GET_BITFIELD(reg, 29, 29) @@ -323,6 +335,79 @@ static struct pci_dev *pci_get_dev_wrapper(int dom, unsigned int bus, return pdev; } +/** + * i10nm_get_imc_num() - Get the number of present DDR memory controllers. + * + * @cfg : The pointer to the structure of EDAC resource configurations. + * + * For Granite Rapids CPUs, the number of present DDR memory controllers read + * at runtime overwrites the value statically configured in @cfg->ddr_imc_num. + * For other CPUs, the number of present DDR memory controllers is statically + * configured in @cfg->ddr_imc_num. + * + * RETURNS : 0 on success, < 0 on failure. + */ +static int i10nm_get_imc_num(struct res_config *cfg) +{ + int n, imc_num, chan_num = 0; + struct skx_dev *d; + u32 reg; + + list_for_each_entry(d, i10nm_edac_list, list) { + d->pcu_cr3 = pci_get_dev_wrapper(d->seg, d->bus[res_cfg->pcu_cr3_bdf.bus], + res_cfg->pcu_cr3_bdf.dev, + res_cfg->pcu_cr3_bdf.fun); + if (!d->pcu_cr3) + continue; + + if (I10NM_GET_CAPID5_CFG(d, reg)) + continue; + + n = I10NM_DDR_IMC_CH_CNT(reg); + + if (!chan_num) { + chan_num = n; + edac_dbg(2, "Get DDR CH number: %d\n", chan_num); + } else if (chan_num != n) { + i10nm_printk(KERN_NOTICE, "Get DDR CH numbers: %d, %d\n", chan_num, n); + } + } + + switch (cfg->type) { + case GNR: + /* + * One channel per DDR memory controller for Granite Rapids CPUs. + */ + imc_num = chan_num; + + if (!imc_num) { + i10nm_printk(KERN_ERR, "Invalid DDR MC number\n"); + return -ENODEV; + } + + if (imc_num > I10NM_NUM_DDR_IMC) { + i10nm_printk(KERN_ERR, "Need to make I10NM_NUM_DDR_IMC >= %d\n", imc_num); + return -EINVAL; + } + + if (cfg->ddr_imc_num != imc_num) { + /* + * Store the number of present DDR memory controllers. + */ + cfg->ddr_imc_num = imc_num; + edac_dbg(2, "Set DDR MC number: %d", imc_num); + } + + return 0; + default: + /* + * For other CPUs, the number of present DDR memory controllers + * is statically pre-configured in cfg->ddr_imc_num. + */ + return 0; + } +} + static bool i10nm_check_2lm(struct res_config *cfg) { struct skx_dev *d; @@ -445,6 +530,98 @@ static bool i10nm_mc_decode(struct decoded_addr *res) return true; } +/** + * get_gnr_mdev() - Get the PCI device of the @logical_idx-th DDR memory controller. + * + * @d : The pointer to the structure of CPU socket EDAC device. + * @logical_idx : The logical index of the present memory controller (0 ~ max present MC# - 1). + * @physical_idx : To store the corresponding physical index of @logical_idx. + * + * RETURNS : The PCI device of the @logical_idx-th DDR memory controller, NULL on failure. + */ +static struct pci_dev *get_gnr_mdev(struct skx_dev *d, int logical_idx, int *physical_idx) +{ +#define GNR_MAX_IMC_PCI_CNT 28 + + struct pci_dev *mdev; + int i, logical = 0; + + /* + * Detect present memory controllers from { PCI device: 8-5, function 7-1 } + */ + for (i = 0; i < GNR_MAX_IMC_PCI_CNT; i++) { + mdev = pci_get_dev_wrapper(d->seg, + d->bus[res_cfg->ddr_mdev_bdf.bus], + res_cfg->ddr_mdev_bdf.dev + i / 7, + res_cfg->ddr_mdev_bdf.fun + i % 7); + + if (mdev) { + if (logical == logical_idx) { + *physical_idx = i; + return mdev; + } + + pci_dev_put(mdev); + logical++; + } + } + + return NULL; +} + +/** + * get_ddr_munit() - Get the resource of the i-th DDR memory controller. + * + * @d : The pointer to the structure of CPU socket EDAC device. + * @i : The index of the CPU socket relative DDR memory controller. + * @offset : To store the MMIO offset of the i-th DDR memory controller. + * @size : To store the MMIO size of the i-th DDR memory controller. + * + * RETURNS : The PCI device of the i-th DDR memory controller, NULL on failure. + */ +static struct pci_dev *get_ddr_munit(struct skx_dev *d, int i, u32 *offset, unsigned long *size) +{ + struct pci_dev *mdev; + int physical_idx; + u32 reg; + + switch (res_cfg->type) { + case GNR: + if (I10NM_GET_IMC_BAR(d, 0, reg)) { + i10nm_printk(KERN_ERR, "Failed to get mc0 bar\n"); + return NULL; + } + + mdev = get_gnr_mdev(d, i, &physical_idx); + if (!mdev) + return NULL; + + *offset = I10NM_GET_IMC_MMIO_OFFSET(reg) + + I10NM_GNR_IMC_MMIO_OFFSET + + physical_idx * I10NM_GNR_IMC_MMIO_SIZE; + *size = I10NM_GNR_IMC_MMIO_SIZE; + + break; + default: + if (I10NM_GET_IMC_BAR(d, i, reg)) { + i10nm_printk(KERN_ERR, "Failed to get mc%d bar\n", i); + return NULL; + } + + mdev = pci_get_dev_wrapper(d->seg, + d->bus[res_cfg->ddr_mdev_bdf.bus], + res_cfg->ddr_mdev_bdf.dev + i, + res_cfg->ddr_mdev_bdf.fun); + if (!mdev) + return NULL; + + *offset = I10NM_GET_IMC_MMIO_OFFSET(reg); + *size = I10NM_GET_IMC_MMIO_SIZE(reg); + } + + return mdev; +} + static int i10nm_get_ddr_munits(void) { struct pci_dev *mdev; @@ -478,9 +655,8 @@ static int i10nm_get_ddr_munits(void) j++, base, reg); for (i = 0; i < res_cfg->ddr_imc_num; i++) { - mdev = pci_get_dev_wrapper(d->seg, d->bus[res_cfg->ddr_mdev_bdf.bus], - res_cfg->ddr_mdev_bdf.dev + i, - res_cfg->ddr_mdev_bdf.fun); + mdev = get_ddr_munit(d, i, &off, &size); + if (i == 0 && !mdev) { i10nm_printk(KERN_ERR, "No IMC found\n"); return -ENODEV; @@ -490,13 +666,6 @@ static int i10nm_get_ddr_munits(void) d->imc[i].mdev = mdev; - if (I10NM_GET_IMC_BAR(d, i, reg)) { - i10nm_printk(KERN_ERR, "Failed to get mc bar\n"); - return -ENODEV; - } - - off = I10NM_GET_IMC_MMIO_OFFSET(reg); - size = I10NM_GET_IMC_MMIO_SIZE(reg); edac_dbg(2, "mc%d mmio base 0x%llx size 0x%lx (reg 0x%x)\n", i, base + off, size, reg); @@ -536,9 +705,6 @@ static int i10nm_get_hbm_munits(void) u64 base; list_for_each_entry(d, i10nm_edac_list, list) { - d->pcu_cr3 = pci_get_dev_wrapper(d->seg, d->bus[res_cfg->pcu_cr3_bdf.bus], - res_cfg->pcu_cr3_bdf.dev, - res_cfg->pcu_cr3_bdf.fun); if (!d->pcu_cr3) return -ENODEV; @@ -678,6 +844,23 @@ static struct res_config spr_cfg = { .offsets_demand_hbm1 = offsets_demand_spr_hbm1, }; +static struct res_config gnr_cfg = { + .type = GNR, + .decs_did = 0x3252, + .busno_cfg_offset = 0xd0, + .ddr_imc_num = 12, + .ddr_chan_num = 1, + .ddr_dimm_num = 2, + .ddr_chan_mmio_sz = 0x4000, + .support_ddr5 = true, + .sad_all_bdf = {0, 13, 0}, + .pcu_cr3_bdf = {0, 5, 0}, + .util_all_bdf = {0, 13, 1}, + .uracu_bdf = {0, 0, 1}, + .ddr_mdev_bdf = {0, 5, 1}, + .sad_all_offset = 0x300, +}; + static const struct x86_cpu_id i10nm_cpuids[] = { X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(ATOM_TREMONT_D, X86_STEPPINGS(0x0, 0x3), &i10nm_cfg0), X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(ATOM_TREMONT_D, X86_STEPPINGS(0x4, 0xf), &i10nm_cfg1), @@ -686,6 +869,7 @@ static const struct x86_cpu_id i10nm_cpuids[] = { X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(ICELAKE_D, X86_STEPPINGS(0x0, 0xf), &i10nm_cfg1), X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(SAPPHIRERAPIDS_X, X86_STEPPINGS(0x0, 0xf), &spr_cfg), X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(EMERALDRAPIDS_X, X86_STEPPINGS(0x0, 0xf), &spr_cfg), + X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(GRANITERAPIDS_X, X86_STEPPINGS(0x0, 0xf), &gnr_cfg), {} }; MODULE_DEVICE_TABLE(x86cpu, i10nm_cpuids); @@ -705,7 +889,7 @@ static int i10nm_get_dimm_config(struct mem_ctl_info *mci, { struct skx_pvt *pvt = mci->pvt_info; struct skx_imc *imc = pvt->imc; - u32 mtr, amap, mcddrtcfg; + u32 mtr, amap, mcddrtcfg = 0; struct dimm_info *dimm; int i, j, ndimms; @@ -715,7 +899,10 @@ static int i10nm_get_dimm_config(struct mem_ctl_info *mci, ndimms = 0; amap = I10NM_GET_AMAP(imc, i); - mcddrtcfg = I10NM_GET_MCDDRTCFG(imc, i); + + if (res_cfg->type != GNR) + mcddrtcfg = I10NM_GET_MCDDRTCFG(imc, i); + for (j = 0; j < imc->num_dimms; j++) { dimm = edac_get_dimm(mci, i, j, 0); mtr = I10NM_GET_DIMMMTR(imc, i, j); @@ -834,6 +1021,10 @@ static int __init i10nm_init(void) return -ENODEV; } + rc = i10nm_get_imc_num(cfg); + if (rc < 0) + goto fail; + mem_cfg_2lm = i10nm_check_2lm(cfg); skx_set_mem_cfg(mem_cfg_2lm); diff --git a/drivers/edac/skx_common.h b/drivers/edac/skx_common.h index 982e1bcb1edf3..b6d3607dffe27 100644 --- a/drivers/edac/skx_common.h +++ b/drivers/edac/skx_common.h @@ -33,7 +33,7 @@ #define SKX_NUM_CHANNELS 3 /* Channels per memory controller */ #define SKX_NUM_DIMMS 2 /* Max DIMMS per channel */ -#define I10NM_NUM_DDR_IMC 4 +#define I10NM_NUM_DDR_IMC 12 #define I10NM_NUM_DDR_CHANNELS 2 #define I10NM_NUM_DDR_DIMMS 2 @@ -129,7 +129,8 @@ struct skx_pvt { enum type { SKX, I10NM, - SPR + SPR, + GNR }; enum { From 221aa03fb4e0740b1f4d9ecbf3d9af9fd9f7f85e Mon Sep 17 00:00:00 2001 From: Youquan Song Date: Mon, 6 Feb 2023 09:14:23 +0800 Subject: [PATCH 09/15] EDAC/i10nm: Add driver decoder for Sapphire Rapids server Intel SDM (December 2022) vol3B 17.13.2 contains IMC MC error codes for Sapphire Rapids. Current i10nm_edac only supports firmware decoder (ACPI DSM methods) for Sapphire Rapids. So add the driver decoder (decoding DDR memory errors via extracting error information from the IMC MC error codes) for Sapphire Rapids for better decoding performance. Co-developed-by: Qiuxu Zhuo Signed-off-by: Qiuxu Zhuo Signed-off-by: Youquan Song Signed-off-by: Tony Luck --- drivers/edac/i10nm_base.c | 102 ++++++++++++++++++++++++++------------ 1 file changed, 69 insertions(+), 33 deletions(-) diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c index 46034310b78ee..0a4691792801d 100644 --- a/drivers/edac/i10nm_base.c +++ b/drivers/edac/i10nm_base.c @@ -434,20 +434,39 @@ static bool i10nm_check_2lm(struct res_config *cfg) } /* - * Check whether the error comes from DDRT by ICX/Tremont model specific error code. - * Refer to SDM vol3B 16.11.3 Intel IMC MC error codes for IA32_MCi_STATUS. + * Check whether the error comes from DDRT by ICX/Tremont/SPR model specific error code. + * Refer to SDM vol3B 17.11.3/17.13.2 Intel IMC MC error codes for IA32_MCi_STATUS. */ static bool i10nm_mscod_is_ddrt(u32 mscod) { - switch (mscod) { - case 0x0106: case 0x0107: - case 0x0800: case 0x0804: - case 0x0806 ... 0x0808: - case 0x080a ... 0x080e: - case 0x0810: case 0x0811: - case 0x0816: case 0x081e: - case 0x081f: - return true; + switch (res_cfg->type) { + case I10NM: + switch (mscod) { + case 0x0106: case 0x0107: + case 0x0800: case 0x0804: + case 0x0806 ... 0x0808: + case 0x080a ... 0x080e: + case 0x0810: case 0x0811: + case 0x0816: case 0x081e: + case 0x081f: + return true; + } + + break; + case SPR: + switch (mscod) { + case 0x0800: case 0x0804: + case 0x0806 ... 0x0808: + case 0x080a ... 0x080e: + case 0x0810: case 0x0811: + case 0x0816: case 0x081e: + case 0x081f: + return true; + } + + break; + default: + return false; } return false; @@ -455,6 +474,7 @@ static bool i10nm_mscod_is_ddrt(u32 mscod) static bool i10nm_mc_decode_available(struct mce *mce) { +#define ICX_IMCx_CHy 0x06666000 u8 bank; if (!decoding_via_mca || mem_cfg_2lm) @@ -468,21 +488,26 @@ static bool i10nm_mc_decode_available(struct mce *mce) switch (res_cfg->type) { case I10NM: - if (bank < 13 || bank > 26) - return false; - - /* DDRT errors can't be decoded from MCA bank registers */ - if (MCI_MISC_ECC_MODE(mce->misc) == MCI_MISC_ECC_DDRT) + /* Check whether the bank is one of {13,14,17,18,21,22,25,26} */ + if (!(ICX_IMCx_CHy & (1 << bank))) return false; - - if (i10nm_mscod_is_ddrt(MCI_STATUS_MSCOD(mce->status))) + break; + case SPR: + if (bank < 13 || bank > 20) return false; - - /* Check whether one of {13,14,17,18,21,22,25,26} */ - return ((bank - 13) & BIT(1)) == 0; + break; default: return false; } + + /* DDRT errors can't be decoded from MCA bank registers */ + if (MCI_MISC_ECC_MODE(mce->misc) == MCI_MISC_ECC_DDRT) + return false; + + if (i10nm_mscod_is_ddrt(MCI_STATUS_MSCOD(mce->status))) + return false; + + return true; } static bool i10nm_mc_decode(struct decoded_addr *res) @@ -504,9 +529,29 @@ static bool i10nm_mc_decode(struct decoded_addr *res) switch (res_cfg->type) { case I10NM: - bank = m->bank - 13; - res->imc = bank / 4; - res->channel = bank % 2; + bank = m->bank - 13; + res->imc = bank / 4; + res->channel = bank % 2; + res->column = GET_BITFIELD(m->misc, 9, 18) << 2; + res->row = GET_BITFIELD(m->misc, 19, 39); + res->bank_group = GET_BITFIELD(m->misc, 40, 41); + res->bank_address = GET_BITFIELD(m->misc, 42, 43); + res->bank_group |= GET_BITFIELD(m->misc, 44, 44) << 2; + res->rank = GET_BITFIELD(m->misc, 56, 58); + res->dimm = res->rank >> 2; + res->rank = res->rank % 4; + break; + case SPR: + bank = m->bank - 13; + res->imc = bank / 2; + res->channel = bank % 2; + res->column = GET_BITFIELD(m->misc, 9, 18) << 2; + res->row = GET_BITFIELD(m->misc, 19, 36); + res->bank_group = GET_BITFIELD(m->misc, 37, 38); + res->bank_address = GET_BITFIELD(m->misc, 39, 40); + res->bank_group |= GET_BITFIELD(m->misc, 41, 41) << 2; + res->rank = GET_BITFIELD(m->misc, 57, 57); + res->dimm = GET_BITFIELD(m->misc, 58, 58); break; default: return false; @@ -518,15 +563,6 @@ static bool i10nm_mc_decode(struct decoded_addr *res) return false; } - res->column = GET_BITFIELD(m->misc, 9, 18) << 2; - res->row = GET_BITFIELD(m->misc, 19, 39); - res->bank_group = GET_BITFIELD(m->misc, 40, 41); - res->bank_address = GET_BITFIELD(m->misc, 42, 43); - res->bank_group |= GET_BITFIELD(m->misc, 44, 44) << 2; - res->rank = GET_BITFIELD(m->misc, 56, 58); - res->dimm = res->rank >> 2; - res->rank = res->rank % 4; - return true; } From fdce765a13384cf411d456472d396d8db2b3114f Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Fri, 27 Jan 2023 17:03:58 +0000 Subject: [PATCH 10/15] EDAC/amd64: Don't set up EDAC PCI control on Family 17h+ EDAC PCI control is used to detect/report legacy PCI errors like "Parity" and "SERROR". Modern AMD systems use PCIe Advanced Error Reporting (AER), and legacy PCI errors should not be reported. Remove EDAC PCI control setup on AMD Family 17h and later systems. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20230127170419.1824692-2-yazen.ghannam@amd.com --- drivers/edac/amd64_edac.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index e3318e5575a34..2cc7336a5121c 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -4370,12 +4370,12 @@ static int __init amd64_edac_init(void) } /* register stuff with EDAC MCE */ - if (boot_cpu_data.x86 >= 0x17) + if (boot_cpu_data.x86 >= 0x17) { amd_register_ecc_decoder(decode_umc_error); - else + } else { amd_register_ecc_decoder(decode_bus_error); - - setup_pci_device(); + setup_pci_device(); + } #ifdef CONFIG_X86_32 amd64_err("%s on 32-bit is unsupported. USE AT YOUR OWN RISK!\n", EDAC_MOD_STR); From 6e241bc93c9f0ae6f76ed65b44132948ca70fd76 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Fri, 27 Jan 2023 17:03:59 +0000 Subject: [PATCH 11/15] EDAC/amd64: Remove scrub rate control for Family 17h and later The scrub registers on AMD Family 17h and later may be inaccessible to the OS. Furthermore, hardware designers recommend that the scrubbing feature is managed by the firmware. Remove support for the sdram_scrub_rate interface for AMD Family 17h systems and later by not setting the scrub function pointers. The EDAC MC core will then not expose the scrub files in sysfs. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20230127170419.1824692-3-yazen.ghannam@amd.com --- drivers/edac/amd64_edac.c | 33 +++++---------------------------- drivers/edac/amd64_edac.h | 2 -- 2 files changed, 5 insertions(+), 30 deletions(-) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 2cc7336a5121c..07a89df0d4f4b 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -182,21 +182,6 @@ static inline int amd64_read_dct_pci_cfg(struct amd64_pvt *pvt, u8 dct, * other archs, we might not have access to the caches directly. */ -static inline void __f17h_set_scrubval(struct amd64_pvt *pvt, u32 scrubval) -{ - /* - * Fam17h supports scrub values between 0x5 and 0x14. Also, the values - * are shifted down by 0x5, so scrubval 0x5 is written to the register - * as 0x0, scrubval 0x6 as 0x1, etc. - */ - if (scrubval >= 0x5 && scrubval <= 0x14) { - scrubval -= 0x5; - pci_write_bits32(pvt->F6, F17H_SCR_LIMIT_ADDR, scrubval, 0xF); - pci_write_bits32(pvt->F6, F17H_SCR_BASE_ADDR, 1, 0x1); - } else { - pci_write_bits32(pvt->F6, F17H_SCR_BASE_ADDR, 0, 0x1); - } -} /* * Scan the scrub rate mapping table for a close or matching bandwidth value to * issue. If requested is too big, then use last maximum value found. @@ -229,9 +214,7 @@ static int __set_scrub_rate(struct amd64_pvt *pvt, u32 new_bw, u32 min_rate) scrubval = scrubrates[i].scrubval; - if (pvt->umc) { - __f17h_set_scrubval(pvt, scrubval); - } else if (pvt->fam == 0x15 && pvt->model == 0x60) { + if (pvt->fam == 0x15 && pvt->model == 0x60) { f15h_select_dct(pvt, 0); pci_write_bits32(pvt->F2, F15H_M60H_SCRCTRL, scrubval, 0x001F); f15h_select_dct(pvt, 1); @@ -271,16 +254,7 @@ static int get_scrub_rate(struct mem_ctl_info *mci) int i, retval = -EINVAL; u32 scrubval = 0; - if (pvt->umc) { - amd64_read_pci_cfg(pvt->F6, F17H_SCR_BASE_ADDR, &scrubval); - if (scrubval & BIT(0)) { - amd64_read_pci_cfg(pvt->F6, F17H_SCR_LIMIT_ADDR, &scrubval); - scrubval &= 0xF; - scrubval += 0x5; - } else { - scrubval = 0; - } - } else if (pvt->fam == 0x15) { + if (pvt->fam == 0x15) { /* Erratum #505 */ if (pvt->model < 0x10) f15h_select_dct(pvt, 0); @@ -3967,6 +3941,9 @@ static void setup_mci_misc_attrs(struct mem_ctl_info *mci) mci->dev_name = pci_name(pvt->F3); mci->ctl_page_to_phys = NULL; + if (pvt->fam >= 0x17) + return; + /* memory scrubber interface */ mci->set_sdram_scrub_rate = set_scrub_rate; mci->get_sdram_scrub_rate = get_scrub_rate; diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h index 38e5ad95d0109..48f1d97e12743 100644 --- a/drivers/edac/amd64_edac.h +++ b/drivers/edac/amd64_edac.h @@ -215,8 +215,6 @@ #define DCT_SEL_HI 0x114 #define F15H_M60H_SCRCTRL 0x1C8 -#define F17H_SCR_BASE_ADDR 0x48 -#define F17H_SCR_LIMIT_ADDR 0x4C /* * Function 3 - Misc Control From 6229235f7c6616c96ac06fd1094520b037410f46 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Fri, 27 Jan 2023 17:04:00 +0000 Subject: [PATCH 12/15] EDAC/amd64: Remove PCI Function 6 PCI Function 6 is used on Family 17h and later to access scrub registers. With scrub access removed, this function has no other use. Remove all Function 6 PCI IDs and related code. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20230127170419.1824692-4-yazen.ghannam@amd.com --- drivers/edac/amd64_edac.c | 22 +--------------------- drivers/edac/amd64_edac.h | 12 ++---------- 2 files changed, 3 insertions(+), 31 deletions(-) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 07a89df0d4f4b..dce2179ad4549 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -2906,7 +2906,6 @@ static struct amd64_family_type family_types[] = { [F17_CPUS] = { .ctl_name = "F17h", .f0_id = PCI_DEVICE_ID_AMD_17H_DF_F0, - .f6_id = PCI_DEVICE_ID_AMD_17H_DF_F6, .max_mcs = 2, .ops = { .early_channel_count = f17_early_channel_count, @@ -2916,7 +2915,6 @@ static struct amd64_family_type family_types[] = { [F17_M10H_CPUS] = { .ctl_name = "F17h_M10h", .f0_id = PCI_DEVICE_ID_AMD_17H_M10H_DF_F0, - .f6_id = PCI_DEVICE_ID_AMD_17H_M10H_DF_F6, .max_mcs = 2, .ops = { .early_channel_count = f17_early_channel_count, @@ -2926,7 +2924,6 @@ static struct amd64_family_type family_types[] = { [F17_M30H_CPUS] = { .ctl_name = "F17h_M30h", .f0_id = PCI_DEVICE_ID_AMD_17H_M30H_DF_F0, - .f6_id = PCI_DEVICE_ID_AMD_17H_M30H_DF_F6, .max_mcs = 8, .ops = { .early_channel_count = f17_early_channel_count, @@ -2936,7 +2933,6 @@ static struct amd64_family_type family_types[] = { [F17_M60H_CPUS] = { .ctl_name = "F17h_M60h", .f0_id = PCI_DEVICE_ID_AMD_17H_M60H_DF_F0, - .f6_id = PCI_DEVICE_ID_AMD_17H_M60H_DF_F6, .max_mcs = 2, .ops = { .early_channel_count = f17_early_channel_count, @@ -2946,7 +2942,6 @@ static struct amd64_family_type family_types[] = { [F17_M70H_CPUS] = { .ctl_name = "F17h_M70h", .f0_id = PCI_DEVICE_ID_AMD_17H_M70H_DF_F0, - .f6_id = PCI_DEVICE_ID_AMD_17H_M70H_DF_F6, .max_mcs = 2, .ops = { .early_channel_count = f17_early_channel_count, @@ -2956,7 +2951,6 @@ static struct amd64_family_type family_types[] = { [F19_CPUS] = { .ctl_name = "F19h", .f0_id = PCI_DEVICE_ID_AMD_19H_DF_F0, - .f6_id = PCI_DEVICE_ID_AMD_19H_DF_F6, .max_mcs = 8, .ops = { .early_channel_count = f17_early_channel_count, @@ -2966,7 +2960,6 @@ static struct amd64_family_type family_types[] = { [F19_M10H_CPUS] = { .ctl_name = "F19h_M10h", .f0_id = PCI_DEVICE_ID_AMD_19H_M10H_DF_F0, - .f6_id = PCI_DEVICE_ID_AMD_19H_M10H_DF_F6, .max_mcs = 12, .flags.zn_regs_v2 = 1, .ops = { @@ -2977,7 +2970,6 @@ static struct amd64_family_type family_types[] = { [F19_M50H_CPUS] = { .ctl_name = "F19h_M50h", .f0_id = PCI_DEVICE_ID_AMD_19H_M50H_DF_F0, - .f6_id = PCI_DEVICE_ID_AMD_19H_M50H_DF_F6, .max_mcs = 2, .ops = { .early_channel_count = f17_early_channel_count, @@ -3290,7 +3282,7 @@ static void decode_umc_error(int node_id, struct mce *m) /* * Use pvt->F3 which contains the F3 CPU PCI device to get the related * F1 (AddrMap) and F2 (Dct) devices. Return negative value on error. - * Reserve F0 and F6 on systems with a UMC. + * Reserve F0 on systems with a UMC. */ static int reserve_mc_sibling_devs(struct amd64_pvt *pvt, u16 pci_id1, u16 pci_id2) @@ -3302,21 +3294,11 @@ reserve_mc_sibling_devs(struct amd64_pvt *pvt, u16 pci_id1, u16 pci_id2) return -ENODEV; } - pvt->F6 = pci_get_related_function(pvt->F3->vendor, pci_id2, pvt->F3); - if (!pvt->F6) { - pci_dev_put(pvt->F0); - pvt->F0 = NULL; - - edac_dbg(1, "F6 not found: device 0x%x\n", pci_id2); - return -ENODEV; - } - if (!pci_ctl_dev) pci_ctl_dev = &pvt->F0->dev; edac_dbg(1, "F0: %s\n", pci_name(pvt->F0)); edac_dbg(1, "F3: %s\n", pci_name(pvt->F3)); - edac_dbg(1, "F6: %s\n", pci_name(pvt->F6)); return 0; } @@ -3352,7 +3334,6 @@ static void free_mc_sibling_devs(struct amd64_pvt *pvt) { if (pvt->umc) { pci_dev_put(pvt->F0); - pci_dev_put(pvt->F6); } else { pci_dev_put(pvt->F1); pci_dev_put(pvt->F2); @@ -4078,7 +4059,6 @@ static int hw_info_get(struct amd64_pvt *pvt) return -ENOMEM; pci_id1 = fam_type->f0_id; - pci_id2 = fam_type->f6_id; } else { pci_id1 = fam_type->f1_id; pci_id2 = fam_type->f2_id; diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h index 48f1d97e12743..2d5ea9ca3868f 100644 --- a/drivers/edac/amd64_edac.h +++ b/drivers/edac/amd64_edac.h @@ -115,21 +115,13 @@ #define PCI_DEVICE_ID_AMD_16H_M30H_NB_F1 0x1581 #define PCI_DEVICE_ID_AMD_16H_M30H_NB_F2 0x1582 #define PCI_DEVICE_ID_AMD_17H_DF_F0 0x1460 -#define PCI_DEVICE_ID_AMD_17H_DF_F6 0x1466 #define PCI_DEVICE_ID_AMD_17H_M10H_DF_F0 0x15e8 -#define PCI_DEVICE_ID_AMD_17H_M10H_DF_F6 0x15ee #define PCI_DEVICE_ID_AMD_17H_M30H_DF_F0 0x1490 -#define PCI_DEVICE_ID_AMD_17H_M30H_DF_F6 0x1496 #define PCI_DEVICE_ID_AMD_17H_M60H_DF_F0 0x1448 -#define PCI_DEVICE_ID_AMD_17H_M60H_DF_F6 0x144e #define PCI_DEVICE_ID_AMD_17H_M70H_DF_F0 0x1440 -#define PCI_DEVICE_ID_AMD_17H_M70H_DF_F6 0x1446 #define PCI_DEVICE_ID_AMD_19H_DF_F0 0x1650 -#define PCI_DEVICE_ID_AMD_19H_DF_F6 0x1656 #define PCI_DEVICE_ID_AMD_19H_M10H_DF_F0 0x14ad -#define PCI_DEVICE_ID_AMD_19H_M10H_DF_F6 0x14b3 #define PCI_DEVICE_ID_AMD_19H_M50H_DF_F0 0x166a -#define PCI_DEVICE_ID_AMD_19H_M50H_DF_F6 0x1670 /* * Function 1 - Address Map @@ -354,7 +346,7 @@ struct amd64_pvt { struct low_ops *ops; /* pci_device handles which we utilize */ - struct pci_dev *F0, *F1, *F2, *F3, *F6; + struct pci_dev *F0, *F1, *F2, *F3; u16 mc_node_id; /* MC index of this MC node */ u8 fam; /* CPU family */ @@ -501,7 +493,7 @@ struct amd64_family_flags { struct amd64_family_type { const char *ctl_name; - u16 f0_id, f1_id, f2_id, f6_id; + u16 f0_id, f1_id, f2_id; /* Maximum number of memory controllers per die/node. */ u8 max_mcs; struct amd64_family_flags flags; From cf981562e6270397a2b0dc871a1f11ccc2a687e8 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Fri, 27 Jan 2023 17:04:01 +0000 Subject: [PATCH 13/15] EDAC/amd64: Remove PCI Function 0 PCI Function 0 is used on Family 17h and later only to read the "dhar" value. This value is printed and provided through a module-specific debug sysfs file. The value is not used for any Family 17h and later code, and it does not have any apparent debug value on these systems. Remove "dhar", Function 0 PCI IDs, and all related code. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20230127170419.1824692-5-yazen.ghannam@amd.com --- drivers/edac/amd64_edac.c | 38 +++++--------------------------------- drivers/edac/amd64_edac.h | 12 ++---------- 2 files changed, 7 insertions(+), 43 deletions(-) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index dce2179ad4549..352cbcda53f92 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -1428,9 +1428,6 @@ static void __dump_misc_regs_df(struct amd64_pvt *pvt) debug_display_dimm_sizes_df(pvt, i); } - - edac_dbg(1, "F0x104 (DRAM Hole Address): 0x%08x, base: 0x%08x\n", - pvt->dhar, dhar_base(pvt)); } /* Display and decode various NB registers for debug purposes. */ @@ -1465,6 +1462,8 @@ static void __dump_misc_regs(struct amd64_pvt *pvt) /* Only if NOT ganged does dclr1 have valid info */ if (!dct_ganging_enabled(pvt)) debug_dump_dramcfg_low(pvt, pvt->dclr1, 1); + + edac_dbg(1, " DramHoleValid: %s\n", dhar_valid(pvt) ? "yes" : "no"); } /* Display and decode various NB registers for debug purposes. */ @@ -1475,8 +1474,6 @@ static void dump_misc_regs(struct amd64_pvt *pvt) else __dump_misc_regs(pvt); - edac_dbg(1, " DramHoleValid: %s\n", dhar_valid(pvt) ? "yes" : "no"); - amd64_info("using x%u syndromes.\n", pvt->ecc_sym_sz); } @@ -2905,7 +2902,6 @@ static struct amd64_family_type family_types[] = { }, [F17_CPUS] = { .ctl_name = "F17h", - .f0_id = PCI_DEVICE_ID_AMD_17H_DF_F0, .max_mcs = 2, .ops = { .early_channel_count = f17_early_channel_count, @@ -2914,7 +2910,6 @@ static struct amd64_family_type family_types[] = { }, [F17_M10H_CPUS] = { .ctl_name = "F17h_M10h", - .f0_id = PCI_DEVICE_ID_AMD_17H_M10H_DF_F0, .max_mcs = 2, .ops = { .early_channel_count = f17_early_channel_count, @@ -2923,7 +2918,6 @@ static struct amd64_family_type family_types[] = { }, [F17_M30H_CPUS] = { .ctl_name = "F17h_M30h", - .f0_id = PCI_DEVICE_ID_AMD_17H_M30H_DF_F0, .max_mcs = 8, .ops = { .early_channel_count = f17_early_channel_count, @@ -2932,7 +2926,6 @@ static struct amd64_family_type family_types[] = { }, [F17_M60H_CPUS] = { .ctl_name = "F17h_M60h", - .f0_id = PCI_DEVICE_ID_AMD_17H_M60H_DF_F0, .max_mcs = 2, .ops = { .early_channel_count = f17_early_channel_count, @@ -2941,7 +2934,6 @@ static struct amd64_family_type family_types[] = { }, [F17_M70H_CPUS] = { .ctl_name = "F17h_M70h", - .f0_id = PCI_DEVICE_ID_AMD_17H_M70H_DF_F0, .max_mcs = 2, .ops = { .early_channel_count = f17_early_channel_count, @@ -2950,7 +2942,6 @@ static struct amd64_family_type family_types[] = { }, [F19_CPUS] = { .ctl_name = "F19h", - .f0_id = PCI_DEVICE_ID_AMD_19H_DF_F0, .max_mcs = 8, .ops = { .early_channel_count = f17_early_channel_count, @@ -2959,7 +2950,6 @@ static struct amd64_family_type family_types[] = { }, [F19_M10H_CPUS] = { .ctl_name = "F19h_M10h", - .f0_id = PCI_DEVICE_ID_AMD_19H_M10H_DF_F0, .max_mcs = 12, .flags.zn_regs_v2 = 1, .ops = { @@ -2969,7 +2959,6 @@ static struct amd64_family_type family_types[] = { }, [F19_M50H_CPUS] = { .ctl_name = "F19h_M50h", - .f0_id = PCI_DEVICE_ID_AMD_19H_M50H_DF_F0, .max_mcs = 2, .ops = { .early_channel_count = f17_early_channel_count, @@ -3282,26 +3271,12 @@ static void decode_umc_error(int node_id, struct mce *m) /* * Use pvt->F3 which contains the F3 CPU PCI device to get the related * F1 (AddrMap) and F2 (Dct) devices. Return negative value on error. - * Reserve F0 on systems with a UMC. */ static int reserve_mc_sibling_devs(struct amd64_pvt *pvt, u16 pci_id1, u16 pci_id2) { - if (pvt->umc) { - pvt->F0 = pci_get_related_function(pvt->F3->vendor, pci_id1, pvt->F3); - if (!pvt->F0) { - edac_dbg(1, "F0 not found, device 0x%x\n", pci_id1); - return -ENODEV; - } - - if (!pci_ctl_dev) - pci_ctl_dev = &pvt->F0->dev; - - edac_dbg(1, "F0: %s\n", pci_name(pvt->F0)); - edac_dbg(1, "F3: %s\n", pci_name(pvt->F3)); - + if (pvt->umc) return 0; - } /* Reserve the ADDRESS MAP Device */ pvt->F1 = pci_get_related_function(pvt->F3->vendor, pci_id1, pvt->F3); @@ -3333,7 +3308,7 @@ reserve_mc_sibling_devs(struct amd64_pvt *pvt, u16 pci_id1, u16 pci_id2) static void free_mc_sibling_devs(struct amd64_pvt *pvt) { if (pvt->umc) { - pci_dev_put(pvt->F0); + return; } else { pci_dev_put(pvt->F1); pci_dev_put(pvt->F2); @@ -3423,7 +3398,6 @@ static void read_mc_regs(struct amd64_pvt *pvt) if (pvt->umc) { __read_mc_regs_df(pvt); - amd64_read_pci_cfg(pvt->F0, DF_DHAR, &pvt->dhar); goto skip; } @@ -4057,8 +4031,6 @@ static int hw_info_get(struct amd64_pvt *pvt) pvt->umc = kcalloc(fam_type->max_mcs, sizeof(struct amd64_umc), GFP_KERNEL); if (!pvt->umc) return -ENOMEM; - - pci_id1 = fam_type->f0_id; } else { pci_id1 = fam_type->f1_id; pci_id2 = fam_type->f2_id; @@ -4075,7 +4047,7 @@ static int hw_info_get(struct amd64_pvt *pvt) static void hw_info_put(struct amd64_pvt *pvt) { - if (pvt->F0 || pvt->F1) + if (pvt->F1) free_mc_sibling_devs(pvt); kfree(pvt->umc); diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h index 2d5ea9ca3868f..398fb58dacbf4 100644 --- a/drivers/edac/amd64_edac.h +++ b/drivers/edac/amd64_edac.h @@ -114,14 +114,6 @@ #define PCI_DEVICE_ID_AMD_16H_NB_F2 0x1532 #define PCI_DEVICE_ID_AMD_16H_M30H_NB_F1 0x1581 #define PCI_DEVICE_ID_AMD_16H_M30H_NB_F2 0x1582 -#define PCI_DEVICE_ID_AMD_17H_DF_F0 0x1460 -#define PCI_DEVICE_ID_AMD_17H_M10H_DF_F0 0x15e8 -#define PCI_DEVICE_ID_AMD_17H_M30H_DF_F0 0x1490 -#define PCI_DEVICE_ID_AMD_17H_M60H_DF_F0 0x1448 -#define PCI_DEVICE_ID_AMD_17H_M70H_DF_F0 0x1440 -#define PCI_DEVICE_ID_AMD_19H_DF_F0 0x1650 -#define PCI_DEVICE_ID_AMD_19H_M10H_DF_F0 0x14ad -#define PCI_DEVICE_ID_AMD_19H_M50H_DF_F0 0x166a /* * Function 1 - Address Map @@ -346,7 +338,7 @@ struct amd64_pvt { struct low_ops *ops; /* pci_device handles which we utilize */ - struct pci_dev *F0, *F1, *F2, *F3; + struct pci_dev *F1, *F2, *F3; u16 mc_node_id; /* MC index of this MC node */ u8 fam; /* CPU family */ @@ -493,7 +485,7 @@ struct amd64_family_flags { struct amd64_family_type { const char *ctl_name; - u16 f0_id, f1_id, f2_id; + u16 f1_id, f2_id; /* Maximum number of memory controllers per die/node. */ u8 max_mcs; struct amd64_family_flags flags; From c4605bde334367b22bbf43cbbef0d1b7c75433dc Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Fri, 27 Jan 2023 17:04:02 +0000 Subject: [PATCH 14/15] EDAC/amd64: Remove early_channel_count() The early_channel_count() function seems to have been useful in the past for knowing how many EDAC mci structures to populate. However, this is no longer needed as the maximum channel count for a system is used instead. Remove the early_channel_count() helper functions and related code. Use the size of the channel layer when iterating over channel structures. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20230127170419.1824692-6-yazen.ghannam@amd.com --- drivers/edac/amd64_edac.c | 116 +------------------------------------- drivers/edac/amd64_edac.h | 2 - 2 files changed, 2 insertions(+), 116 deletions(-) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 352cbcda53f92..1c4bef1cdf28f 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -1703,24 +1703,6 @@ static void determine_memory_type(struct amd64_pvt *pvt) pvt->dram_type = (pvt->dclr0 & BIT(16)) ? MEM_DDR3 : MEM_RDDR3; } -/* Get the number of DCT channels the memory controller is using. */ -static int k8_early_channel_count(struct amd64_pvt *pvt) -{ - int flag; - - if (pvt->ext_model >= K8_REV_F) - /* RevF (NPT) and later */ - flag = pvt->dclr0 & WIDTH_128; - else - /* RevE and earlier */ - flag = pvt->dclr0 & REVE_WIDTH_128; - - /* not used */ - pvt->dclr1 = 0; - - return (flag) ? 2 : 1; -} - /* On F10h and later ErrAddr is MC4_ADDR[47:1] */ static u64 get_error_address(struct amd64_pvt *pvt, struct mce *m) { @@ -1972,69 +1954,6 @@ static int k8_dbam_to_chip_select(struct amd64_pvt *pvt, u8 dct, } } -/* - * Get the number of DCT channels in use. - * - * Return: - * number of Memory Channels in operation - * Pass back: - * contents of the DCL0_LOW register - */ -static int f1x_early_channel_count(struct amd64_pvt *pvt) -{ - int i, j, channels = 0; - - /* On F10h, if we are in 128 bit mode, then we are using 2 channels */ - if (pvt->fam == 0x10 && (pvt->dclr0 & WIDTH_128)) - return 2; - - /* - * Need to check if in unganged mode: In such, there are 2 channels, - * but they are not in 128 bit mode and thus the above 'dclr0' status - * bit will be OFF. - * - * Need to check DCT0[0] and DCT1[0] to see if only one of them has - * their CSEnable bit on. If so, then SINGLE DIMM case. - */ - edac_dbg(0, "Data width is not 128 bits - need more decoding\n"); - - /* - * Check DRAM Bank Address Mapping values for each DIMM to see if there - * is more than just one DIMM present in unganged mode. Need to check - * both controllers since DIMMs can be placed in either one. - */ - for (i = 0; i < 2; i++) { - u32 dbam = (i ? pvt->dbam1 : pvt->dbam0); - - for (j = 0; j < 4; j++) { - if (DBAM_DIMM(j, dbam) > 0) { - channels++; - break; - } - } - } - - if (channels > 2) - channels = 2; - - amd64_info("MCT channel count: %d\n", channels); - - return channels; -} - -static int f17_early_channel_count(struct amd64_pvt *pvt) -{ - int i, channels = 0; - - /* SDP Control bit 31 (SdpInit) is clear for unused UMC channels */ - for_each_umc(i) - channels += !!(pvt->umc[i].sdp_ctrl & UMC_SDP_INIT); - - amd64_info("MCT channel count: %d\n", channels); - - return channels; -} - static int ddr3_cs_size(unsigned i, bool dct_width) { unsigned shift = 0; @@ -2829,7 +2748,6 @@ static struct amd64_family_type family_types[] = { .f2_id = PCI_DEVICE_ID_AMD_K8_NB_MEMCTL, .max_mcs = 2, .ops = { - .early_channel_count = k8_early_channel_count, .map_sysaddr_to_csrow = k8_map_sysaddr_to_csrow, .dbam_to_cs = k8_dbam_to_chip_select, } @@ -2840,7 +2758,6 @@ static struct amd64_family_type family_types[] = { .f2_id = PCI_DEVICE_ID_AMD_10H_NB_DRAM, .max_mcs = 2, .ops = { - .early_channel_count = f1x_early_channel_count, .map_sysaddr_to_csrow = f1x_map_sysaddr_to_csrow, .dbam_to_cs = f10_dbam_to_chip_select, } @@ -2851,7 +2768,6 @@ static struct amd64_family_type family_types[] = { .f2_id = PCI_DEVICE_ID_AMD_15H_NB_F2, .max_mcs = 2, .ops = { - .early_channel_count = f1x_early_channel_count, .map_sysaddr_to_csrow = f1x_map_sysaddr_to_csrow, .dbam_to_cs = f15_dbam_to_chip_select, } @@ -2862,7 +2778,6 @@ static struct amd64_family_type family_types[] = { .f2_id = PCI_DEVICE_ID_AMD_15H_M30H_NB_F2, .max_mcs = 2, .ops = { - .early_channel_count = f1x_early_channel_count, .map_sysaddr_to_csrow = f1x_map_sysaddr_to_csrow, .dbam_to_cs = f16_dbam_to_chip_select, } @@ -2873,7 +2788,6 @@ static struct amd64_family_type family_types[] = { .f2_id = PCI_DEVICE_ID_AMD_15H_M60H_NB_F2, .max_mcs = 2, .ops = { - .early_channel_count = f1x_early_channel_count, .map_sysaddr_to_csrow = f1x_map_sysaddr_to_csrow, .dbam_to_cs = f15_m60h_dbam_to_chip_select, } @@ -2884,7 +2798,6 @@ static struct amd64_family_type family_types[] = { .f2_id = PCI_DEVICE_ID_AMD_16H_NB_F2, .max_mcs = 2, .ops = { - .early_channel_count = f1x_early_channel_count, .map_sysaddr_to_csrow = f1x_map_sysaddr_to_csrow, .dbam_to_cs = f16_dbam_to_chip_select, } @@ -2895,7 +2808,6 @@ static struct amd64_family_type family_types[] = { .f2_id = PCI_DEVICE_ID_AMD_16H_M30H_NB_F2, .max_mcs = 2, .ops = { - .early_channel_count = f1x_early_channel_count, .map_sysaddr_to_csrow = f1x_map_sysaddr_to_csrow, .dbam_to_cs = f16_dbam_to_chip_select, } @@ -2904,7 +2816,6 @@ static struct amd64_family_type family_types[] = { .ctl_name = "F17h", .max_mcs = 2, .ops = { - .early_channel_count = f17_early_channel_count, .dbam_to_cs = f17_addr_mask_to_cs_size, } }, @@ -2912,7 +2823,6 @@ static struct amd64_family_type family_types[] = { .ctl_name = "F17h_M10h", .max_mcs = 2, .ops = { - .early_channel_count = f17_early_channel_count, .dbam_to_cs = f17_addr_mask_to_cs_size, } }, @@ -2920,7 +2830,6 @@ static struct amd64_family_type family_types[] = { .ctl_name = "F17h_M30h", .max_mcs = 8, .ops = { - .early_channel_count = f17_early_channel_count, .dbam_to_cs = f17_addr_mask_to_cs_size, } }, @@ -2928,7 +2837,6 @@ static struct amd64_family_type family_types[] = { .ctl_name = "F17h_M60h", .max_mcs = 2, .ops = { - .early_channel_count = f17_early_channel_count, .dbam_to_cs = f17_addr_mask_to_cs_size, } }, @@ -2936,7 +2844,6 @@ static struct amd64_family_type family_types[] = { .ctl_name = "F17h_M70h", .max_mcs = 2, .ops = { - .early_channel_count = f17_early_channel_count, .dbam_to_cs = f17_addr_mask_to_cs_size, } }, @@ -2944,7 +2851,6 @@ static struct amd64_family_type family_types[] = { .ctl_name = "F19h", .max_mcs = 8, .ops = { - .early_channel_count = f17_early_channel_count, .dbam_to_cs = f17_addr_mask_to_cs_size, } }, @@ -2953,7 +2859,6 @@ static struct amd64_family_type family_types[] = { .max_mcs = 12, .flags.zn_regs_v2 = 1, .ops = { - .early_channel_count = f17_early_channel_count, .dbam_to_cs = f17_addr_mask_to_cs_size, } }, @@ -2961,7 +2866,6 @@ static struct amd64_family_type family_types[] = { .ctl_name = "F19h_M50h", .max_mcs = 2, .ops = { - .early_channel_count = f17_early_channel_count, .dbam_to_cs = f17_addr_mask_to_cs_size, } }, @@ -3620,7 +3524,7 @@ static int init_csrows(struct mem_ctl_info *mci) : EDAC_SECDED; } - for (j = 0; j < pvt->channel_count; j++) { + for (j = 0; j < fam_type->max_mcs; j++) { dimm = csrow->channels[j]->dimm; dimm->mtype = pvt->dram_type; dimm->edac_mode = edac_mode; @@ -4057,28 +3961,12 @@ static int init_one_instance(struct amd64_pvt *pvt) { struct mem_ctl_info *mci = NULL; struct edac_mc_layer layers[2]; - int ret = -EINVAL; + int ret = -ENOMEM; - /* - * We need to determine how many memory channels there are. Then use - * that information for calculating the size of the dynamic instance - * tables in the 'mci' structure. - */ - pvt->channel_count = pvt->ops->early_channel_count(pvt); - if (pvt->channel_count < 0) - return ret; - - ret = -ENOMEM; layers[0].type = EDAC_MC_LAYER_CHIP_SELECT; layers[0].size = pvt->csels[0].b_cnt; layers[0].is_virt_csrow = true; layers[1].type = EDAC_MC_LAYER_CHANNEL; - - /* - * Always allocate two channels since we can have setups with DIMMs on - * only one channel. Also, this simplifies handling later for the price - * of a couple of KBs tops. - */ layers[1].size = fam_type->max_mcs; layers[1].is_virt_csrow = false; diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h index 398fb58dacbf4..e4329dff8cf2e 100644 --- a/drivers/edac/amd64_edac.h +++ b/drivers/edac/amd64_edac.h @@ -346,7 +346,6 @@ struct amd64_pvt { u8 stepping; /* ... stepping */ int ext_model; /* extended model value of this node */ - int channel_count; /* Raw registers */ u32 dclr0; /* DRAM Configuration Low DCT0 reg */ @@ -466,7 +465,6 @@ struct ecc_settings { * functions and per device encoding/decoding logic. */ struct low_ops { - int (*early_channel_count) (struct amd64_pvt *pvt); void (*map_sysaddr_to_csrow) (struct mem_ctl_info *mci, u64 sys_addr, struct err_info *); int (*dbam_to_cs) (struct amd64_pvt *pvt, u8 dct, From 28980db94742f9f2fb0f68ea35f2171b38007bae Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Tue, 14 Feb 2023 10:42:22 +0100 Subject: [PATCH 15/15] EDAC/amd64: Shut up an -Werror,-Wsometimes-uninitialized clang false positive Reportedly, clang cannot do interprocedural analysis: https://lore.kernel.org/r/20230213-amd64_edac-wsometimes-uninitialized-v1-1-5bde32b89e02@kernel.org and see that those arguments won't be used uninitialized. So, yeah, the code's fine even without this. Normally, such a "fix" won't be applied but that warning gets automatically enabled in -Wall builds and when CONFIG_WERROR is set in allmodconfig builds, the build fails. So shut it up with a minimal fix as this code will see more reorganization very soon. [ bp: Write commit message. ] Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Nathan Chancellor Link: https://lore.kernel.org/r/Y%2BqdVHidnrrKvxiD@dev-arch.thelio-3990X --- drivers/edac/amd64_edac.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 1c4bef1cdf28f..5b42533f306a7 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -3928,7 +3928,7 @@ static const struct attribute_group *amd64_edac_attr_groups[] = { static int hw_info_get(struct amd64_pvt *pvt) { - u16 pci_id1, pci_id2; + u16 pci_id1 = 0, pci_id2 = 0; int ret; if (pvt->fam >= 0x17) {