Skip to content

Commit

Permalink
Merge tag 'edac_updates_for_6.2' of git://git.kernel.org/pub/scm/linu…
Browse files Browse the repository at this point in the history
…x/kernel/git/ras/ras

Pull EDAC updates from Borislav Petkov:

 - Make ghes_edac a simple module like the rest of the EDAC drivers and
   drop the forced built-in only configuration by disentangling it from
   GHES (Jia He)

 - The usual small cleanups and improvements all over EDAC land

* tag 'edac_updates_for_6.2' of git://git.kernel.org/pub/scm/linux/kernel/git/ras/ras:
  EDAC/i10nm: fix refcount leak in pci_get_dev_wrapper()
  EDAC/i5400: Fix typo in comment: vaious -> various
  EDAC/mc_sysfs: Increase legacy channel support to 12
  MAINTAINERS: Make Mauro EDAC reviewer
  MAINTAINERS: Make Manivannan Sadhasivam the maintainer of qcom_edac
  EDAC/igen6: Return the correct error type when not the MC owner
  apei/ghes: Use xchg_release() for updating new cache slot instead of cmpxchg()
  EDAC: Check for GHES preference in the chipset-specific EDAC drivers
  EDAC/ghes: Make ghes_edac a proper module
  EDAC/ghes: Prepare to make ghes_edac a proper module
  EDAC/ghes: Add a notifier for reporting memory errors
  efi/cper: Export several helpers for ghes_edac to use
  EDAC/i5000: Mark as BROKEN
  • Loading branch information
Linus Torvalds committed Dec 12, 2022
2 parents 40deb5e + 3919430 commit 7adcadb
Show file tree
Hide file tree
Showing 19 changed files with 194 additions and 72 deletions.
5 changes: 2 additions & 3 deletions MAINTAINERS
Original file line number Diff line number Diff line change
Expand Up @@ -7386,9 +7386,9 @@ F: drivers/edac/thunderx_edac*

EDAC-CORE
M: Borislav Petkov <bp@alien8.de>
M: Mauro Carvalho Chehab <mchehab@kernel.org>
M: Tony Luck <tony.luck@intel.com>
R: James Morse <james.morse@arm.com>
R: Mauro Carvalho Chehab <mchehab@kernel.org>
R: Robert Richter <rric@kernel.org>
L: linux-edac@vger.kernel.org
S: Supported
Expand Down Expand Up @@ -7505,8 +7505,7 @@ S: Maintained
F: drivers/edac/pnd2_edac.[ch]

EDAC-QCOM
M: Channagoud Kadabi <ckadabi@codeaurora.org>
M: Venkata Narendra Kumar Gutta <vnkgutta@codeaurora.org>
M: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
L: linux-arm-msm@vger.kernel.org
L: linux-edac@vger.kernel.org
S: Maintained
Expand Down
66 changes: 63 additions & 3 deletions drivers/acpi/apei/ghes.c
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,8 @@
#define FIX_APEI_GHES_SDEI_CRITICAL __end_of_fixed_addresses
#endif

static ATOMIC_NOTIFIER_HEAD(ghes_report_chain);

static inline bool is_hest_type_generic_v2(struct ghes *ghes)
{
return ghes->generic->header.type == ACPI_HEST_TYPE_GENERIC_ERROR_V2;
Expand All @@ -107,6 +109,13 @@ static inline bool is_hest_type_generic_v2(struct ghes *ghes)
bool ghes_disable;
module_param_named(disable, ghes_disable, bool, 0);

/*
* "ghes.edac_force_enable" forcibly enables ghes_edac and skips the platform
* check.
*/
static bool ghes_edac_force_enable;
module_param_named(edac_force_enable, ghes_edac_force_enable, bool, 0);

/*
* All error sources notified with HED (Hardware Error Device) share a
* single notifier callback, so they need to be linked and checked one
Expand All @@ -118,6 +127,13 @@ module_param_named(disable, ghes_disable, bool, 0);
static LIST_HEAD(ghes_hed);
static DEFINE_MUTEX(ghes_list_mutex);

/*
* A list of GHES devices which are given to the corresponding EDAC driver
* ghes_edac for further use.
*/
static LIST_HEAD(ghes_devs);
static DEFINE_MUTEX(ghes_devs_mutex);

/*
* Because the memory area used to transfer hardware error information
* from BIOS to Linux can be determined only in NMI, IRQ or timer
Expand Down Expand Up @@ -645,7 +661,7 @@ static bool ghes_do_proc(struct ghes *ghes,
if (guid_equal(sec_type, &CPER_SEC_PLATFORM_MEM)) {
struct cper_sec_mem_err *mem_err = acpi_hest_get_payload(gdata);

ghes_edac_report_mem_error(sev, mem_err);
atomic_notifier_call_chain(&ghes_report_chain, sev, mem_err);

arch_apei_report_mem_error(sev, mem_err);
queued = ghes_handle_memory_failure(gdata, sev);
Expand Down Expand Up @@ -1382,7 +1398,11 @@ static int ghes_probe(struct platform_device *ghes_dev)

platform_set_drvdata(ghes_dev, ghes);

ghes_edac_register(ghes, &ghes_dev->dev);
ghes->dev = &ghes_dev->dev;

mutex_lock(&ghes_devs_mutex);
list_add_tail(&ghes->elist, &ghes_devs);
mutex_unlock(&ghes_devs_mutex);

/* Handle any pending errors right away */
spin_lock_irqsave(&ghes_notify_lock_irq, flags);
Expand Down Expand Up @@ -1446,7 +1466,9 @@ static int ghes_remove(struct platform_device *ghes_dev)

ghes_fini(ghes);

ghes_edac_unregister(ghes);
mutex_lock(&ghes_devs_mutex);
list_del(&ghes->elist);
mutex_unlock(&ghes_devs_mutex);

kfree(ghes);

Expand Down Expand Up @@ -1501,3 +1523,41 @@ void __init acpi_ghes_init(void)
else
pr_info(GHES_PFX "Failed to enable APEI firmware first mode.\n");
}

/*
* Known x86 systems that prefer GHES error reporting:
*/
static struct acpi_platform_list plat_list[] = {
{"HPE ", "Server ", 0, ACPI_SIG_FADT, all_versions},
{ } /* End */
};

struct list_head *ghes_get_devices(void)
{
int idx = -1;

if (IS_ENABLED(CONFIG_X86)) {
idx = acpi_match_platform_list(plat_list);
if (idx < 0) {
if (!ghes_edac_force_enable)
return NULL;

pr_warn_once("Force-loading ghes_edac on an unsupported platform. You're on your own!\n");
}
}

return &ghes_devs;
}
EXPORT_SYMBOL_GPL(ghes_get_devices);

void ghes_register_report_chain(struct notifier_block *nb)
{
atomic_notifier_chain_register(&ghes_report_chain, nb);
}
EXPORT_SYMBOL_GPL(ghes_register_report_chain);

void ghes_unregister_report_chain(struct notifier_block *nb)
{
atomic_notifier_chain_unregister(&ghes_report_chain, nb);
}
EXPORT_SYMBOL_GPL(ghes_unregister_report_chain);
5 changes: 3 additions & 2 deletions drivers/edac/Kconfig
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,8 @@ config EDAC_DECODE_MCE
has been initialized.

config EDAC_GHES
bool "Output ACPI APEI/GHES BIOS detected errors via EDAC"
depends on ACPI_APEI_GHES && (EDAC=y)
tristate "Output ACPI APEI/GHES BIOS detected errors via EDAC"
depends on ACPI_APEI_GHES
select UEFI_CPER
help
Not all machines support hardware-driven error report. Some of those
Expand Down Expand Up @@ -211,6 +211,7 @@ config EDAC_R82600
config EDAC_I5000
tristate "Intel Greencreek/Blackford chipset"
depends on X86 && PCI
depends on BROKEN
help
Support for error detection and correction the Intel
Greekcreek/Blackford chipsets.
Expand Down
3 changes: 3 additions & 0 deletions drivers/edac/amd64_edac.c
Original file line number Diff line number Diff line change
Expand Up @@ -4329,6 +4329,9 @@ static int __init amd64_edac_init(void)
int err = -ENODEV;
int i;

if (ghes_get_devices())
return -EBUSY;

owner = edac_get_owner();
if (owner && strncmp(owner, EDAC_MOD_STR, sizeof(EDAC_MOD_STR)))
return -EBUSY;
Expand Down
3 changes: 3 additions & 0 deletions drivers/edac/armada_xp_edac.c
Original file line number Diff line number Diff line change
Expand Up @@ -599,6 +599,9 @@ static int __init armada_xp_edac_init(void)
{
int res;

if (ghes_get_devices())
return -EBUSY;

/* only polling is supported */
edac_op_state = EDAC_OPSTATE_POLL;

Expand Down
24 changes: 24 additions & 0 deletions drivers/edac/edac_mc_sysfs.c
Original file line number Diff line number Diff line change
Expand Up @@ -298,6 +298,14 @@ DEVICE_CHANNEL(ch6_dimm_label, S_IRUGO | S_IWUSR,
channel_dimm_label_show, channel_dimm_label_store, 6);
DEVICE_CHANNEL(ch7_dimm_label, S_IRUGO | S_IWUSR,
channel_dimm_label_show, channel_dimm_label_store, 7);
DEVICE_CHANNEL(ch8_dimm_label, S_IRUGO | S_IWUSR,
channel_dimm_label_show, channel_dimm_label_store, 8);
DEVICE_CHANNEL(ch9_dimm_label, S_IRUGO | S_IWUSR,
channel_dimm_label_show, channel_dimm_label_store, 9);
DEVICE_CHANNEL(ch10_dimm_label, S_IRUGO | S_IWUSR,
channel_dimm_label_show, channel_dimm_label_store, 10);
DEVICE_CHANNEL(ch11_dimm_label, S_IRUGO | S_IWUSR,
channel_dimm_label_show, channel_dimm_label_store, 11);

/* Total possible dynamic DIMM Label attribute file table */
static struct attribute *dynamic_csrow_dimm_attr[] = {
Expand All @@ -309,6 +317,10 @@ static struct attribute *dynamic_csrow_dimm_attr[] = {
&dev_attr_legacy_ch5_dimm_label.attr.attr,
&dev_attr_legacy_ch6_dimm_label.attr.attr,
&dev_attr_legacy_ch7_dimm_label.attr.attr,
&dev_attr_legacy_ch8_dimm_label.attr.attr,
&dev_attr_legacy_ch9_dimm_label.attr.attr,
&dev_attr_legacy_ch10_dimm_label.attr.attr,
&dev_attr_legacy_ch11_dimm_label.attr.attr,
NULL
};

Expand All @@ -329,6 +341,14 @@ DEVICE_CHANNEL(ch6_ce_count, S_IRUGO,
channel_ce_count_show, NULL, 6);
DEVICE_CHANNEL(ch7_ce_count, S_IRUGO,
channel_ce_count_show, NULL, 7);
DEVICE_CHANNEL(ch8_ce_count, S_IRUGO,
channel_ce_count_show, NULL, 8);
DEVICE_CHANNEL(ch9_ce_count, S_IRUGO,
channel_ce_count_show, NULL, 9);
DEVICE_CHANNEL(ch10_ce_count, S_IRUGO,
channel_ce_count_show, NULL, 10);
DEVICE_CHANNEL(ch11_ce_count, S_IRUGO,
channel_ce_count_show, NULL, 11);

/* Total possible dynamic ce_count attribute file table */
static struct attribute *dynamic_csrow_ce_count_attr[] = {
Expand All @@ -340,6 +360,10 @@ static struct attribute *dynamic_csrow_ce_count_attr[] = {
&dev_attr_legacy_ch5_ce_count.attr.attr,
&dev_attr_legacy_ch6_ce_count.attr.attr,
&dev_attr_legacy_ch7_ce_count.attr.attr,
&dev_attr_legacy_ch8_ce_count.attr.attr,
&dev_attr_legacy_ch9_ce_count.attr.attr,
&dev_attr_legacy_ch10_ce_count.attr.attr,
&dev_attr_legacy_ch11_ce_count.attr.attr,
NULL
};

Expand Down
1 change: 1 addition & 0 deletions drivers/edac/edac_module.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#ifndef __EDAC_MODULE_H__
#define __EDAC_MODULE_H__

#include <acpi/ghes.h>
#include "edac_mc.h"
#include "edac_pci.h"
#include "edac_device.h"
Expand Down
Loading

0 comments on commit 7adcadb

Please sign in to comment.