Skip to content

Commit

Permalink
---
Browse files Browse the repository at this point in the history
yaml
---
r: 199266
b: refs/heads/master
c: edbe77b
h: refs/heads/master
v: v3
  • Loading branch information
Len Brown committed May 28, 2010
1 parent 0caa3bc commit 7917ff6
Show file tree
Hide file tree
Showing 50 changed files with 4,437 additions and 409 deletions.
2 changes: 1 addition & 1 deletion [refs]
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
---
refs/heads/master: 0dc698b93f3eecdda43b22232131324eb41e510c
refs/heads/master: edbe77ba94217868caf5f391d2a083729bef3742
59 changes: 59 additions & 0 deletions trunk/Documentation/acpi/apei/einj.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
APEI Error INJection
~~~~~~~~~~~~~~~~~~~~

EINJ provides a hardware error injection mechanism
It is very useful for debugging and testing of other APEI and RAS features.

To use EINJ, make sure the following are enabled in your kernel
configuration:

CONFIG_DEBUG_FS
CONFIG_ACPI_APEI
CONFIG_ACPI_APEI_EINJ

The user interface of EINJ is debug file system, under the
directory apei/einj. The following files are provided.

- available_error_type
Reading this file returns the error injection capability of the
platform, that is, which error types are supported. The error type
definition is as follow, the left field is the error type value, the
right field is error description.

0x00000001 Processor Correctable
0x00000002 Processor Uncorrectable non-fatal
0x00000004 Processor Uncorrectable fatal
0x00000008 Memory Correctable
0x00000010 Memory Uncorrectable non-fatal
0x00000020 Memory Uncorrectable fatal
0x00000040 PCI Express Correctable
0x00000080 PCI Express Uncorrectable fatal
0x00000100 PCI Express Uncorrectable non-fatal
0x00000200 Platform Correctable
0x00000400 Platform Uncorrectable non-fatal
0x00000800 Platform Uncorrectable fatal

The format of file contents are as above, except there are only the
available error type lines.

- error_type
This file is used to set the error type value. The error type value
is defined in "available_error_type" description.

- error_inject
Write any integer to this file to trigger the error
injection. Before this, please specify all necessary error
parameters.

- param1
This file is used to set the first error parameter value. Effect of
parameter depends on error_type specified. For memory error, this is
physical memory address.

- param2
This file is used to set the second error parameter value. Effect of
parameter depends on error_type specified. For memory error, this is
physical memory address mask.

For more information about EINJ, please refer to ACPI specification
version 4.0, section 17.5.
9 changes: 9 additions & 0 deletions trunk/Documentation/kernel-parameters.txt
Original file line number Diff line number Diff line change
Expand Up @@ -750,6 +750,10 @@ and is between 256 and 4096 characters. It is defined in the file
Default value is 0.
Value can be changed at runtime via /selinux/enforce.

erst_disable [ACPI]
Disable Error Record Serialization Table (ERST)
support.

ether= [HW,NET] Ethernet cards parameters
This option is obsoleted by the "netdev=" option, which
has equivalent usage. See its documentation for details.
Expand Down Expand Up @@ -843,6 +847,11 @@ and is between 256 and 4096 characters. It is defined in the file
hd= [EIDE] (E)IDE hard drive subsystem geometry
Format: <cyl>,<head>,<sect>

hest_disable [ACPI]
Disable Hardware Error Source Table (HEST) support;
corresponding firmware-first mode error processing
logic will be disabled.

highmem=nn[KMG] [KNL,BOOT] forces the highmem zone to have an exact
size of <nn>. This works even on boxes that have no
highmem otherwise. This also works to reduce highmem
Expand Down
5 changes: 4 additions & 1 deletion trunk/arch/ia64/pci/pci.c
Original file line number Diff line number Diff line change
Expand Up @@ -335,8 +335,11 @@ pcibios_setup_root_windows(struct pci_bus *bus, struct pci_controller *ctrl)
}

struct pci_bus * __devinit
pci_acpi_scan_root(struct acpi_device *device, int domain, int bus)
pci_acpi_scan_root(struct acpi_pci_root *root)
{
struct acpi_device *device = root->device;
int domain = root->segment;
int bus = root->secondary.start;
struct pci_controller *controller;
unsigned int windows = 0;
struct pci_bus *pbus;
Expand Down
8 changes: 8 additions & 0 deletions trunk/arch/x86/include/asm/mce.h
Original file line number Diff line number Diff line change
Expand Up @@ -225,5 +225,13 @@ extern void mcheck_intel_therm_init(void);
static inline void mcheck_intel_therm_init(void) { }
#endif

/*
* Used by APEI to report memory error via /dev/mcelog
*/

struct cper_sec_mem_err;
extern void apei_mce_report_mem_error(int corrected,
struct cper_sec_mem_err *mem_err);

#endif /* __KERNEL__ */
#endif /* _ASM_X86_MCE_H */
2 changes: 0 additions & 2 deletions trunk/arch/x86/kernel/acpi/sleep.c
Original file line number Diff line number Diff line change
Expand Up @@ -162,8 +162,6 @@ static int __init acpi_sleep_setup(char *str)
#endif
if (strncmp(str, "old_ordering", 12) == 0)
acpi_old_suspend_ordering();
if (strncmp(str, "sci_force_enable", 16) == 0)
acpi_set_sci_en_on_resume();
str = strchr(str, ',');
if (str != NULL)
str += strspn(str, ", \t");
Expand Down
2 changes: 2 additions & 0 deletions trunk/arch/x86/kernel/cpu/mcheck/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,5 @@ obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o
obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o

obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o

obj-$(CONFIG_ACPI_APEI) += mce-apei.o
138 changes: 138 additions & 0 deletions trunk/arch/x86/kernel/cpu/mcheck/mce-apei.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
/*
* Bridge between MCE and APEI
*
* On some machine, corrected memory errors are reported via APEI
* generic hardware error source (GHES) instead of corrected Machine
* Check. These corrected memory errors can be reported to user space
* through /dev/mcelog via faking a corrected Machine Check, so that
* the error memory page can be offlined by /sbin/mcelog if the error
* count for one page is beyond the threshold.
*
* For fatal MCE, save MCE record into persistent storage via ERST, so
* that the MCE record can be logged after reboot via ERST.
*
* Copyright 2010 Intel Corp.
* Author: Huang Ying <ying.huang@intel.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License version
* 2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/

#include <linux/kernel.h>
#include <linux/acpi.h>
#include <linux/cper.h>
#include <acpi/apei.h>
#include <asm/mce.h>

#include "mce-internal.h"

void apei_mce_report_mem_error(int corrected, struct cper_sec_mem_err *mem_err)
{
struct mce m;

/* Only corrected MC is reported */
if (!corrected)
return;

mce_setup(&m);
m.bank = 1;
/* Fake a memory read corrected error with unknown channel */
m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | 0x9f;
m.addr = mem_err->physical_addr;
mce_log(&m);
mce_notify_irq();
}
EXPORT_SYMBOL_GPL(apei_mce_report_mem_error);

#define CPER_CREATOR_MCE \
UUID_LE(0x75a574e3, 0x5052, 0x4b29, 0x8a, 0x8e, 0xbe, 0x2c, \
0x64, 0x90, 0xb8, 0x9d)
#define CPER_SECTION_TYPE_MCE \
UUID_LE(0xfe08ffbe, 0x95e4, 0x4be7, 0xbc, 0x73, 0x40, 0x96, \
0x04, 0x4a, 0x38, 0xfc)

/*
* CPER specification (in UEFI specification 2.3 appendix N) requires
* byte-packed.
*/
struct cper_mce_record {
struct cper_record_header hdr;
struct cper_section_descriptor sec_hdr;
struct mce mce;
} __packed;

int apei_write_mce(struct mce *m)
{
struct cper_mce_record rcd;

memset(&rcd, 0, sizeof(rcd));
memcpy(rcd.hdr.signature, CPER_SIG_RECORD, CPER_SIG_SIZE);
rcd.hdr.revision = CPER_RECORD_REV;
rcd.hdr.signature_end = CPER_SIG_END;
rcd.hdr.section_count = 1;
rcd.hdr.error_severity = CPER_SER_FATAL;
/* timestamp, platform_id, partition_id are all invalid */
rcd.hdr.validation_bits = 0;
rcd.hdr.record_length = sizeof(rcd);
rcd.hdr.creator_id = CPER_CREATOR_MCE;
rcd.hdr.notification_type = CPER_NOTIFY_MCE;
rcd.hdr.record_id = cper_next_record_id();
rcd.hdr.flags = CPER_HW_ERROR_FLAGS_PREVERR;

rcd.sec_hdr.section_offset = (void *)&rcd.mce - (void *)&rcd;
rcd.sec_hdr.section_length = sizeof(rcd.mce);
rcd.sec_hdr.revision = CPER_SEC_REV;
/* fru_id and fru_text is invalid */
rcd.sec_hdr.validation_bits = 0;
rcd.sec_hdr.flags = CPER_SEC_PRIMARY;
rcd.sec_hdr.section_type = CPER_SECTION_TYPE_MCE;
rcd.sec_hdr.section_severity = CPER_SER_FATAL;

memcpy(&rcd.mce, m, sizeof(*m));

return erst_write(&rcd.hdr);
}

ssize_t apei_read_mce(struct mce *m, u64 *record_id)
{
struct cper_mce_record rcd;
ssize_t len;

len = erst_read_next(&rcd.hdr, sizeof(rcd));
if (len <= 0)
return len;
/* Can not skip other records in storage via ERST unless clear them */
else if (len != sizeof(rcd) ||
uuid_le_cmp(rcd.hdr.creator_id, CPER_CREATOR_MCE)) {
if (printk_ratelimit())
pr_warning(
"MCE-APEI: Can not skip the unknown record in ERST");
return -EIO;
}

memcpy(m, &rcd.mce, sizeof(*m));
*record_id = rcd.hdr.record_id;

return sizeof(*m);
}

/* Check whether there is record in ERST */
int apei_check_mce(void)
{
return erst_get_record_count();
}

int apei_clear_mce(u64 record_id)
{
return erst_clear(record_id);
}
23 changes: 23 additions & 0 deletions trunk/arch/x86/kernel/cpu/mcheck/mce-internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,3 +28,26 @@ extern int mce_ser;

extern struct mce_bank *mce_banks;

#ifdef CONFIG_ACPI_APEI
int apei_write_mce(struct mce *m);
ssize_t apei_read_mce(struct mce *m, u64 *record_id);
int apei_check_mce(void);
int apei_clear_mce(u64 record_id);
#else
static inline int apei_write_mce(struct mce *m)
{
return -EINVAL;
}
static inline ssize_t apei_read_mce(struct mce *m, u64 *record_id)
{
return 0;
}
static inline int apei_check_mce(void)
{
return 0;
}
static inline int apei_clear_mce(u64 record_id)
{
return -EINVAL;
}
#endif
Loading

0 comments on commit 7917ff6

Please sign in to comment.