From 55e5914ba443a1d446f648e1443f809df9d59840 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Thu, 2 Feb 2012 13:54:44 +0100 Subject: [PATCH] --- yaml --- r: 288651 b: refs/heads/master c: 73323f541fe5f55a3b8a5c3d565bfc1efd64abf6 h: refs/heads/master i: 288649: 7627ccec427d3927533cfe2dd54639f3ec5cd546 288647: 4c3f44a57ce96ffe25c8b28508f9d9417d98a15e v: v3 --- [refs] | 2 +- trunk/Documentation/lockup-watchdogs.txt | 63 ------------------ trunk/Documentation/nmi_watchdog.txt | 83 ++++++++++++++++++++++++ trunk/arch/x86/include/asm/inat.h | 5 +- trunk/arch/x86/include/asm/insn.h | 18 ++--- trunk/arch/x86/lib/inat.c | 36 +++++----- trunk/arch/x86/lib/insn.c | 13 ++-- trunk/kernel/watchdog.c | 24 +++---- trunk/lib/Kconfig.debug | 18 ++--- trunk/tools/perf/util/header.c | 77 ++++++++++++++++++---- 10 files changed, 199 insertions(+), 140 deletions(-) delete mode 100644 trunk/Documentation/lockup-watchdogs.txt create mode 100644 trunk/Documentation/nmi_watchdog.txt diff --git a/[refs] b/[refs] index 7faff1b5e47d..ee9643948bc8 100644 --- a/[refs] +++ b/[refs] @@ -1,2 +1,2 @@ --- -refs/heads/master: f8d98f1095210da708a59f3a0b6fd267ad8f3f03 +refs/heads/master: 73323f541fe5f55a3b8a5c3d565bfc1efd64abf6 diff --git a/trunk/Documentation/lockup-watchdogs.txt b/trunk/Documentation/lockup-watchdogs.txt deleted file mode 100644 index d2a36602ca8d..000000000000 --- a/trunk/Documentation/lockup-watchdogs.txt +++ /dev/null @@ -1,63 +0,0 @@ -=============================================================== -Softlockup detector and hardlockup detector (aka nmi_watchdog) -=============================================================== - -The Linux kernel can act as a watchdog to detect both soft and hard -lockups. - -A 'softlockup' is defined as a bug that causes the kernel to loop in -kernel mode for more than 20 seconds (see "Implementation" below for -details), without giving other tasks a chance to run. The current -stack trace is displayed upon detection and, by default, the system -will stay locked up. Alternatively, the kernel can be configured to -panic; a sysctl, "kernel.softlockup_panic", a kernel parameter, -"softlockup_panic" (see "Documentation/kernel-parameters.txt" for -details), and a compile option, "BOOTPARAM_HARDLOCKUP_PANIC", are -provided for this. - -A 'hardlockup' is defined as a bug that causes the CPU to loop in -kernel mode for more than 10 seconds (see "Implementation" below for -details), without letting other interrupts have a chance to run. -Similarly to the softlockup case, the current stack trace is displayed -upon detection and the system will stay locked up unless the default -behavior is changed, which can be done through a compile time knob, -"BOOTPARAM_HARDLOCKUP_PANIC", and a kernel parameter, "nmi_watchdog" -(see "Documentation/kernel-parameters.txt" for details). - -The panic option can be used in combination with panic_timeout (this -timeout is set through the confusingly named "kernel.panic" sysctl), -to cause the system to reboot automatically after a specified amount -of time. - -=== Implementation === - -The soft and hard lockup detectors are built on top of the hrtimer and -perf subsystems, respectively. A direct consequence of this is that, -in principle, they should work in any architecture where these -subsystems are present. - -A periodic hrtimer runs to generate interrupts and kick the watchdog -task. An NMI perf event is generated every "watchdog_thresh" -(compile-time initialized to 10 and configurable through sysctl of the -same name) seconds to check for hardlockups. If any CPU in the system -does not receive any hrtimer interrupt during that time the -'hardlockup detector' (the handler for the NMI perf event) will -generate a kernel warning or call panic, depending on the -configuration. - -The watchdog task is a high priority kernel thread that updates a -timestamp every time it is scheduled. If that timestamp is not updated -for 2*watchdog_thresh seconds (the softlockup threshold) the -'softlockup detector' (coded inside the hrtimer callback function) -will dump useful debug information to the system log, after which it -will call panic if it was instructed to do so or resume execution of -other kernel code. - -The period of the hrtimer is 2*watchdog_thresh/5, which means it has -two or three chances to generate an interrupt before the hardlockup -detector kicks in. - -As explained above, a kernel knob is provided that allows -administrators to configure the period of the hrtimer and the perf -event. The right value for a particular environment is a trade-off -between fast response to lockups and detection overhead. diff --git a/trunk/Documentation/nmi_watchdog.txt b/trunk/Documentation/nmi_watchdog.txt new file mode 100644 index 000000000000..bf9f80a98282 --- /dev/null +++ b/trunk/Documentation/nmi_watchdog.txt @@ -0,0 +1,83 @@ + +[NMI watchdog is available for x86 and x86-64 architectures] + +Is your system locking up unpredictably? No keyboard activity, just +a frustrating complete hard lockup? Do you want to help us debugging +such lockups? If all yes then this document is definitely for you. + +On many x86/x86-64 type hardware there is a feature that enables +us to generate 'watchdog NMI interrupts'. (NMI: Non Maskable Interrupt +which get executed even if the system is otherwise locked up hard). +This can be used to debug hard kernel lockups. By executing periodic +NMI interrupts, the kernel can monitor whether any CPU has locked up, +and print out debugging messages if so. + +In order to use the NMI watchdog, you need to have APIC support in your +kernel. For SMP kernels, APIC support gets compiled in automatically. For +UP, enable either CONFIG_X86_UP_APIC (Processor type and features -> Local +APIC support on uniprocessors) or CONFIG_X86_UP_IOAPIC (Processor type and +features -> IO-APIC support on uniprocessors) in your kernel config. +CONFIG_X86_UP_APIC is for uniprocessor machines without an IO-APIC. +CONFIG_X86_UP_IOAPIC is for uniprocessor with an IO-APIC. [Note: certain +kernel debugging options, such as Kernel Stack Meter or Kernel Tracer, +may implicitly disable the NMI watchdog.] + +For x86-64, the needed APIC is always compiled in. + +Using local APIC (nmi_watchdog=2) needs the first performance register, so +you can't use it for other purposes (such as high precision performance +profiling.) However, at least oprofile and the perfctr driver disable the +local APIC NMI watchdog automatically. + +To actually enable the NMI watchdog, use the 'nmi_watchdog=N' boot +parameter. Eg. the relevant lilo.conf entry: + + append="nmi_watchdog=1" + +For SMP machines and UP machines with an IO-APIC use nmi_watchdog=1. +For UP machines without an IO-APIC use nmi_watchdog=2, this only works +for some processor types. If in doubt, boot with nmi_watchdog=1 and +check the NMI count in /proc/interrupts; if the count is zero then +reboot with nmi_watchdog=2 and check the NMI count. If it is still +zero then log a problem, you probably have a processor that needs to be +added to the nmi code. + +A 'lockup' is the following scenario: if any CPU in the system does not +execute the period local timer interrupt for more than 5 seconds, then +the NMI handler generates an oops and kills the process. This +'controlled crash' (and the resulting kernel messages) can be used to +debug the lockup. Thus whenever the lockup happens, wait 5 seconds and +the oops will show up automatically. If the kernel produces no messages +then the system has crashed so hard (eg. hardware-wise) that either it +cannot even accept NMI interrupts, or the crash has made the kernel +unable to print messages. + +Be aware that when using local APIC, the frequency of NMI interrupts +it generates, depends on the system load. The local APIC NMI watchdog, +lacking a better source, uses the "cycles unhalted" event. As you may +guess it doesn't tick when the CPU is in the halted state (which happens +when the system is idle), but if your system locks up on anything but the +"hlt" processor instruction, the watchdog will trigger very soon as the +"cycles unhalted" event will happen every clock tick. If it locks up on +"hlt", then you are out of luck -- the event will not happen at all and the +watchdog won't trigger. This is a shortcoming of the local APIC watchdog +-- unfortunately there is no "clock ticks" event that would work all the +time. The I/O APIC watchdog is driven externally and has no such shortcoming. +But its NMI frequency is much higher, resulting in a more significant hit +to the overall system performance. + +On x86 nmi_watchdog is disabled by default so you have to enable it with +a boot time parameter. + +It's possible to disable the NMI watchdog in run-time by writing "0" to +/proc/sys/kernel/nmi_watchdog. Writing "1" to the same file will re-enable +the NMI watchdog. Notice that you still need to use "nmi_watchdog=" parameter +at boot time. + +NOTE: In kernels prior to 2.4.2-ac18 the NMI-oopser is enabled unconditionally +on x86 SMP boxes. + +[ feel free to send bug reports, suggestions and patches to + Ingo Molnar or the Linux SMP mailing + list at ] + diff --git a/trunk/arch/x86/include/asm/inat.h b/trunk/arch/x86/include/asm/inat.h index 74a2e312e8a2..205b063e3e32 100644 --- a/trunk/arch/x86/include/asm/inat.h +++ b/trunk/arch/x86/include/asm/inat.h @@ -97,12 +97,11 @@ /* Attribute search APIs */ extern insn_attr_t inat_get_opcode_attribute(insn_byte_t opcode); -extern int inat_get_last_prefix_id(insn_byte_t last_pfx); extern insn_attr_t inat_get_escape_attribute(insn_byte_t opcode, - int lpfx_id, + insn_byte_t last_pfx, insn_attr_t esc_attr); extern insn_attr_t inat_get_group_attribute(insn_byte_t modrm, - int lpfx_id, + insn_byte_t last_pfx, insn_attr_t esc_attr); extern insn_attr_t inat_get_avx_attribute(insn_byte_t opcode, insn_byte_t vex_m, diff --git a/trunk/arch/x86/include/asm/insn.h b/trunk/arch/x86/include/asm/insn.h index 48eb30a86062..74df3f1eddfd 100644 --- a/trunk/arch/x86/include/asm/insn.h +++ b/trunk/arch/x86/include/asm/insn.h @@ -96,6 +96,12 @@ struct insn { #define X86_VEX_P(vex) ((vex) & 0x03) /* VEX3 Byte2, VEX2 Byte1 */ #define X86_VEX_M_MAX 0x1f /* VEX3.M Maximum value */ +/* The last prefix is needed for two-byte and three-byte opcodes */ +static inline insn_byte_t insn_last_prefix(struct insn *insn) +{ + return insn->prefixes.bytes[3]; +} + extern void insn_init(struct insn *insn, const void *kaddr, int x86_64); extern void insn_get_prefixes(struct insn *insn); extern void insn_get_opcode(struct insn *insn); @@ -154,18 +160,6 @@ static inline insn_byte_t insn_vex_p_bits(struct insn *insn) return X86_VEX_P(insn->vex_prefix.bytes[2]); } -/* Get the last prefix id from last prefix or VEX prefix */ -static inline int insn_last_prefix_id(struct insn *insn) -{ - if (insn_is_avx(insn)) - return insn_vex_p_bits(insn); /* VEX_p is a SIMD prefix id */ - - if (insn->prefixes.bytes[3]) - return inat_get_last_prefix_id(insn->prefixes.bytes[3]); - - return 0; -} - /* Offset of each field from kaddr */ static inline int insn_offset_rex_prefix(struct insn *insn) { diff --git a/trunk/arch/x86/lib/inat.c b/trunk/arch/x86/lib/inat.c index c1f01a8e9f65..88ad5fbda6e1 100644 --- a/trunk/arch/x86/lib/inat.c +++ b/trunk/arch/x86/lib/inat.c @@ -29,46 +29,46 @@ insn_attr_t inat_get_opcode_attribute(insn_byte_t opcode) return inat_primary_table[opcode]; } -int inat_get_last_prefix_id(insn_byte_t last_pfx) -{ - insn_attr_t lpfx_attr; - - lpfx_attr = inat_get_opcode_attribute(last_pfx); - return inat_last_prefix_id(lpfx_attr); -} - -insn_attr_t inat_get_escape_attribute(insn_byte_t opcode, int lpfx_id, +insn_attr_t inat_get_escape_attribute(insn_byte_t opcode, insn_byte_t last_pfx, insn_attr_t esc_attr) { const insn_attr_t *table; - int n; + insn_attr_t lpfx_attr; + int n, m = 0; n = inat_escape_id(esc_attr); - + if (last_pfx) { + lpfx_attr = inat_get_opcode_attribute(last_pfx); + m = inat_last_prefix_id(lpfx_attr); + } table = inat_escape_tables[n][0]; if (!table) return 0; - if (inat_has_variant(table[opcode]) && lpfx_id) { - table = inat_escape_tables[n][lpfx_id]; + if (inat_has_variant(table[opcode]) && m) { + table = inat_escape_tables[n][m]; if (!table) return 0; } return table[opcode]; } -insn_attr_t inat_get_group_attribute(insn_byte_t modrm, int lpfx_id, +insn_attr_t inat_get_group_attribute(insn_byte_t modrm, insn_byte_t last_pfx, insn_attr_t grp_attr) { const insn_attr_t *table; - int n; + insn_attr_t lpfx_attr; + int n, m = 0; n = inat_group_id(grp_attr); - + if (last_pfx) { + lpfx_attr = inat_get_opcode_attribute(last_pfx); + m = inat_last_prefix_id(lpfx_attr); + } table = inat_group_tables[n][0]; if (!table) return inat_group_common_attribute(grp_attr); - if (inat_has_variant(table[X86_MODRM_REG(modrm)]) && lpfx_id) { - table = inat_group_tables[n][lpfx_id]; + if (inat_has_variant(table[X86_MODRM_REG(modrm)]) && m) { + table = inat_group_tables[n][m]; if (!table) return inat_group_common_attribute(grp_attr); } diff --git a/trunk/arch/x86/lib/insn.c b/trunk/arch/x86/lib/insn.c index 25feb1ae71c5..5a1f9f3e3fbb 100644 --- a/trunk/arch/x86/lib/insn.c +++ b/trunk/arch/x86/lib/insn.c @@ -185,8 +185,7 @@ void insn_get_prefixes(struct insn *insn) void insn_get_opcode(struct insn *insn) { struct insn_field *opcode = &insn->opcode; - insn_byte_t op; - int pfx_id; + insn_byte_t op, pfx; if (opcode->got) return; if (!insn->prefixes.got) @@ -213,8 +212,8 @@ void insn_get_opcode(struct insn *insn) /* Get escaped opcode */ op = get_next(insn_byte_t, insn); opcode->bytes[opcode->nbytes++] = op; - pfx_id = insn_last_prefix_id(insn); - insn->attr = inat_get_escape_attribute(op, pfx_id, insn->attr); + pfx = insn_last_prefix(insn); + insn->attr = inat_get_escape_attribute(op, pfx, insn->attr); } if (inat_must_vex(insn->attr)) insn->attr = 0; /* This instruction is bad */ @@ -236,7 +235,7 @@ void insn_get_opcode(struct insn *insn) void insn_get_modrm(struct insn *insn) { struct insn_field *modrm = &insn->modrm; - insn_byte_t pfx_id, mod; + insn_byte_t pfx, mod; if (modrm->got) return; if (!insn->opcode.got) @@ -247,8 +246,8 @@ void insn_get_modrm(struct insn *insn) modrm->value = mod; modrm->nbytes = 1; if (inat_is_group(insn->attr)) { - pfx_id = insn_last_prefix_id(insn); - insn->attr = inat_get_group_attribute(mod, pfx_id, + pfx = insn_last_prefix(insn); + insn->attr = inat_get_group_attribute(mod, pfx, insn->attr); if (insn_is_avx(insn) && !inat_accept_vex(insn->attr)) insn->attr = 0; /* This is bad */ diff --git a/trunk/kernel/watchdog.c b/trunk/kernel/watchdog.c index 14bc092fb12c..d117262deba3 100644 --- a/trunk/kernel/watchdog.c +++ b/trunk/kernel/watchdog.c @@ -3,9 +3,12 @@ * * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc. * - * Note: Most of this code is borrowed heavily from the original softlockup - * detector, so thanks to Ingo for the initial implementation. - * Some chunks also taken from the old x86-specific nmi watchdog code, thanks + * this code detects hard lockups: incidents in where on a CPU + * the kernel does not respond to anything except NMI. + * + * Note: Most of this code is borrowed heavily from softlockup.c, + * so thanks to Ingo for the initial implementation. + * Some chunks also taken from arch/x86/kernel/apic/nmi.c, thanks * to those contributors as well. */ @@ -114,10 +117,9 @@ static unsigned long get_sample_period(void) { /* * convert watchdog_thresh from seconds to ns - * the divide by 5 is to give hrtimer several chances (two - * or three with the current relation between the soft - * and hard thresholds) to increment before the - * hardlockup detector generates a warning + * the divide by 5 is to give hrtimer 5 chances to + * increment before the hardlockup detector generates + * a warning */ return get_softlockup_thresh() * (NSEC_PER_SEC / 5); } @@ -334,11 +336,9 @@ static int watchdog(void *unused) set_current_state(TASK_INTERRUPTIBLE); /* - * Run briefly (kicked by the hrtimer callback function) once every - * get_sample_period() seconds (4 seconds by default) to reset the - * softlockup timestamp. If this gets delayed for more than - * 2*watchdog_thresh seconds then the debug-printout triggers in - * watchdog_timer_fn(). + * Run briefly once per second to reset the softlockup timestamp. + * If this gets delayed for more than 60 seconds then the + * debug-printout triggers in watchdog_timer_fn(). */ while (!kthread_should_stop()) { __touch_watchdog(); diff --git a/trunk/lib/Kconfig.debug b/trunk/lib/Kconfig.debug index 9739c0b45e93..8745ac7d1f75 100644 --- a/trunk/lib/Kconfig.debug +++ b/trunk/lib/Kconfig.debug @@ -166,21 +166,18 @@ config LOCKUP_DETECTOR hard and soft lockups. Softlockups are bugs that cause the kernel to loop in kernel - mode for more than 20 seconds, without giving other tasks a + mode for more than 60 seconds, without giving other tasks a chance to run. The current stack trace is displayed upon detection and the system will stay locked up. Hardlockups are bugs that cause the CPU to loop in kernel mode - for more than 10 seconds, without letting other interrupts have a + for more than 60 seconds, without letting other interrupts have a chance to run. The current stack trace is displayed upon detection and the system will stay locked up. The overhead should be minimal. A periodic hrtimer runs to - generate interrupts and kick the watchdog task every 4 seconds. - An NMI is generated every 10 seconds or so to check for hardlockups. - - The frequency of hrtimer and NMI events and the soft and hard lockup - thresholds can be controlled through the sysctl watchdog_thresh. + generate interrupts and kick the watchdog task every 10-12 seconds. + An NMI is generated every 60 seconds or so to check for hardlockups. config HARDLOCKUP_DETECTOR def_bool LOCKUP_DETECTOR && PERF_EVENTS && HAVE_PERF_EVENTS_NMI && \ @@ -192,8 +189,7 @@ config BOOTPARAM_HARDLOCKUP_PANIC help Say Y here to enable the kernel to panic on "hard lockups", which are bugs that cause the kernel to loop in kernel - mode with interrupts disabled for more than 10 seconds (configurable - using the watchdog_thresh sysctl). + mode with interrupts disabled for more than 60 seconds. Say N if unsure. @@ -210,8 +206,8 @@ config BOOTPARAM_SOFTLOCKUP_PANIC help Say Y here to enable the kernel to panic on "soft lockups", which are bugs that cause the kernel to loop in kernel - mode for more than 20 seconds (configurable using the watchdog_thresh - sysctl), without giving other tasks a chance to run. + mode for more than 60 seconds, without giving other tasks a + chance to run. The panic can be used in combination with panic_timeout, to cause the system to reboot automatically after a diff --git a/trunk/tools/perf/util/header.c b/trunk/tools/perf/util/header.c index ecd7f4dd7eea..6f4187de4b72 100644 --- a/trunk/tools/perf/util/header.c +++ b/trunk/tools/perf/util/header.c @@ -63,9 +63,20 @@ char *perf_header__find_event(u64 id) return NULL; } -static const char *__perf_magic = "PERFFILE"; +/* + * magic2 = "PERFILE2" + * must be a numerical value to let the endianness + * determine the memory layout. That way we are able + * to detect endianness when reading the perf.data file + * back. + * + * we check for legacy (PERFFILE) format. + */ +static const char *__perf_magic1 = "PERFFILE"; +static const u64 __perf_magic2 = 0x32454c4946524550ULL; +static const u64 __perf_magic2_sw = 0x50455246494c4532ULL; -#define PERF_MAGIC (*(u64 *)__perf_magic) +#define PERF_MAGIC __perf_magic2 struct perf_file_attr { struct perf_event_attr attr; @@ -1620,24 +1631,59 @@ int perf_header__process_sections(struct perf_header *header, int fd, return err; } +static int check_magic_endian(u64 *magic, struct perf_file_header *header, + struct perf_header *ph) +{ + int ret; + + /* check for legacy format */ + ret = memcmp(magic, __perf_magic1, sizeof(*magic)); + if (ret == 0) { + pr_debug("legacy perf.data format\n"); + if (!header) + return -1; + + if (header->attr_size != sizeof(struct perf_file_attr)) { + u64 attr_size = bswap_64(header->attr_size); + + if (attr_size != sizeof(struct perf_file_attr)) + return -1; + + ph->needs_swap = true; + } + return 0; + } + + /* check magic number with same endianness */ + if (*magic == __perf_magic2) + return 0; + + /* check magic number but opposite endianness */ + if (*magic != __perf_magic2_sw) + return -1; + + ph->needs_swap = true; + + return 0; +} + int perf_file_header__read(struct perf_file_header *header, struct perf_header *ph, int fd) { + int ret; + lseek(fd, 0, SEEK_SET); - if (readn(fd, header, sizeof(*header)) <= 0 || - memcmp(&header->magic, __perf_magic, sizeof(header->magic))) + ret = readn(fd, header, sizeof(*header)); + if (ret <= 0) return -1; - if (header->attr_size != sizeof(struct perf_file_attr)) { - u64 attr_size = bswap_64(header->attr_size); - - if (attr_size != sizeof(struct perf_file_attr)) - return -1; + if (check_magic_endian(&header->magic, header, ph) < 0) + return -1; + if (ph->needs_swap) { mem_bswap_64(header, offsetof(struct perf_file_header, - adds_features)); - ph->needs_swap = true; + adds_features)); } if (header->size != sizeof(*header)) { @@ -1873,8 +1919,13 @@ static int perf_file_header__read_pipe(struct perf_pipe_file_header *header, struct perf_header *ph, int fd, bool repipe) { - if (readn(fd, header, sizeof(*header)) <= 0 || - memcmp(&header->magic, __perf_magic, sizeof(header->magic))) + int ret; + + ret = readn(fd, header, sizeof(*header)); + if (ret <= 0) + return -1; + + if (check_magic_endian(&header->magic, NULL, ph) < 0) return -1; if (repipe && do_write(STDOUT_FILENO, header, sizeof(*header)) < 0)