Skip to content

Commit

Permalink
lockup_detector: Combine nmi_watchdog and softlockup detector
Browse files Browse the repository at this point in the history
The new nmi_watchdog (which uses the perf event subsystem) is very
similar in structure to the softlockup detector.  Using Ingo's
suggestion, I combined the two functionalities into one file:
kernel/watchdog.c.

Now both the nmi_watchdog (or hardlockup detector) and softlockup
detector sit on top of the perf event subsystem, which is run every
60 seconds or so to see if there are any lockups.

To detect hardlockups, cpus not responding to interrupts, I
implemented an hrtimer that runs 5 times for every perf event
overflow event.  If that stops counting on a cpu, then the cpu is
most likely in trouble.

To detect softlockups, tasks not yielding to the scheduler, I used the
previous kthread idea that now gets kicked every time the hrtimer fires.
If the kthread isn't being scheduled neither is anyone else and the
warning is printed to the console.

I tested this on x86_64 and both the softlockup and hardlockup paths
work.

V2:
- cleaned up the Kconfig and softlockup combination
- surrounded hardlockup cases with #ifdef CONFIG_PERF_EVENTS_NMI
- seperated out the softlockup case from perf event subsystem
- re-arranged the enabling/disabling nmi watchdog from proc space
- added cpumasks for hardlockup failure cases
- removed fallback to soft events if no PMU exists for hard events

V3:
- comment cleanups
- drop support for older softlockup code
- per_cpu cleanups
- completely remove software clock base hardlockup detector
- use per_cpu masking on hard/soft lockup detection
- #ifdef cleanups
- rename config option NMI_WATCHDOG to LOCKUP_DETECTOR
- documentation additions

V4:
- documentation fixes
- convert per_cpu to __get_cpu_var
- powerpc compile fixes

V5:
- split apart warn flags for hard and soft lockups

TODO:
- figure out how to make an arch-agnostic clock2cycles call
  (if possible) to feed into perf events as a sample period

[fweisbec: merged conflict patch]

Signed-off-by: Don Zickus <dzickus@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Eric Paris <eparis@redhat.com>
Cc: Randy Dunlap <randy.dunlap@oracle.com>
LKML-Reference: <1273266711-18706-2-git-send-email-dzickus@redhat.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
  • Loading branch information
Don Zickus authored and Frederic Weisbecker committed May 12, 2010
1 parent a9aa1d0 commit 58687ac
Show file tree
Hide file tree
Showing 12 changed files with 650 additions and 29 deletions.
2 changes: 2 additions & 0 deletions Documentation/kernel-parameters.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1777,6 +1777,8 @@ and is between 256 and 4096 characters. It is defined in the file

nousb [USB] Disable the USB subsystem

nowatchdog [KNL] Disable the lockup detector.

nowb [ARM]

nox2apic [X86-64,APIC] Do not enable x2APIC mode.
Expand Down
2 changes: 1 addition & 1 deletion arch/x86/include/asm/nmi.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ int do_nmi_callback(struct pt_regs *regs, int cpu);

extern void die_nmi(char *str, struct pt_regs *regs, int do_panic);
extern int check_nmi_watchdog(void);
#if !defined(CONFIG_NMI_WATCHDOG)
#if !defined(CONFIG_LOCKUP_DETECTOR)
extern int nmi_watchdog_enabled;
#endif
extern int avail_to_resrv_perfctr_nmi_bit(unsigned int);
Expand Down
4 changes: 2 additions & 2 deletions arch/x86/kernel/apic/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,10 @@
#

obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o probe_$(BITS).o ipi.o
ifneq ($(CONFIG_NMI_WATCHDOG),y)
ifneq ($(CONFIG_LOCKUP_DETECTOR),y)
obj-$(CONFIG_X86_LOCAL_APIC) += nmi.o
endif
obj-$(CONFIG_NMI_WATCHDOG) += hw_nmi.o
obj-$(CONFIG_LOCKUP_DETECTOR) += hw_nmi.o

obj-$(CONFIG_X86_IO_APIC) += io_apic.o
obj-$(CONFIG_SMP) += ipi.o
Expand Down
2 changes: 1 addition & 1 deletion arch/x86/kernel/apic/hw_nmi.c
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ int hw_nmi_is_cpu_stuck(struct pt_regs *regs)

u64 hw_nmi_get_sample_period(void)
{
return cpu_khz * 1000;
return (u64)(cpu_khz) * 1000 * 60;
}

#ifdef ARCH_HAS_NMI_WATCHDOG
Expand Down
4 changes: 2 additions & 2 deletions arch/x86/kernel/traps.c
Original file line number Diff line number Diff line change
Expand Up @@ -406,15 +406,15 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
== NOTIFY_STOP)
return;

#ifndef CONFIG_NMI_WATCHDOG
#ifndef CONFIG_LOCKUP_DETECTOR
/*
* Ok, so this is none of the documented NMI sources,
* so it must be the NMI watchdog.
*/
if (nmi_watchdog_tick(regs, reason))
return;
if (!do_nmi_callback(regs, cpu))
#endif /* !CONFIG_NMI_WATCHDOG */
#endif /* !CONFIG_LOCKUP_DETECTOR */
unknown_nmi_error(reason, regs);
#else
unknown_nmi_error(reason, regs);
Expand Down
8 changes: 4 additions & 4 deletions include/linux/nmi.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ extern void touch_nmi_watchdog(void);
extern void acpi_nmi_disable(void);
extern void acpi_nmi_enable(void);
#else
#ifndef CONFIG_NMI_WATCHDOG
#ifndef CONFIG_LOCKUP_DETECTOR
static inline void touch_nmi_watchdog(void)
{
touch_softlockup_watchdog();
Expand Down Expand Up @@ -51,12 +51,12 @@ static inline bool trigger_all_cpu_backtrace(void)
}
#endif

#ifdef CONFIG_NMI_WATCHDOG
#ifdef CONFIG_LOCKUP_DETECTOR
int hw_nmi_is_cpu_stuck(struct pt_regs *);
u64 hw_nmi_get_sample_period(void);
extern int nmi_watchdog_enabled;
extern int watchdog_enabled;
struct ctl_table;
extern int proc_nmi_enabled(struct ctl_table *, int ,
extern int proc_dowatchdog_enabled(struct ctl_table *, int ,
void __user *, size_t *, loff_t *);
#endif

Expand Down
6 changes: 6 additions & 0 deletions include/linux/sched.h
Original file line number Diff line number Diff line change
Expand Up @@ -346,6 +346,12 @@ extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
size_t *lenp, loff_t *ppos);
#endif

#ifdef CONFIG_LOCKUP_DETECTOR
extern int proc_dowatchdog_thresh(struct ctl_table *table, int write,
void __user *buffer,
size_t *lenp, loff_t *ppos);
#endif

/* Attach to any functions which should be ignored in wchan output. */
#define __sched __attribute__((__section__(".sched.text")))

Expand Down
5 changes: 4 additions & 1 deletion init/Kconfig
Original file line number Diff line number Diff line change
Expand Up @@ -944,8 +944,11 @@ config PERF_USE_VMALLOC

config PERF_EVENTS_NMI
bool
depends on PERF_EVENTS
help
Arch has support for nmi_watchdog
System hardware can generate an NMI using the perf event
subsystem. Also has support for calculating CPU cycle events
to determine how many clock cycles in a given period.

menu "Kernel Performance Events And Counters"

Expand Down
3 changes: 1 addition & 2 deletions kernel/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -76,9 +76,8 @@ obj-$(CONFIG_GCOV_KERNEL) += gcov/
obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
obj-$(CONFIG_KPROBES) += kprobes.o
obj-$(CONFIG_KGDB) += kgdb.o
obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
obj-$(CONFIG_NMI_WATCHDOG) += nmi_watchdog.o
obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o
obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
obj-$(CONFIG_SECCOMP) += seccomp.o
obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
Expand Down
21 changes: 15 additions & 6 deletions kernel/sysctl.c
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@
#include <scsi/sg.h>
#endif

#ifdef CONFIG_NMI_WATCHDOG
#ifdef CONFIG_LOCKUP_DETECTOR
#include <linux/nmi.h>
#endif

Expand Down Expand Up @@ -686,16 +686,25 @@ static struct ctl_table kern_table[] = {
.mode = 0444,
.proc_handler = proc_dointvec,
},
#if defined(CONFIG_NMI_WATCHDOG)
#if defined(CONFIG_LOCKUP_DETECTOR)
{
.procname = "nmi_watchdog",
.data = &nmi_watchdog_enabled,
.procname = "watchdog",
.data = &watchdog_enabled,
.maxlen = sizeof (int),
.mode = 0644,
.proc_handler = proc_nmi_enabled,
.proc_handler = proc_dowatchdog_enabled,
},
{
.procname = "watchdog_thresh",
.data = &softlockup_thresh,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dowatchdog_thresh,
.extra1 = &neg_one,
.extra2 = &sixty,
},
#endif
#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) && !defined(CONFIG_NMI_WATCHDOG)
#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) && !defined(CONFIG_LOCKUP_DETECTOR)
{
.procname = "unknown_nmi_panic",
.data = &unknown_nmi_panic,
Expand Down
Loading

0 comments on commit 58687ac

Please sign in to comment.