Skip to content

Commit

Permalink
Merge tag 'clocksource.2023.02.06b' of git://git.kernel.org/pub/scm/l…
Browse files Browse the repository at this point in the history
…inux/kernel/git/paulmck/linux-rcu into timers/core

Pull clocksource watchdog changes from Paul McKenney:

     o	Improvements to clocksource-watchdog console messages.

     o	Loosening of the clocksource-watchdog skew criteria to match
     	those of NTP (500 parts per million, relaxed from 400 parts
     	per million).  If it is good enough for NTP, it is good enough
     	for the clocksource watchdog.

     o	Suspend clocksource-watchdog checking temporarily when high
     	memory latencies are detected.	This avoids the false-positive
     	clock-skew events that have been seen on production systems
     	running memory-intensive workloads.

     o	On systems where the TSC is deemed trustworthy, use it as the
     	watchdog timesource, but only when specifically requested using
     	the tsc=watchdog kernel boot parameter.  This permits clock-skew
     	events to be detected, but avoids forcing workloads to use the
     	slow HPET and ACPI PM timers.  These last two timers are slow
     	enough to cause systems to be needlessly marked bad on the one
     	hand, and real skew does sometimes happen on production systems
     	running production workloads on the other.  And sometimes it is
     	the fault of the TSC, or at least of the firmware that told the
     	kernel to program the TSC with the wrong frequency.

     o	Add a tsc=revalidate kernel boot parameter to allow the kernel
     	to diagnose cases where the TSC hardware works fine, but was told
     	by firmware to tick at the wrong frequency.  Such cases are rare,
     	but they really have happened on production systems.

Link: https://lore.kernel.org/r/20230210193640.GA3325193@paulmck-ThinkPad-P17-Gen-1
  • Loading branch information
Thomas Gleixner committed Feb 13, 2023
2 parents 7b0f95f + 0051293 commit ab407a1
Show file tree
Hide file tree
Showing 7 changed files with 123 additions and 29 deletions.
10 changes: 10 additions & 0 deletions Documentation/admin-guide/kernel-parameters.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6369,6 +6369,16 @@
in situations with strict latency requirements (where
interruptions from clocksource watchdog are not
acceptable).
[x86] recalibrate: force recalibration against a HW timer
(HPET or PM timer) on systems whose TSC frequency was
obtained from HW or FW using either an MSR or CPUID(0x15).
Warn if the difference is more than 500 ppm.
[x86] watchdog: Use TSC as the watchdog clocksource with
which to check other HW timers (HPET or PM timer), but
only on systems where TSC has been deemed trustworthy.
This will be suppressed by an earlier tsc=nowatchdog and
can be overridden by a later tsc=nowatchdog. A console
message will flag any such suppression or overriding.

tsc_early_khz= [X86] Skip early TSC calibration and use the given
value instead. Useful when the early TSC frequency discovery
Expand Down
1 change: 1 addition & 0 deletions arch/x86/include/asm/time.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
extern void hpet_time_init(void);
extern void time_init(void);
extern bool pit_timer_init(void);
extern bool tsc_clocksource_watchdog_disabled(void);

extern struct clock_event_device *global_clock_event;

Expand Down
2 changes: 2 additions & 0 deletions arch/x86/kernel/hpet.c
Original file line number Diff line number Diff line change
Expand Up @@ -1091,6 +1091,8 @@ int __init hpet_enable(void)
if (!hpet_counting())
goto out_nohpet;

if (tsc_clocksource_watchdog_disabled())
clocksource_hpet.flags |= CLOCK_SOURCE_MUST_VERIFY;
clocksource_register_hz(&clocksource_hpet, (u32)hpet_freq);

if (id & HPET_ID_LEGSUP) {
Expand Down
55 changes: 50 additions & 5 deletions arch/x86/kernel/tsc.c
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,8 @@ static DEFINE_STATIC_KEY_FALSE(__use_tsc);

int tsc_clocksource_reliable;

static int __read_mostly tsc_force_recalibrate;

static u32 art_to_tsc_numerator;
static u32 art_to_tsc_denominator;
static u64 art_to_tsc_offset;
Expand Down Expand Up @@ -292,6 +294,7 @@ __setup("notsc", notsc_setup);

static int no_sched_irq_time;
static int no_tsc_watchdog;
static int tsc_as_watchdog;

static int __init tsc_setup(char *str)
{
Expand All @@ -301,8 +304,22 @@ static int __init tsc_setup(char *str)
no_sched_irq_time = 1;
if (!strcmp(str, "unstable"))
mark_tsc_unstable("boot parameter");
if (!strcmp(str, "nowatchdog"))
if (!strcmp(str, "nowatchdog")) {
no_tsc_watchdog = 1;
if (tsc_as_watchdog)
pr_alert("%s: Overriding earlier tsc=watchdog with tsc=nowatchdog\n",
__func__);
tsc_as_watchdog = 0;
}
if (!strcmp(str, "recalibrate"))
tsc_force_recalibrate = 1;
if (!strcmp(str, "watchdog")) {
if (no_tsc_watchdog)
pr_alert("%s: tsc=watchdog overridden by earlier tsc=nowatchdog\n",
__func__);
else
tsc_as_watchdog = 1;
}
return 1;
}

Expand Down Expand Up @@ -1186,6 +1203,12 @@ static void __init tsc_disable_clocksource_watchdog(void)
clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
}

bool tsc_clocksource_watchdog_disabled(void)
{
return !(clocksource_tsc.flags & CLOCK_SOURCE_MUST_VERIFY) &&
tsc_as_watchdog && !no_tsc_watchdog;
}

static void __init check_system_tsc_reliable(void)
{
#if defined(CONFIG_MGEODEGX1) || defined(CONFIG_MGEODE_LX) || defined(CONFIG_X86_GENERIC)
Expand Down Expand Up @@ -1374,6 +1397,25 @@ static void tsc_refine_calibration_work(struct work_struct *work)
else
freq = calc_pmtimer_ref(delta, ref_start, ref_stop);

/* Will hit this only if tsc_force_recalibrate has been set */
if (boot_cpu_has(X86_FEATURE_TSC_KNOWN_FREQ)) {

/* Warn if the deviation exceeds 500 ppm */
if (abs(tsc_khz - freq) > (tsc_khz >> 11)) {
pr_warn("Warning: TSC freq calibrated by CPUID/MSR differs from what is calibrated by HW timer, please check with vendor!!\n");
pr_info("Previous calibrated TSC freq:\t %lu.%03lu MHz\n",
(unsigned long)tsc_khz / 1000,
(unsigned long)tsc_khz % 1000);
}

pr_info("TSC freq recalibrated by [%s]:\t %lu.%03lu MHz\n",
hpet ? "HPET" : "PM_TIMER",
(unsigned long)freq / 1000,
(unsigned long)freq % 1000);

return;
}

/* Make sure we're within 1% */
if (abs(tsc_khz - freq) > tsc_khz/100)
goto out;
Expand Down Expand Up @@ -1407,8 +1449,10 @@ static int __init init_tsc_clocksource(void)
if (!boot_cpu_has(X86_FEATURE_TSC) || !tsc_khz)
return 0;

if (tsc_unstable)
goto unreg;
if (tsc_unstable) {
clocksource_unregister(&clocksource_tsc_early);
return 0;
}

if (boot_cpu_has(X86_FEATURE_NONSTOP_TSC_S3))
clocksource_tsc.flags |= CLOCK_SOURCE_SUSPEND_NONSTOP;
Expand All @@ -1421,9 +1465,10 @@ static int __init init_tsc_clocksource(void)
if (boot_cpu_has(X86_FEATURE_ART))
art_related_clocksource = &clocksource_tsc;
clocksource_register_khz(&clocksource_tsc, tsc_khz);
unreg:
clocksource_unregister(&clocksource_tsc_early);
return 0;

if (!tsc_force_recalibrate)
return 0;
}

schedule_delayed_work(&tsc_irqwork, 0);
Expand Down
6 changes: 4 additions & 2 deletions drivers/clocksource/acpi_pm.c
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#include <linux/pci.h>
#include <linux/delay.h>
#include <asm/io.h>
#include <asm/time.h>

/*
* The I/O port the PMTMR resides at.
Expand Down Expand Up @@ -210,8 +211,9 @@ static int __init init_acpi_pm_clocksource(void)
return -ENODEV;
}

return clocksource_register_hz(&clocksource_acpi_pm,
PMTMR_TICKS_PER_SEC);
if (tsc_clocksource_watchdog_disabled())
clocksource_acpi_pm.flags |= CLOCK_SOURCE_MUST_VERIFY;
return clocksource_register_hz(&clocksource_acpi_pm, PMTMR_TICKS_PER_SEC);
}

/* We use fs_initcall because we want the PCI fixups to have run
Expand Down
6 changes: 5 additions & 1 deletion kernel/time/Kconfig
Original file line number Diff line number Diff line change
Expand Up @@ -200,10 +200,14 @@ config CLOCKSOURCE_WATCHDOG_MAX_SKEW_US
int "Clocksource watchdog maximum allowable skew (in μs)"
depends on CLOCKSOURCE_WATCHDOG
range 50 1000
default 100
default 125
help
Specify the maximum amount of allowable watchdog skew in
microseconds before reporting the clocksource to be unstable.
The default is based on a half-second clocksource watchdog
interval and NTP's maximum frequency drift of 500 parts
per million. If the clocksource is good enough for NTP,
it is good enough for the clocksource watchdog!

endmenu
endif
72 changes: 51 additions & 21 deletions kernel/time/clocksource.c
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,11 @@ static char override_name[CS_NAME_LEN];
static int finished_booting;
static u64 suspend_start;

/*
* Interval: 0.5sec.
*/
#define WATCHDOG_INTERVAL (HZ >> 1)

/*
* Threshold: 0.0312s, when doubled: 0.0625s.
* Also a default for cs->uncertainty_margin when registering clocks.
Expand All @@ -106,11 +111,14 @@ static u64 suspend_start;
* clocksource surrounding a read of the clocksource being validated.
* This delay could be due to SMIs, NMIs, or to VCPU preemptions. Used as
* a lower bound for cs->uncertainty_margin values when registering clocks.
*
* The default of 500 parts per million is based on NTP's limits.
* If a clocksource is good enough for NTP, it is good enough for us!
*/
#ifdef CONFIG_CLOCKSOURCE_WATCHDOG_MAX_SKEW_US
#define MAX_SKEW_USEC CONFIG_CLOCKSOURCE_WATCHDOG_MAX_SKEW_US
#else
#define MAX_SKEW_USEC 100
#define MAX_SKEW_USEC (125 * WATCHDOG_INTERVAL / HZ)
#endif

#define WATCHDOG_MAX_SKEW (MAX_SKEW_USEC * NSEC_PER_USEC)
Expand Down Expand Up @@ -140,11 +148,6 @@ static inline void clocksource_watchdog_unlock(unsigned long *flags)
static int clocksource_watchdog_kthread(void *data);
static void __clocksource_change_rating(struct clocksource *cs, int rating);

/*
* Interval: 0.5sec.
*/
#define WATCHDOG_INTERVAL (HZ >> 1)

static void clocksource_watchdog_work(struct work_struct *work)
{
/*
Expand Down Expand Up @@ -257,8 +260,8 @@ static enum wd_read_status cs_watchdog_read(struct clocksource *cs, u64 *csnow,
goto skip_test;
}

pr_warn("timekeeping watchdog on CPU%d: %s read-back delay of %lldns, attempt %d, marking unstable\n",
smp_processor_id(), watchdog->name, wd_delay, nretries);
pr_warn("timekeeping watchdog on CPU%d: wd-%s-wd excessive read-back delay of %lldns vs. limit of %ldns, wd-wd read-back delay only %lldns, attempt %d, marking %s unstable\n",
smp_processor_id(), cs->name, wd_delay, WATCHDOG_MAX_SKEW, wd_seq_delay, nretries, cs->name);
return WD_READ_UNSTABLE;

skip_test:
Expand Down Expand Up @@ -384,13 +387,23 @@ void clocksource_verify_percpu(struct clocksource *cs)
}
EXPORT_SYMBOL_GPL(clocksource_verify_percpu);

static inline void clocksource_reset_watchdog(void)
{
struct clocksource *cs;

list_for_each_entry(cs, &watchdog_list, wd_list)
cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
}


static void clocksource_watchdog(struct timer_list *unused)
{
u64 csnow, wdnow, cslast, wdlast, delta;
int next_cpu, reset_pending;
int64_t wd_nsec, cs_nsec;
struct clocksource *cs;
enum wd_read_status read_ret;
unsigned long extra_wait = 0;
u32 md;

spin_lock(&watchdog_lock);
Expand All @@ -410,13 +423,30 @@ static void clocksource_watchdog(struct timer_list *unused)

read_ret = cs_watchdog_read(cs, &csnow, &wdnow);

if (read_ret != WD_READ_SUCCESS) {
if (read_ret == WD_READ_UNSTABLE)
/* Clock readout unreliable, so give it up. */
__clocksource_unstable(cs);
if (read_ret == WD_READ_UNSTABLE) {
/* Clock readout unreliable, so give it up. */
__clocksource_unstable(cs);
continue;
}

/*
* When WD_READ_SKIP is returned, it means the system is likely
* under very heavy load, where the latency of reading
* watchdog/clocksource is very big, and affect the accuracy of
* watchdog check. So give system some space and suspend the
* watchdog check for 5 minutes.
*/
if (read_ret == WD_READ_SKIP) {
/*
* As the watchdog timer will be suspended, and
* cs->last could keep unchanged for 5 minutes, reset
* the counters.
*/
clocksource_reset_watchdog();
extra_wait = HZ * 300;
break;
}

/* Clocksource initialized ? */
if (!(cs->flags & CLOCK_SOURCE_WATCHDOG) ||
atomic_read(&watchdog_reset_pending)) {
Expand All @@ -443,12 +473,20 @@ static void clocksource_watchdog(struct timer_list *unused)
/* Check the deviation from the watchdog clocksource. */
md = cs->uncertainty_margin + watchdog->uncertainty_margin;
if (abs(cs_nsec - wd_nsec) > md) {
u64 cs_wd_msec;
u64 wd_msec;
u32 wd_rem;

pr_warn("timekeeping watchdog on CPU%d: Marking clocksource '%s' as unstable because the skew is too large:\n",
smp_processor_id(), cs->name);
pr_warn(" '%s' wd_nsec: %lld wd_now: %llx wd_last: %llx mask: %llx\n",
watchdog->name, wd_nsec, wdnow, wdlast, watchdog->mask);
pr_warn(" '%s' cs_nsec: %lld cs_now: %llx cs_last: %llx mask: %llx\n",
cs->name, cs_nsec, csnow, cslast, cs->mask);
cs_wd_msec = div_u64_rem(cs_nsec - wd_nsec, 1000U * 1000U, &wd_rem);
wd_msec = div_u64_rem(wd_nsec, 1000U * 1000U, &wd_rem);
pr_warn(" Clocksource '%s' skewed %lld ns (%lld ms) over watchdog '%s' interval of %lld ns (%lld ms)\n",
cs->name, cs_nsec - wd_nsec, cs_wd_msec, watchdog->name, wd_nsec, wd_msec);
if (curr_clocksource == cs)
pr_warn(" '%s' is current clocksource.\n", cs->name);
else if (curr_clocksource)
Expand Down Expand Up @@ -512,7 +550,7 @@ static void clocksource_watchdog(struct timer_list *unused)
* pair clocksource_stop_watchdog() clocksource_start_watchdog().
*/
if (!timer_pending(&watchdog_timer)) {
watchdog_timer.expires += WATCHDOG_INTERVAL;
watchdog_timer.expires += WATCHDOG_INTERVAL + extra_wait;
add_timer_on(&watchdog_timer, next_cpu);
}
out:
Expand All @@ -537,14 +575,6 @@ static inline void clocksource_stop_watchdog(void)
watchdog_running = 0;
}

static inline void clocksource_reset_watchdog(void)
{
struct clocksource *cs;

list_for_each_entry(cs, &watchdog_list, wd_list)
cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
}

static void clocksource_resume_watchdog(void)
{
atomic_inc(&watchdog_reset_pending);
Expand Down

0 comments on commit ab407a1

Please sign in to comment.