From 2bbb6817c0ac1b5f2a68d720f364f98eeb1ac4fd Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 8 Oct 2011 16:01:00 +0200 Subject: [PATCH] nohz: Allow rcu extended quiescent state handling seperately from tick stop It is assumed that rcu won't be used once we switch to tickless mode and until we restart the tick. However this is not always true, as in x86-64 where we dereference the idle notifiers after the tick is stopped. To prepare for fixing this, add two new APIs: tick_nohz_idle_enter_norcu() and tick_nohz_idle_exit_norcu(). If no use of RCU is made in the idle loop between tick_nohz_enter_idle() and tick_nohz_exit_idle() calls, the arch must instead call the new *_norcu() version such that the arch doesn't need to call rcu_idle_enter() and rcu_idle_exit(). Otherwise the arch must call tick_nohz_enter_idle() and tick_nohz_exit_idle() and also call explicitly: - rcu_idle_enter() after its last use of RCU before the CPU is put to sleep. - rcu_idle_exit() before the first use of RCU after the CPU is woken up. Signed-off-by: Frederic Weisbecker Cc: Mike Frysinger Cc: Guan Xuetao Cc: David Miller Cc: Chris Metcalf Cc: Hans-Christian Egtvedt Cc: Ralf Baechle Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: H. Peter Anvin Cc: Russell King Cc: Paul Mackerras Cc: Heiko Carstens Cc: Paul Mundt Signed-off-by: Paul E. McKenney --- arch/arm/kernel/process.c | 4 +-- arch/avr32/kernel/process.c | 4 +-- arch/blackfin/kernel/process.c | 4 +-- arch/microblaze/kernel/process.c | 4 +-- arch/mips/kernel/process.c | 4 +-- arch/openrisc/kernel/idle.c | 4 +-- arch/powerpc/kernel/idle.c | 4 +-- arch/powerpc/platforms/iseries/setup.c | 8 ++--- arch/s390/kernel/process.c | 4 +-- arch/sh/kernel/idle.c | 4 +-- arch/sparc/kernel/process_64.c | 4 +-- arch/tile/kernel/process.c | 4 +-- arch/um/kernel/process.c | 4 +-- arch/unicore32/kernel/process.c | 4 +-- arch/x86/kernel/process_32.c | 4 +-- arch/x86/kernel/process_64.c | 4 +-- include/linux/tick.h | 46 ++++++++++++++++++++++++-- kernel/time/tick-sched.c | 25 +++++++------- 18 files changed, 90 insertions(+), 49 deletions(-) diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c index 3f1f8daf703cf..47e34c091276f 100644 --- a/arch/arm/kernel/process.c +++ b/arch/arm/kernel/process.c @@ -183,7 +183,7 @@ void cpu_idle(void) /* endless idle loop with no priority at all */ while (1) { - tick_nohz_idle_enter(); + tick_nohz_idle_enter_norcu(); leds_event(led_idle_start); while (!need_resched()) { #ifdef CONFIG_HOTPLUG_CPU @@ -213,7 +213,7 @@ void cpu_idle(void) } } leds_event(led_idle_end); - tick_nohz_idle_exit(); + tick_nohz_idle_exit_norcu(); preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/arch/avr32/kernel/process.c b/arch/avr32/kernel/process.c index 6ee7952248dbe..34c8c703bb161 100644 --- a/arch/avr32/kernel/process.c +++ b/arch/avr32/kernel/process.c @@ -34,10 +34,10 @@ void cpu_idle(void) { /* endless idle loop with no priority at all */ while (1) { - tick_nohz_idle_enter(); + tick_nohz_idle_enter_norcu(); while (!need_resched()) cpu_idle_sleep(); - tick_nohz_idle_exit(); + tick_nohz_idle_exit_norcu(); preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/arch/blackfin/kernel/process.c b/arch/blackfin/kernel/process.c index 7b141b5c9e8d6..57e07498a0e76 100644 --- a/arch/blackfin/kernel/process.c +++ b/arch/blackfin/kernel/process.c @@ -88,10 +88,10 @@ void cpu_idle(void) #endif if (!idle) idle = default_idle; - tick_nohz_idle_enter(); + tick_nohz_idle_enter_norcu(); while (!need_resched()) idle(); - tick_nohz_idle_exit(); + tick_nohz_idle_exit_norcu(); preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/arch/microblaze/kernel/process.c b/arch/microblaze/kernel/process.c index 5407f09b4be46..13d59f34b94e1 100644 --- a/arch/microblaze/kernel/process.c +++ b/arch/microblaze/kernel/process.c @@ -103,10 +103,10 @@ void cpu_idle(void) if (!idle) idle = default_idle; - tick_nohz_idle_enter(); + tick_nohz_idle_enter_norcu(); while (!need_resched()) idle(); - tick_nohz_idle_exit(); + tick_nohz_idle_exit_norcu(); preempt_enable_no_resched(); schedule(); diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c index c11e5ca2a434a..17fb3a270160d 100644 --- a/arch/mips/kernel/process.c +++ b/arch/mips/kernel/process.c @@ -56,7 +56,7 @@ void __noreturn cpu_idle(void) /* endless idle loop with no priority at all */ while (1) { - tick_nohz_idle_enter(); + tick_nohz_idle_enter_norcu(); while (!need_resched() && cpu_online(cpu)) { #ifdef CONFIG_MIPS_MT_SMTC extern void smtc_idle_loop_hook(void); @@ -77,7 +77,7 @@ void __noreturn cpu_idle(void) system_state == SYSTEM_BOOTING)) play_dead(); #endif - tick_nohz_idle_exit(); + tick_nohz_idle_exit_norcu(); preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/arch/openrisc/kernel/idle.c b/arch/openrisc/kernel/idle.c index fb6a9bf40006d..2e82cd0fa5e19 100644 --- a/arch/openrisc/kernel/idle.c +++ b/arch/openrisc/kernel/idle.c @@ -51,7 +51,7 @@ void cpu_idle(void) /* endless idle loop with no priority at all */ while (1) { - tick_nohz_idle_enter(); + tick_nohz_idle_enter_norcu(); while (!need_resched()) { check_pgt_cache(); @@ -69,7 +69,7 @@ void cpu_idle(void) set_thread_flag(TIF_POLLING_NRFLAG); } - tick_nohz_idle_exit(); + tick_nohz_idle_exit_norcu(); preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/arch/powerpc/kernel/idle.c b/arch/powerpc/kernel/idle.c index 878572f70ac5f..2e782a36d8f2e 100644 --- a/arch/powerpc/kernel/idle.c +++ b/arch/powerpc/kernel/idle.c @@ -56,7 +56,7 @@ void cpu_idle(void) set_thread_flag(TIF_POLLING_NRFLAG); while (1) { - tick_nohz_idle_enter(); + tick_nohz_idle_enter_norcu(); while (!need_resched() && !cpu_should_die()) { ppc64_runlatch_off(); @@ -93,7 +93,7 @@ void cpu_idle(void) HMT_medium(); ppc64_runlatch_on(); - tick_nohz_idle_exit(); + tick_nohz_idle_exit_norcu(); preempt_enable_no_resched(); if (cpu_should_die()) cpu_die(); diff --git a/arch/powerpc/platforms/iseries/setup.c b/arch/powerpc/platforms/iseries/setup.c index e83dfaf89f697..d69d3d185e892 100644 --- a/arch/powerpc/platforms/iseries/setup.c +++ b/arch/powerpc/platforms/iseries/setup.c @@ -563,7 +563,7 @@ static void yield_shared_processor(void) static void iseries_shared_idle(void) { while (1) { - tick_nohz_idle_enter(); + tick_nohz_idle_enter_norcu(); while (!need_resched() && !hvlpevent_is_pending()) { local_irq_disable(); ppc64_runlatch_off(); @@ -577,7 +577,7 @@ static void iseries_shared_idle(void) } ppc64_runlatch_on(); - tick_nohz_idle_exit(); + tick_nohz_idle_exit_norcu(); if (hvlpevent_is_pending()) process_iSeries_events(); @@ -593,7 +593,7 @@ static void iseries_dedicated_idle(void) set_thread_flag(TIF_POLLING_NRFLAG); while (1) { - tick_nohz_idle_enter(); + tick_nohz_idle_enter_norcu(); if (!need_resched()) { while (!need_resched()) { ppc64_runlatch_off(); @@ -610,7 +610,7 @@ static void iseries_dedicated_idle(void) } ppc64_runlatch_on(); - tick_nohz_idle_exit(); + tick_nohz_idle_exit_norcu(); preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c index 6224f9dbbc1f8..6fa987367ae67 100644 --- a/arch/s390/kernel/process.c +++ b/arch/s390/kernel/process.c @@ -91,10 +91,10 @@ static void default_idle(void) void cpu_idle(void) { for (;;) { - tick_nohz_idle_enter(); + tick_nohz_idle_enter_norcu(); while (!need_resched()) default_idle(); - tick_nohz_idle_exit(); + tick_nohz_idle_exit_norcu(); preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/arch/sh/kernel/idle.c b/arch/sh/kernel/idle.c index 6015743020a07..ad58e7535a7c4 100644 --- a/arch/sh/kernel/idle.c +++ b/arch/sh/kernel/idle.c @@ -89,7 +89,7 @@ void cpu_idle(void) /* endless idle loop with no priority at all */ while (1) { - tick_nohz_idle_enter(); + tick_nohz_idle_enter_norcu(); while (!need_resched()) { check_pgt_cache(); @@ -111,7 +111,7 @@ void cpu_idle(void) start_critical_timings(); } - tick_nohz_idle_exit(); + tick_nohz_idle_exit_norcu(); preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c index 9c2795ba2cfe5..4a0e7d79cb926 100644 --- a/arch/sparc/kernel/process_64.c +++ b/arch/sparc/kernel/process_64.c @@ -95,12 +95,12 @@ void cpu_idle(void) set_thread_flag(TIF_POLLING_NRFLAG); while(1) { - tick_nohz_idle_enter(); + tick_nohz_idle_enter_norcu(); while (!need_resched() && !cpu_is_offline(cpu)) sparc64_yield(cpu); - tick_nohz_idle_exit(); + tick_nohz_idle_exit_norcu(); preempt_enable_no_resched(); diff --git a/arch/tile/kernel/process.c b/arch/tile/kernel/process.c index 920e674aedb94..53ac89595ab15 100644 --- a/arch/tile/kernel/process.c +++ b/arch/tile/kernel/process.c @@ -85,7 +85,7 @@ void cpu_idle(void) /* endless idle loop with no priority at all */ while (1) { - tick_nohz_idle_enter(); + tick_nohz_idle_enter_norcu(); while (!need_resched()) { if (cpu_is_offline(cpu)) BUG(); /* no HOTPLUG_CPU */ @@ -105,7 +105,7 @@ void cpu_idle(void) local_irq_enable(); current_thread_info()->status |= TS_POLLING; } - tick_nohz_idle_exit(); + tick_nohz_idle_exit_norcu(); preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c index cfb657e928491..55d2cf455f630 100644 --- a/arch/um/kernel/process.c +++ b/arch/um/kernel/process.c @@ -246,10 +246,10 @@ void default_idle(void) if (need_resched()) schedule(); - tick_nohz_idle_enter(); + tick_nohz_idle_enter_norcu(); nsecs = disable_timer(); idle_sleep(nsecs); - tick_nohz_idle_exit(); + tick_nohz_idle_exit_norcu(); } } diff --git a/arch/unicore32/kernel/process.c b/arch/unicore32/kernel/process.c index 9999b9a84d467..095ff5a57928f 100644 --- a/arch/unicore32/kernel/process.c +++ b/arch/unicore32/kernel/process.c @@ -55,7 +55,7 @@ void cpu_idle(void) { /* endless idle loop with no priority at all */ while (1) { - tick_nohz_idle_enter(); + tick_nohz_idle_enter_norcu(); while (!need_resched()) { local_irq_disable(); stop_critical_timings(); @@ -63,7 +63,7 @@ void cpu_idle(void) local_irq_enable(); start_critical_timings(); } - tick_nohz_idle_exit(); + tick_nohz_idle_exit_norcu(); preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 6d9d4d52cac5a..f94da3920c36b 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -99,7 +99,7 @@ void cpu_idle(void) /* endless idle loop with no priority at all */ while (1) { - tick_nohz_idle_enter(); + tick_nohz_idle_enter_norcu(); while (!need_resched()) { check_pgt_cache(); @@ -116,7 +116,7 @@ void cpu_idle(void) pm_idle(); start_critical_timings(); } - tick_nohz_idle_exit(); + tick_nohz_idle_exit_norcu(); preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index b069e9d7875f7..18e8cf3581f66 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -122,7 +122,7 @@ void cpu_idle(void) /* endless idle loop with no priority at all */ while (1) { - tick_nohz_idle_enter(); + tick_nohz_idle_enter_norcu(); while (!need_resched()) { rmb(); @@ -149,7 +149,7 @@ void cpu_idle(void) __exit_idle(); } - tick_nohz_idle_exit(); + tick_nohz_idle_exit_norcu(); preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/include/linux/tick.h b/include/linux/tick.h index 0df1d50a408ae..327434a057572 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -7,6 +7,7 @@ #define _LINUX_TICK_H #include +#include #ifdef CONFIG_GENERIC_CLOCKEVENTS @@ -121,18 +122,57 @@ static inline int tick_oneshot_mode_active(void) { return 0; } #endif /* !CONFIG_GENERIC_CLOCKEVENTS */ # ifdef CONFIG_NO_HZ -extern void tick_nohz_idle_enter(void); +extern void __tick_nohz_idle_enter(void); +static inline void tick_nohz_idle_enter(void) +{ + local_irq_disable(); + __tick_nohz_idle_enter(); + local_irq_enable(); +} extern void tick_nohz_idle_exit(void); + +/* + * Call this pair of function if the arch doesn't make any use + * of RCU in-between. You won't need to call rcu_idle_enter() and + * rcu_idle_exit(). + * Otherwise you need to call tick_nohz_idle_enter() and tick_nohz_idle_exit() + * and explicitly tell RCU about the window around the place the CPU enters low + * power mode where no RCU use is made. This is done by calling rcu_idle_enter() + * after the last use of RCU before the CPU is put to sleep and by calling + * rcu_idle_exit() before the first use of RCU after the CPU woke up. + */ +static inline void tick_nohz_idle_enter_norcu(void) +{ + /* + * Also call rcu_idle_enter() in the irq disabled section even + * if it disables irq itself. + * Just an optimization that prevents from an interrupt happening + * between it and __tick_nohz_idle_enter() to lose time to help + * completing a grace period while we could be in extended grace + * period already. + */ + local_irq_disable(); + __tick_nohz_idle_enter(); + rcu_idle_enter(); + local_irq_enable(); +} +static inline void tick_nohz_idle_exit_norcu(void) +{ + rcu_idle_exit(); + tick_nohz_idle_exit(); +} extern void tick_nohz_irq_exit(void); extern ktime_t tick_nohz_get_sleep_length(void); extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time); extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time); # else -static inline void tick_nohz_idle_enter(void) +static inline void tick_nohz_idle_enter(void) { } +static inline void tick_nohz_idle_exit(void) { } +static inline void tick_nohz_idle_enter_norcu(void) { rcu_idle_enter(); } -static inline void tick_nohz_idle_exit(void) +static inline void tick_nohz_idle_exit_norcu(void) { rcu_idle_exit(); } diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 266c242dc354d..c76aefe764b0f 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -453,18 +453,22 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts) * * When the next event is more than a tick into the future, stop the idle tick * Called when we start the idle loop. - * This also enters into RCU extended quiescent state so that this CPU doesn't - * need anymore to be part of any global grace period completion. This way - * the tick can be stopped safely as we don't need to report quiescent states. + * + * If no use of RCU is made in the idle loop between + * tick_nohz_idle_enter() and tick_nohz_idle_exit() calls, then + * tick_nohz_idle_enter_norcu() should be called instead and the arch + * doesn't need to call rcu_idle_enter() and rcu_idle_exit() explicitly. + * + * Otherwise the arch is responsible of calling: + * + * - rcu_idle_enter() after its last use of RCU before the CPU is put + * to sleep. + * - rcu_idle_exit() before the first use of RCU after the CPU is woken up. */ -void tick_nohz_idle_enter(void) +void __tick_nohz_idle_enter(void) { struct tick_sched *ts; - WARN_ON_ONCE(irqs_disabled()); - - local_irq_disable(); - ts = &__get_cpu_var(tick_cpu_sched); /* * set ts->inidle unconditionally. even if the system did not @@ -473,9 +477,6 @@ void tick_nohz_idle_enter(void) */ ts->inidle = 1; tick_nohz_stop_sched_tick(ts); - rcu_idle_enter(); - - local_irq_enable(); } /** @@ -551,7 +552,7 @@ void tick_nohz_idle_exit(void) ktime_t now; local_irq_disable(); - rcu_idle_exit(); + if (ts->idle_active || (ts->inidle && ts->tick_stopped)) now = ktime_get();