From 5c889690aa089cc0f36f5cf4abb4d4f0ed81b4da Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 19 Jul 2013 15:09:25 -0700 Subject: [PATCH 01/11] mm: Place preemption point in do_mlockall() loop There is a loop in do_mlockall() that lacks a preemption point, which means that the following can happen on non-preemptible builds of the kernel: > My fuzz tester keeps hitting this. Every instance shows the non-irq stack > came in from mlockall. I'm only seeing this on one box, but that has more > ram (8gb) than my other machines, which might explain it. > > Dave > > INFO: rcu_preempt self-detected stall on CPU { 3} (t=6500 jiffies g=470344 c=470343 q=0) > sending NMI to all CPUs: > NMI backtrace for cpu 3 > CPU: 3 PID: 29664 Comm: trinity-child2 Not tainted 3.11.0-rc1+ #32 > task: ffff88023e743fc0 ti: ffff88022f6f2000 task.ti: ffff88022f6f2000 > RIP: 0010:[] [] trace_hardirqs_off_caller+0x21/0xb0 > RSP: 0018:ffff880244e03c30 EFLAGS: 00000046 > RAX: ffff88023e743fc0 RBX: 0000000000000001 RCX: 000000000000003c > RDX: 000000000000000f RSI: 0000000000000004 RDI: ffffffff81033cab > RBP: ffff880244e03c38 R08: ffff880243288a80 R09: 0000000000000001 > R10: 0000000000000000 R11: 0000000000000001 R12: ffff880243288a80 > R13: ffff8802437eda40 R14: 0000000000080000 R15: 000000000000d010 > FS: 00007f50ae33b740(0000) GS:ffff880244e00000(0000) knlGS:0000000000000000 > CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 > CR2: 000000000097f000 CR3: 0000000240fa0000 CR4: 00000000001407e0 > DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 > DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000600 > Stack: > ffffffff810bf86d ffff880244e03c98 ffffffff81033cab 0000000000000096 > 000000000000d008 0000000300000002 0000000000000004 0000000000000003 > 0000000000002710 ffffffff81c50d00 ffffffff81c50d00 ffff880244fcde00 > Call Trace: > > [] ? trace_hardirqs_off+0xd/0x10 > [] __x2apic_send_IPI_mask+0x1ab/0x1c0 > [] x2apic_send_IPI_all+0x1c/0x20 > [] arch_trigger_all_cpu_backtrace+0x65/0xa0 > [] rcu_check_callbacks+0x331/0x8e0 > [] ? hrtimer_run_queues+0x20/0x180 > [] ? sched_clock_cpu+0xb5/0x100 > [] update_process_times+0x47/0x80 > [] tick_sched_handle.isra.16+0x25/0x60 > [] tick_sched_timer+0x41/0x60 > [] __run_hrtimer+0x81/0x4e0 > [] ? tick_sched_do_timer+0x60/0x60 > [] hrtimer_interrupt+0xff/0x240 > [] local_apic_timer_interrupt+0x34/0x60 > [] smp_apic_timer_interrupt+0x3f/0x60 > [] apic_timer_interrupt+0x6f/0x80 > [] ? retint_restore_args+0xe/0xe > [] ? __do_softirq+0xb1/0x440 > [] irq_exit+0xcd/0xe0 > [] smp_apic_timer_interrupt+0x45/0x60 > [] apic_timer_interrupt+0x6f/0x80 > > [] ? retint_restore_args+0xe/0xe > [] ? wait_for_completion_killable+0x170/0x170 > [] ? preempt_schedule_irq+0x53/0x90 > [] retint_kernel+0x26/0x30 > [] ? queue_work_on+0x43/0x90 > [] schedule_on_each_cpu+0xc9/0x1a0 > [] ? lru_add_drain+0x50/0x50 > [] lru_add_drain_all+0x15/0x20 > [] SyS_mlockall+0xa5/0x1a0 > [] tracesys+0xdd/0xe2 This commit addresses this problem by inserting the required preemption point. Reported-by: Dave Jones Signed-off-by: Paul E. McKenney Cc: KOSAKI Motohiro Cc: Michel Lespinasse Cc: Andrew Morton Cc: Linus Torvalds --- mm/mlock.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/mlock.c b/mm/mlock.c index d63802663242e..67ba6da7d0e31 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -736,6 +736,7 @@ static int do_mlockall(int flags) /* Ignore errors */ mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags); + cond_resched(); } out: return 0; From b3f2d02598fcf16933f72a57bbba7edb22ad8eda Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 8 Aug 2013 14:37:47 -0700 Subject: [PATCH 02/11] rcu: Use proper cpp macro for ->gp_flags One of the ->gp_flags assignments used a raw number rather than the cpp macro that was intended for this purpose, which this commit fixes. Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 32618b3fe4e6a..e0fa1920cd67d 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1452,7 +1452,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) rdp = this_cpu_ptr(rsp->rda); rcu_advance_cbs(rsp, rnp, rdp); /* Reduce false positives below. */ if (cpu_needs_another_gp(rsp, rdp)) - rsp->gp_flags = 1; + rsp->gp_flags = RCU_GP_FLAG_INIT; raw_spin_unlock_irq(&rnp->lock); } From 01896f7e0a122e8f20082e24f6f9a340034b9c01 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 18 Aug 2013 12:14:32 -0700 Subject: [PATCH 03/11] rcu: Convert local functions to static The rcu_cpu_stall_timeout kernel parameter, the rcu_dynticks per-CPU variable, and the rcu_gp_fqs() function are used only locally. This commit therefore marks them as static. Reported-by: kbuild test robot Signed-off-by: Paul E. McKenney --- kernel/rcupdate.c | 2 +- kernel/rcutree.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index b02a339836b43..3260a1074b488 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -298,7 +298,7 @@ EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read); #endif int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */ -int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; +static int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; module_param(rcu_cpu_stall_suppress, int, 0644); module_param(rcu_cpu_stall_timeout, int, 0644); diff --git a/kernel/rcutree.c b/kernel/rcutree.c index e0fa1920cd67d..2712b89911432 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -222,7 +222,7 @@ void rcu_note_context_switch(int cpu) } EXPORT_SYMBOL_GPL(rcu_note_context_switch); -DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { +static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, .dynticks = ATOMIC_INIT(1), #ifdef CONFIG_NO_HZ_FULL_SYSIDLE @@ -1366,7 +1366,7 @@ static int rcu_gp_init(struct rcu_state *rsp) /* * Do one round of quiescent-state forcing. */ -int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) +static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) { int fqs_state = fqs_state_in; bool isidle = false; From 829511d8aa7a2179bba57ab4ab277d6f9c77ae5b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 18 Aug 2013 12:21:57 -0700 Subject: [PATCH 04/11] rcu: Fix dubious "if" condition in __call_rcu_nocb_enqueue() This commit replaces an incorrect (but fortunately functional) bitwise OR ("|") operator with the correct logical OR ("||"). Reported-by: kbuild test robot Signed-off-by: Paul E. McKenney --- kernel/rcutree_plugin.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 130c97b027f2e..6f9aecef8ab63 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -2108,7 +2108,7 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, /* If we are not being polled and there is a kthread, awaken it ... */ t = ACCESS_ONCE(rdp->nocb_kthread); - if (rcu_nocb_poll | !t) + if (rcu_nocb_poll || !t) return; len = atomic_long_read(&rdp->nocb_q_count); if (old_rhpp == &rdp->nocb_head) { From 2a855b644c310d5db5a80b8816c0c7748c167977 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 23 Aug 2013 09:40:42 -0700 Subject: [PATCH 05/11] rcu: Make list_splice_init_rcu() account for RCU readers The list_splice_init_rcu() function allows a list visible to RCU readers to be spliced into another list visible to RCU readers. This is OK, except for the use of INIT_LIST_HEAD(), which does pointer updates without doing anything to make those updates safe for concurrent readers. Of course, most of the time INIT_LIST_HEAD() is being used in reader-free contexts, such as initialization or cleanup, so it is OK for it to update pointers in an unsafe-for-RCU-readers manner. This commit therefore creates an INIT_LIST_HEAD_RCU() that uses ACCESS_ONCE() to make the updates reader-safe. The reason that we can use ACCESS_ONCE() instead of the more typical rcu_assign_pointer() is that list_splice_init_rcu() is updating the pointers to reference something that is already visible to readers, so that there is no problem with pre-initialized values. Signed-off-by: Paul E. McKenney --- include/linux/rculist.h | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/include/linux/rculist.h b/include/linux/rculist.h index 4106721c4e5e3..45a0a9e81478c 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -18,6 +18,21 @@ * be used anywhere you would want to use a list_empty_rcu(). */ +/* + * INIT_LIST_HEAD_RCU - Initialize a list_head visible to RCU readers + * @list: list to be initialized + * + * You should instead use INIT_LIST_HEAD() for normal initialization and + * cleanup tasks, when readers have no access to the list being initialized. + * However, if the list being initialized is visible to readers, you + * need to keep the compiler from being too mischievous. + */ +static inline void INIT_LIST_HEAD_RCU(struct list_head *list) +{ + ACCESS_ONCE(list->next) = list; + ACCESS_ONCE(list->prev) = list; +} + /* * return the ->next pointer of a list_head in an rcu safe * way, we must not access it directly @@ -191,9 +206,13 @@ static inline void list_splice_init_rcu(struct list_head *list, if (list_empty(list)) return; - /* "first" and "last" tracking list, so initialize it. */ + /* + * "first" and "last" tracking list, so initialize it. RCU readers + * have access to this list, so we must use INIT_LIST_HEAD_RCU() + * instead of INIT_LIST_HEAD(). + */ - INIT_LIST_HEAD(list); + INIT_LIST_HEAD_RCU(list); /* * At this point, the list body still points to the source list. From c9d4b0af9e0609cc525c55de18229fde7c926d61 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Sat, 31 Aug 2013 13:34:10 -0700 Subject: [PATCH 06/11] rcu: Replace __get_cpu_var() uses __get_cpu_var() is used for multiple purposes in the kernel source. One of them is address calculation via the form &__get_cpu_var(x). This calculates the address for the instance of the percpu variable of the current processor based on an offset. Other use cases are for storing and retrieving data from the current processors percpu area. __get_cpu_var() can be used as an lvalue when writing data or on the right side of an assignment. __get_cpu_var() is defined as : __get_cpu_var() always only does an address determination. However, store and retrieve operations could use a segment prefix (or global register on other platforms) to avoid the address calculation. this_cpu_write() and this_cpu_read() can directly take an offset into a percpu area and use optimized assembly code to read and write per cpu variables. This patch converts __get_cpu_var into either an explicit address calculation using this_cpu_ptr() or into a use of this_cpu operations that use the offset. Thereby address calcualtions are avoided and less registers are used when code is generated. At the end of the patchset all uses of __get_cpu_var have been removed so the macro is removed too. The patchset includes passes over all arches as well. Once these operations are used throughout then specialized macros can be defined in non -x86 arches as well in order to optimize per cpu access by f.e. using a global register that may be set to the per cpu base. Transformations done to __get_cpu_var() 1. Determine the address of the percpu instance of the current processor. DEFINE_PER_CPU(int, y); int *x = &__get_cpu_var(y); Converts to int *x = this_cpu_ptr(&y); 2. Same as #1 but this time an array structure is involved. DEFINE_PER_CPU(int, y[20]); int *x = __get_cpu_var(y); Converts to int *x = this_cpu_ptr(y); 3. Retrieve the content of the current processors instance of a per cpu variable. DEFINE_PER_CPU(int, u); int x = __get_cpu_var(y) Converts to int x = __this_cpu_read(y); 4. Retrieve the content of a percpu struct DEFINE_PER_CPU(struct mystruct, y); struct mystruct x = __get_cpu_var(y); Converts to memcpy(this_cpu_ptr(&x), y, sizeof(x)); 5. Assignment to a per cpu variable DEFINE_PER_CPU(int, y) __get_cpu_var(y) = x; Converts to this_cpu_write(y, x); 6. Increment/Decrement etc of a per cpu variable DEFINE_PER_CPU(int, y); __get_cpu_var(y)++ Converts to this_cpu_inc(y) Signed-off-by: Christoph Lameter [ paulmck: Address conflicts. ] Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 22 +++++++++++----------- kernel/rcutree_plugin.h | 14 +++++++------- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 2712b89911432..8eb9cfd9e2b16 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -407,7 +407,7 @@ static void rcu_eqs_enter(bool user) long long oldval; struct rcu_dynticks *rdtp; - rdtp = &__get_cpu_var(rcu_dynticks); + rdtp = this_cpu_ptr(&rcu_dynticks); oldval = rdtp->dynticks_nesting; WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0); if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) @@ -435,7 +435,7 @@ void rcu_idle_enter(void) local_irq_save(flags); rcu_eqs_enter(false); - rcu_sysidle_enter(&__get_cpu_var(rcu_dynticks), 0); + rcu_sysidle_enter(this_cpu_ptr(&rcu_dynticks), 0); local_irq_restore(flags); } EXPORT_SYMBOL_GPL(rcu_idle_enter); @@ -478,7 +478,7 @@ void rcu_irq_exit(void) struct rcu_dynticks *rdtp; local_irq_save(flags); - rdtp = &__get_cpu_var(rcu_dynticks); + rdtp = this_cpu_ptr(&rcu_dynticks); oldval = rdtp->dynticks_nesting; rdtp->dynticks_nesting--; WARN_ON_ONCE(rdtp->dynticks_nesting < 0); @@ -528,7 +528,7 @@ static void rcu_eqs_exit(bool user) struct rcu_dynticks *rdtp; long long oldval; - rdtp = &__get_cpu_var(rcu_dynticks); + rdtp = this_cpu_ptr(&rcu_dynticks); oldval = rdtp->dynticks_nesting; WARN_ON_ONCE(oldval < 0); if (oldval & DYNTICK_TASK_NEST_MASK) @@ -555,7 +555,7 @@ void rcu_idle_exit(void) local_irq_save(flags); rcu_eqs_exit(false); - rcu_sysidle_exit(&__get_cpu_var(rcu_dynticks), 0); + rcu_sysidle_exit(this_cpu_ptr(&rcu_dynticks), 0); local_irq_restore(flags); } EXPORT_SYMBOL_GPL(rcu_idle_exit); @@ -599,7 +599,7 @@ void rcu_irq_enter(void) long long oldval; local_irq_save(flags); - rdtp = &__get_cpu_var(rcu_dynticks); + rdtp = this_cpu_ptr(&rcu_dynticks); oldval = rdtp->dynticks_nesting; rdtp->dynticks_nesting++; WARN_ON_ONCE(rdtp->dynticks_nesting == 0); @@ -620,7 +620,7 @@ void rcu_irq_enter(void) */ void rcu_nmi_enter(void) { - struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); + struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); if (rdtp->dynticks_nmi_nesting == 0 && (atomic_read(&rdtp->dynticks) & 0x1)) @@ -642,7 +642,7 @@ void rcu_nmi_enter(void) */ void rcu_nmi_exit(void) { - struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); + struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); if (rdtp->dynticks_nmi_nesting == 0 || --rdtp->dynticks_nmi_nesting != 0) @@ -665,7 +665,7 @@ int rcu_is_cpu_idle(void) int ret; preempt_disable(); - ret = (atomic_read(&__get_cpu_var(rcu_dynticks).dynticks) & 0x1) == 0; + ret = (atomic_read(this_cpu_ptr(&rcu_dynticks.dynticks)) & 0x1) == 0; preempt_enable(); return ret; } @@ -703,7 +703,7 @@ bool rcu_lockdep_current_cpu_online(void) if (in_nmi()) return 1; preempt_disable(); - rdp = &__get_cpu_var(rcu_sched_data); + rdp = this_cpu_ptr(&rcu_sched_data); rnp = rdp->mynode; ret = (rdp->grpmask & rnp->qsmaskinit) || !rcu_scheduler_fully_active; @@ -723,7 +723,7 @@ EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online); */ static int rcu_is_cpu_rrupt_from_idle(void) { - return __get_cpu_var(rcu_dynticks).dynticks_nesting <= 1; + return __this_cpu_read(rcu_dynticks.dynticks_nesting) <= 1; } /* diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 6f9aecef8ab63..c684f7ab37fa7 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -660,7 +660,7 @@ static void rcu_preempt_check_callbacks(int cpu) static void rcu_preempt_do_callbacks(void) { - rcu_do_batch(&rcu_preempt_state, &__get_cpu_var(rcu_preempt_data)); + rcu_do_batch(&rcu_preempt_state, this_cpu_ptr(&rcu_preempt_data)); } #endif /* #ifdef CONFIG_RCU_BOOST */ @@ -1332,7 +1332,7 @@ static void invoke_rcu_callbacks_kthread(void) */ static bool rcu_is_callbacks_kthread(void) { - return __get_cpu_var(rcu_cpu_kthread_task) == current; + return __this_cpu_read(rcu_cpu_kthread_task) == current; } #define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000) @@ -1382,8 +1382,8 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp, static void rcu_kthread_do_work(void) { - rcu_do_batch(&rcu_sched_state, &__get_cpu_var(rcu_sched_data)); - rcu_do_batch(&rcu_bh_state, &__get_cpu_var(rcu_bh_data)); + rcu_do_batch(&rcu_sched_state, this_cpu_ptr(&rcu_sched_data)); + rcu_do_batch(&rcu_bh_state, this_cpu_ptr(&rcu_bh_data)); rcu_preempt_do_callbacks(); } @@ -1402,7 +1402,7 @@ static void rcu_cpu_kthread_park(unsigned int cpu) static int rcu_cpu_kthread_should_run(unsigned int cpu) { - return __get_cpu_var(rcu_cpu_has_work); + return __this_cpu_read(rcu_cpu_has_work); } /* @@ -1412,8 +1412,8 @@ static int rcu_cpu_kthread_should_run(unsigned int cpu) */ static void rcu_cpu_kthread(unsigned int cpu) { - unsigned int *statusp = &__get_cpu_var(rcu_cpu_kthread_status); - char work, *workp = &__get_cpu_var(rcu_cpu_has_work); + unsigned int *statusp = this_cpu_ptr(&rcu_cpu_kthread_status); + char work, *workp = this_cpu_ptr(&rcu_cpu_has_work); int spincnt; for (spincnt = 0; spincnt < 10; spincnt++) { From 289828e62de0334a0d01c0f65df91cd47d3a9e05 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 31 Aug 2013 19:23:29 -0700 Subject: [PATCH 07/11] rcu: Silence unused-variable warnings The "idle" variable in both rcu_eqs_enter_common() and rcu_eqs_exit_common() is only used in a WARN_ON_ONCE(). If the kernel is built disabling WARN_ON_ONCE(), the compiler will complain (rightly) that "idle" is unused. This commit therefore adds a __maybe_unused to the declaration of both variables. Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 8eb9cfd9e2b16..e6f2e8f141405 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -371,7 +371,8 @@ static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval, { trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting); if (!user && !is_idle_task(current)) { - struct task_struct *idle = idle_task(smp_processor_id()); + struct task_struct *idle __maybe_unused = + idle_task(smp_processor_id()); trace_rcu_dyntick(TPS("Error on entry: not idle task"), oldval, 0); ftrace_dump(DUMP_ORIG); @@ -508,7 +509,8 @@ static void rcu_eqs_exit_common(struct rcu_dynticks *rdtp, long long oldval, rcu_cleanup_after_idle(smp_processor_id()); trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting); if (!user && !is_idle_task(current)) { - struct task_struct *idle = idle_task(smp_processor_id()); + struct task_struct *idle __maybe_unused = + idle_task(smp_processor_id()); trace_rcu_dyntick(TPS("Error on exit: not idle task"), oldval, rdtp->dynticks_nesting); From 69c8d28c96445e28f081fcd987e34ea2afa65039 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 3 Sep 2013 09:52:20 -0700 Subject: [PATCH 08/11] rcu: Micro-optimize rcu_cpu_has_callbacks() The for_each_rcu_flavor() loop unconditionally scans all flavors, even when the first flavor might have some non-lazy callbacks. Once the loop has seen a non-lazy callback, further passes through the loop cannot change the state. This is not a huge problem, given that there can be at most three RCU flavors (RCU-bh, RCU-preempt, and RCU-sched), but this code is on the path to idle, so speeding it up even a small amount would have some benefit. This commit therefore does two things: 1. Rearranges the order of the list of RCU flavors in order to place the most active flavor first in the list. The most active RCU flavor is RCU-preempt, or, if there is no RCU-preempt, RCU-sched. 2. Reworks the for_each_rcu_flavor() to exit early when the first non-lazy callback is seen, or, in the case where the caller does not care about non-lazy callbacks (RCU_FAST_NO_HZ=n), when the first callback is seen. Reported-by: Chen Gang Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/kernel/rcutree.c b/kernel/rcutree.c index e6f2e8f141405..49464aded7f7a 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -2727,10 +2727,13 @@ static int rcu_cpu_has_callbacks(int cpu, bool *all_lazy) for_each_rcu_flavor(rsp) { rdp = per_cpu_ptr(rsp->rda, cpu); - if (rdp->qlen != rdp->qlen_lazy) + if (!rdp->nxtlist) + continue; + hc = true; + if (rdp->qlen != rdp->qlen_lazy || !all_lazy) { al = false; - if (rdp->nxtlist) - hc = true; + break; + } } if (all_lazy) *all_lazy = al; @@ -3297,8 +3300,8 @@ void __init rcu_init(void) rcu_bootup_announce(); rcu_init_geometry(); - rcu_init_one(&rcu_sched_state, &rcu_sched_data); rcu_init_one(&rcu_bh_state, &rcu_bh_data); + rcu_init_one(&rcu_sched_state, &rcu_sched_data); __rcu_init_preempt(); open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); From 26cdfedf6a902345f8604ea8e0b7dd2566b37a46 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 4 Sep 2013 10:51:13 -0700 Subject: [PATCH 09/11] rcu: Reject memory-order-induced stall-warning false positives If a system is idle from an RCU perspective for longer than specified by CONFIG_RCU_CPU_STALL_TIMEOUT, and if one CPU starts a grace period just as a second checks for CPU stalls, and if this second CPU happens to see the old value of rsp->jiffies_stall, it will incorrectly report a CPU stall. This is quite rare, but apparently occurs deterministically on systems with about 6TB of memory. This commit therefore orders accesses to the data used to determine whether or not a CPU stall is in progress. Grace-period initialization and cleanup first increments rsp->completed to mark the end of the previous grace period, then records the current jiffies in rsp->gp_start, then records the jiffies at which a stall can be expected to occur in rsp->jiffies_stall, and finally increments rsp->gpnum to mark the start of the new grace period. Now, this ordering by itself does not prevent false positives. For example, if grace-period initialization was delayed between recording rsp->gp_start and rsp->jiffies_stall, the CPU stall warning code might still see an old value of rsp->jiffies_stall. Therefore, this commit also orders the CPU stall warning accesses as well, loading rsp->gpnum and jiffies, then rsp->jiffies_stall, then rsp->gp_start, and finally rsp->completed. This ordering means that the false-positive scenario in the previous paragraph would result in rsp->completed being greater than or equal to rsp->gpnum, which is never valid for a CPU stall, allowing the false positive to be rejected. Furthermore, any fetch that gets an old value of rsp->jiffies_stall must also get an old value of rsp->gpnum, which will again be rejected by the comparison of rsp->gpnum and rsp->completed. Situations where rsp->gp_start is later than rsp->jiffies_stall are also rejected, as are situations where jiffies is less than rsp->jiffies_stall. Although use of unsynchronized accesses means that there are likely still some false-positive scenarios (synchronization has proven to be a very bad idea on large systems), this should get rid of a large class of these scenarios. Reported-by: Fabian Herschel Reported-by: Michal Hocko Signed-off-by: Paul E. McKenney Reviewed-by: Michal Hocko Tested-by: Jochen Striepe --- kernel/rcutree.c | 45 ++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 40 insertions(+), 5 deletions(-) diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 49464aded7f7a..b618d72bd8ecb 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -804,8 +804,11 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, static void record_gp_stall_check_time(struct rcu_state *rsp) { - rsp->gp_start = jiffies; - rsp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check(); + unsigned long j = ACCESS_ONCE(jiffies); + + rsp->gp_start = j; + smp_wmb(); /* Record start time before stall time. */ + rsp->jiffies_stall = j + rcu_jiffies_till_stall_check(); } /* @@ -934,17 +937,48 @@ static void print_cpu_stall(struct rcu_state *rsp) static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) { + unsigned long completed; + unsigned long gpnum; + unsigned long gps; unsigned long j; unsigned long js; struct rcu_node *rnp; - if (rcu_cpu_stall_suppress) + if (rcu_cpu_stall_suppress || !rcu_gp_in_progress(rsp)) return; j = ACCESS_ONCE(jiffies); + + /* + * Lots of memory barriers to reject false positives. + * + * The idea is to pick up rsp->gpnum, then rsp->jiffies_stall, + * then rsp->gp_start, and finally rsp->completed. These values + * are updated in the opposite order with memory barriers (or + * equivalent) during grace-period initialization and cleanup. + * Now, a false positive can occur if we get an new value of + * rsp->gp_start and a old value of rsp->jiffies_stall. But given + * the memory barriers, the only way that this can happen is if one + * grace period ends and another starts between these two fetches. + * Detect this by comparing rsp->completed with the previous fetch + * from rsp->gpnum. + * + * Given this check, comparisons of jiffies, rsp->jiffies_stall, + * and rsp->gp_start suffice to forestall false positives. + */ + gpnum = ACCESS_ONCE(rsp->gpnum); + smp_rmb(); /* Pick up ->gpnum first... */ js = ACCESS_ONCE(rsp->jiffies_stall); + smp_rmb(); /* ...then ->jiffies_stall before the rest... */ + gps = ACCESS_ONCE(rsp->gp_start); + smp_rmb(); /* ...and finally ->gp_start before ->completed. */ + completed = ACCESS_ONCE(rsp->completed); + if (ULONG_CMP_GE(completed, gpnum) || + ULONG_CMP_LT(j, js) || + ULONG_CMP_GE(gps, js)) + return; /* No stall or GP completed since entering function. */ rnp = rdp->mynode; if (rcu_gp_in_progress(rsp) && - (ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && ULONG_CMP_GE(j, js)) { + (ACCESS_ONCE(rnp->qsmask) & rdp->grpmask)) { /* We haven't checked in, so go dump stack. */ print_cpu_stall(rsp); @@ -1317,9 +1351,10 @@ static int rcu_gp_init(struct rcu_state *rsp) } /* Advance to a new grace period and initialize state. */ + record_gp_stall_check_time(rsp); + smp_wmb(); /* Record GP times before starting GP. */ rsp->gpnum++; trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start")); - record_gp_stall_check_time(rsp); raw_spin_unlock_irq(&rnp->lock); /* Exclude any concurrent CPU-hotplug operations. */ From 0d75292467b0c8554d70c751a35af6514202ac28 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 17 Aug 2013 18:08:37 -0700 Subject: [PATCH 10/11] rcu: Have rcutiny tracepoints use tracepoint_string() This commit extends the work done in f7f7bac9 (rcu: Have the RCU tracepoints use the tracepoint_string infrastructure) to cover rcutiny. Signed-off-by: Paul E. McKenney Cc: Steven Rostedt --- kernel/rcu.h | 7 +++++++ kernel/rcutiny.c | 17 ++++++++++------- kernel/rcutree.c | 7 ------- 3 files changed, 17 insertions(+), 14 deletions(-) diff --git a/kernel/rcu.h b/kernel/rcu.h index 77131966c4adc..7859a0a3951ea 100644 --- a/kernel/rcu.h +++ b/kernel/rcu.h @@ -122,4 +122,11 @@ int rcu_jiffies_till_stall_check(void); #endif /* #ifdef CONFIG_RCU_STALL_COMMON */ +/* + * Strings used in tracepoints need to be exported via the + * tracing system such that tools like perf and trace-cmd can + * translate the string address pointers to actual text. + */ +#define TPS(x) tracepoint_string(x) + #endif /* __LINUX_RCU_H */ diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index 9ed6075dc5624..e99eb5fb10afc 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c @@ -35,6 +35,7 @@ #include #include #include +#include #ifdef CONFIG_RCU_TRACE #include @@ -58,16 +59,17 @@ static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; static void rcu_idle_enter_common(long long newval) { if (newval) { - RCU_TRACE(trace_rcu_dyntick("--=", + RCU_TRACE(trace_rcu_dyntick(TPS("--="), rcu_dynticks_nesting, newval)); rcu_dynticks_nesting = newval; return; } - RCU_TRACE(trace_rcu_dyntick("Start", rcu_dynticks_nesting, newval)); + RCU_TRACE(trace_rcu_dyntick(TPS("Start"), + rcu_dynticks_nesting, newval)); if (!is_idle_task(current)) { struct task_struct *idle = idle_task(smp_processor_id()); - RCU_TRACE(trace_rcu_dyntick("Error on entry: not idle task", + RCU_TRACE(trace_rcu_dyntick(TPS("Entry error: not idle task"), rcu_dynticks_nesting, newval)); ftrace_dump(DUMP_ALL); WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", @@ -120,15 +122,15 @@ EXPORT_SYMBOL_GPL(rcu_irq_exit); static void rcu_idle_exit_common(long long oldval) { if (oldval) { - RCU_TRACE(trace_rcu_dyntick("++=", + RCU_TRACE(trace_rcu_dyntick(TPS("++="), oldval, rcu_dynticks_nesting)); return; } - RCU_TRACE(trace_rcu_dyntick("End", oldval, rcu_dynticks_nesting)); + RCU_TRACE(trace_rcu_dyntick(TPS("End"), oldval, rcu_dynticks_nesting)); if (!is_idle_task(current)) { struct task_struct *idle = idle_task(smp_processor_id()); - RCU_TRACE(trace_rcu_dyntick("Error on exit: not idle task", + RCU_TRACE(trace_rcu_dyntick(TPS("Exit error: not idle task"), oldval, rcu_dynticks_nesting)); ftrace_dump(DUMP_ALL); WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", @@ -304,7 +306,8 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) RCU_TRACE(cb_count++); } RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count)); - RCU_TRACE(trace_rcu_batch_end(rcp->name, cb_count, 0, need_resched(), + RCU_TRACE(trace_rcu_batch_end(rcp->name, + cb_count, 0, need_resched(), is_idle_task(current), false)); } diff --git a/kernel/rcutree.c b/kernel/rcutree.c index b618d72bd8ecb..62aab5ceefe9b 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -61,13 +61,6 @@ #include "rcu.h" -/* - * Strings used in tracepoints need to be exported via the - * tracing system such that tools like perf and trace-cmd can - * translate the string address pointers to actual text. - */ -#define TPS(x) tracepoint_string(x) - /* Data structures. */ static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; From 5d5a08003d3e678372e375d99c65a24e0d33d2f5 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Sun, 15 Sep 2013 17:29:17 +0400 Subject: [PATCH 11/11] rcu: Fix CONFIG_RCU_NOCB_CPU_ALL panic on machines with sparse CPU mask Some architectures have sparse cpu mask. UltraSparc's cpuinfo for example: CPU0: online CPU2: online So, set only possible CPUs when CONFIG_RCU_NOCB_CPU_ALL is enabled. Also, check that user passes right 'rcu_nocbs=' option. Signed-off-by: Kirill Tkhai CC: Dipankar Sarma [ paulmck: Fix pr_info() issue noted by scripts/checkpatch.pl. ] Signed-off-by: Paul E. McKenney --- kernel/rcutree_plugin.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index c684f7ab37fa7..1855d66bf7056 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -96,10 +96,15 @@ static void __init rcu_bootup_announce_oddness(void) #endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */ #ifdef CONFIG_RCU_NOCB_CPU_ALL pr_info("\tOffload RCU callbacks from all CPUs\n"); - cpumask_setall(rcu_nocb_mask); + cpumask_copy(rcu_nocb_mask, cpu_possible_mask); #endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */ #endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */ if (have_rcu_nocb_mask) { + if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) { + pr_info("\tNote: kernel parameter 'rcu_nocbs=' contains nonexistent CPUs.\n"); + cpumask_and(rcu_nocb_mask, cpu_possible_mask, + rcu_nocb_mask); + } cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask); pr_info("\tOffload RCU callbacks from CPUs: %s.\n", nocb_buf); if (rcu_nocb_poll)