Merge branch 'core-rcu-for-linus' of git://git.kernel.org/pub/scm/lin…

…ux/kernel/git/tip/tip Pull RCU changes from Ingo Molnar: "The main changes in this cycle were: - changes permitting use of call_rcu() and friends very early in boot, for example, before rcu_init() is invoked. - add in-kernel API to enable and disable expediting of normal RCU grace periods. - improve RCU's handling of (hotplug-) outgoing CPUs. - NO_HZ_FULL_SYSIDLE fixes. - tiny-RCU updates to make it more tiny. - documentation updates. - miscellaneous fixes" * 'core-rcu-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (58 commits) cpu: Provide smpboot_thread_init() on !CONFIG_SMP kernels as well cpu: Defer smpboot kthread unparking until CPU known to scheduler rcu: Associate quiescent-state reports with grace period rcu: Yet another fix for preemption and CPU hotplug rcu: Add diagnostics to grace-period cleanup rcutorture: Default to grace-period-initialization delays rcu: Handle outgoing CPUs on exit from idle loop cpu: Make CPU-offline idle-loop transition point more precise rcu: Eliminate ->onoff_mutex from rcu_node structure rcu: Process offlining and onlining only at grace-period start rcu: Move rcu_report_unblock_qs_rnp() to common code rcu: Rework preemptible expedited bitmask handling rcu: Remove event tracing from rcu_cpu_notify(), used by offline CPUs rcutorture: Enable slow grace-period initializations rcu: Provide diagnostic option to slow down grace-period initialization rcu: Detect stalls caused by failure to propagate up rcu_node tree rcu: Eliminate empty HOTPLUG_CPU ifdef rcu: Simplify sync_rcu_preempt_exp_init() rcu: Put all orphan-callback-related code under same comment rcu: Consolidate offline-CPU callback initialization ...
git-mirror · Apr 14, 2015 · 078838d · 078838d
2 parents eeee78c + 590ee7d
commit 078838d
Show file tree

Hide file tree

Showing 31 changed files with 986 additions and 440 deletions.
diff --git a/Documentation/atomic_ops.txt b/Documentation/atomic_ops.txt
@@ -201,11 +201,11 @@ These routines add 1 and subtract 1, respectively, from the given
 atomic_t and return the new counter value after the operation is
 performed.
 
-Unlike the above routines, it is required that explicit memory
-barriers are performed before and after the operation.  It must be
-done such that all memory operations before and after the atomic
-operation calls are strongly ordered with respect to the atomic
-operation itself.
+Unlike the above routines, it is required that these primitives
+include explicit memory barriers that are performed before and after
+the operation.  It must be done such that all memory operations before
+and after the atomic operation calls are strongly ordered with respect
+to the atomic operation itself.
 
 For example, it should behave as if a smp_mb() call existed both
 before and after the atomic operation.
@@ -233,21 +233,21 @@ These two routines increment and decrement by 1, respectively, the
 given atomic counter.  They return a boolean indicating whether the
 resulting counter value was zero or not.
 
-It requires explicit memory barrier semantics around the operation as
-above.
+Again, these primitives provide explicit memory barrier semantics around
+the atomic operation.
 
 	int atomic_sub_and_test(int i, atomic_t *v);
 
 This is identical to atomic_dec_and_test() except that an explicit
-decrement is given instead of the implicit "1".  It requires explicit
-memory barrier semantics around the operation.
+decrement is given instead of the implicit "1".  This primitive must
+provide explicit memory barrier semantics around the operation.
 
 	int atomic_add_negative(int i, atomic_t *v);
 
-The given increment is added to the given atomic counter value.  A
-boolean is return which indicates whether the resulting counter value
-is negative.  It requires explicit memory barrier semantics around the
-operation.
+The given increment is added to the given atomic counter value.  A boolean
+is return which indicates whether the resulting counter value is negative.
+This primitive must provide explicit memory barrier semantics around
+the operation.
 
 Then:
 
@@ -257,7 +257,7 @@ This performs an atomic exchange operation on the atomic variable v, setting
 the given new value.  It returns the old value that the atomic variable v had
 just before the operation.
 
-atomic_xchg requires explicit memory barriers around the operation.
+atomic_xchg must provide explicit memory barriers around the operation.
 
 	int atomic_cmpxchg(atomic_t *v, int old, int new);
 
@@ -266,7 +266,7 @@ with the given old and new values. Like all atomic_xxx operations,
 atomic_cmpxchg will only satisfy its atomicity semantics as long as all
 other accesses of *v are performed through atomic_xxx operations.
 
-atomic_cmpxchg requires explicit memory barriers around the operation.
+atomic_cmpxchg must provide explicit memory barriers around the operation.
 
 The semantics for atomic_cmpxchg are the same as those defined for 'cas'
 below.
@@ -279,8 +279,8 @@ If the atomic value v is not equal to u, this function adds a to v, and
 returns non zero. If v is equal to u then it returns zero. This is done as
 an atomic operation.
 
-atomic_add_unless requires explicit memory barriers around the operation
-unless it fails (returns 0).
+atomic_add_unless must provide explicit memory barriers around the
+operation unless it fails (returns 0).
 
 atomic_inc_not_zero, equivalent to atomic_add_unless(v, 1, 0)
 
@@ -460,9 +460,9 @@ the return value into an int.  There are other places where things
 like this occur as well.
 
 These routines, like the atomic_t counter operations returning values,
-require explicit memory barrier semantics around their execution.  All
-memory operations before the atomic bit operation call must be made
-visible globally before the atomic bit operation is made visible.
+must provide explicit memory barrier semantics around their execution.
+All memory operations before the atomic bit operation call must be
+made visible globally before the atomic bit operation is made visible.
 Likewise, the atomic bit operation must be visible globally before any
 subsequent memory operation is made visible.  For example:
 
@@ -536,8 +536,9 @@ except that two underscores are prefixed to the interface name.
 These non-atomic variants also do not require any special memory
 barrier semantics.
 
-The routines xchg() and cmpxchg() need the same exact memory barriers
-as the atomic and bit operations returning values.
+The routines xchg() and cmpxchg() must provide the same exact
+memory-barrier semantics as the atomic and bit operations returning
+values.
 
 Spinlocks and rwlocks have memory barrier expectations as well.
 The rule to follow is simple:

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
@@ -2969,6 +2969,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 			Set maximum number of finished RCU callbacks to
 			process in one batch.
 
+	rcutree.gp_init_delay=	[KNL]
+			Set the number of jiffies to delay each step of
+			RCU grace-period initialization.  This only has
+			effect when CONFIG_RCU_TORTURE_TEST_SLOW_INIT is
+			set.
+
 	rcutree.rcu_fanout_leaf= [KNL]
 			Increase the number of CPUs assigned to each
 			leaf rcu_node structure.  Useful for very large
@@ -2992,11 +2998,15 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 			value is one, and maximum value is HZ.
 
 	rcutree.kthread_prio= 	 [KNL,BOOT]
-			Set the SCHED_FIFO priority of the RCU
-			per-CPU kthreads (rcuc/N). This value is also
-			used for the priority of the RCU boost threads
-			(rcub/N). Valid values are 1-99 and the default
-			is 1 (the least-favored priority).
+			Set the SCHED_FIFO priority of the RCU per-CPU
+			kthreads (rcuc/N). This value is also used for
+			the priority of the RCU boost threads (rcub/N)
+			and for the RCU grace-period kthreads (rcu_bh,
+			rcu_preempt, and rcu_sched). If RCU_BOOST is
+			set, valid values are 1-99 and the default is 1
+			(the least-favored priority).  Otherwise, when
+			RCU_BOOST is not set, valid values are 0-99 and
+			the default is zero (non-realtime operation).
 
 	rcutree.rcu_nocb_leader_stride= [KNL]
 			Set the number of NOCB kthread groups, which

diff --git a/Documentation/kernel-per-CPU-kthreads.txt b/Documentation/kernel-per-CPU-kthreads.txt
@@ -190,33 +190,37 @@ To reduce its OS jitter, do any of the following:
 		on each CPU, including cs_dbs_timer() and od_dbs_timer().
 		WARNING:  Please check your CPU specifications to
 		make sure that this is safe on your particular system.
-	d.	It is not possible to entirely get rid of OS jitter
-		from vmstat_update() on CONFIG_SMP=y systems, but you
-		can decrease its frequency by writing a large value
-		to /proc/sys/vm/stat_interval.	The default value is
-		HZ, for an interval of one second.  Of course, larger
-		values will make your virtual-memory statistics update
-		more slowly.  Of course, you can also run your workload
-		at a real-time priority, thus preempting vmstat_update(),
+	d.	As of v3.18, Christoph Lameter's on-demand vmstat workers
+		commit prevents OS jitter due to vmstat_update() on
+		CONFIG_SMP=y systems.  Before v3.18, is not possible
+		to entirely get rid of the OS jitter, but you can
+		decrease its frequency by writing a large value to
+		/proc/sys/vm/stat_interval.  The default value is HZ,
+		for an interval of one second.	Of course, larger values
+		will make your virtual-memory statistics update more
+		slowly.  Of course, you can also run your workload at
+		a real-time priority, thus preempting vmstat_update(),
 		but if your workload is CPU-bound, this is a bad idea.
 		However, there is an RFC patch from Christoph Lameter
 		(based on an earlier one from Gilad Ben-Yossef) that
 		reduces or even eliminates vmstat overhead for some
 		workloads at https://lkml.org/lkml/2013/9/4/379.
-	e.	If running on high-end powerpc servers, build with
+	e.	Boot with "elevator=noop" to avoid workqueue use by
+		the block layer.
+	f.	If running on high-end powerpc servers, build with
 		CONFIG_PPC_RTAS_DAEMON=n.  This prevents the RTAS
 		daemon from running on each CPU every second or so.
 		(This will require editing Kconfig files and will defeat
 		this platform's RAS functionality.)  This avoids jitter
 		due to the rtas_event_scan() function.
 		WARNING:  Please check your CPU specifications to
 		make sure that this is safe on your particular system.
-	f.	If running on Cell Processor, build your kernel with
+	g.	If running on Cell Processor, build your kernel with
 		CBE_CPUFREQ_SPU_GOVERNOR=n to avoid OS jitter from
 		spu_gov_work().
 		WARNING:  Please check your CPU specifications to
 		make sure that this is safe on your particular system.
-	g.	If running on PowerMAC, build your kernel with
+	h.	If running on PowerMAC, build your kernel with
 		CONFIG_PMAC_RACKMETER=n to disable the CPU-meter,
 		avoiding OS jitter from rackmeter_do_timer().
 
@@ -258,8 +262,12 @@ Purpose: Detect software lockups on each CPU.
 To reduce its OS jitter, do at least one of the following:
 1.	Build with CONFIG_LOCKUP_DETECTOR=n, which will prevent these
 	kthreads from being created in the first place.
-2.	Echo a zero to /proc/sys/kernel/watchdog to disable the
+2.	Boot with "nosoftlockup=0", which will also prevent these kthreads
+	from being created.  Other related watchdog and softlockup boot
+	parameters may be found in Documentation/kernel-parameters.txt
+	and Documentation/watchdog/watchdog-parameters.txt.
+3.	Echo a zero to /proc/sys/kernel/watchdog to disable the
 	watchdog timer.
-3.	Echo a large number of /proc/sys/kernel/watchdog_thresh in
+4.	Echo a large number of /proc/sys/kernel/watchdog_thresh in
 	order to reduce the frequency of OS jitter due to the watchdog
 	timer down to a level that is acceptable for your workload.
diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt
@@ -592,9 +592,9 @@ See also the subsection on "Cache Coherency" for a more thorough example.
 CONTROL DEPENDENCIES
 --------------------
 
-A control dependency requires a full read memory barrier, not simply a data
-dependency barrier to make it work correctly.  Consider the following bit of
-code:
+A load-load control dependency requires a full read memory barrier, not
+simply a data dependency barrier to make it work correctly.  Consider the
+following bit of code:
 
 	q = ACCESS_ONCE(a);
 	if (q) {
@@ -615,14 +615,15 @@ case what's actually required is:
 	}
 
 However, stores are not speculated.  This means that ordering -is- provided
-in the following example:
+for load-store control dependencies, as in the following example:
 
 	q = ACCESS_ONCE(a);
 	if (q) {
 		ACCESS_ONCE(b) = p;
 	}
 
-Please note that ACCESS_ONCE() is not optional!  Without the
+Control dependencies pair normally with other types of barriers.
+That said, please note that ACCESS_ONCE() is not optional!  Without the
 ACCESS_ONCE(), might combine the load from 'a' with other loads from
 'a', and the store to 'b' with other stores to 'b', with possible highly
 counterintuitive effects on ordering.
@@ -813,6 +814,8 @@ In summary:
       barrier() can help to preserve your control dependency.  Please
       see the Compiler Barrier section for more information.
 
+  (*) Control dependencies pair normally with other types of barriers.
+
   (*) Control dependencies do -not- provide transitivity.  If you
       need transitivity, use smp_mb().
 
@@ -823,14 +826,14 @@ SMP BARRIER PAIRING
 When dealing with CPU-CPU interactions, certain types of memory barrier should
 always be paired.  A lack of appropriate pairing is almost certainly an error.
 
-General barriers pair with each other, though they also pair with
-most other types of barriers, albeit without transitivity.  An acquire
-barrier pairs with a release barrier, but both may also pair with other
-barriers, including of course general barriers.  A write barrier pairs
-with a data dependency barrier, an acquire barrier, a release barrier,
-a read barrier, or a general barrier.  Similarly a read barrier or a
-data dependency barrier pairs with a write barrier, an acquire barrier,
-a release barrier, or a general barrier:
+General barriers pair with each other, though they also pair with most
+other types of barriers, albeit without transitivity.  An acquire barrier
+pairs with a release barrier, but both may also pair with other barriers,
+including of course general barriers.  A write barrier pairs with a data
+dependency barrier, a control dependency, an acquire barrier, a release
+barrier, a read barrier, or a general barrier.  Similarly a read barrier,
+control dependency, or a data dependency barrier pairs with a write
+barrier, an acquire barrier, a release barrier, or a general barrier:
 
 	CPU 1		      CPU 2
 	===============	      ===============
@@ -850,6 +853,19 @@ Or:
 			      <data dependency barrier>
 			      y = *x;
 
+Or even:
+
+	CPU 1		      CPU 2
+	===============	      ===============================
+	r1 = ACCESS_ONCE(y);
+	<general barrier>
+	ACCESS_ONCE(y) = 1;   if (r2 = ACCESS_ONCE(x)) {
+			         <implicit control dependency>
+			         ACCESS_ONCE(y) = 1;
+			      }
+
+	assert(r1 == 0 || r2 == 0);
+
 Basically, the read barrier always has to be there, even though it can be of
 the "weaker" type.
 

diff --git a/Documentation/timers/NO_HZ.txt b/Documentation/timers/NO_HZ.txt
@@ -158,13 +158,9 @@ not come for free:
 	to the need to inform kernel subsystems (such as RCU) about
 	the change in mode.
 
-3.	POSIX CPU timers on adaptive-tick CPUs may miss their deadlines
-	(perhaps indefinitely) because they currently rely on
-	scheduling-tick interrupts.  This will likely be fixed in
-	one of two ways: (1) Prevent CPUs with POSIX CPU timers from
-	entering adaptive-tick mode, or (2) Use hrtimers or other
-	adaptive-ticks-immune mechanism to cause the POSIX CPU timer to
-	fire properly.
+3.	POSIX CPU timers prevent CPUs from entering adaptive-tick mode.
+	Real-time applications needing to take actions based on CPU time
+	consumption need to use other means of doing so.
 
 4.	If there are more perf events pending than the hardware can
 	accommodate, they are normally round-robined so as to collect

diff --git a/arch/blackfin/mach-common/smp.c b/arch/blackfin/mach-common/smp.c
@@ -413,16 +413,14 @@ int __cpu_disable(void)
 	return 0;
 }
 
-static DECLARE_COMPLETION(cpu_killed);
-
 int __cpu_die(unsigned int cpu)
 {
-	return wait_for_completion_timeout(&cpu_killed, 5000);
+	return cpu_wait_death(cpu, 5);
 }
 
 void cpu_die(void)
 {
-	complete(&cpu_killed);
+	(void)cpu_report_death();
 
 	atomic_dec(&init_mm.mm_users);
 	atomic_dec(&init_mm.mm_count);

diff --git a/arch/metag/kernel/smp.c b/arch/metag/kernel/smp.c
@@ -261,7 +261,6 @@ int __cpu_up(unsigned int cpu, struct task_struct *idle)
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
-static DECLARE_COMPLETION(cpu_killed);
 
 /*
  * __cpu_disable runs on the processor to be shutdown.
@@ -299,7 +298,7 @@ int __cpu_disable(void)
  */
 void __cpu_die(unsigned int cpu)
 {
-	if (!wait_for_completion_timeout(&cpu_killed, msecs_to_jiffies(1)))
+	if (!cpu_wait_death(cpu, 1))
 		pr_err("CPU%u: unable to kill\n", cpu);
 }
 
@@ -314,7 +313,7 @@ void cpu_die(void)
 	local_irq_disable();
 	idle_task_exit();
 
-	complete(&cpu_killed);
+	(void)cpu_report_death();
 
 	asm ("XOR	TXENABLE, D0Re0,D0Re0\n");
 }

diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h
@@ -34,8 +34,6 @@ extern int _debug_hotplug_cpu(int cpu, int action);
 #endif
 #endif
 
-DECLARE_PER_CPU(int, cpu_state);
-
 int mwait_usable(const struct cpuinfo_x86 *);
 
 #endif /* _ASM_X86_CPU_H */
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
@@ -150,13 +150,13 @@ static inline void arch_send_call_function_ipi_mask(const struct cpumask *mask)
 }
 
 void cpu_disable_common(void);
-void cpu_die_common(unsigned int cpu);
 void native_smp_prepare_boot_cpu(void);
 void native_smp_prepare_cpus(unsigned int max_cpus);
 void native_smp_cpus_done(unsigned int max_cpus);
 void common_cpu_up(unsigned int cpunum, struct task_struct *tidle);
 int native_cpu_up(unsigned int cpunum, struct task_struct *tidle);
 int native_cpu_disable(void);
+int common_cpu_die(unsigned int cpu);
 void native_cpu_die(unsigned int cpu);
 void native_play_dead(void);
 void play_dead_common(void);