From 97b116b02275a5ca0e2fa9c5735abb3ba63ca31b Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 23 Aug 2012 14:11:25 +0200
Subject: [PATCH] --- yaml --- r: 323556 b: refs/heads/master c:
 6a6c0272f17cc80a8286d915f2ddf31557c2d559 h: refs/heads/master v: v3

---
 [refs]                            |  2 +-
 trunk/arch/alpha/kernel/process.c |  3 +-
 trunk/arch/alpha/kernel/smp.c     |  1 +
 trunk/arch/x86/kernel/cpuid.c     |  5 --
 trunk/arch/x86/kernel/msr.c       |  5 --
 trunk/kernel/rcutree.c            | 93 ++++++++++++++++++++++++-------
 trunk/kernel/rcutree.h            |  3 +
 trunk/kernel/rcutree_trace.c      |  4 +-
 trunk/kernel/sched/core.c         | 41 +++++++-------
 9 files changed, 102 insertions(+), 55 deletions(-)

diff --git a/[refs] b/[refs]
index ae5ef0d1370c..7adf5c9f74b9 100644
--- a/[refs]
+++ b/[refs]
@@ -1,2 +1,2 @@
 ---
-refs/heads/master: 429227bbe55647aa42f8f63cac61e4544e248629
+refs/heads/master: 6a6c0272f17cc80a8286d915f2ddf31557c2d559
diff --git a/trunk/arch/alpha/kernel/process.c b/trunk/arch/alpha/kernel/process.c
index d6fde98b74b3..db56f31dbf70 100644
--- a/trunk/arch/alpha/kernel/process.c
+++ b/trunk/arch/alpha/kernel/process.c
@@ -56,7 +56,8 @@ cpu_idle(void)
 
 		while (!need_resched())
 			cpu_relax();
-		schedule();
+
+		schedule_preempt_disabled();
 	}
 }
 
diff --git a/trunk/arch/alpha/kernel/smp.c b/trunk/arch/alpha/kernel/smp.c
index 35ddc02bfa4a..a41ad90a97a6 100644
--- a/trunk/arch/alpha/kernel/smp.c
+++ b/trunk/arch/alpha/kernel/smp.c
@@ -166,6 +166,7 @@ smp_callin(void)
 	DBGS(("smp_callin: commencing CPU %d current %p active_mm %p\n",
 	      cpuid, current, current->active_mm));
 
+	preempt_disable();
 	/* Do nothing.  */
 	cpu_idle();
 }
diff --git a/trunk/arch/x86/kernel/cpuid.c b/trunk/arch/x86/kernel/cpuid.c
index 60c78917190c..39472dd2323f 100644
--- a/trunk/arch/x86/kernel/cpuid.c
+++ b/trunk/arch/x86/kernel/cpuid.c
@@ -199,14 +199,12 @@ static int __init cpuid_init(void)
 		goto out_chrdev;
 	}
 	cpuid_class->devnode = cpuid_devnode;
-	get_online_cpus();
 	for_each_online_cpu(i) {
 		err = cpuid_device_create(i);
 		if (err != 0)
 			goto out_class;
 	}
 	register_hotcpu_notifier(&cpuid_class_cpu_notifier);
-	put_online_cpus();
 
 	err = 0;
 	goto out;
@@ -216,7 +214,6 @@ static int __init cpuid_init(void)
 	for_each_online_cpu(i) {
 		cpuid_device_destroy(i);
 	}
-	put_online_cpus();
 	class_destroy(cpuid_class);
 out_chrdev:
 	__unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid");
@@ -228,13 +225,11 @@ static void __exit cpuid_exit(void)
 {
 	int cpu = 0;
 
-	get_online_cpus();
 	for_each_online_cpu(cpu)
 		cpuid_device_destroy(cpu);
 	class_destroy(cpuid_class);
 	__unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid");
 	unregister_hotcpu_notifier(&cpuid_class_cpu_notifier);
-	put_online_cpus();
 }
 
 module_init(cpuid_init);
diff --git a/trunk/arch/x86/kernel/msr.c b/trunk/arch/x86/kernel/msr.c
index a7c5661f8496..eb113693f043 100644
--- a/trunk/arch/x86/kernel/msr.c
+++ b/trunk/arch/x86/kernel/msr.c
@@ -257,14 +257,12 @@ static int __init msr_init(void)
 		goto out_chrdev;
 	}
 	msr_class->devnode = msr_devnode;
-	get_online_cpus();
 	for_each_online_cpu(i) {
 		err = msr_device_create(i);
 		if (err != 0)
 			goto out_class;
 	}
 	register_hotcpu_notifier(&msr_class_cpu_notifier);
-	put_online_cpus();
 
 	err = 0;
 	goto out;
@@ -273,7 +271,6 @@ static int __init msr_init(void)
 	i = 0;
 	for_each_online_cpu(i)
 		msr_device_destroy(i);
-	put_online_cpus();
 	class_destroy(msr_class);
 out_chrdev:
 	__unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr");
@@ -284,13 +281,11 @@ static int __init msr_init(void)
 static void __exit msr_exit(void)
 {
 	int cpu = 0;
-	get_online_cpus();
 	for_each_online_cpu(cpu)
 		msr_device_destroy(cpu);
 	class_destroy(msr_class);
 	__unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr");
 	unregister_hotcpu_notifier(&msr_class_cpu_notifier);
-	put_online_cpus();
 }
 
 module_init(msr_init);
diff --git a/trunk/kernel/rcutree.c b/trunk/kernel/rcutree.c
index be76c80a14d1..f7bcd9e6c054 100644
--- a/trunk/kernel/rcutree.c
+++ b/trunk/kernel/rcutree.c
@@ -1392,6 +1392,17 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
 	int i;
 	struct rcu_data *rdp = __this_cpu_ptr(rsp->rda);
 
+	/*
+	 * If there is an rcu_barrier() operation in progress, then
+	 * only the task doing that operation is permitted to adopt
+	 * callbacks.  To do otherwise breaks rcu_barrier() and friends
+	 * by causing them to fail to wait for the callbacks in the
+	 * orphanage.
+	 */
+	if (rsp->rcu_barrier_in_progress &&
+	    rsp->rcu_barrier_in_progress != current)
+		return;
+
 	/* Do the accounting first. */
 	rdp->qlen_lazy += rsp->qlen_lazy;
 	rdp->qlen += rsp->qlen;
@@ -1446,8 +1457,9 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
  * The CPU has been completely removed, and some other CPU is reporting
  * this fact from process context.  Do the remainder of the cleanup,
  * including orphaning the outgoing CPU's RCU callbacks, and also
- * adopting them.  There can only be one CPU hotplug operation at a time,
- * so no other CPU can be attempting to update rcu_cpu_kthread_task.
+ * adopting them, if there is no _rcu_barrier() instance running.
+ * There can only be one CPU hotplug operation at a time, so no other
+ * CPU can be attempting to update rcu_cpu_kthread_task.
  */
 static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
 {
@@ -1505,13 +1517,14 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
 	WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL,
 		  "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n",
 		  cpu, rdp->qlen, rdp->nxtlist);
-	init_callback_list(rdp);
-	/* Disallow further callbacks on this CPU. */
-	rdp->nxttail[RCU_NEXT_TAIL] = NULL;
 }
 
 #else /* #ifdef CONFIG_HOTPLUG_CPU */
 
+static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
+{
+}
+
 static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
 {
 }
@@ -1930,12 +1943,6 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
 	rdp = this_cpu_ptr(rsp->rda);
 
 	/* Add the callback to our list. */
-	if (unlikely(rdp->nxttail[RCU_NEXT_TAIL] == NULL)) {
-		/* _call_rcu() is illegal on offline CPU; leak the callback. */
-		WARN_ON_ONCE(1);
-		local_irq_restore(flags);
-		return;
-	}
 	ACCESS_ONCE(rdp->qlen)++;
 	if (lazy)
 		rdp->qlen_lazy++;
@@ -2321,10 +2328,13 @@ static void rcu_barrier_func(void *type)
 static void _rcu_barrier(struct rcu_state *rsp)
 {
 	int cpu;
+	unsigned long flags;
 	struct rcu_data *rdp;
+	struct rcu_data rd;
 	unsigned long snap = ACCESS_ONCE(rsp->n_barrier_done);
 	unsigned long snap_done;
 
+	init_rcu_head_on_stack(&rd.barrier_head);
 	_rcu_barrier_trace(rsp, "Begin", -1, snap);
 
 	/* Take mutex to serialize concurrent rcu_barrier() requests. */
@@ -2364,30 +2374,70 @@ static void _rcu_barrier(struct rcu_state *rsp)
 	/*
 	 * Initialize the count to one rather than to zero in order to
 	 * avoid a too-soon return to zero in case of a short grace period
-	 * (or preemption of this task).  Exclude CPU-hotplug operations
-	 * to ensure that no offline CPU has callbacks queued.
+	 * (or preemption of this task).  Also flag this task as doing
+	 * an rcu_barrier().  This will prevent anyone else from adopting
+	 * orphaned callbacks, which could cause otherwise failure if a
+	 * CPU went offline and quickly came back online.  To see this,
+	 * consider the following sequence of events:
+	 *
+	 * 1.	We cause CPU 0 to post an rcu_barrier_callback() callback.
+	 * 2.	CPU 1 goes offline, orphaning its callbacks.
+	 * 3.	CPU 0 adopts CPU 1's orphaned callbacks.
+	 * 4.	CPU 1 comes back online.
+	 * 5.	We cause CPU 1 to post an rcu_barrier_callback() callback.
+	 * 6.	Both rcu_barrier_callback() callbacks are invoked, awakening
+	 *	us -- but before CPU 1's orphaned callbacks are invoked!!!
 	 */
 	init_completion(&rsp->barrier_completion);
 	atomic_set(&rsp->barrier_cpu_count, 1);
-	get_online_cpus();
+	raw_spin_lock_irqsave(&rsp->onofflock, flags);
+	rsp->rcu_barrier_in_progress = current;
+	raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
 
 	/*
-	 * Force each CPU with callbacks to register a new callback.
-	 * When that callback is invoked, we will know that all of the
-	 * corresponding CPU's preceding callbacks have been invoked.
+	 * Force every CPU with callbacks to register a new callback
+	 * that will tell us when all the preceding callbacks have
+	 * been invoked.  If an offline CPU has callbacks, wait for
+	 * it to either come back online or to finish orphaning those
+	 * callbacks.
 	 */
-	for_each_online_cpu(cpu) {
+	for_each_possible_cpu(cpu) {
+		preempt_disable();
 		rdp = per_cpu_ptr(rsp->rda, cpu);
-		if (ACCESS_ONCE(rdp->qlen)) {
+		if (cpu_is_offline(cpu)) {
+			_rcu_barrier_trace(rsp, "Offline", cpu,
+					   rsp->n_barrier_done);
+			preempt_enable();
+			while (cpu_is_offline(cpu) && ACCESS_ONCE(rdp->qlen))
+				schedule_timeout_interruptible(1);
+		} else if (ACCESS_ONCE(rdp->qlen)) {
 			_rcu_barrier_trace(rsp, "OnlineQ", cpu,
 					   rsp->n_barrier_done);
 			smp_call_function_single(cpu, rcu_barrier_func, rsp, 1);
+			preempt_enable();
 		} else {
 			_rcu_barrier_trace(rsp, "OnlineNQ", cpu,
 					   rsp->n_barrier_done);
+			preempt_enable();
 		}
 	}
-	put_online_cpus();
+
+	/*
+	 * Now that all online CPUs have rcu_barrier_callback() callbacks
+	 * posted, we can adopt all of the orphaned callbacks and place
+	 * an rcu_barrier_callback() callback after them.  When that is done,
+	 * we are guaranteed to have an rcu_barrier_callback() callback
+	 * following every callback that could possibly have been
+	 * registered before _rcu_barrier() was called.
+	 */
+	raw_spin_lock_irqsave(&rsp->onofflock, flags);
+	rcu_adopt_orphan_cbs(rsp);
+	rsp->rcu_barrier_in_progress = NULL;
+	raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
+	atomic_inc(&rsp->barrier_cpu_count);
+	smp_mb__after_atomic_inc(); /* Ensure atomic_inc() before callback. */
+	rd.rsp = rsp;
+	rsp->call(&rd.barrier_head, rcu_barrier_callback);
 
 	/*
 	 * Now that we have an rcu_barrier_callback() callback on each
@@ -2408,6 +2458,8 @@ static void _rcu_barrier(struct rcu_state *rsp)
 
 	/* Other rcu_barrier() invocations can now safely proceed. */
 	mutex_unlock(&rsp->barrier_mutex);
+
+	destroy_rcu_head_on_stack(&rd.barrier_head);
 }
 
 /**
@@ -2473,7 +2525,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
 	rdp->qlen_last_fqs_check = 0;
 	rdp->n_force_qs_snap = rsp->n_force_qs;
 	rdp->blimit = blimit;
-	init_callback_list(rdp);  /* Re-enable callbacks on this CPU. */
 	rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
 	atomic_set(&rdp->dynticks->dynticks,
 		   (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1);
diff --git a/trunk/kernel/rcutree.h b/trunk/kernel/rcutree.h
index 94dfdf1f31f5..4d29169f2124 100644
--- a/trunk/kernel/rcutree.h
+++ b/trunk/kernel/rcutree.h
@@ -398,6 +398,9 @@ struct rcu_state {
 	struct rcu_head **orphan_donetail;	/* Tail of above. */
 	long qlen_lazy;				/* Number of lazy callbacks. */
 	long qlen;				/* Total number of callbacks. */
+	struct task_struct *rcu_barrier_in_progress;
+						/* Task doing rcu_barrier(), */
+						/*  or NULL if no barrier. */
 	struct mutex barrier_mutex;		/* Guards barrier fields. */
 	atomic_t barrier_cpu_count;		/* # CPUs waiting on. */
 	struct completion barrier_completion;	/* Wake at barrier end. */
diff --git a/trunk/kernel/rcutree_trace.c b/trunk/kernel/rcutree_trace.c
index 6a2e52a85d77..abffb486e94e 100644
--- a/trunk/kernel/rcutree_trace.c
+++ b/trunk/kernel/rcutree_trace.c
@@ -51,8 +51,8 @@ static int show_rcubarrier(struct seq_file *m, void *unused)
 	struct rcu_state *rsp;
 
 	for_each_rcu_flavor(rsp)
-		seq_printf(m, "%s: bcc: %d nbd: %lu\n",
-			   rsp->name,
+		seq_printf(m, "%s: %c bcc: %d nbd: %lu\n",
+			   rsp->name, rsp->rcu_barrier_in_progress ? 'B' : '.',
 			   atomic_read(&rsp->barrier_cpu_count),
 			   rsp->n_barrier_done);
 	return 0;
diff --git a/trunk/kernel/sched/core.c b/trunk/kernel/sched/core.c
index 8c38b5e7ce47..fbf1fd098dc6 100644
--- a/trunk/kernel/sched/core.c
+++ b/trunk/kernel/sched/core.c
@@ -5304,17 +5304,27 @@ void idle_task_exit(void)
 }
 
 /*
- * Since this CPU is going 'away' for a while, fold any nr_active delta
- * we might have. Assumes we're called after migrate_tasks() so that the
- * nr_active count is stable.
- *
- * Also see the comment "Global load-average calculations".
+ * While a dead CPU has no uninterruptible tasks queued at this point,
+ * it might still have a nonzero ->nr_uninterruptible counter, because
+ * for performance reasons the counter is not stricly tracking tasks to
+ * their home CPUs. So we just add the counter to another CPU's counter,
+ * to keep the global sum constant after CPU-down:
  */
-static void calc_load_migrate(struct rq *rq)
+static void migrate_nr_uninterruptible(struct rq *rq_src)
 {
-	long delta = calc_load_fold_active(rq);
-	if (delta)
-		atomic_long_add(delta, &calc_load_tasks);
+	struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask));
+
+	rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
+	rq_src->nr_uninterruptible = 0;
+}
+
+/*
+ * remove the tasks which were accounted by rq from calc_load_tasks.
+ */
+static void calc_global_load_remove(struct rq *rq)
+{
+	atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
+	rq->calc_load_active = 0;
 }
 
 /*
@@ -5607,18 +5617,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 		migrate_tasks(cpu);
 		BUG_ON(rq->nr_running != 1); /* the migration thread */
 		raw_spin_unlock_irqrestore(&rq->lock, flags);
-		break;
 
-	case CPU_DEAD:
-		{
-			struct rq *dest_rq;
-
-			local_irq_save(flags);
-			dest_rq = cpu_rq(smp_processor_id());
-			raw_spin_lock(&dest_rq->lock);
-			calc_load_migrate(rq);
-			raw_spin_unlock_irqrestore(&dest_rq->lock, flags);
-		}
+		migrate_nr_uninterruptible(rq);
+		calc_global_load_remove(rq);
 		break;
 #endif
 	}