From 115869f435f7ff8bf35df9eb5605d417f3220028 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 11 Jul 2008 13:34:54 +0100
Subject: [PATCH] --- yaml --- r: 105242 b: refs/heads/master c:
 577b4a58d2e74a4d48050eeea3e3f952ce04eb86 h: refs/heads/master v: v3

---
 [refs]                        |   2 +-
 trunk/include/linux/cpumask.h |   6 +-
 trunk/include/linux/cpuset.h  |   7 ---
 trunk/include/linux/sched.h   |  11 +---
 trunk/init/main.c             |   7 ---
 trunk/kernel/cpu.c            |  40 ++-----------
 trunk/kernel/cpuset.c         |   2 +-
 trunk/kernel/sched.c          | 108 ++++++++++++++++++++--------------
 trunk/kernel/sched_fair.c     |   3 -
 trunk/kernel/sched_rt.c       |  77 ++++++++++--------------
 10 files changed, 103 insertions(+), 160 deletions(-)

diff --git a/[refs] b/[refs]
index bd1e859d257d..f7b901cca55d 100644
--- a/[refs]
+++ b/[refs]
@@ -1,2 +1,2 @@
 ---
-refs/heads/master: 1b427c153a08fdbc092c2bdbf845b92fda58d857
+refs/heads/master: 577b4a58d2e74a4d48050eeea3e3f952ce04eb86
diff --git a/trunk/include/linux/cpumask.h b/trunk/include/linux/cpumask.h
index d614d2472798..c24875bd9c5b 100644
--- a/trunk/include/linux/cpumask.h
+++ b/trunk/include/linux/cpumask.h
@@ -359,14 +359,13 @@ static inline void __cpus_fold(cpumask_t *dstp, const cpumask_t *origp,
 
 /*
  * The following particular system cpumasks and operations manage
- * possible, present, active and online cpus.  Each of them is a fixed size
+ * possible, present and online cpus.  Each of them is a fixed size
  * bitmap of size NR_CPUS.
  *
  *  #ifdef CONFIG_HOTPLUG_CPU
  *     cpu_possible_map - has bit 'cpu' set iff cpu is populatable
  *     cpu_present_map  - has bit 'cpu' set iff cpu is populated
  *     cpu_online_map   - has bit 'cpu' set iff cpu available to scheduler
- *     cpu_active_map   - has bit 'cpu' set iff cpu available to migration
  *  #else
  *     cpu_possible_map - has bit 'cpu' set iff cpu is populated
  *     cpu_present_map  - copy of cpu_possible_map
@@ -417,7 +416,6 @@ static inline void __cpus_fold(cpumask_t *dstp, const cpumask_t *origp,
 extern cpumask_t cpu_possible_map;
 extern cpumask_t cpu_online_map;
 extern cpumask_t cpu_present_map;
-extern cpumask_t cpu_active_map;
 
 #if NR_CPUS > 1
 #define num_online_cpus()	cpus_weight(cpu_online_map)
@@ -426,7 +424,6 @@ extern cpumask_t cpu_active_map;
 #define cpu_online(cpu)		cpu_isset((cpu), cpu_online_map)
 #define cpu_possible(cpu)	cpu_isset((cpu), cpu_possible_map)
 #define cpu_present(cpu)	cpu_isset((cpu), cpu_present_map)
-#define cpu_active(cpu)		cpu_isset((cpu), cpu_active_map)
 #else
 #define num_online_cpus()	1
 #define num_possible_cpus()	1
@@ -434,7 +431,6 @@ extern cpumask_t cpu_active_map;
 #define cpu_online(cpu)		((cpu) == 0)
 #define cpu_possible(cpu)	((cpu) == 0)
 #define cpu_present(cpu)	((cpu) == 0)
-#define cpu_active(cpu)		((cpu) == 0)
 #endif
 
 #define cpu_is_offline(cpu)	unlikely(!cpu_online(cpu))
diff --git a/trunk/include/linux/cpuset.h b/trunk/include/linux/cpuset.h
index e8f450c499b0..038578362b47 100644
--- a/trunk/include/linux/cpuset.h
+++ b/trunk/include/linux/cpuset.h
@@ -78,8 +78,6 @@ extern void cpuset_track_online_nodes(void);
 
 extern int current_cpuset_is_being_rebound(void);
 
-extern void rebuild_sched_domains(void);
-
 #else /* !CONFIG_CPUSETS */
 
 static inline int cpuset_init_early(void) { return 0; }
@@ -158,11 +156,6 @@ static inline int current_cpuset_is_being_rebound(void)
 	return 0;
 }
 
-static inline void rebuild_sched_domains(void)
-{
-	partition_sched_domains(0, NULL, NULL);
-}
-
 #endif /* !CONFIG_CPUSETS */
 
 #endif /* _LINUX_CPUSET_H */
diff --git a/trunk/include/linux/sched.h b/trunk/include/linux/sched.h
index 26da921530fe..1941d8b5cf11 100644
--- a/trunk/include/linux/sched.h
+++ b/trunk/include/linux/sched.h
@@ -824,16 +824,7 @@ extern void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
 				    struct sched_domain_attr *dattr_new);
 extern int arch_reinit_sched_domains(void);
 
-#else /* CONFIG_SMP */
-
-struct sched_domain_attr;
-
-static inline void
-partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
-			struct sched_domain_attr *dattr_new)
-{
-}
-#endif	/* !CONFIG_SMP */
+#endif	/* CONFIG_SMP */
 
 struct io_context;			/* See blkdev.h */
 #define NGROUPS_SMALL		32
diff --git a/trunk/init/main.c b/trunk/init/main.c
index dd25259530ea..edeace036fd9 100644
--- a/trunk/init/main.c
+++ b/trunk/init/main.c
@@ -415,13 +415,6 @@ static void __init smp_init(void)
 {
 	unsigned int cpu;
 
-	/*
-	 * Set up the current CPU as possible to migrate to.
-	 * The other ones will be done by cpu_up/cpu_down()
-	 */
-	cpu = smp_processor_id();
-	cpu_set(cpu, cpu_active_map);
-
 	/* FIXME: This should be done in userspace --RR */
 	for_each_present_cpu(cpu) {
 		if (num_online_cpus() >= setup_max_cpus)
diff --git a/trunk/kernel/cpu.c b/trunk/kernel/cpu.c
index 033603c1d7c3..cfb1d43ab801 100644
--- a/trunk/kernel/cpu.c
+++ b/trunk/kernel/cpu.c
@@ -64,8 +64,6 @@ void __init cpu_hotplug_init(void)
 	cpu_hotplug.refcount = 0;
 }
 
-cpumask_t cpu_active_map;
-
 #ifdef CONFIG_HOTPLUG_CPU
 
 void get_online_cpus(void)
@@ -293,30 +291,11 @@ int __ref cpu_down(unsigned int cpu)
 	int err = 0;
 
 	cpu_maps_update_begin();
-
-	if (cpu_hotplug_disabled) {
+	if (cpu_hotplug_disabled)
 		err = -EBUSY;
-		goto out;
-	}
-
-	cpu_clear(cpu, cpu_active_map);
-
-	/*
-	 * Make sure the all cpus did the reschedule and are not
-	 * using stale version of the cpu_active_map.
-	 * This is not strictly necessary becuase stop_machine()
-	 * that we run down the line already provides the required
-	 * synchronization. But it's really a side effect and we do not
-	 * want to depend on the innards of the stop_machine here.
-	 */
-	synchronize_sched();
-
-	err = _cpu_down(cpu, 0);
+	else
+		err = _cpu_down(cpu, 0);
 
-	if (cpu_online(cpu))
-		cpu_set(cpu, cpu_active_map);
-
-out:
 	cpu_maps_update_done();
 	return err;
 }
@@ -376,18 +355,11 @@ int __cpuinit cpu_up(unsigned int cpu)
 	}
 
 	cpu_maps_update_begin();
-
-	if (cpu_hotplug_disabled) {
+	if (cpu_hotplug_disabled)
 		err = -EBUSY;
-		goto out;
-	}
-
-	err = _cpu_up(cpu, 0);
+	else
+		err = _cpu_up(cpu, 0);
 
-	if (cpu_online(cpu))
-		cpu_set(cpu, cpu_active_map);
-
-out:
 	cpu_maps_update_done();
 	return err;
 }
diff --git a/trunk/kernel/cpuset.c b/trunk/kernel/cpuset.c
index 3c3ef02f65f1..459d601947a8 100644
--- a/trunk/kernel/cpuset.c
+++ b/trunk/kernel/cpuset.c
@@ -564,7 +564,7 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
  *	partition_sched_domains().
  */
 
-void rebuild_sched_domains(void)
+static void rebuild_sched_domains(void)
 {
 	struct kfifo *q;	/* queue of cpusets to be scanned */
 	struct cpuset *cp;	/* scans q */
diff --git a/trunk/kernel/sched.c b/trunk/kernel/sched.c
index c237624a8a04..1ee18dbb4516 100644
--- a/trunk/kernel/sched.c
+++ b/trunk/kernel/sched.c
@@ -2881,7 +2881,7 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu)
 
 	rq = task_rq_lock(p, &flags);
 	if (!cpu_isset(dest_cpu, p->cpus_allowed)
-	    || unlikely(!cpu_active(dest_cpu)))
+	    || unlikely(cpu_is_offline(dest_cpu)))
 		goto out;
 
 	/* force the process onto the specified CPU */
@@ -3849,7 +3849,7 @@ int select_nohz_load_balancer(int stop_tick)
 		/*
 		 * If we are going offline and still the leader, give up!
 		 */
-		if (!cpu_active(cpu) &&
+		if (cpu_is_offline(cpu) &&
 		    atomic_read(&nohz.load_balancer) == cpu) {
 			if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
 				BUG();
@@ -5876,7 +5876,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
 	struct rq *rq_dest, *rq_src;
 	int ret = 0, on_rq;
 
-	if (unlikely(!cpu_active(dest_cpu)))
+	if (unlikely(cpu_is_offline(dest_cpu)))
 		return ret;
 
 	rq_src = cpu_rq(src_cpu);
@@ -7553,6 +7553,18 @@ void __attribute__((weak)) arch_update_cpu_topology(void)
 {
 }
 
+/*
+ * Free current domain masks.
+ * Called after all cpus are attached to NULL domain.
+ */
+static void free_sched_domains(void)
+{
+	ndoms_cur = 0;
+	if (doms_cur != &fallback_doms)
+		kfree(doms_cur);
+	doms_cur = &fallback_doms;
+}
+
 /*
  * Set up scheduler domains and groups. Callers must hold the hotplug lock.
  * For now this just excludes isolated cpus, but could be used to
@@ -7631,7 +7643,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
  * ownership of it and will kfree it when done with it. If the caller
  * failed the kmalloc call, then it can pass in doms_new == NULL,
  * and partition_sched_domains() will fallback to the single partition
- * 'fallback_doms', it also forces the domains to be rebuilt.
+ * 'fallback_doms'.
  *
  * Call with hotplug lock held
  */
@@ -7645,8 +7657,12 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
 	/* always unregister in case we don't destroy any domains */
 	unregister_sched_domain_sysctl();
 
-	if (doms_new == NULL)
-		ndoms_new = 0;
+	if (doms_new == NULL) {
+		ndoms_new = 1;
+		doms_new = &fallback_doms;
+		cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
+		dattr_new = NULL;
+	}
 
 	/* Destroy deleted domains */
 	for (i = 0; i < ndoms_cur; i++) {
@@ -7661,14 +7677,6 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
 		;
 	}
 
-	if (doms_new == NULL) {
-		ndoms_cur = 0;
-		ndoms_new = 1;
-		doms_new = &fallback_doms;
-		cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
-		dattr_new = NULL;
-	}
-
 	/* Build new domains */
 	for (i = 0; i < ndoms_new; i++) {
 		for (j = 0; j < ndoms_cur; j++) {
@@ -7699,10 +7707,17 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
 int arch_reinit_sched_domains(void)
 {
+	int err;
+
 	get_online_cpus();
-	rebuild_sched_domains();
+	mutex_lock(&sched_domains_mutex);
+	detach_destroy_domains(&cpu_online_map);
+	free_sched_domains();
+	err = arch_init_sched_domains(&cpu_online_map);
+	mutex_unlock(&sched_domains_mutex);
 	put_online_cpus();
-	return 0;
+
+	return err;
 }
 
 static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
@@ -7768,30 +7783,14 @@ int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
 }
 #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
 
-#ifndef CONFIG_CPUSETS
 /*
- * Add online and remove offline CPUs from the scheduler domains.
- * When cpusets are enabled they take over this function.
+ * Force a reinitialization of the sched domains hierarchy. The domains
+ * and groups cannot be updated in place without racing with the balancing
+ * code, so we temporarily attach all running cpus to the NULL domain
+ * which will prevent rebalancing while the sched domains are recalculated.
  */
 static int update_sched_domains(struct notifier_block *nfb,
 				unsigned long action, void *hcpu)
-{
-	switch (action) {
-	case CPU_ONLINE:
-	case CPU_ONLINE_FROZEN:
-	case CPU_DEAD:
-	case CPU_DEAD_FROZEN:
-		partition_sched_domains(0, NULL, NULL);
-		return NOTIFY_OK;
-
-	default:
-		return NOTIFY_DONE;
-	}
-}
-#endif
-
-static int update_runtime(struct notifier_block *nfb,
-				unsigned long action, void *hcpu)
 {
 	int cpu = (int)(long)hcpu;
 
@@ -7799,18 +7798,44 @@ static int update_runtime(struct notifier_block *nfb,
 	case CPU_DOWN_PREPARE:
 	case CPU_DOWN_PREPARE_FROZEN:
 		disable_runtime(cpu_rq(cpu));
+		/* fall-through */
+	case CPU_UP_PREPARE:
+	case CPU_UP_PREPARE_FROZEN:
+		detach_destroy_domains(&cpu_online_map);
+		free_sched_domains();
 		return NOTIFY_OK;
 
+
 	case CPU_DOWN_FAILED:
 	case CPU_DOWN_FAILED_FROZEN:
 	case CPU_ONLINE:
 	case CPU_ONLINE_FROZEN:
 		enable_runtime(cpu_rq(cpu));
-		return NOTIFY_OK;
-
+		/* fall-through */
+	case CPU_UP_CANCELED:
+	case CPU_UP_CANCELED_FROZEN:
+	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
+		/*
+		 * Fall through and re-initialise the domains.
+		 */
+		break;
 	default:
 		return NOTIFY_DONE;
 	}
+
+#ifndef CONFIG_CPUSETS
+	/*
+	 * Create default domain partitioning if cpusets are disabled.
+	 * Otherwise we let cpusets rebuild the domains based on the
+	 * current setup.
+	 */
+
+	/* The hotplug lock is already held by cpu_up/cpu_down */
+	arch_init_sched_domains(&cpu_online_map);
+#endif
+
+	return NOTIFY_OK;
 }
 
 void __init sched_init_smp(void)
@@ -7830,15 +7855,8 @@ void __init sched_init_smp(void)
 		cpu_set(smp_processor_id(), non_isolated_cpus);
 	mutex_unlock(&sched_domains_mutex);
 	put_online_cpus();
-
-#ifndef CONFIG_CPUSETS
 	/* XXX: Theoretical race here - CPU may be hotplugged now */
 	hotcpu_notifier(update_sched_domains, 0);
-#endif
-
-	/* RT runtime code needs to handle some hotplug events */
-	hotcpu_notifier(update_runtime, 0);
-
 	init_hrtick();
 
 	/* Move init over to a non-isolated CPU */
diff --git a/trunk/kernel/sched_fair.c b/trunk/kernel/sched_fair.c
index d924c679dfac..f2aa987027d6 100644
--- a/trunk/kernel/sched_fair.c
+++ b/trunk/kernel/sched_fair.c
@@ -1004,8 +1004,6 @@ static void yield_task_fair(struct rq *rq)
  * not idle and an idle cpu is available.  The span of cpus to
  * search starts with cpus closest then further out as needed,
  * so we always favor a closer, idle cpu.
- * Domains may include CPUs that are not usable for migration,
- * hence we need to mask them out (cpu_active_map)
  *
  * Returns the CPU we should wake onto.
  */
@@ -1033,7 +1031,6 @@ static int wake_idle(int cpu, struct task_struct *p)
 		    || ((sd->flags & SD_WAKE_IDLE_FAR)
 			&& !task_hot(p, task_rq(p)->clock, sd))) {
 			cpus_and(tmp, sd->span, p->cpus_allowed);
-			cpus_and(tmp, tmp, cpu_active_map);
 			for_each_cpu_mask(i, tmp) {
 				if (idle_cpu(i)) {
 					if (i != task_cpu(p)) {
diff --git a/trunk/kernel/sched_rt.c b/trunk/kernel/sched_rt.c
index 50735bb96149..147004c651c0 100644
--- a/trunk/kernel/sched_rt.c
+++ b/trunk/kernel/sched_rt.c
@@ -505,7 +505,9 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 	rt_rq->rt_nr_running++;
 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
 	if (rt_se_prio(rt_se) < rt_rq->highest_prio) {
+#ifdef CONFIG_SMP
 		struct rq *rq = rq_of_rt_rq(rt_rq);
+#endif
 
 		rt_rq->highest_prio = rt_se_prio(rt_se);
 #ifdef CONFIG_SMP
@@ -599,7 +601,11 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se)
 	if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
 		return;
 
-	list_add_tail(&rt_se->run_list, queue);
+	if (rt_se->nr_cpus_allowed == 1)
+		list_add(&rt_se->run_list, queue);
+	else
+		list_add_tail(&rt_se->run_list, queue);
+
 	__set_bit(rt_se_prio(rt_se), array->bitmap);
 
 	inc_rt_tasks(rt_se, rt_rq);
@@ -684,34 +690,32 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
  * Put task to the end of the run list without the overhead of dequeue
  * followed by enqueue.
  */
-static void
-requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head)
+static
+void requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
 {
-	if (on_rt_rq(rt_se)) {
-		struct rt_prio_array *array = &rt_rq->active;
-		struct list_head *queue = array->queue + rt_se_prio(rt_se);
+	struct rt_prio_array *array = &rt_rq->active;
 
-		if (head)
-			list_move(&rt_se->run_list, queue);
-		else
-			list_move_tail(&rt_se->run_list, queue);
+	if (on_rt_rq(rt_se)) {
+		list_del_init(&rt_se->run_list);
+		list_add_tail(&rt_se->run_list,
+			      array->queue + rt_se_prio(rt_se));
 	}
 }
 
-static void requeue_task_rt(struct rq *rq, struct task_struct *p, int head)
+static void requeue_task_rt(struct rq *rq, struct task_struct *p)
 {
 	struct sched_rt_entity *rt_se = &p->rt;
 	struct rt_rq *rt_rq;
 
 	for_each_sched_rt_entity(rt_se) {
 		rt_rq = rt_rq_of_se(rt_se);
-		requeue_rt_entity(rt_rq, rt_se, head);
+		requeue_rt_entity(rt_rq, rt_se);
 	}
 }
 
 static void yield_task_rt(struct rq *rq)
 {
-	requeue_task_rt(rq, rq->curr, 0);
+	requeue_task_rt(rq, rq->curr);
 }
 
 #ifdef CONFIG_SMP
@@ -751,30 +755,6 @@ static int select_task_rq_rt(struct task_struct *p, int sync)
 	 */
 	return task_cpu(p);
 }
-
-static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
-{
-	cpumask_t mask;
-
-	if (rq->curr->rt.nr_cpus_allowed == 1)
-		return;
-
-	if (p->rt.nr_cpus_allowed != 1
-	    && cpupri_find(&rq->rd->cpupri, p, &mask))
-		return;
-
-	if (!cpupri_find(&rq->rd->cpupri, rq->curr, &mask))
-		return;
-
-	/*
-	 * There appears to be other cpus that can accept
-	 * current and none to run 'p', so lets reschedule
-	 * to try and push current away:
-	 */
-	requeue_task_rt(rq, p, 1);
-	resched_task(rq->curr);
-}
-
 #endif /* CONFIG_SMP */
 
 /*
@@ -800,8 +780,18 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p)
 	 * to move current somewhere else, making room for our non-migratable
 	 * task.
 	 */
-	if (p->prio == rq->curr->prio && !need_resched())
-		check_preempt_equal_prio(rq, p);
+	if((p->prio == rq->curr->prio)
+	   && p->rt.nr_cpus_allowed == 1
+	   && rq->curr->rt.nr_cpus_allowed != 1) {
+		cpumask_t mask;
+
+		if (cpupri_find(&rq->rd->cpupri, rq->curr, &mask))
+			/*
+			 * There appears to be other cpus that can accept
+			 * current, so lets reschedule to try and push it away
+			 */
+			resched_task(rq->curr);
+	}
 #endif
 }
 
@@ -933,13 +923,6 @@ static int find_lowest_rq(struct task_struct *task)
 	if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
 		return -1; /* No targets found */
 
-	/*
-	 * Only consider CPUs that are usable for migration.
-	 * I guess we might want to change cpupri_find() to ignore those
-	 * in the first place.
-	 */
-	cpus_and(*lowest_mask, *lowest_mask, cpu_active_map);
-
 	/*
 	 * At this point we have built a mask of cpus representing the
 	 * lowest priority tasks in the system.  Now we want to elect
@@ -1434,7 +1417,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
 	 * on the queue:
 	 */
 	if (p->rt.run_list.prev != p->rt.run_list.next) {
-		requeue_task_rt(rq, p, 0);
+		requeue_task_rt(rq, p);
 		set_tsk_need_resched(p);
 	}
 }