From e3334a482b78c90a8f89b0ef1118771cba526e1b Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Sun, 27 Nov 2011 21:43:10 +0000
Subject: [PATCH] --- yaml --- r: 298354 b: refs/heads/master c:
 01f23e1630d944f7085cd8fd5793e31ea91c03d8 h: refs/heads/master v: v3

---
 [refs]                                        |   2 +-
 trunk/Documentation/scheduler/sched-stats.txt |   3 +-
 trunk/MAINTAINERS                             |   4 +-
 trunk/arch/arm/kernel/process.c               |   4 +-
 trunk/arch/arm/kernel/smp.c                   |   7 +
 trunk/arch/avr32/kernel/process.c             |   4 +-
 trunk/arch/blackfin/kernel/process.c          |   4 +-
 trunk/arch/cris/kernel/process.c              |   4 +-
 trunk/arch/frv/kernel/process.c               |   4 +-
 trunk/arch/h8300/kernel/process.c             |   4 +-
 trunk/arch/hexagon/kernel/smp.c               |   2 +
 trunk/arch/ia64/kernel/process.c              |   4 +-
 trunk/arch/m32r/kernel/process.c              |   4 +-
 trunk/arch/m68k/kernel/process_mm.c           |   4 +-
 trunk/arch/m68k/kernel/process_no.c           |   4 +-
 trunk/arch/microblaze/kernel/process.c        |   4 +-
 trunk/arch/mips/kernel/process.c              |   4 +-
 trunk/arch/mn10300/kernel/process.c           |   4 +-
 trunk/arch/parisc/kernel/process.c            |   4 +-
 trunk/arch/powerpc/kernel/idle.c              |   8 +-
 trunk/arch/powerpc/platforms/iseries/setup.c  |   8 +-
 trunk/arch/s390/kernel/process.c              |   4 +-
 trunk/arch/s390/kernel/smp.c                  |   6 +
 trunk/arch/score/kernel/process.c             |   4 +-
 trunk/arch/sh/kernel/idle.c                   |   4 +-
 trunk/arch/sparc/kernel/process_32.c          |   8 +-
 trunk/arch/sparc/kernel/process_64.c          |  10 +-
 trunk/arch/tile/kernel/process.c              |   4 +-
 trunk/arch/x86/include/asm/timer.h            |   8 +-
 trunk/arch/x86/kernel/process_32.c            |   4 +-
 trunk/arch/x86/kernel/process_64.c            |   4 +-
 trunk/arch/x86/kernel/smpboot.c               |  18 +
 trunk/arch/x86/kernel/tsc.c                   |   3 +-
 trunk/arch/xtensa/kernel/process.c            |   4 +-
 trunk/block/blk-softirq.c                     |  16 +-
 trunk/block/blk.h                             |  16 +
 trunk/fs/proc/base.c                          |   3 +-
 trunk/include/linux/cpuset.h                  |   6 +-
 trunk/include/linux/init_task.h               |   2 +-
 trunk/include/linux/kernel.h                  |  13 -
 trunk/include/linux/preempt.h                 |   5 +-
 trunk/include/linux/printk.h                  |  10 -
 trunk/include/linux/sched.h                   |  41 +-
 trunk/include/linux/wait.h                    |   5 +-
 trunk/init/main.c                             |   5 +-
 trunk/kernel/cpuset.c                         |  21 +-
 trunk/kernel/mutex.c                          |   4 +-
 trunk/kernel/printk.c                         |  40 +-
 trunk/kernel/sched/auto_group.c               |  12 +-
 trunk/kernel/sched/core.c                     | 198 +++------
 trunk/kernel/sched/debug.c                    |   1 +
 trunk/kernel/sched/fair.c                     | 408 ++++++++++--------
 trunk/kernel/sched/rt.c                       |  45 +-
 trunk/kernel/sched/sched.h                    |  18 +-
 trunk/kernel/sched/stats.c                    |   4 +-
 trunk/kernel/softirq.c                        |   8 +-
 56 files changed, 526 insertions(+), 526 deletions(-)

diff --git a/[refs] b/[refs]
index 2700e53c598c..bce2aa989b9d 100644
--- a/[refs]
+++ b/[refs]
@@ -1,2 +1,2 @@
 ---
-refs/heads/master: 6135fc1eb4b1c9ae5f535507ed59591bab51e630
+refs/heads/master: 01f23e1630d944f7085cd8fd5793e31ea91c03d8
diff --git a/trunk/Documentation/scheduler/sched-stats.txt b/trunk/Documentation/scheduler/sched-stats.txt
index 8259b34a66ae..1cd5d51bc761 100644
--- a/trunk/Documentation/scheduler/sched-stats.txt
+++ b/trunk/Documentation/scheduler/sched-stats.txt
@@ -38,8 +38,7 @@ First field is a sched_yield() statistic:
      1) # of times sched_yield() was called
 
 Next three are schedule() statistics:
-     2) This field is a legacy array expiration count field used in the O(1)
-	scheduler. We kept it for ABI compatibility, but it is always set to zero.
+     2) # of times we switched to the expired queue and reused it
      3) # of times schedule() was called
      4) # of times schedule() left the processor idle
 
diff --git a/trunk/MAINTAINERS b/trunk/MAINTAINERS
index 1ad6a06b0180..3321d75c6c7f 100644
--- a/trunk/MAINTAINERS
+++ b/trunk/MAINTAINERS
@@ -5112,7 +5112,7 @@ F:	kernel/delayacct.c
 PERFORMANCE EVENTS SUBSYSTEM
 M:	Peter Zijlstra <a.p.zijlstra@chello.nl>
 M:	Paul Mackerras <paulus@samba.org>
-M:	Ingo Molnar <mingo@redhat.com>
+M:	Ingo Molnar <mingo@elte.hu>
 M:	Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git perf/core
 S:	Supported
@@ -5736,7 +5736,7 @@ S:	Maintained
 F:	drivers/watchdog/sc1200wdt.c
 
 SCHEDULER
-M:	Ingo Molnar <mingo@redhat.com>
+M:	Ingo Molnar <mingo@elte.hu>
 M:	Peter Zijlstra <peterz@infradead.org>
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched/core
 S:	Maintained
diff --git a/trunk/arch/arm/kernel/process.c b/trunk/arch/arm/kernel/process.c
index c2ae3cd331fe..971d65c253a9 100644
--- a/trunk/arch/arm/kernel/process.c
+++ b/trunk/arch/arm/kernel/process.c
@@ -239,7 +239,9 @@ void cpu_idle(void)
 		leds_event(led_idle_end);
 		rcu_idle_exit();
 		tick_nohz_idle_exit();
-		schedule_preempt_disabled();
+		preempt_enable_no_resched();
+		schedule();
+		preempt_disable();
 	}
 }
 
diff --git a/trunk/arch/arm/kernel/smp.c b/trunk/arch/arm/kernel/smp.c
index d616ed51e7a7..cdeb727527d3 100644
--- a/trunk/arch/arm/kernel/smp.c
+++ b/trunk/arch/arm/kernel/smp.c
@@ -295,6 +295,13 @@ asmlinkage void __cpuinit secondary_start_kernel(void)
 	 */
 	percpu_timer_setup();
 
+	while (!cpu_active(cpu))
+		cpu_relax();
+
+	/*
+	 * cpu_active bit is set, so it's safe to enalbe interrupts
+	 * now.
+	 */
 	local_irq_enable();
 	local_fiq_enable();
 
diff --git a/trunk/arch/avr32/kernel/process.c b/trunk/arch/avr32/kernel/process.c
index 92c5af98a6f7..ea3395750324 100644
--- a/trunk/arch/avr32/kernel/process.c
+++ b/trunk/arch/avr32/kernel/process.c
@@ -40,7 +40,9 @@ void cpu_idle(void)
 			cpu_idle_sleep();
 		rcu_idle_exit();
 		tick_nohz_idle_exit();
-		schedule_preempt_disabled();
+		preempt_enable_no_resched();
+		schedule();
+		preempt_disable();
 	}
 }
 
diff --git a/trunk/arch/blackfin/kernel/process.c b/trunk/arch/blackfin/kernel/process.c
index a80a643f3691..8dd0416673cb 100644
--- a/trunk/arch/blackfin/kernel/process.c
+++ b/trunk/arch/blackfin/kernel/process.c
@@ -94,7 +94,9 @@ void cpu_idle(void)
 			idle();
 		rcu_idle_exit();
 		tick_nohz_idle_exit();
-		schedule_preempt_disabled();
+		preempt_enable_no_resched();
+		schedule();
+		preempt_disable();
 	}
 }
 
diff --git a/trunk/arch/cris/kernel/process.c b/trunk/arch/cris/kernel/process.c
index d8f50ff6fadd..aa585e4e979e 100644
--- a/trunk/arch/cris/kernel/process.c
+++ b/trunk/arch/cris/kernel/process.c
@@ -115,7 +115,9 @@ void cpu_idle (void)
 				idle = default_idle;
 			idle();
 		}
-		schedule_preempt_disabled();
+		preempt_enable_no_resched();
+		schedule();
+		preempt_disable();
 	}
 }
 
diff --git a/trunk/arch/frv/kernel/process.c b/trunk/arch/frv/kernel/process.c
index 29cc49783787..3901df1213c0 100644
--- a/trunk/arch/frv/kernel/process.c
+++ b/trunk/arch/frv/kernel/process.c
@@ -92,7 +92,9 @@ void cpu_idle(void)
 				idle();
 		}
 
-		schedule_preempt_disabled();
+		preempt_enable_no_resched();
+		schedule();
+		preempt_disable();
 	}
 }
 
diff --git a/trunk/arch/h8300/kernel/process.c b/trunk/arch/h8300/kernel/process.c
index 1a173b35f475..933bd388efb2 100644
--- a/trunk/arch/h8300/kernel/process.c
+++ b/trunk/arch/h8300/kernel/process.c
@@ -81,7 +81,9 @@ void cpu_idle(void)
 	while (1) {
 		while (!need_resched())
 			idle();
-		schedule_preempt_disabled();
+		preempt_enable_no_resched();
+		schedule();
+		preempt_disable();
 	}
 }
 
diff --git a/trunk/arch/hexagon/kernel/smp.c b/trunk/arch/hexagon/kernel/smp.c
index 0123c63e9a3a..c871a2cffaef 100644
--- a/trunk/arch/hexagon/kernel/smp.c
+++ b/trunk/arch/hexagon/kernel/smp.c
@@ -179,6 +179,8 @@ void __cpuinit start_secondary(void)
 	printk(KERN_INFO "%s cpu %d\n", __func__, current_thread_info()->cpu);
 
 	set_cpu_online(cpu, true);
+	while (!cpumask_test_cpu(cpu, cpu_active_mask))
+		cpu_relax();
 	local_irq_enable();
 
 	cpu_idle();
diff --git a/trunk/arch/ia64/kernel/process.c b/trunk/arch/ia64/kernel/process.c
index 9dc52b63fc87..6d33c5cc94f0 100644
--- a/trunk/arch/ia64/kernel/process.c
+++ b/trunk/arch/ia64/kernel/process.c
@@ -330,7 +330,9 @@ cpu_idle (void)
 			normal_xtp();
 #endif
 		}
-		schedule_preempt_disabled();
+		preempt_enable_no_resched();
+		schedule();
+		preempt_disable();
 		check_pgt_cache();
 		if (cpu_is_offline(cpu))
 			play_dead();
diff --git a/trunk/arch/m32r/kernel/process.c b/trunk/arch/m32r/kernel/process.c
index 3a4a32b27208..422bea9f1dbc 100644
--- a/trunk/arch/m32r/kernel/process.c
+++ b/trunk/arch/m32r/kernel/process.c
@@ -90,7 +90,9 @@ void cpu_idle (void)
 
 			idle();
 		}
-		schedule_preempt_disabled();
+		preempt_enable_no_resched();
+		schedule();
+		preempt_disable();
 	}
 }
 
diff --git a/trunk/arch/m68k/kernel/process_mm.c b/trunk/arch/m68k/kernel/process_mm.c
index fe4186b5fc32..099283ee1a8f 100644
--- a/trunk/arch/m68k/kernel/process_mm.c
+++ b/trunk/arch/m68k/kernel/process_mm.c
@@ -78,7 +78,9 @@ void cpu_idle(void)
 	while (1) {
 		while (!need_resched())
 			idle();
-		schedule_preempt_disabled();
+		preempt_enable_no_resched();
+		schedule();
+		preempt_disable();
 	}
 }
 
diff --git a/trunk/arch/m68k/kernel/process_no.c b/trunk/arch/m68k/kernel/process_no.c
index f7fe6c348595..5e1078cabe0e 100644
--- a/trunk/arch/m68k/kernel/process_no.c
+++ b/trunk/arch/m68k/kernel/process_no.c
@@ -73,7 +73,9 @@ void cpu_idle(void)
 	/* endless idle loop with no priority at all */
 	while (1) {
 		idle();
-		schedule_preempt_disabled();
+		preempt_enable_no_resched();
+		schedule();
+		preempt_disable();
 	}
 }
 
diff --git a/trunk/arch/microblaze/kernel/process.c b/trunk/arch/microblaze/kernel/process.c
index 9155f7d92669..7dcb5bfffb75 100644
--- a/trunk/arch/microblaze/kernel/process.c
+++ b/trunk/arch/microblaze/kernel/process.c
@@ -110,7 +110,9 @@ void cpu_idle(void)
 		rcu_idle_exit();
 		tick_nohz_idle_exit();
 
-		schedule_preempt_disabled();
+		preempt_enable_no_resched();
+		schedule();
+		preempt_disable();
 		check_pgt_cache();
 	}
 }
diff --git a/trunk/arch/mips/kernel/process.c b/trunk/arch/mips/kernel/process.c
index 61f1cb45a1d5..7955409051c4 100644
--- a/trunk/arch/mips/kernel/process.c
+++ b/trunk/arch/mips/kernel/process.c
@@ -80,7 +80,9 @@ void __noreturn cpu_idle(void)
 #endif
 		rcu_idle_exit();
 		tick_nohz_idle_exit();
-		schedule_preempt_disabled();
+		preempt_enable_no_resched();
+		schedule();
+		preempt_disable();
 	}
 }
 
diff --git a/trunk/arch/mn10300/kernel/process.c b/trunk/arch/mn10300/kernel/process.c
index cac401d37f75..28eec3102535 100644
--- a/trunk/arch/mn10300/kernel/process.c
+++ b/trunk/arch/mn10300/kernel/process.c
@@ -123,7 +123,9 @@ void cpu_idle(void)
 			idle();
 		}
 
-		schedule_preempt_disabled();
+		preempt_enable_no_resched();
+		schedule();
+		preempt_disable();
 	}
 }
 
diff --git a/trunk/arch/parisc/kernel/process.c b/trunk/arch/parisc/kernel/process.c
index d4b94b395c16..62c60b87d039 100644
--- a/trunk/arch/parisc/kernel/process.c
+++ b/trunk/arch/parisc/kernel/process.c
@@ -71,7 +71,9 @@ void cpu_idle(void)
 	while (1) {
 		while (!need_resched())
 			barrier();
-		schedule_preempt_disabled();
+		preempt_enable_no_resched();
+		schedule();
+		preempt_disable();
 		check_pgt_cache();
 	}
 }
diff --git a/trunk/arch/powerpc/kernel/idle.c b/trunk/arch/powerpc/kernel/idle.c
index c97fc60c790c..0a48bf5db6c8 100644
--- a/trunk/arch/powerpc/kernel/idle.c
+++ b/trunk/arch/powerpc/kernel/idle.c
@@ -101,11 +101,11 @@ void cpu_idle(void)
 		ppc64_runlatch_on();
 		rcu_idle_exit();
 		tick_nohz_idle_exit();
-		if (cpu_should_die()) {
-			sched_preempt_enable_no_resched();
+		preempt_enable_no_resched();
+		if (cpu_should_die())
 			cpu_die();
-		}
-		schedule_preempt_disabled();
+		schedule();
+		preempt_disable();
 	}
 }
 
diff --git a/trunk/arch/powerpc/platforms/iseries/setup.c b/trunk/arch/powerpc/platforms/iseries/setup.c
index a5fbf4cb6329..8fc62586a973 100644
--- a/trunk/arch/powerpc/platforms/iseries/setup.c
+++ b/trunk/arch/powerpc/platforms/iseries/setup.c
@@ -584,7 +584,9 @@ static void iseries_shared_idle(void)
 		if (hvlpevent_is_pending())
 			process_iSeries_events();
 
-		schedule_preempt_disabled();
+		preempt_enable_no_resched();
+		schedule();
+		preempt_disable();
 	}
 }
 
@@ -613,7 +615,9 @@ static void iseries_dedicated_idle(void)
 		ppc64_runlatch_on();
 		rcu_idle_exit();
 		tick_nohz_idle_exit();
-		schedule_preempt_disabled();
+		preempt_enable_no_resched();
+		schedule();
+		preempt_disable();
 	}
 }
 
diff --git a/trunk/arch/s390/kernel/process.c b/trunk/arch/s390/kernel/process.c
index 7618085b4164..e795933eb2cb 100644
--- a/trunk/arch/s390/kernel/process.c
+++ b/trunk/arch/s390/kernel/process.c
@@ -97,7 +97,9 @@ void cpu_idle(void)
 		tick_nohz_idle_exit();
 		if (test_thread_flag(TIF_MCCK_PENDING))
 			s390_handle_mcck();
-		schedule_preempt_disabled();
+		preempt_enable_no_resched();
+		schedule();
+		preempt_disable();
 	}
 }
 
diff --git a/trunk/arch/s390/kernel/smp.c b/trunk/arch/s390/kernel/smp.c
index b0e28c47ab83..2398ce6b15ae 100644
--- a/trunk/arch/s390/kernel/smp.c
+++ b/trunk/arch/s390/kernel/smp.c
@@ -550,6 +550,12 @@ int __cpuinit start_secondary(void *cpuvoid)
 	S390_lowcore.restart_psw.addr =
 		PSW_ADDR_AMODE | (unsigned long) psw_restart_int_handler;
 	__ctl_set_bit(0, 28); /* Enable lowcore protection */
+	/*
+	 * Wait until the cpu which brought this one up marked it
+	 * active before enabling interrupts.
+	 */
+	while (!cpumask_test_cpu(smp_processor_id(), cpu_active_mask))
+		cpu_relax();
 	local_irq_enable();
 	/* cpu_idle will call schedule for us */
 	cpu_idle();
diff --git a/trunk/arch/score/kernel/process.c b/trunk/arch/score/kernel/process.c
index 2707023c7563..25d08030a883 100644
--- a/trunk/arch/score/kernel/process.c
+++ b/trunk/arch/score/kernel/process.c
@@ -53,7 +53,9 @@ void __noreturn cpu_idle(void)
 		while (!need_resched())
 			barrier();
 
-		schedule_preempt_disabled();
+		preempt_enable_no_resched();
+		schedule();
+		preempt_disable();
 	}
 }
 
diff --git a/trunk/arch/sh/kernel/idle.c b/trunk/arch/sh/kernel/idle.c
index 7e4892826563..406508d4ce74 100644
--- a/trunk/arch/sh/kernel/idle.c
+++ b/trunk/arch/sh/kernel/idle.c
@@ -114,7 +114,9 @@ void cpu_idle(void)
 
 		rcu_idle_exit();
 		tick_nohz_idle_exit();
-		schedule_preempt_disabled();
+		preempt_enable_no_resched();
+		schedule();
+		preempt_disable();
 	}
 }
 
diff --git a/trunk/arch/sparc/kernel/process_32.c b/trunk/arch/sparc/kernel/process_32.c
index 935fdbcd88c2..f793742eec2b 100644
--- a/trunk/arch/sparc/kernel/process_32.c
+++ b/trunk/arch/sparc/kernel/process_32.c
@@ -113,7 +113,9 @@ void cpu_idle(void)
 			while (!need_resched())
 				cpu_relax();
 		}
-		schedule_preempt_disabled();
+		preempt_enable_no_resched();
+		schedule();
+		preempt_disable();
 		check_pgt_cache();
 	}
 }
@@ -136,7 +138,9 @@ void cpu_idle(void)
 			while (!need_resched())
 				cpu_relax();
 		}
-		schedule_preempt_disabled();
+		preempt_enable_no_resched();
+		schedule();
+		preempt_disable();
 		check_pgt_cache();
 	}
 }
diff --git a/trunk/arch/sparc/kernel/process_64.c b/trunk/arch/sparc/kernel/process_64.c
index 06b5b5fc20c7..39d8b05201a2 100644
--- a/trunk/arch/sparc/kernel/process_64.c
+++ b/trunk/arch/sparc/kernel/process_64.c
@@ -104,13 +104,15 @@ void cpu_idle(void)
 		rcu_idle_exit();
 		tick_nohz_idle_exit();
 
+		preempt_enable_no_resched();
+
 #ifdef CONFIG_HOTPLUG_CPU
-		if (cpu_is_offline(cpu)) {
-			sched_preempt_enable_no_resched();
+		if (cpu_is_offline(cpu))
 			cpu_play_dead();
-		}
 #endif
-		schedule_preempt_disabled();
+
+		schedule();
+		preempt_disable();
 	}
 }
 
diff --git a/trunk/arch/tile/kernel/process.c b/trunk/arch/tile/kernel/process.c
index 6ae495ef2b99..4c1ac6e5347a 100644
--- a/trunk/arch/tile/kernel/process.c
+++ b/trunk/arch/tile/kernel/process.c
@@ -108,7 +108,9 @@ void cpu_idle(void)
 		}
 		rcu_idle_exit();
 		tick_nohz_idle_exit();
-		schedule_preempt_disabled();
+		preempt_enable_no_resched();
+		schedule();
+		preempt_disable();
 	}
 }
 
diff --git a/trunk/arch/x86/include/asm/timer.h b/trunk/arch/x86/include/asm/timer.h
index 34baa0eb5d0c..431793e5d484 100644
--- a/trunk/arch/x86/include/asm/timer.h
+++ b/trunk/arch/x86/include/asm/timer.h
@@ -57,10 +57,14 @@ DECLARE_PER_CPU(unsigned long long, cyc2ns_offset);
 
 static inline unsigned long long __cycles_2_ns(unsigned long long cyc)
 {
+	unsigned long long quot;
+	unsigned long long rem;
 	int cpu = smp_processor_id();
 	unsigned long long ns = per_cpu(cyc2ns_offset, cpu);
-	ns += mult_frac(cyc, per_cpu(cyc2ns, cpu),
-			(1UL << CYC2NS_SCALE_FACTOR));
+	quot = (cyc >> CYC2NS_SCALE_FACTOR);
+	rem = cyc & ((1ULL << CYC2NS_SCALE_FACTOR) - 1);
+	ns += quot * per_cpu(cyc2ns, cpu) +
+		((rem * per_cpu(cyc2ns, cpu)) >> CYC2NS_SCALE_FACTOR);
 	return ns;
 }
 
diff --git a/trunk/arch/x86/kernel/process_32.c b/trunk/arch/x86/kernel/process_32.c
index 49888fefe794..c08d1ff12b7c 100644
--- a/trunk/arch/x86/kernel/process_32.c
+++ b/trunk/arch/x86/kernel/process_32.c
@@ -119,7 +119,9 @@ void cpu_idle(void)
 		}
 		rcu_idle_exit();
 		tick_nohz_idle_exit();
-		schedule_preempt_disabled();
+		preempt_enable_no_resched();
+		schedule();
+		preempt_disable();
 	}
 }
 
diff --git a/trunk/arch/x86/kernel/process_64.c b/trunk/arch/x86/kernel/process_64.c
index e34257c70c28..cfa5c90c01db 100644
--- a/trunk/arch/x86/kernel/process_64.c
+++ b/trunk/arch/x86/kernel/process_64.c
@@ -156,7 +156,9 @@ void cpu_idle(void)
 		}
 
 		tick_nohz_idle_exit();
-		schedule_preempt_disabled();
+		preempt_enable_no_resched();
+		schedule();
+		preempt_disable();
 	}
 }
 
diff --git a/trunk/arch/x86/kernel/smpboot.c b/trunk/arch/x86/kernel/smpboot.c
index 89571a0c4a49..66d250c00d11 100644
--- a/trunk/arch/x86/kernel/smpboot.c
+++ b/trunk/arch/x86/kernel/smpboot.c
@@ -219,9 +219,14 @@ static void __cpuinit smp_callin(void)
 	 * Update loops_per_jiffy in cpu_data. Previous call to
 	 * smp_store_cpu_info() stored a value that is close but not as
 	 * accurate as the value just calculated.
+	 *
+	 * Need to enable IRQs because it can take longer and then
+	 * the NMI watchdog might kill us.
 	 */
+	local_irq_enable();
 	calibrate_delay();
 	cpu_data(cpuid).loops_per_jiffy = loops_per_jiffy;
+	local_irq_disable();
 	pr_debug("Stack at about %p\n", &cpuid);
 
 	/*
@@ -286,6 +291,19 @@ notrace static void __cpuinit start_secondary(void *unused)
 	per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
 	x86_platform.nmi_init();
 
+	/*
+	 * Wait until the cpu which brought this one up marked it
+	 * online before enabling interrupts. If we don't do that then
+	 * we can end up waking up the softirq thread before this cpu
+	 * reached the active state, which makes the scheduler unhappy
+	 * and schedule the softirq thread on the wrong cpu. This is
+	 * only observable with forced threaded interrupts, but in
+	 * theory it could also happen w/o them. It's just way harder
+	 * to achieve.
+	 */
+	while (!cpumask_test_cpu(smp_processor_id(), cpu_active_mask))
+		cpu_relax();
+
 	/* enable local interrupts */
 	local_irq_enable();
 
diff --git a/trunk/arch/x86/kernel/tsc.c b/trunk/arch/x86/kernel/tsc.c
index 183c5925a9fe..a62c201c97ec 100644
--- a/trunk/arch/x86/kernel/tsc.c
+++ b/trunk/arch/x86/kernel/tsc.c
@@ -620,8 +620,7 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
 
 	if (cpu_khz) {
 		*scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz;
-		*offset = ns_now - mult_frac(tsc_now, *scale,
-					     (1UL << CYC2NS_SCALE_FACTOR));
+		*offset = ns_now - (tsc_now * *scale >> CYC2NS_SCALE_FACTOR);
 	}
 
 	sched_clock_idle_wakeup_event(0);
diff --git a/trunk/arch/xtensa/kernel/process.c b/trunk/arch/xtensa/kernel/process.c
index 2c9004770c4e..47041e7c088c 100644
--- a/trunk/arch/xtensa/kernel/process.c
+++ b/trunk/arch/xtensa/kernel/process.c
@@ -113,7 +113,9 @@ void cpu_idle(void)
 	while (1) {
 		while (!need_resched())
 			platform_idle();
-		schedule_preempt_disabled();
+		preempt_enable_no_resched();
+		schedule();
+		preempt_disable();
 	}
 }
 
diff --git a/trunk/block/blk-softirq.c b/trunk/block/blk-softirq.c
index 467c8de88642..1366a89d8e66 100644
--- a/trunk/block/blk-softirq.c
+++ b/trunk/block/blk-softirq.c
@@ -8,7 +8,6 @@
 #include <linux/blkdev.h>
 #include <linux/interrupt.h>
 #include <linux/cpu.h>
-#include <linux/sched.h>
 
 #include "blk.h"
 
@@ -104,10 +103,9 @@ static struct notifier_block __cpuinitdata blk_cpu_notifier = {
 
 void __blk_complete_request(struct request *req)
 {
-	int ccpu, cpu;
+	int ccpu, cpu, group_cpu = NR_CPUS;
 	struct request_queue *q = req->q;
 	unsigned long flags;
-	bool shared = false;
 
 	BUG_ON(!q->softirq_done_fn);
 
@@ -119,20 +117,22 @@ void __blk_complete_request(struct request *req)
 	 */
 	if (req->cpu != -1) {
 		ccpu = req->cpu;
-		if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags))
-			shared = cpus_share_cache(cpu, ccpu);
+		if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags)) {
+			ccpu = blk_cpu_to_group(ccpu);
+			group_cpu = blk_cpu_to_group(cpu);
+		}
 	} else
 		ccpu = cpu;
 
 	/*
-	 * If current CPU and requested CPU share a cache, run the softirq on
-	 * the current CPU. One might concern this is just like
+	 * If current CPU and requested CPU are in the same group, running
+	 * softirq in current CPU. One might concern this is just like
 	 * QUEUE_FLAG_SAME_FORCE, but actually not. blk_complete_request() is
 	 * running in interrupt handler, and currently I/O controller doesn't
 	 * support multiple interrupts, so current CPU is unique actually. This
 	 * avoids IPI sending from current CPU to the first CPU of a group.
 	 */
-	if (ccpu == cpu || shared) {
+	if (ccpu == cpu || ccpu == group_cpu) {
 		struct list_head *list;
 do_local:
 		list = &__get_cpu_var(blk_cpu_done);
diff --git a/trunk/block/blk.h b/trunk/block/blk.h
index d45be871329e..9c12f80882b0 100644
--- a/trunk/block/blk.h
+++ b/trunk/block/blk.h
@@ -166,6 +166,22 @@ static inline int queue_congestion_off_threshold(struct request_queue *q)
 	return q->nr_congestion_off;
 }
 
+static inline int blk_cpu_to_group(int cpu)
+{
+	int group = NR_CPUS;
+#ifdef CONFIG_SCHED_MC
+	const struct cpumask *mask = cpu_coregroup_mask(cpu);
+	group = cpumask_first(mask);
+#elif defined(CONFIG_SCHED_SMT)
+	group = cpumask_first(topology_thread_cpumask(cpu));
+#else
+	return cpu;
+#endif
+	if (likely(group < NR_CPUS))
+		return group;
+	return cpu;
+}
+
 /*
  * Contribute to IO statistics IFF:
  *
diff --git a/trunk/fs/proc/base.c b/trunk/fs/proc/base.c
index 965d4bde3a3b..d4548dd49b02 100644
--- a/trunk/fs/proc/base.c
+++ b/trunk/fs/proc/base.c
@@ -1310,7 +1310,8 @@ sched_autogroup_write(struct file *file, const char __user *buf,
 	if (!p)
 		return -ESRCH;
 
-	err = proc_sched_autogroup_set_nice(p, nice);
+	err = nice;
+	err = proc_sched_autogroup_set_nice(p, &err);
 	if (err)
 		count = err;
 
diff --git a/trunk/include/linux/cpuset.h b/trunk/include/linux/cpuset.h
index e0ffaf061ab7..e9eaec522655 100644
--- a/trunk/include/linux/cpuset.h
+++ b/trunk/include/linux/cpuset.h
@@ -22,7 +22,7 @@ extern int cpuset_init(void);
 extern void cpuset_init_smp(void);
 extern void cpuset_update_active_cpus(void);
 extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask);
-extern void cpuset_cpus_allowed_fallback(struct task_struct *p);
+extern int cpuset_cpus_allowed_fallback(struct task_struct *p);
 extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
 #define cpuset_current_mems_allowed (current->mems_allowed)
 void cpuset_init_current_mems_allowed(void);
@@ -144,8 +144,10 @@ static inline void cpuset_cpus_allowed(struct task_struct *p,
 	cpumask_copy(mask, cpu_possible_mask);
 }
 
-static inline void cpuset_cpus_allowed_fallback(struct task_struct *p)
+static inline int cpuset_cpus_allowed_fallback(struct task_struct *p)
 {
+	do_set_cpus_allowed(p, cpu_possible_mask);
+	return cpumask_any(cpu_active_mask);
 }
 
 static inline nodemask_t cpuset_mems_allowed(struct task_struct *p)
diff --git a/trunk/include/linux/init_task.h b/trunk/include/linux/init_task.h
index f994d51f70f2..9c66b1ada9d7 100644
--- a/trunk/include/linux/init_task.h
+++ b/trunk/include/linux/init_task.h
@@ -149,7 +149,7 @@ extern struct cred init_cred;
 	},								\
 	.rt		= {						\
 		.run_list	= LIST_HEAD_INIT(tsk.rt.run_list),	\
-		.time_slice	= RR_TIMESLICE,				\
+		.time_slice	= HZ, 					\
 		.nr_cpus_allowed = NR_CPUS,				\
 	},								\
 	.tasks		= LIST_HEAD_INIT(tsk.tasks),			\
diff --git a/trunk/include/linux/kernel.h b/trunk/include/linux/kernel.h
index d801acb5e680..e8343422240a 100644
--- a/trunk/include/linux/kernel.h
+++ b/trunk/include/linux/kernel.h
@@ -85,19 +85,6 @@
 }							\
 )
 
-/*
- * Multiplies an integer by a fraction, while avoiding unnecessary
- * overflow or loss of precision.
- */
-#define mult_frac(x, numer, denom)(			\
-{							\
-	typeof(x) quot = (x) / (denom);			\
-	typeof(x) rem  = (x) % (denom);			\
-	(quot * (numer)) + ((rem * (numer)) / (denom));	\
-}							\
-)
-
-
 #define _RET_IP_		(unsigned long)__builtin_return_address(0)
 #define _THIS_IP_  ({ __label__ __here; __here: (unsigned long)&&__here; })
 
diff --git a/trunk/include/linux/preempt.h b/trunk/include/linux/preempt.h
index 5a710b9c578e..58969b2a8a82 100644
--- a/trunk/include/linux/preempt.h
+++ b/trunk/include/linux/preempt.h
@@ -48,14 +48,12 @@ do { \
 	barrier(); \
 } while (0)
 
-#define sched_preempt_enable_no_resched() \
+#define preempt_enable_no_resched() \
 do { \
 	barrier(); \
 	dec_preempt_count(); \
 } while (0)
 
-#define preempt_enable_no_resched()	sched_preempt_enable_no_resched()
-
 #define preempt_enable() \
 do { \
 	preempt_enable_no_resched(); \
@@ -94,7 +92,6 @@ do { \
 #else /* !CONFIG_PREEMPT_COUNT */
 
 #define preempt_disable()		do { } while (0)
-#define sched_preempt_enable_no_resched()	do { } while (0)
 #define preempt_enable_no_resched()	do { } while (0)
 #define preempt_enable()		do { } while (0)
 
diff --git a/trunk/include/linux/printk.h b/trunk/include/linux/printk.h
index 1f77a4174ee0..f0e22f75143f 100644
--- a/trunk/include/linux/printk.h
+++ b/trunk/include/linux/printk.h
@@ -100,11 +100,6 @@ int vprintk(const char *fmt, va_list args);
 asmlinkage __printf(1, 2) __cold
 int printk(const char *fmt, ...);
 
-/*
- * Special printk facility for scheduler use only, _DO_NOT_USE_ !
- */
-__printf(1, 2) __cold int printk_sched(const char *fmt, ...);
-
 /*
  * Please don't use printk_ratelimit(), because it shares ratelimiting state
  * with all other unrelated printk_ratelimit() callsites.  Instead use
@@ -132,11 +127,6 @@ int printk(const char *s, ...)
 {
 	return 0;
 }
-static inline __printf(1, 2) __cold
-int printk_sched(const char *s, ...)
-{
-	return 0;
-}
 static inline int printk_ratelimit(void)
 {
 	return 0;
diff --git a/trunk/include/linux/sched.h b/trunk/include/linux/sched.h
index 0cd002cc9e6a..0657368bd78f 100644
--- a/trunk/include/linux/sched.h
+++ b/trunk/include/linux/sched.h
@@ -361,7 +361,6 @@ extern signed long schedule_timeout_interruptible(signed long timeout);
 extern signed long schedule_timeout_killable(signed long timeout);
 extern signed long schedule_timeout_uninterruptible(signed long timeout);
 asmlinkage void schedule(void);
-extern void schedule_preempt_disabled(void);
 extern int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner);
 
 struct nsproxy;
@@ -906,7 +905,6 @@ struct sched_group_power {
 	 * single CPU.
 	 */
 	unsigned int power, power_orig;
-	unsigned long next_update;
 	/*
 	 * Number of busy cpus in this group.
 	 */
@@ -1054,8 +1052,6 @@ static inline int test_sd_parent(struct sched_domain *sd, int flag)
 unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu);
 unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu);
 
-bool cpus_share_cache(int this_cpu, int that_cpu);
-
 #else /* CONFIG_SMP */
 
 struct sched_domain_attr;
@@ -1065,12 +1061,6 @@ partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
 			struct sched_domain_attr *dattr_new)
 {
 }
-
-static inline bool cpus_share_cache(int this_cpu, int that_cpu)
-{
-	return true;
-}
-
 #endif	/* !CONFIG_SMP */
 
 
@@ -1235,12 +1225,6 @@ struct sched_rt_entity {
 #endif
 };
 
-/*
- * default timeslice is 100 msecs (used only for SCHED_RR tasks).
- * Timeslices get refilled after they expire.
- */
-#define RR_TIMESLICE		(100 * HZ / 1000)
-
 struct rcu_node;
 
 enum perf_event_task_context {
@@ -2064,7 +2048,7 @@ extern void sched_autogroup_fork(struct signal_struct *sig);
 extern void sched_autogroup_exit(struct signal_struct *sig);
 #ifdef CONFIG_PROC_FS
 extern void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m);
-extern int proc_sched_autogroup_set_nice(struct task_struct *p, int nice);
+extern int proc_sched_autogroup_set_nice(struct task_struct *p, int *nice);
 #endif
 #else
 static inline void sched_autogroup_create_attach(struct task_struct *p) { }
@@ -2081,20 +2065,12 @@ extern unsigned int sysctl_sched_cfs_bandwidth_slice;
 extern int rt_mutex_getprio(struct task_struct *p);
 extern void rt_mutex_setprio(struct task_struct *p, int prio);
 extern void rt_mutex_adjust_pi(struct task_struct *p);
-static inline bool tsk_is_pi_blocked(struct task_struct *tsk)
-{
-	return tsk->pi_blocked_on != NULL;
-}
 #else
 static inline int rt_mutex_getprio(struct task_struct *p)
 {
 	return p->normal_prio;
 }
 # define rt_mutex_adjust_pi(p)		do { } while (0)
-static inline bool tsk_is_pi_blocked(struct task_struct *tsk)
-{
-	return false;
-}
 #endif
 
 extern bool yield_to(struct task_struct *p, bool preempt);
@@ -2413,15 +2389,12 @@ static inline void task_unlock(struct task_struct *p)
 extern struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
 							unsigned long *flags);
 
-static inline struct sighand_struct *lock_task_sighand(struct task_struct *tsk,
-						       unsigned long *flags)
-{
-	struct sighand_struct *ret;
-
-	ret = __lock_task_sighand(tsk, flags);
-	(void)__cond_lock(&tsk->sighand->siglock, ret);
-	return ret;
-}
+#define lock_task_sighand(tsk, flags)					\
+({	struct sighand_struct *__ss;					\
+	__cond_lock(&(tsk)->sighand->siglock,				\
+		    (__ss = __lock_task_sighand(tsk, flags)));		\
+	__ss;								\
+})									\
 
 static inline void unlock_task_sighand(struct task_struct *tsk,
 						unsigned long *flags)
diff --git a/trunk/include/linux/wait.h b/trunk/include/linux/wait.h
index 7d9a9e990ce6..a9ce45e8501c 100644
--- a/trunk/include/linux/wait.h
+++ b/trunk/include/linux/wait.h
@@ -157,7 +157,7 @@ void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr, void *key);
 void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key);
 void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, int nr,
 			void *key);
-void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr);
+void __wake_up_locked(wait_queue_head_t *q, unsigned int mode);
 void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr);
 void __wake_up_bit(wait_queue_head_t *, void *, int);
 int __wait_on_bit(wait_queue_head_t *, struct wait_bit_queue *, int (*)(void *), unsigned);
@@ -170,8 +170,7 @@ wait_queue_head_t *bit_waitqueue(void *, int);
 #define wake_up(x)			__wake_up(x, TASK_NORMAL, 1, NULL)
 #define wake_up_nr(x, nr)		__wake_up(x, TASK_NORMAL, nr, NULL)
 #define wake_up_all(x)			__wake_up(x, TASK_NORMAL, 0, NULL)
-#define wake_up_locked(x)		__wake_up_locked((x), TASK_NORMAL, 1)
-#define wake_up_all_locked(x)		__wake_up_locked((x), TASK_NORMAL, 0)
+#define wake_up_locked(x)		__wake_up_locked((x), TASK_NORMAL)
 
 #define wake_up_interruptible(x)	__wake_up(x, TASK_INTERRUPTIBLE, 1, NULL)
 #define wake_up_interruptible_nr(x, nr)	__wake_up(x, TASK_INTERRUPTIBLE, nr, NULL)
diff --git a/trunk/init/main.c b/trunk/init/main.c
index 4990f7ec776a..ff49a6dacfbb 100644
--- a/trunk/init/main.c
+++ b/trunk/init/main.c
@@ -374,8 +374,11 @@ static noinline void __init_refok rest_init(void)
 	 * at least once to get things moving:
 	 */
 	init_idle_bootup_task(current);
-	schedule_preempt_disabled();
+	preempt_enable_no_resched();
+	schedule();
+
 	/* Call into cpu_idle with preempt disabled */
+	preempt_disable();
 	cpu_idle();
 }
 
diff --git a/trunk/kernel/cpuset.c b/trunk/kernel/cpuset.c
index 4ef4d7ecb9fb..a09ac2b9a661 100644
--- a/trunk/kernel/cpuset.c
+++ b/trunk/kernel/cpuset.c
@@ -2195,9 +2195,10 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
 	mutex_unlock(&callback_mutex);
 }
 
-void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
+int cpuset_cpus_allowed_fallback(struct task_struct *tsk)
 {
 	const struct cpuset *cs;
+	int cpu;
 
 	rcu_read_lock();
 	cs = task_cs(tsk);
@@ -2218,10 +2219,22 @@ void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
 	 * changes in tsk_cs()->cpus_allowed. Otherwise we can temporary
 	 * set any mask even if it is not right from task_cs() pov,
 	 * the pending set_cpus_allowed_ptr() will fix things.
-	 *
-	 * select_fallback_rq() will fix things ups and set cpu_possible_mask
-	 * if required.
 	 */
+
+	cpu = cpumask_any_and(&tsk->cpus_allowed, cpu_active_mask);
+	if (cpu >= nr_cpu_ids) {
+		/*
+		 * Either tsk->cpus_allowed is wrong (see above) or it
+		 * is actually empty. The latter case is only possible
+		 * if we are racing with remove_tasks_in_empty_cpuset().
+		 * Like above we can temporary set any mask and rely on
+		 * set_cpus_allowed_ptr() as synchronization point.
+		 */
+		do_set_cpus_allowed(tsk, cpu_possible_mask);
+		cpu = cpumask_any(cpu_active_mask);
+	}
+
+	return cpu;
 }
 
 void cpuset_init_current_mems_allowed(void)
diff --git a/trunk/kernel/mutex.c b/trunk/kernel/mutex.c
index a307cc9c9526..89096dd8786f 100644
--- a/trunk/kernel/mutex.c
+++ b/trunk/kernel/mutex.c
@@ -240,7 +240,9 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 
 		/* didn't get the lock, go to sleep: */
 		spin_unlock_mutex(&lock->wait_lock, flags);
-		schedule_preempt_disabled();
+		preempt_enable_no_resched();
+		schedule();
+		preempt_disable();
 		spin_lock_mutex(&lock->wait_lock, flags);
 	}
 
diff --git a/trunk/kernel/printk.c b/trunk/kernel/printk.c
index b64ce71cb2e5..32690a0b7a18 100644
--- a/trunk/kernel/printk.c
+++ b/trunk/kernel/printk.c
@@ -1211,27 +1211,13 @@ int is_console_locked(void)
 	return console_locked;
 }
 
-/*
- * Delayed printk facility, for scheduler-internal messages:
- */
-#define PRINTK_BUF_SIZE		512
-
-#define PRINTK_PENDING_WAKEUP	0x01
-#define PRINTK_PENDING_SCHED	0x02
-
 static DEFINE_PER_CPU(int, printk_pending);
-static DEFINE_PER_CPU(char [PRINTK_BUF_SIZE], printk_sched_buf);
 
 void printk_tick(void)
 {
 	if (__this_cpu_read(printk_pending)) {
-		int pending = __this_cpu_xchg(printk_pending, 0);
-		if (pending & PRINTK_PENDING_SCHED) {
-			char *buf = __get_cpu_var(printk_sched_buf);
-			printk(KERN_WARNING "[sched_delayed] %s", buf);
-		}
-		if (pending & PRINTK_PENDING_WAKEUP)
-			wake_up_interruptible(&log_wait);
+		__this_cpu_write(printk_pending, 0);
+		wake_up_interruptible(&log_wait);
 	}
 }
 
@@ -1245,7 +1231,7 @@ int printk_needs_cpu(int cpu)
 void wake_up_klogd(void)
 {
 	if (waitqueue_active(&log_wait))
-		this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP);
+		this_cpu_write(printk_pending, 1);
 }
 
 /**
@@ -1638,26 +1624,6 @@ late_initcall(printk_late_init);
 
 #if defined CONFIG_PRINTK
 
-int printk_sched(const char *fmt, ...)
-{
-	unsigned long flags;
-	va_list args;
-	char *buf;
-	int r;
-
-	local_irq_save(flags);
-	buf = __get_cpu_var(printk_sched_buf);
-
-	va_start(args, fmt);
-	r = vsnprintf(buf, PRINTK_BUF_SIZE, fmt, args);
-	va_end(args);
-
-	__this_cpu_or(printk_pending, PRINTK_PENDING_SCHED);
-	local_irq_restore(flags);
-
-	return r;
-}
-
 /*
  * printk rate limiting, lifted from the networking subsystem.
  *
diff --git a/trunk/kernel/sched/auto_group.c b/trunk/kernel/sched/auto_group.c
index 0984a21076a3..e8a1f83ee0e7 100644
--- a/trunk/kernel/sched/auto_group.c
+++ b/trunk/kernel/sched/auto_group.c
@@ -195,20 +195,20 @@ __setup("noautogroup", setup_autogroup);
 
 #ifdef CONFIG_PROC_FS
 
-int proc_sched_autogroup_set_nice(struct task_struct *p, int nice)
+int proc_sched_autogroup_set_nice(struct task_struct *p, int *nice)
 {
 	static unsigned long next = INITIAL_JIFFIES;
 	struct autogroup *ag;
 	int err;
 
-	if (nice < -20 || nice > 19)
+	if (*nice < -20 || *nice > 19)
 		return -EINVAL;
 
-	err = security_task_setnice(current, nice);
+	err = security_task_setnice(current, *nice);
 	if (err)
 		return err;
 
-	if (nice < 0 && !can_nice(current, nice))
+	if (*nice < 0 && !can_nice(current, *nice))
 		return -EPERM;
 
 	/* this is a heavy operation taking global locks.. */
@@ -219,9 +219,9 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int nice)
 	ag = autogroup_task_get(p);
 
 	down_write(&ag->lock);
-	err = sched_group_set_shares(ag->tg, prio_to_weight[nice + 20]);
+	err = sched_group_set_shares(ag->tg, prio_to_weight[*nice + 20]);
 	if (!err)
-		ag->nice = nice;
+		ag->nice = *nice;
 	up_write(&ag->lock);
 
 	autogroup_kref_put(ag);
diff --git a/trunk/kernel/sched/core.c b/trunk/kernel/sched/core.c
index 929fd857ef88..423f40f32a59 100644
--- a/trunk/kernel/sched/core.c
+++ b/trunk/kernel/sched/core.c
@@ -1263,59 +1263,29 @@ EXPORT_SYMBOL_GPL(kick_process);
  */
 static int select_fallback_rq(int cpu, struct task_struct *p)
 {
-	const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu));
-	enum { cpuset, possible, fail } state = cpuset;
 	int dest_cpu;
+	const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu));
 
 	/* Look for allowed, online CPU in same node. */
-	for_each_cpu_mask(dest_cpu, *nodemask) {
-		if (!cpu_online(dest_cpu))
-			continue;
-		if (!cpu_active(dest_cpu))
-			continue;
+	for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask)
 		if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
 			return dest_cpu;
-	}
-
-	for (;;) {
-		/* Any allowed, online CPU? */
-		for_each_cpu_mask(dest_cpu, *tsk_cpus_allowed(p)) {
-			if (!cpu_online(dest_cpu))
-				continue;
-			if (!cpu_active(dest_cpu))
-				continue;
-			goto out;
-		}
 
-		switch (state) {
-		case cpuset:
-			/* No more Mr. Nice Guy. */
-			cpuset_cpus_allowed_fallback(p);
-			state = possible;
-			break;
+	/* Any allowed, online CPU? */
+	dest_cpu = cpumask_any_and(tsk_cpus_allowed(p), cpu_active_mask);
+	if (dest_cpu < nr_cpu_ids)
+		return dest_cpu;
 
-		case possible:
-			do_set_cpus_allowed(p, cpu_possible_mask);
-			state = fail;
-			break;
-
-		case fail:
-			BUG();
-			break;
-		}
-	}
-
-out:
-	if (state != cpuset) {
-		/*
-		 * Don't tell them about moving exiting tasks or
-		 * kernel threads (both mm NULL), since they never
-		 * leave kernel.
-		 */
-		if (p->mm && printk_ratelimit()) {
-			printk_sched("process %d (%s) no longer affine to cpu%d\n",
-					task_pid_nr(p), p->comm, cpu);
-		}
+	/* No more Mr. Nice Guy. */
+	dest_cpu = cpuset_cpus_allowed_fallback(p);
+	/*
+	 * Don't tell them about moving exiting tasks or
+	 * kernel threads (both mm NULL), since they never
+	 * leave kernel.
+	 */
+	if (p->mm && printk_ratelimit()) {
+		printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n",
+				task_pid_nr(p), p->comm, cpu);
 	}
 
 	return dest_cpu;
@@ -1537,7 +1507,7 @@ static int ttwu_activate_remote(struct task_struct *p, int wake_flags)
 }
 #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
 
-bool cpus_share_cache(int this_cpu, int that_cpu)
+static inline int ttwu_share_cache(int this_cpu, int that_cpu)
 {
 	return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
 }
@@ -1548,7 +1518,7 @@ static void ttwu_queue(struct task_struct *p, int cpu)
 	struct rq *rq = cpu_rq(cpu);
 
 #if defined(CONFIG_SMP)
-	if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
+	if (sched_feat(TTWU_QUEUE) && !ttwu_share_cache(smp_processor_id(), cpu)) {
 		sched_clock_cpu(cpu); /* sync clocks x-cpu */
 		ttwu_queue_remote(p, cpu);
 		return;
@@ -1962,6 +1932,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
 	local_irq_enable();
 #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
 	finish_lock_switch(rq, prev);
+	finish_arch_post_lock_switch();
 
 	fire_sched_in_preempt_notifiers(current);
 	if (mm)
@@ -2296,10 +2267,13 @@ calc_load_n(unsigned long load, unsigned long exp,
  * Once we've updated the global active value, we need to apply the exponential
  * weights adjusted to the number of cycles missed.
  */
-static void calc_global_nohz(void)
+static void calc_global_nohz(unsigned long ticks)
 {
 	long delta, active, n;
 
+	if (time_before(jiffies, calc_load_update))
+		return;
+
 	/*
 	 * If we crossed a calc_load_update boundary, make sure to fold
 	 * any pending idle changes, the respective CPUs might have
@@ -2311,25 +2285,31 @@ static void calc_global_nohz(void)
 		atomic_long_add(delta, &calc_load_tasks);
 
 	/*
-	 * It could be the one fold was all it took, we done!
+	 * If we were idle for multiple load cycles, apply them.
 	 */
-	if (time_before(jiffies, calc_load_update + 10))
-		return;
+	if (ticks >= LOAD_FREQ) {
+		n = ticks / LOAD_FREQ;
 
-	/*
-	 * Catch-up, fold however many we are behind still
-	 */
-	delta = jiffies - calc_load_update - 10;
-	n = 1 + (delta / LOAD_FREQ);
+		active = atomic_long_read(&calc_load_tasks);
+		active = active > 0 ? active * FIXED_1 : 0;
 
-	active = atomic_long_read(&calc_load_tasks);
-	active = active > 0 ? active * FIXED_1 : 0;
+		avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
+		avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
+		avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
 
-	avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
-	avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
-	avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
+		calc_load_update += n * LOAD_FREQ;
+	}
 
-	calc_load_update += n * LOAD_FREQ;
+	/*
+	 * Its possible the remainder of the above division also crosses
+	 * a LOAD_FREQ period, the regular check in calc_global_load()
+	 * which comes after this will take care of that.
+	 *
+	 * Consider us being 11 ticks before a cycle completion, and us
+	 * sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will
+	 * age us 4 cycles, and the test in calc_global_load() will
+	 * pick up the final one.
+	 */
 }
 #else
 void calc_load_account_idle(struct rq *this_rq)
@@ -2341,7 +2321,7 @@ static inline long calc_load_fold_idle(void)
 	return 0;
 }
 
-static void calc_global_nohz(void)
+static void calc_global_nohz(unsigned long ticks)
 {
 }
 #endif
@@ -2369,6 +2349,8 @@ void calc_global_load(unsigned long ticks)
 {
 	long active;
 
+	calc_global_nohz(ticks);
+
 	if (time_before(jiffies, calc_load_update + 10))
 		return;
 
@@ -2380,16 +2362,6 @@ void calc_global_load(unsigned long ticks)
 	avenrun[2] = calc_load(avenrun[2], EXP_15, active);
 
 	calc_load_update += LOAD_FREQ;
-
-	/*
-	 * Account one period with whatever state we found before
-	 * folding in the nohz state and ageing the entire idle period.
-	 *
-	 * This avoids loosing a sample when we go idle between 
-	 * calc_load_account_active() (10 ticks ago) and now and thus
-	 * under-accounting.
-	 */
-	calc_global_nohz();
 }
 
 /*
@@ -3099,6 +3071,8 @@ EXPORT_SYMBOL(sub_preempt_count);
  */
 static noinline void __schedule_bug(struct task_struct *prev)
 {
+	struct pt_regs *regs = get_irq_regs();
+
 	if (oops_in_progress)
 		return;
 
@@ -3109,7 +3083,11 @@ static noinline void __schedule_bug(struct task_struct *prev)
 	print_modules();
 	if (irqs_disabled())
 		print_irqtrace_events(prev);
-	dump_stack();
+
+	if (regs)
+		show_regs(regs);
+	else
+		dump_stack();
 }
 
 /*
@@ -3243,14 +3221,14 @@ static void __sched __schedule(void)
 
 	post_schedule(rq);
 
-	sched_preempt_enable_no_resched();
+	preempt_enable_no_resched();
 	if (need_resched())
 		goto need_resched;
 }
 
 static inline void sched_submit_work(struct task_struct *tsk)
 {
-	if (!tsk->state || tsk_is_pi_blocked(tsk))
+	if (!tsk->state)
 		return;
 	/*
 	 * If we are going to sleep and we have plugged IO queued,
@@ -3269,18 +3247,6 @@ asmlinkage void __sched schedule(void)
 }
 EXPORT_SYMBOL(schedule);
 
-/**
- * schedule_preempt_disabled - called with preemption disabled
- *
- * Returns with preemption disabled. Note: preempt_count must be 1
- */
-void __sched schedule_preempt_disabled(void)
-{
-	sched_preempt_enable_no_resched();
-	schedule();
-	preempt_disable();
-}
-
 #ifdef CONFIG_MUTEX_SPIN_ON_OWNER
 
 static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
@@ -3441,9 +3407,9 @@ EXPORT_SYMBOL(__wake_up);
 /*
  * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
  */
-void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr)
+void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
 {
-	__wake_up_common(q, mode, nr, 0, NULL);
+	__wake_up_common(q, mode, 1, 0, NULL);
 }
 EXPORT_SYMBOL_GPL(__wake_up_locked);
 
@@ -3802,24 +3768,6 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 
 	rq = __task_rq_lock(p);
 
-	/*
-	 * Idle task boosting is a nono in general. There is one
-	 * exception, when PREEMPT_RT and NOHZ is active:
-	 *
-	 * The idle task calls get_next_timer_interrupt() and holds
-	 * the timer wheel base->lock on the CPU and another CPU wants
-	 * to access the timer (probably to cancel it). We can safely
-	 * ignore the boosting request, as the idle CPU runs this code
-	 * with interrupts disabled and will complete the lock
-	 * protected section without being interrupted. So there is no
-	 * real need to boost.
-	 */
-	if (unlikely(p == rq->idle)) {
-		WARN_ON(p != rq->curr);
-		WARN_ON(p->pi_blocked_on);
-		goto out_unlock;
-	}
-
 	trace_sched_pi_setprio(p, prio);
 	oldprio = p->prio;
 	prev_class = p->sched_class;
@@ -3843,10 +3791,11 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 		enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
 
 	check_class_changed(rq, p, prev_class, oldprio);
-out_unlock:
 	__task_rq_unlock(rq);
 }
+
 #endif
+
 void set_user_nice(struct task_struct *p, long nice)
 {
 	int old_prio, delta, on_rq;
@@ -4526,7 +4475,7 @@ SYSCALL_DEFINE0(sched_yield)
 	__release(rq->lock);
 	spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
 	do_raw_spin_unlock(&rq->lock);
-	sched_preempt_enable_no_resched();
+	preempt_enable_no_resched();
 
 	schedule();
 
@@ -4600,24 +4549,8 @@ EXPORT_SYMBOL(__cond_resched_softirq);
 /**
  * yield - yield the current processor to other threads.
  *
- * Do not ever use this function, there's a 99% chance you're doing it wrong.
- *
- * The scheduler is at all times free to pick the calling task as the most
- * eligible task to run, if removing the yield() call from your code breaks
- * it, its already broken.
- *
- * Typical broken usage is:
- *
- * while (!event)
- * 	yield();
- *
- * where one assumes that yield() will let 'the other' process run that will
- * make event true. If the current task is a SCHED_FIFO task that will never
- * happen. Never use yield() as a progress guarantee!!
- *
- * If you want to use yield() to wait for something, use wait_event().
- * If you want to use yield() to be 'nice' for others, use cond_resched().
- * If you still want to use yield(), do not!
+ * This is a shortcut for kernel-space yielding - it marks the
+ * thread runnable and calls sys_sched_yield().
  */
 void __sched yield(void)
 {
@@ -5449,7 +5382,7 @@ static int __cpuinit sched_cpu_active(struct notifier_block *nfb,
 				      unsigned long action, void *hcpu)
 {
 	switch (action & ~CPU_TASKS_FROZEN) {
-	case CPU_STARTING:
+	case CPU_ONLINE:
 	case CPU_DOWN_FAILED:
 		set_cpu_active((long)hcpu, true);
 		return NOTIFY_OK;
@@ -5821,7 +5754,7 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
  *
  * Also keep a unique ID per domain (we use the first cpu number in
  * the cpumask of the domain), this allows us to quickly tell if
- * two cpus are in the same cache domain, see cpus_share_cache().
+ * two cpus are in the same cache domain, see ttwu_share_cache().
  */
 DEFINE_PER_CPU(struct sched_domain *, sd_llc);
 DEFINE_PER_CPU(int, sd_llc_id);
@@ -6998,9 +6931,6 @@ void __init sched_init(void)
 		rq->online = 0;
 		rq->idle_stamp = 0;
 		rq->avg_idle = 2*sysctl_sched_migration_cost;
-
-		INIT_LIST_HEAD(&rq->cfs_tasks);
-
 		rq_attach_root(rq, &def_root_domain);
 #ifdef CONFIG_NO_HZ
 		rq->nohz_flags = 0;
diff --git a/trunk/kernel/sched/debug.c b/trunk/kernel/sched/debug.c
index 09acaa15161d..2a075e10004b 100644
--- a/trunk/kernel/sched/debug.c
+++ b/trunk/kernel/sched/debug.c
@@ -288,6 +288,7 @@ static void print_cpu(struct seq_file *m, int cpu)
 
 	P(yld_count);
 
+	P(sched_switch);
 	P(sched_count);
 	P(sched_goidle);
 #ifdef CONFIG_SMP
diff --git a/trunk/kernel/sched/fair.c b/trunk/kernel/sched/fair.c
index 258f430d71a5..aca16b843b7e 100644
--- a/trunk/kernel/sched/fair.c
+++ b/trunk/kernel/sched/fair.c
@@ -416,8 +416,8 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
 
 #endif	/* CONFIG_FAIR_GROUP_SCHED */
 
-static __always_inline
-void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec);
+static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
+				   unsigned long delta_exec);
 
 /**************************************************************
  * Scheduling class tree data structure manipulation methods:
@@ -776,16 +776,29 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
  * Scheduling class queueing methods:
  */
 
+#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
+static void
+add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
+{
+	cfs_rq->task_weight += weight;
+}
+#else
+static inline void
+add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
+{
+}
+#endif
+
 static void
 account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	update_load_add(&cfs_rq->load, se->load.weight);
 	if (!parent_entity(se))
 		update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
-#ifdef CONFIG_SMP
-	if (entity_is_task(se))
-		list_add_tail(&se->group_node, &rq_of(cfs_rq)->cfs_tasks);
-#endif
+	if (entity_is_task(se)) {
+		add_cfs_task_weight(cfs_rq, se->load.weight);
+		list_add(&se->group_node, &cfs_rq->tasks);
+	}
 	cfs_rq->nr_running++;
 }
 
@@ -795,8 +808,10 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	update_load_sub(&cfs_rq->load, se->load.weight);
 	if (!parent_entity(se))
 		update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
-	if (entity_is_task(se))
+	if (entity_is_task(se)) {
+		add_cfs_task_weight(cfs_rq, -se->load.weight);
 		list_del_init(&se->group_node);
+	}
 	cfs_rq->nr_running--;
 }
 
@@ -1162,7 +1177,7 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
 		__clear_buddies_skip(se);
 }
 
-static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
+static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
 
 static void
 dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
@@ -1546,8 +1561,8 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
 		resched_task(rq_of(cfs_rq)->curr);
 }
 
-static __always_inline
-void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec)
+static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
+						   unsigned long delta_exec)
 {
 	if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
 		return;
@@ -2073,11 +2088,11 @@ void unthrottle_offline_cfs_rqs(struct rq *rq)
 }
 
 #else /* CONFIG_CFS_BANDWIDTH */
-static __always_inline
-void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec) {}
+static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
+				     unsigned long delta_exec) {}
 static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
 static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
-static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
+static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
 
 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
 {
@@ -2657,6 +2672,8 @@ static int select_idle_sibling(struct task_struct *p, int target)
 	/*
 	 * Otherwise, iterate the domains and find an elegible idle cpu.
 	 */
+	rcu_read_lock();
+
 	sd = rcu_dereference(per_cpu(sd_llc, target));
 	for_each_lower_domain(sd) {
 		sg = sd->groups;
@@ -2678,6 +2695,8 @@ static int select_idle_sibling(struct task_struct *p, int target)
 		} while (sg != sd->groups);
 	}
 done:
+	rcu_read_unlock();
+
 	return target;
 }
 
@@ -2903,7 +2922,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
 		return;
 
 	/*
-	 * This is possible from callers such as move_task(), in which we
+	 * This is possible from callers such as pull_task(), in which we
 	 * unconditionally check_prempt_curr() after an enqueue (which may have
 	 * lead to a throttle).  This both saves work and prevents false
 	 * next-buddy nomination below.
@@ -3067,39 +3086,17 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
  * Fair scheduling class load-balancing methods:
  */
 
-static unsigned long __read_mostly max_load_balance_interval = HZ/10;
-
-#define LBF_ALL_PINNED	0x01
-#define LBF_NEED_BREAK	0x02
-
-struct lb_env {
-	struct sched_domain	*sd;
-
-	int			src_cpu;
-	struct rq		*src_rq;
-
-	int			dst_cpu;
-	struct rq		*dst_rq;
-
-	enum cpu_idle_type	idle;
-	long			load_move;
-	unsigned int		flags;
-
-	unsigned int		loop;
-	unsigned int		loop_break;
-	unsigned int		loop_max;
-};
-
 /*
- * move_task - move a task from one runqueue to another runqueue.
+ * pull_task - move a task from a remote runqueue to the local runqueue.
  * Both runqueues must be locked.
  */
-static void move_task(struct task_struct *p, struct lb_env *env)
+static void pull_task(struct rq *src_rq, struct task_struct *p,
+		      struct rq *this_rq, int this_cpu)
 {
-	deactivate_task(env->src_rq, p, 0);
-	set_task_cpu(p, env->dst_cpu);
-	activate_task(env->dst_rq, p, 0);
-	check_preempt_curr(env->dst_rq, p, 0);
+	deactivate_task(src_rq, p, 0);
+	set_task_cpu(p, this_cpu);
+	activate_task(this_rq, p, 0);
+	check_preempt_curr(this_rq, p, 0);
 }
 
 /*
@@ -3134,11 +3131,19 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
 	return delta < (s64)sysctl_sched_migration_cost;
 }
 
+#define LBF_ALL_PINNED	0x01
+#define LBF_NEED_BREAK	0x02	/* clears into HAD_BREAK */
+#define LBF_HAD_BREAK	0x04
+#define LBF_HAD_BREAKS	0x0C	/* count HAD_BREAKs overflows into ABORT */
+#define LBF_ABORT	0x10
+
 /*
  * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
  */
 static
-int can_migrate_task(struct task_struct *p, struct lb_env *env)
+int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
+		     struct sched_domain *sd, enum cpu_idle_type idle,
+		     int *lb_flags)
 {
 	int tsk_cache_hot = 0;
 	/*
@@ -3147,13 +3152,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
 	 * 2) cannot be migrated to this CPU due to cpus_allowed, or
 	 * 3) are cache-hot on their current CPU.
 	 */
-	if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
+	if (!cpumask_test_cpu(this_cpu, tsk_cpus_allowed(p))) {
 		schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
 		return 0;
 	}
-	env->flags &= ~LBF_ALL_PINNED;
+	*lb_flags &= ~LBF_ALL_PINNED;
 
-	if (task_running(env->src_rq, p)) {
+	if (task_running(rq, p)) {
 		schedstat_inc(p, se.statistics.nr_failed_migrations_running);
 		return 0;
 	}
@@ -3164,12 +3169,12 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
 	 * 2) too many balance attempts have failed.
 	 */
 
-	tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd);
+	tsk_cache_hot = task_hot(p, rq->clock_task, sd);
 	if (!tsk_cache_hot ||
-		env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
+		sd->nr_balance_failed > sd->cache_nice_tries) {
 #ifdef CONFIG_SCHEDSTATS
 		if (tsk_cache_hot) {
-			schedstat_inc(env->sd, lb_hot_gained[env->idle]);
+			schedstat_inc(sd, lb_hot_gained[idle]);
 			schedstat_inc(p, se.statistics.nr_forced_migrations);
 		}
 #endif
@@ -3190,80 +3195,65 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
  *
  * Called with both runqueues locked.
  */
-static int move_one_task(struct lb_env *env)
+static int
+move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
+	      struct sched_domain *sd, enum cpu_idle_type idle)
 {
 	struct task_struct *p, *n;
+	struct cfs_rq *cfs_rq;
+	int pinned = 0;
 
-	list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
-		if (throttled_lb_pair(task_group(p), env->src_rq->cpu, env->dst_cpu))
-			continue;
+	for_each_leaf_cfs_rq(busiest, cfs_rq) {
+		list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) {
+			if (throttled_lb_pair(task_group(p),
+					      busiest->cpu, this_cpu))
+				break;
 
-		if (!can_migrate_task(p, env))
-			continue;
+			if (!can_migrate_task(p, busiest, this_cpu,
+						sd, idle, &pinned))
+				continue;
 
-		move_task(p, env);
-		/*
-		 * Right now, this is only the second place move_task()
-		 * is called, so we can safely collect move_task()
-		 * stats here rather than inside move_task().
-		 */
-		schedstat_inc(env->sd, lb_gained[env->idle]);
-		return 1;
+			pull_task(busiest, p, this_rq, this_cpu);
+			/*
+			 * Right now, this is only the second place pull_task()
+			 * is called, so we can safely collect pull_task()
+			 * stats here rather than inside pull_task().
+			 */
+			schedstat_inc(sd, lb_gained[idle]);
+			return 1;
+		}
 	}
+
 	return 0;
 }
 
-static unsigned long task_h_load(struct task_struct *p);
-
-/*
- * move_tasks tries to move up to load_move weighted load from busiest to
- * this_rq, as part of a balancing operation within domain "sd".
- * Returns 1 if successful and 0 otherwise.
- *
- * Called with both runqueues locked.
- */
-static int move_tasks(struct lb_env *env)
+static unsigned long
+balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
+	      unsigned long max_load_move, struct sched_domain *sd,
+	      enum cpu_idle_type idle, int *lb_flags,
+	      struct cfs_rq *busiest_cfs_rq)
 {
-	struct list_head *tasks = &env->src_rq->cfs_tasks;
-	struct task_struct *p;
-	unsigned long load;
-	int pulled = 0;
-
-	if (env->load_move <= 0)
-		return 0;
+	int loops = 0, pulled = 0;
+	long rem_load_move = max_load_move;
+	struct task_struct *p, *n;
 
-	while (!list_empty(tasks)) {
-		p = list_first_entry(tasks, struct task_struct, se.group_node);
+	if (max_load_move == 0)
+		goto out;
 
-		env->loop++;
-		/* We've more or less seen every task there is, call it quits */
-		if (env->loop > env->loop_max)
-			break;
-
-		/* take a breather every nr_migrate tasks */
-		if (env->loop > env->loop_break) {
-			env->loop_break += sysctl_sched_nr_migrate;
-			env->flags |= LBF_NEED_BREAK;
+	list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) {
+		if (loops++ > sysctl_sched_nr_migrate) {
+			*lb_flags |= LBF_NEED_BREAK;
 			break;
 		}
 
-		if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
-			goto next;
-
-		load = task_h_load(p);
-
-		if (load < 16 && !env->sd->nr_balance_failed)
-			goto next;
-
-		if ((load / 2) > env->load_move)
-			goto next;
-
-		if (!can_migrate_task(p, env))
-			goto next;
+		if ((p->se.load.weight >> 1) > rem_load_move ||
+		    !can_migrate_task(p, busiest, this_cpu, sd, idle,
+				      lb_flags))
+			continue;
 
-		move_task(p, env);
+		pull_task(busiest, p, this_rq, this_cpu);
 		pulled++;
-		env->load_move -= load;
+		rem_load_move -= p->se.load.weight;
 
 #ifdef CONFIG_PREEMPT
 		/*
@@ -3271,30 +3261,28 @@ static int move_tasks(struct lb_env *env)
 		 * kernels will stop after the first task is pulled to minimize
 		 * the critical section.
 		 */
-		if (env->idle == CPU_NEWLY_IDLE)
+		if (idle == CPU_NEWLY_IDLE) {
+			*lb_flags |= LBF_ABORT;
 			break;
+		}
 #endif
 
 		/*
 		 * We only want to steal up to the prescribed amount of
 		 * weighted load.
 		 */
-		if (env->load_move <= 0)
+		if (rem_load_move <= 0)
 			break;
-
-		continue;
-next:
-		list_move_tail(&p->se.group_node, tasks);
 	}
-
+out:
 	/*
-	 * Right now, this is one of only two places move_task() is called,
-	 * so we can safely collect move_task() stats here rather than
-	 * inside move_task().
+	 * Right now, this is one of only two places pull_task() is called,
+	 * so we can safely collect pull_task() stats here rather than
+	 * inside pull_task().
 	 */
-	schedstat_add(env->sd, lb_gained[env->idle], pulled);
+	schedstat_add(sd, lb_gained[idle], pulled);
 
-	return pulled;
+	return max_load_move - rem_load_move;
 }
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -3374,35 +3362,113 @@ static int tg_load_down(struct task_group *tg, void *data)
 
 static void update_h_load(long cpu)
 {
-	rcu_read_lock();
 	walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
-	rcu_read_unlock();
 }
 
-static unsigned long task_h_load(struct task_struct *p)
+static unsigned long
+load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
+		  unsigned long max_load_move,
+		  struct sched_domain *sd, enum cpu_idle_type idle,
+		  int *lb_flags)
 {
-	struct cfs_rq *cfs_rq = task_cfs_rq(p);
-	unsigned long load;
+	long rem_load_move = max_load_move;
+	struct cfs_rq *busiest_cfs_rq;
 
-	load = p->se.load.weight;
-	load = div_u64(load * cfs_rq->h_load, cfs_rq->load.weight + 1);
+	rcu_read_lock();
+	update_h_load(cpu_of(busiest));
 
-	return load;
+	for_each_leaf_cfs_rq(busiest, busiest_cfs_rq) {
+		unsigned long busiest_h_load = busiest_cfs_rq->h_load;
+		unsigned long busiest_weight = busiest_cfs_rq->load.weight;
+		u64 rem_load, moved_load;
+
+		if (*lb_flags & (LBF_NEED_BREAK|LBF_ABORT))
+			break;
+
+		/*
+		 * empty group or part of a throttled hierarchy
+		 */
+		if (!busiest_cfs_rq->task_weight ||
+		    throttled_lb_pair(busiest_cfs_rq->tg, cpu_of(busiest), this_cpu))
+			continue;
+
+		rem_load = (u64)rem_load_move * busiest_weight;
+		rem_load = div_u64(rem_load, busiest_h_load + 1);
+
+		moved_load = balance_tasks(this_rq, this_cpu, busiest,
+				rem_load, sd, idle, lb_flags,
+				busiest_cfs_rq);
+
+		if (!moved_load)
+			continue;
+
+		moved_load *= busiest_h_load;
+		moved_load = div_u64(moved_load, busiest_weight + 1);
+
+		rem_load_move -= moved_load;
+		if (rem_load_move < 0)
+			break;
+	}
+	rcu_read_unlock();
+
+	return max_load_move - rem_load_move;
 }
 #else
 static inline void update_shares(int cpu)
 {
 }
 
-static inline void update_h_load(long cpu)
+static unsigned long
+load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
+		  unsigned long max_load_move,
+		  struct sched_domain *sd, enum cpu_idle_type idle,
+		  int *lb_flags)
 {
+	return balance_tasks(this_rq, this_cpu, busiest,
+			max_load_move, sd, idle, lb_flags,
+			&busiest->cfs);
 }
+#endif
 
-static unsigned long task_h_load(struct task_struct *p)
+/*
+ * move_tasks tries to move up to max_load_move weighted load from busiest to
+ * this_rq, as part of a balancing operation within domain "sd".
+ * Returns 1 if successful and 0 otherwise.
+ *
+ * Called with both runqueues locked.
+ */
+static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
+		      unsigned long max_load_move,
+		      struct sched_domain *sd, enum cpu_idle_type idle,
+		      int *lb_flags)
 {
-	return p->se.load.weight;
-}
+	unsigned long total_load_moved = 0, load_moved;
+
+	do {
+		load_moved = load_balance_fair(this_rq, this_cpu, busiest,
+				max_load_move - total_load_moved,
+				sd, idle, lb_flags);
+
+		total_load_moved += load_moved;
+
+		if (*lb_flags & (LBF_NEED_BREAK|LBF_ABORT))
+			break;
+
+#ifdef CONFIG_PREEMPT
+		/*
+		 * NEWIDLE balancing is a source of latency, so preemptible
+		 * kernels will stop after the first task is pulled to minimize
+		 * the critical section.
+		 */
+		if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) {
+			*lb_flags |= LBF_ABORT;
+			break;
+		}
 #endif
+	} while (load_moved && max_load_move > total_load_moved);
+
+	return total_load_moved > 0;
+}
 
 /********** Helpers for find_busiest_group ************************/
 /*
@@ -3712,11 +3778,6 @@ void update_group_power(struct sched_domain *sd, int cpu)
 	struct sched_domain *child = sd->child;
 	struct sched_group *group, *sdg = sd->groups;
 	unsigned long power;
-	unsigned long interval;
-
-	interval = msecs_to_jiffies(sd->balance_interval);
-	interval = clamp(interval, 1UL, max_load_balance_interval);
-	sdg->sgp->next_update = jiffies + interval;
 
 	if (!child) {
 		update_cpu_power(sd, cpu);
@@ -3824,15 +3885,12 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
 	 * domains. In the newly idle case, we will allow all the cpu's
 	 * to do the newly idle load balance.
 	 */
-	if (local_group) {
-		if (idle != CPU_NEWLY_IDLE) {
-			if (balance_cpu != this_cpu) {
-				*balance = 0;
-				return;
-			}
-			update_group_power(sd, this_cpu);
-		} else if (time_after_eq(jiffies, group->sgp->next_update))
-			update_group_power(sd, this_cpu);
+	if (idle != CPU_NEWLY_IDLE && local_group) {
+		if (balance_cpu != this_cpu) {
+			*balance = 0;
+			return;
+		}
+		update_group_power(sd, this_cpu);
 	}
 
 	/* Adjust by relative CPU power of the group */
@@ -4395,21 +4453,13 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 			struct sched_domain *sd, enum cpu_idle_type idle,
 			int *balance)
 {
-	int ld_moved, active_balance = 0;
+	int ld_moved, lb_flags = 0, active_balance = 0;
 	struct sched_group *group;
 	unsigned long imbalance;
 	struct rq *busiest;
 	unsigned long flags;
 	struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
 
-	struct lb_env env = {
-		.sd		= sd,
-		.dst_cpu	= this_cpu,
-		.dst_rq		= this_rq,
-		.idle		= idle,
-		.loop_break	= sysctl_sched_nr_migrate,
-	};
-
 	cpumask_copy(cpus, cpu_active_mask);
 
 	schedstat_inc(sd, lb_count[idle]);
@@ -4444,34 +4494,32 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 		 * still unbalanced. ld_moved simply stays zero, so it is
 		 * correctly treated as an imbalance.
 		 */
-		env.flags |= LBF_ALL_PINNED;
-		env.load_move = imbalance;
-		env.src_cpu = busiest->cpu;
-		env.src_rq = busiest;
-		env.loop_max = busiest->nr_running;
-
-more_balance:
+		lb_flags |= LBF_ALL_PINNED;
 		local_irq_save(flags);
 		double_rq_lock(this_rq, busiest);
-		if (!env.loop)
-			update_h_load(env.src_cpu);
-		ld_moved += move_tasks(&env);
+		ld_moved = move_tasks(this_rq, this_cpu, busiest,
+				      imbalance, sd, idle, &lb_flags);
 		double_rq_unlock(this_rq, busiest);
 		local_irq_restore(flags);
 
-		if (env.flags & LBF_NEED_BREAK) {
-			env.flags &= ~LBF_NEED_BREAK;
-			goto more_balance;
-		}
-
 		/*
 		 * some other cpu did the load balance for us.
 		 */
 		if (ld_moved && this_cpu != smp_processor_id())
 			resched_cpu(this_cpu);
 
+		if (lb_flags & LBF_ABORT)
+			goto out_balanced;
+
+		if (lb_flags & LBF_NEED_BREAK) {
+			lb_flags += LBF_HAD_BREAK - LBF_NEED_BREAK;
+			if (lb_flags & LBF_ABORT)
+				goto out_balanced;
+			goto redo;
+		}
+
 		/* All tasks on this runqueue were pinned by CPU affinity */
-		if (unlikely(env.flags & LBF_ALL_PINNED)) {
+		if (unlikely(lb_flags & LBF_ALL_PINNED)) {
 			cpumask_clear_cpu(cpu_of(busiest), cpus);
 			if (!cpumask_empty(cpus))
 				goto redo;
@@ -4501,7 +4549,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 					tsk_cpus_allowed(busiest->curr))) {
 				raw_spin_unlock_irqrestore(&busiest->lock,
 							    flags);
-				env.flags |= LBF_ALL_PINNED;
+				lb_flags |= LBF_ALL_PINNED;
 				goto out_one_pinned;
 			}
 
@@ -4554,7 +4602,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 
 out_one_pinned:
 	/* tune up the balancing interval */
-	if (((env.flags & LBF_ALL_PINNED) &&
+	if (((lb_flags & LBF_ALL_PINNED) &&
 			sd->balance_interval < MAX_PINNED_INTERVAL) ||
 			(sd->balance_interval < sd->max_interval))
 		sd->balance_interval *= 2;
@@ -4664,18 +4712,10 @@ static int active_load_balance_cpu_stop(void *data)
 	}
 
 	if (likely(sd)) {
-		struct lb_env env = {
-			.sd		= sd,
-			.dst_cpu	= target_cpu,
-			.dst_rq		= target_rq,
-			.src_cpu	= busiest_rq->cpu,
-			.src_rq		= busiest_rq,
-			.idle		= CPU_IDLE,
-		};
-
 		schedstat_inc(sd, alb_count);
 
-		if (move_one_task(&env))
+		if (move_one_task(target_rq, target_cpu, busiest_rq,
+				  sd, CPU_IDLE))
 			schedstat_inc(sd, alb_pushed);
 		else
 			schedstat_inc(sd, alb_failed);
@@ -4907,6 +4947,8 @@ static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb,
 
 static DEFINE_SPINLOCK(balancing);
 
+static unsigned long __read_mostly max_load_balance_interval = HZ/10;
+
 /*
  * Scale the max load_balance interval with the number of CPUs in the system.
  * This trades load-balance latency on larger machines for less cross talk.
@@ -5300,6 +5342,7 @@ static void set_curr_task_fair(struct rq *rq)
 void init_cfs_rq(struct cfs_rq *cfs_rq)
 {
 	cfs_rq->tasks_timeline = RB_ROOT;
+	INIT_LIST_HEAD(&cfs_rq->tasks);
 	cfs_rq->min_vruntime = (u64)(-(1LL << 20));
 #ifndef CONFIG_64BIT
 	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
@@ -5571,7 +5614,6 @@ __init void init_sched_fair_class(void)
 	open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
 
 #ifdef CONFIG_NO_HZ
-	nohz.next_balance = jiffies;
 	zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
 	cpu_notifier(sched_ilb_notifier, 0);
 #endif
diff --git a/trunk/kernel/sched/rt.c b/trunk/kernel/sched/rt.c
index 44af55e6d5d0..f42ae7fb5ec5 100644
--- a/trunk/kernel/sched/rt.c
+++ b/trunk/kernel/sched/rt.c
@@ -778,9 +778,12 @@ static inline int balance_runtime(struct rt_rq *rt_rq)
 
 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
 {
-	int i, idle = 1, throttled = 0;
+	int i, idle = 1;
 	const struct cpumask *span;
 
+	if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
+		return 1;
+
 	span = sched_rt_period_mask();
 	for_each_cpu(i, span) {
 		int enqueue = 0;
@@ -815,17 +818,12 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
 			if (!rt_rq_throttled(rt_rq))
 				enqueue = 1;
 		}
-		if (rt_rq->rt_throttled)
-			throttled = 1;
 
 		if (enqueue)
 			sched_rt_rq_enqueue(rt_rq);
 		raw_spin_unlock(&rq->lock);
 	}
 
-	if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF))
-		return 1;
-
 	return idle;
 }
 
@@ -857,30 +855,8 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
 		return 0;
 
 	if (rt_rq->rt_time > runtime) {
-		struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
-
-		/*
-		 * Don't actually throttle groups that have no runtime assigned
-		 * but accrue some time due to boosting.
-		 */
-		if (likely(rt_b->rt_runtime)) {
-			static bool once = false;
-
-			rt_rq->rt_throttled = 1;
-
-			if (!once) {
-				once = true;
-				printk_sched("sched: RT throttling activated\n");
-			}
-		} else {
-			/*
-			 * In case we did anyway, make it go away,
-			 * replenishment is a joke, since it will replenish us
-			 * with exactly 0 ns.
-			 */
-			rt_rq->rt_time = 0;
-		}
-
+		rt_rq->rt_throttled = 1;
+		printk_once(KERN_WARNING "sched: RT throttling activated\n");
 		if (rt_rq_throttled(rt_rq)) {
 			sched_rt_rq_dequeue(rt_rq);
 			return 1;
@@ -908,8 +884,7 @@ static void update_curr_rt(struct rq *rq)
 	if (unlikely((s64)delta_exec < 0))
 		delta_exec = 0;
 
-	schedstat_set(curr->se.statistics.exec_max,
-		      max(curr->se.statistics.exec_max, delta_exec));
+	schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec));
 
 	curr->se.sum_exec_runtime += delta_exec;
 	account_group_exec_runtime(curr, delta_exec);
@@ -1428,7 +1403,7 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
 next_idx:
 		if (idx >= MAX_RT_PRIO)
 			continue;
-		if (next && next->prio <= idx)
+		if (next && next->prio < idx)
 			continue;
 		list_for_each_entry(rt_se, array->queue + idx, run_list) {
 			struct task_struct *p;
@@ -1997,7 +1972,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
 	if (--p->rt.time_slice)
 		return;
 
-	p->rt.time_slice = RR_TIMESLICE;
+	p->rt.time_slice = DEF_TIMESLICE;
 
 	/*
 	 * Requeue to the end of queue if we are not the only element
@@ -2025,7 +2000,7 @@ static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
 	 * Time slice is 0 for SCHED_FIFO tasks
 	 */
 	if (task->policy == SCHED_RR)
-		return RR_TIMESLICE;
+		return DEF_TIMESLICE;
 	else
 		return 0;
 }
diff --git a/trunk/kernel/sched/sched.h b/trunk/kernel/sched/sched.h
index 753bdd567416..d72483d07c9f 100644
--- a/trunk/kernel/sched/sched.h
+++ b/trunk/kernel/sched/sched.h
@@ -36,7 +36,11 @@ extern __read_mostly int scheduler_running;
 
 /*
  * These are the 'tuning knobs' of the scheduler:
+ *
+ * default timeslice is 100 msecs (used only for SCHED_RR tasks).
+ * Timeslices get refilled after they expire.
  */
+#define DEF_TIMESLICE		(100 * HZ / 1000)
 
 /*
  * single value that denotes runtime == period, ie unlimited time.
@@ -212,6 +216,9 @@ struct cfs_rq {
 	struct rb_root tasks_timeline;
 	struct rb_node *rb_leftmost;
 
+	struct list_head tasks;
+	struct list_head *balance_iterator;
+
 	/*
 	 * 'curr' points to currently running entity on this cfs_rq.
 	 * It is set to NULL otherwise (i.e when none are currently running).
@@ -238,6 +245,11 @@ struct cfs_rq {
 	struct task_group *tg;	/* group that "owns" this runqueue */
 
 #ifdef CONFIG_SMP
+	/*
+	 * the part of load.weight contributed by tasks
+	 */
+	unsigned long task_weight;
+
 	/*
 	 *   h_load = weight * f(tg)
 	 *
@@ -412,8 +424,6 @@ struct rq {
 	int cpu;
 	int online;
 
-	struct list_head cfs_tasks;
-
 	u64 rt_avg;
 	u64 age_stamp;
 	u64 idle_stamp;
@@ -452,6 +462,7 @@ struct rq {
 	unsigned int yld_count;
 
 	/* schedule() stats */
+	unsigned int sched_switch;
 	unsigned int sched_count;
 	unsigned int sched_goidle;
 
@@ -681,6 +692,9 @@ static inline int task_running(struct rq *rq, struct task_struct *p)
 #ifndef finish_arch_switch
 # define finish_arch_switch(prev)	do { } while (0)
 #endif
+#ifndef finish_arch_post_lock_switch
+# define finish_arch_post_lock_switch()	do { } while (0)
+#endif
 
 #ifndef __ARCH_WANT_UNLOCKED_CTXSW
 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
diff --git a/trunk/kernel/sched/stats.c b/trunk/kernel/sched/stats.c
index 903ffa9e8872..2a581ba8e190 100644
--- a/trunk/kernel/sched/stats.c
+++ b/trunk/kernel/sched/stats.c
@@ -32,9 +32,9 @@ static int show_schedstat(struct seq_file *seq, void *v)
 
 		/* runqueue-specific stats */
 		seq_printf(seq,
-		    "cpu%d %u 0 %u %u %u %u %llu %llu %lu",
+		    "cpu%d %u %u %u %u %u %u %llu %llu %lu",
 		    cpu, rq->yld_count,
-		    rq->sched_count, rq->sched_goidle,
+		    rq->sched_switch, rq->sched_count, rq->sched_goidle,
 		    rq->ttwu_count, rq->ttwu_local,
 		    rq->rq_cpu_time,
 		    rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount);
diff --git a/trunk/kernel/softirq.c b/trunk/kernel/softirq.c
index f268369ebe1f..4eb3a0fa351e 100644
--- a/trunk/kernel/softirq.c
+++ b/trunk/kernel/softirq.c
@@ -353,7 +353,7 @@ void irq_exit(void)
 		tick_nohz_irq_exit();
 #endif
 	rcu_irq_exit();
-	sched_preempt_enable_no_resched();
+	preempt_enable_no_resched();
 }
 
 /*
@@ -744,7 +744,9 @@ static int run_ksoftirqd(void * __bind_cpu)
 	while (!kthread_should_stop()) {
 		preempt_disable();
 		if (!local_softirq_pending()) {
-			schedule_preempt_disabled();
+			preempt_enable_no_resched();
+			schedule();
+			preempt_disable();
 		}
 
 		__set_current_state(TASK_RUNNING);
@@ -759,7 +761,7 @@ static int run_ksoftirqd(void * __bind_cpu)
 			if (local_softirq_pending())
 				__do_softirq();
 			local_irq_enable();
-			sched_preempt_enable_no_resched();
+			preempt_enable_no_resched();
 			cond_resched();
 			preempt_disable();
 			rcu_note_context_switch((long)__bind_cpu);