From e3334a482b78c90a8f89b0ef1118771cba526e1b Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Sun, 27 Nov 2011 21:43:10 +0000 Subject: [PATCH] --- yaml --- r: 298354 b: refs/heads/master c: 01f23e1630d944f7085cd8fd5793e31ea91c03d8 h: refs/heads/master v: v3 --- [refs] | 2 +- trunk/Documentation/scheduler/sched-stats.txt | 3 +- trunk/MAINTAINERS | 4 +- trunk/arch/arm/kernel/process.c | 4 +- trunk/arch/arm/kernel/smp.c | 7 + trunk/arch/avr32/kernel/process.c | 4 +- trunk/arch/blackfin/kernel/process.c | 4 +- trunk/arch/cris/kernel/process.c | 4 +- trunk/arch/frv/kernel/process.c | 4 +- trunk/arch/h8300/kernel/process.c | 4 +- trunk/arch/hexagon/kernel/smp.c | 2 + trunk/arch/ia64/kernel/process.c | 4 +- trunk/arch/m32r/kernel/process.c | 4 +- trunk/arch/m68k/kernel/process_mm.c | 4 +- trunk/arch/m68k/kernel/process_no.c | 4 +- trunk/arch/microblaze/kernel/process.c | 4 +- trunk/arch/mips/kernel/process.c | 4 +- trunk/arch/mn10300/kernel/process.c | 4 +- trunk/arch/parisc/kernel/process.c | 4 +- trunk/arch/powerpc/kernel/idle.c | 8 +- trunk/arch/powerpc/platforms/iseries/setup.c | 8 +- trunk/arch/s390/kernel/process.c | 4 +- trunk/arch/s390/kernel/smp.c | 6 + trunk/arch/score/kernel/process.c | 4 +- trunk/arch/sh/kernel/idle.c | 4 +- trunk/arch/sparc/kernel/process_32.c | 8 +- trunk/arch/sparc/kernel/process_64.c | 10 +- trunk/arch/tile/kernel/process.c | 4 +- trunk/arch/x86/include/asm/timer.h | 8 +- trunk/arch/x86/kernel/process_32.c | 4 +- trunk/arch/x86/kernel/process_64.c | 4 +- trunk/arch/x86/kernel/smpboot.c | 18 + trunk/arch/x86/kernel/tsc.c | 3 +- trunk/arch/xtensa/kernel/process.c | 4 +- trunk/block/blk-softirq.c | 16 +- trunk/block/blk.h | 16 + trunk/fs/proc/base.c | 3 +- trunk/include/linux/cpuset.h | 6 +- trunk/include/linux/init_task.h | 2 +- trunk/include/linux/kernel.h | 13 - trunk/include/linux/preempt.h | 5 +- trunk/include/linux/printk.h | 10 - trunk/include/linux/sched.h | 41 +- trunk/include/linux/wait.h | 5 +- trunk/init/main.c | 5 +- trunk/kernel/cpuset.c | 21 +- trunk/kernel/mutex.c | 4 +- trunk/kernel/printk.c | 40 +- trunk/kernel/sched/auto_group.c | 12 +- trunk/kernel/sched/core.c | 198 +++------ trunk/kernel/sched/debug.c | 1 + trunk/kernel/sched/fair.c | 408 ++++++++++-------- trunk/kernel/sched/rt.c | 45 +- trunk/kernel/sched/sched.h | 18 +- trunk/kernel/sched/stats.c | 4 +- trunk/kernel/softirq.c | 8 +- 56 files changed, 526 insertions(+), 526 deletions(-) diff --git a/[refs] b/[refs] index 2700e53c598c..bce2aa989b9d 100644 --- a/[refs] +++ b/[refs] @@ -1,2 +1,2 @@ --- -refs/heads/master: 6135fc1eb4b1c9ae5f535507ed59591bab51e630 +refs/heads/master: 01f23e1630d944f7085cd8fd5793e31ea91c03d8 diff --git a/trunk/Documentation/scheduler/sched-stats.txt b/trunk/Documentation/scheduler/sched-stats.txt index 8259b34a66ae..1cd5d51bc761 100644 --- a/trunk/Documentation/scheduler/sched-stats.txt +++ b/trunk/Documentation/scheduler/sched-stats.txt @@ -38,8 +38,7 @@ First field is a sched_yield() statistic: 1) # of times sched_yield() was called Next three are schedule() statistics: - 2) This field is a legacy array expiration count field used in the O(1) - scheduler. We kept it for ABI compatibility, but it is always set to zero. + 2) # of times we switched to the expired queue and reused it 3) # of times schedule() was called 4) # of times schedule() left the processor idle diff --git a/trunk/MAINTAINERS b/trunk/MAINTAINERS index 1ad6a06b0180..3321d75c6c7f 100644 --- a/trunk/MAINTAINERS +++ b/trunk/MAINTAINERS @@ -5112,7 +5112,7 @@ F: kernel/delayacct.c PERFORMANCE EVENTS SUBSYSTEM M: Peter Zijlstra M: Paul Mackerras -M: Ingo Molnar +M: Ingo Molnar M: Arnaldo Carvalho de Melo T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git perf/core S: Supported @@ -5736,7 +5736,7 @@ S: Maintained F: drivers/watchdog/sc1200wdt.c SCHEDULER -M: Ingo Molnar +M: Ingo Molnar M: Peter Zijlstra T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched/core S: Maintained diff --git a/trunk/arch/arm/kernel/process.c b/trunk/arch/arm/kernel/process.c index c2ae3cd331fe..971d65c253a9 100644 --- a/trunk/arch/arm/kernel/process.c +++ b/trunk/arch/arm/kernel/process.c @@ -239,7 +239,9 @@ void cpu_idle(void) leds_event(led_idle_end); rcu_idle_exit(); tick_nohz_idle_exit(); - schedule_preempt_disabled(); + preempt_enable_no_resched(); + schedule(); + preempt_disable(); } } diff --git a/trunk/arch/arm/kernel/smp.c b/trunk/arch/arm/kernel/smp.c index d616ed51e7a7..cdeb727527d3 100644 --- a/trunk/arch/arm/kernel/smp.c +++ b/trunk/arch/arm/kernel/smp.c @@ -295,6 +295,13 @@ asmlinkage void __cpuinit secondary_start_kernel(void) */ percpu_timer_setup(); + while (!cpu_active(cpu)) + cpu_relax(); + + /* + * cpu_active bit is set, so it's safe to enalbe interrupts + * now. + */ local_irq_enable(); local_fiq_enable(); diff --git a/trunk/arch/avr32/kernel/process.c b/trunk/arch/avr32/kernel/process.c index 92c5af98a6f7..ea3395750324 100644 --- a/trunk/arch/avr32/kernel/process.c +++ b/trunk/arch/avr32/kernel/process.c @@ -40,7 +40,9 @@ void cpu_idle(void) cpu_idle_sleep(); rcu_idle_exit(); tick_nohz_idle_exit(); - schedule_preempt_disabled(); + preempt_enable_no_resched(); + schedule(); + preempt_disable(); } } diff --git a/trunk/arch/blackfin/kernel/process.c b/trunk/arch/blackfin/kernel/process.c index a80a643f3691..8dd0416673cb 100644 --- a/trunk/arch/blackfin/kernel/process.c +++ b/trunk/arch/blackfin/kernel/process.c @@ -94,7 +94,9 @@ void cpu_idle(void) idle(); rcu_idle_exit(); tick_nohz_idle_exit(); - schedule_preempt_disabled(); + preempt_enable_no_resched(); + schedule(); + preempt_disable(); } } diff --git a/trunk/arch/cris/kernel/process.c b/trunk/arch/cris/kernel/process.c index d8f50ff6fadd..aa585e4e979e 100644 --- a/trunk/arch/cris/kernel/process.c +++ b/trunk/arch/cris/kernel/process.c @@ -115,7 +115,9 @@ void cpu_idle (void) idle = default_idle; idle(); } - schedule_preempt_disabled(); + preempt_enable_no_resched(); + schedule(); + preempt_disable(); } } diff --git a/trunk/arch/frv/kernel/process.c b/trunk/arch/frv/kernel/process.c index 29cc49783787..3901df1213c0 100644 --- a/trunk/arch/frv/kernel/process.c +++ b/trunk/arch/frv/kernel/process.c @@ -92,7 +92,9 @@ void cpu_idle(void) idle(); } - schedule_preempt_disabled(); + preempt_enable_no_resched(); + schedule(); + preempt_disable(); } } diff --git a/trunk/arch/h8300/kernel/process.c b/trunk/arch/h8300/kernel/process.c index 1a173b35f475..933bd388efb2 100644 --- a/trunk/arch/h8300/kernel/process.c +++ b/trunk/arch/h8300/kernel/process.c @@ -81,7 +81,9 @@ void cpu_idle(void) while (1) { while (!need_resched()) idle(); - schedule_preempt_disabled(); + preempt_enable_no_resched(); + schedule(); + preempt_disable(); } } diff --git a/trunk/arch/hexagon/kernel/smp.c b/trunk/arch/hexagon/kernel/smp.c index 0123c63e9a3a..c871a2cffaef 100644 --- a/trunk/arch/hexagon/kernel/smp.c +++ b/trunk/arch/hexagon/kernel/smp.c @@ -179,6 +179,8 @@ void __cpuinit start_secondary(void) printk(KERN_INFO "%s cpu %d\n", __func__, current_thread_info()->cpu); set_cpu_online(cpu, true); + while (!cpumask_test_cpu(cpu, cpu_active_mask)) + cpu_relax(); local_irq_enable(); cpu_idle(); diff --git a/trunk/arch/ia64/kernel/process.c b/trunk/arch/ia64/kernel/process.c index 9dc52b63fc87..6d33c5cc94f0 100644 --- a/trunk/arch/ia64/kernel/process.c +++ b/trunk/arch/ia64/kernel/process.c @@ -330,7 +330,9 @@ cpu_idle (void) normal_xtp(); #endif } - schedule_preempt_disabled(); + preempt_enable_no_resched(); + schedule(); + preempt_disable(); check_pgt_cache(); if (cpu_is_offline(cpu)) play_dead(); diff --git a/trunk/arch/m32r/kernel/process.c b/trunk/arch/m32r/kernel/process.c index 3a4a32b27208..422bea9f1dbc 100644 --- a/trunk/arch/m32r/kernel/process.c +++ b/trunk/arch/m32r/kernel/process.c @@ -90,7 +90,9 @@ void cpu_idle (void) idle(); } - schedule_preempt_disabled(); + preempt_enable_no_resched(); + schedule(); + preempt_disable(); } } diff --git a/trunk/arch/m68k/kernel/process_mm.c b/trunk/arch/m68k/kernel/process_mm.c index fe4186b5fc32..099283ee1a8f 100644 --- a/trunk/arch/m68k/kernel/process_mm.c +++ b/trunk/arch/m68k/kernel/process_mm.c @@ -78,7 +78,9 @@ void cpu_idle(void) while (1) { while (!need_resched()) idle(); - schedule_preempt_disabled(); + preempt_enable_no_resched(); + schedule(); + preempt_disable(); } } diff --git a/trunk/arch/m68k/kernel/process_no.c b/trunk/arch/m68k/kernel/process_no.c index f7fe6c348595..5e1078cabe0e 100644 --- a/trunk/arch/m68k/kernel/process_no.c +++ b/trunk/arch/m68k/kernel/process_no.c @@ -73,7 +73,9 @@ void cpu_idle(void) /* endless idle loop with no priority at all */ while (1) { idle(); - schedule_preempt_disabled(); + preempt_enable_no_resched(); + schedule(); + preempt_disable(); } } diff --git a/trunk/arch/microblaze/kernel/process.c b/trunk/arch/microblaze/kernel/process.c index 9155f7d92669..7dcb5bfffb75 100644 --- a/trunk/arch/microblaze/kernel/process.c +++ b/trunk/arch/microblaze/kernel/process.c @@ -110,7 +110,9 @@ void cpu_idle(void) rcu_idle_exit(); tick_nohz_idle_exit(); - schedule_preempt_disabled(); + preempt_enable_no_resched(); + schedule(); + preempt_disable(); check_pgt_cache(); } } diff --git a/trunk/arch/mips/kernel/process.c b/trunk/arch/mips/kernel/process.c index 61f1cb45a1d5..7955409051c4 100644 --- a/trunk/arch/mips/kernel/process.c +++ b/trunk/arch/mips/kernel/process.c @@ -80,7 +80,9 @@ void __noreturn cpu_idle(void) #endif rcu_idle_exit(); tick_nohz_idle_exit(); - schedule_preempt_disabled(); + preempt_enable_no_resched(); + schedule(); + preempt_disable(); } } diff --git a/trunk/arch/mn10300/kernel/process.c b/trunk/arch/mn10300/kernel/process.c index cac401d37f75..28eec3102535 100644 --- a/trunk/arch/mn10300/kernel/process.c +++ b/trunk/arch/mn10300/kernel/process.c @@ -123,7 +123,9 @@ void cpu_idle(void) idle(); } - schedule_preempt_disabled(); + preempt_enable_no_resched(); + schedule(); + preempt_disable(); } } diff --git a/trunk/arch/parisc/kernel/process.c b/trunk/arch/parisc/kernel/process.c index d4b94b395c16..62c60b87d039 100644 --- a/trunk/arch/parisc/kernel/process.c +++ b/trunk/arch/parisc/kernel/process.c @@ -71,7 +71,9 @@ void cpu_idle(void) while (1) { while (!need_resched()) barrier(); - schedule_preempt_disabled(); + preempt_enable_no_resched(); + schedule(); + preempt_disable(); check_pgt_cache(); } } diff --git a/trunk/arch/powerpc/kernel/idle.c b/trunk/arch/powerpc/kernel/idle.c index c97fc60c790c..0a48bf5db6c8 100644 --- a/trunk/arch/powerpc/kernel/idle.c +++ b/trunk/arch/powerpc/kernel/idle.c @@ -101,11 +101,11 @@ void cpu_idle(void) ppc64_runlatch_on(); rcu_idle_exit(); tick_nohz_idle_exit(); - if (cpu_should_die()) { - sched_preempt_enable_no_resched(); + preempt_enable_no_resched(); + if (cpu_should_die()) cpu_die(); - } - schedule_preempt_disabled(); + schedule(); + preempt_disable(); } } diff --git a/trunk/arch/powerpc/platforms/iseries/setup.c b/trunk/arch/powerpc/platforms/iseries/setup.c index a5fbf4cb6329..8fc62586a973 100644 --- a/trunk/arch/powerpc/platforms/iseries/setup.c +++ b/trunk/arch/powerpc/platforms/iseries/setup.c @@ -584,7 +584,9 @@ static void iseries_shared_idle(void) if (hvlpevent_is_pending()) process_iSeries_events(); - schedule_preempt_disabled(); + preempt_enable_no_resched(); + schedule(); + preempt_disable(); } } @@ -613,7 +615,9 @@ static void iseries_dedicated_idle(void) ppc64_runlatch_on(); rcu_idle_exit(); tick_nohz_idle_exit(); - schedule_preempt_disabled(); + preempt_enable_no_resched(); + schedule(); + preempt_disable(); } } diff --git a/trunk/arch/s390/kernel/process.c b/trunk/arch/s390/kernel/process.c index 7618085b4164..e795933eb2cb 100644 --- a/trunk/arch/s390/kernel/process.c +++ b/trunk/arch/s390/kernel/process.c @@ -97,7 +97,9 @@ void cpu_idle(void) tick_nohz_idle_exit(); if (test_thread_flag(TIF_MCCK_PENDING)) s390_handle_mcck(); - schedule_preempt_disabled(); + preempt_enable_no_resched(); + schedule(); + preempt_disable(); } } diff --git a/trunk/arch/s390/kernel/smp.c b/trunk/arch/s390/kernel/smp.c index b0e28c47ab83..2398ce6b15ae 100644 --- a/trunk/arch/s390/kernel/smp.c +++ b/trunk/arch/s390/kernel/smp.c @@ -550,6 +550,12 @@ int __cpuinit start_secondary(void *cpuvoid) S390_lowcore.restart_psw.addr = PSW_ADDR_AMODE | (unsigned long) psw_restart_int_handler; __ctl_set_bit(0, 28); /* Enable lowcore protection */ + /* + * Wait until the cpu which brought this one up marked it + * active before enabling interrupts. + */ + while (!cpumask_test_cpu(smp_processor_id(), cpu_active_mask)) + cpu_relax(); local_irq_enable(); /* cpu_idle will call schedule for us */ cpu_idle(); diff --git a/trunk/arch/score/kernel/process.c b/trunk/arch/score/kernel/process.c index 2707023c7563..25d08030a883 100644 --- a/trunk/arch/score/kernel/process.c +++ b/trunk/arch/score/kernel/process.c @@ -53,7 +53,9 @@ void __noreturn cpu_idle(void) while (!need_resched()) barrier(); - schedule_preempt_disabled(); + preempt_enable_no_resched(); + schedule(); + preempt_disable(); } } diff --git a/trunk/arch/sh/kernel/idle.c b/trunk/arch/sh/kernel/idle.c index 7e4892826563..406508d4ce74 100644 --- a/trunk/arch/sh/kernel/idle.c +++ b/trunk/arch/sh/kernel/idle.c @@ -114,7 +114,9 @@ void cpu_idle(void) rcu_idle_exit(); tick_nohz_idle_exit(); - schedule_preempt_disabled(); + preempt_enable_no_resched(); + schedule(); + preempt_disable(); } } diff --git a/trunk/arch/sparc/kernel/process_32.c b/trunk/arch/sparc/kernel/process_32.c index 935fdbcd88c2..f793742eec2b 100644 --- a/trunk/arch/sparc/kernel/process_32.c +++ b/trunk/arch/sparc/kernel/process_32.c @@ -113,7 +113,9 @@ void cpu_idle(void) while (!need_resched()) cpu_relax(); } - schedule_preempt_disabled(); + preempt_enable_no_resched(); + schedule(); + preempt_disable(); check_pgt_cache(); } } @@ -136,7 +138,9 @@ void cpu_idle(void) while (!need_resched()) cpu_relax(); } - schedule_preempt_disabled(); + preempt_enable_no_resched(); + schedule(); + preempt_disable(); check_pgt_cache(); } } diff --git a/trunk/arch/sparc/kernel/process_64.c b/trunk/arch/sparc/kernel/process_64.c index 06b5b5fc20c7..39d8b05201a2 100644 --- a/trunk/arch/sparc/kernel/process_64.c +++ b/trunk/arch/sparc/kernel/process_64.c @@ -104,13 +104,15 @@ void cpu_idle(void) rcu_idle_exit(); tick_nohz_idle_exit(); + preempt_enable_no_resched(); + #ifdef CONFIG_HOTPLUG_CPU - if (cpu_is_offline(cpu)) { - sched_preempt_enable_no_resched(); + if (cpu_is_offline(cpu)) cpu_play_dead(); - } #endif - schedule_preempt_disabled(); + + schedule(); + preempt_disable(); } } diff --git a/trunk/arch/tile/kernel/process.c b/trunk/arch/tile/kernel/process.c index 6ae495ef2b99..4c1ac6e5347a 100644 --- a/trunk/arch/tile/kernel/process.c +++ b/trunk/arch/tile/kernel/process.c @@ -108,7 +108,9 @@ void cpu_idle(void) } rcu_idle_exit(); tick_nohz_idle_exit(); - schedule_preempt_disabled(); + preempt_enable_no_resched(); + schedule(); + preempt_disable(); } } diff --git a/trunk/arch/x86/include/asm/timer.h b/trunk/arch/x86/include/asm/timer.h index 34baa0eb5d0c..431793e5d484 100644 --- a/trunk/arch/x86/include/asm/timer.h +++ b/trunk/arch/x86/include/asm/timer.h @@ -57,10 +57,14 @@ DECLARE_PER_CPU(unsigned long long, cyc2ns_offset); static inline unsigned long long __cycles_2_ns(unsigned long long cyc) { + unsigned long long quot; + unsigned long long rem; int cpu = smp_processor_id(); unsigned long long ns = per_cpu(cyc2ns_offset, cpu); - ns += mult_frac(cyc, per_cpu(cyc2ns, cpu), - (1UL << CYC2NS_SCALE_FACTOR)); + quot = (cyc >> CYC2NS_SCALE_FACTOR); + rem = cyc & ((1ULL << CYC2NS_SCALE_FACTOR) - 1); + ns += quot * per_cpu(cyc2ns, cpu) + + ((rem * per_cpu(cyc2ns, cpu)) >> CYC2NS_SCALE_FACTOR); return ns; } diff --git a/trunk/arch/x86/kernel/process_32.c b/trunk/arch/x86/kernel/process_32.c index 49888fefe794..c08d1ff12b7c 100644 --- a/trunk/arch/x86/kernel/process_32.c +++ b/trunk/arch/x86/kernel/process_32.c @@ -119,7 +119,9 @@ void cpu_idle(void) } rcu_idle_exit(); tick_nohz_idle_exit(); - schedule_preempt_disabled(); + preempt_enable_no_resched(); + schedule(); + preempt_disable(); } } diff --git a/trunk/arch/x86/kernel/process_64.c b/trunk/arch/x86/kernel/process_64.c index e34257c70c28..cfa5c90c01db 100644 --- a/trunk/arch/x86/kernel/process_64.c +++ b/trunk/arch/x86/kernel/process_64.c @@ -156,7 +156,9 @@ void cpu_idle(void) } tick_nohz_idle_exit(); - schedule_preempt_disabled(); + preempt_enable_no_resched(); + schedule(); + preempt_disable(); } } diff --git a/trunk/arch/x86/kernel/smpboot.c b/trunk/arch/x86/kernel/smpboot.c index 89571a0c4a49..66d250c00d11 100644 --- a/trunk/arch/x86/kernel/smpboot.c +++ b/trunk/arch/x86/kernel/smpboot.c @@ -219,9 +219,14 @@ static void __cpuinit smp_callin(void) * Update loops_per_jiffy in cpu_data. Previous call to * smp_store_cpu_info() stored a value that is close but not as * accurate as the value just calculated. + * + * Need to enable IRQs because it can take longer and then + * the NMI watchdog might kill us. */ + local_irq_enable(); calibrate_delay(); cpu_data(cpuid).loops_per_jiffy = loops_per_jiffy; + local_irq_disable(); pr_debug("Stack at about %p\n", &cpuid); /* @@ -286,6 +291,19 @@ notrace static void __cpuinit start_secondary(void *unused) per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; x86_platform.nmi_init(); + /* + * Wait until the cpu which brought this one up marked it + * online before enabling interrupts. If we don't do that then + * we can end up waking up the softirq thread before this cpu + * reached the active state, which makes the scheduler unhappy + * and schedule the softirq thread on the wrong cpu. This is + * only observable with forced threaded interrupts, but in + * theory it could also happen w/o them. It's just way harder + * to achieve. + */ + while (!cpumask_test_cpu(smp_processor_id(), cpu_active_mask)) + cpu_relax(); + /* enable local interrupts */ local_irq_enable(); diff --git a/trunk/arch/x86/kernel/tsc.c b/trunk/arch/x86/kernel/tsc.c index 183c5925a9fe..a62c201c97ec 100644 --- a/trunk/arch/x86/kernel/tsc.c +++ b/trunk/arch/x86/kernel/tsc.c @@ -620,8 +620,7 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) if (cpu_khz) { *scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz; - *offset = ns_now - mult_frac(tsc_now, *scale, - (1UL << CYC2NS_SCALE_FACTOR)); + *offset = ns_now - (tsc_now * *scale >> CYC2NS_SCALE_FACTOR); } sched_clock_idle_wakeup_event(0); diff --git a/trunk/arch/xtensa/kernel/process.c b/trunk/arch/xtensa/kernel/process.c index 2c9004770c4e..47041e7c088c 100644 --- a/trunk/arch/xtensa/kernel/process.c +++ b/trunk/arch/xtensa/kernel/process.c @@ -113,7 +113,9 @@ void cpu_idle(void) while (1) { while (!need_resched()) platform_idle(); - schedule_preempt_disabled(); + preempt_enable_no_resched(); + schedule(); + preempt_disable(); } } diff --git a/trunk/block/blk-softirq.c b/trunk/block/blk-softirq.c index 467c8de88642..1366a89d8e66 100644 --- a/trunk/block/blk-softirq.c +++ b/trunk/block/blk-softirq.c @@ -8,7 +8,6 @@ #include #include #include -#include #include "blk.h" @@ -104,10 +103,9 @@ static struct notifier_block __cpuinitdata blk_cpu_notifier = { void __blk_complete_request(struct request *req) { - int ccpu, cpu; + int ccpu, cpu, group_cpu = NR_CPUS; struct request_queue *q = req->q; unsigned long flags; - bool shared = false; BUG_ON(!q->softirq_done_fn); @@ -119,20 +117,22 @@ void __blk_complete_request(struct request *req) */ if (req->cpu != -1) { ccpu = req->cpu; - if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags)) - shared = cpus_share_cache(cpu, ccpu); + if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags)) { + ccpu = blk_cpu_to_group(ccpu); + group_cpu = blk_cpu_to_group(cpu); + } } else ccpu = cpu; /* - * If current CPU and requested CPU share a cache, run the softirq on - * the current CPU. One might concern this is just like + * If current CPU and requested CPU are in the same group, running + * softirq in current CPU. One might concern this is just like * QUEUE_FLAG_SAME_FORCE, but actually not. blk_complete_request() is * running in interrupt handler, and currently I/O controller doesn't * support multiple interrupts, so current CPU is unique actually. This * avoids IPI sending from current CPU to the first CPU of a group. */ - if (ccpu == cpu || shared) { + if (ccpu == cpu || ccpu == group_cpu) { struct list_head *list; do_local: list = &__get_cpu_var(blk_cpu_done); diff --git a/trunk/block/blk.h b/trunk/block/blk.h index d45be871329e..9c12f80882b0 100644 --- a/trunk/block/blk.h +++ b/trunk/block/blk.h @@ -166,6 +166,22 @@ static inline int queue_congestion_off_threshold(struct request_queue *q) return q->nr_congestion_off; } +static inline int blk_cpu_to_group(int cpu) +{ + int group = NR_CPUS; +#ifdef CONFIG_SCHED_MC + const struct cpumask *mask = cpu_coregroup_mask(cpu); + group = cpumask_first(mask); +#elif defined(CONFIG_SCHED_SMT) + group = cpumask_first(topology_thread_cpumask(cpu)); +#else + return cpu; +#endif + if (likely(group < NR_CPUS)) + return group; + return cpu; +} + /* * Contribute to IO statistics IFF: * diff --git a/trunk/fs/proc/base.c b/trunk/fs/proc/base.c index 965d4bde3a3b..d4548dd49b02 100644 --- a/trunk/fs/proc/base.c +++ b/trunk/fs/proc/base.c @@ -1310,7 +1310,8 @@ sched_autogroup_write(struct file *file, const char __user *buf, if (!p) return -ESRCH; - err = proc_sched_autogroup_set_nice(p, nice); + err = nice; + err = proc_sched_autogroup_set_nice(p, &err); if (err) count = err; diff --git a/trunk/include/linux/cpuset.h b/trunk/include/linux/cpuset.h index e0ffaf061ab7..e9eaec522655 100644 --- a/trunk/include/linux/cpuset.h +++ b/trunk/include/linux/cpuset.h @@ -22,7 +22,7 @@ extern int cpuset_init(void); extern void cpuset_init_smp(void); extern void cpuset_update_active_cpus(void); extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask); -extern void cpuset_cpus_allowed_fallback(struct task_struct *p); +extern int cpuset_cpus_allowed_fallback(struct task_struct *p); extern nodemask_t cpuset_mems_allowed(struct task_struct *p); #define cpuset_current_mems_allowed (current->mems_allowed) void cpuset_init_current_mems_allowed(void); @@ -144,8 +144,10 @@ static inline void cpuset_cpus_allowed(struct task_struct *p, cpumask_copy(mask, cpu_possible_mask); } -static inline void cpuset_cpus_allowed_fallback(struct task_struct *p) +static inline int cpuset_cpus_allowed_fallback(struct task_struct *p) { + do_set_cpus_allowed(p, cpu_possible_mask); + return cpumask_any(cpu_active_mask); } static inline nodemask_t cpuset_mems_allowed(struct task_struct *p) diff --git a/trunk/include/linux/init_task.h b/trunk/include/linux/init_task.h index f994d51f70f2..9c66b1ada9d7 100644 --- a/trunk/include/linux/init_task.h +++ b/trunk/include/linux/init_task.h @@ -149,7 +149,7 @@ extern struct cred init_cred; }, \ .rt = { \ .run_list = LIST_HEAD_INIT(tsk.rt.run_list), \ - .time_slice = RR_TIMESLICE, \ + .time_slice = HZ, \ .nr_cpus_allowed = NR_CPUS, \ }, \ .tasks = LIST_HEAD_INIT(tsk.tasks), \ diff --git a/trunk/include/linux/kernel.h b/trunk/include/linux/kernel.h index d801acb5e680..e8343422240a 100644 --- a/trunk/include/linux/kernel.h +++ b/trunk/include/linux/kernel.h @@ -85,19 +85,6 @@ } \ ) -/* - * Multiplies an integer by a fraction, while avoiding unnecessary - * overflow or loss of precision. - */ -#define mult_frac(x, numer, denom)( \ -{ \ - typeof(x) quot = (x) / (denom); \ - typeof(x) rem = (x) % (denom); \ - (quot * (numer)) + ((rem * (numer)) / (denom)); \ -} \ -) - - #define _RET_IP_ (unsigned long)__builtin_return_address(0) #define _THIS_IP_ ({ __label__ __here; __here: (unsigned long)&&__here; }) diff --git a/trunk/include/linux/preempt.h b/trunk/include/linux/preempt.h index 5a710b9c578e..58969b2a8a82 100644 --- a/trunk/include/linux/preempt.h +++ b/trunk/include/linux/preempt.h @@ -48,14 +48,12 @@ do { \ barrier(); \ } while (0) -#define sched_preempt_enable_no_resched() \ +#define preempt_enable_no_resched() \ do { \ barrier(); \ dec_preempt_count(); \ } while (0) -#define preempt_enable_no_resched() sched_preempt_enable_no_resched() - #define preempt_enable() \ do { \ preempt_enable_no_resched(); \ @@ -94,7 +92,6 @@ do { \ #else /* !CONFIG_PREEMPT_COUNT */ #define preempt_disable() do { } while (0) -#define sched_preempt_enable_no_resched() do { } while (0) #define preempt_enable_no_resched() do { } while (0) #define preempt_enable() do { } while (0) diff --git a/trunk/include/linux/printk.h b/trunk/include/linux/printk.h index 1f77a4174ee0..f0e22f75143f 100644 --- a/trunk/include/linux/printk.h +++ b/trunk/include/linux/printk.h @@ -100,11 +100,6 @@ int vprintk(const char *fmt, va_list args); asmlinkage __printf(1, 2) __cold int printk(const char *fmt, ...); -/* - * Special printk facility for scheduler use only, _DO_NOT_USE_ ! - */ -__printf(1, 2) __cold int printk_sched(const char *fmt, ...); - /* * Please don't use printk_ratelimit(), because it shares ratelimiting state * with all other unrelated printk_ratelimit() callsites. Instead use @@ -132,11 +127,6 @@ int printk(const char *s, ...) { return 0; } -static inline __printf(1, 2) __cold -int printk_sched(const char *s, ...) -{ - return 0; -} static inline int printk_ratelimit(void) { return 0; diff --git a/trunk/include/linux/sched.h b/trunk/include/linux/sched.h index 0cd002cc9e6a..0657368bd78f 100644 --- a/trunk/include/linux/sched.h +++ b/trunk/include/linux/sched.h @@ -361,7 +361,6 @@ extern signed long schedule_timeout_interruptible(signed long timeout); extern signed long schedule_timeout_killable(signed long timeout); extern signed long schedule_timeout_uninterruptible(signed long timeout); asmlinkage void schedule(void); -extern void schedule_preempt_disabled(void); extern int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner); struct nsproxy; @@ -906,7 +905,6 @@ struct sched_group_power { * single CPU. */ unsigned int power, power_orig; - unsigned long next_update; /* * Number of busy cpus in this group. */ @@ -1054,8 +1052,6 @@ static inline int test_sd_parent(struct sched_domain *sd, int flag) unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu); unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu); -bool cpus_share_cache(int this_cpu, int that_cpu); - #else /* CONFIG_SMP */ struct sched_domain_attr; @@ -1065,12 +1061,6 @@ partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], struct sched_domain_attr *dattr_new) { } - -static inline bool cpus_share_cache(int this_cpu, int that_cpu) -{ - return true; -} - #endif /* !CONFIG_SMP */ @@ -1235,12 +1225,6 @@ struct sched_rt_entity { #endif }; -/* - * default timeslice is 100 msecs (used only for SCHED_RR tasks). - * Timeslices get refilled after they expire. - */ -#define RR_TIMESLICE (100 * HZ / 1000) - struct rcu_node; enum perf_event_task_context { @@ -2064,7 +2048,7 @@ extern void sched_autogroup_fork(struct signal_struct *sig); extern void sched_autogroup_exit(struct signal_struct *sig); #ifdef CONFIG_PROC_FS extern void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m); -extern int proc_sched_autogroup_set_nice(struct task_struct *p, int nice); +extern int proc_sched_autogroup_set_nice(struct task_struct *p, int *nice); #endif #else static inline void sched_autogroup_create_attach(struct task_struct *p) { } @@ -2081,20 +2065,12 @@ extern unsigned int sysctl_sched_cfs_bandwidth_slice; extern int rt_mutex_getprio(struct task_struct *p); extern void rt_mutex_setprio(struct task_struct *p, int prio); extern void rt_mutex_adjust_pi(struct task_struct *p); -static inline bool tsk_is_pi_blocked(struct task_struct *tsk) -{ - return tsk->pi_blocked_on != NULL; -} #else static inline int rt_mutex_getprio(struct task_struct *p) { return p->normal_prio; } # define rt_mutex_adjust_pi(p) do { } while (0) -static inline bool tsk_is_pi_blocked(struct task_struct *tsk) -{ - return false; -} #endif extern bool yield_to(struct task_struct *p, bool preempt); @@ -2413,15 +2389,12 @@ static inline void task_unlock(struct task_struct *p) extern struct sighand_struct *__lock_task_sighand(struct task_struct *tsk, unsigned long *flags); -static inline struct sighand_struct *lock_task_sighand(struct task_struct *tsk, - unsigned long *flags) -{ - struct sighand_struct *ret; - - ret = __lock_task_sighand(tsk, flags); - (void)__cond_lock(&tsk->sighand->siglock, ret); - return ret; -} +#define lock_task_sighand(tsk, flags) \ +({ struct sighand_struct *__ss; \ + __cond_lock(&(tsk)->sighand->siglock, \ + (__ss = __lock_task_sighand(tsk, flags))); \ + __ss; \ +}) \ static inline void unlock_task_sighand(struct task_struct *tsk, unsigned long *flags) diff --git a/trunk/include/linux/wait.h b/trunk/include/linux/wait.h index 7d9a9e990ce6..a9ce45e8501c 100644 --- a/trunk/include/linux/wait.h +++ b/trunk/include/linux/wait.h @@ -157,7 +157,7 @@ void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr, void *key); void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key); void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, int nr, void *key); -void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr); +void __wake_up_locked(wait_queue_head_t *q, unsigned int mode); void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr); void __wake_up_bit(wait_queue_head_t *, void *, int); int __wait_on_bit(wait_queue_head_t *, struct wait_bit_queue *, int (*)(void *), unsigned); @@ -170,8 +170,7 @@ wait_queue_head_t *bit_waitqueue(void *, int); #define wake_up(x) __wake_up(x, TASK_NORMAL, 1, NULL) #define wake_up_nr(x, nr) __wake_up(x, TASK_NORMAL, nr, NULL) #define wake_up_all(x) __wake_up(x, TASK_NORMAL, 0, NULL) -#define wake_up_locked(x) __wake_up_locked((x), TASK_NORMAL, 1) -#define wake_up_all_locked(x) __wake_up_locked((x), TASK_NORMAL, 0) +#define wake_up_locked(x) __wake_up_locked((x), TASK_NORMAL) #define wake_up_interruptible(x) __wake_up(x, TASK_INTERRUPTIBLE, 1, NULL) #define wake_up_interruptible_nr(x, nr) __wake_up(x, TASK_INTERRUPTIBLE, nr, NULL) diff --git a/trunk/init/main.c b/trunk/init/main.c index 4990f7ec776a..ff49a6dacfbb 100644 --- a/trunk/init/main.c +++ b/trunk/init/main.c @@ -374,8 +374,11 @@ static noinline void __init_refok rest_init(void) * at least once to get things moving: */ init_idle_bootup_task(current); - schedule_preempt_disabled(); + preempt_enable_no_resched(); + schedule(); + /* Call into cpu_idle with preempt disabled */ + preempt_disable(); cpu_idle(); } diff --git a/trunk/kernel/cpuset.c b/trunk/kernel/cpuset.c index 4ef4d7ecb9fb..a09ac2b9a661 100644 --- a/trunk/kernel/cpuset.c +++ b/trunk/kernel/cpuset.c @@ -2195,9 +2195,10 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) mutex_unlock(&callback_mutex); } -void cpuset_cpus_allowed_fallback(struct task_struct *tsk) +int cpuset_cpus_allowed_fallback(struct task_struct *tsk) { const struct cpuset *cs; + int cpu; rcu_read_lock(); cs = task_cs(tsk); @@ -2218,10 +2219,22 @@ void cpuset_cpus_allowed_fallback(struct task_struct *tsk) * changes in tsk_cs()->cpus_allowed. Otherwise we can temporary * set any mask even if it is not right from task_cs() pov, * the pending set_cpus_allowed_ptr() will fix things. - * - * select_fallback_rq() will fix things ups and set cpu_possible_mask - * if required. */ + + cpu = cpumask_any_and(&tsk->cpus_allowed, cpu_active_mask); + if (cpu >= nr_cpu_ids) { + /* + * Either tsk->cpus_allowed is wrong (see above) or it + * is actually empty. The latter case is only possible + * if we are racing with remove_tasks_in_empty_cpuset(). + * Like above we can temporary set any mask and rely on + * set_cpus_allowed_ptr() as synchronization point. + */ + do_set_cpus_allowed(tsk, cpu_possible_mask); + cpu = cpumask_any(cpu_active_mask); + } + + return cpu; } void cpuset_init_current_mems_allowed(void) diff --git a/trunk/kernel/mutex.c b/trunk/kernel/mutex.c index a307cc9c9526..89096dd8786f 100644 --- a/trunk/kernel/mutex.c +++ b/trunk/kernel/mutex.c @@ -240,7 +240,9 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, /* didn't get the lock, go to sleep: */ spin_unlock_mutex(&lock->wait_lock, flags); - schedule_preempt_disabled(); + preempt_enable_no_resched(); + schedule(); + preempt_disable(); spin_lock_mutex(&lock->wait_lock, flags); } diff --git a/trunk/kernel/printk.c b/trunk/kernel/printk.c index b64ce71cb2e5..32690a0b7a18 100644 --- a/trunk/kernel/printk.c +++ b/trunk/kernel/printk.c @@ -1211,27 +1211,13 @@ int is_console_locked(void) return console_locked; } -/* - * Delayed printk facility, for scheduler-internal messages: - */ -#define PRINTK_BUF_SIZE 512 - -#define PRINTK_PENDING_WAKEUP 0x01 -#define PRINTK_PENDING_SCHED 0x02 - static DEFINE_PER_CPU(int, printk_pending); -static DEFINE_PER_CPU(char [PRINTK_BUF_SIZE], printk_sched_buf); void printk_tick(void) { if (__this_cpu_read(printk_pending)) { - int pending = __this_cpu_xchg(printk_pending, 0); - if (pending & PRINTK_PENDING_SCHED) { - char *buf = __get_cpu_var(printk_sched_buf); - printk(KERN_WARNING "[sched_delayed] %s", buf); - } - if (pending & PRINTK_PENDING_WAKEUP) - wake_up_interruptible(&log_wait); + __this_cpu_write(printk_pending, 0); + wake_up_interruptible(&log_wait); } } @@ -1245,7 +1231,7 @@ int printk_needs_cpu(int cpu) void wake_up_klogd(void) { if (waitqueue_active(&log_wait)) - this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP); + this_cpu_write(printk_pending, 1); } /** @@ -1638,26 +1624,6 @@ late_initcall(printk_late_init); #if defined CONFIG_PRINTK -int printk_sched(const char *fmt, ...) -{ - unsigned long flags; - va_list args; - char *buf; - int r; - - local_irq_save(flags); - buf = __get_cpu_var(printk_sched_buf); - - va_start(args, fmt); - r = vsnprintf(buf, PRINTK_BUF_SIZE, fmt, args); - va_end(args); - - __this_cpu_or(printk_pending, PRINTK_PENDING_SCHED); - local_irq_restore(flags); - - return r; -} - /* * printk rate limiting, lifted from the networking subsystem. * diff --git a/trunk/kernel/sched/auto_group.c b/trunk/kernel/sched/auto_group.c index 0984a21076a3..e8a1f83ee0e7 100644 --- a/trunk/kernel/sched/auto_group.c +++ b/trunk/kernel/sched/auto_group.c @@ -195,20 +195,20 @@ __setup("noautogroup", setup_autogroup); #ifdef CONFIG_PROC_FS -int proc_sched_autogroup_set_nice(struct task_struct *p, int nice) +int proc_sched_autogroup_set_nice(struct task_struct *p, int *nice) { static unsigned long next = INITIAL_JIFFIES; struct autogroup *ag; int err; - if (nice < -20 || nice > 19) + if (*nice < -20 || *nice > 19) return -EINVAL; - err = security_task_setnice(current, nice); + err = security_task_setnice(current, *nice); if (err) return err; - if (nice < 0 && !can_nice(current, nice)) + if (*nice < 0 && !can_nice(current, *nice)) return -EPERM; /* this is a heavy operation taking global locks.. */ @@ -219,9 +219,9 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int nice) ag = autogroup_task_get(p); down_write(&ag->lock); - err = sched_group_set_shares(ag->tg, prio_to_weight[nice + 20]); + err = sched_group_set_shares(ag->tg, prio_to_weight[*nice + 20]); if (!err) - ag->nice = nice; + ag->nice = *nice; up_write(&ag->lock); autogroup_kref_put(ag); diff --git a/trunk/kernel/sched/core.c b/trunk/kernel/sched/core.c index 929fd857ef88..423f40f32a59 100644 --- a/trunk/kernel/sched/core.c +++ b/trunk/kernel/sched/core.c @@ -1263,59 +1263,29 @@ EXPORT_SYMBOL_GPL(kick_process); */ static int select_fallback_rq(int cpu, struct task_struct *p) { - const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu)); - enum { cpuset, possible, fail } state = cpuset; int dest_cpu; + const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu)); /* Look for allowed, online CPU in same node. */ - for_each_cpu_mask(dest_cpu, *nodemask) { - if (!cpu_online(dest_cpu)) - continue; - if (!cpu_active(dest_cpu)) - continue; + for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask) if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) return dest_cpu; - } - - for (;;) { - /* Any allowed, online CPU? */ - for_each_cpu_mask(dest_cpu, *tsk_cpus_allowed(p)) { - if (!cpu_online(dest_cpu)) - continue; - if (!cpu_active(dest_cpu)) - continue; - goto out; - } - switch (state) { - case cpuset: - /* No more Mr. Nice Guy. */ - cpuset_cpus_allowed_fallback(p); - state = possible; - break; + /* Any allowed, online CPU? */ + dest_cpu = cpumask_any_and(tsk_cpus_allowed(p), cpu_active_mask); + if (dest_cpu < nr_cpu_ids) + return dest_cpu; - case possible: - do_set_cpus_allowed(p, cpu_possible_mask); - state = fail; - break; - - case fail: - BUG(); - break; - } - } - -out: - if (state != cpuset) { - /* - * Don't tell them about moving exiting tasks or - * kernel threads (both mm NULL), since they never - * leave kernel. - */ - if (p->mm && printk_ratelimit()) { - printk_sched("process %d (%s) no longer affine to cpu%d\n", - task_pid_nr(p), p->comm, cpu); - } + /* No more Mr. Nice Guy. */ + dest_cpu = cpuset_cpus_allowed_fallback(p); + /* + * Don't tell them about moving exiting tasks or + * kernel threads (both mm NULL), since they never + * leave kernel. + */ + if (p->mm && printk_ratelimit()) { + printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n", + task_pid_nr(p), p->comm, cpu); } return dest_cpu; @@ -1537,7 +1507,7 @@ static int ttwu_activate_remote(struct task_struct *p, int wake_flags) } #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ -bool cpus_share_cache(int this_cpu, int that_cpu) +static inline int ttwu_share_cache(int this_cpu, int that_cpu) { return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); } @@ -1548,7 +1518,7 @@ static void ttwu_queue(struct task_struct *p, int cpu) struct rq *rq = cpu_rq(cpu); #if defined(CONFIG_SMP) - if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) { + if (sched_feat(TTWU_QUEUE) && !ttwu_share_cache(smp_processor_id(), cpu)) { sched_clock_cpu(cpu); /* sync clocks x-cpu */ ttwu_queue_remote(p, cpu); return; @@ -1962,6 +1932,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) local_irq_enable(); #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ finish_lock_switch(rq, prev); + finish_arch_post_lock_switch(); fire_sched_in_preempt_notifiers(current); if (mm) @@ -2296,10 +2267,13 @@ calc_load_n(unsigned long load, unsigned long exp, * Once we've updated the global active value, we need to apply the exponential * weights adjusted to the number of cycles missed. */ -static void calc_global_nohz(void) +static void calc_global_nohz(unsigned long ticks) { long delta, active, n; + if (time_before(jiffies, calc_load_update)) + return; + /* * If we crossed a calc_load_update boundary, make sure to fold * any pending idle changes, the respective CPUs might have @@ -2311,25 +2285,31 @@ static void calc_global_nohz(void) atomic_long_add(delta, &calc_load_tasks); /* - * It could be the one fold was all it took, we done! + * If we were idle for multiple load cycles, apply them. */ - if (time_before(jiffies, calc_load_update + 10)) - return; + if (ticks >= LOAD_FREQ) { + n = ticks / LOAD_FREQ; - /* - * Catch-up, fold however many we are behind still - */ - delta = jiffies - calc_load_update - 10; - n = 1 + (delta / LOAD_FREQ); + active = atomic_long_read(&calc_load_tasks); + active = active > 0 ? active * FIXED_1 : 0; - active = atomic_long_read(&calc_load_tasks); - active = active > 0 ? active * FIXED_1 : 0; + avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); + avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); + avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); - avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); - avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); - avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); + calc_load_update += n * LOAD_FREQ; + } - calc_load_update += n * LOAD_FREQ; + /* + * Its possible the remainder of the above division also crosses + * a LOAD_FREQ period, the regular check in calc_global_load() + * which comes after this will take care of that. + * + * Consider us being 11 ticks before a cycle completion, and us + * sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will + * age us 4 cycles, and the test in calc_global_load() will + * pick up the final one. + */ } #else void calc_load_account_idle(struct rq *this_rq) @@ -2341,7 +2321,7 @@ static inline long calc_load_fold_idle(void) return 0; } -static void calc_global_nohz(void) +static void calc_global_nohz(unsigned long ticks) { } #endif @@ -2369,6 +2349,8 @@ void calc_global_load(unsigned long ticks) { long active; + calc_global_nohz(ticks); + if (time_before(jiffies, calc_load_update + 10)) return; @@ -2380,16 +2362,6 @@ void calc_global_load(unsigned long ticks) avenrun[2] = calc_load(avenrun[2], EXP_15, active); calc_load_update += LOAD_FREQ; - - /* - * Account one period with whatever state we found before - * folding in the nohz state and ageing the entire idle period. - * - * This avoids loosing a sample when we go idle between - * calc_load_account_active() (10 ticks ago) and now and thus - * under-accounting. - */ - calc_global_nohz(); } /* @@ -3099,6 +3071,8 @@ EXPORT_SYMBOL(sub_preempt_count); */ static noinline void __schedule_bug(struct task_struct *prev) { + struct pt_regs *regs = get_irq_regs(); + if (oops_in_progress) return; @@ -3109,7 +3083,11 @@ static noinline void __schedule_bug(struct task_struct *prev) print_modules(); if (irqs_disabled()) print_irqtrace_events(prev); - dump_stack(); + + if (regs) + show_regs(regs); + else + dump_stack(); } /* @@ -3243,14 +3221,14 @@ static void __sched __schedule(void) post_schedule(rq); - sched_preempt_enable_no_resched(); + preempt_enable_no_resched(); if (need_resched()) goto need_resched; } static inline void sched_submit_work(struct task_struct *tsk) { - if (!tsk->state || tsk_is_pi_blocked(tsk)) + if (!tsk->state) return; /* * If we are going to sleep and we have plugged IO queued, @@ -3269,18 +3247,6 @@ asmlinkage void __sched schedule(void) } EXPORT_SYMBOL(schedule); -/** - * schedule_preempt_disabled - called with preemption disabled - * - * Returns with preemption disabled. Note: preempt_count must be 1 - */ -void __sched schedule_preempt_disabled(void) -{ - sched_preempt_enable_no_resched(); - schedule(); - preempt_disable(); -} - #ifdef CONFIG_MUTEX_SPIN_ON_OWNER static inline bool owner_running(struct mutex *lock, struct task_struct *owner) @@ -3441,9 +3407,9 @@ EXPORT_SYMBOL(__wake_up); /* * Same as __wake_up but called with the spinlock in wait_queue_head_t held. */ -void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr) +void __wake_up_locked(wait_queue_head_t *q, unsigned int mode) { - __wake_up_common(q, mode, nr, 0, NULL); + __wake_up_common(q, mode, 1, 0, NULL); } EXPORT_SYMBOL_GPL(__wake_up_locked); @@ -3802,24 +3768,6 @@ void rt_mutex_setprio(struct task_struct *p, int prio) rq = __task_rq_lock(p); - /* - * Idle task boosting is a nono in general. There is one - * exception, when PREEMPT_RT and NOHZ is active: - * - * The idle task calls get_next_timer_interrupt() and holds - * the timer wheel base->lock on the CPU and another CPU wants - * to access the timer (probably to cancel it). We can safely - * ignore the boosting request, as the idle CPU runs this code - * with interrupts disabled and will complete the lock - * protected section without being interrupted. So there is no - * real need to boost. - */ - if (unlikely(p == rq->idle)) { - WARN_ON(p != rq->curr); - WARN_ON(p->pi_blocked_on); - goto out_unlock; - } - trace_sched_pi_setprio(p, prio); oldprio = p->prio; prev_class = p->sched_class; @@ -3843,10 +3791,11 @@ void rt_mutex_setprio(struct task_struct *p, int prio) enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); check_class_changed(rq, p, prev_class, oldprio); -out_unlock: __task_rq_unlock(rq); } + #endif + void set_user_nice(struct task_struct *p, long nice) { int old_prio, delta, on_rq; @@ -4526,7 +4475,7 @@ SYSCALL_DEFINE0(sched_yield) __release(rq->lock); spin_release(&rq->lock.dep_map, 1, _THIS_IP_); do_raw_spin_unlock(&rq->lock); - sched_preempt_enable_no_resched(); + preempt_enable_no_resched(); schedule(); @@ -4600,24 +4549,8 @@ EXPORT_SYMBOL(__cond_resched_softirq); /** * yield - yield the current processor to other threads. * - * Do not ever use this function, there's a 99% chance you're doing it wrong. - * - * The scheduler is at all times free to pick the calling task as the most - * eligible task to run, if removing the yield() call from your code breaks - * it, its already broken. - * - * Typical broken usage is: - * - * while (!event) - * yield(); - * - * where one assumes that yield() will let 'the other' process run that will - * make event true. If the current task is a SCHED_FIFO task that will never - * happen. Never use yield() as a progress guarantee!! - * - * If you want to use yield() to wait for something, use wait_event(). - * If you want to use yield() to be 'nice' for others, use cond_resched(). - * If you still want to use yield(), do not! + * This is a shortcut for kernel-space yielding - it marks the + * thread runnable and calls sys_sched_yield(). */ void __sched yield(void) { @@ -5449,7 +5382,7 @@ static int __cpuinit sched_cpu_active(struct notifier_block *nfb, unsigned long action, void *hcpu) { switch (action & ~CPU_TASKS_FROZEN) { - case CPU_STARTING: + case CPU_ONLINE: case CPU_DOWN_FAILED: set_cpu_active((long)hcpu, true); return NOTIFY_OK; @@ -5821,7 +5754,7 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu) * * Also keep a unique ID per domain (we use the first cpu number in * the cpumask of the domain), this allows us to quickly tell if - * two cpus are in the same cache domain, see cpus_share_cache(). + * two cpus are in the same cache domain, see ttwu_share_cache(). */ DEFINE_PER_CPU(struct sched_domain *, sd_llc); DEFINE_PER_CPU(int, sd_llc_id); @@ -6998,9 +6931,6 @@ void __init sched_init(void) rq->online = 0; rq->idle_stamp = 0; rq->avg_idle = 2*sysctl_sched_migration_cost; - - INIT_LIST_HEAD(&rq->cfs_tasks); - rq_attach_root(rq, &def_root_domain); #ifdef CONFIG_NO_HZ rq->nohz_flags = 0; diff --git a/trunk/kernel/sched/debug.c b/trunk/kernel/sched/debug.c index 09acaa15161d..2a075e10004b 100644 --- a/trunk/kernel/sched/debug.c +++ b/trunk/kernel/sched/debug.c @@ -288,6 +288,7 @@ static void print_cpu(struct seq_file *m, int cpu) P(yld_count); + P(sched_switch); P(sched_count); P(sched_goidle); #ifdef CONFIG_SMP diff --git a/trunk/kernel/sched/fair.c b/trunk/kernel/sched/fair.c index 258f430d71a5..aca16b843b7e 100644 --- a/trunk/kernel/sched/fair.c +++ b/trunk/kernel/sched/fair.c @@ -416,8 +416,8 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse) #endif /* CONFIG_FAIR_GROUP_SCHED */ -static __always_inline -void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec); +static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, + unsigned long delta_exec); /************************************************************** * Scheduling class tree data structure manipulation methods: @@ -776,16 +776,29 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) * Scheduling class queueing methods: */ +#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED +static void +add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight) +{ + cfs_rq->task_weight += weight; +} +#else +static inline void +add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight) +{ +} +#endif + static void account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) { update_load_add(&cfs_rq->load, se->load.weight); if (!parent_entity(se)) update_load_add(&rq_of(cfs_rq)->load, se->load.weight); -#ifdef CONFIG_SMP - if (entity_is_task(se)) - list_add_tail(&se->group_node, &rq_of(cfs_rq)->cfs_tasks); -#endif + if (entity_is_task(se)) { + add_cfs_task_weight(cfs_rq, se->load.weight); + list_add(&se->group_node, &cfs_rq->tasks); + } cfs_rq->nr_running++; } @@ -795,8 +808,10 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) update_load_sub(&cfs_rq->load, se->load.weight); if (!parent_entity(se)) update_load_sub(&rq_of(cfs_rq)->load, se->load.weight); - if (entity_is_task(se)) + if (entity_is_task(se)) { + add_cfs_task_weight(cfs_rq, -se->load.weight); list_del_init(&se->group_node); + } cfs_rq->nr_running--; } @@ -1162,7 +1177,7 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) __clear_buddies_skip(se); } -static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq); +static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq); static void dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) @@ -1546,8 +1561,8 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, resched_task(rq_of(cfs_rq)->curr); } -static __always_inline -void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec) +static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, + unsigned long delta_exec) { if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled) return; @@ -2073,11 +2088,11 @@ void unthrottle_offline_cfs_rqs(struct rq *rq) } #else /* CONFIG_CFS_BANDWIDTH */ -static __always_inline -void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec) {} +static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, + unsigned long delta_exec) {} static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} -static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} +static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) { @@ -2657,6 +2672,8 @@ static int select_idle_sibling(struct task_struct *p, int target) /* * Otherwise, iterate the domains and find an elegible idle cpu. */ + rcu_read_lock(); + sd = rcu_dereference(per_cpu(sd_llc, target)); for_each_lower_domain(sd) { sg = sd->groups; @@ -2678,6 +2695,8 @@ static int select_idle_sibling(struct task_struct *p, int target) } while (sg != sd->groups); } done: + rcu_read_unlock(); + return target; } @@ -2903,7 +2922,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ return; /* - * This is possible from callers such as move_task(), in which we + * This is possible from callers such as pull_task(), in which we * unconditionally check_prempt_curr() after an enqueue (which may have * lead to a throttle). This both saves work and prevents false * next-buddy nomination below. @@ -3067,39 +3086,17 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp * Fair scheduling class load-balancing methods: */ -static unsigned long __read_mostly max_load_balance_interval = HZ/10; - -#define LBF_ALL_PINNED 0x01 -#define LBF_NEED_BREAK 0x02 - -struct lb_env { - struct sched_domain *sd; - - int src_cpu; - struct rq *src_rq; - - int dst_cpu; - struct rq *dst_rq; - - enum cpu_idle_type idle; - long load_move; - unsigned int flags; - - unsigned int loop; - unsigned int loop_break; - unsigned int loop_max; -}; - /* - * move_task - move a task from one runqueue to another runqueue. + * pull_task - move a task from a remote runqueue to the local runqueue. * Both runqueues must be locked. */ -static void move_task(struct task_struct *p, struct lb_env *env) +static void pull_task(struct rq *src_rq, struct task_struct *p, + struct rq *this_rq, int this_cpu) { - deactivate_task(env->src_rq, p, 0); - set_task_cpu(p, env->dst_cpu); - activate_task(env->dst_rq, p, 0); - check_preempt_curr(env->dst_rq, p, 0); + deactivate_task(src_rq, p, 0); + set_task_cpu(p, this_cpu); + activate_task(this_rq, p, 0); + check_preempt_curr(this_rq, p, 0); } /* @@ -3134,11 +3131,19 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) return delta < (s64)sysctl_sched_migration_cost; } +#define LBF_ALL_PINNED 0x01 +#define LBF_NEED_BREAK 0x02 /* clears into HAD_BREAK */ +#define LBF_HAD_BREAK 0x04 +#define LBF_HAD_BREAKS 0x0C /* count HAD_BREAKs overflows into ABORT */ +#define LBF_ABORT 0x10 + /* * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? */ static -int can_migrate_task(struct task_struct *p, struct lb_env *env) +int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, + struct sched_domain *sd, enum cpu_idle_type idle, + int *lb_flags) { int tsk_cache_hot = 0; /* @@ -3147,13 +3152,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) * 2) cannot be migrated to this CPU due to cpus_allowed, or * 3) are cache-hot on their current CPU. */ - if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) { + if (!cpumask_test_cpu(this_cpu, tsk_cpus_allowed(p))) { schedstat_inc(p, se.statistics.nr_failed_migrations_affine); return 0; } - env->flags &= ~LBF_ALL_PINNED; + *lb_flags &= ~LBF_ALL_PINNED; - if (task_running(env->src_rq, p)) { + if (task_running(rq, p)) { schedstat_inc(p, se.statistics.nr_failed_migrations_running); return 0; } @@ -3164,12 +3169,12 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) * 2) too many balance attempts have failed. */ - tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd); + tsk_cache_hot = task_hot(p, rq->clock_task, sd); if (!tsk_cache_hot || - env->sd->nr_balance_failed > env->sd->cache_nice_tries) { + sd->nr_balance_failed > sd->cache_nice_tries) { #ifdef CONFIG_SCHEDSTATS if (tsk_cache_hot) { - schedstat_inc(env->sd, lb_hot_gained[env->idle]); + schedstat_inc(sd, lb_hot_gained[idle]); schedstat_inc(p, se.statistics.nr_forced_migrations); } #endif @@ -3190,80 +3195,65 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) * * Called with both runqueues locked. */ -static int move_one_task(struct lb_env *env) +static int +move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, + struct sched_domain *sd, enum cpu_idle_type idle) { struct task_struct *p, *n; + struct cfs_rq *cfs_rq; + int pinned = 0; - list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) { - if (throttled_lb_pair(task_group(p), env->src_rq->cpu, env->dst_cpu)) - continue; + for_each_leaf_cfs_rq(busiest, cfs_rq) { + list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) { + if (throttled_lb_pair(task_group(p), + busiest->cpu, this_cpu)) + break; - if (!can_migrate_task(p, env)) - continue; + if (!can_migrate_task(p, busiest, this_cpu, + sd, idle, &pinned)) + continue; - move_task(p, env); - /* - * Right now, this is only the second place move_task() - * is called, so we can safely collect move_task() - * stats here rather than inside move_task(). - */ - schedstat_inc(env->sd, lb_gained[env->idle]); - return 1; + pull_task(busiest, p, this_rq, this_cpu); + /* + * Right now, this is only the second place pull_task() + * is called, so we can safely collect pull_task() + * stats here rather than inside pull_task(). + */ + schedstat_inc(sd, lb_gained[idle]); + return 1; + } } + return 0; } -static unsigned long task_h_load(struct task_struct *p); - -/* - * move_tasks tries to move up to load_move weighted load from busiest to - * this_rq, as part of a balancing operation within domain "sd". - * Returns 1 if successful and 0 otherwise. - * - * Called with both runqueues locked. - */ -static int move_tasks(struct lb_env *env) +static unsigned long +balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, + unsigned long max_load_move, struct sched_domain *sd, + enum cpu_idle_type idle, int *lb_flags, + struct cfs_rq *busiest_cfs_rq) { - struct list_head *tasks = &env->src_rq->cfs_tasks; - struct task_struct *p; - unsigned long load; - int pulled = 0; - - if (env->load_move <= 0) - return 0; + int loops = 0, pulled = 0; + long rem_load_move = max_load_move; + struct task_struct *p, *n; - while (!list_empty(tasks)) { - p = list_first_entry(tasks, struct task_struct, se.group_node); + if (max_load_move == 0) + goto out; - env->loop++; - /* We've more or less seen every task there is, call it quits */ - if (env->loop > env->loop_max) - break; - - /* take a breather every nr_migrate tasks */ - if (env->loop > env->loop_break) { - env->loop_break += sysctl_sched_nr_migrate; - env->flags |= LBF_NEED_BREAK; + list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) { + if (loops++ > sysctl_sched_nr_migrate) { + *lb_flags |= LBF_NEED_BREAK; break; } - if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) - goto next; - - load = task_h_load(p); - - if (load < 16 && !env->sd->nr_balance_failed) - goto next; - - if ((load / 2) > env->load_move) - goto next; - - if (!can_migrate_task(p, env)) - goto next; + if ((p->se.load.weight >> 1) > rem_load_move || + !can_migrate_task(p, busiest, this_cpu, sd, idle, + lb_flags)) + continue; - move_task(p, env); + pull_task(busiest, p, this_rq, this_cpu); pulled++; - env->load_move -= load; + rem_load_move -= p->se.load.weight; #ifdef CONFIG_PREEMPT /* @@ -3271,30 +3261,28 @@ static int move_tasks(struct lb_env *env) * kernels will stop after the first task is pulled to minimize * the critical section. */ - if (env->idle == CPU_NEWLY_IDLE) + if (idle == CPU_NEWLY_IDLE) { + *lb_flags |= LBF_ABORT; break; + } #endif /* * We only want to steal up to the prescribed amount of * weighted load. */ - if (env->load_move <= 0) + if (rem_load_move <= 0) break; - - continue; -next: - list_move_tail(&p->se.group_node, tasks); } - +out: /* - * Right now, this is one of only two places move_task() is called, - * so we can safely collect move_task() stats here rather than - * inside move_task(). + * Right now, this is one of only two places pull_task() is called, + * so we can safely collect pull_task() stats here rather than + * inside pull_task(). */ - schedstat_add(env->sd, lb_gained[env->idle], pulled); + schedstat_add(sd, lb_gained[idle], pulled); - return pulled; + return max_load_move - rem_load_move; } #ifdef CONFIG_FAIR_GROUP_SCHED @@ -3374,35 +3362,113 @@ static int tg_load_down(struct task_group *tg, void *data) static void update_h_load(long cpu) { - rcu_read_lock(); walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); - rcu_read_unlock(); } -static unsigned long task_h_load(struct task_struct *p) +static unsigned long +load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, + unsigned long max_load_move, + struct sched_domain *sd, enum cpu_idle_type idle, + int *lb_flags) { - struct cfs_rq *cfs_rq = task_cfs_rq(p); - unsigned long load; + long rem_load_move = max_load_move; + struct cfs_rq *busiest_cfs_rq; - load = p->se.load.weight; - load = div_u64(load * cfs_rq->h_load, cfs_rq->load.weight + 1); + rcu_read_lock(); + update_h_load(cpu_of(busiest)); - return load; + for_each_leaf_cfs_rq(busiest, busiest_cfs_rq) { + unsigned long busiest_h_load = busiest_cfs_rq->h_load; + unsigned long busiest_weight = busiest_cfs_rq->load.weight; + u64 rem_load, moved_load; + + if (*lb_flags & (LBF_NEED_BREAK|LBF_ABORT)) + break; + + /* + * empty group or part of a throttled hierarchy + */ + if (!busiest_cfs_rq->task_weight || + throttled_lb_pair(busiest_cfs_rq->tg, cpu_of(busiest), this_cpu)) + continue; + + rem_load = (u64)rem_load_move * busiest_weight; + rem_load = div_u64(rem_load, busiest_h_load + 1); + + moved_load = balance_tasks(this_rq, this_cpu, busiest, + rem_load, sd, idle, lb_flags, + busiest_cfs_rq); + + if (!moved_load) + continue; + + moved_load *= busiest_h_load; + moved_load = div_u64(moved_load, busiest_weight + 1); + + rem_load_move -= moved_load; + if (rem_load_move < 0) + break; + } + rcu_read_unlock(); + + return max_load_move - rem_load_move; } #else static inline void update_shares(int cpu) { } -static inline void update_h_load(long cpu) +static unsigned long +load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, + unsigned long max_load_move, + struct sched_domain *sd, enum cpu_idle_type idle, + int *lb_flags) { + return balance_tasks(this_rq, this_cpu, busiest, + max_load_move, sd, idle, lb_flags, + &busiest->cfs); } +#endif -static unsigned long task_h_load(struct task_struct *p) +/* + * move_tasks tries to move up to max_load_move weighted load from busiest to + * this_rq, as part of a balancing operation within domain "sd". + * Returns 1 if successful and 0 otherwise. + * + * Called with both runqueues locked. + */ +static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, + unsigned long max_load_move, + struct sched_domain *sd, enum cpu_idle_type idle, + int *lb_flags) { - return p->se.load.weight; -} + unsigned long total_load_moved = 0, load_moved; + + do { + load_moved = load_balance_fair(this_rq, this_cpu, busiest, + max_load_move - total_load_moved, + sd, idle, lb_flags); + + total_load_moved += load_moved; + + if (*lb_flags & (LBF_NEED_BREAK|LBF_ABORT)) + break; + +#ifdef CONFIG_PREEMPT + /* + * NEWIDLE balancing is a source of latency, so preemptible + * kernels will stop after the first task is pulled to minimize + * the critical section. + */ + if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) { + *lb_flags |= LBF_ABORT; + break; + } #endif + } while (load_moved && max_load_move > total_load_moved); + + return total_load_moved > 0; +} /********** Helpers for find_busiest_group ************************/ /* @@ -3712,11 +3778,6 @@ void update_group_power(struct sched_domain *sd, int cpu) struct sched_domain *child = sd->child; struct sched_group *group, *sdg = sd->groups; unsigned long power; - unsigned long interval; - - interval = msecs_to_jiffies(sd->balance_interval); - interval = clamp(interval, 1UL, max_load_balance_interval); - sdg->sgp->next_update = jiffies + interval; if (!child) { update_cpu_power(sd, cpu); @@ -3824,15 +3885,12 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, * domains. In the newly idle case, we will allow all the cpu's * to do the newly idle load balance. */ - if (local_group) { - if (idle != CPU_NEWLY_IDLE) { - if (balance_cpu != this_cpu) { - *balance = 0; - return; - } - update_group_power(sd, this_cpu); - } else if (time_after_eq(jiffies, group->sgp->next_update)) - update_group_power(sd, this_cpu); + if (idle != CPU_NEWLY_IDLE && local_group) { + if (balance_cpu != this_cpu) { + *balance = 0; + return; + } + update_group_power(sd, this_cpu); } /* Adjust by relative CPU power of the group */ @@ -4395,21 +4453,13 @@ static int load_balance(int this_cpu, struct rq *this_rq, struct sched_domain *sd, enum cpu_idle_type idle, int *balance) { - int ld_moved, active_balance = 0; + int ld_moved, lb_flags = 0, active_balance = 0; struct sched_group *group; unsigned long imbalance; struct rq *busiest; unsigned long flags; struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); - struct lb_env env = { - .sd = sd, - .dst_cpu = this_cpu, - .dst_rq = this_rq, - .idle = idle, - .loop_break = sysctl_sched_nr_migrate, - }; - cpumask_copy(cpus, cpu_active_mask); schedstat_inc(sd, lb_count[idle]); @@ -4444,34 +4494,32 @@ static int load_balance(int this_cpu, struct rq *this_rq, * still unbalanced. ld_moved simply stays zero, so it is * correctly treated as an imbalance. */ - env.flags |= LBF_ALL_PINNED; - env.load_move = imbalance; - env.src_cpu = busiest->cpu; - env.src_rq = busiest; - env.loop_max = busiest->nr_running; - -more_balance: + lb_flags |= LBF_ALL_PINNED; local_irq_save(flags); double_rq_lock(this_rq, busiest); - if (!env.loop) - update_h_load(env.src_cpu); - ld_moved += move_tasks(&env); + ld_moved = move_tasks(this_rq, this_cpu, busiest, + imbalance, sd, idle, &lb_flags); double_rq_unlock(this_rq, busiest); local_irq_restore(flags); - if (env.flags & LBF_NEED_BREAK) { - env.flags &= ~LBF_NEED_BREAK; - goto more_balance; - } - /* * some other cpu did the load balance for us. */ if (ld_moved && this_cpu != smp_processor_id()) resched_cpu(this_cpu); + if (lb_flags & LBF_ABORT) + goto out_balanced; + + if (lb_flags & LBF_NEED_BREAK) { + lb_flags += LBF_HAD_BREAK - LBF_NEED_BREAK; + if (lb_flags & LBF_ABORT) + goto out_balanced; + goto redo; + } + /* All tasks on this runqueue were pinned by CPU affinity */ - if (unlikely(env.flags & LBF_ALL_PINNED)) { + if (unlikely(lb_flags & LBF_ALL_PINNED)) { cpumask_clear_cpu(cpu_of(busiest), cpus); if (!cpumask_empty(cpus)) goto redo; @@ -4501,7 +4549,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, tsk_cpus_allowed(busiest->curr))) { raw_spin_unlock_irqrestore(&busiest->lock, flags); - env.flags |= LBF_ALL_PINNED; + lb_flags |= LBF_ALL_PINNED; goto out_one_pinned; } @@ -4554,7 +4602,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, out_one_pinned: /* tune up the balancing interval */ - if (((env.flags & LBF_ALL_PINNED) && + if (((lb_flags & LBF_ALL_PINNED) && sd->balance_interval < MAX_PINNED_INTERVAL) || (sd->balance_interval < sd->max_interval)) sd->balance_interval *= 2; @@ -4664,18 +4712,10 @@ static int active_load_balance_cpu_stop(void *data) } if (likely(sd)) { - struct lb_env env = { - .sd = sd, - .dst_cpu = target_cpu, - .dst_rq = target_rq, - .src_cpu = busiest_rq->cpu, - .src_rq = busiest_rq, - .idle = CPU_IDLE, - }; - schedstat_inc(sd, alb_count); - if (move_one_task(&env)) + if (move_one_task(target_rq, target_cpu, busiest_rq, + sd, CPU_IDLE)) schedstat_inc(sd, alb_pushed); else schedstat_inc(sd, alb_failed); @@ -4907,6 +4947,8 @@ static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb, static DEFINE_SPINLOCK(balancing); +static unsigned long __read_mostly max_load_balance_interval = HZ/10; + /* * Scale the max load_balance interval with the number of CPUs in the system. * This trades load-balance latency on larger machines for less cross talk. @@ -5300,6 +5342,7 @@ static void set_curr_task_fair(struct rq *rq) void init_cfs_rq(struct cfs_rq *cfs_rq) { cfs_rq->tasks_timeline = RB_ROOT; + INIT_LIST_HEAD(&cfs_rq->tasks); cfs_rq->min_vruntime = (u64)(-(1LL << 20)); #ifndef CONFIG_64BIT cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; @@ -5571,7 +5614,6 @@ __init void init_sched_fair_class(void) open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); #ifdef CONFIG_NO_HZ - nohz.next_balance = jiffies; zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); cpu_notifier(sched_ilb_notifier, 0); #endif diff --git a/trunk/kernel/sched/rt.c b/trunk/kernel/sched/rt.c index 44af55e6d5d0..f42ae7fb5ec5 100644 --- a/trunk/kernel/sched/rt.c +++ b/trunk/kernel/sched/rt.c @@ -778,9 +778,12 @@ static inline int balance_runtime(struct rt_rq *rt_rq) static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) { - int i, idle = 1, throttled = 0; + int i, idle = 1; const struct cpumask *span; + if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) + return 1; + span = sched_rt_period_mask(); for_each_cpu(i, span) { int enqueue = 0; @@ -815,17 +818,12 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) if (!rt_rq_throttled(rt_rq)) enqueue = 1; } - if (rt_rq->rt_throttled) - throttled = 1; if (enqueue) sched_rt_rq_enqueue(rt_rq); raw_spin_unlock(&rq->lock); } - if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)) - return 1; - return idle; } @@ -857,30 +855,8 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) return 0; if (rt_rq->rt_time > runtime) { - struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); - - /* - * Don't actually throttle groups that have no runtime assigned - * but accrue some time due to boosting. - */ - if (likely(rt_b->rt_runtime)) { - static bool once = false; - - rt_rq->rt_throttled = 1; - - if (!once) { - once = true; - printk_sched("sched: RT throttling activated\n"); - } - } else { - /* - * In case we did anyway, make it go away, - * replenishment is a joke, since it will replenish us - * with exactly 0 ns. - */ - rt_rq->rt_time = 0; - } - + rt_rq->rt_throttled = 1; + printk_once(KERN_WARNING "sched: RT throttling activated\n"); if (rt_rq_throttled(rt_rq)) { sched_rt_rq_dequeue(rt_rq); return 1; @@ -908,8 +884,7 @@ static void update_curr_rt(struct rq *rq) if (unlikely((s64)delta_exec < 0)) delta_exec = 0; - schedstat_set(curr->se.statistics.exec_max, - max(curr->se.statistics.exec_max, delta_exec)); + schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec)); curr->se.sum_exec_runtime += delta_exec; account_group_exec_runtime(curr, delta_exec); @@ -1428,7 +1403,7 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu) next_idx: if (idx >= MAX_RT_PRIO) continue; - if (next && next->prio <= idx) + if (next && next->prio < idx) continue; list_for_each_entry(rt_se, array->queue + idx, run_list) { struct task_struct *p; @@ -1997,7 +1972,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) if (--p->rt.time_slice) return; - p->rt.time_slice = RR_TIMESLICE; + p->rt.time_slice = DEF_TIMESLICE; /* * Requeue to the end of queue if we are not the only element @@ -2025,7 +2000,7 @@ static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task) * Time slice is 0 for SCHED_FIFO tasks */ if (task->policy == SCHED_RR) - return RR_TIMESLICE; + return DEF_TIMESLICE; else return 0; } diff --git a/trunk/kernel/sched/sched.h b/trunk/kernel/sched/sched.h index 753bdd567416..d72483d07c9f 100644 --- a/trunk/kernel/sched/sched.h +++ b/trunk/kernel/sched/sched.h @@ -36,7 +36,11 @@ extern __read_mostly int scheduler_running; /* * These are the 'tuning knobs' of the scheduler: + * + * default timeslice is 100 msecs (used only for SCHED_RR tasks). + * Timeslices get refilled after they expire. */ +#define DEF_TIMESLICE (100 * HZ / 1000) /* * single value that denotes runtime == period, ie unlimited time. @@ -212,6 +216,9 @@ struct cfs_rq { struct rb_root tasks_timeline; struct rb_node *rb_leftmost; + struct list_head tasks; + struct list_head *balance_iterator; + /* * 'curr' points to currently running entity on this cfs_rq. * It is set to NULL otherwise (i.e when none are currently running). @@ -238,6 +245,11 @@ struct cfs_rq { struct task_group *tg; /* group that "owns" this runqueue */ #ifdef CONFIG_SMP + /* + * the part of load.weight contributed by tasks + */ + unsigned long task_weight; + /* * h_load = weight * f(tg) * @@ -412,8 +424,6 @@ struct rq { int cpu; int online; - struct list_head cfs_tasks; - u64 rt_avg; u64 age_stamp; u64 idle_stamp; @@ -452,6 +462,7 @@ struct rq { unsigned int yld_count; /* schedule() stats */ + unsigned int sched_switch; unsigned int sched_count; unsigned int sched_goidle; @@ -681,6 +692,9 @@ static inline int task_running(struct rq *rq, struct task_struct *p) #ifndef finish_arch_switch # define finish_arch_switch(prev) do { } while (0) #endif +#ifndef finish_arch_post_lock_switch +# define finish_arch_post_lock_switch() do { } while (0) +#endif #ifndef __ARCH_WANT_UNLOCKED_CTXSW static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) diff --git a/trunk/kernel/sched/stats.c b/trunk/kernel/sched/stats.c index 903ffa9e8872..2a581ba8e190 100644 --- a/trunk/kernel/sched/stats.c +++ b/trunk/kernel/sched/stats.c @@ -32,9 +32,9 @@ static int show_schedstat(struct seq_file *seq, void *v) /* runqueue-specific stats */ seq_printf(seq, - "cpu%d %u 0 %u %u %u %u %llu %llu %lu", + "cpu%d %u %u %u %u %u %u %llu %llu %lu", cpu, rq->yld_count, - rq->sched_count, rq->sched_goidle, + rq->sched_switch, rq->sched_count, rq->sched_goidle, rq->ttwu_count, rq->ttwu_local, rq->rq_cpu_time, rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount); diff --git a/trunk/kernel/softirq.c b/trunk/kernel/softirq.c index f268369ebe1f..4eb3a0fa351e 100644 --- a/trunk/kernel/softirq.c +++ b/trunk/kernel/softirq.c @@ -353,7 +353,7 @@ void irq_exit(void) tick_nohz_irq_exit(); #endif rcu_irq_exit(); - sched_preempt_enable_no_resched(); + preempt_enable_no_resched(); } /* @@ -744,7 +744,9 @@ static int run_ksoftirqd(void * __bind_cpu) while (!kthread_should_stop()) { preempt_disable(); if (!local_softirq_pending()) { - schedule_preempt_disabled(); + preempt_enable_no_resched(); + schedule(); + preempt_disable(); } __set_current_state(TASK_RUNNING); @@ -759,7 +761,7 @@ static int run_ksoftirqd(void * __bind_cpu) if (local_softirq_pending()) __do_softirq(); local_irq_enable(); - sched_preempt_enable_no_resched(); + preempt_enable_no_resched(); cond_resched(); preempt_disable(); rcu_note_context_switch((long)__bind_cpu);