Skip to content

Commit

Permalink
[PATCH] Fix longstanding load balancing bug in the scheduler
Browse files Browse the repository at this point in the history
The scheduler will stop load balancing if the most busy processor contains
processes pinned via processor affinity.

The scheduler currently only does one search for busiest cpu.  If it cannot
pull any tasks away from the busiest cpu because they were pinned then the
scheduler goes into a corner and sulks leaving the idle processors idle.

F.e.  If you have processor 0 busy running four tasks pinned via taskset,
there are none on processor 1 and one just started two processes on
processor 2 then the scheduler will not move one of the two processes away
from processor 2.

This patch fixes that issue by forcing the scheduler to come out of its
corner and retrying the load balancing by considering other processors for
load balancing.

This patch was originally developed by John Hawkes and discussed at

    http://marc.theaimsgroup.com/?l=linux-kernel&m=113901368523205&w=2.

I have removed extraneous material and gone back to equipping struct rq
with the cpu the queue is associated with since this makes the patch much
easier and it is likely that others in the future will have the same
difficulty of figuring out which processor owns which runqueue.

The overhead added through these patches is a single word on the stack if
the kernel is configured to support 32 cpus or less (32 bit).  For 32 bit
environments the maximum number of cpus that can be configued is 255 which
would result in the use of 32 bytes additional on the stack.  On IA64 up to
1k cpus can be configured which will result in the use of 128 additional
bytes on the stack.  The maximum additional cache footprint is one
cacheline.  Typically memory use will be much less than a cacheline and the
additional cpumask will be placed on the stack in a cacheline that already
contains other local variable.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: John Hawkes <hawkes@sgi.com>
Cc: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Peter Williams <pwil3058@bigpond.net.au>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
  • Loading branch information
Christoph Lameter authored and Linus Torvalds committed Sep 26, 2006
1 parent 656ddf7 commit 0a2966b
Showing 1 changed file with 46 additions and 8 deletions.
54 changes: 46 additions & 8 deletions kernel/sched.c
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,7 @@ struct rq {
/* For active balancing */
int active_balance;
int push_cpu;
int cpu; /* cpu of this runqueue */

struct task_struct *migration_thread;
struct list_head migration_queue;
Expand Down Expand Up @@ -267,6 +268,15 @@ struct rq {

static DEFINE_PER_CPU(struct rq, runqueues);

static inline int cpu_of(struct rq *rq)
{
#ifdef CONFIG_SMP
return rq->cpu;
#else
return 0;
#endif
}

/*
* The domain tree (rq->sd) is protected by RCU's quiescent state transition.
* See detach_destroy_domains: synchronize_sched for details.
Expand Down Expand Up @@ -2211,7 +2221,8 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
*/
static struct sched_group *
find_busiest_group(struct sched_domain *sd, int this_cpu,
unsigned long *imbalance, enum idle_type idle, int *sd_idle)
unsigned long *imbalance, enum idle_type idle, int *sd_idle,
cpumask_t *cpus)
{
struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
unsigned long max_load, avg_load, total_load, this_load, total_pwr;
Expand Down Expand Up @@ -2248,7 +2259,12 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
sum_weighted_load = sum_nr_running = avg_load = 0;

for_each_cpu_mask(i, group->cpumask) {
struct rq *rq = cpu_rq(i);
struct rq *rq;

if (!cpu_isset(i, *cpus))
continue;

rq = cpu_rq(i);

if (*sd_idle && !idle_cpu(i))
*sd_idle = 0;
Expand Down Expand Up @@ -2466,13 +2482,17 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
*/
static struct rq *
find_busiest_queue(struct sched_group *group, enum idle_type idle,
unsigned long imbalance)
unsigned long imbalance, cpumask_t *cpus)
{
struct rq *busiest = NULL, *rq;
unsigned long max_load = 0;
int i;

for_each_cpu_mask(i, group->cpumask) {

if (!cpu_isset(i, *cpus))
continue;

rq = cpu_rq(i);

if (rq->nr_running == 1 && rq->raw_weighted_load > imbalance)
Expand Down Expand Up @@ -2511,20 +2531,23 @@ static int load_balance(int this_cpu, struct rq *this_rq,
struct sched_group *group;
unsigned long imbalance;
struct rq *busiest;
cpumask_t cpus = CPU_MASK_ALL;

if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
!sched_smt_power_savings)
sd_idle = 1;

schedstat_inc(sd, lb_cnt[idle]);

group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle);
redo:
group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
&cpus);
if (!group) {
schedstat_inc(sd, lb_nobusyg[idle]);
goto out_balanced;
}

busiest = find_busiest_queue(group, idle, imbalance);
busiest = find_busiest_queue(group, idle, imbalance, &cpus);
if (!busiest) {
schedstat_inc(sd, lb_nobusyq[idle]);
goto out_balanced;
Expand All @@ -2549,8 +2572,12 @@ static int load_balance(int this_cpu, struct rq *this_rq,
double_rq_unlock(this_rq, busiest);

/* All tasks on this runqueue were pinned by CPU affinity */
if (unlikely(all_pinned))
if (unlikely(all_pinned)) {
cpu_clear(cpu_of(busiest), cpus);
if (!cpus_empty(cpus))
goto redo;
goto out_balanced;
}
}

if (!nr_moved) {
Expand Down Expand Up @@ -2639,18 +2666,22 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
unsigned long imbalance;
int nr_moved = 0;
int sd_idle = 0;
cpumask_t cpus = CPU_MASK_ALL;

if (sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings)
sd_idle = 1;

schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE, &sd_idle);
redo:
group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE,
&sd_idle, &cpus);
if (!group) {
schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]);
goto out_balanced;
}

busiest = find_busiest_queue(group, NEWLY_IDLE, imbalance);
busiest = find_busiest_queue(group, NEWLY_IDLE, imbalance,
&cpus);
if (!busiest) {
schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);
goto out_balanced;
Expand All @@ -2668,6 +2699,12 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
minus_1_or_zero(busiest->nr_running),
imbalance, sd, NEWLY_IDLE, NULL);
spin_unlock(&busiest->lock);

if (!nr_moved) {
cpu_clear(cpu_of(busiest), cpus);
if (!cpus_empty(cpus))
goto redo;
}
}

if (!nr_moved) {
Expand Down Expand Up @@ -6747,6 +6784,7 @@ void __init sched_init(void)
rq->cpu_load[j] = 0;
rq->active_balance = 0;
rq->push_cpu = 0;
rq->cpu = i;
rq->migration_thread = NULL;
INIT_LIST_HEAD(&rq->migration_queue);
#endif
Expand Down

0 comments on commit 0a2966b

Please sign in to comment.