Skip to content

Commit

Permalink
sched: fix overload performance: buddy wakeups
Browse files Browse the repository at this point in the history
Currently we schedule to the leftmost task in the runqueue. When the
runtimes are very short because of some server/client ping-pong,
especially in over-saturated workloads, this will cycle through all
tasks trashing the cache.

Reduce cache trashing by keeping dependent tasks together by running
newly woken tasks first. However, by not running the leftmost task first
we could starve tasks because the wakee can gain unlimited runtime.

Therefore we only run the wakee if its within a small
(wakeup_granularity) window of the leftmost task. This preserves
fairness, but does alternate server/client task groups.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
  • Loading branch information
Peter Zijlstra authored and Ingo Molnar committed Mar 15, 2008
1 parent 27d1172 commit aa2ac25
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 1 deletion.
2 changes: 1 addition & 1 deletion kernel/sched.c
Original file line number Diff line number Diff line change
Expand Up @@ -301,7 +301,7 @@ struct cfs_rq {
/* 'curr' points to currently running entity on this cfs_rq.
* It is set to NULL otherwise (i.e when none are currently running).
*/
struct sched_entity *curr;
struct sched_entity *curr, *next;

unsigned long nr_spread_over;

Expand Down
26 changes: 26 additions & 0 deletions kernel/sched_fair.c
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,9 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
}
}

if (cfs_rq->next == se)
cfs_rq->next = NULL;

rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
}

Expand Down Expand Up @@ -626,12 +629,32 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
se->prev_sum_exec_runtime = se->sum_exec_runtime;
}

static struct sched_entity *
pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
s64 diff, gran;

if (!cfs_rq->next)
return se;

diff = cfs_rq->next->vruntime - se->vruntime;
if (diff < 0)
return se;

gran = calc_delta_fair(sysctl_sched_wakeup_granularity, &cfs_rq->load);
if (diff > gran)
return se;

return cfs_rq->next;
}

static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
{
struct sched_entity *se = NULL;

if (first_fair(cfs_rq)) {
se = __pick_next_entity(cfs_rq);
se = pick_next(cfs_rq, se);
set_next_entity(cfs_rq, se);
}

Expand Down Expand Up @@ -1070,6 +1093,9 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
resched_task(curr);
return;
}

cfs_rq_of(pse)->next = pse;

/*
* Batch tasks do not preempt (their preemption is driven by
* the tick):
Expand Down

0 comments on commit aa2ac25

Please sign in to comment.