Skip to content

Commit

Permalink
sched/fair: Fix rework of find_idlest_group()
Browse files Browse the repository at this point in the history
The task, for which the scheduler looks for the idlest group of CPUs, must
be discounted from all statistics in order to get a fair comparison
between groups. This includes utilization, load, nr_running and idle_cpus.

Such unfairness can be easily highlighted with the unixbench execl 1 task.
This test continuously call execve() and the scheduler looks for the idlest
group/CPU on which it should place the task. Because the task runs on the
local group/CPU, the latter seems already busy even if there is nothing
else running on it. As a result, the scheduler will always select another
group/CPU than the local one.

This recovers most of the performance regression on my system from the
recent load-balancer rewrite.

[ mingo: Minor cleanups. ]

Reported-by: kernel test robot <rong.a.chen@intel.com>
Tested-by: kernel test robot <rong.a.chen@intel.com>
Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Morten.Rasmussen@arm.com
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dietmar.eggemann@arm.com
Cc: hdanton@sina.com
Cc: parth@linux.ibm.com
Cc: pauld@redhat.com
Cc: quentin.perret@arm.com
Cc: riel@surriel.com
Cc: srikar@linux.vnet.ibm.com
Cc: valentin.schneider@arm.com
Fixes: 57abff0 ("sched/fair: Rework find_idlest_group()")
Link: https://lkml.kernel.org/r/1571762798-25900-1-git-send-email-vincent.guittot@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
  • Loading branch information
Vincent Guittot authored and Ingo Molnar committed Nov 18, 2019
1 parent 7763baa commit 3318544
Showing 1 changed file with 84 additions and 7 deletions.
91 changes: 84 additions & 7 deletions kernel/sched/fair.c
Original file line number Diff line number Diff line change
Expand Up @@ -5391,6 +5391,37 @@ static unsigned long cpu_load(struct rq *rq)
return cfs_rq_load_avg(&rq->cfs);
}

/*
* cpu_load_without - compute CPU load without any contributions from *p
* @cpu: the CPU which load is requested
* @p: the task which load should be discounted
*
* The load of a CPU is defined by the load of tasks currently enqueued on that
* CPU as well as tasks which are currently sleeping after an execution on that
* CPU.
*
* This method returns the load of the specified CPU by discounting the load of
* the specified task, whenever the task is currently contributing to the CPU
* load.
*/
static unsigned long cpu_load_without(struct rq *rq, struct task_struct *p)
{
struct cfs_rq *cfs_rq;
unsigned int load;

/* Task has no contribution or is new */
if (cpu_of(rq) != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
return cpu_load(rq);

cfs_rq = &rq->cfs;
load = READ_ONCE(cfs_rq->avg.load_avg);

/* Discount task's util from CPU's util */
lsub_positive(&load, task_h_load(p));

return load;
}

static unsigned long capacity_of(int cpu)
{
return cpu_rq(cpu)->cpu_capacity;
Expand Down Expand Up @@ -8141,11 +8172,56 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq)

struct sg_lb_stats;

/*
* task_running_on_cpu - return 1 if @p is running on @cpu.
*/

static unsigned int task_running_on_cpu(int cpu, struct task_struct *p)
{
/* Task has no contribution or is new */
if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
return 0;

if (task_on_rq_queued(p))
return 1;

return 0;
}

/**
* idle_cpu_without - would a given CPU be idle without p ?
* @cpu: the processor on which idleness is tested.
* @p: task which should be ignored.
*
* Return: 1 if the CPU would be idle. 0 otherwise.
*/
static int idle_cpu_without(int cpu, struct task_struct *p)
{
struct rq *rq = cpu_rq(cpu);

if (rq->curr != rq->idle && rq->curr != p)
return 0;

/*
* rq->nr_running can't be used but an updated version without the
* impact of p on cpu must be used instead. The updated nr_running
* be computed and tested before calling idle_cpu_without().
*/

#ifdef CONFIG_SMP
if (!llist_empty(&rq->wake_list))
return 0;
#endif

return 1;
}

/*
* update_sg_wakeup_stats - Update sched_group's statistics for wakeup.
* @denv: The ched_domain level to look for idlest group.
* @sd: The sched_domain level to look for idlest group.
* @group: sched_group whose statistics are to be updated.
* @sgs: variable to hold the statistics for this group.
* @p: The task for which we look for the idlest group/CPU.
*/
static inline void update_sg_wakeup_stats(struct sched_domain *sd,
struct sched_group *group,
Expand All @@ -8158,21 +8234,22 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd,

for_each_cpu(i, sched_group_span(group)) {
struct rq *rq = cpu_rq(i);
unsigned int local;

sgs->group_load += cpu_load(rq);
sgs->group_load += cpu_load_without(rq, p);
sgs->group_util += cpu_util_without(i, p);
sgs->sum_h_nr_running += rq->cfs.h_nr_running;
local = task_running_on_cpu(i, p);
sgs->sum_h_nr_running += rq->cfs.h_nr_running - local;

nr_running = rq->nr_running;
nr_running = rq->nr_running - local;
sgs->sum_nr_running += nr_running;

/*
* No need to call idle_cpu() if nr_running is not 0
* No need to call idle_cpu_without() if nr_running is not 0
*/
if (!nr_running && idle_cpu(i))
if (!nr_running && idle_cpu_without(i, p))
sgs->idle_cpus++;


}

/* Check if task fits in the group */
Expand Down

0 comments on commit 3318544

Please sign in to comment.