Skip to content

Commit

Permalink
sched/numa: Replace runnable_load_avg by load_avg
Browse files Browse the repository at this point in the history
Similarly to what has been done for the normal load balancer, we can
replace runnable_load_avg by load_avg in numa load balancing and track the
other statistics like the utilization and the number of running tasks to
get to better view of the current state of a node.

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: "Dietmar Eggemann <dietmar.eggemann@arm.com>"
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Valentin Schneider <valentin.schneider@arm.com>
Cc: Phil Auld <pauld@redhat.com>
Cc: Hillf Danton <hdanton@sina.com>
Link: https://lore.kernel.org/r/20200224095223.13361-6-mgorman@techsingularity.net
  • Loading branch information
Vincent Guittot authored and Ingo Molnar committed Feb 24, 2020
1 parent 6d4d224 commit 6499b1b
Showing 1 changed file with 70 additions and 32 deletions.
102 changes: 70 additions & 32 deletions kernel/sched/fair.c
Original file line number Diff line number Diff line change
Expand Up @@ -1473,38 +1473,35 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
}

static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq);

static unsigned long cpu_runnable_load(struct rq *rq)
{
return cfs_rq_runnable_load_avg(&rq->cfs);
}
/*
* 'numa_type' describes the node at the moment of load balancing.
*/
enum numa_type {
/* The node has spare capacity that can be used to run more tasks. */
node_has_spare = 0,
/*
* The node is fully used and the tasks don't compete for more CPU
* cycles. Nevertheless, some tasks might wait before running.
*/
node_fully_busy,
/*
* The node is overloaded and can't provide expected CPU cycles to all
* tasks.
*/
node_overloaded
};

/* Cached statistics for all CPUs within a node */
struct numa_stats {
unsigned long load;

unsigned long util;
/* Total compute capacity of CPUs on a node */
unsigned long compute_capacity;
unsigned int nr_running;
unsigned int weight;
enum numa_type node_type;
};

/*
* XXX borrowed from update_sg_lb_stats
*/
static void update_numa_stats(struct numa_stats *ns, int nid)
{
int cpu;

memset(ns, 0, sizeof(*ns));
for_each_cpu(cpu, cpumask_of_node(nid)) {
struct rq *rq = cpu_rq(cpu);

ns->load += cpu_runnable_load(rq);
ns->compute_capacity += capacity_of(cpu);
}

}

struct task_numa_env {
struct task_struct *p;

Expand All @@ -1521,6 +1518,47 @@ struct task_numa_env {
int best_cpu;
};

static unsigned long cpu_load(struct rq *rq);
static unsigned long cpu_util(int cpu);

static inline enum
numa_type numa_classify(unsigned int imbalance_pct,
struct numa_stats *ns)
{
if ((ns->nr_running > ns->weight) &&
((ns->compute_capacity * 100) < (ns->util * imbalance_pct)))
return node_overloaded;

if ((ns->nr_running < ns->weight) ||
((ns->compute_capacity * 100) > (ns->util * imbalance_pct)))
return node_has_spare;

return node_fully_busy;
}

/*
* XXX borrowed from update_sg_lb_stats
*/
static void update_numa_stats(struct task_numa_env *env,
struct numa_stats *ns, int nid)
{
int cpu;

memset(ns, 0, sizeof(*ns));
for_each_cpu(cpu, cpumask_of_node(nid)) {
struct rq *rq = cpu_rq(cpu);

ns->load += cpu_load(rq);
ns->util += cpu_util(cpu);
ns->nr_running += rq->cfs.h_nr_running;
ns->compute_capacity += capacity_of(cpu);
}

ns->weight = cpumask_weight(cpumask_of_node(nid));

ns->node_type = numa_classify(env->imbalance_pct, ns);
}

static void task_numa_assign(struct task_numa_env *env,
struct task_struct *p, long imp)
{
Expand Down Expand Up @@ -1556,6 +1594,11 @@ static bool load_too_imbalanced(long src_load, long dst_load,
long orig_src_load, orig_dst_load;
long src_capacity, dst_capacity;


/* If dst node has spare capacity, there is no real load imbalance */
if (env->dst_stats.node_type == node_has_spare)
return false;

/*
* The load is corrected for the CPU capacity available on each node.
*
Expand Down Expand Up @@ -1788,10 +1831,10 @@ static int task_numa_migrate(struct task_struct *p)
dist = env.dist = node_distance(env.src_nid, env.dst_nid);
taskweight = task_weight(p, env.src_nid, dist);
groupweight = group_weight(p, env.src_nid, dist);
update_numa_stats(&env.src_stats, env.src_nid);
update_numa_stats(&env, &env.src_stats, env.src_nid);
taskimp = task_weight(p, env.dst_nid, dist) - taskweight;
groupimp = group_weight(p, env.dst_nid, dist) - groupweight;
update_numa_stats(&env.dst_stats, env.dst_nid);
update_numa_stats(&env, &env.dst_stats, env.dst_nid);

/* Try to find a spot on the preferred nid. */
task_numa_find_cpu(&env, taskimp, groupimp);
Expand Down Expand Up @@ -1824,7 +1867,7 @@ static int task_numa_migrate(struct task_struct *p)

env.dist = dist;
env.dst_nid = nid;
update_numa_stats(&env.dst_stats, env.dst_nid);
update_numa_stats(&env, &env.dst_stats, env.dst_nid);
task_numa_find_cpu(&env, taskimp, groupimp);
}
}
Expand Down Expand Up @@ -3686,11 +3729,6 @@ static void remove_entity_load_avg(struct sched_entity *se)
raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags);
}

static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
{
return cfs_rq->avg.runnable_load_avg;
}

static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
{
return cfs_rq->avg.load_avg;
Expand Down

0 comments on commit 6499b1b

Please sign in to comment.